LCOV - code coverage report
Current view: top level - proxy/src/control_plane/client - neon.rs (source / functions) Coverage Total Hit
Test: 6fa910d1c9aea142e54ede6987809ef55544c500.info Lines: 7.5 % 280 21
Test Date: 2024-11-19 23:07:42 Functions: 10.8 % 37 4

            Line data    Source code
       1              : //! Production console backend.
       2              : 
       3              : use std::sync::Arc;
       4              : use std::time::Duration;
       5              : 
       6              : use ::http::header::AUTHORIZATION;
       7              : use ::http::HeaderName;
       8              : use futures::TryFutureExt;
       9              : use tokio::time::Instant;
      10              : use tokio_postgres::config::SslMode;
      11              : use tracing::{debug, info, info_span, warn, Instrument};
      12              : 
      13              : use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute};
      14              : use crate::auth::backend::jwt::AuthRule;
      15              : use crate::auth::backend::ComputeUserInfo;
      16              : use crate::cache::Cached;
      17              : use crate::context::RequestMonitoring;
      18              : use crate::control_plane::caches::ApiCaches;
      19              : use crate::control_plane::errors::{
      20              :     ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
      21              : };
      22              : use crate::control_plane::locks::ApiLocks;
      23              : use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
      24              : use crate::control_plane::{
      25              :     AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
      26              : };
      27              : use crate::metrics::{CacheOutcome, Metrics};
      28              : use crate::rate_limiter::WakeComputeRateLimiter;
      29              : use crate::types::{EndpointCacheKey, EndpointId};
      30              : use crate::{compute, http, scram};
      31              : 
      32              : const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
      33              : 
      34              : #[derive(Clone)]
      35              : pub struct NeonControlPlaneClient {
      36              :     endpoint: http::Endpoint,
      37              :     pub caches: &'static ApiCaches,
      38              :     pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
      39              :     pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
      40              :     // put in a shared ref so we don't copy secrets all over in memory
      41              :     jwt: Arc<str>,
      42              : }
      43              : 
      44              : impl NeonControlPlaneClient {
      45              :     /// Construct an API object containing the auth parameters.
      46            0 :     pub fn new(
      47            0 :         endpoint: http::Endpoint,
      48            0 :         jwt: Arc<str>,
      49            0 :         caches: &'static ApiCaches,
      50            0 :         locks: &'static ApiLocks<EndpointCacheKey>,
      51            0 :         wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
      52            0 :     ) -> Self {
      53            0 :         Self {
      54            0 :             endpoint,
      55            0 :             caches,
      56            0 :             locks,
      57            0 :             wake_compute_endpoint_rate_limiter,
      58            0 :             jwt,
      59            0 :         }
      60            0 :     }
      61              : 
      62            0 :     pub(crate) fn url(&self) -> &str {
      63            0 :         self.endpoint.url().as_str()
      64            0 :     }
      65              : 
      66            0 :     async fn do_get_auth_info(
      67            0 :         &self,
      68            0 :         ctx: &RequestMonitoring,
      69            0 :         user_info: &ComputeUserInfo,
      70            0 :     ) -> Result<AuthInfo, GetAuthInfoError> {
      71            0 :         if !self
      72            0 :             .caches
      73            0 :             .endpoints_cache
      74            0 :             .is_valid(ctx, &user_info.endpoint.normalize())
      75              :         {
      76            0 :             info!("endpoint is not valid, skipping the request");
      77            0 :             return Ok(AuthInfo::default());
      78            0 :         }
      79            0 :         let request_id = ctx.session_id().to_string();
      80            0 :         let application_name = ctx.console_application_name();
      81            0 :         async {
      82            0 :             let request = self
      83            0 :                 .endpoint
      84            0 :                 .get_path("proxy_get_role_secret")
      85            0 :                 .header(X_REQUEST_ID, &request_id)
      86            0 :                 .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
      87            0 :                 .query(&[("session_id", ctx.session_id())])
      88            0 :                 .query(&[
      89            0 :                     ("application_name", application_name.as_str()),
      90            0 :                     ("project", user_info.endpoint.as_str()),
      91            0 :                     ("role", user_info.user.as_str()),
      92            0 :                 ])
      93            0 :                 .build()?;
      94              : 
      95            0 :             info!(url = request.url().as_str(), "sending http request");
      96            0 :             let start = Instant::now();
      97            0 :             let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
      98            0 :             let response = self.endpoint.execute(request).await?;
      99            0 :             drop(pause);
     100            0 :             info!(duration = ?start.elapsed(), "received http response");
     101            0 :             let body = match parse_body::<GetRoleSecret>(response).await {
     102            0 :                 Ok(body) => body,
     103              :                 // Error 404 is special: it's ok not to have a secret.
     104              :                 // TODO(anna): retry
     105            0 :                 Err(e) => {
     106            0 :                     return if e.get_reason().is_not_found() {
     107            0 :                         Ok(AuthInfo::default())
     108              :                     } else {
     109            0 :                         Err(e.into())
     110              :                     }
     111              :                 }
     112              :             };
     113              : 
     114            0 :             let secret = if body.role_secret.is_empty() {
     115            0 :                 None
     116              :             } else {
     117            0 :                 let secret = scram::ServerSecret::parse(&body.role_secret)
     118            0 :                     .map(AuthSecret::Scram)
     119            0 :                     .ok_or(GetAuthInfoError::BadSecret)?;
     120            0 :                 Some(secret)
     121              :             };
     122            0 :             let allowed_ips = body.allowed_ips.unwrap_or_default();
     123            0 :             Metrics::get()
     124            0 :                 .proxy
     125            0 :                 .allowed_ips_number
     126            0 :                 .observe(allowed_ips.len() as f64);
     127            0 :             Ok(AuthInfo {
     128            0 :                 secret,
     129            0 :                 allowed_ips,
     130            0 :                 project_id: body.project_id,
     131            0 :             })
     132            0 :         }
     133            0 :         .map_err(crate::error::log_error)
     134            0 :         .instrument(info_span!("http", id = request_id))
     135            0 :         .await
     136            0 :     }
     137              : 
     138            0 :     async fn do_get_endpoint_jwks(
     139            0 :         &self,
     140            0 :         ctx: &RequestMonitoring,
     141            0 :         endpoint: EndpointId,
     142            0 :     ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
     143            0 :         if !self
     144            0 :             .caches
     145            0 :             .endpoints_cache
     146            0 :             .is_valid(ctx, &endpoint.normalize())
     147              :         {
     148            0 :             return Err(GetEndpointJwksError::EndpointNotFound);
     149            0 :         }
     150            0 :         let request_id = ctx.session_id().to_string();
     151            0 :         async {
     152            0 :             let request = self
     153            0 :                 .endpoint
     154            0 :                 .get_with_url(|url| {
     155            0 :                     url.path_segments_mut()
     156            0 :                         .push("endpoints")
     157            0 :                         .push(endpoint.as_str())
     158            0 :                         .push("jwks");
     159            0 :                 })
     160            0 :                 .header(X_REQUEST_ID, &request_id)
     161            0 :                 .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
     162            0 :                 .query(&[("session_id", ctx.session_id())])
     163            0 :                 .build()
     164            0 :                 .map_err(GetEndpointJwksError::RequestBuild)?;
     165              : 
     166            0 :             info!(url = request.url().as_str(), "sending http request");
     167            0 :             let start = Instant::now();
     168            0 :             let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
     169            0 :             let response = self
     170            0 :                 .endpoint
     171            0 :                 .execute(request)
     172            0 :                 .await
     173            0 :                 .map_err(GetEndpointJwksError::RequestExecute)?;
     174            0 :             drop(pause);
     175            0 :             info!(duration = ?start.elapsed(), "received http response");
     176              : 
     177            0 :             let body = parse_body::<EndpointJwksResponse>(response).await?;
     178              : 
     179            0 :             let rules = body
     180            0 :                 .jwks
     181            0 :                 .into_iter()
     182            0 :                 .map(|jwks| AuthRule {
     183            0 :                     id: jwks.id,
     184            0 :                     jwks_url: jwks.jwks_url,
     185            0 :                     audience: jwks.jwt_audience,
     186            0 :                     role_names: jwks.role_names,
     187            0 :                 })
     188            0 :                 .collect();
     189            0 : 
     190            0 :             Ok(rules)
     191            0 :         }
     192            0 :         .map_err(crate::error::log_error)
     193            0 :         .instrument(info_span!("http", id = request_id))
     194            0 :         .await
     195            0 :     }
     196              : 
     197            0 :     async fn do_wake_compute(
     198            0 :         &self,
     199            0 :         ctx: &RequestMonitoring,
     200            0 :         user_info: &ComputeUserInfo,
     201            0 :     ) -> Result<NodeInfo, WakeComputeError> {
     202            0 :         let request_id = ctx.session_id().to_string();
     203            0 :         let application_name = ctx.console_application_name();
     204            0 :         async {
     205            0 :             let mut request_builder = self
     206            0 :                 .endpoint
     207            0 :                 .get_path("proxy_wake_compute")
     208            0 :                 .header("X-Request-ID", &request_id)
     209            0 :                 .header("Authorization", format!("Bearer {}", &self.jwt))
     210            0 :                 .query(&[("session_id", ctx.session_id())])
     211            0 :                 .query(&[
     212            0 :                     ("application_name", application_name.as_str()),
     213            0 :                     ("project", user_info.endpoint.as_str()),
     214            0 :                 ]);
     215            0 : 
     216            0 :             let options = user_info.options.to_deep_object();
     217            0 :             if !options.is_empty() {
     218            0 :                 request_builder = request_builder.query(&options);
     219            0 :             }
     220              : 
     221            0 :             let request = request_builder.build()?;
     222              : 
     223            0 :             info!(url = request.url().as_str(), "sending http request");
     224            0 :             let start = Instant::now();
     225            0 :             let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
     226            0 :             let response = self.endpoint.execute(request).await?;
     227            0 :             drop(pause);
     228            0 :             info!(duration = ?start.elapsed(), "received http response");
     229            0 :             let body = parse_body::<WakeCompute>(response).await?;
     230              : 
     231              :             // Unfortunately, ownership won't let us use `Option::ok_or` here.
     232            0 :             let (host, port) = match parse_host_port(&body.address) {
     233            0 :                 None => return Err(WakeComputeError::BadComputeAddress(body.address)),
     234            0 :                 Some(x) => x,
     235            0 :             };
     236            0 : 
     237            0 :             // Don't set anything but host and port! This config will be cached.
     238            0 :             // We'll set username and such later using the startup message.
     239            0 :             // TODO: add more type safety (in progress).
     240            0 :             let mut config = compute::ConnCfg::new();
     241            0 :             config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
     242            0 : 
     243            0 :             let node = NodeInfo {
     244            0 :                 config,
     245            0 :                 aux: body.aux,
     246            0 :                 allow_self_signed_compute: false,
     247            0 :             };
     248            0 : 
     249            0 :             Ok(node)
     250            0 :         }
     251            0 :         .map_err(crate::error::log_error)
     252            0 :         .instrument(info_span!("http", id = request_id))
     253            0 :         .await
     254            0 :     }
     255              : }
     256              : 
     257              : impl super::ControlPlaneApi for NeonControlPlaneClient {
     258            0 :     #[tracing::instrument(skip_all)]
     259              :     async fn get_role_secret(
     260              :         &self,
     261              :         ctx: &RequestMonitoring,
     262              :         user_info: &ComputeUserInfo,
     263              :     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
     264              :         let normalized_ep = &user_info.endpoint.normalize();
     265              :         let user = &user_info.user;
     266              :         if let Some(role_secret) = self
     267              :             .caches
     268              :             .project_info
     269              :             .get_role_secret(normalized_ep, user)
     270              :         {
     271              :             return Ok(role_secret);
     272              :         }
     273              :         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
     274              :         if let Some(project_id) = auth_info.project_id {
     275              :             let normalized_ep_int = normalized_ep.into();
     276              :             self.caches.project_info.insert_role_secret(
     277              :                 project_id,
     278              :                 normalized_ep_int,
     279              :                 user.into(),
     280              :                 auth_info.secret.clone(),
     281              :             );
     282              :             self.caches.project_info.insert_allowed_ips(
     283              :                 project_id,
     284              :                 normalized_ep_int,
     285              :                 Arc::new(auth_info.allowed_ips),
     286              :             );
     287              :             ctx.set_project_id(project_id);
     288              :         }
     289              :         // When we just got a secret, we don't need to invalidate it.
     290              :         Ok(Cached::new_uncached(auth_info.secret))
     291              :     }
     292              : 
     293            0 :     async fn get_allowed_ips_and_secret(
     294            0 :         &self,
     295            0 :         ctx: &RequestMonitoring,
     296            0 :         user_info: &ComputeUserInfo,
     297            0 :     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
     298            0 :         let normalized_ep = &user_info.endpoint.normalize();
     299            0 :         if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
     300            0 :             Metrics::get()
     301            0 :                 .proxy
     302            0 :                 .allowed_ips_cache_misses
     303            0 :                 .inc(CacheOutcome::Hit);
     304            0 :             return Ok((allowed_ips, None));
     305            0 :         }
     306            0 :         Metrics::get()
     307            0 :             .proxy
     308            0 :             .allowed_ips_cache_misses
     309            0 :             .inc(CacheOutcome::Miss);
     310            0 :         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
     311            0 :         let allowed_ips = Arc::new(auth_info.allowed_ips);
     312            0 :         let user = &user_info.user;
     313            0 :         if let Some(project_id) = auth_info.project_id {
     314            0 :             let normalized_ep_int = normalized_ep.into();
     315            0 :             self.caches.project_info.insert_role_secret(
     316            0 :                 project_id,
     317            0 :                 normalized_ep_int,
     318            0 :                 user.into(),
     319            0 :                 auth_info.secret.clone(),
     320            0 :             );
     321            0 :             self.caches.project_info.insert_allowed_ips(
     322            0 :                 project_id,
     323            0 :                 normalized_ep_int,
     324            0 :                 allowed_ips.clone(),
     325            0 :             );
     326            0 :             ctx.set_project_id(project_id);
     327            0 :         }
     328            0 :         Ok((
     329            0 :             Cached::new_uncached(allowed_ips),
     330            0 :             Some(Cached::new_uncached(auth_info.secret)),
     331            0 :         ))
     332            0 :     }
     333              : 
     334            0 :     #[tracing::instrument(skip_all)]
     335              :     async fn get_endpoint_jwks(
     336              :         &self,
     337              :         ctx: &RequestMonitoring,
     338              :         endpoint: EndpointId,
     339              :     ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
     340              :         self.do_get_endpoint_jwks(ctx, endpoint).await
     341              :     }
     342              : 
     343            0 :     #[tracing::instrument(skip_all)]
     344              :     async fn wake_compute(
     345              :         &self,
     346              :         ctx: &RequestMonitoring,
     347              :         user_info: &ComputeUserInfo,
     348              :     ) -> Result<CachedNodeInfo, WakeComputeError> {
     349              :         let key = user_info.endpoint_cache_key();
     350              : 
     351              :         macro_rules! check_cache {
     352              :             () => {
     353              :                 if let Some(cached) = self.caches.node_info.get(&key) {
     354              :                     let (cached, info) = cached.take_value();
     355            0 :                     let info = info.map_err(|c| {
     356            0 :                         info!(key = &*key, "found cached wake_compute error");
     357            0 :                         WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
     358            0 :                     })?;
     359              : 
     360              :                     debug!(key = &*key, "found cached compute node info");
     361              :                     ctx.set_project(info.aux.clone());
     362            0 :                     return Ok(cached.map(|()| info));
     363              :                 }
     364              :             };
     365              :         }
     366              : 
     367              :         // Every time we do a wakeup http request, the compute node will stay up
     368              :         // for some time (highly depends on the console's scale-to-zero policy);
     369              :         // The connection info remains the same during that period of time,
     370              :         // which means that we might cache it to reduce the load and latency.
     371              :         check_cache!();
     372              : 
     373              :         let permit = self.locks.get_permit(&key).await?;
     374              : 
     375              :         // after getting back a permit - it's possible the cache was filled
     376              :         // double check
     377              :         if permit.should_check_cache() {
     378              :             check_cache!();
     379              :         }
     380              : 
     381              :         // check rate limit
     382              :         if !self
     383              :             .wake_compute_endpoint_rate_limiter
     384              :             .check(user_info.endpoint.normalize_intern(), 1)
     385              :         {
     386              :             return Err(WakeComputeError::TooManyConnections);
     387              :         }
     388              : 
     389              :         let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
     390              :         match node {
     391              :             Ok(node) => {
     392              :                 ctx.set_project(node.aux.clone());
     393              :                 debug!(key = &*key, "created a cache entry for woken compute node");
     394              : 
     395              :                 let mut stored_node = node.clone();
     396              :                 // store the cached node as 'warm_cached'
     397              :                 stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
     398              : 
     399              :                 let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
     400              : 
     401            0 :                 Ok(cached.map(|()| node))
     402              :             }
     403              :             Err(err) => match err {
     404              :                 WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
     405              :                     let Some(status) = &err.status else {
     406              :                         return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
     407              :                             err,
     408              :                         )));
     409              :                     };
     410              : 
     411              :                     let reason = status
     412              :                         .details
     413              :                         .error_info
     414            0 :                         .map_or(Reason::Unknown, |x| x.reason);
     415              : 
     416              :                     // if we can retry this error, do not cache it.
     417              :                     if reason.can_retry() {
     418              :                         return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
     419              :                             err,
     420              :                         )));
     421              :                     }
     422              : 
     423              :                     // at this point, we should only have quota errors.
     424              :                     debug!(
     425              :                         key = &*key,
     426              :                         "created a cache entry for the wake compute error"
     427              :                     );
     428              : 
     429              :                     self.caches.node_info.insert_ttl(
     430              :                         key,
     431              :                         Err(err.clone()),
     432              :                         Duration::from_secs(30),
     433              :                     );
     434              : 
     435              :                     Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
     436              :                         err,
     437              :                     )))
     438              :                 }
     439              :                 err => return Err(err),
     440              :             },
     441              :         }
     442              :     }
     443              : }
     444              : 
     445              : /// Parse http response body, taking status code into account.
     446            0 : async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
     447            0 :     response: http::Response,
     448            0 : ) -> Result<T, ControlPlaneError> {
     449            0 :     let status = response.status();
     450            0 :     if status.is_success() {
     451              :         // We shouldn't log raw body because it may contain secrets.
     452            0 :         info!("request succeeded, processing the body");
     453            0 :         return Ok(response.json().await?);
     454            0 :     }
     455            0 :     let s = response.bytes().await?;
     456              :     // Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
     457            0 :     info!("response_error plaintext: {:?}", s);
     458              : 
     459              :     // Don't throw an error here because it's not as important
     460              :     // as the fact that the request itself has failed.
     461            0 :     let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
     462            0 :         warn!("failed to parse error body: {e}");
     463            0 :         ControlPlaneErrorMessage {
     464            0 :             error: "reason unclear (malformed error message)".into(),
     465            0 :             http_status_code: status,
     466            0 :             status: None,
     467            0 :         }
     468            0 :     });
     469            0 :     body.http_status_code = status;
     470            0 : 
     471            0 :     warn!("console responded with an error ({status}): {body:?}");
     472            0 :     Err(ControlPlaneError::Message(Box::new(body)))
     473            0 : }
     474              : 
     475            3 : fn parse_host_port(input: &str) -> Option<(&str, u16)> {
     476            3 :     let (host, port) = input.rsplit_once(':')?;
     477            3 :     let ipv6_brackets: &[_] = &['[', ']'];
     478            3 :     Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
     479            3 : }
     480              : 
     481              : #[cfg(test)]
     482              : mod tests {
     483              :     use super::*;
     484              : 
     485              :     #[test]
     486            1 :     fn test_parse_host_port_v4() {
     487            1 :         let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
     488            1 :         assert_eq!(host, "127.0.0.1");
     489            1 :         assert_eq!(port, 5432);
     490            1 :     }
     491              : 
     492              :     #[test]
     493            1 :     fn test_parse_host_port_v6() {
     494            1 :         let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
     495            1 :         assert_eq!(host, "2001:db8::1");
     496            1 :         assert_eq!(port, 5432);
     497            1 :     }
     498              : 
     499              :     #[test]
     500            1 :     fn test_parse_host_port_url() {
     501            1 :         let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
     502            1 :             .expect("failed to parse");
     503            1 :         assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
     504            1 :         assert_eq!(port, 5432);
     505            1 :     }
     506              : }
        

Generated by: LCOV version 2.1-beta