LCOV - befab6bd7a1d43d9d0f7dc96f75bdc872dffa42a.info

LCOV - code coverage report

Current view:	top level - proxy/src/binary - proxy.rs (source / functions)		Coverage	Total	Hit
Test:	befab6bd7a1d43d9d0f7dc96f75bdc872dffa42a.info	Lines:	7.1 %	562	40
Test Date:	2025-06-14 11:52:36	Functions:	17.2 %	151	26

            Line data    Source code

       1              : #[cfg(any(test, feature = "testing"))]
       2              : use std::env;
       3              : use std::net::SocketAddr;
       4              : use std::path::PathBuf;
       5              : use std::pin::pin;
       6              : use std::sync::Arc;
       7              : use std::time::Duration;
       8              : 
       9              : #[cfg(any(test, feature = "testing"))]
      10              : use anyhow::Context;
      11              : use anyhow::{bail, ensure};
      12              : use arc_swap::ArcSwapOption;
      13              : use futures::future::Either;
      14              : use itertools::{Itertools, Position};
      15              : use rand::{Rng, thread_rng};
      16              : use remote_storage::RemoteStorageConfig;
      17              : use tokio::net::TcpListener;
      18              : use tokio::task::JoinSet;
      19              : use tokio_util::sync::CancellationToken;
      20              : use tracing::{Instrument, error, info, warn};
      21              : use utils::sentry_init::init_sentry;
      22              : use utils::{project_build_tag, project_git_version};
      23              : 
      24              : use crate::auth::backend::jwt::JwkCache;
      25              : use crate::auth::backend::{ConsoleRedirectBackend, MaybeOwned};
      26              : use crate::cancellation::{CancellationHandler, handle_cancel_messages};
      27              : use crate::config::{
      28              :     self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions,
      29              :     ProxyConfig, ProxyProtocolV2, remote_storage_from_toml,
      30              : };
      31              : use crate::context::parquet::ParquetUploadArgs;
      32              : use crate::http::health_server::AppMetrics;
      33              : use crate::metrics::Metrics;
      34              : use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
      35              : use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
      36              : use crate::redis::kv_ops::RedisKVClient;
      37              : use crate::redis::{elasticache, notifications};
      38              : use crate::scram::threadpool::ThreadPool;
      39              : use crate::serverless::GlobalConnPoolOptions;
      40              : use crate::serverless::cancel_set::CancelSet;
      41              : use crate::tls::client_config::compute_client_config_with_root_certs;
      42              : #[cfg(any(test, feature = "testing"))]
      43              : use crate::url::ApiUrl;
      44              : use crate::{auth, control_plane, http, serverless, usage_metrics};
      45              : 
      46              : project_git_version!(GIT_VERSION);
      47              : project_build_tag!(BUILD_TAG);
      48              : 
      49              : use clap::{Parser, ValueEnum};
      50              : 
      51              : #[derive(Clone, Debug, ValueEnum)]
      52              : #[clap(rename_all = "kebab-case")]
      53              : enum AuthBackendType {
      54              :     #[clap(alias("cplane-v1"))]
      55              :     ControlPlane,
      56              : 
      57              :     #[clap(alias("link"))]
      58              :     ConsoleRedirect,
      59              : 
      60              :     #[cfg(any(test, feature = "testing"))]
      61              :     Postgres,
      62              : }
      63              : 
      64              : /// Neon proxy/router
      65              : #[derive(Parser)]
      66              : #[command(version = GIT_VERSION, about)]
      67              : struct ProxyCliArgs {
      68              :     /// Name of the region this proxy is deployed in
      69            1 :     #[clap(long, default_value_t = String::new())]
      70            0 :     region: String,
      71              :     /// listen for incoming client connections on ip:port
      72              :     #[clap(short, long, default_value = "127.0.0.1:4432")]
      73            0 :     proxy: SocketAddr,
      74            1 :     #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)]
      75            0 :     auth_backend: AuthBackendType,
      76              :     /// listen for management callback connection on ip:port
      77              :     #[clap(short, long, default_value = "127.0.0.1:7000")]
      78            0 :     mgmt: SocketAddr,
      79              :     /// listen for incoming http connections (metrics, etc) on ip:port
      80              :     #[clap(long, default_value = "127.0.0.1:7001")]
      81            0 :     http: SocketAddr,
      82              :     /// listen for incoming wss connections on ip:port
      83              :     #[clap(long)]
      84              :     wss: Option<SocketAddr>,
      85              :     /// redirect unauthenticated users to the given uri in case of console redirect auth
      86              :     #[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
      87            0 :     uri: String,
      88              :     /// cloud API endpoint for authenticating users
      89              :     #[clap(
      90              :         short,
      91              :         long,
      92              :         default_value = "http://localhost:3000/authenticate_proxy_request/"
      93              :     )]
      94            0 :     auth_endpoint: String,
      95              :     /// JWT used to connect to control plane.
      96              :     #[clap(
      97              :         long,
      98              :         value_name = "JWT",
      99              :         default_value = "",
     100              :         env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN"
     101              :     )]
     102            0 :     control_plane_token: Arc<str>,
     103              :     /// if this is not local proxy, this toggles whether we accept jwt or passwords for http
     104            1 :     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     105            0 :     is_auth_broker: bool,
     106              :     /// path to TLS key for client postgres connections
     107              :     ///
     108              :     /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
     109              :     #[clap(short = 'k', long, alias = "ssl-key")]
     110              :     tls_key: Option<PathBuf>,
     111              :     /// path to TLS cert for client postgres connections
     112              :     ///
     113              :     /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
     114              :     #[clap(short = 'c', long, alias = "ssl-cert")]
     115              :     tls_cert: Option<PathBuf>,
     116              :     /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`.
     117              :     #[clap(long, alias = "allow-ssl-keylogfile")]
     118            0 :     allow_tls_keylogfile: bool,
     119              :     /// path to directory with TLS certificates for client postgres connections
     120              :     #[clap(long)]
     121              :     certs_dir: Option<PathBuf>,
     122              :     /// timeout for the TLS handshake
     123              :     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     124            0 :     handshake_timeout: tokio::time::Duration,
     125              :     /// http endpoint to receive periodic metric updates
     126              :     #[clap(long)]
     127              :     metric_collection_endpoint: Option<String>,
     128              :     /// how often metrics should be sent to a collection endpoint
     129              :     #[clap(long)]
     130              :     metric_collection_interval: Option<String>,
     131              :     /// cache for `wake_compute` api method (use `size=0` to disable)
     132              :     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     133            0 :     wake_compute_cache: String,
     134              :     /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
     135              :     #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
     136            0 :     wake_compute_lock: String,
     137              :     /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
     138              :     #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
     139            0 :     connect_compute_lock: String,
     140              :     #[clap(flatten)]
     141              :     sql_over_http: SqlOverHttpArgs,
     142              :     /// timeout for scram authentication protocol
     143              :     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     144            0 :     scram_protocol_timeout: tokio::time::Duration,
     145              :     /// size of the threadpool for password hashing
     146            1 :     #[clap(long, default_value_t = 4)]
     147            0 :     scram_thread_pool_size: u8,
     148              :     /// Endpoint rate limiter max number of requests per second.
     149              :     ///
     150              :     /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
     151              :     /// Can be given multiple times for different bucket sizes.
     152            4 :     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     153            1 :     endpoint_rps_limit: Vec<RateBucketInfo>,
     154              :     /// Wake compute rate limiter max number of requests per second.
     155            4 :     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
     156            1 :     wake_compute_limit: Vec<RateBucketInfo>,
     157              :     /// Redis rate limiter max number of requests per second.
     158            3 :     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
     159            1 :     redis_rps_limit: Vec<RateBucketInfo>,
     160              :     /// Cancellation channel size (max queue size for redis kv client)
     161            1 :     #[clap(long, default_value_t = 1024)]
     162            0 :     cancellation_ch_size: usize,
     163              :     /// Cancellation ops batch size for redis
     164            1 :     #[clap(long, default_value_t = 8)]
     165            0 :     cancellation_batch_size: usize,
     166              :     /// cache for `allowed_ips` (use `size=0` to disable)
     167              :     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     168            0 :     allowed_ips_cache: String,
     169              :     /// cache for `role_secret` (use `size=0` to disable)
     170              :     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     171            0 :     role_secret_cache: String,
     172              :     /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
     173              :     #[clap(long)]
     174              :     redis_notifications: Option<String>,
     175              :     /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain".
     176              :     #[clap(long, default_value = "irsa")]
     177            0 :     redis_auth_type: String,
     178              :     /// redis host for streaming connections (might be different from the notifications host)
     179              :     #[clap(long)]
     180              :     redis_host: Option<String>,
     181              :     /// redis port for streaming connections (might be different from the notifications host)
     182              :     #[clap(long)]
     183              :     redis_port: Option<u16>,
     184              :     /// redis cluster name, used in aws elasticache
     185              :     #[clap(long)]
     186              :     redis_cluster_name: Option<String>,
     187              :     /// redis user_id, used in aws elasticache
     188              :     #[clap(long)]
     189              :     redis_user_id: Option<String>,
     190              :     /// aws region to retrieve credentials
     191            1 :     #[clap(long, default_value_t = String::new())]
     192            0 :     aws_region: String,
     193              :     /// cache for `project_info` (use `size=0` to disable)
     194              :     #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
     195            0 :     project_info_cache: String,
     196              :     /// cache for all valid endpoints
     197              :     #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
     198            0 :     endpoint_cache_config: String,
     199              :     #[clap(flatten)]
     200              :     parquet_upload: ParquetUploadArgs,
     201              : 
     202              :     /// interval for backup metric collection
     203              :     #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
     204            0 :     metric_backup_collection_interval: std::time::Duration,
     205              :     /// remote storage configuration for backup metric collection
     206              :     /// Encoded as toml (same format as pageservers), eg
     207              :     /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
     208              :     #[clap(long, value_parser = remote_storage_from_toml)]
     209              :     metric_backup_collection_remote_storage: Option<RemoteStorageConfig>,
     210              :     /// chunk size for backup metric collection
     211              :     /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
     212              :     #[clap(long, default_value = "4194304")]
     213            0 :     metric_backup_collection_chunk_size: usize,
     214              :     /// Whether to retry the connection to the compute node
     215              :     #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
     216            0 :     connect_to_compute_retry: String,
     217              :     /// Whether to retry the wake_compute request
     218              :     #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
     219            0 :     wake_compute_retry: String,
     220              : 
     221              :     /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist
     222            1 :     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     223            0 :     is_private_access_proxy: bool,
     224              : 
     225              :     /// Configure whether all incoming requests have a Proxy Protocol V2 packet.
     226            1 :     #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Rejected)]
     227            0 :     proxy_protocol_v2: ProxyProtocolV2,
     228              : 
     229              :     /// Time the proxy waits for the webauth session to be confirmed by the control plane.
     230              :     // TODO: rename to `console_redirect_confirmation_timeout`.
     231              :     #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)]
     232            0 :     webauth_confirmation_timeout: std::time::Duration,
     233              : 
     234              :     #[clap(flatten)]
     235              :     pg_sni_router: PgSniRouterArgs,
     236              : }
     237              : 
     238              : #[derive(clap::Args, Clone, Copy, Debug)]
     239              : struct SqlOverHttpArgs {
     240              :     /// timeout for http connection requests
     241              :     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     242            0 :     sql_over_http_timeout: tokio::time::Duration,
     243              : 
     244              :     /// Whether the SQL over http pool is opt-in
     245            1 :     #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     246            0 :     sql_over_http_pool_opt_in: bool,
     247              : 
     248              :     /// How many connections to pool for each endpoint. Excess connections are discarded
     249            1 :     #[clap(long, default_value_t = 20)]
     250            0 :     sql_over_http_pool_max_conns_per_endpoint: usize,
     251              : 
     252              :     /// How many connections to pool for each endpoint. Excess connections are discarded
     253            1 :     #[clap(long, default_value_t = 20000)]
     254            0 :     sql_over_http_pool_max_total_conns: usize,
     255              : 
     256              :     /// How long pooled connections should remain idle for before closing
     257              :     #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
     258            0 :     sql_over_http_idle_timeout: tokio::time::Duration,
     259              : 
     260              :     /// Duration each shard will wait on average before a GC sweep.
     261              :     /// A longer time will causes sweeps to take longer but will interfere less frequently.
     262              :     #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
     263            0 :     sql_over_http_pool_gc_epoch: tokio::time::Duration,
     264              : 
     265              :     /// How many shards should the global pool have. Must be a power of two.
     266              :     /// More shards will introduce less contention for pool operations, but can
     267              :     /// increase memory used by the pool
     268            1 :     #[clap(long, default_value_t = 128)]
     269            0 :     sql_over_http_pool_shards: usize,
     270              : 
     271            1 :     #[clap(long, default_value_t = 10000)]
     272            0 :     sql_over_http_client_conn_threshold: u64,
     273              : 
     274            1 :     #[clap(long, default_value_t = 64)]
     275            0 :     sql_over_http_cancel_set_shards: usize,
     276              : 
     277            1 :     #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
     278            0 :     sql_over_http_max_request_size_bytes: usize,
     279              : 
     280            1 :     #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
     281            0 :     sql_over_http_max_response_size_bytes: usize,
     282              : }
     283              : 
     284              : #[derive(clap::Args, Clone, Debug)]
     285              : struct PgSniRouterArgs {
     286              :     /// listen for incoming client connections on ip:port
     287              :     #[clap(id = "sni-router-listen", long, default_value = "127.0.0.1:4432")]
     288            0 :     listen: SocketAddr,
     289              :     /// listen for incoming client connections on ip:port, requiring TLS to compute
     290              :     #[clap(id = "sni-router-listen-tls", long, default_value = "127.0.0.1:4433")]
     291            0 :     listen_tls: SocketAddr,
     292              :     /// path to TLS key for client postgres connections
     293              :     #[clap(id = "sni-router-tls-key", long)]
     294              :     tls_key: Option<PathBuf>,
     295              :     /// path to TLS cert for client postgres connections
     296              :     #[clap(id = "sni-router-tls-cert", long)]
     297              :     tls_cert: Option<PathBuf>,
     298              :     /// append this domain zone to the SNI hostname to get the destination address
     299              :     #[clap(id = "sni-router-destination", long)]
     300              :     dest: Option<String>,
     301              : }
     302              : 
     303            0 : pub async fn run() -> anyhow::Result<()> {
     304            0 :     let _logging_guard = crate::logging::init().await?;
     305            0 :     let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
     306            0 :     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
     307            0 : 
     308            0 :     // TODO: refactor these to use labels
     309            0 :     info!("Version: {GIT_VERSION}");
     310            0 :     info!("Build_tag: {BUILD_TAG}");
     311            0 :     let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
     312            0 :         revision: GIT_VERSION,
     313            0 :         build_tag: BUILD_TAG,
     314            0 :     });
     315              : 
     316            0 :     let jemalloc = match crate::jemalloc::MetricRecorder::new() {
     317            0 :         Ok(t) => Some(t),
     318            0 :         Err(e) => {
     319            0 :             error!(error = ?e, "could not start jemalloc metrics loop");
     320            0 :             None
     321              :         }
     322              :     };
     323              : 
     324            0 :     let args = ProxyCliArgs::parse();
     325            0 :     let config = build_config(&args)?;
     326            0 :     let auth_backend = build_auth_backend(&args)?;
     327              : 
     328            0 :     match auth_backend {
     329            0 :         Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"),
     330            0 :         Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"),
     331              :     }
     332            0 :     info!("Using region: {}", args.aws_region);
     333            0 :     let (regional_redis_client, redis_notifications_client) = configure_redis(&args).await?;
     334              : 
     335              :     // Check that we can bind to address before further initialization
     336            0 :     info!("Starting http on {}", args.http);
     337            0 :     let http_listener = TcpListener::bind(args.http).await?.into_std()?;
     338              : 
     339            0 :     info!("Starting mgmt on {}", args.mgmt);
     340            0 :     let mgmt_listener = TcpListener::bind(args.mgmt).await?;
     341              : 
     342            0 :     let proxy_listener = if args.is_auth_broker {
     343            0 :         None
     344              :     } else {
     345            0 :         info!("Starting proxy on {}", args.proxy);
     346            0 :         Some(TcpListener::bind(args.proxy).await?)
     347              :     };
     348              : 
     349            0 :     let sni_router_listeners = {
     350            0 :         let args = &args.pg_sni_router;
     351            0 :         if args.dest.is_some() {
     352            0 :             ensure!(
     353            0 :                 args.tls_key.is_some(),
     354            0 :                 "sni-router-tls-key must be provided"
     355              :             );
     356            0 :             ensure!(
     357            0 :                 args.tls_cert.is_some(),
     358            0 :                 "sni-router-tls-cert must be provided"
     359              :             );
     360              : 
     361            0 :             info!(
     362            0 :                 "Starting pg-sni-router on {} and {}",
     363              :                 args.listen, args.listen_tls
     364              :             );
     365              : 
     366              :             Some((
     367            0 :                 TcpListener::bind(args.listen).await?,
     368            0 :                 TcpListener::bind(args.listen_tls).await?,
     369              :             ))
     370              :         } else {
     371            0 :             None
     372              :         }
     373              :     };
     374              : 
     375              :     // TODO: rename the argument to something like serverless.
     376              :     // It now covers more than just websockets, it also covers SQL over HTTP.
     377            0 :     let serverless_listener = if let Some(serverless_address) = args.wss {
     378            0 :         info!("Starting wss on {serverless_address}");
     379            0 :         Some(TcpListener::bind(serverless_address).await?)
     380            0 :     } else if args.is_auth_broker {
     381            0 :         bail!("wss arg must be present for auth-broker")
     382              :     } else {
     383            0 :         None
     384              :     };
     385              : 
     386            0 :     let cancellation_token = CancellationToken::new();
     387            0 : 
     388            0 :     let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
     389            0 :     RateBucketInfo::validate(redis_rps_limit)?;
     390              : 
     391            0 :     let redis_kv_client = regional_redis_client
     392            0 :         .as_ref()
     393            0 :         .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit));
     394            0 : 
     395            0 :     // channel size should be higher than redis client limit to avoid blocking
     396            0 :     let cancel_ch_size = args.cancellation_ch_size;
     397            0 :     let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size);
     398            0 :     let cancellation_handler = Arc::new(CancellationHandler::new(
     399            0 :         &config.connect_to_compute,
     400            0 :         Some(tx_cancel),
     401            0 :     ));
     402            0 : 
     403            0 :     let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
     404            0 :         RateBucketInfo::to_leaky_bucket(&args.endpoint_rps_limit)
     405            0 :             .unwrap_or(EndpointRateLimiter::DEFAULT),
     406            0 :         64,
     407            0 :     ));
     408            0 : 
     409            0 :     // client facing tasks. these will exit on error or on cancellation
     410            0 :     // cancellation returns Ok(())
     411            0 :     let mut client_tasks = JoinSet::new();
     412            0 :     match auth_backend {
     413            0 :         Either::Left(auth_backend) => {
     414            0 :             if let Some(proxy_listener) = proxy_listener {
     415            0 :                 client_tasks.spawn(crate::proxy::task_main(
     416            0 :                     config,
     417            0 :                     auth_backend,
     418            0 :                     proxy_listener,
     419            0 :                     cancellation_token.clone(),
     420            0 :                     cancellation_handler.clone(),
     421            0 :                     endpoint_rate_limiter.clone(),
     422            0 :                 ));
     423            0 :             }
     424              : 
     425            0 :             if let Some(serverless_listener) = serverless_listener {
     426            0 :                 client_tasks.spawn(serverless::task_main(
     427            0 :                     config,
     428            0 :                     auth_backend,
     429            0 :                     serverless_listener,
     430            0 :                     cancellation_token.clone(),
     431            0 :                     cancellation_handler.clone(),
     432            0 :                     endpoint_rate_limiter.clone(),
     433            0 :                 ));
     434            0 :             }
     435              :         }
     436            0 :         Either::Right(auth_backend) => {
     437            0 :             if let Some(proxy_listener) = proxy_listener {
     438            0 :                 client_tasks.spawn(crate::console_redirect_proxy::task_main(
     439            0 :                     config,
     440            0 :                     auth_backend,
     441            0 :                     proxy_listener,
     442            0 :                     cancellation_token.clone(),
     443            0 :                     cancellation_handler.clone(),
     444            0 :                 ));
     445            0 :             }
     446              :         }
     447              :     }
     448              : 
     449              :     // spawn pg-sni-router mode.
     450            0 :     if let Some((listen, listen_tls)) = sni_router_listeners {
     451            0 :         let args = args.pg_sni_router;
     452            0 :         let dest = args.dest.expect("already asserted it is set");
     453            0 :         let key_path = args.tls_key.expect("already asserted it is set");
     454            0 :         let cert_path = args.tls_cert.expect("already asserted it is set");
     455              : 
     456            0 :         let tls_config = super::pg_sni_router::parse_tls(&key_path, &cert_path)?;
     457              : 
     458            0 :         let dest = Arc::new(dest);
     459            0 : 
     460            0 :         client_tasks.spawn(super::pg_sni_router::task_main(
     461            0 :             dest.clone(),
     462            0 :             tls_config.clone(),
     463            0 :             None,
     464            0 :             listen,
     465            0 :             cancellation_token.clone(),
     466            0 :         ));
     467            0 : 
     468            0 :         client_tasks.spawn(super::pg_sni_router::task_main(
     469            0 :             dest,
     470            0 :             tls_config,
     471            0 :             Some(config.connect_to_compute.tls.clone()),
     472            0 :             listen_tls,
     473            0 :             cancellation_token.clone(),
     474            0 :         ));
     475            0 :     }
     476              : 
     477            0 :     client_tasks.spawn(crate::context::parquet::worker(
     478            0 :         cancellation_token.clone(),
     479            0 :         args.parquet_upload,
     480            0 :     ));
     481            0 : 
     482            0 :     // maintenance tasks. these never return unless there's an error
     483            0 :     let mut maintenance_tasks = JoinSet::new();
     484            0 :     maintenance_tasks.spawn(crate::signals::handle(cancellation_token.clone(), || {}));
     485            0 :     maintenance_tasks.spawn(http::health_server::task_main(
     486            0 :         http_listener,
     487            0 :         AppMetrics {
     488            0 :             jemalloc,
     489            0 :             neon_metrics,
     490            0 :             proxy: crate::metrics::Metrics::get(),
     491            0 :         },
     492            0 :     ));
     493            0 :     maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener));
     494              : 
     495            0 :     if let Some(metrics_config) = &config.metric_collection {
     496            0 :         // TODO: Add gc regardles of the metric collection being enabled.
     497            0 :         maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
     498            0 :     }
     499              : 
     500              :     #[cfg_attr(not(any(test, feature = "testing")), expect(irrefutable_let_patterns))]
     501            0 :     if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend {
     502            0 :         if let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
     503            0 :             match (redis_notifications_client, regional_redis_client.clone()) {
     504            0 :                 (None, None) => {}
     505            0 :                 (client1, client2) => {
     506            0 :                     let cache = api.caches.project_info.clone();
     507            0 :                     if let Some(client) = client1 {
     508            0 :                         maintenance_tasks.spawn(notifications::task_main(
     509            0 :                             client,
     510            0 :                             cache.clone(),
     511            0 :                             args.region.clone(),
     512            0 :                         ));
     513            0 :                     }
     514            0 :                     if let Some(client) = client2 {
     515            0 :                         maintenance_tasks.spawn(notifications::task_main(
     516            0 :                             client,
     517            0 :                             cache.clone(),
     518            0 :                             args.region.clone(),
     519            0 :                         ));
     520            0 :                     }
     521            0 :                     maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
     522            0 :                 }
     523              :             }
     524              : 
     525              :             // Try to connect to Redis 3 times with 1 + (0..0.1) second interval.
     526              :             // This prevents immediate exit and pod restart,
     527              :             // which can cause hammering of the redis in case of connection issues.
     528            0 :             if let Some(mut redis_kv_client) = redis_kv_client {
     529            0 :                 for attempt in (0..3).with_position() {
     530            0 :                     match redis_kv_client.try_connect().await {
     531              :                         Ok(()) => {
     532            0 :                             info!("Connected to Redis KV client");
     533            0 :                             maintenance_tasks.spawn(async move {
     534            0 :                                 handle_cancel_messages(
     535            0 :                                     &mut redis_kv_client,
     536            0 :                                     rx_cancel,
     537            0 :                                     args.cancellation_batch_size,
     538            0 :                                 )
     539            0 :                                 .await?;
     540              : 
     541            0 :                                 drop(redis_kv_client);
     542            0 : 
     543            0 :                                 // `handle_cancel_messages` was terminated due to the tx_cancel
     544            0 :                                 // being dropped. this is not worthy of an error, and this task can only return `Err`,
     545            0 :                                 // so let's wait forever instead.
     546            0 :                                 std::future::pending().await
     547            0 :                             });
     548            0 :                             break;
     549              :                         }
     550            0 :                         Err(e) => {
     551            0 :                             error!("Failed to connect to Redis KV client: {e}");
     552            0 :                             if matches!(attempt, Position::Last(_)) {
     553            0 :                                 bail!(
     554            0 :                                     "Failed to connect to Redis KV client after {} attempts",
     555            0 :                                     attempt.into_inner()
     556            0 :                                 );
     557            0 :                             }
     558            0 :                             let jitter = thread_rng().gen_range(0..100);
     559            0 :                             tokio::time::sleep(Duration::from_millis(1000 + jitter)).await;
     560              :                         }
     561              :                     }
     562              :                 }
     563            0 :             }
     564              : 
     565            0 :             if let Some(regional_redis_client) = regional_redis_client {
     566            0 :                 let cache = api.caches.endpoints_cache.clone();
     567            0 :                 let con = regional_redis_client;
     568            0 :                 let span = tracing::info_span!("endpoints_cache");
     569            0 :                 maintenance_tasks.spawn(
     570            0 :                     async move { cache.do_read(con, cancellation_token.clone()).await }
     571            0 :                         .instrument(span),
     572            0 :                 );
     573            0 :             }
     574            0 :         }
     575            0 :     }
     576              : 
     577              :     let maintenance = loop {
     578              :         // get one complete task
     579            0 :         match futures::future::select(
     580            0 :             pin!(maintenance_tasks.join_next()),
     581            0 :             pin!(client_tasks.join_next()),
     582            0 :         )
     583            0 :         .await
     584              :         {
     585              :             // exit immediately on maintenance task completion
     586            0 :             Either::Left((Some(res), _)) => break crate::error::flatten_err(res)?,
     587              :             // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
     588            0 :             Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
     589              :             // exit immediately on client task error
     590            0 :             Either::Right((Some(res), _)) => crate::error::flatten_err(res)?,
     591              :             // exit if all our client tasks have shutdown gracefully
     592            0 :             Either::Right((None, _)) => return Ok(()),
     593              :         }
     594              :     };
     595              : 
     596              :     // maintenance tasks return Infallible success values, this is an impossible value
     597              :     // so this match statically ensures that there are no possibilities for that value
     598              :     match maintenance {}
     599            0 : }
     600              : 
     601              : /// ProxyConfig is created at proxy startup, and lives forever.
     602            0 : fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     603            0 :     let thread_pool = ThreadPool::new(args.scram_thread_pool_size);
     604            0 :     Metrics::install(thread_pool.metrics.clone());
     605              : 
     606            0 :     let tls_config = match (&args.tls_key, &args.tls_cert) {
     607            0 :         (Some(key_path), Some(cert_path)) => Some(config::configure_tls(
     608            0 :             key_path,
     609            0 :             cert_path,
     610            0 :             args.certs_dir.as_deref(),
     611            0 :             args.allow_tls_keylogfile,
     612            0 :         )?),
     613            0 :         (None, None) => None,
     614            0 :         _ => bail!("either both or neither tls-key and tls-cert must be specified"),
     615              :     };
     616            0 :     let tls_config = ArcSwapOption::from(tls_config.map(Arc::new));
     617            0 : 
     618            0 :     let backup_metric_collection_config = config::MetricBackupCollectionConfig {
     619            0 :         remote_storage_config: args.metric_backup_collection_remote_storage.clone(),
     620            0 :         chunk_size: args.metric_backup_collection_chunk_size,
     621            0 :     };
     622              : 
     623            0 :     let metric_collection = match (
     624            0 :         &args.metric_collection_endpoint,
     625            0 :         &args.metric_collection_interval,
     626              :     ) {
     627            0 :         (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
     628            0 :             endpoint: endpoint.parse()?,
     629            0 :             interval: humantime::parse_duration(interval)?,
     630            0 :             backup_metric_collection_config,
     631              :         }),
     632            0 :         (None, None) => None,
     633            0 :         _ => bail!(
     634            0 :             "either both or neither metric-collection-endpoint \
     635            0 :              and metric-collection-interval must be specified"
     636            0 :         ),
     637              :     };
     638              : 
     639              :     let config::ConcurrencyLockOptions {
     640            0 :         shards,
     641            0 :         limiter,
     642            0 :         epoch,
     643            0 :         timeout,
     644            0 :     } = args.connect_compute_lock.parse()?;
     645            0 :     info!(
     646              :         ?limiter,
     647              :         shards,
     648              :         ?epoch,
     649            0 :         "Using NodeLocks (connect_compute)"
     650              :     );
     651            0 :     let connect_compute_locks = control_plane::locks::ApiLocks::new(
     652            0 :         "connect_compute_lock",
     653            0 :         limiter,
     654            0 :         shards,
     655            0 :         timeout,
     656            0 :         epoch,
     657            0 :         &Metrics::get().proxy.connect_compute_lock,
     658            0 :     );
     659            0 : 
     660            0 :     let http_config = HttpConfig {
     661            0 :         accept_websockets: !args.is_auth_broker,
     662            0 :         pool_options: GlobalConnPoolOptions {
     663            0 :             max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
     664            0 :             gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
     665            0 :             pool_shards: args.sql_over_http.sql_over_http_pool_shards,
     666            0 :             idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
     667            0 :             opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
     668            0 :             max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
     669            0 :         },
     670            0 :         cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
     671            0 :         client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
     672            0 :         max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
     673            0 :         max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
     674            0 :     };
     675            0 :     let authentication_config = AuthenticationConfig {
     676            0 :         jwks_cache: JwkCache::default(),
     677            0 :         thread_pool,
     678            0 :         scram_protocol_timeout: args.scram_protocol_timeout,
     679            0 :         ip_allowlist_check_enabled: !args.is_private_access_proxy,
     680            0 :         is_vpc_acccess_proxy: args.is_private_access_proxy,
     681            0 :         is_auth_broker: args.is_auth_broker,
     682            0 :         accept_jwts: args.is_auth_broker,
     683            0 :         console_redirect_confirmation_timeout: args.webauth_confirmation_timeout,
     684            0 :     };
     685              : 
     686            0 :     let compute_config = ComputeConfig {
     687            0 :         retry: config::RetryConfig::parse(&args.connect_to_compute_retry)?,
     688            0 :         tls: Arc::new(compute_client_config_with_root_certs()?),
     689            0 :         timeout: Duration::from_secs(2),
     690              :     };
     691              : 
     692            0 :     let config = ProxyConfig {
     693            0 :         tls_config,
     694            0 :         metric_collection,
     695            0 :         http_config,
     696            0 :         authentication_config,
     697            0 :         proxy_protocol_v2: args.proxy_protocol_v2,
     698            0 :         handshake_timeout: args.handshake_timeout,
     699            0 :         region: args.region.clone(),
     700            0 :         wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
     701            0 :         connect_compute_locks,
     702            0 :         connect_to_compute: compute_config,
     703            0 :     };
     704            0 : 
     705            0 :     let config = Box::leak(Box::new(config));
     706            0 : 
     707            0 :     tokio::spawn(config.connect_compute_locks.garbage_collect_worker());
     708            0 : 
     709            0 :     Ok(config)
     710            0 : }
     711              : 
     712              : /// auth::Backend is created at proxy startup, and lives forever.
     713            0 : fn build_auth_backend(
     714            0 :     args: &ProxyCliArgs,
     715            0 : ) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
     716            0 :     match &args.auth_backend {
     717              :         AuthBackendType::ControlPlane => {
     718            0 :             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
     719            0 :             let project_info_cache_config: ProjectInfoCacheOptions =
     720            0 :                 args.project_info_cache.parse()?;
     721            0 :             let endpoint_cache_config: config::EndpointCacheConfig =
     722            0 :                 args.endpoint_cache_config.parse()?;
     723              : 
     724            0 :             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
     725            0 :             info!(
     726            0 :                 "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
     727              :             );
     728            0 :             info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
     729            0 :             let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
     730            0 :                 wake_compute_cache_config,
     731            0 :                 project_info_cache_config,
     732            0 :                 endpoint_cache_config,
     733            0 :             )));
     734              : 
     735              :             let config::ConcurrencyLockOptions {
     736            0 :                 shards,
     737            0 :                 limiter,
     738            0 :                 epoch,
     739            0 :                 timeout,
     740            0 :             } = args.wake_compute_lock.parse()?;
     741            0 :             info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
     742            0 :             let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
     743            0 :                 "wake_compute_lock",
     744            0 :                 limiter,
     745            0 :                 shards,
     746            0 :                 timeout,
     747            0 :                 epoch,
     748            0 :                 &Metrics::get().wake_compute_lock,
     749            0 :             )));
     750            0 :             tokio::spawn(locks.garbage_collect_worker());
     751              : 
     752            0 :             let url: crate::url::ApiUrl = args.auth_endpoint.parse()?;
     753              : 
     754            0 :             let endpoint = http::Endpoint::new(url, http::new_client());
     755            0 : 
     756            0 :             let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
     757            0 :             RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
     758            0 :             let wake_compute_endpoint_rate_limiter =
     759            0 :                 Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
     760            0 : 
     761            0 :             let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
     762            0 :                 endpoint,
     763            0 :                 args.control_plane_token.clone(),
     764            0 :                 caches,
     765            0 :                 locks,
     766            0 :                 wake_compute_endpoint_rate_limiter,
     767            0 :             );
     768            0 : 
     769            0 :             let api = control_plane::client::ControlPlaneClient::ProxyV1(api);
     770            0 :             let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
     771            0 :             let config = Box::leak(Box::new(auth_backend));
     772            0 : 
     773            0 :             Ok(Either::Left(config))
     774              :         }
     775              : 
     776              :         #[cfg(any(test, feature = "testing"))]
     777              :         AuthBackendType::Postgres => {
     778            0 :             let mut url: ApiUrl = args.auth_endpoint.parse()?;
     779            0 :             if url.password().is_none() {
     780            0 :                 let password = env::var("PGPASSWORD")
     781            0 :                     .with_context(|| "auth-endpoint does not contain a password and environment variable `PGPASSWORD` is not set")?;
     782            0 :                 url.set_password(Some(&password))
     783            0 :                     .expect("Failed to set password");
     784            0 :             }
     785            0 :             let api = control_plane::client::mock::MockControlPlane::new(
     786            0 :                 url,
     787            0 :                 !args.is_private_access_proxy,
     788            0 :             );
     789            0 :             let api = control_plane::client::ControlPlaneClient::PostgresMock(api);
     790            0 : 
     791            0 :             let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
     792            0 : 
     793            0 :             let config = Box::leak(Box::new(auth_backend));
     794            0 : 
     795            0 :             Ok(Either::Left(config))
     796              :         }
     797              : 
     798              :         AuthBackendType::ConsoleRedirect => {
     799            0 :             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
     800            0 :             let project_info_cache_config: ProjectInfoCacheOptions =
     801            0 :                 args.project_info_cache.parse()?;
     802            0 :             let endpoint_cache_config: config::EndpointCacheConfig =
     803            0 :                 args.endpoint_cache_config.parse()?;
     804              : 
     805            0 :             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
     806            0 :             info!(
     807            0 :                 "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
     808              :             );
     809            0 :             info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
     810            0 :             let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
     811            0 :                 wake_compute_cache_config,
     812            0 :                 project_info_cache_config,
     813            0 :                 endpoint_cache_config,
     814            0 :             )));
     815              : 
     816              :             let config::ConcurrencyLockOptions {
     817            0 :                 shards,
     818            0 :                 limiter,
     819            0 :                 epoch,
     820            0 :                 timeout,
     821            0 :             } = args.wake_compute_lock.parse()?;
     822            0 :             info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
     823            0 :             let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
     824            0 :                 "wake_compute_lock",
     825            0 :                 limiter,
     826            0 :                 shards,
     827            0 :                 timeout,
     828            0 :                 epoch,
     829            0 :                 &Metrics::get().wake_compute_lock,
     830            0 :             )));
     831              : 
     832            0 :             let url = args.uri.clone().parse()?;
     833            0 :             let ep_url: crate::url::ApiUrl = args.auth_endpoint.parse()?;
     834            0 :             let endpoint = http::Endpoint::new(ep_url, http::new_client());
     835            0 :             let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
     836            0 :             RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
     837            0 :             let wake_compute_endpoint_rate_limiter =
     838            0 :                 Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
     839            0 : 
     840            0 :             // Since we use only get_allowed_ips_and_secret() wake_compute_endpoint_rate_limiter
     841            0 :             // and locks are not used in ConsoleRedirectBackend,
     842            0 :             // but they are required by the NeonControlPlaneClient
     843            0 :             let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
     844            0 :                 endpoint,
     845            0 :                 args.control_plane_token.clone(),
     846            0 :                 caches,
     847            0 :                 locks,
     848            0 :                 wake_compute_endpoint_rate_limiter,
     849            0 :             );
     850            0 : 
     851            0 :             let backend = ConsoleRedirectBackend::new(url, api);
     852            0 :             let config = Box::leak(Box::new(backend));
     853            0 : 
     854            0 :             Ok(Either::Right(config))
     855              :         }
     856              :     }
     857            0 : }
     858              : 
     859            0 : async fn configure_redis(
     860            0 :     args: &ProxyCliArgs,
     861            0 : ) -> anyhow::Result<(
     862            0 :     Option<ConnectionWithCredentialsProvider>,
     863            0 :     Option<ConnectionWithCredentialsProvider>,
     864            0 : )> {
     865              :     // TODO: untangle the config args
     866            0 :     let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
     867            0 :         ("plain", redis_url) => match redis_url {
     868              :             None => {
     869            0 :                 bail!("plain auth requires redis_notifications to be set");
     870              :             }
     871            0 :             Some(url) => {
     872            0 :                 Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone()))
     873              :             }
     874              :         },
     875            0 :         ("irsa", _) => match (&args.redis_host, args.redis_port) {
     876            0 :             (Some(host), Some(port)) => Some(
     877            0 :                 ConnectionWithCredentialsProvider::new_with_credentials_provider(
     878            0 :                     host.clone(),
     879            0 :                     port,
     880            0 :                     elasticache::CredentialsProvider::new(
     881            0 :                         args.aws_region.clone(),
     882            0 :                         args.redis_cluster_name.clone(),
     883            0 :                         args.redis_user_id.clone(),
     884            0 :                     )
     885            0 :                     .await,
     886              :                 ),
     887              :             ),
     888              :             (None, None) => {
     889              :                 // todo: upgrade to error?
     890            0 :                 warn!(
     891            0 :                     "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"
     892              :                 );
     893            0 :                 None
     894              :             }
     895              :             _ => {
     896            0 :                 bail!("redis-host and redis-port must be specified together");
     897              :             }
     898              :         },
     899              :         _ => {
     900            0 :             bail!("unknown auth type given");
     901              :         }
     902              :     };
     903              : 
     904            0 :     let redis_notifications_client = if let Some(url) = &args.redis_notifications {
     905            0 :         Some(ConnectionWithCredentialsProvider::new_with_static_credentials(&**url))
     906              :     } else {
     907            0 :         regional_redis_client.clone()
     908              :     };
     909              : 
     910            0 :     Ok((regional_redis_client, redis_notifications_client))
     911            0 : }
     912              : 
     913              : #[cfg(test)]
     914              : mod tests {
     915              :     use std::time::Duration;
     916              : 
     917              :     use clap::Parser;
     918              : 
     919              :     use crate::rate_limiter::RateBucketInfo;
     920              : 
     921              :     #[test]
     922            1 :     fn parse_endpoint_rps_limit() {
     923            1 :         let config = super::ProxyCliArgs::parse_from([
     924            1 :             "proxy",
     925            1 :             "--endpoint-rps-limit",
     926            1 :             "100@1s",
     927            1 :             "--endpoint-rps-limit",
     928            1 :             "20@30s",
     929            1 :         ]);
     930            1 : 
     931            1 :         assert_eq!(
     932            1 :             config.endpoint_rps_limit,
     933            1 :             vec![
     934            1 :                 RateBucketInfo::new(100, Duration::from_secs(1)),
     935            1 :                 RateBucketInfo::new(20, Duration::from_secs(30)),
     936            1 :             ]
     937            1 :         );
     938            1 :     }
     939              : }

Generated by: LCOV version 2.1-beta