LCOV - code coverage report
Current view: top level - proxy/src/binary - proxy.rs (source / functions) Coverage Total Hit
Test: 1e20c4f2b28aa592527961bb32170ebbd2c9172f.info Lines: 3.3 % 399 13
Test Date: 2025-07-16 12:29:03 Functions: 11.1 % 9 1

            Line data    Source code
       1              : #[cfg(any(test, feature = "testing"))]
       2              : use std::env;
       3              : use std::net::SocketAddr;
       4              : use std::path::PathBuf;
       5              : use std::pin::pin;
       6              : use std::sync::Arc;
       7              : use std::time::Duration;
       8              : 
       9              : #[cfg(any(test, feature = "testing"))]
      10              : use anyhow::Context;
      11              : use anyhow::{bail, ensure};
      12              : use arc_swap::ArcSwapOption;
      13              : #[cfg(any(test, feature = "testing"))]
      14              : use camino::Utf8PathBuf;
      15              : use futures::future::Either;
      16              : use itertools::{Itertools, Position};
      17              : use rand::{Rng, thread_rng};
      18              : use remote_storage::RemoteStorageConfig;
      19              : use tokio::net::TcpListener;
      20              : #[cfg(any(test, feature = "testing"))]
      21              : use tokio::sync::Notify;
      22              : use tokio::task::JoinSet;
      23              : use tokio_util::sync::CancellationToken;
      24              : use tracing::{error, info, warn};
      25              : use utils::sentry_init::init_sentry;
      26              : use utils::{project_build_tag, project_git_version};
      27              : 
      28              : use crate::auth::backend::jwt::JwkCache;
      29              : #[cfg(any(test, feature = "testing"))]
      30              : use crate::auth::backend::local::LocalBackend;
      31              : use crate::auth::backend::{ConsoleRedirectBackend, MaybeOwned};
      32              : use crate::batch::BatchQueue;
      33              : use crate::cancellation::{CancellationHandler, CancellationProcessor};
      34              : #[cfg(any(test, feature = "testing"))]
      35              : use crate::config::refresh_config_loop;
      36              : use crate::config::{
      37              :     self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions,
      38              :     ProxyConfig, ProxyProtocolV2, remote_storage_from_toml,
      39              : };
      40              : use crate::context::parquet::ParquetUploadArgs;
      41              : use crate::http::health_server::AppMetrics;
      42              : use crate::metrics::Metrics;
      43              : use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
      44              : use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
      45              : use crate::redis::kv_ops::RedisKVClient;
      46              : use crate::redis::{elasticache, notifications};
      47              : use crate::scram::threadpool::ThreadPool;
      48              : use crate::serverless::GlobalConnPoolOptions;
      49              : use crate::serverless::cancel_set::CancelSet;
      50              : use crate::tls::client_config::compute_client_config_with_root_certs;
      51              : #[cfg(any(test, feature = "testing"))]
      52              : use crate::url::ApiUrl;
      53              : use crate::{auth, control_plane, http, serverless, usage_metrics};
      54              : 
      55              : project_git_version!(GIT_VERSION);
      56              : project_build_tag!(BUILD_TAG);
      57              : 
      58              : use clap::{Parser, ValueEnum};
      59              : 
      60              : #[derive(Clone, Debug, ValueEnum)]
      61              : #[clap(rename_all = "kebab-case")]
      62              : enum AuthBackendType {
      63              :     #[clap(alias("cplane-v1"))]
      64              :     ControlPlane,
      65              : 
      66              :     #[clap(alias("link"))]
      67              :     ConsoleRedirect,
      68              : 
      69              :     #[cfg(any(test, feature = "testing"))]
      70              :     Postgres,
      71              : 
      72              :     #[cfg(any(test, feature = "testing"))]
      73              :     Local,
      74              : }
      75              : 
      76              : /// Neon proxy/router
      77              : #[derive(Parser)]
      78              : #[command(version = GIT_VERSION, about)]
      79              : struct ProxyCliArgs {
      80              :     /// Name of the region this proxy is deployed in
      81              :     #[clap(long, default_value_t = String::new())]
      82              :     region: String,
      83              :     /// listen for incoming client connections on ip:port
      84              :     #[clap(short, long, default_value = "127.0.0.1:4432")]
      85              :     proxy: SocketAddr,
      86              :     #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)]
      87              :     auth_backend: AuthBackendType,
      88              :     /// Path of the local proxy config file (used for local-file auth backend)
      89              :     #[clap(long, default_value = "./local_proxy.json")]
      90              :     #[cfg(any(test, feature = "testing"))]
      91              :     config_path: Utf8PathBuf,
      92              :     /// listen for management callback connection on ip:port
      93              :     #[clap(short, long, default_value = "127.0.0.1:7000")]
      94              :     mgmt: SocketAddr,
      95              :     /// listen for incoming http connections (metrics, etc) on ip:port
      96              :     #[clap(long, default_value = "127.0.0.1:7001")]
      97              :     http: SocketAddr,
      98              :     /// listen for incoming wss connections on ip:port
      99              :     #[clap(long)]
     100              :     wss: Option<SocketAddr>,
     101              :     /// redirect unauthenticated users to the given uri in case of console redirect auth
     102              :     #[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
     103              :     uri: String,
     104              :     /// cloud API endpoint for authenticating users
     105              :     #[clap(
     106              :         short,
     107              :         long,
     108              :         default_value = "http://localhost:3000/authenticate_proxy_request/"
     109              :     )]
     110              :     auth_endpoint: String,
     111              :     /// JWT used to connect to control plane.
     112              :     #[clap(
     113              :         long,
     114              :         value_name = "JWT",
     115              :         default_value = "",
     116              :         env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN"
     117              :     )]
     118              :     control_plane_token: Arc<str>,
     119              :     /// if this is not local proxy, this toggles whether we accept jwt or passwords for http
     120              :     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     121              :     is_auth_broker: bool,
     122              :     /// path to TLS key for client postgres connections
     123              :     ///
     124              :     /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
     125              :     #[clap(short = 'k', long, alias = "ssl-key")]
     126              :     tls_key: Option<PathBuf>,
     127              :     /// path to TLS cert for client postgres connections
     128              :     ///
     129              :     /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
     130              :     #[clap(short = 'c', long, alias = "ssl-cert")]
     131              :     tls_cert: Option<PathBuf>,
     132              :     /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`.
     133              :     #[clap(long, alias = "allow-ssl-keylogfile")]
     134              :     allow_tls_keylogfile: bool,
     135              :     /// path to directory with TLS certificates for client postgres connections
     136              :     #[clap(long)]
     137              :     certs_dir: Option<PathBuf>,
     138              :     /// timeout for the TLS handshake
     139              :     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     140              :     handshake_timeout: tokio::time::Duration,
     141              :     /// cache for `wake_compute` api method (use `size=0` to disable)
     142              :     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     143              :     wake_compute_cache: String,
     144              :     /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
     145              :     #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
     146              :     wake_compute_lock: String,
     147              :     /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
     148              :     #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
     149              :     connect_compute_lock: String,
     150              :     #[clap(flatten)]
     151              :     sql_over_http: SqlOverHttpArgs,
     152              :     /// timeout for scram authentication protocol
     153              :     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     154              :     scram_protocol_timeout: tokio::time::Duration,
     155              :     /// size of the threadpool for password hashing
     156              :     #[clap(long, default_value_t = 4)]
     157              :     scram_thread_pool_size: u8,
     158              :     /// Endpoint rate limiter max number of requests per second.
     159              :     ///
     160              :     /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
     161              :     /// Can be given multiple times for different bucket sizes.
     162              :     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     163              :     endpoint_rps_limit: Vec<RateBucketInfo>,
     164              :     /// Wake compute rate limiter max number of requests per second.
     165              :     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
     166              :     wake_compute_limit: Vec<RateBucketInfo>,
     167              :     /// Cancellation channel size (max queue size for redis kv client)
     168              :     #[clap(long, default_value_t = 1024)]
     169              :     cancellation_ch_size: usize,
     170              :     /// Cancellation ops batch size for redis
     171              :     #[clap(long, default_value_t = 8)]
     172              :     cancellation_batch_size: usize,
     173              :     /// redis url for plain authentication
     174              :     #[clap(long, alias("redis-notifications"))]
     175              :     redis_plain: Option<String>,
     176              :     /// what from the available authentications type to use for redis. Supported are "irsa" and "plain".
     177              :     #[clap(long, default_value = "irsa")]
     178              :     redis_auth_type: String,
     179              :     /// redis host for irsa authentication
     180              :     #[clap(long)]
     181              :     redis_host: Option<String>,
     182              :     /// redis port for irsa authentication
     183              :     #[clap(long)]
     184              :     redis_port: Option<u16>,
     185              :     /// redis cluster name for irsa authentication
     186              :     #[clap(long)]
     187              :     redis_cluster_name: Option<String>,
     188              :     /// redis user_id for irsa authentication
     189              :     #[clap(long)]
     190              :     redis_user_id: Option<String>,
     191              :     /// aws region for irsa authentication
     192              :     #[clap(long, default_value_t = String::new())]
     193              :     aws_region: String,
     194              :     /// cache for `project_info` (use `size=0` to disable)
     195              :     #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
     196              :     project_info_cache: String,
     197              :     /// cache for all valid endpoints
     198              :     // TODO: remove after a couple of releases.
     199              :     #[clap(long, default_value_t = String::new())]
     200              :     #[deprecated]
     201              :     endpoint_cache_config: String,
     202              :     #[clap(flatten)]
     203              :     parquet_upload: ParquetUploadArgs,
     204              : 
     205              :     /// http endpoint to receive periodic metric updates
     206              :     #[clap(long)]
     207              :     metric_collection_endpoint: Option<String>,
     208              :     /// how often metrics should be sent to a collection endpoint
     209              :     #[clap(long)]
     210              :     metric_collection_interval: Option<String>,
     211              :     /// interval for backup metric collection
     212              :     #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
     213              :     metric_backup_collection_interval: std::time::Duration,
     214              :     /// remote storage configuration for backup metric collection
     215              :     /// Encoded as toml (same format as pageservers), eg
     216              :     /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
     217              :     #[clap(long, value_parser = remote_storage_from_toml)]
     218              :     metric_backup_collection_remote_storage: Option<RemoteStorageConfig>,
     219              :     /// chunk size for backup metric collection
     220              :     /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
     221              :     #[clap(long, default_value = "4194304")]
     222              :     metric_backup_collection_chunk_size: usize,
     223              : 
     224              :     /// Whether to retry the connection to the compute node
     225              :     #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
     226              :     connect_to_compute_retry: String,
     227              :     /// Whether to retry the wake_compute request
     228              :     #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
     229              :     wake_compute_retry: String,
     230              : 
     231              :     /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist
     232              :     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     233              :     is_private_access_proxy: bool,
     234              : 
     235              :     /// Configure whether all incoming requests have a Proxy Protocol V2 packet.
     236              :     #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Rejected)]
     237              :     proxy_protocol_v2: ProxyProtocolV2,
     238              : 
     239              :     /// Time the proxy waits for the webauth session to be confirmed by the control plane.
     240              :     // TODO: rename to `console_redirect_confirmation_timeout`.
     241              :     #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)]
     242              :     webauth_confirmation_timeout: std::time::Duration,
     243              : 
     244              :     #[clap(flatten)]
     245              :     pg_sni_router: PgSniRouterArgs,
     246              : 
     247              :     /// if this is not local proxy, this toggles whether we accept Postgres REST requests
     248              :     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     249              :     is_rest_broker: bool,
     250              : 
     251              :     /// cache for `db_schema_cache` introspection (use `size=0` to disable)
     252              :     #[clap(long, default_value = "size=1000,ttl=1h")]
     253              :     db_schema_cache: String,
     254              : }
     255              : 
     256              : #[derive(clap::Args, Clone, Copy, Debug)]
     257              : struct SqlOverHttpArgs {
     258              :     /// timeout for http connection requests
     259              :     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     260              :     sql_over_http_timeout: tokio::time::Duration,
     261              : 
     262              :     /// Whether the SQL over http pool is opt-in
     263              :     #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     264              :     sql_over_http_pool_opt_in: bool,
     265              : 
     266              :     /// How many connections to pool for each endpoint. Excess connections are discarded
     267              :     #[clap(long, default_value_t = 20)]
     268              :     sql_over_http_pool_max_conns_per_endpoint: usize,
     269              : 
     270              :     /// How many connections to pool for each endpoint. Excess connections are discarded
     271              :     #[clap(long, default_value_t = 20000)]
     272              :     sql_over_http_pool_max_total_conns: usize,
     273              : 
     274              :     /// How long pooled connections should remain idle for before closing
     275              :     #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
     276              :     sql_over_http_idle_timeout: tokio::time::Duration,
     277              : 
     278              :     /// Duration each shard will wait on average before a GC sweep.
     279              :     /// A longer time will causes sweeps to take longer but will interfere less frequently.
     280              :     #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
     281              :     sql_over_http_pool_gc_epoch: tokio::time::Duration,
     282              : 
     283              :     /// How many shards should the global pool have. Must be a power of two.
     284              :     /// More shards will introduce less contention for pool operations, but can
     285              :     /// increase memory used by the pool
     286              :     #[clap(long, default_value_t = 128)]
     287              :     sql_over_http_pool_shards: usize,
     288              : 
     289              :     #[clap(long, default_value_t = 10000)]
     290              :     sql_over_http_client_conn_threshold: u64,
     291              : 
     292              :     #[clap(long, default_value_t = 64)]
     293              :     sql_over_http_cancel_set_shards: usize,
     294              : 
     295              :     #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
     296              :     sql_over_http_max_request_size_bytes: usize,
     297              : 
     298              :     #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
     299              :     sql_over_http_max_response_size_bytes: usize,
     300              : }
     301              : 
     302              : #[derive(clap::Args, Clone, Debug)]
     303              : struct PgSniRouterArgs {
     304              :     /// listen for incoming client connections on ip:port
     305              :     #[clap(id = "sni-router-listen", long, default_value = "127.0.0.1:4432")]
     306              :     listen: SocketAddr,
     307              :     /// listen for incoming client connections on ip:port, requiring TLS to compute
     308              :     #[clap(id = "sni-router-listen-tls", long, default_value = "127.0.0.1:4433")]
     309              :     listen_tls: SocketAddr,
     310              :     /// path to TLS key for client postgres connections
     311              :     #[clap(id = "sni-router-tls-key", long)]
     312              :     tls_key: Option<PathBuf>,
     313              :     /// path to TLS cert for client postgres connections
     314              :     #[clap(id = "sni-router-tls-cert", long)]
     315              :     tls_cert: Option<PathBuf>,
     316              :     /// append this domain zone to the SNI hostname to get the destination address
     317              :     #[clap(id = "sni-router-destination", long)]
     318              :     dest: Option<String>,
     319              : }
     320              : 
     321            0 : pub async fn run() -> anyhow::Result<()> {
     322            0 :     let _logging_guard = crate::logging::init().await?;
     323            0 :     let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
     324            0 :     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
     325              : 
     326              :     // TODO: refactor these to use labels
     327            0 :     info!("Version: {GIT_VERSION}");
     328            0 :     info!("Build_tag: {BUILD_TAG}");
     329            0 :     let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
     330            0 :         revision: GIT_VERSION,
     331            0 :         build_tag: BUILD_TAG,
     332            0 :     });
     333              : 
     334            0 :     let jemalloc = match crate::jemalloc::MetricRecorder::new() {
     335            0 :         Ok(t) => Some(t),
     336            0 :         Err(e) => {
     337            0 :             error!(error = ?e, "could not start jemalloc metrics loop");
     338            0 :             None
     339              :         }
     340              :     };
     341              : 
     342            0 :     let args = ProxyCliArgs::parse();
     343            0 :     let config = build_config(&args)?;
     344            0 :     let auth_backend = build_auth_backend(&args)?;
     345              : 
     346            0 :     match auth_backend {
     347            0 :         Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"),
     348            0 :         Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"),
     349              :     }
     350            0 :     info!("Using region: {}", args.aws_region);
     351            0 :     let redis_client = configure_redis(&args).await?;
     352              : 
     353              :     // Check that we can bind to address before further initialization
     354            0 :     info!("Starting http on {}", args.http);
     355            0 :     let http_listener = TcpListener::bind(args.http).await?.into_std()?;
     356              : 
     357            0 :     info!("Starting mgmt on {}", args.mgmt);
     358            0 :     let mgmt_listener = TcpListener::bind(args.mgmt).await?;
     359              : 
     360            0 :     let proxy_listener = if args.is_auth_broker {
     361            0 :         None
     362              :     } else {
     363            0 :         info!("Starting proxy on {}", args.proxy);
     364            0 :         Some(TcpListener::bind(args.proxy).await?)
     365              :     };
     366              : 
     367            0 :     let sni_router_listeners = {
     368            0 :         let args = &args.pg_sni_router;
     369            0 :         if args.dest.is_some() {
     370            0 :             ensure!(
     371            0 :                 args.tls_key.is_some(),
     372            0 :                 "sni-router-tls-key must be provided"
     373              :             );
     374            0 :             ensure!(
     375            0 :                 args.tls_cert.is_some(),
     376            0 :                 "sni-router-tls-cert must be provided"
     377              :             );
     378              : 
     379            0 :             info!(
     380            0 :                 "Starting pg-sni-router on {} and {}",
     381              :                 args.listen, args.listen_tls
     382              :             );
     383              : 
     384              :             Some((
     385            0 :                 TcpListener::bind(args.listen).await?,
     386            0 :                 TcpListener::bind(args.listen_tls).await?,
     387              :             ))
     388              :         } else {
     389            0 :             None
     390              :         }
     391              :     };
     392              : 
     393              :     // TODO: rename the argument to something like serverless.
     394              :     // It now covers more than just websockets, it also covers SQL over HTTP.
     395            0 :     let serverless_listener = if let Some(serverless_address) = args.wss {
     396            0 :         info!("Starting wss on {serverless_address}");
     397            0 :         Some(TcpListener::bind(serverless_address).await?)
     398            0 :     } else if args.is_auth_broker {
     399            0 :         bail!("wss arg must be present for auth-broker")
     400              :     } else {
     401            0 :         None
     402              :     };
     403              : 
     404            0 :     let cancellation_token = CancellationToken::new();
     405              : 
     406            0 :     let cancellation_handler = Arc::new(CancellationHandler::new(&config.connect_to_compute));
     407              : 
     408            0 :     let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
     409            0 :         RateBucketInfo::to_leaky_bucket(&args.endpoint_rps_limit)
     410            0 :             .unwrap_or(EndpointRateLimiter::DEFAULT),
     411              :         64,
     412              :     ));
     413              : 
     414              :     #[cfg(any(test, feature = "testing"))]
     415            0 :     let refresh_config_notify = Arc::new(Notify::new());
     416              :     // client facing tasks. these will exit on error or on cancellation
     417              :     // cancellation returns Ok(())
     418            0 :     let mut client_tasks = JoinSet::new();
     419            0 :     match auth_backend {
     420            0 :         Either::Left(auth_backend) => {
     421            0 :             if let Some(proxy_listener) = proxy_listener {
     422            0 :                 client_tasks.spawn(crate::pglb::task_main(
     423            0 :                     config,
     424            0 :                     auth_backend,
     425            0 :                     proxy_listener,
     426            0 :                     cancellation_token.clone(),
     427            0 :                     cancellation_handler.clone(),
     428            0 :                     endpoint_rate_limiter.clone(),
     429            0 :                 ));
     430            0 :             }
     431              : 
     432            0 :             if let Some(serverless_listener) = serverless_listener {
     433            0 :                 client_tasks.spawn(serverless::task_main(
     434            0 :                     config,
     435            0 :                     auth_backend,
     436            0 :                     serverless_listener,
     437            0 :                     cancellation_token.clone(),
     438            0 :                     cancellation_handler.clone(),
     439            0 :                     endpoint_rate_limiter.clone(),
     440            0 :                 ));
     441            0 :             }
     442              : 
     443              :             // if auth backend is local, we need to load the config file
     444              :             #[cfg(any(test, feature = "testing"))]
     445            0 :             if let auth::Backend::Local(_) = &auth_backend {
     446            0 :                 refresh_config_notify.notify_one();
     447            0 :                 tokio::spawn(refresh_config_loop(
     448            0 :                     config,
     449            0 :                     args.config_path,
     450            0 :                     refresh_config_notify.clone(),
     451            0 :                 ));
     452            0 :             }
     453              :         }
     454            0 :         Either::Right(auth_backend) => {
     455            0 :             if let Some(proxy_listener) = proxy_listener {
     456            0 :                 client_tasks.spawn(crate::console_redirect_proxy::task_main(
     457            0 :                     config,
     458            0 :                     auth_backend,
     459            0 :                     proxy_listener,
     460            0 :                     cancellation_token.clone(),
     461            0 :                     cancellation_handler.clone(),
     462            0 :                 ));
     463            0 :             }
     464              :         }
     465              :     }
     466              : 
     467              :     // spawn pg-sni-router mode.
     468            0 :     if let Some((listen, listen_tls)) = sni_router_listeners {
     469            0 :         let args = args.pg_sni_router;
     470            0 :         let dest = args.dest.expect("already asserted it is set");
     471            0 :         let key_path = args.tls_key.expect("already asserted it is set");
     472            0 :         let cert_path = args.tls_cert.expect("already asserted it is set");
     473              : 
     474            0 :         let tls_config = super::pg_sni_router::parse_tls(&key_path, &cert_path)?;
     475              : 
     476            0 :         let dest = Arc::new(dest);
     477              : 
     478            0 :         client_tasks.spawn(super::pg_sni_router::task_main(
     479            0 :             dest.clone(),
     480            0 :             tls_config.clone(),
     481            0 :             None,
     482            0 :             listen,
     483            0 :             cancellation_token.clone(),
     484              :         ));
     485              : 
     486            0 :         client_tasks.spawn(super::pg_sni_router::task_main(
     487            0 :             dest,
     488            0 :             tls_config,
     489            0 :             Some(config.connect_to_compute.tls.clone()),
     490            0 :             listen_tls,
     491            0 :             cancellation_token.clone(),
     492              :         ));
     493            0 :     }
     494              : 
     495            0 :     client_tasks.spawn(crate::context::parquet::worker(
     496            0 :         cancellation_token.clone(),
     497            0 :         args.parquet_upload,
     498            0 :         args.region,
     499              :     ));
     500              : 
     501              :     // maintenance tasks. these never return unless there's an error
     502            0 :     let mut maintenance_tasks = JoinSet::new();
     503              : 
     504            0 :     maintenance_tasks.spawn(crate::signals::handle(cancellation_token.clone(), {
     505            0 :         move || {
     506              :             #[cfg(any(test, feature = "testing"))]
     507            0 :             refresh_config_notify.notify_one();
     508            0 :         }
     509              :     }));
     510            0 :     maintenance_tasks.spawn(http::health_server::task_main(
     511            0 :         http_listener,
     512            0 :         AppMetrics {
     513            0 :             jemalloc,
     514            0 :             neon_metrics,
     515            0 :             proxy: crate::metrics::Metrics::get(),
     516            0 :         },
     517              :     ));
     518            0 :     maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener));
     519              : 
     520            0 :     if let Some(metrics_config) = &config.metric_collection {
     521            0 :         // TODO: Add gc regardles of the metric collection being enabled.
     522            0 :         maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
     523            0 :     }
     524              : 
     525            0 :     if let Some(client) = redis_client {
     526              :         // Try to connect to Redis 3 times with 1 + (0..0.1) second interval.
     527              :         // This prevents immediate exit and pod restart,
     528              :         // which can cause hammering of the redis in case of connection issues.
     529              :         // cancellation key management
     530            0 :         let mut redis_kv_client = RedisKVClient::new(client.clone());
     531            0 :         for attempt in (0..3).with_position() {
     532            0 :             match redis_kv_client.try_connect().await {
     533              :                 Ok(()) => {
     534            0 :                     info!("Connected to Redis KV client");
     535            0 :                     cancellation_handler.init_tx(BatchQueue::new(CancellationProcessor {
     536            0 :                         client: redis_kv_client,
     537            0 :                         batch_size: args.cancellation_batch_size,
     538            0 :                     }));
     539              : 
     540            0 :                     break;
     541              :                 }
     542            0 :                 Err(e) => {
     543            0 :                     error!("Failed to connect to Redis KV client: {e}");
     544            0 :                     if matches!(attempt, Position::Last(_)) {
     545            0 :                         bail!(
     546            0 :                             "Failed to connect to Redis KV client after {} attempts",
     547            0 :                             attempt.into_inner()
     548              :                         );
     549            0 :                     }
     550            0 :                     let jitter = thread_rng().gen_range(0..100);
     551            0 :                     tokio::time::sleep(Duration::from_millis(1000 + jitter)).await;
     552              :                 }
     553              :             }
     554              :         }
     555              : 
     556              :         #[allow(irrefutable_let_patterns)]
     557            0 :         if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend
     558            0 :             && let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api
     559              :         {
     560              :             // project info cache and invalidation of that cache.
     561            0 :             let cache = api.caches.project_info.clone();
     562            0 :             maintenance_tasks.spawn(notifications::task_main(client, cache.clone()));
     563            0 :             maintenance_tasks.spawn(async move { cache.gc_worker().await });
     564            0 :         }
     565            0 :     }
     566              : 
     567              :     let maintenance = loop {
     568              :         // get one complete task
     569            0 :         match futures::future::select(
     570            0 :             pin!(maintenance_tasks.join_next()),
     571            0 :             pin!(client_tasks.join_next()),
     572              :         )
     573            0 :         .await
     574              :         {
     575              :             // exit immediately on maintenance task completion
     576            0 :             Either::Left((Some(res), _)) => break crate::error::flatten_err(res)?,
     577              :             // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
     578            0 :             Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
     579              :             // exit immediately on client task error
     580            0 :             Either::Right((Some(res), _)) => crate::error::flatten_err(res)?,
     581              :             // exit if all our client tasks have shutdown gracefully
     582            0 :             Either::Right((None, _)) => return Ok(()),
     583              :         }
     584              :     };
     585              : 
     586              :     // maintenance tasks return Infallible success values, this is an impossible value
     587              :     // so this match statically ensures that there are no possibilities for that value
     588              :     match maintenance {}
     589            0 : }
     590              : 
     591              : /// ProxyConfig is created at proxy startup, and lives forever.
     592            0 : fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     593            0 :     let thread_pool = ThreadPool::new(args.scram_thread_pool_size);
     594            0 :     Metrics::install(thread_pool.metrics.clone());
     595              : 
     596            0 :     let tls_config = match (&args.tls_key, &args.tls_cert) {
     597            0 :         (Some(key_path), Some(cert_path)) => Some(config::configure_tls(
     598            0 :             key_path,
     599            0 :             cert_path,
     600            0 :             args.certs_dir.as_deref(),
     601            0 :             args.allow_tls_keylogfile,
     602            0 :         )?),
     603            0 :         (None, None) => None,
     604            0 :         _ => bail!("either both or neither tls-key and tls-cert must be specified"),
     605              :     };
     606            0 :     let tls_config = ArcSwapOption::from(tls_config.map(Arc::new));
     607              : 
     608            0 :     let backup_metric_collection_config = config::MetricBackupCollectionConfig {
     609            0 :         remote_storage_config: args.metric_backup_collection_remote_storage.clone(),
     610            0 :         chunk_size: args.metric_backup_collection_chunk_size,
     611            0 :     };
     612              : 
     613            0 :     let metric_collection = match (
     614            0 :         &args.metric_collection_endpoint,
     615            0 :         &args.metric_collection_interval,
     616              :     ) {
     617            0 :         (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
     618            0 :             endpoint: endpoint.parse()?,
     619            0 :             interval: humantime::parse_duration(interval)?,
     620            0 :             backup_metric_collection_config,
     621              :         }),
     622            0 :         (None, None) => None,
     623            0 :         _ => bail!(
     624            0 :             "either both or neither metric-collection-endpoint \
     625            0 :              and metric-collection-interval must be specified"
     626              :         ),
     627              :     };
     628              : 
     629              :     let config::ConcurrencyLockOptions {
     630            0 :         shards,
     631            0 :         limiter,
     632            0 :         epoch,
     633            0 :         timeout,
     634            0 :     } = args.connect_compute_lock.parse()?;
     635            0 :     info!(
     636              :         ?limiter,
     637              :         shards,
     638              :         ?epoch,
     639            0 :         "Using NodeLocks (connect_compute)"
     640              :     );
     641            0 :     let connect_compute_locks = control_plane::locks::ApiLocks::new(
     642              :         "connect_compute_lock",
     643            0 :         limiter,
     644            0 :         shards,
     645            0 :         timeout,
     646            0 :         epoch,
     647            0 :         &Metrics::get().proxy.connect_compute_lock,
     648              :     );
     649              : 
     650            0 :     let http_config = HttpConfig {
     651            0 :         accept_websockets: !args.is_auth_broker,
     652            0 :         pool_options: GlobalConnPoolOptions {
     653            0 :             max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
     654            0 :             gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
     655            0 :             pool_shards: args.sql_over_http.sql_over_http_pool_shards,
     656            0 :             idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
     657            0 :             opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
     658            0 :             max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
     659            0 :         },
     660            0 :         cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
     661            0 :         client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
     662            0 :         max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
     663            0 :         max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
     664            0 :     };
     665            0 :     let authentication_config = AuthenticationConfig {
     666            0 :         jwks_cache: JwkCache::default(),
     667            0 :         thread_pool,
     668            0 :         scram_protocol_timeout: args.scram_protocol_timeout,
     669            0 :         ip_allowlist_check_enabled: !args.is_private_access_proxy,
     670            0 :         is_vpc_acccess_proxy: args.is_private_access_proxy,
     671            0 :         is_auth_broker: args.is_auth_broker,
     672            0 :         accept_jwts: args.is_auth_broker,
     673            0 :         console_redirect_confirmation_timeout: args.webauth_confirmation_timeout,
     674            0 :     };
     675              : 
     676            0 :     let compute_config = ComputeConfig {
     677            0 :         retry: config::RetryConfig::parse(&args.connect_to_compute_retry)?,
     678            0 :         tls: Arc::new(compute_client_config_with_root_certs()?),
     679            0 :         timeout: Duration::from_secs(2),
     680              :     };
     681              : 
     682            0 :     let config = ProxyConfig {
     683            0 :         tls_config,
     684            0 :         metric_collection,
     685            0 :         http_config,
     686            0 :         authentication_config,
     687            0 :         proxy_protocol_v2: args.proxy_protocol_v2,
     688            0 :         handshake_timeout: args.handshake_timeout,
     689            0 :         wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
     690            0 :         connect_compute_locks,
     691            0 :         connect_to_compute: compute_config,
     692              :         #[cfg(feature = "testing")]
     693              :         disable_pg_session_jwt: false,
     694              :     };
     695              : 
     696            0 :     let config = Box::leak(Box::new(config));
     697              : 
     698            0 :     tokio::spawn(config.connect_compute_locks.garbage_collect_worker());
     699              : 
     700            0 :     Ok(config)
     701            0 : }
     702              : 
     703              : /// auth::Backend is created at proxy startup, and lives forever.
     704            0 : fn build_auth_backend(
     705            0 :     args: &ProxyCliArgs,
     706            0 : ) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
     707            0 :     match &args.auth_backend {
     708              :         AuthBackendType::ControlPlane => {
     709            0 :             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
     710            0 :             let project_info_cache_config: ProjectInfoCacheOptions =
     711            0 :                 args.project_info_cache.parse()?;
     712              : 
     713            0 :             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
     714            0 :             info!(
     715            0 :                 "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
     716              :             );
     717              : 
     718            0 :             let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
     719            0 :                 wake_compute_cache_config,
     720            0 :                 project_info_cache_config,
     721              :             )));
     722              : 
     723              :             let config::ConcurrencyLockOptions {
     724            0 :                 shards,
     725            0 :                 limiter,
     726            0 :                 epoch,
     727            0 :                 timeout,
     728            0 :             } = args.wake_compute_lock.parse()?;
     729            0 :             info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
     730            0 :             let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
     731              :                 "wake_compute_lock",
     732            0 :                 limiter,
     733            0 :                 shards,
     734            0 :                 timeout,
     735            0 :                 epoch,
     736            0 :                 &Metrics::get().wake_compute_lock,
     737              :             )));
     738            0 :             tokio::spawn(locks.garbage_collect_worker());
     739              : 
     740            0 :             let url: crate::url::ApiUrl = args.auth_endpoint.parse()?;
     741              : 
     742            0 :             let endpoint = http::Endpoint::new(url, http::new_client());
     743              : 
     744            0 :             let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
     745            0 :             RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
     746            0 :             let wake_compute_endpoint_rate_limiter =
     747            0 :                 Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
     748              : 
     749            0 :             let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
     750            0 :                 endpoint,
     751            0 :                 args.control_plane_token.clone(),
     752            0 :                 caches,
     753            0 :                 locks,
     754            0 :                 wake_compute_endpoint_rate_limiter,
     755              :             );
     756              : 
     757            0 :             let api = control_plane::client::ControlPlaneClient::ProxyV1(api);
     758            0 :             let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
     759            0 :             let config = Box::leak(Box::new(auth_backend));
     760              : 
     761            0 :             Ok(Either::Left(config))
     762              :         }
     763              : 
     764              :         #[cfg(any(test, feature = "testing"))]
     765              :         AuthBackendType::Postgres => {
     766            0 :             let mut url: ApiUrl = args.auth_endpoint.parse()?;
     767            0 :             if url.password().is_none() {
     768            0 :                 let password = env::var("PGPASSWORD")
     769            0 :                     .with_context(|| "auth-endpoint does not contain a password and environment variable `PGPASSWORD` is not set")?;
     770            0 :                 url.set_password(Some(&password))
     771            0 :                     .expect("Failed to set password");
     772            0 :             }
     773            0 :             let api = control_plane::client::mock::MockControlPlane::new(
     774            0 :                 url,
     775            0 :                 !args.is_private_access_proxy,
     776              :             );
     777            0 :             let api = control_plane::client::ControlPlaneClient::PostgresMock(api);
     778              : 
     779            0 :             let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
     780              : 
     781            0 :             let config = Box::leak(Box::new(auth_backend));
     782              : 
     783            0 :             Ok(Either::Left(config))
     784              :         }
     785              : 
     786              :         AuthBackendType::ConsoleRedirect => {
     787            0 :             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
     788            0 :             let project_info_cache_config: ProjectInfoCacheOptions =
     789            0 :                 args.project_info_cache.parse()?;
     790              : 
     791            0 :             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
     792            0 :             info!(
     793            0 :                 "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
     794              :             );
     795              : 
     796            0 :             let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
     797            0 :                 wake_compute_cache_config,
     798            0 :                 project_info_cache_config,
     799              :             )));
     800              : 
     801              :             let config::ConcurrencyLockOptions {
     802            0 :                 shards,
     803            0 :                 limiter,
     804            0 :                 epoch,
     805            0 :                 timeout,
     806            0 :             } = args.wake_compute_lock.parse()?;
     807            0 :             info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
     808            0 :             let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
     809              :                 "wake_compute_lock",
     810            0 :                 limiter,
     811            0 :                 shards,
     812            0 :                 timeout,
     813            0 :                 epoch,
     814            0 :                 &Metrics::get().wake_compute_lock,
     815              :             )));
     816              : 
     817            0 :             let url = args.uri.clone().parse()?;
     818            0 :             let ep_url: crate::url::ApiUrl = args.auth_endpoint.parse()?;
     819            0 :             let endpoint = http::Endpoint::new(ep_url, http::new_client());
     820            0 :             let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
     821            0 :             RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
     822            0 :             let wake_compute_endpoint_rate_limiter =
     823            0 :                 Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
     824              : 
     825              :             // Since we use only get_allowed_ips_and_secret() wake_compute_endpoint_rate_limiter
     826              :             // and locks are not used in ConsoleRedirectBackend,
     827              :             // but they are required by the NeonControlPlaneClient
     828            0 :             let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
     829            0 :                 endpoint,
     830            0 :                 args.control_plane_token.clone(),
     831            0 :                 caches,
     832            0 :                 locks,
     833            0 :                 wake_compute_endpoint_rate_limiter,
     834              :             );
     835              : 
     836            0 :             let backend = ConsoleRedirectBackend::new(url, api);
     837            0 :             let config = Box::leak(Box::new(backend));
     838              : 
     839            0 :             Ok(Either::Right(config))
     840              :         }
     841              : 
     842              :         #[cfg(any(test, feature = "testing"))]
     843              :         AuthBackendType::Local => {
     844            0 :             let postgres: SocketAddr = "127.0.0.1:7432".parse()?;
     845            0 :             let compute_ctl: ApiUrl = "http://127.0.0.1:3081/".parse()?;
     846            0 :             let auth_backend = crate::auth::Backend::Local(
     847            0 :                 crate::auth::backend::MaybeOwned::Owned(LocalBackend::new(postgres, compute_ctl)),
     848            0 :             );
     849              : 
     850            0 :             let config = Box::leak(Box::new(auth_backend));
     851              : 
     852            0 :             Ok(Either::Left(config))
     853              :         }
     854              :     }
     855            0 : }
     856              : 
     857            0 : async fn configure_redis(
     858            0 :     args: &ProxyCliArgs,
     859            0 : ) -> anyhow::Result<Option<ConnectionWithCredentialsProvider>> {
     860              :     // TODO: untangle the config args
     861            0 :     let redis_client = match &*args.redis_auth_type {
     862            0 :         "plain" => match &args.redis_plain {
     863              :             None => {
     864            0 :                 bail!("plain auth requires redis_plain to be set");
     865              :             }
     866            0 :             Some(url) => {
     867            0 :                 Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone()))
     868              :             }
     869              :         },
     870            0 :         "irsa" => match (&args.redis_host, args.redis_port) {
     871            0 :             (Some(host), Some(port)) => Some(
     872            0 :                 ConnectionWithCredentialsProvider::new_with_credentials_provider(
     873            0 :                     host.clone(),
     874            0 :                     port,
     875            0 :                     elasticache::CredentialsProvider::new(
     876            0 :                         args.aws_region.clone(),
     877            0 :                         args.redis_cluster_name.clone(),
     878            0 :                         args.redis_user_id.clone(),
     879            0 :                     )
     880            0 :                     .await,
     881              :                 ),
     882              :             ),
     883              :             (None, None) => {
     884              :                 // todo: upgrade to error?
     885            0 :                 warn!(
     886            0 :                     "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"
     887              :                 );
     888            0 :                 None
     889              :             }
     890              :             _ => {
     891            0 :                 bail!("redis-host and redis-port must be specified together");
     892              :             }
     893              :         },
     894            0 :         auth_type => {
     895            0 :             bail!("unknown auth type {auth_type:?} given")
     896              :         }
     897              :     };
     898              : 
     899            0 :     Ok(redis_client)
     900            0 : }
     901              : 
     902              : #[cfg(test)]
     903              : mod tests {
     904              :     use std::time::Duration;
     905              : 
     906              :     use clap::Parser;
     907              : 
     908              :     use crate::rate_limiter::RateBucketInfo;
     909              : 
     910              :     #[test]
     911            1 :     fn parse_endpoint_rps_limit() {
     912            1 :         let config = super::ProxyCliArgs::parse_from([
     913            1 :             "proxy",
     914            1 :             "--endpoint-rps-limit",
     915            1 :             "100@1s",
     916            1 :             "--endpoint-rps-limit",
     917            1 :             "20@30s",
     918            1 :         ]);
     919              : 
     920            1 :         assert_eq!(
     921              :             config.endpoint_rps_limit,
     922            1 :             vec![
     923            1 :                 RateBucketInfo::new(100, Duration::from_secs(1)),
     924            1 :                 RateBucketInfo::new(20, Duration::from_secs(30)),
     925              :             ]
     926              :         );
     927            1 :     }
     928              : }
        

Generated by: LCOV version 2.1-beta