LCOV - 32f4a56327bc9da697706839ed4836b2a00a408f.info

LCOV - code coverage report

Current view:	top level - proxy/src/bin - proxy.rs (source / functions)		Coverage	Total	Hit
Test:	32f4a56327bc9da697706839ed4836b2a00a408f.info	Lines:	84.2 %	240	202
Test Date:	2024-02-07 07:37:29	Functions:	29.9 %	107	32

            Line data    Source code

       1              : use futures::future::Either;
       2              : use proxy::auth;
       3              : use proxy::auth::backend::MaybeOwned;
       4              : use proxy::config::AuthenticationConfig;
       5              : use proxy::config::CacheOptions;
       6              : use proxy::config::HttpConfig;
       7              : use proxy::config::ProjectInfoCacheOptions;
       8              : use proxy::console;
       9              : use proxy::context::parquet::ParquetUploadArgs;
      10              : use proxy::http;
      11              : use proxy::rate_limiter::EndpointRateLimiter;
      12              : use proxy::rate_limiter::RateBucketInfo;
      13              : use proxy::rate_limiter::RateLimiterConfig;
      14              : use proxy::redis::notifications;
      15              : use proxy::serverless::GlobalConnPoolOptions;
      16              : use proxy::usage_metrics;
      17              : 
      18              : use anyhow::bail;
      19              : use proxy::config::{self, ProxyConfig};
      20              : use proxy::serverless;
      21              : use std::net::SocketAddr;
      22              : use std::pin::pin;
      23              : use std::sync::Arc;
      24              : use tokio::net::TcpListener;
      25              : use tokio::task::JoinSet;
      26              : use tokio_util::sync::CancellationToken;
      27              : use tracing::info;
      28              : use tracing::warn;
      29              : use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
      30              : 
      31              : project_git_version!(GIT_VERSION);
      32              : project_build_tag!(BUILD_TAG);
      33              : 
      34              : use clap::{Parser, ValueEnum};
      35              : 
      36              : #[global_allocator]
      37         4654 : static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
      38              : 
      39          229 : #[derive(Clone, Debug, ValueEnum)]
      40              : enum AuthBackend {
      41              :     Console,
      42              :     #[cfg(feature = "testing")]
      43              :     Postgres,
      44              :     Link,
      45              : }
      46              : 
      47              : /// Neon proxy/router
      48           25 : #[derive(Parser)]
      49              : #[command(version = GIT_VERSION, about)]
      50              : struct ProxyCliArgs {
      51              :     /// Name of the region this proxy is deployed in
      52           25 :     #[clap(long, default_value_t = String::new())]
      53            0 :     region: String,
      54              :     /// listen for incoming client connections on ip:port
      55              :     #[clap(short, long, default_value = "127.0.0.1:4432")]
      56            0 :     proxy: String,
      57           25 :     #[clap(value_enum, long, default_value_t = AuthBackend::Link)]
      58            0 :     auth_backend: AuthBackend,
      59              :     /// listen for management callback connection on ip:port
      60              :     #[clap(short, long, default_value = "127.0.0.1:7000")]
      61            0 :     mgmt: String,
      62              :     /// listen for incoming http connections (metrics, etc) on ip:port
      63              :     #[clap(long, default_value = "127.0.0.1:7001")]
      64            0 :     http: String,
      65              :     /// listen for incoming wss connections on ip:port
      66              :     #[clap(long)]
      67              :     wss: Option<String>,
      68              :     /// redirect unauthenticated users to the given uri in case of link auth
      69              :     #[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
      70            0 :     uri: String,
      71              :     /// cloud API endpoint for authenticating users
      72              :     #[clap(
      73              :         short,
      74              :         long,
      75              :         default_value = "http://localhost:3000/authenticate_proxy_request/"
      76              :     )]
      77            0 :     auth_endpoint: String,
      78              :     /// path to TLS key for client postgres connections
      79              :     ///
      80              :     /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
      81              :     #[clap(short = 'k', long, alias = "ssl-key")]
      82              :     tls_key: Option<String>,
      83              :     /// path to TLS cert for client postgres connections
      84              :     ///
      85              :     /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
      86              :     #[clap(short = 'c', long, alias = "ssl-cert")]
      87              :     tls_cert: Option<String>,
      88              :     /// path to directory with TLS certificates for client postgres connections
      89              :     #[clap(long)]
      90              :     certs_dir: Option<String>,
      91              :     /// http endpoint to receive periodic metric updates
      92              :     #[clap(long)]
      93              :     metric_collection_endpoint: Option<String>,
      94              :     /// how often metrics should be sent to a collection endpoint
      95              :     #[clap(long)]
      96              :     metric_collection_interval: Option<String>,
      97              :     /// cache for `wake_compute` api method (use `size=0` to disable)
      98              :     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
      99            0 :     wake_compute_cache: String,
     100              :     /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
     101              :     #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
     102            0 :     wake_compute_lock: String,
     103              :     /// Allow self-signed certificates for compute nodes (for testing)
     104           25 :     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     105            0 :     allow_self_signed_compute: bool,
     106              :     #[clap(flatten)]
     107              :     sql_over_http: SqlOverHttpArgs,
     108              :     /// timeout for scram authentication protocol
     109              :     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     110            0 :     scram_protocol_timeout: tokio::time::Duration,
     111              :     /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
     112           25 :     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     113            0 :     require_client_ip: bool,
     114              :     /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
     115           25 :     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     116            0 :     disable_dynamic_rate_limiter: bool,
     117              :     /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
     118           25 :     #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
     119            0 :     rate_limit_algorithm: proxy::rate_limiter::RateLimitAlgorithm,
     120              :     /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error.
     121              :     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     122            0 :     rate_limiter_timeout: tokio::time::Duration,
     123              :     /// Endpoint rate limiter max number of requests per second.
     124              :     ///
     125              :     /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
     126              :     /// Can be given multiple times for different bucket sizes.
     127          125 :     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
     128           25 :     endpoint_rps_limit: Vec<RateBucketInfo>,
     129              :     /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
     130           25 :     #[clap(long, default_value_t = 100)]
     131            0 :     initial_limit: usize,
     132              :     #[clap(flatten)]
     133              :     aimd_config: proxy::rate_limiter::AimdConfig,
     134              :     /// cache for `allowed_ips` (use `size=0` to disable)
     135              :     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     136            0 :     allowed_ips_cache: String,
     137              :     /// cache for `role_secret` (use `size=0` to disable)
     138              :     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     139            0 :     role_secret_cache: String,
     140              :     /// disable ip check for http requests. If it is too time consuming, it could be turned off.
     141           25 :     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     142            0 :     disable_ip_check_for_http: bool,
     143              :     /// redis url for notifications.
     144              :     #[clap(long)]
     145              :     redis_notifications: Option<String>,
     146              :     /// cache for `project_info` (use `size=0` to disable)
     147              :     #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
     148            0 :     project_info_cache: String,
     149              : 
     150              :     #[clap(flatten)]
     151              :     parquet_upload: ParquetUploadArgs,
     152              : }
     153              : 
     154           25 : #[derive(clap::Args, Clone, Copy, Debug)]
     155              : struct SqlOverHttpArgs {
     156              :     /// timeout for http connection requests
     157              :     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     158            0 :     sql_over_http_timeout: tokio::time::Duration,
     159              : 
     160              :     /// Whether the SQL over http pool is opt-in
     161           25 :     #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     162            0 :     sql_over_http_pool_opt_in: bool,
     163              : 
     164              :     /// How many connections to pool for each endpoint. Excess connections are discarded
     165           25 :     #[clap(long, default_value_t = 20)]
     166            0 :     sql_over_http_pool_max_conns_per_endpoint: usize,
     167              : 
     168              :     /// How long pooled connections should remain idle for before closing
     169              :     #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
     170            0 :     sql_over_http_idle_timeout: tokio::time::Duration,
     171              : 
     172              :     /// Duration each shard will wait on average before a GC sweep.
     173              :     /// A longer time will causes sweeps to take longer but will interfere less frequently.
     174              :     #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
     175            0 :     sql_over_http_pool_gc_epoch: tokio::time::Duration,
     176              : 
     177              :     /// How many shards should the global pool have. Must be a power of two.
     178              :     /// More shards will introduce less contention for pool operations, but can
     179              :     /// increase memory used by the pool
     180           25 :     #[clap(long, default_value_t = 128)]
     181            0 :     sql_over_http_pool_shards: usize,
     182              : }
     183              : 
     184              : #[tokio::main]
     185           23 : async fn main() -> anyhow::Result<()> {
     186           23 :     let _logging_guard = proxy::logging::init().await?;
     187           23 :     let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
     188           23 :     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
     189           23 : 
     190           23 :     info!("Version: {GIT_VERSION}");
     191           23 :     info!("Build_tag: {BUILD_TAG}");
     192           23 :     ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
     193           23 : 
     194           23 :     match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) {
     195           23 :         Ok(t) => {
     196           23 :             t.start();
     197           23 :         }
     198            0 :         Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"),
     199              :     }
     200              : 
     201           23 :     let args = ProxyCliArgs::parse();
     202           23 :     let config = build_config(&args)?;
     203              : 
     204           23 :     info!("Authentication backend: {}", config.auth_backend);
     205              : 
     206              :     // Check that we can bind to address before further initialization
     207           23 :     let http_address: SocketAddr = args.http.parse()?;
     208           23 :     info!("Starting http on {http_address}");
     209           23 :     let http_listener = TcpListener::bind(http_address).await?.into_std()?;
     210              : 
     211           23 :     let mgmt_address: SocketAddr = args.mgmt.parse()?;
     212           23 :     info!("Starting mgmt on {mgmt_address}");
     213           23 :     let mgmt_listener = TcpListener::bind(mgmt_address).await?;
     214              : 
     215           23 :     let proxy_address: SocketAddr = args.proxy.parse()?;
     216           23 :     info!("Starting proxy on {proxy_address}");
     217           23 :     let proxy_listener = TcpListener::bind(proxy_address).await?;
     218           23 :     let cancellation_token = CancellationToken::new();
     219           23 : 
     220           23 :     let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
     221           23 : 
     222           23 :     // client facing tasks. these will exit on error or on cancellation
     223           23 :     // cancellation returns Ok(())
     224           23 :     let mut client_tasks = JoinSet::new();
     225           23 :     client_tasks.spawn(proxy::proxy::task_main(
     226           23 :         config,
     227           23 :         proxy_listener,
     228           23 :         cancellation_token.clone(),
     229           23 :         endpoint_rate_limiter.clone(),
     230           23 :     ));
     231              : 
     232              :     // TODO: rename the argument to something like serverless.
     233              :     // It now covers more than just websockets, it also covers SQL over HTTP.
     234           23 :     if let Some(serverless_address) = args.wss {
     235           23 :         let serverless_address: SocketAddr = serverless_address.parse()?;
     236           23 :         info!("Starting wss on {serverless_address}");
     237           23 :         let serverless_listener = TcpListener::bind(serverless_address).await?;
     238              : 
     239           23 :         client_tasks.spawn(serverless::task_main(
     240           23 :             config,
     241           23 :             serverless_listener,
     242           23 :             cancellation_token.clone(),
     243           23 :             endpoint_rate_limiter.clone(),
     244           23 :         ));
     245            0 :     }
     246              : 
     247           23 :     client_tasks.spawn(proxy::context::parquet::worker(
     248           23 :         cancellation_token.clone(),
     249           23 :         args.parquet_upload,
     250           23 :     ));
     251           23 : 
     252           23 :     // maintenance tasks. these never return unless there's an error
     253           23 :     let mut maintenance_tasks = JoinSet::new();
     254           23 :     maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
     255           23 :     maintenance_tasks.spawn(http::health_server::task_main(http_listener));
     256           23 :     maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
     257              : 
     258           23 :     if let Some(metrics_config) = &config.metric_collection {
     259            1 :         maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
     260           22 :     }
     261              : 
     262           23 :     if let auth::BackendType::Console(api, _) = &config.auth_backend {
     263           20 :         if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
     264            1 :             let cache = api.caches.project_info.clone();
     265            1 :             if let Some(url) = args.redis_notifications {
     266            0 :                 info!("Starting redis notifications listener ({url})");
     267            0 :                 maintenance_tasks.spawn(notifications::task_main(url.to_owned(), cache.clone()));
     268            1 :             }
     269            1 :             maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
     270           19 :         }
     271            3 :     }
     272              : 
     273              :     let maintenance = loop {
     274              :         // get one complete task
     275           92 :         match futures::future::select(
     276           92 :             pin!(maintenance_tasks.join_next()),
     277           92 :             pin!(client_tasks.join_next()),
     278           92 :         )
     279           53 :         .await
     280              :         {
     281              :             // exit immediately on maintenance task completion
     282            0 :             Either::Left((Some(res), _)) => break proxy::flatten_err(res)?,
     283              :             // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
     284            0 :             Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
     285              :             // exit immediately on client task error
     286           69 :             Either::Right((Some(res), _)) => proxy::flatten_err(res)?,
     287              :             // exit if all our client tasks have shutdown gracefully
     288           23 :             Either::Right((None, _)) => return Ok(()),
     289              :         }
     290              :     };
     291              : 
     292              :     // maintenance tasks return Infallible success values, this is an impossible value
     293              :     // so this match statically ensures that there are no possibilities for that value
     294              :     match maintenance {}
     295              : }
     296              : 
     297              : /// ProxyConfig is created at proxy startup, and lives forever.
     298           23 : fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     299           23 :     let tls_config = match (&args.tls_key, &args.tls_cert) {
     300           23 :         (Some(key_path), Some(cert_path)) => Some(config::configure_tls(
     301           23 :             key_path,
     302           23 :             cert_path,
     303           23 :             args.certs_dir.as_ref(),
     304           23 :         )?),
     305            0 :         (None, None) => None,
     306            0 :         _ => bail!("either both or neither tls-key and tls-cert must be specified"),
     307              :     };
     308              : 
     309           23 :     if args.allow_self_signed_compute {
     310            3 :         warn!("allowing self-signed compute certificates");
     311           20 :     }
     312              : 
     313           23 :     let metric_collection = match (
     314           23 :         &args.metric_collection_endpoint,
     315           23 :         &args.metric_collection_interval,
     316              :     ) {
     317            1 :         (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
     318            1 :             endpoint: endpoint.parse()?,
     319            1 :             interval: humantime::parse_duration(interval)?,
     320              :         }),
     321           22 :         (None, None) => None,
     322            0 :         _ => bail!(
     323            0 :             "either both or neither metric-collection-endpoint \
     324            0 :              and metric-collection-interval must be specified"
     325            0 :         ),
     326              :     };
     327           23 :     let rate_limiter_config = RateLimiterConfig {
     328           23 :         disable: args.disable_dynamic_rate_limiter,
     329           23 :         algorithm: args.rate_limit_algorithm,
     330           23 :         timeout: args.rate_limiter_timeout,
     331           23 :         initial_limit: args.initial_limit,
     332           23 :         aimd_config: Some(args.aimd_config),
     333           23 :     };
     334              : 
     335           23 :     let auth_backend = match &args.auth_backend {
     336              :         AuthBackend::Console => {
     337            1 :             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
     338            1 :             let project_info_cache_config: ProjectInfoCacheOptions =
     339            1 :                 args.project_info_cache.parse()?;
     340              : 
     341            1 :             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
     342            1 :             info!(
     343            1 :                 "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
     344            1 :             );
     345            1 :             let caches = Box::leak(Box::new(console::caches::ApiCaches::new(
     346            1 :                 wake_compute_cache_config,
     347            1 :                 project_info_cache_config,
     348            1 :             )));
     349              : 
     350              :             let config::WakeComputeLockOptions {
     351            1 :                 shards,
     352            1 :                 permits,
     353            1 :                 epoch,
     354            1 :                 timeout,
     355            1 :             } = args.wake_compute_lock.parse()?;
     356            1 :             info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
     357            1 :             let locks = Box::leak(Box::new(
     358            1 :                 console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
     359            1 :                     .unwrap(),
     360            1 :             ));
     361            1 :             tokio::spawn(locks.garbage_collect_worker(epoch));
     362              : 
     363            1 :             let url = args.auth_endpoint.parse()?;
     364            1 :             let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
     365            1 : 
     366            1 :             let api = console::provider::neon::Api::new(endpoint, caches, locks);
     367            1 :             let api = console::provider::ConsoleBackend::Console(api);
     368            1 :             auth::BackendType::Console(MaybeOwned::Owned(api), ())
     369              :         }
     370              :         #[cfg(feature = "testing")]
     371              :         AuthBackend::Postgres => {
     372           19 :             let url = args.auth_endpoint.parse()?;
     373           19 :             let api = console::provider::mock::Api::new(url);
     374           19 :             let api = console::provider::ConsoleBackend::Postgres(api);
     375           19 :             auth::BackendType::Console(MaybeOwned::Owned(api), ())
     376              :         }
     377              :         AuthBackend::Link => {
     378            3 :             let url = args.uri.parse()?;
     379            3 :             auth::BackendType::Link(MaybeOwned::Owned(url))
     380              :         }
     381              :     };
     382           23 :     let http_config = HttpConfig {
     383           23 :         request_timeout: args.sql_over_http.sql_over_http_timeout,
     384           23 :         pool_options: GlobalConnPoolOptions {
     385           23 :             max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
     386           23 :             gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
     387           23 :             pool_shards: args.sql_over_http.sql_over_http_pool_shards,
     388           23 :             idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
     389           23 :             opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
     390           23 :         },
     391           23 :     };
     392           23 :     let authentication_config = AuthenticationConfig {
     393           23 :         scram_protocol_timeout: args.scram_protocol_timeout,
     394           23 :     };
     395           23 : 
     396           23 :     let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
     397           23 :     RateBucketInfo::validate(&mut endpoint_rps_limit)?;
     398              : 
     399           23 :     let config = Box::leak(Box::new(ProxyConfig {
     400           23 :         tls_config,
     401           23 :         auth_backend,
     402           23 :         metric_collection,
     403           23 :         allow_self_signed_compute: args.allow_self_signed_compute,
     404           23 :         http_config,
     405           23 :         authentication_config,
     406           23 :         require_client_ip: args.require_client_ip,
     407           23 :         disable_ip_check_for_http: args.disable_ip_check_for_http,
     408           23 :         endpoint_rps_limit,
     409           23 :         // TODO: add this argument
     410           23 :         region: args.region.clone(),
     411           23 :     }));
     412           23 : 
     413           23 :     Ok(config)
     414           23 : }
     415              : 
     416              : #[cfg(test)]
     417              : mod tests {
     418              :     use std::time::Duration;
     419              : 
     420              :     use clap::Parser;
     421              :     use proxy::rate_limiter::RateBucketInfo;
     422              : 
     423            2 :     #[test]
     424            2 :     fn parse_endpoint_rps_limit() {
     425            2 :         let config = super::ProxyCliArgs::parse_from([
     426            2 :             "proxy",
     427            2 :             "--endpoint-rps-limit",
     428            2 :             "100@1s",
     429            2 :             "--endpoint-rps-limit",
     430            2 :             "20@30s",
     431            2 :         ]);
     432            2 : 
     433            2 :         assert_eq!(
     434            2 :             config.endpoint_rps_limit,
     435            2 :             vec![
     436            2 :                 RateBucketInfo::new(100, Duration::from_secs(1)),
     437            2 :                 RateBucketInfo::new(20, Duration::from_secs(30)),
     438            2 :             ]
     439            2 :         );
     440            2 :     }
     441              : }

Generated by: LCOV version 2.1-beta