Line data Source code
1 : use std::str::FromStr;
2 : use std::sync::Arc;
3 : use std::time::Duration;
4 :
5 : use anyhow::{Context, Ok, bail, ensure};
6 : use arc_swap::ArcSwapOption;
7 : use camino::{Utf8Path, Utf8PathBuf};
8 : use clap::ValueEnum;
9 : use compute_api::spec::LocalProxySpec;
10 : use remote_storage::RemoteStorageConfig;
11 : use thiserror::Error;
12 : use tokio::sync::Notify;
13 : use tracing::{debug, error, info, warn};
14 :
15 : use crate::auth::backend::jwt::JwkCache;
16 : use crate::auth::backend::local::JWKS_ROLE_MAP;
17 : use crate::control_plane::locks::ApiLocks;
18 : use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings};
19 : use crate::ext::TaskExt;
20 : use crate::intern::RoleNameInt;
21 : use crate::rate_limiter::{RateLimitAlgorithm, RateLimiterConfig};
22 : use crate::scram::threadpool::ThreadPool;
23 : use crate::serverless::GlobalConnPoolOptions;
24 : use crate::serverless::cancel_set::CancelSet;
25 : #[cfg(feature = "rest_broker")]
26 : use crate::serverless::rest::DbSchemaCache;
27 : pub use crate::tls::server_config::{TlsConfig, configure_tls};
28 : use crate::types::{Host, RoleName};
29 :
30 : pub struct ProxyConfig {
31 : pub tls_config: ArcSwapOption<TlsConfig>,
32 : pub metric_collection: Option<MetricCollectionConfig>,
33 : pub http_config: HttpConfig,
34 : pub authentication_config: AuthenticationConfig,
35 : #[cfg(feature = "rest_broker")]
36 : pub rest_config: RestConfig,
37 : pub proxy_protocol_v2: ProxyProtocolV2,
38 : pub handshake_timeout: Duration,
39 : pub wake_compute_retry_config: RetryConfig,
40 : pub connect_compute_locks: ApiLocks<Host>,
41 : pub connect_to_compute: ComputeConfig,
42 : pub greetings: String, // Greeting message sent to the client after connection establishment and contains session_id.
43 : #[cfg(feature = "testing")]
44 : pub disable_pg_session_jwt: bool,
45 : }
46 :
47 : pub struct ComputeConfig {
48 : pub retry: RetryConfig,
49 : pub tls: Arc<rustls::ClientConfig>,
50 : pub timeout: Duration,
51 : }
52 :
53 : #[derive(Copy, Clone, Debug, ValueEnum, PartialEq)]
54 : pub enum ProxyProtocolV2 {
55 : /// Connection will error if PROXY protocol v2 header is missing
56 : Required,
57 : /// Connection will error if PROXY protocol v2 header is provided
58 : Rejected,
59 : }
60 :
61 : #[derive(Debug)]
62 : pub struct MetricCollectionConfig {
63 : pub endpoint: reqwest::Url,
64 : pub interval: Duration,
65 : pub backup_metric_collection_config: MetricBackupCollectionConfig,
66 : }
67 :
68 : pub struct HttpConfig {
69 : pub accept_websockets: bool,
70 : pub pool_options: GlobalConnPoolOptions,
71 : pub cancel_set: CancelSet,
72 : pub client_conn_threshold: u64,
73 : pub max_request_size_bytes: usize,
74 : pub max_response_size_bytes: usize,
75 : }
76 :
77 : pub struct AuthenticationConfig {
78 : pub thread_pool: Arc<ThreadPool>,
79 : pub scram_protocol_timeout: tokio::time::Duration,
80 : pub ip_allowlist_check_enabled: bool,
81 : pub is_vpc_acccess_proxy: bool,
82 : pub jwks_cache: JwkCache,
83 : pub is_auth_broker: bool,
84 : pub accept_jwts: bool,
85 : pub console_redirect_confirmation_timeout: tokio::time::Duration,
86 : }
87 :
88 : #[cfg(feature = "rest_broker")]
89 : pub struct RestConfig {
90 : pub is_rest_broker: bool,
91 : pub db_schema_cache: Option<DbSchemaCache>,
92 : pub max_schema_size: usize,
93 : pub hostname_prefix: String,
94 : }
95 :
96 : #[derive(Debug)]
97 : pub struct MetricBackupCollectionConfig {
98 : pub remote_storage_config: Option<RemoteStorageConfig>,
99 : pub chunk_size: usize,
100 : }
101 :
102 1 : pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<RemoteStorageConfig> {
103 1 : RemoteStorageConfig::from_toml(&s.parse()?)
104 1 : }
105 :
106 : /// Helper for cmdline cache options parsing.
107 : #[derive(Debug)]
108 : pub struct CacheOptions {
109 : /// Max number of entries.
110 : pub size: usize,
111 : /// Entry's time-to-live.
112 : pub ttl: Duration,
113 : }
114 :
115 : impl CacheOptions {
116 : /// Default options for [`crate::control_plane::NodeInfoCache`].
117 : pub const CACHE_DEFAULT_OPTIONS: &'static str = "size=4000,ttl=4m";
118 :
119 : /// Parse cache options passed via cmdline.
120 : /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
121 4 : fn parse(options: &str) -> anyhow::Result<Self> {
122 4 : let mut size = None;
123 4 : let mut ttl = None;
124 :
125 7 : for option in options.split(',') {
126 7 : let (key, value) = option
127 7 : .split_once('=')
128 7 : .with_context(|| format!("bad key-value pair: {option}"))?;
129 :
130 7 : match key {
131 7 : "size" => size = Some(value.parse()?),
132 3 : "ttl" => ttl = Some(humantime::parse_duration(value)?),
133 0 : unknown => bail!("unknown key: {unknown}"),
134 : }
135 : }
136 :
137 : // TTL doesn't matter if cache is always empty.
138 4 : if let Some(0) = size {
139 2 : ttl.get_or_insert(Duration::default());
140 2 : }
141 :
142 4 : Ok(Self {
143 4 : size: size.context("missing `size`")?,
144 4 : ttl: ttl.context("missing `ttl`")?,
145 : })
146 4 : }
147 : }
148 :
149 : impl FromStr for CacheOptions {
150 : type Err = anyhow::Error;
151 :
152 4 : fn from_str(options: &str) -> Result<Self, Self::Err> {
153 4 : let error = || format!("failed to parse cache options '{options}'");
154 4 : Self::parse(options).with_context(error)
155 4 : }
156 : }
157 :
158 : /// Helper for cmdline cache options parsing.
159 : #[derive(Debug)]
160 : pub struct ProjectInfoCacheOptions {
161 : /// Max number of entries.
162 : pub size: usize,
163 : /// Entry's time-to-live.
164 : pub ttl: Duration,
165 : /// Max number of roles per endpoint.
166 : pub max_roles: usize,
167 : /// Gc interval.
168 : pub gc_interval: Duration,
169 : }
170 :
171 : impl ProjectInfoCacheOptions {
172 : /// Default options for [`crate::control_plane::NodeInfoCache`].
173 : pub const CACHE_DEFAULT_OPTIONS: &'static str =
174 : "size=10000,ttl=4m,max_roles=10,gc_interval=60m";
175 :
176 : /// Parse cache options passed via cmdline.
177 : /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
178 0 : fn parse(options: &str) -> anyhow::Result<Self> {
179 0 : let mut size = None;
180 0 : let mut ttl = None;
181 0 : let mut max_roles = None;
182 0 : let mut gc_interval = None;
183 :
184 0 : for option in options.split(',') {
185 0 : let (key, value) = option
186 0 : .split_once('=')
187 0 : .with_context(|| format!("bad key-value pair: {option}"))?;
188 :
189 0 : match key {
190 0 : "size" => size = Some(value.parse()?),
191 0 : "ttl" => ttl = Some(humantime::parse_duration(value)?),
192 0 : "max_roles" => max_roles = Some(value.parse()?),
193 0 : "gc_interval" => gc_interval = Some(humantime::parse_duration(value)?),
194 0 : unknown => bail!("unknown key: {unknown}"),
195 : }
196 : }
197 :
198 : // TTL doesn't matter if cache is always empty.
199 0 : if let Some(0) = size {
200 0 : ttl.get_or_insert(Duration::default());
201 0 : }
202 :
203 0 : Ok(Self {
204 0 : size: size.context("missing `size`")?,
205 0 : ttl: ttl.context("missing `ttl`")?,
206 0 : max_roles: max_roles.context("missing `max_roles`")?,
207 0 : gc_interval: gc_interval.context("missing `gc_interval`")?,
208 : })
209 0 : }
210 : }
211 :
212 : impl FromStr for ProjectInfoCacheOptions {
213 : type Err = anyhow::Error;
214 :
215 0 : fn from_str(options: &str) -> Result<Self, Self::Err> {
216 0 : let error = || format!("failed to parse cache options '{options}'");
217 0 : Self::parse(options).with_context(error)
218 0 : }
219 : }
220 :
221 : /// This is a config for connect to compute and wake compute.
222 : #[derive(Clone, Copy, Debug)]
223 : pub struct RetryConfig {
224 : /// Number of times we should retry.
225 : pub max_retries: u32,
226 : /// Retry duration is base_delay * backoff_factor ^ n, where n starts at 0
227 : pub base_delay: tokio::time::Duration,
228 : /// Exponential base for retry wait duration
229 : pub backoff_factor: f64,
230 : }
231 :
232 : impl RetryConfig {
233 : // Default options for RetryConfig.
234 :
235 : /// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s.
236 : pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
237 : "num_retries=5,base_retry_wait_duration=200ms,retry_wait_exponent_base=2";
238 : /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
239 : /// Cplane has timeout of 60s on each request. 8m7s in total.
240 : pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
241 : "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
242 :
243 : /// Parse retry options passed via cmdline.
244 : /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`].
245 0 : pub fn parse(options: &str) -> anyhow::Result<Self> {
246 0 : let mut num_retries = None;
247 0 : let mut base_retry_wait_duration = None;
248 0 : let mut retry_wait_exponent_base = None;
249 :
250 0 : for option in options.split(',') {
251 0 : let (key, value) = option
252 0 : .split_once('=')
253 0 : .with_context(|| format!("bad key-value pair: {option}"))?;
254 :
255 0 : match key {
256 0 : "num_retries" => num_retries = Some(value.parse()?),
257 0 : "base_retry_wait_duration" => {
258 0 : base_retry_wait_duration = Some(humantime::parse_duration(value)?);
259 : }
260 0 : "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?),
261 0 : unknown => bail!("unknown key: {unknown}"),
262 : }
263 : }
264 :
265 0 : Ok(Self {
266 0 : max_retries: num_retries.context("missing `num_retries`")?,
267 0 : base_delay: base_retry_wait_duration.context("missing `base_retry_wait_duration`")?,
268 0 : backoff_factor: retry_wait_exponent_base
269 0 : .context("missing `retry_wait_exponent_base`")?,
270 : })
271 0 : }
272 : }
273 :
274 : /// Helper for cmdline cache options parsing.
275 : #[derive(serde::Deserialize)]
276 : pub struct ConcurrencyLockOptions {
277 : /// The number of shards the lock map should have
278 : pub shards: usize,
279 : /// The number of allowed concurrent requests for each endpoitn
280 : #[serde(flatten)]
281 : pub limiter: RateLimiterConfig,
282 : /// Garbage collection epoch
283 : #[serde(deserialize_with = "humantime_serde::deserialize")]
284 : pub epoch: Duration,
285 : /// Lock timeout
286 : #[serde(deserialize_with = "humantime_serde::deserialize")]
287 : pub timeout: Duration,
288 : }
289 :
290 : impl ConcurrencyLockOptions {
291 : /// Default options for [`crate::control_plane::client::ApiLocks`].
292 : pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
293 : /// Default options for [`crate::control_plane::client::ApiLocks`].
294 : pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
295 : "shards=64,permits=100,epoch=10m,timeout=10ms";
296 :
297 : // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
298 :
299 : /// Parse lock options passed via cmdline.
300 : /// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`].
301 4 : fn parse(options: &str) -> anyhow::Result<Self> {
302 4 : let options = options.trim();
303 4 : if options.starts_with('{') && options.ends_with('}') {
304 1 : return Ok(serde_json::from_str(options)?);
305 3 : }
306 :
307 3 : let mut shards = None;
308 3 : let mut permits = None;
309 3 : let mut epoch = None;
310 3 : let mut timeout = None;
311 :
312 9 : for option in options.split(',') {
313 9 : let (key, value) = option
314 9 : .split_once('=')
315 9 : .with_context(|| format!("bad key-value pair: {option}"))?;
316 :
317 9 : match key {
318 9 : "shards" => shards = Some(value.parse()?),
319 7 : "permits" => permits = Some(value.parse()?),
320 4 : "epoch" => epoch = Some(humantime::parse_duration(value)?),
321 2 : "timeout" => timeout = Some(humantime::parse_duration(value)?),
322 0 : unknown => bail!("unknown key: {unknown}"),
323 : }
324 : }
325 :
326 : // these dont matter if lock is disabled
327 3 : if let Some(0) = permits {
328 1 : timeout = Some(Duration::default());
329 1 : epoch = Some(Duration::default());
330 1 : shards = Some(2);
331 2 : }
332 :
333 3 : let permits = permits.context("missing `permits`")?;
334 3 : let out = Self {
335 3 : shards: shards.context("missing `shards`")?,
336 3 : limiter: RateLimiterConfig {
337 3 : algorithm: RateLimitAlgorithm::Fixed,
338 3 : initial_limit: permits,
339 3 : },
340 3 : epoch: epoch.context("missing `epoch`")?,
341 3 : timeout: timeout.context("missing `timeout`")?,
342 : };
343 :
344 3 : ensure!(out.shards > 1, "shard count must be > 1");
345 3 : ensure!(
346 3 : out.shards.is_power_of_two(),
347 0 : "shard count must be a power of two"
348 : );
349 :
350 3 : Ok(out)
351 4 : }
352 : }
353 :
354 : impl FromStr for ConcurrencyLockOptions {
355 : type Err = anyhow::Error;
356 :
357 4 : fn from_str(options: &str) -> Result<Self, Self::Err> {
358 4 : let error = || format!("failed to parse cache lock options '{options}'");
359 4 : Self::parse(options).with_context(error)
360 4 : }
361 : }
362 :
363 : #[derive(Error, Debug)]
364 : pub(crate) enum RefreshConfigError {
365 : #[error(transparent)]
366 : Read(#[from] std::io::Error),
367 : #[error(transparent)]
368 : Parse(#[from] serde_json::Error),
369 : #[error(transparent)]
370 : Validate(anyhow::Error),
371 : #[error(transparent)]
372 : Tls(anyhow::Error),
373 : }
374 :
375 0 : pub(crate) async fn refresh_config_loop(config: &ProxyConfig, path: Utf8PathBuf, rx: Arc<Notify>) {
376 0 : let mut init = true;
377 : loop {
378 0 : rx.notified().await;
379 :
380 0 : match refresh_config_inner(config, &path).await {
381 0 : std::result::Result::Ok(()) => {}
382 : // don't log for file not found errors if this is the first time we are checking
383 : // for computes that don't use local_proxy, this is not an error.
384 0 : Err(RefreshConfigError::Read(e))
385 0 : if init && e.kind() == std::io::ErrorKind::NotFound =>
386 : {
387 0 : debug!(error=?e, ?path, "could not read config file");
388 : }
389 0 : Err(RefreshConfigError::Tls(e)) => {
390 0 : error!(error=?e, ?path, "could not read TLS certificates");
391 : }
392 0 : Err(e) => {
393 0 : error!(error=?e, ?path, "could not read config file");
394 : }
395 : }
396 :
397 0 : init = false;
398 : }
399 : }
400 :
401 0 : pub(crate) async fn refresh_config_inner(
402 0 : config: &ProxyConfig,
403 0 : path: &Utf8Path,
404 0 : ) -> Result<(), RefreshConfigError> {
405 0 : let bytes = tokio::fs::read(&path).await?;
406 0 : let data: LocalProxySpec = serde_json::from_slice(&bytes)?;
407 :
408 0 : let mut jwks_set = vec![];
409 :
410 0 : fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result<JwksSettings> {
411 0 : let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?;
412 :
413 0 : ensure!(
414 0 : jwks_url.has_authority()
415 0 : && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"),
416 0 : "Invalid JWKS url. Must be HTTP",
417 : );
418 :
419 0 : ensure!(
420 0 : jwks_url.host().is_some_and(|h| h != url::Host::Domain("")),
421 0 : "Invalid JWKS url. No domain listed",
422 : );
423 :
424 : // clear username, password and ports
425 0 : jwks_url
426 0 : .set_username("")
427 0 : .expect("url can be a base and has a valid host and is not a file. should not error");
428 0 : jwks_url
429 0 : .set_password(None)
430 0 : .expect("url can be a base and has a valid host and is not a file. should not error");
431 : // local testing is hard if we need to have a specific restricted port
432 0 : if cfg!(not(feature = "testing")) {
433 0 : jwks_url.set_port(None).expect(
434 0 : "url can be a base and has a valid host and is not a file. should not error",
435 0 : );
436 0 : }
437 :
438 : // clear query params
439 0 : jwks_url.set_fragment(None);
440 0 : jwks_url.query_pairs_mut().clear().finish();
441 :
442 0 : if jwks_url.scheme() != "https" {
443 : // local testing is hard if we need to set up https support.
444 0 : if cfg!(not(feature = "testing")) {
445 0 : jwks_url
446 0 : .set_scheme("https")
447 0 : .expect("should not error to set the scheme to https if it was http");
448 0 : } else {
449 0 : warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS");
450 : }
451 0 : }
452 :
453 0 : Ok(JwksSettings {
454 0 : id: jwks.id,
455 0 : jwks_url,
456 0 : _provider_name: jwks.provider_name,
457 0 : jwt_audience: jwks.jwt_audience,
458 0 : role_names: jwks
459 0 : .role_names
460 0 : .into_iter()
461 0 : .map(RoleName::from)
462 0 : .map(|s| RoleNameInt::from(&s))
463 0 : .collect(),
464 : })
465 0 : }
466 :
467 0 : for jwks in data.jwks.into_iter().flatten() {
468 0 : jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?);
469 : }
470 :
471 0 : info!("successfully loaded new config");
472 0 : JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set })));
473 :
474 0 : if let Some(tls_config) = data.tls {
475 0 : let tls_config = tokio::task::spawn_blocking(move || {
476 0 : crate::tls::server_config::configure_tls(
477 0 : tls_config.key_path.as_ref(),
478 0 : tls_config.cert_path.as_ref(),
479 0 : None,
480 : false,
481 : )
482 0 : })
483 0 : .await
484 0 : .propagate_task_panic()
485 0 : .map_err(RefreshConfigError::Tls)?;
486 0 : config.tls_config.store(Some(Arc::new(tls_config)));
487 0 : }
488 :
489 0 : std::result::Result::Ok(())
490 0 : }
491 :
492 : #[cfg(test)]
493 : mod tests {
494 : use super::*;
495 : use crate::rate_limiter::Aimd;
496 :
497 : #[test]
498 1 : fn test_parse_cache_options() -> anyhow::Result<()> {
499 1 : let CacheOptions { size, ttl } = "size=4096,ttl=5min".parse()?;
500 1 : assert_eq!(size, 4096);
501 1 : assert_eq!(ttl, Duration::from_secs(5 * 60));
502 :
503 1 : let CacheOptions { size, ttl } = "ttl=4m,size=2".parse()?;
504 1 : assert_eq!(size, 2);
505 1 : assert_eq!(ttl, Duration::from_secs(4 * 60));
506 :
507 1 : let CacheOptions { size, ttl } = "size=0,ttl=1s".parse()?;
508 1 : assert_eq!(size, 0);
509 1 : assert_eq!(ttl, Duration::from_secs(1));
510 :
511 1 : let CacheOptions { size, ttl } = "size=0".parse()?;
512 1 : assert_eq!(size, 0);
513 1 : assert_eq!(ttl, Duration::default());
514 :
515 1 : Ok(())
516 1 : }
517 :
518 : #[test]
519 1 : fn test_parse_lock_options() -> anyhow::Result<()> {
520 : let ConcurrencyLockOptions {
521 1 : epoch,
522 1 : limiter,
523 1 : shards,
524 1 : timeout,
525 1 : } = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?;
526 1 : assert_eq!(epoch, Duration::from_secs(10 * 60));
527 1 : assert_eq!(timeout, Duration::from_secs(1));
528 1 : assert_eq!(shards, 32);
529 1 : assert_eq!(limiter.initial_limit, 4);
530 1 : assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
531 :
532 : let ConcurrencyLockOptions {
533 1 : epoch,
534 1 : limiter,
535 1 : shards,
536 1 : timeout,
537 1 : } = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?;
538 1 : assert_eq!(epoch, Duration::from_secs(60));
539 1 : assert_eq!(timeout, Duration::from_millis(100));
540 1 : assert_eq!(shards, 16);
541 1 : assert_eq!(limiter.initial_limit, 8);
542 1 : assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
543 :
544 : let ConcurrencyLockOptions {
545 1 : epoch,
546 1 : limiter,
547 1 : shards,
548 1 : timeout,
549 1 : } = "permits=0".parse()?;
550 1 : assert_eq!(epoch, Duration::ZERO);
551 1 : assert_eq!(timeout, Duration::ZERO);
552 1 : assert_eq!(shards, 2);
553 1 : assert_eq!(limiter.initial_limit, 0);
554 1 : assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
555 :
556 1 : Ok(())
557 1 : }
558 :
559 : #[test]
560 1 : fn test_parse_json_lock_options() -> anyhow::Result<()> {
561 : let ConcurrencyLockOptions {
562 1 : epoch,
563 1 : limiter,
564 1 : shards,
565 1 : timeout,
566 1 : } = r#"{"shards":32,"initial_limit":44,"aimd":{"min":5,"max":500,"inc":10,"dec":0.9,"utilisation":0.8},"epoch":"10m","timeout":"1s"}"#
567 1 : .parse()?;
568 1 : assert_eq!(epoch, Duration::from_secs(10 * 60));
569 1 : assert_eq!(timeout, Duration::from_secs(1));
570 1 : assert_eq!(shards, 32);
571 1 : assert_eq!(limiter.initial_limit, 44);
572 1 : assert_eq!(
573 : limiter.algorithm,
574 : RateLimitAlgorithm::Aimd {
575 : conf: Aimd {
576 : min: 5,
577 : max: 500,
578 : dec: 0.9,
579 : inc: 10,
580 : utilisation: 0.8
581 : }
582 : },
583 : );
584 :
585 1 : Ok(())
586 1 : }
587 : }
|