Line data Source code
1 : use camino::Utf8PathBuf;
2 :
3 : #[cfg(test)]
4 : mod tests;
5 :
6 : use const_format::formatcp;
7 : pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
8 : pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
9 : pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
10 : pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
11 : // TODO: gRPC is disabled by default for now, but the port is used in neon_local.
12 : pub const DEFAULT_GRPC_LISTEN_PORT: u16 = 51051; // storage-broker already uses 50051
13 :
14 : use std::collections::HashMap;
15 : use std::num::{NonZeroU64, NonZeroUsize};
16 : use std::str::FromStr;
17 : use std::time::Duration;
18 :
19 : use postgres_backend::AuthType;
20 : use remote_storage::RemoteStorageConfig;
21 : use serde_with::serde_as;
22 : use utils::logging::LogFormat;
23 : use utils::postgres_client::PostgresClientProtocol;
24 :
25 : use crate::models::{ImageCompressionAlgorithm, LsnLease};
26 :
27 : // Certain metadata (e.g. externally-addressable name, AZ) is delivered
28 : // as a separate structure. This information is not neeed by the pageserver
29 : // itself, it is only used for registering the pageserver with the control
30 : // plane and/or storage controller.
31 : //
32 9 : #[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
33 : pub struct NodeMetadata {
34 : #[serde(rename = "host")]
35 : pub postgres_host: String,
36 : #[serde(rename = "port")]
37 : pub postgres_port: u16,
38 : pub http_host: String,
39 : pub http_port: u16,
40 : pub https_port: Option<u16>,
41 :
42 : // Deployment tools may write fields to the metadata file beyond what we
43 : // use in this type: this type intentionally only names fields that require.
44 : #[serde(flatten)]
45 : pub other: HashMap<String, serde_json::Value>,
46 : }
47 :
48 : /// PostHog integration config.
49 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
50 : pub struct PostHogConfig {
51 : /// PostHog project ID
52 : pub project_id: String,
53 : /// Server-side (private) API key
54 : pub server_api_key: String,
55 : /// Client-side (public) API key
56 : pub client_api_key: String,
57 : /// Private API URL
58 : pub private_api_url: String,
59 : /// Public API URL
60 : pub public_api_url: String,
61 : }
62 :
63 : /// `pageserver.toml`
64 : ///
65 : /// We use serde derive with `#[serde(default)]` to generate a deserializer
66 : /// that fills in the default values for each config field.
67 : ///
68 : /// If there cannot be a static default value because we need to make runtime
69 : /// checks to determine the default, make it an `Option` (which defaults to None).
70 : /// The runtime check should be done in the consuming crate, i.e., `pageserver`.
71 : ///
72 : /// Unknown fields are silently ignored during deserialization.
73 : /// The alternative, which we used in the past, was to set `deny_unknown_fields`,
74 : /// which fails deserialization, and hence pageserver startup, if there is an unknown field.
75 : /// The reason we don't do that anymore is that it complicates
76 : /// usage of config fields for feature flagging, which we commonly do for
77 : /// region-by-region rollouts.
78 : /// The complications mainly arise because the `pageserver.toml` contents on a
79 : /// prod server have a separate lifecycle from the pageserver binary.
80 : /// For instance, `pageserver.toml` contents today are defined in the internal
81 : /// infra repo, and thus introducing a new config field to pageserver and
82 : /// rolling it out to prod servers are separate commits in separate repos
83 : /// that can't be made or rolled back atomically.
84 : /// Rollbacks in particular pose a risk with deny_unknown_fields because
85 : /// the old pageserver binary may reject a new config field, resulting in
86 : /// an outage unless the person doing the pageserver rollback remembers
87 : /// to also revert the commit that added the config field in to the
88 : /// `pageserver.toml` templates in the internal infra repo.
89 : /// (A pre-deploy config check would eliminate this risk during rollbacks,
90 : /// cf [here](https://github.com/neondatabase/cloud/issues/24349).)
91 : /// In addition to this compatibility problem during emergency rollbacks,
92 : /// deny_unknown_fields adds further complications when decomissioning a feature
93 : /// flag: with deny_unknown_fields, we can't remove a flag from the [`ConfigToml`]
94 : /// until all prod servers' `pageserver.toml` files have been updated to a version
95 : /// that doesn't specify the flag. Otherwise new software would fail to start up.
96 : /// This adds the requirement for an intermediate step where the new config field
97 : /// is accepted but ignored, prolonging the decomissioning process by an entire
98 : /// release cycle.
99 : /// By contrast with unknown fields silently ignored, decomissioning a feature
100 : /// flag is a one-step process: we can skip the intermediate step and straight
101 : /// remove the field from the [`ConfigToml`]. We leave the field in the
102 : /// `pageserver.toml` files on prod servers until we reach certainty that we
103 : /// will not roll back to old software whose behavior was dependent on config.
104 : /// Then we can remove the field from the templates in the internal infra repo.
105 : /// This process is [documented internally](
106 : /// https://docs.neon.build/storage/pageserver_configuration.html).
107 : ///
108 : /// Note that above relaxed compatbility for the config format does NOT APPLY
109 : /// TO THE STORAGE FORMAT. As general guidance, when introducing storage format
110 : /// changes, ensure that the potential rollback target version will be compatible
111 : /// with the new format. This must hold regardless of what flags are set in in the `pageserver.toml`:
112 : /// any format version that exists in an environment must be compatible with the software that runs there.
113 : /// Use a pageserver.toml flag only to gate whether software _writes_ the new format.
114 : /// For more compatibility considerations, refer to [internal docs](
115 : /// https://docs.neon.build/storage/compat.html?highlight=compat#format-versions--compatibility)
116 : #[serde_as]
117 3 : #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
118 : #[serde(default)]
119 : pub struct ConfigToml {
120 : // types mapped 1:1 into the runtime PageServerConfig type
121 : pub listen_pg_addr: String,
122 : pub listen_http_addr: String,
123 : pub listen_https_addr: Option<String>,
124 : pub listen_grpc_addr: Option<String>,
125 : pub ssl_key_file: Utf8PathBuf,
126 : pub ssl_cert_file: Utf8PathBuf,
127 : #[serde(with = "humantime_serde")]
128 : pub ssl_cert_reload_period: Duration,
129 : pub ssl_ca_file: Option<Utf8PathBuf>,
130 : pub availability_zone: Option<String>,
131 : #[serde(with = "humantime_serde")]
132 : pub wait_lsn_timeout: Duration,
133 : #[serde(with = "humantime_serde")]
134 : pub wal_redo_timeout: Duration,
135 : pub superuser: String,
136 : pub locale: String,
137 : pub page_cache_size: usize,
138 : pub max_file_descriptors: usize,
139 : pub pg_distrib_dir: Option<Utf8PathBuf>,
140 : #[serde_as(as = "serde_with::DisplayFromStr")]
141 : pub http_auth_type: AuthType,
142 : #[serde_as(as = "serde_with::DisplayFromStr")]
143 : pub pg_auth_type: AuthType,
144 : pub grpc_auth_type: AuthType,
145 : pub auth_validation_public_key_path: Option<Utf8PathBuf>,
146 : pub remote_storage: Option<RemoteStorageConfig>,
147 : pub tenant_config: TenantConfigToml,
148 : #[serde_as(as = "serde_with::DisplayFromStr")]
149 : pub broker_endpoint: storage_broker::Uri,
150 : #[serde(with = "humantime_serde")]
151 : pub broker_keepalive_interval: Duration,
152 : #[serde_as(as = "serde_with::DisplayFromStr")]
153 : pub log_format: LogFormat,
154 : pub concurrent_tenant_warmup: NonZeroUsize,
155 : pub concurrent_tenant_size_logical_size_queries: NonZeroUsize,
156 : #[serde(with = "humantime_serde")]
157 : pub metric_collection_interval: Duration,
158 : pub metric_collection_endpoint: Option<reqwest::Url>,
159 : pub metric_collection_bucket: Option<RemoteStorageConfig>,
160 : #[serde(with = "humantime_serde")]
161 : pub synthetic_size_calculation_interval: Duration,
162 : pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
163 : pub test_remote_failures: u64,
164 : pub ondemand_download_behavior_treat_error_as_warn: bool,
165 : #[serde(with = "humantime_serde")]
166 : pub background_task_maximum_delay: Duration,
167 : pub control_plane_api: Option<reqwest::Url>,
168 : pub control_plane_api_token: Option<String>,
169 : pub control_plane_emergency_mode: bool,
170 : /// Unstable feature: subject to change or removal without notice.
171 : /// See <https://github.com/neondatabase/neon/pull/9218>.
172 : pub import_pgdata_upcall_api: Option<reqwest::Url>,
173 : /// Unstable feature: subject to change or removal without notice.
174 : /// See <https://github.com/neondatabase/neon/pull/9218>.
175 : pub import_pgdata_upcall_api_token: Option<String>,
176 : /// Unstable feature: subject to change or removal without notice.
177 : /// See <https://github.com/neondatabase/neon/pull/9218>.
178 : pub import_pgdata_aws_endpoint_url: Option<reqwest::Url>,
179 : pub heatmap_upload_concurrency: usize,
180 : pub secondary_download_concurrency: usize,
181 : pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
182 : pub ingest_batch_size: u64,
183 : pub max_vectored_read_bytes: MaxVectoredReadBytes,
184 : pub image_compression: ImageCompressionAlgorithm,
185 : pub timeline_offloading: bool,
186 : pub ephemeral_bytes_per_memory_kb: usize,
187 : pub l0_flush: Option<crate::models::L0FlushConfig>,
188 : pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
189 : #[serde(skip_serializing_if = "Option::is_none")]
190 : pub no_sync: Option<bool>,
191 : pub wal_receiver_protocol: PostgresClientProtocol,
192 : pub page_service_pipelining: PageServicePipeliningConfig,
193 : pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
194 : pub enable_read_path_debugging: Option<bool>,
195 : #[serde(skip_serializing_if = "Option::is_none")]
196 : pub validate_wal_contiguity: Option<bool>,
197 : #[serde(skip_serializing_if = "Option::is_none")]
198 : pub load_previous_heatmap: Option<bool>,
199 : #[serde(skip_serializing_if = "Option::is_none")]
200 : pub generate_unarchival_heatmap: Option<bool>,
201 : pub tracing: Option<Tracing>,
202 : pub enable_tls_page_service_api: bool,
203 : pub dev_mode: bool,
204 : #[serde(skip_serializing_if = "Option::is_none")]
205 : pub posthog_config: Option<PostHogConfig>,
206 : pub timeline_import_config: TimelineImportConfig,
207 : #[serde(skip_serializing_if = "Option::is_none")]
208 : pub basebackup_cache_config: Option<BasebackupCacheConfig>,
209 : }
210 :
211 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
212 : pub struct DiskUsageEvictionTaskConfig {
213 : pub max_usage_pct: utils::serde_percent::Percent,
214 : pub min_avail_bytes: u64,
215 : #[serde(with = "humantime_serde")]
216 : pub period: Duration,
217 : #[cfg(feature = "testing")]
218 : pub mock_statvfs: Option<statvfs::mock::Behavior>,
219 : /// Select sorting for evicted layers
220 : #[serde(default)]
221 : pub eviction_order: EvictionOrder,
222 : }
223 :
224 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
225 : #[serde(tag = "mode", rename_all = "kebab-case")]
226 : pub enum PageServicePipeliningConfig {
227 : Serial,
228 : Pipelined(PageServicePipeliningConfigPipelined),
229 : }
230 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
231 : pub struct PageServicePipeliningConfigPipelined {
232 : /// Causes runtime errors if larger than max get_vectored batch size.
233 : pub max_batch_size: NonZeroUsize,
234 : pub execution: PageServiceProtocolPipelinedExecutionStrategy,
235 : // The default below is such that new versions of the software can start
236 : // with the old configuration.
237 : #[serde(default)]
238 : pub batching: PageServiceProtocolPipelinedBatchingStrategy,
239 : }
240 :
241 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
242 : #[serde(rename_all = "kebab-case")]
243 : pub enum PageServiceProtocolPipelinedExecutionStrategy {
244 : ConcurrentFutures,
245 : Tasks,
246 : }
247 :
248 0 : #[derive(Default, Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
249 : #[serde(rename_all = "kebab-case")]
250 : pub enum PageServiceProtocolPipelinedBatchingStrategy {
251 : /// All get page requests in a batch will be at the same LSN
252 : #[default]
253 : UniformLsn,
254 : /// Get page requests in a batch may be at different LSN
255 : ///
256 : /// One key cannot be present more than once at different LSNs in
257 : /// the same batch.
258 : ScatteredLsn,
259 : }
260 :
261 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
262 : #[serde(tag = "mode", rename_all = "kebab-case")]
263 : pub enum GetVectoredConcurrentIo {
264 : /// The read path is fully sequential: layers are visited
265 : /// one after the other and IOs are issued and waited upon
266 : /// from the same task that traverses the layers.
267 : Sequential,
268 : /// The read path still traverses layers sequentially, and
269 : /// index blocks will be read into the PS PageCache from
270 : /// that task, with waiting.
271 : /// But data IOs are dispatched and waited upon from a sidecar
272 : /// task so that the traversing task can continue to traverse
273 : /// layers while the IOs are in flight.
274 : /// If the PS PageCache miss rate is low, this improves
275 : /// throughput dramatically.
276 : SidecarTask,
277 : }
278 :
279 2 : #[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
280 : pub struct Ratio {
281 : pub numerator: usize,
282 : pub denominator: usize,
283 : }
284 :
285 3 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
286 : pub struct OtelExporterConfig {
287 : pub endpoint: String,
288 : pub protocol: OtelExporterProtocol,
289 : #[serde(with = "humantime_serde")]
290 : pub timeout: Duration,
291 : }
292 :
293 1 : #[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
294 : #[serde(rename_all = "kebab-case")]
295 : pub enum OtelExporterProtocol {
296 : Grpc,
297 : HttpBinary,
298 : HttpJson,
299 : }
300 :
301 2 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
302 : pub struct Tracing {
303 : pub sampling_ratio: Ratio,
304 : pub export_config: OtelExporterConfig,
305 : }
306 :
307 : impl From<&OtelExporterConfig> for tracing_utils::ExportConfig {
308 0 : fn from(val: &OtelExporterConfig) -> Self {
309 0 : tracing_utils::ExportConfig {
310 0 : endpoint: Some(val.endpoint.clone()),
311 0 : protocol: val.protocol.into(),
312 0 : timeout: val.timeout,
313 0 : }
314 0 : }
315 : }
316 :
317 : impl From<OtelExporterProtocol> for tracing_utils::Protocol {
318 0 : fn from(val: OtelExporterProtocol) -> Self {
319 0 : match val {
320 0 : OtelExporterProtocol::Grpc => tracing_utils::Protocol::Grpc,
321 0 : OtelExporterProtocol::HttpJson => tracing_utils::Protocol::HttpJson,
322 0 : OtelExporterProtocol::HttpBinary => tracing_utils::Protocol::HttpBinary,
323 : }
324 0 : }
325 : }
326 :
327 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
328 : pub struct TimelineImportConfig {
329 : pub import_job_concurrency: NonZeroUsize,
330 : pub import_job_soft_size_limit: NonZeroUsize,
331 : pub import_job_checkpoint_threshold: NonZeroUsize,
332 : }
333 :
334 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
335 : #[serde(default)]
336 : pub struct BasebackupCacheConfig {
337 : #[serde(with = "humantime_serde")]
338 : pub cleanup_period: Duration,
339 : // FIXME: Support max_size_bytes.
340 : // pub max_size_bytes: usize,
341 : pub max_size_entries: i64,
342 : }
343 :
344 : impl Default for BasebackupCacheConfig {
345 0 : fn default() -> Self {
346 0 : Self {
347 0 : cleanup_period: Duration::from_secs(60),
348 0 : // max_size_bytes: 1024 * 1024 * 1024, // 1 GiB
349 0 : max_size_entries: 1000,
350 0 : }
351 0 : }
352 : }
353 :
354 : pub mod statvfs {
355 : pub mod mock {
356 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
357 : #[serde(tag = "type")]
358 : pub enum Behavior {
359 : Success {
360 : blocksize: u64,
361 : total_blocks: u64,
362 : name_filter: Option<utils::serde_regex::Regex>,
363 : },
364 : #[cfg(feature = "testing")]
365 : Failure { mocked_error: MockedError },
366 : }
367 :
368 : #[cfg(feature = "testing")]
369 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
370 : #[allow(clippy::upper_case_acronyms)]
371 : pub enum MockedError {
372 : EIO,
373 : }
374 :
375 : #[cfg(feature = "testing")]
376 : impl From<MockedError> for nix::Error {
377 0 : fn from(e: MockedError) -> Self {
378 0 : match e {
379 0 : MockedError::EIO => nix::Error::EIO,
380 0 : }
381 0 : }
382 : }
383 : }
384 : }
385 :
386 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
387 : #[serde(tag = "type", content = "args")]
388 : pub enum EvictionOrder {
389 : RelativeAccessed {
390 : highest_layer_count_loses_first: bool,
391 : },
392 : }
393 :
394 : impl Default for EvictionOrder {
395 1 : fn default() -> Self {
396 1 : Self::RelativeAccessed {
397 1 : highest_layer_count_loses_first: true,
398 1 : }
399 1 : }
400 : }
401 :
402 0 : #[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
403 : #[serde(transparent)]
404 : pub struct MaxVectoredReadBytes(pub NonZeroUsize);
405 :
406 : /// Tenant-level configuration values, used for various purposes.
407 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
408 : #[serde(default)]
409 : pub struct TenantConfigToml {
410 : // Flush out an inmemory layer, if it's holding WAL older than this
411 : // This puts a backstop on how much WAL needs to be re-digested if the
412 : // page server crashes.
413 : // This parameter actually determines L0 layer file size.
414 : pub checkpoint_distance: u64,
415 : // Inmemory layer is also flushed at least once in checkpoint_timeout to
416 : // eventually upload WAL after activity is stopped.
417 : #[serde(with = "humantime_serde")]
418 : pub checkpoint_timeout: Duration,
419 : // Target file size, when creating image and delta layers.
420 : // This parameter determines L1 layer file size.
421 : pub compaction_target_size: u64,
422 : // How often to check if there's compaction work to be done.
423 : // Duration::ZERO means automatic compaction is disabled.
424 : #[serde(with = "humantime_serde")]
425 : pub compaction_period: Duration,
426 : /// Level0 delta layer threshold for compaction.
427 : pub compaction_threshold: usize,
428 : /// Controls the amount of L0 included in a single compaction iteration.
429 : /// The unit is `checkpoint_distance`, i.e., a size.
430 : /// We add L0s to the set of layers to compact until their cumulative
431 : /// size exceeds `compaction_upper_limit * checkpoint_distance`.
432 : pub compaction_upper_limit: usize,
433 : pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
434 : /// If true, enable shard ancestor compaction (enabled by default).
435 : pub compaction_shard_ancestor: bool,
436 : /// If true, compact down L0 across all tenant timelines before doing regular compaction. L0
437 : /// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true.
438 : pub compaction_l0_first: bool,
439 : /// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only
440 : /// has an effect if `compaction_l0_first` is true. Defaults to true.
441 : pub compaction_l0_semaphore: bool,
442 : /// Level0 delta layer threshold at which to delay layer flushes such that they take 2x as long,
443 : /// and block on layer flushes during ephemeral layer rolls, for compaction backpressure. This
444 : /// helps compaction keep up with WAL ingestion, and avoids read amplification blowing up.
445 : /// Should be >compaction_threshold. 0 to disable. Defaults to 3x compaction_threshold.
446 : pub l0_flush_delay_threshold: Option<usize>,
447 : /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold
448 : /// to avoid deadlock. 0 to disable. Disabled by default.
449 : pub l0_flush_stall_threshold: Option<usize>,
450 : // Determines how much history is retained, to allow
451 : // branching and read replicas at an older point in time.
452 : // The unit is #of bytes of WAL.
453 : // Page versions older than this are garbage collected away.
454 : pub gc_horizon: u64,
455 : // Interval at which garbage collection is triggered.
456 : // Duration::ZERO means automatic GC is disabled
457 : #[serde(with = "humantime_serde")]
458 : pub gc_period: Duration,
459 : // Delta layer churn threshold to create L1 image layers.
460 : pub image_creation_threshold: usize,
461 : // Determines how much history is retained, to allow
462 : // branching and read replicas at an older point in time.
463 : // The unit is time.
464 : // Page versions older than this are garbage collected away.
465 : #[serde(with = "humantime_serde")]
466 : pub pitr_interval: Duration,
467 : /// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
468 : #[serde(with = "humantime_serde")]
469 : pub walreceiver_connect_timeout: Duration,
470 : /// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
471 : /// A stalled safekeeper will be changed to a newer one when it appears.
472 : #[serde(with = "humantime_serde")]
473 : pub lagging_wal_timeout: Duration,
474 : /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
475 : /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
476 : /// to avoid eager reconnects.
477 : pub max_lsn_wal_lag: NonZeroU64,
478 : pub eviction_policy: crate::models::EvictionPolicy,
479 : pub min_resident_size_override: Option<u64>,
480 : // See the corresponding metric's help string.
481 : #[serde(with = "humantime_serde")]
482 : pub evictions_low_residence_duration_metric_threshold: Duration,
483 :
484 : /// If non-zero, the period between uploads of a heatmap from attached tenants. This
485 : /// may be disabled if a Tenant will not have secondary locations: only secondary
486 : /// locations will use the heatmap uploaded by attached locations.
487 : #[serde(with = "humantime_serde")]
488 : pub heatmap_period: Duration,
489 :
490 : /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
491 : pub lazy_slru_download: bool,
492 :
493 : pub timeline_get_throttle: crate::models::ThrottleConfig,
494 :
495 : // How much WAL must be ingested before checking again whether a new image layer is required.
496 : // Expresed in multiples of checkpoint distance.
497 : pub image_layer_creation_check_threshold: u8,
498 :
499 : // How many multiples of L0 `compaction_threshold` will preempt image layer creation and do L0 compaction.
500 : // Set to 0 to disable preemption.
501 : pub image_creation_preempt_threshold: usize,
502 :
503 : /// The length for an explicit LSN lease request.
504 : /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
505 : #[serde(with = "humantime_serde")]
506 : pub lsn_lease_length: Duration,
507 :
508 : /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
509 : /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
510 : #[serde(with = "humantime_serde")]
511 : pub lsn_lease_length_for_ts: Duration,
512 :
513 : /// Enable auto-offloading of timelines.
514 : /// (either this flag or the pageserver-global one need to be set)
515 : pub timeline_offloading: bool,
516 :
517 : pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
518 :
519 : /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
520 : /// `index_part.json`, and it cannot be reversed.
521 : pub rel_size_v2_enabled: bool,
522 :
523 : // gc-compaction related configs
524 : /// Enable automatic gc-compaction trigger on this tenant.
525 : pub gc_compaction_enabled: bool,
526 : /// Enable verification of gc-compaction results.
527 : pub gc_compaction_verification: bool,
528 : /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
529 : /// gc-compaction will be triggered.
530 : pub gc_compaction_initial_threshold_kb: u64,
531 : /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN)
532 : /// is above this ratio, gc-compaction will be triggered.
533 : pub gc_compaction_ratio_percent: u64,
534 : /// Tenant level performance sampling ratio override. Controls the ratio of get page requests
535 : /// that will get perf sampling for the tenant.
536 : pub sampling_ratio: Option<Ratio>,
537 :
538 : /// Capacity of relsize snapshot cache (used by replicas).
539 : pub relsize_snapshot_cache_capacity: usize,
540 :
541 : /// Enable preparing basebackup on XLOG_CHECKPOINT_SHUTDOWN and using it in basebackup requests.
542 : // FIXME: Remove skip_serializing_if when the feature is stable.
543 : #[serde(skip_serializing_if = "std::ops::Not::not")]
544 : pub basebackup_cache_enabled: bool,
545 : }
546 :
547 : pub mod defaults {
548 : pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
549 :
550 : use crate::models::ImageCompressionAlgorithm;
551 :
552 : pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
553 : pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
554 :
555 : pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
556 : pub const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
557 : "C"
558 : } else {
559 : "C.UTF-8"
560 : };
561 :
562 : pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
563 : pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
564 :
565 : pub const DEFAULT_LOG_FORMAT: &str = "plain";
566 :
567 : pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
568 :
569 : pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = 1;
570 :
571 : pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
572 : pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
573 : pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
574 : pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
575 :
576 : pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
577 : pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
578 :
579 : pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
580 :
581 : /// Soft limit for the maximum size of a vectored read.
582 : ///
583 : /// This is determined by the largest NeonWalRecord that can exist (minus dbdir and reldir keys
584 : /// which are bounded by the blob io limits only). As of this writing, that is a `NeonWalRecord::ClogSetCommitted` record,
585 : /// with 32k xids. That's the max number of XIDS on a single CLOG page. The size of such a record
586 : /// is `sizeof(Transactionid) * 32768 + (some fixed overhead from 'timestamp`, the Vec length and whatever extra serde serialization adds)`.
587 : /// That is, slightly above 128 kB.
588 : pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 130 * 1024; // 130 KiB
589 :
590 : pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
591 : ImageCompressionAlgorithm::Zstd { level: Some(1) };
592 :
593 : pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
594 :
595 : pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
596 :
597 : pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
598 : utils::postgres_client::PostgresClientProtocol::Vanilla;
599 :
600 : pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
601 : pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
602 : }
603 :
604 : impl Default for ConfigToml {
605 127 : fn default() -> Self {
606 : use defaults::*;
607 :
608 : Self {
609 127 : listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
610 127 : listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
611 127 : listen_https_addr: (None),
612 127 : listen_grpc_addr: None, // TODO: default to 127.0.0.1:51051
613 127 : ssl_key_file: Utf8PathBuf::from(DEFAULT_SSL_KEY_FILE),
614 127 : ssl_cert_file: Utf8PathBuf::from(DEFAULT_SSL_CERT_FILE),
615 127 : ssl_cert_reload_period: Duration::from_secs(60),
616 127 : ssl_ca_file: None,
617 127 : availability_zone: (None),
618 127 : wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
619 127 : .expect("cannot parse default wait lsn timeout")),
620 127 : wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
621 127 : .expect("cannot parse default wal redo timeout")),
622 127 : superuser: (DEFAULT_SUPERUSER.to_string()),
623 127 : locale: DEFAULT_LOCALE.to_string(),
624 127 : page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
625 127 : max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
626 127 : pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
627 127 : http_auth_type: (AuthType::Trust),
628 127 : pg_auth_type: (AuthType::Trust),
629 127 : grpc_auth_type: (AuthType::Trust),
630 127 : auth_validation_public_key_path: (None),
631 127 : remote_storage: None,
632 127 : broker_endpoint: (storage_broker::DEFAULT_ENDPOINT
633 127 : .parse()
634 127 : .expect("failed to parse default broker endpoint")),
635 127 : broker_keepalive_interval: (humantime::parse_duration(
636 127 : storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
637 127 : )
638 127 : .expect("cannot parse default keepalive interval")),
639 127 : log_format: (LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
640 127 :
641 127 : concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
642 127 : .expect("Invalid default constant")),
643 127 : concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(
644 127 : DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES,
645 127 : )
646 127 : .unwrap(),
647 127 : metric_collection_interval: (humantime::parse_duration(
648 127 : DEFAULT_METRIC_COLLECTION_INTERVAL,
649 127 : )
650 127 : .expect("cannot parse default metric collection interval")),
651 127 : synthetic_size_calculation_interval: (humantime::parse_duration(
652 127 : DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
653 127 : )
654 127 : .expect("cannot parse default synthetic size calculation interval")),
655 127 : metric_collection_endpoint: (DEFAULT_METRIC_COLLECTION_ENDPOINT),
656 127 :
657 127 : metric_collection_bucket: (None),
658 127 :
659 127 : disk_usage_based_eviction: (None),
660 127 :
661 127 : test_remote_failures: (0),
662 127 :
663 127 : ondemand_download_behavior_treat_error_as_warn: (false),
664 127 :
665 127 : background_task_maximum_delay: (humantime::parse_duration(
666 127 : DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
667 127 : )
668 127 : .unwrap()),
669 127 :
670 127 : control_plane_api: (None),
671 127 : control_plane_api_token: (None),
672 127 : control_plane_emergency_mode: (false),
673 127 :
674 127 : import_pgdata_upcall_api: (None),
675 127 : import_pgdata_upcall_api_token: (None),
676 127 : import_pgdata_aws_endpoint_url: (None),
677 127 :
678 127 : heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
679 127 : secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
680 127 :
681 127 : ingest_batch_size: (DEFAULT_INGEST_BATCH_SIZE),
682 127 :
683 127 : virtual_file_io_engine: None,
684 127 :
685 127 : max_vectored_read_bytes: (MaxVectoredReadBytes(
686 127 : NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
687 127 : )),
688 127 : image_compression: (DEFAULT_IMAGE_COMPRESSION),
689 127 : timeline_offloading: true,
690 127 : ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
691 127 : l0_flush: None,
692 127 : virtual_file_io_mode: None,
693 127 : tenant_config: TenantConfigToml::default(),
694 127 : no_sync: None,
695 127 : wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
696 127 : page_service_pipelining: PageServicePipeliningConfig::Pipelined(
697 127 : PageServicePipeliningConfigPipelined {
698 127 : max_batch_size: NonZeroUsize::new(32).unwrap(),
699 127 : execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
700 127 : batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn,
701 127 : },
702 127 : ),
703 127 : get_vectored_concurrent_io: GetVectoredConcurrentIo::SidecarTask,
704 127 : enable_read_path_debugging: if cfg!(feature = "testing") {
705 127 : Some(true)
706 : } else {
707 0 : None
708 : },
709 127 : validate_wal_contiguity: None,
710 127 : load_previous_heatmap: None,
711 127 : generate_unarchival_heatmap: None,
712 127 : tracing: None,
713 127 : enable_tls_page_service_api: false,
714 127 : dev_mode: false,
715 127 : timeline_import_config: TimelineImportConfig {
716 127 : import_job_concurrency: NonZeroUsize::new(128).unwrap(),
717 127 : import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(),
718 127 : import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(),
719 127 : },
720 127 : basebackup_cache_config: None,
721 127 : posthog_config: None,
722 127 : }
723 127 : }
724 : }
725 :
726 : pub mod tenant_conf_defaults {
727 :
728 : // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
729 : // would be more appropriate. But a low value forces the code to be exercised more,
730 : // which is good for now to trigger bugs.
731 : // This parameter actually determines L0 layer file size.
732 : pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
733 : pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
734 :
735 : // FIXME the below configs are only used by legacy algorithm. The new algorithm
736 : // has different parameters.
737 :
738 : // Target file size, when creating image and delta layers.
739 : // This parameter determines L1 layer file size.
740 : pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
741 :
742 : pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
743 : pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
744 : pub const DEFAULT_COMPACTION_SHARD_ANCESTOR: bool = true;
745 :
746 : // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
747 : // 3/4*8=6 on most of our pageservers. Compacting 10 layers requires a maximum of
748 : // DEFAULT_CHECKPOINT_DISTANCE*10 memory, that's 2560MB. So with this config, we can get a maximum peak
749 : // compaction usage of 15360MB.
750 : pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 10;
751 : // Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid
752 : // read amp.
753 : pub const DEFAULT_COMPACTION_L0_FIRST: bool = true;
754 : pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true;
755 :
756 : pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
757 : crate::models::CompactionAlgorithm::Legacy;
758 :
759 : pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
760 :
761 : // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
762 : // If there's a need to decrease this value, first make sure that GC
763 : // doesn't hold a layer map write lock for non-trivial operations.
764 : // Relevant: https://github.com/neondatabase/neon/issues/3394
765 : pub const DEFAULT_GC_PERIOD: &str = "1 hr";
766 : pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
767 : // Currently, any value other than 0 will trigger image layer creation preemption immediately with L0 backpressure
768 : // without looking at the exact number of L0 layers.
769 : // It was expected to have the following behavior:
770 : // > If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
771 : // > layer creation will end immediately. Set to 0 to disable.
772 : pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3;
773 : pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
774 : pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
775 : pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
776 : // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
777 : // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
778 : // throughputs up to 1GiB/s per timeline.
779 : pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
780 : pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
781 : // By default ingest enough WAL for two new L0 layers before checking if new image
782 : // image layers should be created.
783 : pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
784 : pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
785 : pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
786 : pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
787 : pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
788 : pub const DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY: usize = 1000;
789 : }
790 :
791 : impl Default for TenantConfigToml {
792 127 : fn default() -> Self {
793 : use tenant_conf_defaults::*;
794 127 : Self {
795 127 : checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
796 127 : checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
797 127 : .expect("cannot parse default checkpoint timeout"),
798 127 : compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
799 127 : compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
800 127 : .expect("cannot parse default compaction period"),
801 127 : compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
802 127 : compaction_upper_limit: DEFAULT_COMPACTION_UPPER_LIMIT,
803 127 : compaction_algorithm: crate::models::CompactionAlgorithmSettings {
804 127 : kind: DEFAULT_COMPACTION_ALGORITHM,
805 127 : },
806 127 : compaction_shard_ancestor: DEFAULT_COMPACTION_SHARD_ANCESTOR,
807 127 : compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
808 127 : compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
809 127 : l0_flush_delay_threshold: None,
810 127 : l0_flush_stall_threshold: None,
811 127 : gc_horizon: DEFAULT_GC_HORIZON,
812 127 : gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
813 127 : .expect("cannot parse default gc period"),
814 127 : image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
815 127 : pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
816 127 : .expect("cannot parse default PITR interval"),
817 127 : walreceiver_connect_timeout: humantime::parse_duration(
818 127 : DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
819 127 : )
820 127 : .expect("cannot parse default walreceiver connect timeout"),
821 127 : lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
822 127 : .expect("cannot parse default walreceiver lagging wal timeout"),
823 127 : max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
824 127 : .expect("cannot parse default max walreceiver Lsn wal lag"),
825 127 : eviction_policy: crate::models::EvictionPolicy::NoEviction,
826 127 : min_resident_size_override: None,
827 127 : evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
828 127 : DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
829 127 : )
830 127 : .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
831 127 : heatmap_period: Duration::ZERO,
832 127 : lazy_slru_download: false,
833 127 : timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
834 127 : image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
835 127 : image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD,
836 127 : lsn_lease_length: LsnLease::DEFAULT_LENGTH,
837 127 : lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
838 127 : timeline_offloading: true,
839 127 : wal_receiver_protocol_override: None,
840 127 : rel_size_v2_enabled: false,
841 127 : gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
842 127 : gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
843 127 : gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
844 127 : gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
845 127 : sampling_ratio: None,
846 127 : relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY,
847 127 : basebackup_cache_enabled: false,
848 127 : }
849 127 : }
850 : }
|