Line data Source code
1 : use camino::Utf8PathBuf;
2 :
3 : #[cfg(test)]
4 : mod tests;
5 :
6 : use const_format::formatcp;
7 : pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
8 : pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
9 : pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
10 : pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
11 :
12 : use std::collections::HashMap;
13 : use std::num::{NonZeroU64, NonZeroUsize};
14 : use std::str::FromStr;
15 : use std::time::Duration;
16 :
17 : use postgres_backend::AuthType;
18 : use remote_storage::RemoteStorageConfig;
19 : use serde_with::serde_as;
20 : use utils::logging::LogFormat;
21 : use utils::postgres_client::PostgresClientProtocol;
22 :
23 : use crate::models::{ImageCompressionAlgorithm, LsnLease};
24 :
25 : // Certain metadata (e.g. externally-addressable name, AZ) is delivered
26 : // as a separate structure. This information is not neeed by the pageserver
27 : // itself, it is only used for registering the pageserver with the control
28 : // plane and/or storage controller.
29 : //
30 9 : #[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
31 : pub struct NodeMetadata {
32 : #[serde(rename = "host")]
33 : pub postgres_host: String,
34 : #[serde(rename = "port")]
35 : pub postgres_port: u16,
36 : pub http_host: String,
37 : pub http_port: u16,
38 : pub https_port: Option<u16>,
39 :
40 : // Deployment tools may write fields to the metadata file beyond what we
41 : // use in this type: this type intentionally only names fields that require.
42 : #[serde(flatten)]
43 : pub other: HashMap<String, serde_json::Value>,
44 : }
45 :
46 : /// `pageserver.toml`
47 : ///
48 : /// We use serde derive with `#[serde(default)]` to generate a deserializer
49 : /// that fills in the default values for each config field.
50 : ///
51 : /// If there cannot be a static default value because we need to make runtime
52 : /// checks to determine the default, make it an `Option` (which defaults to None).
53 : /// The runtime check should be done in the consuming crate, i.e., `pageserver`.
54 : ///
55 : /// Unknown fields are silently ignored during deserialization.
56 : /// The alternative, which we used in the past, was to set `deny_unknown_fields`,
57 : /// which fails deserialization, and hence pageserver startup, if there is an unknown field.
58 : /// The reason we don't do that anymore is that it complicates
59 : /// usage of config fields for feature flagging, which we commonly do for
60 : /// region-by-region rollouts.
61 : /// The complications mainly arise because the `pageserver.toml` contents on a
62 : /// prod server have a separate lifecycle from the pageserver binary.
63 : /// For instance, `pageserver.toml` contents today are defined in the internal
64 : /// infra repo, and thus introducing a new config field to pageserver and
65 : /// rolling it out to prod servers are separate commits in separate repos
66 : /// that can't be made or rolled back atomically.
67 : /// Rollbacks in particular pose a risk with deny_unknown_fields because
68 : /// the old pageserver binary may reject a new config field, resulting in
69 : /// an outage unless the person doing the pageserver rollback remembers
70 : /// to also revert the commit that added the config field in to the
71 : /// `pageserver.toml` templates in the internal infra repo.
72 : /// (A pre-deploy config check would eliminate this risk during rollbacks,
73 : /// cf [here](https://github.com/neondatabase/cloud/issues/24349).)
74 : /// In addition to this compatibility problem during emergency rollbacks,
75 : /// deny_unknown_fields adds further complications when decomissioning a feature
76 : /// flag: with deny_unknown_fields, we can't remove a flag from the [`ConfigToml`]
77 : /// until all prod servers' `pageserver.toml` files have been updated to a version
78 : /// that doesn't specify the flag. Otherwise new software would fail to start up.
79 : /// This adds the requirement for an intermediate step where the new config field
80 : /// is accepted but ignored, prolonging the decomissioning process by an entire
81 : /// release cycle.
82 : /// By contrast with unknown fields silently ignored, decomissioning a feature
83 : /// flag is a one-step process: we can skip the intermediate step and straight
84 : /// remove the field from the [`ConfigToml`]. We leave the field in the
85 : /// `pageserver.toml` files on prod servers until we reach certainty that we
86 : /// will not roll back to old software whose behavior was dependent on config.
87 : /// Then we can remove the field from the templates in the internal infra repo.
88 : /// This process is [documented internally](
89 : /// https://docs.neon.build/storage/pageserver_configuration.html).
90 : ///
91 : /// Note that above relaxed compatbility for the config format does NOT APPLY
92 : /// TO THE STORAGE FORMAT. As general guidance, when introducing storage format
93 : /// changes, ensure that the potential rollback target version will be compatible
94 : /// with the new format. This must hold regardless of what flags are set in in the `pageserver.toml`:
95 : /// any format version that exists in an environment must be compatible with the software that runs there.
96 : /// Use a pageserver.toml flag only to gate whether software _writes_ the new format.
97 : /// For more compatibility considerations, refer to [internal docs](
98 : /// https://docs.neon.build/storage/compat.html?highlight=compat#format-versions--compatibility)
99 : #[serde_as]
100 0 : #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
101 : #[serde(default)]
102 : pub struct ConfigToml {
103 : // types mapped 1:1 into the runtime PageServerConfig type
104 : pub listen_pg_addr: String,
105 : pub listen_http_addr: String,
106 : pub listen_https_addr: Option<String>,
107 : pub ssl_key_file: Utf8PathBuf,
108 : pub ssl_cert_file: Utf8PathBuf,
109 : #[serde(with = "humantime_serde")]
110 : pub ssl_cert_reload_period: Duration,
111 : pub ssl_ca_file: Option<Utf8PathBuf>,
112 : pub availability_zone: Option<String>,
113 : #[serde(with = "humantime_serde")]
114 : pub wait_lsn_timeout: Duration,
115 : #[serde(with = "humantime_serde")]
116 : pub wal_redo_timeout: Duration,
117 : pub superuser: String,
118 : pub locale: String,
119 : pub page_cache_size: usize,
120 : pub max_file_descriptors: usize,
121 : pub pg_distrib_dir: Option<Utf8PathBuf>,
122 : #[serde_as(as = "serde_with::DisplayFromStr")]
123 : pub http_auth_type: AuthType,
124 : #[serde_as(as = "serde_with::DisplayFromStr")]
125 : pub pg_auth_type: AuthType,
126 : pub auth_validation_public_key_path: Option<Utf8PathBuf>,
127 : pub remote_storage: Option<RemoteStorageConfig>,
128 : pub tenant_config: TenantConfigToml,
129 : #[serde_as(as = "serde_with::DisplayFromStr")]
130 : pub broker_endpoint: storage_broker::Uri,
131 : #[serde(with = "humantime_serde")]
132 : pub broker_keepalive_interval: Duration,
133 : #[serde_as(as = "serde_with::DisplayFromStr")]
134 : pub log_format: LogFormat,
135 : pub concurrent_tenant_warmup: NonZeroUsize,
136 : pub concurrent_tenant_size_logical_size_queries: NonZeroUsize,
137 : #[serde(with = "humantime_serde")]
138 : pub metric_collection_interval: Duration,
139 : pub metric_collection_endpoint: Option<reqwest::Url>,
140 : pub metric_collection_bucket: Option<RemoteStorageConfig>,
141 : #[serde(with = "humantime_serde")]
142 : pub synthetic_size_calculation_interval: Duration,
143 : pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
144 : pub test_remote_failures: u64,
145 : pub ondemand_download_behavior_treat_error_as_warn: bool,
146 : #[serde(with = "humantime_serde")]
147 : pub background_task_maximum_delay: Duration,
148 : pub control_plane_api: Option<reqwest::Url>,
149 : pub control_plane_api_token: Option<String>,
150 : pub control_plane_emergency_mode: bool,
151 : /// Unstable feature: subject to change or removal without notice.
152 : /// See <https://github.com/neondatabase/neon/pull/9218>.
153 : pub import_pgdata_upcall_api: Option<reqwest::Url>,
154 : /// Unstable feature: subject to change or removal without notice.
155 : /// See <https://github.com/neondatabase/neon/pull/9218>.
156 : pub import_pgdata_upcall_api_token: Option<String>,
157 : /// Unstable feature: subject to change or removal without notice.
158 : /// See <https://github.com/neondatabase/neon/pull/9218>.
159 : pub import_pgdata_aws_endpoint_url: Option<reqwest::Url>,
160 : pub heatmap_upload_concurrency: usize,
161 : pub secondary_download_concurrency: usize,
162 : pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
163 : pub ingest_batch_size: u64,
164 : pub max_vectored_read_bytes: MaxVectoredReadBytes,
165 : pub image_compression: ImageCompressionAlgorithm,
166 : pub timeline_offloading: bool,
167 : pub ephemeral_bytes_per_memory_kb: usize,
168 : pub l0_flush: Option<crate::models::L0FlushConfig>,
169 : pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
170 : #[serde(skip_serializing_if = "Option::is_none")]
171 : pub no_sync: Option<bool>,
172 : pub wal_receiver_protocol: PostgresClientProtocol,
173 : pub page_service_pipelining: PageServicePipeliningConfig,
174 : pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
175 : pub enable_read_path_debugging: Option<bool>,
176 : #[serde(skip_serializing_if = "Option::is_none")]
177 : pub validate_wal_contiguity: Option<bool>,
178 : #[serde(skip_serializing_if = "Option::is_none")]
179 : pub load_previous_heatmap: Option<bool>,
180 : #[serde(skip_serializing_if = "Option::is_none")]
181 : pub generate_unarchival_heatmap: Option<bool>,
182 : pub tracing: Option<Tracing>,
183 : pub enable_tls_page_service_api: bool,
184 : }
185 :
186 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
187 : pub struct DiskUsageEvictionTaskConfig {
188 : pub max_usage_pct: utils::serde_percent::Percent,
189 : pub min_avail_bytes: u64,
190 : #[serde(with = "humantime_serde")]
191 : pub period: Duration,
192 : #[cfg(feature = "testing")]
193 : pub mock_statvfs: Option<statvfs::mock::Behavior>,
194 : /// Select sorting for evicted layers
195 : #[serde(default)]
196 : pub eviction_order: EvictionOrder,
197 : }
198 :
199 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
200 : #[serde(tag = "mode", rename_all = "kebab-case")]
201 : pub enum PageServicePipeliningConfig {
202 : Serial,
203 : Pipelined(PageServicePipeliningConfigPipelined),
204 : }
205 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
206 : pub struct PageServicePipeliningConfigPipelined {
207 : /// Causes runtime errors if larger than max get_vectored batch size.
208 : pub max_batch_size: NonZeroUsize,
209 : pub execution: PageServiceProtocolPipelinedExecutionStrategy,
210 : }
211 :
212 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
213 : #[serde(rename_all = "kebab-case")]
214 : pub enum PageServiceProtocolPipelinedExecutionStrategy {
215 : ConcurrentFutures,
216 : Tasks,
217 : }
218 :
219 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
220 : #[serde(tag = "mode", rename_all = "kebab-case")]
221 : pub enum GetVectoredConcurrentIo {
222 : /// The read path is fully sequential: layers are visited
223 : /// one after the other and IOs are issued and waited upon
224 : /// from the same task that traverses the layers.
225 : Sequential,
226 : /// The read path still traverses layers sequentially, and
227 : /// index blocks will be read into the PS PageCache from
228 : /// that task, with waiting.
229 : /// But data IOs are dispatched and waited upon from a sidecar
230 : /// task so that the traversing task can continue to traverse
231 : /// layers while the IOs are in flight.
232 : /// If the PS PageCache miss rate is low, this improves
233 : /// throughput dramatically.
234 : SidecarTask,
235 : }
236 :
237 0 : #[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
238 : pub struct Ratio {
239 : pub numerator: usize,
240 : pub denominator: usize,
241 : }
242 :
243 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
244 : pub struct OtelExporterConfig {
245 : pub endpoint: String,
246 : pub protocol: OtelExporterProtocol,
247 : #[serde(with = "humantime_serde")]
248 : pub timeout: Duration,
249 : }
250 :
251 0 : #[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
252 : #[serde(rename_all = "kebab-case")]
253 : pub enum OtelExporterProtocol {
254 : Grpc,
255 : HttpBinary,
256 : HttpJson,
257 : }
258 :
259 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
260 : pub struct Tracing {
261 : pub sampling_ratio: Ratio,
262 : pub export_config: OtelExporterConfig,
263 : }
264 :
265 : impl From<&OtelExporterConfig> for tracing_utils::ExportConfig {
266 0 : fn from(val: &OtelExporterConfig) -> Self {
267 0 : tracing_utils::ExportConfig {
268 0 : endpoint: Some(val.endpoint.clone()),
269 0 : protocol: val.protocol.into(),
270 0 : timeout: val.timeout,
271 0 : }
272 0 : }
273 : }
274 :
275 : impl From<OtelExporterProtocol> for tracing_utils::Protocol {
276 0 : fn from(val: OtelExporterProtocol) -> Self {
277 0 : match val {
278 0 : OtelExporterProtocol::Grpc => tracing_utils::Protocol::Grpc,
279 0 : OtelExporterProtocol::HttpJson => tracing_utils::Protocol::HttpJson,
280 0 : OtelExporterProtocol::HttpBinary => tracing_utils::Protocol::HttpBinary,
281 : }
282 0 : }
283 : }
284 :
285 : pub mod statvfs {
286 : pub mod mock {
287 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
288 : #[serde(tag = "type")]
289 : pub enum Behavior {
290 : Success {
291 : blocksize: u64,
292 : total_blocks: u64,
293 : name_filter: Option<utils::serde_regex::Regex>,
294 : },
295 : #[cfg(feature = "testing")]
296 : Failure { mocked_error: MockedError },
297 : }
298 :
299 : #[cfg(feature = "testing")]
300 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
301 : #[allow(clippy::upper_case_acronyms)]
302 : pub enum MockedError {
303 : EIO,
304 : }
305 :
306 : #[cfg(feature = "testing")]
307 : impl From<MockedError> for nix::Error {
308 0 : fn from(e: MockedError) -> Self {
309 0 : match e {
310 0 : MockedError::EIO => nix::Error::EIO,
311 0 : }
312 0 : }
313 : }
314 : }
315 : }
316 :
317 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
318 : #[serde(tag = "type", content = "args")]
319 : pub enum EvictionOrder {
320 : RelativeAccessed {
321 : highest_layer_count_loses_first: bool,
322 : },
323 : }
324 :
325 : impl Default for EvictionOrder {
326 4 : fn default() -> Self {
327 4 : Self::RelativeAccessed {
328 4 : highest_layer_count_loses_first: true,
329 4 : }
330 4 : }
331 : }
332 :
333 0 : #[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
334 : #[serde(transparent)]
335 : pub struct MaxVectoredReadBytes(pub NonZeroUsize);
336 :
337 : /// Tenant-level configuration values, used for various purposes.
338 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
339 : #[serde(default)]
340 : pub struct TenantConfigToml {
341 : // Flush out an inmemory layer, if it's holding WAL older than this
342 : // This puts a backstop on how much WAL needs to be re-digested if the
343 : // page server crashes.
344 : // This parameter actually determines L0 layer file size.
345 : pub checkpoint_distance: u64,
346 : // Inmemory layer is also flushed at least once in checkpoint_timeout to
347 : // eventually upload WAL after activity is stopped.
348 : #[serde(with = "humantime_serde")]
349 : pub checkpoint_timeout: Duration,
350 : // Target file size, when creating image and delta layers.
351 : // This parameter determines L1 layer file size.
352 : pub compaction_target_size: u64,
353 : // How often to check if there's compaction work to be done.
354 : // Duration::ZERO means automatic compaction is disabled.
355 : #[serde(with = "humantime_serde")]
356 : pub compaction_period: Duration,
357 : /// Level0 delta layer threshold for compaction.
358 : pub compaction_threshold: usize,
359 : /// Controls the amount of L0 included in a single compaction iteration.
360 : /// The unit is `checkpoint_distance`, i.e., a size.
361 : /// We add L0s to the set of layers to compact until their cumulative
362 : /// size exceeds `compaction_upper_limit * checkpoint_distance`.
363 : pub compaction_upper_limit: usize,
364 : pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
365 : /// If true, compact down L0 across all tenant timelines before doing regular compaction. L0
366 : /// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true.
367 : pub compaction_l0_first: bool,
368 : /// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only
369 : /// has an effect if `compaction_l0_first` is true. Defaults to true.
370 : pub compaction_l0_semaphore: bool,
371 : /// Level0 delta layer threshold at which to delay layer flushes such that they take 2x as long,
372 : /// and block on layer flushes during ephemeral layer rolls, for compaction backpressure. This
373 : /// helps compaction keep up with WAL ingestion, and avoids read amplification blowing up.
374 : /// Should be >compaction_threshold. 0 to disable. Defaults to 3x compaction_threshold.
375 : pub l0_flush_delay_threshold: Option<usize>,
376 : /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold
377 : /// to avoid deadlock. 0 to disable. Disabled by default.
378 : pub l0_flush_stall_threshold: Option<usize>,
379 : // Determines how much history is retained, to allow
380 : // branching and read replicas at an older point in time.
381 : // The unit is #of bytes of WAL.
382 : // Page versions older than this are garbage collected away.
383 : pub gc_horizon: u64,
384 : // Interval at which garbage collection is triggered.
385 : // Duration::ZERO means automatic GC is disabled
386 : #[serde(with = "humantime_serde")]
387 : pub gc_period: Duration,
388 : // Delta layer churn threshold to create L1 image layers.
389 : pub image_creation_threshold: usize,
390 : // Determines how much history is retained, to allow
391 : // branching and read replicas at an older point in time.
392 : // The unit is time.
393 : // Page versions older than this are garbage collected away.
394 : #[serde(with = "humantime_serde")]
395 : pub pitr_interval: Duration,
396 : /// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
397 : #[serde(with = "humantime_serde")]
398 : pub walreceiver_connect_timeout: Duration,
399 : /// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
400 : /// A stalled safekeeper will be changed to a newer one when it appears.
401 : #[serde(with = "humantime_serde")]
402 : pub lagging_wal_timeout: Duration,
403 : /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
404 : /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
405 : /// to avoid eager reconnects.
406 : pub max_lsn_wal_lag: NonZeroU64,
407 : pub eviction_policy: crate::models::EvictionPolicy,
408 : pub min_resident_size_override: Option<u64>,
409 : // See the corresponding metric's help string.
410 : #[serde(with = "humantime_serde")]
411 : pub evictions_low_residence_duration_metric_threshold: Duration,
412 :
413 : /// If non-zero, the period between uploads of a heatmap from attached tenants. This
414 : /// may be disabled if a Tenant will not have secondary locations: only secondary
415 : /// locations will use the heatmap uploaded by attached locations.
416 : #[serde(with = "humantime_serde")]
417 : pub heatmap_period: Duration,
418 :
419 : /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
420 : pub lazy_slru_download: bool,
421 :
422 : pub timeline_get_throttle: crate::models::ThrottleConfig,
423 :
424 : // How much WAL must be ingested before checking again whether a new image layer is required.
425 : // Expresed in multiples of checkpoint distance.
426 : pub image_layer_creation_check_threshold: u8,
427 :
428 : // How many multiples of L0 `compaction_threshold` will preempt image layer creation and do L0 compaction.
429 : // Set to 0 to disable preemption.
430 : pub image_creation_preempt_threshold: usize,
431 :
432 : /// The length for an explicit LSN lease request.
433 : /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
434 : #[serde(with = "humantime_serde")]
435 : pub lsn_lease_length: Duration,
436 :
437 : /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
438 : /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
439 : #[serde(with = "humantime_serde")]
440 : pub lsn_lease_length_for_ts: Duration,
441 :
442 : /// Enable auto-offloading of timelines.
443 : /// (either this flag or the pageserver-global one need to be set)
444 : pub timeline_offloading: bool,
445 :
446 : pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
447 :
448 : /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
449 : /// `index_part.json`, and it cannot be reversed.
450 : pub rel_size_v2_enabled: bool,
451 :
452 : // gc-compaction related configs
453 : /// Enable automatic gc-compaction trigger on this tenant.
454 : pub gc_compaction_enabled: bool,
455 : /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
456 : /// gc-compaction will be triggered.
457 : pub gc_compaction_initial_threshold_kb: u64,
458 : /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN)
459 : /// is above this ratio, gc-compaction will be triggered.
460 : pub gc_compaction_ratio_percent: u64,
461 : /// Tenant level performance sampling ratio override. Controls the ratio of get page requests
462 : /// that will get perf sampling for the tenant.
463 : pub sampling_ratio: Option<Ratio>,
464 : }
465 :
466 : pub mod defaults {
467 : pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
468 :
469 : use crate::models::ImageCompressionAlgorithm;
470 :
471 : pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
472 : pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
473 :
474 : pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
475 : pub const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
476 : "C"
477 : } else {
478 : "C.UTF-8"
479 : };
480 :
481 : pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
482 : pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
483 :
484 : pub const DEFAULT_LOG_FORMAT: &str = "plain";
485 :
486 : pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
487 :
488 : pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = 1;
489 :
490 : pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
491 : pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
492 : pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
493 : pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
494 :
495 : pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
496 : pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
497 :
498 : pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
499 :
500 : /// Soft limit for the maximum size of a vectored read.
501 : ///
502 : /// This is determined by the largest NeonWalRecord that can exist (minus dbdir and reldir keys
503 : /// which are bounded by the blob io limits only). As of this writing, that is a `NeonWalRecord::ClogSetCommitted` record,
504 : /// with 32k xids. That's the max number of XIDS on a single CLOG page. The size of such a record
505 : /// is `sizeof(Transactionid) * 32768 + (some fixed overhead from 'timestamp`, the Vec length and whatever extra serde serialization adds)`.
506 : /// That is, slightly above 128 kB.
507 : pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 130 * 1024; // 130 KiB
508 :
509 : pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
510 : ImageCompressionAlgorithm::Zstd { level: Some(1) };
511 :
512 : pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
513 :
514 : pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
515 :
516 : pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
517 : utils::postgres_client::PostgresClientProtocol::Vanilla;
518 :
519 : pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
520 : pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
521 : }
522 :
523 : impl Default for ConfigToml {
524 496 : fn default() -> Self {
525 : use defaults::*;
526 :
527 : Self {
528 496 : listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
529 496 : listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
530 496 : listen_https_addr: (None),
531 496 : ssl_key_file: Utf8PathBuf::from(DEFAULT_SSL_KEY_FILE),
532 496 : ssl_cert_file: Utf8PathBuf::from(DEFAULT_SSL_CERT_FILE),
533 496 : ssl_cert_reload_period: Duration::from_secs(60),
534 496 : ssl_ca_file: None,
535 496 : availability_zone: (None),
536 496 : wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
537 496 : .expect("cannot parse default wait lsn timeout")),
538 496 : wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
539 496 : .expect("cannot parse default wal redo timeout")),
540 496 : superuser: (DEFAULT_SUPERUSER.to_string()),
541 496 : locale: DEFAULT_LOCALE.to_string(),
542 496 : page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
543 496 : max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
544 496 : pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
545 496 : http_auth_type: (AuthType::Trust),
546 496 : pg_auth_type: (AuthType::Trust),
547 496 : auth_validation_public_key_path: (None),
548 496 : remote_storage: None,
549 496 : broker_endpoint: (storage_broker::DEFAULT_ENDPOINT
550 496 : .parse()
551 496 : .expect("failed to parse default broker endpoint")),
552 496 : broker_keepalive_interval: (humantime::parse_duration(
553 496 : storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
554 496 : )
555 496 : .expect("cannot parse default keepalive interval")),
556 496 : log_format: (LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
557 496 :
558 496 : concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
559 496 : .expect("Invalid default constant")),
560 496 : concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(
561 496 : DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES,
562 496 : )
563 496 : .unwrap(),
564 496 : metric_collection_interval: (humantime::parse_duration(
565 496 : DEFAULT_METRIC_COLLECTION_INTERVAL,
566 496 : )
567 496 : .expect("cannot parse default metric collection interval")),
568 496 : synthetic_size_calculation_interval: (humantime::parse_duration(
569 496 : DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
570 496 : )
571 496 : .expect("cannot parse default synthetic size calculation interval")),
572 496 : metric_collection_endpoint: (DEFAULT_METRIC_COLLECTION_ENDPOINT),
573 496 :
574 496 : metric_collection_bucket: (None),
575 496 :
576 496 : disk_usage_based_eviction: (None),
577 496 :
578 496 : test_remote_failures: (0),
579 496 :
580 496 : ondemand_download_behavior_treat_error_as_warn: (false),
581 496 :
582 496 : background_task_maximum_delay: (humantime::parse_duration(
583 496 : DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
584 496 : )
585 496 : .unwrap()),
586 496 :
587 496 : control_plane_api: (None),
588 496 : control_plane_api_token: (None),
589 496 : control_plane_emergency_mode: (false),
590 496 :
591 496 : import_pgdata_upcall_api: (None),
592 496 : import_pgdata_upcall_api_token: (None),
593 496 : import_pgdata_aws_endpoint_url: (None),
594 496 :
595 496 : heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
596 496 : secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
597 496 :
598 496 : ingest_batch_size: (DEFAULT_INGEST_BATCH_SIZE),
599 496 :
600 496 : virtual_file_io_engine: None,
601 496 :
602 496 : max_vectored_read_bytes: (MaxVectoredReadBytes(
603 496 : NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
604 496 : )),
605 496 : image_compression: (DEFAULT_IMAGE_COMPRESSION),
606 496 : timeline_offloading: true,
607 496 : ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
608 496 : l0_flush: None,
609 496 : virtual_file_io_mode: None,
610 496 : tenant_config: TenantConfigToml::default(),
611 496 : no_sync: None,
612 496 : wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
613 496 : page_service_pipelining: if !cfg!(test) {
614 496 : PageServicePipeliningConfig::Serial
615 : } else {
616 0 : PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
617 0 : max_batch_size: NonZeroUsize::new(32).unwrap(),
618 0 : execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
619 0 : })
620 : },
621 496 : get_vectored_concurrent_io: if !cfg!(test) {
622 496 : GetVectoredConcurrentIo::Sequential
623 : } else {
624 0 : GetVectoredConcurrentIo::SidecarTask
625 : },
626 496 : enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") {
627 496 : Some(true)
628 : } else {
629 0 : None
630 : },
631 496 : validate_wal_contiguity: None,
632 496 : load_previous_heatmap: None,
633 496 : generate_unarchival_heatmap: None,
634 496 : tracing: None,
635 496 : enable_tls_page_service_api: false,
636 496 : }
637 496 : }
638 : }
639 :
640 : pub mod tenant_conf_defaults {
641 :
642 : // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
643 : // would be more appropriate. But a low value forces the code to be exercised more,
644 : // which is good for now to trigger bugs.
645 : // This parameter actually determines L0 layer file size.
646 : pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
647 : pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
648 :
649 : // FIXME the below configs are only used by legacy algorithm. The new algorithm
650 : // has different parameters.
651 :
652 : // Target file size, when creating image and delta layers.
653 : // This parameter determines L1 layer file size.
654 : pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
655 :
656 : pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
657 : pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
658 :
659 : // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
660 : // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
661 : // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So
662 : // with this config, we can get a maximum peak compaction usage of 9 GB.
663 : pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20;
664 : // Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid
665 : // read amp.
666 : pub const DEFAULT_COMPACTION_L0_FIRST: bool = true;
667 : pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true;
668 :
669 : pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
670 : crate::models::CompactionAlgorithm::Legacy;
671 :
672 : pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
673 :
674 : // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
675 : // If there's a need to decrease this value, first make sure that GC
676 : // doesn't hold a layer map write lock for non-trivial operations.
677 : // Relevant: https://github.com/neondatabase/neon/issues/3394
678 : pub const DEFAULT_GC_PERIOD: &str = "1 hr";
679 : pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
680 : // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
681 : // layer creation will end immediately. Set to 0 to disable.
682 : pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3;
683 : pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
684 : pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
685 : pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
686 : // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
687 : // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
688 : // throughputs up to 1GiB/s per timeline.
689 : pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
690 : pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
691 : // By default ingest enough WAL for two new L0 layers before checking if new image
692 : // image layers should be created.
693 : pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
694 : pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
695 : pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
696 : pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
697 : }
698 :
699 : impl Default for TenantConfigToml {
700 496 : fn default() -> Self {
701 : use tenant_conf_defaults::*;
702 496 : Self {
703 496 : checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
704 496 : checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
705 496 : .expect("cannot parse default checkpoint timeout"),
706 496 : compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
707 496 : compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
708 496 : .expect("cannot parse default compaction period"),
709 496 : compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
710 496 : compaction_upper_limit: DEFAULT_COMPACTION_UPPER_LIMIT,
711 496 : compaction_algorithm: crate::models::CompactionAlgorithmSettings {
712 496 : kind: DEFAULT_COMPACTION_ALGORITHM,
713 496 : },
714 496 : compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
715 496 : compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
716 496 : l0_flush_delay_threshold: None,
717 496 : l0_flush_stall_threshold: None,
718 496 : gc_horizon: DEFAULT_GC_HORIZON,
719 496 : gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
720 496 : .expect("cannot parse default gc period"),
721 496 : image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
722 496 : pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
723 496 : .expect("cannot parse default PITR interval"),
724 496 : walreceiver_connect_timeout: humantime::parse_duration(
725 496 : DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
726 496 : )
727 496 : .expect("cannot parse default walreceiver connect timeout"),
728 496 : lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
729 496 : .expect("cannot parse default walreceiver lagging wal timeout"),
730 496 : max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
731 496 : .expect("cannot parse default max walreceiver Lsn wal lag"),
732 496 : eviction_policy: crate::models::EvictionPolicy::NoEviction,
733 496 : min_resident_size_override: None,
734 496 : evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
735 496 : DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
736 496 : )
737 496 : .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
738 496 : heatmap_period: Duration::ZERO,
739 496 : lazy_slru_download: false,
740 496 : timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
741 496 : image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
742 496 : image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD,
743 496 : lsn_lease_length: LsnLease::DEFAULT_LENGTH,
744 496 : lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
745 496 : timeline_offloading: true,
746 496 : wal_receiver_protocol_override: None,
747 496 : rel_size_v2_enabled: false,
748 496 : gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
749 496 : gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
750 496 : gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
751 496 : sampling_ratio: None,
752 496 : }
753 496 : }
754 : }
|