Line data Source code
1 : use camino::Utf8PathBuf;
2 :
3 : #[cfg(test)]
4 : mod tests;
5 :
6 : use const_format::formatcp;
7 : pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
8 : pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
9 : pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
10 : pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
11 : // TODO: gRPC is disabled by default for now, but the port is used in neon_local.
12 : pub const DEFAULT_GRPC_LISTEN_PORT: u16 = 51051; // storage-broker already uses 50051
13 :
14 : use std::collections::HashMap;
15 : use std::num::{NonZeroU64, NonZeroUsize};
16 : use std::str::FromStr;
17 : use std::time::Duration;
18 :
19 : use postgres_backend::AuthType;
20 : use remote_storage::RemoteStorageConfig;
21 : use serde_with::serde_as;
22 : use utils::logging::LogFormat;
23 :
24 : use crate::models::{ImageCompressionAlgorithm, LsnLease};
25 :
26 : // Certain metadata (e.g. externally-addressable name, AZ) is delivered
27 : // as a separate structure. This information is not neeed by the pageserver
28 : // itself, it is only used for registering the pageserver with the control
29 : // plane and/or storage controller.
30 : //
31 9 : #[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
32 : pub struct NodeMetadata {
33 : #[serde(rename = "host")]
34 : pub postgres_host: String,
35 : #[serde(rename = "port")]
36 : pub postgres_port: u16,
37 : pub http_host: String,
38 : pub http_port: u16,
39 : pub https_port: Option<u16>,
40 :
41 : // Deployment tools may write fields to the metadata file beyond what we
42 : // use in this type: this type intentionally only names fields that require.
43 : #[serde(flatten)]
44 : pub other: HashMap<String, serde_json::Value>,
45 : }
46 :
47 : /// PostHog integration config.
48 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
49 : pub struct PostHogConfig {
50 : /// PostHog project ID
51 : pub project_id: String,
52 : /// Server-side (private) API key
53 : pub server_api_key: String,
54 : /// Client-side (public) API key
55 : pub client_api_key: String,
56 : /// Private API URL
57 : pub private_api_url: String,
58 : /// Public API URL
59 : pub public_api_url: String,
60 : }
61 :
62 : /// `pageserver.toml`
63 : ///
64 : /// We use serde derive with `#[serde(default)]` to generate a deserializer
65 : /// that fills in the default values for each config field.
66 : ///
67 : /// If there cannot be a static default value because we need to make runtime
68 : /// checks to determine the default, make it an `Option` (which defaults to None).
69 : /// The runtime check should be done in the consuming crate, i.e., `pageserver`.
70 : ///
71 : /// Unknown fields are silently ignored during deserialization.
72 : /// The alternative, which we used in the past, was to set `deny_unknown_fields`,
73 : /// which fails deserialization, and hence pageserver startup, if there is an unknown field.
74 : /// The reason we don't do that anymore is that it complicates
75 : /// usage of config fields for feature flagging, which we commonly do for
76 : /// region-by-region rollouts.
77 : /// The complications mainly arise because the `pageserver.toml` contents on a
78 : /// prod server have a separate lifecycle from the pageserver binary.
79 : /// For instance, `pageserver.toml` contents today are defined in the internal
80 : /// infra repo, and thus introducing a new config field to pageserver and
81 : /// rolling it out to prod servers are separate commits in separate repos
82 : /// that can't be made or rolled back atomically.
83 : /// Rollbacks in particular pose a risk with deny_unknown_fields because
84 : /// the old pageserver binary may reject a new config field, resulting in
85 : /// an outage unless the person doing the pageserver rollback remembers
86 : /// to also revert the commit that added the config field in to the
87 : /// `pageserver.toml` templates in the internal infra repo.
88 : /// (A pre-deploy config check would eliminate this risk during rollbacks,
89 : /// cf [here](https://github.com/neondatabase/cloud/issues/24349).)
90 : /// In addition to this compatibility problem during emergency rollbacks,
91 : /// deny_unknown_fields adds further complications when decomissioning a feature
92 : /// flag: with deny_unknown_fields, we can't remove a flag from the [`ConfigToml`]
93 : /// until all prod servers' `pageserver.toml` files have been updated to a version
94 : /// that doesn't specify the flag. Otherwise new software would fail to start up.
95 : /// This adds the requirement for an intermediate step where the new config field
96 : /// is accepted but ignored, prolonging the decomissioning process by an entire
97 : /// release cycle.
98 : /// By contrast with unknown fields silently ignored, decomissioning a feature
99 : /// flag is a one-step process: we can skip the intermediate step and straight
100 : /// remove the field from the [`ConfigToml`]. We leave the field in the
101 : /// `pageserver.toml` files on prod servers until we reach certainty that we
102 : /// will not roll back to old software whose behavior was dependent on config.
103 : /// Then we can remove the field from the templates in the internal infra repo.
104 : /// This process is [documented internally](
105 : /// https://docs.neon.build/storage/pageserver_configuration.html).
106 : ///
107 : /// Note that above relaxed compatbility for the config format does NOT APPLY
108 : /// TO THE STORAGE FORMAT. As general guidance, when introducing storage format
109 : /// changes, ensure that the potential rollback target version will be compatible
110 : /// with the new format. This must hold regardless of what flags are set in in the `pageserver.toml`:
111 : /// any format version that exists in an environment must be compatible with the software that runs there.
112 : /// Use a pageserver.toml flag only to gate whether software _writes_ the new format.
113 : /// For more compatibility considerations, refer to [internal docs](
114 : /// https://docs.neon.build/storage/compat.html?highlight=compat#format-versions--compatibility)
115 : #[serde_as]
116 15 : #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
117 : #[serde(default)]
118 : pub struct ConfigToml {
119 : // types mapped 1:1 into the runtime PageServerConfig type
120 : pub listen_pg_addr: String,
121 : pub listen_http_addr: String,
122 : pub listen_https_addr: Option<String>,
123 : pub listen_grpc_addr: Option<String>,
124 : pub ssl_key_file: Utf8PathBuf,
125 : pub ssl_cert_file: Utf8PathBuf,
126 : #[serde(with = "humantime_serde")]
127 : pub ssl_cert_reload_period: Duration,
128 : pub ssl_ca_file: Option<Utf8PathBuf>,
129 : pub availability_zone: Option<String>,
130 : #[serde(with = "humantime_serde")]
131 : pub wait_lsn_timeout: Duration,
132 : #[serde(with = "humantime_serde")]
133 : pub wal_redo_timeout: Duration,
134 : pub superuser: String,
135 : pub locale: String,
136 : pub page_cache_size: usize,
137 : pub max_file_descriptors: usize,
138 : pub pg_distrib_dir: Option<Utf8PathBuf>,
139 : #[serde_as(as = "serde_with::DisplayFromStr")]
140 : pub http_auth_type: AuthType,
141 : #[serde_as(as = "serde_with::DisplayFromStr")]
142 : pub pg_auth_type: AuthType,
143 : pub grpc_auth_type: AuthType,
144 : pub auth_validation_public_key_path: Option<Utf8PathBuf>,
145 : pub remote_storage: Option<RemoteStorageConfig>,
146 : pub tenant_config: TenantConfigToml,
147 : #[serde_as(as = "serde_with::DisplayFromStr")]
148 : pub broker_endpoint: storage_broker::Uri,
149 : #[serde(with = "humantime_serde")]
150 : pub broker_keepalive_interval: Duration,
151 : #[serde_as(as = "serde_with::DisplayFromStr")]
152 : pub log_format: LogFormat,
153 : pub concurrent_tenant_warmup: NonZeroUsize,
154 : pub concurrent_tenant_size_logical_size_queries: NonZeroUsize,
155 : #[serde(with = "humantime_serde")]
156 : pub metric_collection_interval: Duration,
157 : pub metric_collection_endpoint: Option<reqwest::Url>,
158 : pub metric_collection_bucket: Option<RemoteStorageConfig>,
159 : #[serde(with = "humantime_serde")]
160 : pub synthetic_size_calculation_interval: Duration,
161 : pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
162 : pub test_remote_failures: u64,
163 : pub ondemand_download_behavior_treat_error_as_warn: bool,
164 : #[serde(with = "humantime_serde")]
165 : pub background_task_maximum_delay: Duration,
166 : pub control_plane_api: Option<reqwest::Url>,
167 : pub control_plane_api_token: Option<String>,
168 : pub control_plane_emergency_mode: bool,
169 : /// Unstable feature: subject to change or removal without notice.
170 : /// See <https://github.com/neondatabase/neon/pull/9218>.
171 : pub import_pgdata_upcall_api: Option<reqwest::Url>,
172 : /// Unstable feature: subject to change or removal without notice.
173 : /// See <https://github.com/neondatabase/neon/pull/9218>.
174 : pub import_pgdata_upcall_api_token: Option<String>,
175 : /// Unstable feature: subject to change or removal without notice.
176 : /// See <https://github.com/neondatabase/neon/pull/9218>.
177 : pub import_pgdata_aws_endpoint_url: Option<reqwest::Url>,
178 : pub heatmap_upload_concurrency: usize,
179 : pub secondary_download_concurrency: usize,
180 : pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
181 : pub ingest_batch_size: u64,
182 : pub max_vectored_read_bytes: MaxVectoredReadBytes,
183 : pub max_get_vectored_keys: MaxGetVectoredKeys,
184 : pub image_compression: ImageCompressionAlgorithm,
185 : pub timeline_offloading: bool,
186 : pub ephemeral_bytes_per_memory_kb: usize,
187 : pub l0_flush: Option<crate::models::L0FlushConfig>,
188 : pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
189 : #[serde(skip_serializing_if = "Option::is_none")]
190 : pub no_sync: Option<bool>,
191 : pub page_service_pipelining: PageServicePipeliningConfig,
192 : pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
193 : pub enable_read_path_debugging: Option<bool>,
194 : #[serde(skip_serializing_if = "Option::is_none")]
195 : pub validate_wal_contiguity: Option<bool>,
196 : #[serde(skip_serializing_if = "Option::is_none")]
197 : pub load_previous_heatmap: Option<bool>,
198 : #[serde(skip_serializing_if = "Option::is_none")]
199 : pub generate_unarchival_heatmap: Option<bool>,
200 : pub tracing: Option<Tracing>,
201 : pub enable_tls_page_service_api: bool,
202 : pub dev_mode: bool,
203 : #[serde(skip_serializing_if = "Option::is_none")]
204 : pub posthog_config: Option<PostHogConfig>,
205 : pub timeline_import_config: TimelineImportConfig,
206 : #[serde(skip_serializing_if = "Option::is_none")]
207 : pub basebackup_cache_config: Option<BasebackupCacheConfig>,
208 : }
209 :
210 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
211 : pub struct DiskUsageEvictionTaskConfig {
212 : pub max_usage_pct: utils::serde_percent::Percent,
213 : pub min_avail_bytes: u64,
214 : #[serde(with = "humantime_serde")]
215 : pub period: Duration,
216 : #[cfg(feature = "testing")]
217 : pub mock_statvfs: Option<statvfs::mock::Behavior>,
218 : /// Select sorting for evicted layers
219 : #[serde(default)]
220 : pub eviction_order: EvictionOrder,
221 : }
222 :
223 4 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
224 : #[serde(tag = "mode", rename_all = "kebab-case")]
225 : pub enum PageServicePipeliningConfig {
226 : Serial,
227 : Pipelined(PageServicePipeliningConfigPipelined),
228 : }
229 12 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
230 : pub struct PageServicePipeliningConfigPipelined {
231 : /// Failed config parsing and validation if larger than `max_get_vectored_keys`.
232 : pub max_batch_size: NonZeroUsize,
233 : pub execution: PageServiceProtocolPipelinedExecutionStrategy,
234 : // The default below is such that new versions of the software can start
235 : // with the old configuration.
236 : #[serde(default)]
237 : pub batching: PageServiceProtocolPipelinedBatchingStrategy,
238 : }
239 :
240 4 : #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
241 : #[serde(rename_all = "kebab-case")]
242 : pub enum PageServiceProtocolPipelinedExecutionStrategy {
243 : ConcurrentFutures,
244 : Tasks,
245 : }
246 :
247 4 : #[derive(Default, Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
248 : #[serde(rename_all = "kebab-case")]
249 : pub enum PageServiceProtocolPipelinedBatchingStrategy {
250 : /// All get page requests in a batch will be at the same LSN
251 : #[default]
252 : UniformLsn,
253 : /// Get page requests in a batch may be at different LSN
254 : ///
255 : /// One key cannot be present more than once at different LSNs in
256 : /// the same batch.
257 : ScatteredLsn,
258 : }
259 :
260 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
261 : #[serde(tag = "mode", rename_all = "kebab-case")]
262 : pub enum GetVectoredConcurrentIo {
263 : /// The read path is fully sequential: layers are visited
264 : /// one after the other and IOs are issued and waited upon
265 : /// from the same task that traverses the layers.
266 : Sequential,
267 : /// The read path still traverses layers sequentially, and
268 : /// index blocks will be read into the PS PageCache from
269 : /// that task, with waiting.
270 : /// But data IOs are dispatched and waited upon from a sidecar
271 : /// task so that the traversing task can continue to traverse
272 : /// layers while the IOs are in flight.
273 : /// If the PS PageCache miss rate is low, this improves
274 : /// throughput dramatically.
275 : SidecarTask,
276 : }
277 :
278 2 : #[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
279 : pub struct Ratio {
280 : pub numerator: usize,
281 : pub denominator: usize,
282 : }
283 :
284 3 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
285 : pub struct OtelExporterConfig {
286 : pub endpoint: String,
287 : pub protocol: OtelExporterProtocol,
288 : #[serde(with = "humantime_serde")]
289 : pub timeout: Duration,
290 : }
291 :
292 1 : #[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
293 : #[serde(rename_all = "kebab-case")]
294 : pub enum OtelExporterProtocol {
295 : Grpc,
296 : HttpBinary,
297 : HttpJson,
298 : }
299 :
300 2 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
301 : pub struct Tracing {
302 : pub sampling_ratio: Ratio,
303 : pub export_config: OtelExporterConfig,
304 : }
305 :
306 : impl From<&OtelExporterConfig> for tracing_utils::ExportConfig {
307 0 : fn from(val: &OtelExporterConfig) -> Self {
308 0 : tracing_utils::ExportConfig {
309 0 : endpoint: Some(val.endpoint.clone()),
310 0 : protocol: val.protocol.into(),
311 0 : timeout: val.timeout,
312 0 : }
313 0 : }
314 : }
315 :
316 : impl From<OtelExporterProtocol> for tracing_utils::Protocol {
317 0 : fn from(val: OtelExporterProtocol) -> Self {
318 0 : match val {
319 0 : OtelExporterProtocol::Grpc => tracing_utils::Protocol::Grpc,
320 0 : OtelExporterProtocol::HttpJson => tracing_utils::Protocol::HttpJson,
321 0 : OtelExporterProtocol::HttpBinary => tracing_utils::Protocol::HttpBinary,
322 : }
323 0 : }
324 : }
325 :
326 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
327 : pub struct TimelineImportConfig {
328 : pub import_job_concurrency: NonZeroUsize,
329 : pub import_job_soft_size_limit: NonZeroUsize,
330 : pub import_job_checkpoint_threshold: NonZeroUsize,
331 : /// Max size of the remote storage partial read done by any job
332 : pub import_job_max_byte_range_size: NonZeroUsize,
333 : }
334 :
335 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
336 : #[serde(default)]
337 : pub struct BasebackupCacheConfig {
338 : #[serde(with = "humantime_serde")]
339 : pub cleanup_period: Duration,
340 : // FIXME: Support max_size_bytes.
341 : // pub max_size_bytes: usize,
342 : pub max_size_entries: i64,
343 : }
344 :
345 : impl Default for BasebackupCacheConfig {
346 0 : fn default() -> Self {
347 0 : Self {
348 0 : cleanup_period: Duration::from_secs(60),
349 0 : // max_size_bytes: 1024 * 1024 * 1024, // 1 GiB
350 0 : max_size_entries: 1000,
351 0 : }
352 0 : }
353 : }
354 :
355 : pub mod statvfs {
356 : pub mod mock {
357 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
358 : #[serde(tag = "type")]
359 : pub enum Behavior {
360 : Success {
361 : blocksize: u64,
362 : total_blocks: u64,
363 : name_filter: Option<utils::serde_regex::Regex>,
364 : },
365 : #[cfg(feature = "testing")]
366 : Failure { mocked_error: MockedError },
367 : }
368 :
369 : #[cfg(feature = "testing")]
370 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
371 : #[allow(clippy::upper_case_acronyms)]
372 : pub enum MockedError {
373 : EIO,
374 : }
375 :
376 : #[cfg(feature = "testing")]
377 : impl From<MockedError> for nix::Error {
378 0 : fn from(e: MockedError) -> Self {
379 0 : match e {
380 0 : MockedError::EIO => nix::Error::EIO,
381 0 : }
382 0 : }
383 : }
384 : }
385 : }
386 :
387 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
388 : #[serde(tag = "type", content = "args")]
389 : pub enum EvictionOrder {
390 : RelativeAccessed {
391 : highest_layer_count_loses_first: bool,
392 : },
393 : }
394 :
395 : impl Default for EvictionOrder {
396 1 : fn default() -> Self {
397 1 : Self::RelativeAccessed {
398 1 : highest_layer_count_loses_first: true,
399 1 : }
400 1 : }
401 : }
402 :
403 0 : #[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
404 : #[serde(transparent)]
405 : pub struct MaxVectoredReadBytes(pub NonZeroUsize);
406 :
407 4 : #[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
408 : #[serde(transparent)]
409 : pub struct MaxGetVectoredKeys(NonZeroUsize);
410 :
411 : impl MaxGetVectoredKeys {
412 71985 : pub fn get(&self) -> usize {
413 71985 : self.0.get()
414 71985 : }
415 : }
416 :
417 : /// Tenant-level configuration values, used for various purposes.
418 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
419 : #[serde(default)]
420 : pub struct TenantConfigToml {
421 : // Flush out an inmemory layer, if it's holding WAL older than this
422 : // This puts a backstop on how much WAL needs to be re-digested if the
423 : // page server crashes.
424 : // This parameter actually determines L0 layer file size.
425 : pub checkpoint_distance: u64,
426 : // Inmemory layer is also flushed at least once in checkpoint_timeout to
427 : // eventually upload WAL after activity is stopped.
428 : #[serde(with = "humantime_serde")]
429 : pub checkpoint_timeout: Duration,
430 : // Target file size, when creating image and delta layers.
431 : // This parameter determines L1 layer file size.
432 : pub compaction_target_size: u64,
433 : // How often to check if there's compaction work to be done.
434 : // Duration::ZERO means automatic compaction is disabled.
435 : #[serde(with = "humantime_serde")]
436 : pub compaction_period: Duration,
437 : /// Level0 delta layer threshold for compaction.
438 : pub compaction_threshold: usize,
439 : /// Controls the amount of L0 included in a single compaction iteration.
440 : /// The unit is `checkpoint_distance`, i.e., a size.
441 : /// We add L0s to the set of layers to compact until their cumulative
442 : /// size exceeds `compaction_upper_limit * checkpoint_distance`.
443 : pub compaction_upper_limit: usize,
444 : pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
445 : /// If true, enable shard ancestor compaction (enabled by default).
446 : pub compaction_shard_ancestor: bool,
447 : /// If true, compact down L0 across all tenant timelines before doing regular compaction. L0
448 : /// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true.
449 : pub compaction_l0_first: bool,
450 : /// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only
451 : /// has an effect if `compaction_l0_first` is true. Defaults to true.
452 : pub compaction_l0_semaphore: bool,
453 : /// Level0 delta layer threshold at which to delay layer flushes such that they take 2x as long,
454 : /// and block on layer flushes during ephemeral layer rolls, for compaction backpressure. This
455 : /// helps compaction keep up with WAL ingestion, and avoids read amplification blowing up.
456 : /// Should be >compaction_threshold. 0 to disable. Defaults to 3x compaction_threshold.
457 : pub l0_flush_delay_threshold: Option<usize>,
458 : /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold
459 : /// to avoid deadlock. 0 to disable. Disabled by default.
460 : pub l0_flush_stall_threshold: Option<usize>,
461 : // Determines how much history is retained, to allow
462 : // branching and read replicas at an older point in time.
463 : // The unit is #of bytes of WAL.
464 : // Page versions older than this are garbage collected away.
465 : pub gc_horizon: u64,
466 : // Interval at which garbage collection is triggered.
467 : // Duration::ZERO means automatic GC is disabled
468 : #[serde(with = "humantime_serde")]
469 : pub gc_period: Duration,
470 : // Delta layer churn threshold to create L1 image layers.
471 : pub image_creation_threshold: usize,
472 : // Determines how much history is retained, to allow
473 : // branching and read replicas at an older point in time.
474 : // The unit is time.
475 : // Page versions older than this are garbage collected away.
476 : #[serde(with = "humantime_serde")]
477 : pub pitr_interval: Duration,
478 : /// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
479 : #[serde(with = "humantime_serde")]
480 : pub walreceiver_connect_timeout: Duration,
481 : /// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
482 : /// A stalled safekeeper will be changed to a newer one when it appears.
483 : #[serde(with = "humantime_serde")]
484 : pub lagging_wal_timeout: Duration,
485 : /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
486 : /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
487 : /// to avoid eager reconnects.
488 : pub max_lsn_wal_lag: NonZeroU64,
489 : pub eviction_policy: crate::models::EvictionPolicy,
490 : pub min_resident_size_override: Option<u64>,
491 : // See the corresponding metric's help string.
492 : #[serde(with = "humantime_serde")]
493 : pub evictions_low_residence_duration_metric_threshold: Duration,
494 :
495 : /// If non-zero, the period between uploads of a heatmap from attached tenants. This
496 : /// may be disabled if a Tenant will not have secondary locations: only secondary
497 : /// locations will use the heatmap uploaded by attached locations.
498 : #[serde(with = "humantime_serde")]
499 : pub heatmap_period: Duration,
500 :
501 : /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
502 : pub lazy_slru_download: bool,
503 :
504 : pub timeline_get_throttle: crate::models::ThrottleConfig,
505 :
506 : // How much WAL must be ingested before checking again whether a new image layer is required.
507 : // Expresed in multiples of checkpoint distance.
508 : pub image_layer_creation_check_threshold: u8,
509 :
510 : // How many multiples of L0 `compaction_threshold` will preempt image layer creation and do L0 compaction.
511 : // Set to 0 to disable preemption.
512 : pub image_creation_preempt_threshold: usize,
513 :
514 : /// The length for an explicit LSN lease request.
515 : /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
516 : #[serde(with = "humantime_serde")]
517 : pub lsn_lease_length: Duration,
518 :
519 : /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
520 : /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
521 : #[serde(with = "humantime_serde")]
522 : pub lsn_lease_length_for_ts: Duration,
523 :
524 : /// Enable auto-offloading of timelines.
525 : /// (either this flag or the pageserver-global one need to be set)
526 : pub timeline_offloading: bool,
527 :
528 : /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
529 : /// `index_part.json`, and it cannot be reversed.
530 : pub rel_size_v2_enabled: bool,
531 :
532 : // gc-compaction related configs
533 : /// Enable automatic gc-compaction trigger on this tenant.
534 : pub gc_compaction_enabled: bool,
535 : /// Enable verification of gc-compaction results.
536 : pub gc_compaction_verification: bool,
537 : /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
538 : /// gc-compaction will be triggered.
539 : pub gc_compaction_initial_threshold_kb: u64,
540 : /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN)
541 : /// is above this ratio, gc-compaction will be triggered.
542 : pub gc_compaction_ratio_percent: u64,
543 : /// Tenant level performance sampling ratio override. Controls the ratio of get page requests
544 : /// that will get perf sampling for the tenant.
545 : pub sampling_ratio: Option<Ratio>,
546 :
547 : /// Capacity of relsize snapshot cache (used by replicas).
548 : pub relsize_snapshot_cache_capacity: usize,
549 :
550 : /// Enable preparing basebackup on XLOG_CHECKPOINT_SHUTDOWN and using it in basebackup requests.
551 : // FIXME: Remove skip_serializing_if when the feature is stable.
552 : #[serde(skip_serializing_if = "std::ops::Not::not")]
553 : pub basebackup_cache_enabled: bool,
554 : }
555 :
556 : pub mod defaults {
557 : pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
558 :
559 : use crate::models::ImageCompressionAlgorithm;
560 :
561 : pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
562 : pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
563 :
564 : pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
565 : pub const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
566 : "C"
567 : } else {
568 : "C.UTF-8"
569 : };
570 :
571 : pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
572 : pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
573 :
574 : pub const DEFAULT_LOG_FORMAT: &str = "plain";
575 :
576 : pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
577 :
578 : pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = 1;
579 :
580 : pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
581 : pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
582 : pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
583 : pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
584 :
585 : pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
586 : pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
587 :
588 : pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
589 :
590 : /// Soft limit for the maximum size of a vectored read.
591 : ///
592 : /// This is determined by the largest NeonWalRecord that can exist (minus dbdir and reldir keys
593 : /// which are bounded by the blob io limits only). As of this writing, that is a `NeonWalRecord::ClogSetCommitted` record,
594 : /// with 32k xids. That's the max number of XIDS on a single CLOG page. The size of such a record
595 : /// is `sizeof(Transactionid) * 32768 + (some fixed overhead from 'timestamp`, the Vec length and whatever extra serde serialization adds)`.
596 : /// That is, slightly above 128 kB.
597 : pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 130 * 1024; // 130 KiB
598 :
599 : pub const DEFAULT_MAX_GET_VECTORED_KEYS: usize = 32;
600 :
601 : pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
602 : ImageCompressionAlgorithm::Zstd { level: Some(1) };
603 :
604 : pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
605 :
606 : pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
607 :
608 : pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
609 : pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
610 : }
611 :
612 : impl Default for ConfigToml {
613 132 : fn default() -> Self {
614 : use defaults::*;
615 :
616 : Self {
617 132 : listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
618 132 : listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
619 132 : listen_https_addr: (None),
620 132 : listen_grpc_addr: None, // TODO: default to 127.0.0.1:51051
621 132 : ssl_key_file: Utf8PathBuf::from(DEFAULT_SSL_KEY_FILE),
622 132 : ssl_cert_file: Utf8PathBuf::from(DEFAULT_SSL_CERT_FILE),
623 132 : ssl_cert_reload_period: Duration::from_secs(60),
624 132 : ssl_ca_file: None,
625 132 : availability_zone: (None),
626 132 : wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
627 132 : .expect("cannot parse default wait lsn timeout")),
628 132 : wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
629 132 : .expect("cannot parse default wal redo timeout")),
630 132 : superuser: (DEFAULT_SUPERUSER.to_string()),
631 132 : locale: DEFAULT_LOCALE.to_string(),
632 132 : page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
633 132 : max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
634 132 : pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
635 132 : http_auth_type: (AuthType::Trust),
636 132 : pg_auth_type: (AuthType::Trust),
637 132 : grpc_auth_type: (AuthType::Trust),
638 132 : auth_validation_public_key_path: (None),
639 132 : remote_storage: None,
640 132 : broker_endpoint: (storage_broker::DEFAULT_ENDPOINT
641 132 : .parse()
642 132 : .expect("failed to parse default broker endpoint")),
643 132 : broker_keepalive_interval: (humantime::parse_duration(
644 132 : storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
645 132 : )
646 132 : .expect("cannot parse default keepalive interval")),
647 132 : log_format: (LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
648 132 :
649 132 : concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
650 132 : .expect("Invalid default constant")),
651 132 : concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(
652 132 : DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES,
653 132 : )
654 132 : .unwrap(),
655 132 : metric_collection_interval: (humantime::parse_duration(
656 132 : DEFAULT_METRIC_COLLECTION_INTERVAL,
657 132 : )
658 132 : .expect("cannot parse default metric collection interval")),
659 132 : synthetic_size_calculation_interval: (humantime::parse_duration(
660 132 : DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
661 132 : )
662 132 : .expect("cannot parse default synthetic size calculation interval")),
663 132 : metric_collection_endpoint: (DEFAULT_METRIC_COLLECTION_ENDPOINT),
664 132 :
665 132 : metric_collection_bucket: (None),
666 132 :
667 132 : disk_usage_based_eviction: (None),
668 132 :
669 132 : test_remote_failures: (0),
670 132 :
671 132 : ondemand_download_behavior_treat_error_as_warn: (false),
672 132 :
673 132 : background_task_maximum_delay: (humantime::parse_duration(
674 132 : DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
675 132 : )
676 132 : .unwrap()),
677 132 :
678 132 : control_plane_api: (None),
679 132 : control_plane_api_token: (None),
680 132 : control_plane_emergency_mode: (false),
681 132 :
682 132 : import_pgdata_upcall_api: (None),
683 132 : import_pgdata_upcall_api_token: (None),
684 132 : import_pgdata_aws_endpoint_url: (None),
685 132 :
686 132 : heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
687 132 : secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
688 132 :
689 132 : ingest_batch_size: (DEFAULT_INGEST_BATCH_SIZE),
690 132 :
691 132 : virtual_file_io_engine: None,
692 132 :
693 132 : max_vectored_read_bytes: (MaxVectoredReadBytes(
694 132 : NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
695 132 : )),
696 132 : max_get_vectored_keys: (MaxGetVectoredKeys(
697 132 : NonZeroUsize::new(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap(),
698 132 : )),
699 132 : image_compression: (DEFAULT_IMAGE_COMPRESSION),
700 132 : timeline_offloading: true,
701 132 : ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
702 132 : l0_flush: None,
703 132 : virtual_file_io_mode: None,
704 132 : tenant_config: TenantConfigToml::default(),
705 132 : no_sync: None,
706 132 : page_service_pipelining: PageServicePipeliningConfig::Pipelined(
707 132 : PageServicePipeliningConfigPipelined {
708 132 : max_batch_size: NonZeroUsize::new(32).unwrap(),
709 132 : execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
710 132 : batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn,
711 132 : },
712 132 : ),
713 132 : get_vectored_concurrent_io: GetVectoredConcurrentIo::SidecarTask,
714 132 : enable_read_path_debugging: if cfg!(feature = "testing") {
715 132 : Some(true)
716 : } else {
717 0 : None
718 : },
719 132 : validate_wal_contiguity: None,
720 132 : load_previous_heatmap: None,
721 132 : generate_unarchival_heatmap: None,
722 132 : tracing: None,
723 132 : enable_tls_page_service_api: false,
724 132 : dev_mode: false,
725 132 : timeline_import_config: TimelineImportConfig {
726 132 : import_job_concurrency: NonZeroUsize::new(32).unwrap(),
727 132 : import_job_soft_size_limit: NonZeroUsize::new(256 * 1024 * 1024).unwrap(),
728 132 : import_job_checkpoint_threshold: NonZeroUsize::new(32).unwrap(),
729 132 : import_job_max_byte_range_size: NonZeroUsize::new(4 * 1024 * 1024).unwrap(),
730 132 : },
731 132 : basebackup_cache_config: None,
732 132 : posthog_config: None,
733 132 : }
734 132 : }
735 : }
736 :
737 : pub mod tenant_conf_defaults {
738 :
739 : // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
740 : // would be more appropriate. But a low value forces the code to be exercised more,
741 : // which is good for now to trigger bugs.
742 : // This parameter actually determines L0 layer file size.
743 : pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
744 : pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
745 :
746 : // FIXME the below configs are only used by legacy algorithm. The new algorithm
747 : // has different parameters.
748 :
749 : // Target file size, when creating image and delta layers.
750 : // This parameter determines L1 layer file size.
751 : pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
752 :
753 : pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
754 : pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
755 : pub const DEFAULT_COMPACTION_SHARD_ANCESTOR: bool = true;
756 :
757 : // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
758 : // 3/4*8=6 on most of our pageservers. Compacting 10 layers requires a maximum of
759 : // DEFAULT_CHECKPOINT_DISTANCE*10 memory, that's 2560MB. So with this config, we can get a maximum peak
760 : // compaction usage of 15360MB.
761 : pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 10;
762 : // Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid
763 : // read amp.
764 : pub const DEFAULT_COMPACTION_L0_FIRST: bool = true;
765 : pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true;
766 :
767 : pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
768 : crate::models::CompactionAlgorithm::Legacy;
769 :
770 : pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
771 :
772 : // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
773 : // If there's a need to decrease this value, first make sure that GC
774 : // doesn't hold a layer map write lock for non-trivial operations.
775 : // Relevant: https://github.com/neondatabase/neon/issues/3394
776 : pub const DEFAULT_GC_PERIOD: &str = "1 hr";
777 : pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
778 : // Currently, any value other than 0 will trigger image layer creation preemption immediately with L0 backpressure
779 : // without looking at the exact number of L0 layers.
780 : // It was expected to have the following behavior:
781 : // > If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
782 : // > layer creation will end immediately. Set to 0 to disable.
783 : pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3;
784 : pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
785 : pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
786 : pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
787 : // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
788 : // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
789 : // throughputs up to 1GiB/s per timeline.
790 : pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
791 : pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
792 : // By default ingest enough WAL for two new L0 layers before checking if new image
793 : // image layers should be created.
794 : pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
795 : pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
796 : pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
797 : pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
798 : pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
799 : pub const DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY: usize = 1000;
800 : }
801 :
802 : impl Default for TenantConfigToml {
803 132 : fn default() -> Self {
804 : use tenant_conf_defaults::*;
805 132 : Self {
806 132 : checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
807 132 : checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
808 132 : .expect("cannot parse default checkpoint timeout"),
809 132 : compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
810 132 : compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
811 132 : .expect("cannot parse default compaction period"),
812 132 : compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
813 132 : compaction_upper_limit: DEFAULT_COMPACTION_UPPER_LIMIT,
814 132 : compaction_algorithm: crate::models::CompactionAlgorithmSettings {
815 132 : kind: DEFAULT_COMPACTION_ALGORITHM,
816 132 : },
817 132 : compaction_shard_ancestor: DEFAULT_COMPACTION_SHARD_ANCESTOR,
818 132 : compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
819 132 : compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
820 132 : l0_flush_delay_threshold: None,
821 132 : l0_flush_stall_threshold: None,
822 132 : gc_horizon: DEFAULT_GC_HORIZON,
823 132 : gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
824 132 : .expect("cannot parse default gc period"),
825 132 : image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
826 132 : pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
827 132 : .expect("cannot parse default PITR interval"),
828 132 : walreceiver_connect_timeout: humantime::parse_duration(
829 132 : DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
830 132 : )
831 132 : .expect("cannot parse default walreceiver connect timeout"),
832 132 : lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
833 132 : .expect("cannot parse default walreceiver lagging wal timeout"),
834 132 : max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
835 132 : .expect("cannot parse default max walreceiver Lsn wal lag"),
836 132 : eviction_policy: crate::models::EvictionPolicy::NoEviction,
837 132 : min_resident_size_override: None,
838 132 : evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
839 132 : DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
840 132 : )
841 132 : .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
842 132 : heatmap_period: Duration::ZERO,
843 132 : lazy_slru_download: false,
844 132 : timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
845 132 : image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
846 132 : image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD,
847 132 : lsn_lease_length: LsnLease::DEFAULT_LENGTH,
848 132 : lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
849 132 : timeline_offloading: true,
850 132 : rel_size_v2_enabled: false,
851 132 : gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
852 132 : gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
853 132 : gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
854 132 : gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
855 132 : sampling_ratio: None,
856 132 : relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY,
857 132 : basebackup_cache_enabled: false,
858 132 : }
859 132 : }
860 : }
|