Line data Source code
1 : pub mod detach_ancestor;
2 : pub mod partitioning;
3 : pub mod utilization;
4 :
5 : pub use utilization::PageserverUtilization;
6 :
7 : use std::{
8 : collections::HashMap,
9 : fmt::Display,
10 : io::{BufRead, Read},
11 : num::{NonZeroU32, NonZeroU64, NonZeroUsize},
12 : str::FromStr,
13 : sync::atomic::AtomicUsize,
14 : time::{Duration, SystemTime},
15 : };
16 :
17 : use byteorder::{BigEndian, ReadBytesExt};
18 : use postgres_ffi::BLCKSZ;
19 : use serde::{Deserialize, Serialize};
20 : use serde_with::serde_as;
21 : use utils::{
22 : completion,
23 : id::{NodeId, TenantId, TimelineId},
24 : lsn::Lsn,
25 : serde_system_time,
26 : };
27 :
28 : use crate::{
29 : reltag::RelTag,
30 : shard::{ShardCount, ShardStripeSize, TenantShardId},
31 : };
32 : use anyhow::bail;
33 : use bytes::{Buf, BufMut, Bytes, BytesMut};
34 :
35 : /// The state of a tenant in this pageserver.
36 : ///
37 : /// ```mermaid
38 : /// stateDiagram-v2
39 : ///
40 : /// [*] --> Loading: spawn_load()
41 : /// [*] --> Attaching: spawn_attach()
42 : ///
43 : /// Loading --> Activating: activate()
44 : /// Attaching --> Activating: activate()
45 : /// Activating --> Active: infallible
46 : ///
47 : /// Loading --> Broken: load() failure
48 : /// Attaching --> Broken: attach() failure
49 : ///
50 : /// Active --> Stopping: set_stopping(), part of shutdown & detach
51 : /// Stopping --> Broken: late error in remove_tenant_from_memory
52 : ///
53 : /// Broken --> [*]: ignore / detach / shutdown
54 : /// Stopping --> [*]: remove_from_memory complete
55 : ///
56 : /// Active --> Broken: cfg(testing)-only tenant break point
57 : /// ```
58 : #[derive(
59 : Clone,
60 : PartialEq,
61 : Eq,
62 1 : serde::Serialize,
63 6 : serde::Deserialize,
64 0 : strum_macros::Display,
65 : strum_macros::VariantNames,
66 0 : strum_macros::AsRefStr,
67 1141 : strum_macros::IntoStaticStr,
68 : )]
69 : #[serde(tag = "slug", content = "data")]
70 : pub enum TenantState {
71 : /// This tenant is being loaded from local disk.
72 : ///
73 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
74 : Loading,
75 : /// This tenant is being attached to the pageserver.
76 : ///
77 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
78 : Attaching,
79 : /// The tenant is transitioning from Loading/Attaching to Active.
80 : ///
81 : /// While in this state, the individual timelines are being activated.
82 : ///
83 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
84 : Activating(ActivatingFrom),
85 : /// The tenant has finished activating and is open for business.
86 : ///
87 : /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
88 : Active,
89 : /// The tenant is recognized by pageserver, but it is being detached or the
90 : /// system is being shut down.
91 : ///
92 : /// Transitions out of this state are possible through `set_broken()`.
93 : Stopping {
94 : // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
95 : // otherwise it will not be skipped during deserialization
96 : #[serde(skip)]
97 : progress: completion::Barrier,
98 : },
99 : /// The tenant is recognized by the pageserver, but can no longer be used for
100 : /// any operations.
101 : ///
102 : /// If the tenant fails to load or attach, it will transition to this state
103 : /// and it is guaranteed that no background tasks are running in its name.
104 : ///
105 : /// The other way to transition into this state is from `Stopping` state
106 : /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
107 : /// if the cleanup future executed by `remove_tenant_from_memory()` fails.
108 : Broken { reason: String, backtrace: String },
109 : }
110 :
111 : impl TenantState {
112 0 : pub fn attachment_status(&self) -> TenantAttachmentStatus {
113 : use TenantAttachmentStatus::*;
114 :
115 : // Below TenantState::Activating is used as "transient" or "transparent" state for
116 : // attachment_status determining.
117 0 : match self {
118 : // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
119 : // So, technically, we can return Attached here.
120 : // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
121 : // But, our attach task might still be fetching the remote timelines, etc.
122 : // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
123 0 : Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
124 : // tenant mgr startup distinguishes attaching from loading via marker file.
125 0 : Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
126 : // We only reach Active after successful load / attach.
127 : // So, call atttachment status Attached.
128 0 : Self::Active => Attached,
129 : // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
130 : // However, it also becomes Broken if the regular load fails.
131 : // From Console's perspective there's no practical difference
132 : // because attachment_status is polled by console only during attach operation execution.
133 0 : Self::Broken { reason, .. } => Failed {
134 0 : reason: reason.to_owned(),
135 0 : },
136 : // Why is Stopping a Maybe case? Because, during pageserver shutdown,
137 : // we set the Stopping state irrespective of whether the tenant
138 : // has finished attaching or not.
139 0 : Self::Stopping { .. } => Maybe,
140 : }
141 0 : }
142 :
143 0 : pub fn broken_from_reason(reason: String) -> Self {
144 0 : let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
145 0 : Self::Broken {
146 0 : reason,
147 0 : backtrace: backtrace_str,
148 0 : }
149 0 : }
150 : }
151 :
152 : impl std::fmt::Debug for TenantState {
153 2 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
154 2 : match self {
155 2 : Self::Broken { reason, backtrace } if !reason.is_empty() => {
156 2 : write!(f, "Broken due to: {reason}. Backtrace:\n{backtrace}")
157 : }
158 0 : _ => write!(f, "{self}"),
159 : }
160 2 : }
161 : }
162 :
163 : /// A temporary lease to a specific lsn inside a timeline.
164 : /// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
165 : #[serde_as]
166 0 : #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
167 : pub struct LsnLease {
168 : #[serde_as(as = "SystemTimeAsRfc3339Millis")]
169 : pub valid_until: SystemTime,
170 : }
171 :
172 : serde_with::serde_conv!(
173 : SystemTimeAsRfc3339Millis,
174 : SystemTime,
175 0 : |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
176 0 : |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
177 : );
178 :
179 : impl LsnLease {
180 : /// The default length for an explicit LSN lease request (10 minutes).
181 : pub const DEFAULT_LENGTH: Duration = Duration::from_secs(10 * 60);
182 :
183 : /// The default length for an implicit LSN lease granted during
184 : /// `get_lsn_by_timestamp` request (1 minutes).
185 : pub const DEFAULT_LENGTH_FOR_TS: Duration = Duration::from_secs(60);
186 :
187 : /// Checks whether the lease is expired.
188 18 : pub fn is_expired(&self, now: &SystemTime) -> bool {
189 18 : now > &self.valid_until
190 18 : }
191 : }
192 :
193 : /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
194 4 : #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
195 : pub enum ActivatingFrom {
196 : /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
197 : Loading,
198 : /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
199 : Attaching,
200 : }
201 :
202 : /// A state of a timeline in pageserver's memory.
203 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
204 : pub enum TimelineState {
205 : /// The timeline is recognized by the pageserver but is not yet operational.
206 : /// In particular, the walreceiver connection loop is not running for this timeline.
207 : /// It will eventually transition to state Active or Broken.
208 : Loading,
209 : /// The timeline is fully operational.
210 : /// It can be queried, and the walreceiver connection loop is running.
211 : Active,
212 : /// The timeline was previously Loading or Active but is shutting down.
213 : /// It cannot transition back into any other state.
214 : Stopping,
215 : /// The timeline is broken and not operational (previous states: Loading or Active).
216 : Broken { reason: String, backtrace: String },
217 : }
218 :
219 0 : #[derive(Serialize, Deserialize, Clone)]
220 : pub struct TimelineCreateRequest {
221 : pub new_timeline_id: TimelineId,
222 : #[serde(default)]
223 : pub ancestor_timeline_id: Option<TimelineId>,
224 : #[serde(default)]
225 : pub existing_initdb_timeline_id: Option<TimelineId>,
226 : #[serde(default)]
227 : pub ancestor_start_lsn: Option<Lsn>,
228 : pub pg_version: Option<u32>,
229 : }
230 :
231 0 : #[derive(Serialize, Deserialize, Clone)]
232 : pub struct LsnLeaseRequest {
233 : pub lsn: Lsn,
234 : }
235 :
236 0 : #[derive(Serialize, Deserialize)]
237 : pub struct TenantShardSplitRequest {
238 : pub new_shard_count: u8,
239 :
240 : // A tenant's stripe size is only meaningful the first time their shard count goes
241 : // above 1: therefore during a split from 1->N shards, we may modify the stripe size.
242 : //
243 : // If this is set while the stripe count is being increased from an already >1 value,
244 : // then the request will fail with 400.
245 : pub new_stripe_size: Option<ShardStripeSize>,
246 : }
247 :
248 0 : #[derive(Serialize, Deserialize)]
249 : pub struct TenantShardSplitResponse {
250 : pub new_shards: Vec<TenantShardId>,
251 : }
252 :
253 : /// Parameters that apply to all shards in a tenant. Used during tenant creation.
254 0 : #[derive(Serialize, Deserialize, Debug)]
255 : #[serde(deny_unknown_fields)]
256 : pub struct ShardParameters {
257 : pub count: ShardCount,
258 : pub stripe_size: ShardStripeSize,
259 : }
260 :
261 : impl ShardParameters {
262 : pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
263 :
264 0 : pub fn is_unsharded(&self) -> bool {
265 0 : self.count.is_unsharded()
266 0 : }
267 : }
268 :
269 : impl Default for ShardParameters {
270 595 : fn default() -> Self {
271 595 : Self {
272 595 : count: ShardCount::new(0),
273 595 : stripe_size: Self::DEFAULT_STRIPE_SIZE,
274 595 : }
275 595 : }
276 : }
277 :
278 : /// An alternative representation of `pageserver::tenant::TenantConf` with
279 : /// simpler types.
280 2 : #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
281 : pub struct TenantConfig {
282 : pub checkpoint_distance: Option<u64>,
283 : pub checkpoint_timeout: Option<String>,
284 : pub compaction_target_size: Option<u64>,
285 : pub compaction_period: Option<String>,
286 : pub compaction_threshold: Option<usize>,
287 : // defer parsing compaction_algorithm, like eviction_policy
288 : pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
289 : pub gc_horizon: Option<u64>,
290 : pub gc_period: Option<String>,
291 : pub image_creation_threshold: Option<usize>,
292 : pub pitr_interval: Option<String>,
293 : pub walreceiver_connect_timeout: Option<String>,
294 : pub lagging_wal_timeout: Option<String>,
295 : pub max_lsn_wal_lag: Option<NonZeroU64>,
296 : pub eviction_policy: Option<EvictionPolicy>,
297 : pub min_resident_size_override: Option<u64>,
298 : pub evictions_low_residence_duration_metric_threshold: Option<String>,
299 : pub heatmap_period: Option<String>,
300 : pub lazy_slru_download: Option<bool>,
301 : pub timeline_get_throttle: Option<ThrottleConfig>,
302 : pub image_layer_creation_check_threshold: Option<u8>,
303 : pub switch_aux_file_policy: Option<AuxFilePolicy>,
304 : pub lsn_lease_length: Option<String>,
305 : pub lsn_lease_length_for_ts: Option<String>,
306 : }
307 :
308 : /// The policy for the aux file storage.
309 : ///
310 : /// It can be switched through `switch_aux_file_policy` tenant config.
311 : /// When the first aux file written, the policy will be persisted in the
312 : /// `index_part.json` file and has a limited migration path.
313 : ///
314 : /// Currently, we only allow the following migration path:
315 : ///
316 : /// Unset -> V1
317 : /// -> V2
318 : /// -> CrossValidation -> V2
319 : #[derive(
320 : Eq,
321 : PartialEq,
322 : Debug,
323 : Copy,
324 : Clone,
325 9 : strum_macros::EnumString,
326 62 : strum_macros::Display,
327 0 : serde_with::DeserializeFromStr,
328 : serde_with::SerializeDisplay,
329 : )]
330 : #[strum(serialize_all = "kebab-case")]
331 : pub enum AuxFilePolicy {
332 : /// V1 aux file policy: store everything in AUX_FILE_KEY
333 : #[strum(ascii_case_insensitive)]
334 : V1,
335 : /// V2 aux file policy: store in the AUX_FILE keyspace
336 : #[strum(ascii_case_insensitive)]
337 : V2,
338 : /// Cross validation runs both formats on the write path and does validation
339 : /// on the read path.
340 : #[strum(ascii_case_insensitive)]
341 : CrossValidation,
342 : }
343 :
344 : impl AuxFilePolicy {
345 102 : pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
346 62 : matches!(
347 102 : (from, to),
348 : (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
349 : )
350 102 : }
351 :
352 : /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
353 1230 : pub fn default_tenant_config() -> Self {
354 1230 : Self::V2
355 1230 : }
356 : }
357 :
358 : /// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
359 : pub struct AtomicAuxFilePolicy(AtomicUsize);
360 :
361 : impl AtomicAuxFilePolicy {
362 1242 : pub fn new(policy: Option<AuxFilePolicy>) -> Self {
363 1242 : Self(AtomicUsize::new(
364 1242 : policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
365 1242 : ))
366 1242 : }
367 :
368 924 : pub fn load(&self) -> Option<AuxFilePolicy> {
369 924 : match self.0.load(std::sync::atomic::Ordering::Acquire) {
370 726 : 0 => None,
371 198 : other => Some(AuxFilePolicy::from_usize(other)),
372 : }
373 924 : }
374 :
375 66 : pub fn store(&self, policy: Option<AuxFilePolicy>) {
376 66 : self.0.store(
377 66 : policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
378 66 : std::sync::atomic::Ordering::Release,
379 66 : );
380 66 : }
381 : }
382 :
383 : impl AuxFilePolicy {
384 60 : pub fn to_usize(self) -> usize {
385 60 : match self {
386 12 : Self::V1 => 1,
387 6 : Self::CrossValidation => 2,
388 42 : Self::V2 => 3,
389 : }
390 60 : }
391 :
392 198 : pub fn try_from_usize(this: usize) -> Option<Self> {
393 198 : match this {
394 12 : 1 => Some(Self::V1),
395 18 : 2 => Some(Self::CrossValidation),
396 168 : 3 => Some(Self::V2),
397 0 : _ => None,
398 : }
399 198 : }
400 :
401 198 : pub fn from_usize(this: usize) -> Self {
402 198 : Self::try_from_usize(this).unwrap()
403 198 : }
404 : }
405 :
406 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
407 : #[serde(tag = "kind")]
408 : pub enum EvictionPolicy {
409 : NoEviction,
410 : LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
411 : OnlyImitiate(EvictionPolicyLayerAccessThreshold),
412 : }
413 :
414 : impl EvictionPolicy {
415 0 : pub fn discriminant_str(&self) -> &'static str {
416 0 : match self {
417 0 : EvictionPolicy::NoEviction => "NoEviction",
418 0 : EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
419 0 : EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
420 : }
421 0 : }
422 : }
423 :
424 : #[derive(
425 : Eq,
426 : PartialEq,
427 : Debug,
428 : Copy,
429 : Clone,
430 0 : strum_macros::EnumString,
431 0 : strum_macros::Display,
432 0 : serde_with::DeserializeFromStr,
433 : serde_with::SerializeDisplay,
434 : )]
435 : #[strum(serialize_all = "kebab-case")]
436 : pub enum CompactionAlgorithm {
437 : Legacy,
438 : Tiered,
439 : }
440 :
441 : #[derive(
442 0 : Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay,
443 : )]
444 : pub enum ImageCompressionAlgorithm {
445 : // Disabled for writes, support decompressing during read path
446 : Disabled,
447 : /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
448 : /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
449 : Zstd {
450 : level: Option<i8>,
451 : },
452 : }
453 :
454 : impl FromStr for ImageCompressionAlgorithm {
455 : type Err = anyhow::Error;
456 8 : fn from_str(s: &str) -> Result<Self, Self::Err> {
457 8 : let mut components = s.split(['(', ')']);
458 8 : let first = components
459 8 : .next()
460 8 : .ok_or_else(|| anyhow::anyhow!("empty string"))?;
461 8 : match first {
462 8 : "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
463 6 : "zstd" => {
464 6 : let level = if let Some(v) = components.next() {
465 4 : let v: i8 = v.parse()?;
466 4 : Some(v)
467 : } else {
468 2 : None
469 : };
470 :
471 6 : Ok(ImageCompressionAlgorithm::Zstd { level })
472 : }
473 0 : _ => anyhow::bail!("invalid specifier '{first}'"),
474 : }
475 8 : }
476 : }
477 :
478 : impl Display for ImageCompressionAlgorithm {
479 12 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
480 12 : match self {
481 3 : ImageCompressionAlgorithm::Disabled => write!(f, "disabled"),
482 9 : ImageCompressionAlgorithm::Zstd { level } => {
483 9 : if let Some(level) = level {
484 6 : write!(f, "zstd({})", level)
485 : } else {
486 3 : write!(f, "zstd")
487 : }
488 : }
489 : }
490 12 : }
491 : }
492 :
493 0 : #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
494 : pub struct CompactionAlgorithmSettings {
495 : pub kind: CompactionAlgorithm,
496 : }
497 :
498 18 : #[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
499 : #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
500 : pub enum L0FlushConfig {
501 : #[serde(rename_all = "snake_case")]
502 : Direct { max_concurrency: NonZeroUsize },
503 : }
504 :
505 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
506 : pub struct EvictionPolicyLayerAccessThreshold {
507 : #[serde(with = "humantime_serde")]
508 : pub period: Duration,
509 : #[serde(with = "humantime_serde")]
510 : pub threshold: Duration,
511 : }
512 :
513 0 : #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
514 : pub struct ThrottleConfig {
515 : pub task_kinds: Vec<String>, // TaskKind
516 : pub initial: u32,
517 : #[serde(with = "humantime_serde")]
518 : pub refill_interval: Duration,
519 : pub refill_amount: NonZeroU32,
520 : pub max: u32,
521 : }
522 :
523 : impl ThrottleConfig {
524 1170 : pub fn disabled() -> Self {
525 1170 : Self {
526 1170 : task_kinds: vec![], // effectively disables the throttle
527 1170 : // other values don't matter with emtpy `task_kinds`.
528 1170 : initial: 0,
529 1170 : refill_interval: Duration::from_millis(1),
530 1170 : refill_amount: NonZeroU32::new(1).unwrap(),
531 1170 : max: 1,
532 1170 : }
533 1170 : }
534 : /// The requests per second allowed by the given config.
535 0 : pub fn steady_rps(&self) -> f64 {
536 0 : (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
537 0 : }
538 : }
539 :
540 : /// A flattened analog of a `pagesever::tenant::LocationMode`, which
541 : /// lists out all possible states (and the virtual "Detached" state)
542 : /// in a flat form rather than using rust-style enums.
543 0 : #[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
544 : pub enum LocationConfigMode {
545 : AttachedSingle,
546 : AttachedMulti,
547 : AttachedStale,
548 : Secondary,
549 : Detached,
550 : }
551 :
552 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
553 : pub struct LocationConfigSecondary {
554 : pub warm: bool,
555 : }
556 :
557 : /// An alternative representation of `pageserver::tenant::LocationConf`,
558 : /// for use in external-facing APIs.
559 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
560 : pub struct LocationConfig {
561 : pub mode: LocationConfigMode,
562 : /// If attaching, in what generation?
563 : #[serde(default)]
564 : pub generation: Option<u32>,
565 :
566 : // If requesting mode `Secondary`, configuration for that.
567 : #[serde(default)]
568 : pub secondary_conf: Option<LocationConfigSecondary>,
569 :
570 : // Shard parameters: if shard_count is nonzero, then other shard_* fields
571 : // must be set accurately.
572 : #[serde(default)]
573 : pub shard_number: u8,
574 : #[serde(default)]
575 : pub shard_count: u8,
576 : #[serde(default)]
577 : pub shard_stripe_size: u32,
578 :
579 : // This configuration only affects attached mode, but should be provided irrespective
580 : // of the mode, as a secondary location might transition on startup if the response
581 : // to the `/re-attach` control plane API requests it.
582 : pub tenant_conf: TenantConfig,
583 : }
584 :
585 0 : #[derive(Serialize, Deserialize)]
586 : pub struct LocationConfigListResponse {
587 : pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
588 : }
589 :
590 : #[derive(Serialize)]
591 : pub struct StatusResponse {
592 : pub id: NodeId,
593 : }
594 :
595 0 : #[derive(Serialize, Deserialize, Debug)]
596 : #[serde(deny_unknown_fields)]
597 : pub struct TenantLocationConfigRequest {
598 : #[serde(flatten)]
599 : pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
600 : }
601 :
602 0 : #[derive(Serialize, Deserialize, Debug)]
603 : #[serde(deny_unknown_fields)]
604 : pub struct TenantTimeTravelRequest {
605 : pub shard_counts: Vec<ShardCount>,
606 : }
607 :
608 0 : #[derive(Serialize, Deserialize, Debug)]
609 : #[serde(deny_unknown_fields)]
610 : pub struct TenantShardLocation {
611 : pub shard_id: TenantShardId,
612 : pub node_id: NodeId,
613 : }
614 :
615 0 : #[derive(Serialize, Deserialize, Debug)]
616 : #[serde(deny_unknown_fields)]
617 : pub struct TenantLocationConfigResponse {
618 : pub shards: Vec<TenantShardLocation>,
619 : // If the shards' ShardCount count is >1, stripe_size will be set.
620 : pub stripe_size: Option<ShardStripeSize>,
621 : }
622 :
623 3 : #[derive(Serialize, Deserialize, Debug)]
624 : #[serde(deny_unknown_fields)]
625 : pub struct TenantConfigRequest {
626 : pub tenant_id: TenantId,
627 : #[serde(flatten)]
628 : pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
629 : }
630 :
631 : impl std::ops::Deref for TenantConfigRequest {
632 : type Target = TenantConfig;
633 :
634 0 : fn deref(&self) -> &Self::Target {
635 0 : &self.config
636 0 : }
637 : }
638 :
639 : impl TenantConfigRequest {
640 0 : pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
641 0 : let config = TenantConfig::default();
642 0 : TenantConfigRequest { tenant_id, config }
643 0 : }
644 : }
645 :
646 : /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
647 0 : #[derive(Serialize, Deserialize, Clone)]
648 : #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
649 : pub enum TenantAttachmentStatus {
650 : Maybe,
651 : Attached,
652 : Failed { reason: String },
653 : }
654 :
655 0 : #[derive(Serialize, Deserialize, Clone)]
656 : pub struct TenantInfo {
657 : pub id: TenantShardId,
658 : // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
659 : pub state: TenantState,
660 : /// Sum of the size of all layer files.
661 : /// If a layer is present in both local FS and S3, it counts only once.
662 : pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
663 : pub attachment_status: TenantAttachmentStatus,
664 : pub generation: u32,
665 :
666 : /// Opaque explanation if gc is being blocked.
667 : ///
668 : /// Only looked up for the individual tenant detail, not the listing. This is purely for
669 : /// debugging, not included in openapi.
670 : #[serde(skip_serializing_if = "Option::is_none")]
671 : pub gc_blocking: Option<String>,
672 : }
673 :
674 0 : #[derive(Serialize, Deserialize, Clone)]
675 : pub struct TenantDetails {
676 : #[serde(flatten)]
677 : pub tenant_info: TenantInfo,
678 :
679 : pub walredo: Option<WalRedoManagerStatus>,
680 :
681 : pub timelines: Vec<TimelineId>,
682 : }
683 :
684 0 : #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
685 : pub enum TimelineArchivalState {
686 : Archived,
687 : Unarchived,
688 : }
689 :
690 0 : #[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
691 : pub struct TimelineArchivalConfigRequest {
692 : pub state: TimelineArchivalState,
693 : }
694 :
695 : /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
696 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
697 : pub struct TimelineInfo {
698 : pub tenant_id: TenantShardId,
699 : pub timeline_id: TimelineId,
700 :
701 : pub ancestor_timeline_id: Option<TimelineId>,
702 : pub ancestor_lsn: Option<Lsn>,
703 : pub last_record_lsn: Lsn,
704 : pub prev_record_lsn: Option<Lsn>,
705 : pub latest_gc_cutoff_lsn: Lsn,
706 : pub disk_consistent_lsn: Lsn,
707 :
708 : /// The LSN that we have succesfully uploaded to remote storage
709 : pub remote_consistent_lsn: Lsn,
710 :
711 : /// The LSN that we are advertizing to safekeepers
712 : pub remote_consistent_lsn_visible: Lsn,
713 :
714 : /// The LSN from the start of the root timeline (never changes)
715 : pub initdb_lsn: Lsn,
716 :
717 : pub current_logical_size: u64,
718 : pub current_logical_size_is_accurate: bool,
719 :
720 : pub directory_entries_counts: Vec<u64>,
721 :
722 : /// Sum of the size of all layer files.
723 : /// If a layer is present in both local FS and S3, it counts only once.
724 : pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
725 : pub current_logical_size_non_incremental: Option<u64>,
726 :
727 : /// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes
728 : /// beyond the branch's branch point, we only count up to the branch point.
729 : pub pitr_history_size: u64,
730 :
731 : /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
732 : /// ancestor data used by this branch would have been retained anyway). If this is false, then
733 : /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
734 : /// otherwise be able to GC.
735 : pub within_ancestor_pitr: bool,
736 :
737 : pub timeline_dir_layer_file_size_sum: Option<u64>,
738 :
739 : pub wal_source_connstr: Option<String>,
740 : pub last_received_msg_lsn: Option<Lsn>,
741 : /// the timestamp (in microseconds) of the last received message
742 : pub last_received_msg_ts: Option<u128>,
743 : pub pg_version: u32,
744 :
745 : pub state: TimelineState,
746 :
747 : pub walreceiver_status: String,
748 :
749 : // ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility.
750 : // Backward compatibility: you will get a JSON not containing the newly-added field.
751 : // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
752 : // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
753 : // read.
754 : /// The last aux file policy being used on this timeline
755 : pub last_aux_file_policy: Option<AuxFilePolicy>,
756 : pub is_archived: Option<bool>,
757 : }
758 :
759 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
760 : pub struct LayerMapInfo {
761 : pub in_memory_layers: Vec<InMemoryLayerInfo>,
762 : pub historic_layers: Vec<HistoricLayerInfo>,
763 : }
764 :
765 : /// The residence status of a layer
766 0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
767 : pub enum LayerResidenceStatus {
768 : /// Residence status for a layer file that exists locally.
769 : /// It may also exist on the remote, we don't care here.
770 : Resident,
771 : /// Residence status for a layer file that only exists on the remote.
772 : Evicted,
773 : }
774 :
775 : #[serde_as]
776 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
777 : pub struct LayerAccessStats {
778 : #[serde_as(as = "serde_with::TimestampMilliSeconds")]
779 : pub access_time: SystemTime,
780 :
781 : #[serde_as(as = "serde_with::TimestampMilliSeconds")]
782 : pub residence_time: SystemTime,
783 :
784 : pub visible: bool,
785 : }
786 :
787 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
788 : #[serde(tag = "kind")]
789 : pub enum InMemoryLayerInfo {
790 : Open { lsn_start: Lsn },
791 : Frozen { lsn_start: Lsn, lsn_end: Lsn },
792 : }
793 :
794 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
795 : #[serde(tag = "kind")]
796 : pub enum HistoricLayerInfo {
797 : Delta {
798 : layer_file_name: String,
799 : layer_file_size: u64,
800 :
801 : lsn_start: Lsn,
802 : lsn_end: Lsn,
803 : remote: bool,
804 : access_stats: LayerAccessStats,
805 :
806 : l0: bool,
807 : },
808 : Image {
809 : layer_file_name: String,
810 : layer_file_size: u64,
811 :
812 : lsn_start: Lsn,
813 : remote: bool,
814 : access_stats: LayerAccessStats,
815 : },
816 : }
817 :
818 : impl HistoricLayerInfo {
819 0 : pub fn layer_file_name(&self) -> &str {
820 0 : match self {
821 : HistoricLayerInfo::Delta {
822 0 : layer_file_name, ..
823 0 : } => layer_file_name,
824 : HistoricLayerInfo::Image {
825 0 : layer_file_name, ..
826 0 : } => layer_file_name,
827 : }
828 0 : }
829 0 : pub fn is_remote(&self) -> bool {
830 0 : match self {
831 0 : HistoricLayerInfo::Delta { remote, .. } => *remote,
832 0 : HistoricLayerInfo::Image { remote, .. } => *remote,
833 : }
834 0 : }
835 0 : pub fn set_remote(&mut self, value: bool) {
836 0 : let field = match self {
837 0 : HistoricLayerInfo::Delta { remote, .. } => remote,
838 0 : HistoricLayerInfo::Image { remote, .. } => remote,
839 : };
840 0 : *field = value;
841 0 : }
842 0 : pub fn layer_file_size(&self) -> u64 {
843 0 : match self {
844 : HistoricLayerInfo::Delta {
845 0 : layer_file_size, ..
846 0 : } => *layer_file_size,
847 : HistoricLayerInfo::Image {
848 0 : layer_file_size, ..
849 0 : } => *layer_file_size,
850 : }
851 0 : }
852 : }
853 :
854 0 : #[derive(Debug, Serialize, Deserialize)]
855 : pub struct DownloadRemoteLayersTaskSpawnRequest {
856 : pub max_concurrent_downloads: NonZeroUsize,
857 : }
858 :
859 0 : #[derive(Debug, Serialize, Deserialize)]
860 : pub struct IngestAuxFilesRequest {
861 : pub aux_files: HashMap<String, String>,
862 : }
863 :
864 0 : #[derive(Debug, Serialize, Deserialize)]
865 : pub struct ListAuxFilesRequest {
866 : pub lsn: Lsn,
867 : }
868 :
869 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
870 : pub struct DownloadRemoteLayersTaskInfo {
871 : pub task_id: String,
872 : pub state: DownloadRemoteLayersTaskState,
873 : pub total_layer_count: u64, // stable once `completed`
874 : pub successful_download_count: u64, // stable once `completed`
875 : pub failed_download_count: u64, // stable once `completed`
876 : }
877 :
878 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
879 : pub enum DownloadRemoteLayersTaskState {
880 : Running,
881 : Completed,
882 : ShutDown,
883 : }
884 :
885 0 : #[derive(Debug, Serialize, Deserialize)]
886 : pub struct TimelineGcRequest {
887 : pub gc_horizon: Option<u64>,
888 : }
889 :
890 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
891 : pub struct WalRedoManagerProcessStatus {
892 : pub pid: u32,
893 : }
894 :
895 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
896 : pub struct WalRedoManagerStatus {
897 : pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
898 : pub process: Option<WalRedoManagerProcessStatus>,
899 : }
900 :
901 : /// The progress of a secondary tenant.
902 : ///
903 : /// It is mostly useful when doing a long running download: e.g. initiating
904 : /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
905 : /// what's happening.
906 0 : #[derive(Default, Debug, Serialize, Deserialize, Clone)]
907 : pub struct SecondaryProgress {
908 : /// The remote storage LastModified time of the heatmap object we last downloaded.
909 : pub heatmap_mtime: Option<serde_system_time::SystemTime>,
910 :
911 : /// The number of layers currently on-disk
912 : pub layers_downloaded: usize,
913 : /// The number of layers in the most recently seen heatmap
914 : pub layers_total: usize,
915 :
916 : /// The number of layer bytes currently on-disk
917 : pub bytes_downloaded: u64,
918 : /// The number of layer bytes in the most recently seen heatmap
919 : pub bytes_total: u64,
920 : }
921 :
922 0 : #[derive(Serialize, Deserialize, Debug)]
923 : pub struct TenantScanRemoteStorageShard {
924 : pub tenant_shard_id: TenantShardId,
925 : pub generation: Option<u32>,
926 : }
927 :
928 0 : #[derive(Serialize, Deserialize, Debug, Default)]
929 : pub struct TenantScanRemoteStorageResponse {
930 : pub shards: Vec<TenantScanRemoteStorageShard>,
931 : }
932 :
933 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
934 : #[serde(rename_all = "snake_case")]
935 : pub enum TenantSorting {
936 : ResidentSize,
937 : MaxLogicalSize,
938 : }
939 :
940 : impl Default for TenantSorting {
941 0 : fn default() -> Self {
942 0 : Self::ResidentSize
943 0 : }
944 : }
945 :
946 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
947 : pub struct TopTenantShardsRequest {
948 : // How would you like to sort the tenants?
949 : pub order_by: TenantSorting,
950 :
951 : // How many results?
952 : pub limit: usize,
953 :
954 : // Omit tenants with more than this many shards (e.g. if this is the max number of shards
955 : // that the caller would ever split to)
956 : pub where_shards_lt: Option<ShardCount>,
957 :
958 : // Omit tenants where the ordering metric is less than this (this is an optimization to
959 : // let us quickly exclude numerous tiny shards)
960 : pub where_gt: Option<u64>,
961 : }
962 :
963 0 : #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
964 : pub struct TopTenantShardItem {
965 : pub id: TenantShardId,
966 :
967 : /// Total size of layers on local disk for all timelines in this tenant
968 : pub resident_size: u64,
969 :
970 : /// Total size of layers in remote storage for all timelines in this tenant
971 : pub physical_size: u64,
972 :
973 : /// The largest logical size of a timeline within this tenant
974 : pub max_logical_size: u64,
975 : }
976 :
977 0 : #[derive(Serialize, Deserialize, Debug, Default)]
978 : pub struct TopTenantShardsResponse {
979 : pub shards: Vec<TopTenantShardItem>,
980 : }
981 :
982 : pub mod virtual_file {
983 : use std::path::PathBuf;
984 :
985 : #[derive(
986 : Copy,
987 : Clone,
988 : PartialEq,
989 : Eq,
990 : Hash,
991 606 : strum_macros::EnumString,
992 0 : strum_macros::Display,
993 0 : serde_with::DeserializeFromStr,
994 : serde_with::SerializeDisplay,
995 : Debug,
996 : )]
997 : #[strum(serialize_all = "kebab-case")]
998 : pub enum IoEngineKind {
999 : StdFs,
1000 : #[cfg(target_os = "linux")]
1001 : TokioEpollUring,
1002 : }
1003 :
1004 : /// Direct IO modes for a pageserver.
1005 0 : #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
1006 : #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
1007 : pub enum DirectIoMode {
1008 : /// Direct IO disabled (uses usual buffered IO).
1009 : #[default]
1010 : Disabled,
1011 : /// Direct IO disabled (performs checks and perf simulations).
1012 : Evaluate {
1013 : /// Alignment check level
1014 : alignment_check: DirectIoAlignmentCheckLevel,
1015 : /// Latency padded for performance simulation.
1016 : latency_padding: DirectIoLatencyPadding,
1017 : },
1018 : /// Direct IO enabled.
1019 : Enabled {
1020 : /// Actions to perform on alignment error.
1021 : on_alignment_error: DirectIoOnAlignmentErrorAction,
1022 : },
1023 : }
1024 :
1025 0 : #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
1026 : #[serde(rename_all = "kebab-case")]
1027 : pub enum DirectIoAlignmentCheckLevel {
1028 : #[default]
1029 : Error,
1030 : Log,
1031 : None,
1032 : }
1033 :
1034 0 : #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
1035 : #[serde(rename_all = "kebab-case")]
1036 : pub enum DirectIoOnAlignmentErrorAction {
1037 : Error,
1038 : #[default]
1039 : FallbackToBuffered,
1040 : }
1041 :
1042 0 : #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
1043 : #[serde(tag = "type", rename_all = "kebab-case")]
1044 : pub enum DirectIoLatencyPadding {
1045 : /// Pad virtual file operations with IO to a fake file.
1046 : FakeFileRW { path: PathBuf },
1047 : #[default]
1048 : None,
1049 : }
1050 : }
1051 :
1052 : // Wrapped in libpq CopyData
1053 : #[derive(PartialEq, Eq, Debug)]
1054 : pub enum PagestreamFeMessage {
1055 : Exists(PagestreamExistsRequest),
1056 : Nblocks(PagestreamNblocksRequest),
1057 : GetPage(PagestreamGetPageRequest),
1058 : DbSize(PagestreamDbSizeRequest),
1059 : GetSlruSegment(PagestreamGetSlruSegmentRequest),
1060 : }
1061 :
1062 : // Wrapped in libpq CopyData
1063 0 : #[derive(strum_macros::EnumProperty)]
1064 : pub enum PagestreamBeMessage {
1065 : Exists(PagestreamExistsResponse),
1066 : Nblocks(PagestreamNblocksResponse),
1067 : GetPage(PagestreamGetPageResponse),
1068 : Error(PagestreamErrorResponse),
1069 : DbSize(PagestreamDbSizeResponse),
1070 : GetSlruSegment(PagestreamGetSlruSegmentResponse),
1071 : }
1072 :
1073 : // Keep in sync with `pagestore_client.h`
1074 : #[repr(u8)]
1075 : enum PagestreamBeMessageTag {
1076 : Exists = 100,
1077 : Nblocks = 101,
1078 : GetPage = 102,
1079 : Error = 103,
1080 : DbSize = 104,
1081 : GetSlruSegment = 105,
1082 : }
1083 : impl TryFrom<u8> for PagestreamBeMessageTag {
1084 : type Error = u8;
1085 0 : fn try_from(value: u8) -> Result<Self, u8> {
1086 0 : match value {
1087 0 : 100 => Ok(PagestreamBeMessageTag::Exists),
1088 0 : 101 => Ok(PagestreamBeMessageTag::Nblocks),
1089 0 : 102 => Ok(PagestreamBeMessageTag::GetPage),
1090 0 : 103 => Ok(PagestreamBeMessageTag::Error),
1091 0 : 104 => Ok(PagestreamBeMessageTag::DbSize),
1092 0 : 105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
1093 0 : _ => Err(value),
1094 : }
1095 0 : }
1096 : }
1097 :
1098 : // A GetPage request contains two LSN values:
1099 : //
1100 : // request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
1101 : // "get the latest version present". It's used by the primary server, which knows that no one else
1102 : // is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
1103 : // Lsn::Max. Standby servers use the current replay LSN as the request LSN.
1104 : //
1105 : // not_modified_since: Hint to the pageserver that the client knows that the page has not been
1106 : // modified between 'not_modified_since' and the request LSN. It's always correct to set
1107 : // 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
1108 : // passing an earlier LSN can speed up the request, by allowing the pageserver to process the
1109 : // request without waiting for 'request_lsn' to arrive.
1110 : //
1111 : // The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
1112 : // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
1113 : // 'latest' was set to true. The V2 interface was added because there was no correct way for a
1114 : // standby to request a page at a particular non-latest LSN, and also include the
1115 : // 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
1116 : // request, if the standby knows that the page hasn't been modified since, and risk getting an error
1117 : // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
1118 : // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
1119 : // interface allows sending both LSNs, and let the pageserver do the right thing. There was no
1120 : // difference in the responses between V1 and V2.
1121 : //
1122 : #[derive(Clone, Copy)]
1123 : pub enum PagestreamProtocolVersion {
1124 : V2,
1125 : }
1126 :
1127 : #[derive(Debug, PartialEq, Eq)]
1128 : pub struct PagestreamExistsRequest {
1129 : pub request_lsn: Lsn,
1130 : pub not_modified_since: Lsn,
1131 : pub rel: RelTag,
1132 : }
1133 :
1134 : #[derive(Debug, PartialEq, Eq)]
1135 : pub struct PagestreamNblocksRequest {
1136 : pub request_lsn: Lsn,
1137 : pub not_modified_since: Lsn,
1138 : pub rel: RelTag,
1139 : }
1140 :
1141 : #[derive(Debug, PartialEq, Eq)]
1142 : pub struct PagestreamGetPageRequest {
1143 : pub request_lsn: Lsn,
1144 : pub not_modified_since: Lsn,
1145 : pub rel: RelTag,
1146 : pub blkno: u32,
1147 : }
1148 :
1149 : #[derive(Debug, PartialEq, Eq)]
1150 : pub struct PagestreamDbSizeRequest {
1151 : pub request_lsn: Lsn,
1152 : pub not_modified_since: Lsn,
1153 : pub dbnode: u32,
1154 : }
1155 :
1156 : #[derive(Debug, PartialEq, Eq)]
1157 : pub struct PagestreamGetSlruSegmentRequest {
1158 : pub request_lsn: Lsn,
1159 : pub not_modified_since: Lsn,
1160 : pub kind: u8,
1161 : pub segno: u32,
1162 : }
1163 :
1164 : #[derive(Debug)]
1165 : pub struct PagestreamExistsResponse {
1166 : pub exists: bool,
1167 : }
1168 :
1169 : #[derive(Debug)]
1170 : pub struct PagestreamNblocksResponse {
1171 : pub n_blocks: u32,
1172 : }
1173 :
1174 : #[derive(Debug)]
1175 : pub struct PagestreamGetPageResponse {
1176 : pub page: Bytes,
1177 : }
1178 :
1179 : #[derive(Debug)]
1180 : pub struct PagestreamGetSlruSegmentResponse {
1181 : pub segment: Bytes,
1182 : }
1183 :
1184 : #[derive(Debug)]
1185 : pub struct PagestreamErrorResponse {
1186 : pub message: String,
1187 : }
1188 :
1189 : #[derive(Debug)]
1190 : pub struct PagestreamDbSizeResponse {
1191 : pub db_size: i64,
1192 : }
1193 :
1194 : // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
1195 : // that require pageserver-internal types. It is sufficient to get the total size.
1196 0 : #[derive(Serialize, Deserialize, Debug)]
1197 : pub struct TenantHistorySize {
1198 : pub id: TenantId,
1199 : /// Size is a mixture of WAL and logical size, so the unit is bytes.
1200 : ///
1201 : /// Will be none if `?inputs_only=true` was given.
1202 : pub size: Option<u64>,
1203 : }
1204 :
1205 : impl PagestreamFeMessage {
1206 : /// Serialize a compute -> pageserver message. This is currently only used in testing
1207 : /// tools. Always uses protocol version 2.
1208 4 : pub fn serialize(&self) -> Bytes {
1209 4 : let mut bytes = BytesMut::new();
1210 4 :
1211 4 : match self {
1212 1 : Self::Exists(req) => {
1213 1 : bytes.put_u8(0);
1214 1 : bytes.put_u64(req.request_lsn.0);
1215 1 : bytes.put_u64(req.not_modified_since.0);
1216 1 : bytes.put_u32(req.rel.spcnode);
1217 1 : bytes.put_u32(req.rel.dbnode);
1218 1 : bytes.put_u32(req.rel.relnode);
1219 1 : bytes.put_u8(req.rel.forknum);
1220 1 : }
1221 :
1222 1 : Self::Nblocks(req) => {
1223 1 : bytes.put_u8(1);
1224 1 : bytes.put_u64(req.request_lsn.0);
1225 1 : bytes.put_u64(req.not_modified_since.0);
1226 1 : bytes.put_u32(req.rel.spcnode);
1227 1 : bytes.put_u32(req.rel.dbnode);
1228 1 : bytes.put_u32(req.rel.relnode);
1229 1 : bytes.put_u8(req.rel.forknum);
1230 1 : }
1231 :
1232 1 : Self::GetPage(req) => {
1233 1 : bytes.put_u8(2);
1234 1 : bytes.put_u64(req.request_lsn.0);
1235 1 : bytes.put_u64(req.not_modified_since.0);
1236 1 : bytes.put_u32(req.rel.spcnode);
1237 1 : bytes.put_u32(req.rel.dbnode);
1238 1 : bytes.put_u32(req.rel.relnode);
1239 1 : bytes.put_u8(req.rel.forknum);
1240 1 : bytes.put_u32(req.blkno);
1241 1 : }
1242 :
1243 1 : Self::DbSize(req) => {
1244 1 : bytes.put_u8(3);
1245 1 : bytes.put_u64(req.request_lsn.0);
1246 1 : bytes.put_u64(req.not_modified_since.0);
1247 1 : bytes.put_u32(req.dbnode);
1248 1 : }
1249 :
1250 0 : Self::GetSlruSegment(req) => {
1251 0 : bytes.put_u8(4);
1252 0 : bytes.put_u64(req.request_lsn.0);
1253 0 : bytes.put_u64(req.not_modified_since.0);
1254 0 : bytes.put_u8(req.kind);
1255 0 : bytes.put_u32(req.segno);
1256 0 : }
1257 : }
1258 :
1259 4 : bytes.into()
1260 4 : }
1261 :
1262 4 : pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
1263 : // these correspond to the NeonMessageTag enum in pagestore_client.h
1264 : //
1265 : // TODO: consider using protobuf or serde bincode for less error prone
1266 : // serialization.
1267 4 : let msg_tag = body.read_u8()?;
1268 :
1269 : // these two fields are the same for every request type
1270 4 : let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
1271 4 : let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
1272 :
1273 4 : match msg_tag {
1274 : 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
1275 1 : request_lsn,
1276 1 : not_modified_since,
1277 1 : rel: RelTag {
1278 1 : spcnode: body.read_u32::<BigEndian>()?,
1279 1 : dbnode: body.read_u32::<BigEndian>()?,
1280 1 : relnode: body.read_u32::<BigEndian>()?,
1281 1 : forknum: body.read_u8()?,
1282 : },
1283 : })),
1284 : 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1285 1 : request_lsn,
1286 1 : not_modified_since,
1287 1 : rel: RelTag {
1288 1 : spcnode: body.read_u32::<BigEndian>()?,
1289 1 : dbnode: body.read_u32::<BigEndian>()?,
1290 1 : relnode: body.read_u32::<BigEndian>()?,
1291 1 : forknum: body.read_u8()?,
1292 : },
1293 : })),
1294 : 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1295 1 : request_lsn,
1296 1 : not_modified_since,
1297 1 : rel: RelTag {
1298 1 : spcnode: body.read_u32::<BigEndian>()?,
1299 1 : dbnode: body.read_u32::<BigEndian>()?,
1300 1 : relnode: body.read_u32::<BigEndian>()?,
1301 1 : forknum: body.read_u8()?,
1302 : },
1303 1 : blkno: body.read_u32::<BigEndian>()?,
1304 : })),
1305 : 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1306 1 : request_lsn,
1307 1 : not_modified_since,
1308 1 : dbnode: body.read_u32::<BigEndian>()?,
1309 : })),
1310 : 4 => Ok(PagestreamFeMessage::GetSlruSegment(
1311 : PagestreamGetSlruSegmentRequest {
1312 0 : request_lsn,
1313 0 : not_modified_since,
1314 0 : kind: body.read_u8()?,
1315 0 : segno: body.read_u32::<BigEndian>()?,
1316 : },
1317 : )),
1318 0 : _ => bail!("unknown smgr message tag: {:?}", msg_tag),
1319 : }
1320 4 : }
1321 : }
1322 :
1323 : impl PagestreamBeMessage {
1324 0 : pub fn serialize(&self) -> Bytes {
1325 0 : let mut bytes = BytesMut::new();
1326 :
1327 : use PagestreamBeMessageTag as Tag;
1328 0 : match self {
1329 0 : Self::Exists(resp) => {
1330 0 : bytes.put_u8(Tag::Exists as u8);
1331 0 : bytes.put_u8(resp.exists as u8);
1332 0 : }
1333 :
1334 0 : Self::Nblocks(resp) => {
1335 0 : bytes.put_u8(Tag::Nblocks as u8);
1336 0 : bytes.put_u32(resp.n_blocks);
1337 0 : }
1338 :
1339 0 : Self::GetPage(resp) => {
1340 0 : bytes.put_u8(Tag::GetPage as u8);
1341 0 : bytes.put(&resp.page[..]);
1342 0 : }
1343 :
1344 0 : Self::Error(resp) => {
1345 0 : bytes.put_u8(Tag::Error as u8);
1346 0 : bytes.put(resp.message.as_bytes());
1347 0 : bytes.put_u8(0); // null terminator
1348 0 : }
1349 0 : Self::DbSize(resp) => {
1350 0 : bytes.put_u8(Tag::DbSize as u8);
1351 0 : bytes.put_i64(resp.db_size);
1352 0 : }
1353 :
1354 0 : Self::GetSlruSegment(resp) => {
1355 0 : bytes.put_u8(Tag::GetSlruSegment as u8);
1356 0 : bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
1357 0 : bytes.put(&resp.segment[..]);
1358 0 : }
1359 : }
1360 :
1361 0 : bytes.into()
1362 0 : }
1363 :
1364 0 : pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
1365 0 : let mut buf = buf.reader();
1366 0 : let msg_tag = buf.read_u8()?;
1367 :
1368 : use PagestreamBeMessageTag as Tag;
1369 0 : let ok =
1370 0 : match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
1371 : Tag::Exists => {
1372 0 : let exists = buf.read_u8()?;
1373 0 : Self::Exists(PagestreamExistsResponse {
1374 0 : exists: exists != 0,
1375 0 : })
1376 : }
1377 : Tag::Nblocks => {
1378 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1379 0 : Self::Nblocks(PagestreamNblocksResponse { n_blocks })
1380 : }
1381 : Tag::GetPage => {
1382 0 : let mut page = vec![0; 8192]; // TODO: use MaybeUninit
1383 0 : buf.read_exact(&mut page)?;
1384 0 : PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
1385 : }
1386 : Tag::Error => {
1387 0 : let mut msg = Vec::new();
1388 0 : buf.read_until(0, &mut msg)?;
1389 0 : let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
1390 0 : let rust_str = cstring.to_str()?;
1391 0 : PagestreamBeMessage::Error(PagestreamErrorResponse {
1392 0 : message: rust_str.to_owned(),
1393 0 : })
1394 : }
1395 : Tag::DbSize => {
1396 0 : let db_size = buf.read_i64::<BigEndian>()?;
1397 0 : Self::DbSize(PagestreamDbSizeResponse { db_size })
1398 : }
1399 : Tag::GetSlruSegment => {
1400 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1401 0 : let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
1402 0 : buf.read_exact(&mut segment)?;
1403 0 : Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
1404 0 : segment: segment.into(),
1405 0 : })
1406 : }
1407 : };
1408 0 : let remaining = buf.into_inner();
1409 0 : if !remaining.is_empty() {
1410 0 : anyhow::bail!(
1411 0 : "remaining bytes in msg with tag={msg_tag}: {}",
1412 0 : remaining.len()
1413 0 : );
1414 0 : }
1415 0 : Ok(ok)
1416 0 : }
1417 :
1418 0 : pub fn kind(&self) -> &'static str {
1419 0 : match self {
1420 0 : Self::Exists(_) => "Exists",
1421 0 : Self::Nblocks(_) => "Nblocks",
1422 0 : Self::GetPage(_) => "GetPage",
1423 0 : Self::Error(_) => "Error",
1424 0 : Self::DbSize(_) => "DbSize",
1425 0 : Self::GetSlruSegment(_) => "GetSlruSegment",
1426 : }
1427 0 : }
1428 : }
1429 :
1430 : #[cfg(test)]
1431 : mod tests {
1432 : use serde_json::json;
1433 : use std::str::FromStr;
1434 :
1435 : use super::*;
1436 :
1437 : #[test]
1438 1 : fn test_pagestream() {
1439 1 : // Test serialization/deserialization of PagestreamFeMessage
1440 1 : let messages = vec![
1441 1 : PagestreamFeMessage::Exists(PagestreamExistsRequest {
1442 1 : request_lsn: Lsn(4),
1443 1 : not_modified_since: Lsn(3),
1444 1 : rel: RelTag {
1445 1 : forknum: 1,
1446 1 : spcnode: 2,
1447 1 : dbnode: 3,
1448 1 : relnode: 4,
1449 1 : },
1450 1 : }),
1451 1 : PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1452 1 : request_lsn: Lsn(4),
1453 1 : not_modified_since: Lsn(4),
1454 1 : rel: RelTag {
1455 1 : forknum: 1,
1456 1 : spcnode: 2,
1457 1 : dbnode: 3,
1458 1 : relnode: 4,
1459 1 : },
1460 1 : }),
1461 1 : PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1462 1 : request_lsn: Lsn(4),
1463 1 : not_modified_since: Lsn(3),
1464 1 : rel: RelTag {
1465 1 : forknum: 1,
1466 1 : spcnode: 2,
1467 1 : dbnode: 3,
1468 1 : relnode: 4,
1469 1 : },
1470 1 : blkno: 7,
1471 1 : }),
1472 1 : PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1473 1 : request_lsn: Lsn(4),
1474 1 : not_modified_since: Lsn(3),
1475 1 : dbnode: 7,
1476 1 : }),
1477 1 : ];
1478 5 : for msg in messages {
1479 4 : let bytes = msg.serialize();
1480 4 : let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
1481 4 : assert!(msg == reconstructed);
1482 : }
1483 1 : }
1484 :
1485 : #[test]
1486 1 : fn test_tenantinfo_serde() {
1487 1 : // Test serialization/deserialization of TenantInfo
1488 1 : let original_active = TenantInfo {
1489 1 : id: TenantShardId::unsharded(TenantId::generate()),
1490 1 : state: TenantState::Active,
1491 1 : current_physical_size: Some(42),
1492 1 : attachment_status: TenantAttachmentStatus::Attached,
1493 1 : generation: 1,
1494 1 : gc_blocking: None,
1495 1 : };
1496 1 : let expected_active = json!({
1497 1 : "id": original_active.id.to_string(),
1498 1 : "state": {
1499 1 : "slug": "Active",
1500 1 : },
1501 1 : "current_physical_size": 42,
1502 1 : "attachment_status": {
1503 1 : "slug":"attached",
1504 1 : },
1505 1 : "generation" : 1
1506 1 : });
1507 1 :
1508 1 : let original_broken = TenantInfo {
1509 1 : id: TenantShardId::unsharded(TenantId::generate()),
1510 1 : state: TenantState::Broken {
1511 1 : reason: "reason".into(),
1512 1 : backtrace: "backtrace info".into(),
1513 1 : },
1514 1 : current_physical_size: Some(42),
1515 1 : attachment_status: TenantAttachmentStatus::Attached,
1516 1 : generation: 1,
1517 1 : gc_blocking: None,
1518 1 : };
1519 1 : let expected_broken = json!({
1520 1 : "id": original_broken.id.to_string(),
1521 1 : "state": {
1522 1 : "slug": "Broken",
1523 1 : "data": {
1524 1 : "backtrace": "backtrace info",
1525 1 : "reason": "reason",
1526 1 : }
1527 1 : },
1528 1 : "current_physical_size": 42,
1529 1 : "attachment_status": {
1530 1 : "slug":"attached",
1531 1 : },
1532 1 : "generation" : 1
1533 1 : });
1534 1 :
1535 1 : assert_eq!(
1536 1 : serde_json::to_value(&original_active).unwrap(),
1537 1 : expected_active
1538 1 : );
1539 :
1540 1 : assert_eq!(
1541 1 : serde_json::to_value(&original_broken).unwrap(),
1542 1 : expected_broken
1543 1 : );
1544 1 : assert!(format!("{:?}", &original_broken.state).contains("reason"));
1545 1 : assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
1546 1 : }
1547 :
1548 : #[test]
1549 1 : fn test_reject_unknown_field() {
1550 1 : let id = TenantId::generate();
1551 1 : let config_request = json!({
1552 1 : "tenant_id": id.to_string(),
1553 1 : "unknown_field": "unknown_value".to_string(),
1554 1 : });
1555 1 : let err = serde_json::from_value::<TenantConfigRequest>(config_request).unwrap_err();
1556 1 : assert!(
1557 1 : err.to_string().contains("unknown field `unknown_field`"),
1558 0 : "expect unknown field `unknown_field` error, got: {}",
1559 : err
1560 : );
1561 1 : }
1562 :
1563 : #[test]
1564 1 : fn tenantstatus_activating_serde() {
1565 1 : let states = [
1566 1 : TenantState::Activating(ActivatingFrom::Loading),
1567 1 : TenantState::Activating(ActivatingFrom::Attaching),
1568 1 : ];
1569 1 : let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
1570 1 :
1571 1 : let actual = serde_json::to_string(&states).unwrap();
1572 1 :
1573 1 : assert_eq!(actual, expected);
1574 :
1575 1 : let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
1576 1 :
1577 1 : assert_eq!(states.as_slice(), &parsed);
1578 1 : }
1579 :
1580 : #[test]
1581 1 : fn tenantstatus_activating_strum() {
1582 1 : // tests added, because we use these for metrics
1583 1 : let examples = [
1584 1 : (line!(), TenantState::Loading, "Loading"),
1585 1 : (line!(), TenantState::Attaching, "Attaching"),
1586 1 : (
1587 1 : line!(),
1588 1 : TenantState::Activating(ActivatingFrom::Loading),
1589 1 : "Activating",
1590 1 : ),
1591 1 : (
1592 1 : line!(),
1593 1 : TenantState::Activating(ActivatingFrom::Attaching),
1594 1 : "Activating",
1595 1 : ),
1596 1 : (line!(), TenantState::Active, "Active"),
1597 1 : (
1598 1 : line!(),
1599 1 : TenantState::Stopping {
1600 1 : progress: utils::completion::Barrier::default(),
1601 1 : },
1602 1 : "Stopping",
1603 1 : ),
1604 1 : (
1605 1 : line!(),
1606 1 : TenantState::Broken {
1607 1 : reason: "Example".into(),
1608 1 : backtrace: "Looooong backtrace".into(),
1609 1 : },
1610 1 : "Broken",
1611 1 : ),
1612 1 : ];
1613 :
1614 8 : for (line, rendered, expected) in examples {
1615 7 : let actual: &'static str = rendered.into();
1616 7 : assert_eq!(actual, expected, "example on {line}");
1617 : }
1618 1 : }
1619 :
1620 : #[test]
1621 1 : fn test_aux_file_migration_path() {
1622 1 : assert!(AuxFilePolicy::is_valid_migration_path(
1623 1 : None,
1624 1 : AuxFilePolicy::V1
1625 1 : ));
1626 1 : assert!(AuxFilePolicy::is_valid_migration_path(
1627 1 : None,
1628 1 : AuxFilePolicy::V2
1629 1 : ));
1630 1 : assert!(AuxFilePolicy::is_valid_migration_path(
1631 1 : None,
1632 1 : AuxFilePolicy::CrossValidation
1633 1 : ));
1634 : // Self-migration is not a valid migration path, and the caller should handle it by itself.
1635 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1636 1 : Some(AuxFilePolicy::V1),
1637 1 : AuxFilePolicy::V1
1638 1 : ));
1639 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1640 1 : Some(AuxFilePolicy::V2),
1641 1 : AuxFilePolicy::V2
1642 1 : ));
1643 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1644 1 : Some(AuxFilePolicy::CrossValidation),
1645 1 : AuxFilePolicy::CrossValidation
1646 1 : ));
1647 : // Migrations not allowed
1648 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1649 1 : Some(AuxFilePolicy::CrossValidation),
1650 1 : AuxFilePolicy::V1
1651 1 : ));
1652 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1653 1 : Some(AuxFilePolicy::V1),
1654 1 : AuxFilePolicy::V2
1655 1 : ));
1656 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1657 1 : Some(AuxFilePolicy::V2),
1658 1 : AuxFilePolicy::V1
1659 1 : ));
1660 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1661 1 : Some(AuxFilePolicy::V2),
1662 1 : AuxFilePolicy::CrossValidation
1663 1 : ));
1664 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1665 1 : Some(AuxFilePolicy::V1),
1666 1 : AuxFilePolicy::CrossValidation
1667 1 : ));
1668 : // Migrations allowed
1669 1 : assert!(AuxFilePolicy::is_valid_migration_path(
1670 1 : Some(AuxFilePolicy::CrossValidation),
1671 1 : AuxFilePolicy::V2
1672 1 : ));
1673 1 : }
1674 :
1675 : #[test]
1676 1 : fn test_aux_parse() {
1677 1 : assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
1678 1 : assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
1679 1 : assert_eq!(
1680 1 : AuxFilePolicy::from_str("cross-validation").unwrap(),
1681 1 : AuxFilePolicy::CrossValidation
1682 1 : );
1683 1 : }
1684 :
1685 : #[test]
1686 1 : fn test_image_compression_algorithm_parsing() {
1687 : use ImageCompressionAlgorithm::*;
1688 1 : let cases = [
1689 1 : ("disabled", Disabled),
1690 1 : ("zstd", Zstd { level: None }),
1691 1 : ("zstd(18)", Zstd { level: Some(18) }),
1692 1 : ("zstd(-3)", Zstd { level: Some(-3) }),
1693 1 : ];
1694 :
1695 5 : for (display, expected) in cases {
1696 4 : assert_eq!(
1697 4 : ImageCompressionAlgorithm::from_str(display).unwrap(),
1698 : expected,
1699 0 : "parsing works"
1700 : );
1701 4 : assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip");
1702 :
1703 4 : let ser = serde_json::to_string(&expected).expect("serialization");
1704 4 : assert_eq!(
1705 4 : serde_json::from_str::<ImageCompressionAlgorithm>(&ser).unwrap(),
1706 : expected,
1707 0 : "serde roundtrip"
1708 : );
1709 :
1710 4 : assert_eq!(
1711 4 : serde_json::Value::String(display.to_string()),
1712 4 : serde_json::to_value(expected).unwrap(),
1713 0 : "Display is the serde serialization"
1714 : );
1715 : }
1716 1 : }
1717 : }
|