Line data Source code
1 : pub mod detach_ancestor;
2 : pub mod partitioning;
3 : pub mod utilization;
4 :
5 : pub use utilization::PageserverUtilization;
6 :
7 : use std::{
8 : collections::HashMap,
9 : fmt::Display,
10 : io::{BufRead, Read},
11 : num::{NonZeroU32, NonZeroU64, NonZeroUsize},
12 : str::FromStr,
13 : sync::atomic::AtomicUsize,
14 : time::{Duration, SystemTime},
15 : };
16 :
17 : use byteorder::{BigEndian, ReadBytesExt};
18 : use postgres_ffi::BLCKSZ;
19 : use serde::{Deserialize, Serialize};
20 : use serde_with::serde_as;
21 : use utils::{
22 : completion,
23 : id::{NodeId, TenantId, TimelineId},
24 : lsn::Lsn,
25 : serde_system_time,
26 : };
27 :
28 : use crate::{
29 : reltag::RelTag,
30 : shard::{ShardCount, ShardStripeSize, TenantShardId},
31 : };
32 : use anyhow::bail;
33 : use bytes::{Buf, BufMut, Bytes, BytesMut};
34 :
35 : /// The state of a tenant in this pageserver.
36 : ///
37 : /// ```mermaid
38 : /// stateDiagram-v2
39 : ///
40 : /// [*] --> Attaching: spawn_attach()
41 : ///
42 : /// Attaching --> Activating: activate()
43 : /// Activating --> Active: infallible
44 : ///
45 : /// Attaching --> Broken: attach() failure
46 : ///
47 : /// Active --> Stopping: set_stopping(), part of shutdown & detach
48 : /// Stopping --> Broken: late error in remove_tenant_from_memory
49 : ///
50 : /// Broken --> [*]: ignore / detach / shutdown
51 : /// Stopping --> [*]: remove_from_memory complete
52 : ///
53 : /// Active --> Broken: cfg(testing)-only tenant break point
54 : /// ```
55 : #[derive(
56 : Clone,
57 : PartialEq,
58 : Eq,
59 1 : serde::Serialize,
60 3 : serde::Deserialize,
61 0 : strum_macros::Display,
62 : strum_macros::VariantNames,
63 0 : strum_macros::AsRefStr,
64 374 : strum_macros::IntoStaticStr,
65 : )]
66 : #[serde(tag = "slug", content = "data")]
67 : pub enum TenantState {
68 : /// This tenant is being attached to the pageserver.
69 : ///
70 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
71 : Attaching,
72 : /// The tenant is transitioning from Loading/Attaching to Active.
73 : ///
74 : /// While in this state, the individual timelines are being activated.
75 : ///
76 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
77 : Activating(ActivatingFrom),
78 : /// The tenant has finished activating and is open for business.
79 : ///
80 : /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
81 : Active,
82 : /// The tenant is recognized by pageserver, but it is being detached or the
83 : /// system is being shut down.
84 : ///
85 : /// Transitions out of this state are possible through `set_broken()`.
86 : Stopping {
87 : // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
88 : // otherwise it will not be skipped during deserialization
89 : #[serde(skip)]
90 : progress: completion::Barrier,
91 : },
92 : /// The tenant is recognized by the pageserver, but can no longer be used for
93 : /// any operations.
94 : ///
95 : /// If the tenant fails to load or attach, it will transition to this state
96 : /// and it is guaranteed that no background tasks are running in its name.
97 : ///
98 : /// The other way to transition into this state is from `Stopping` state
99 : /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
100 : /// if the cleanup future executed by `remove_tenant_from_memory()` fails.
101 : Broken { reason: String, backtrace: String },
102 : }
103 :
104 : impl TenantState {
105 0 : pub fn attachment_status(&self) -> TenantAttachmentStatus {
106 : use TenantAttachmentStatus::*;
107 :
108 : // Below TenantState::Activating is used as "transient" or "transparent" state for
109 : // attachment_status determining.
110 0 : match self {
111 : // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
112 : // So, technically, we can return Attached here.
113 : // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
114 : // But, our attach task might still be fetching the remote timelines, etc.
115 : // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
116 0 : Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
117 : // We only reach Active after successful load / attach.
118 : // So, call atttachment status Attached.
119 0 : Self::Active => Attached,
120 : // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
121 : // However, it also becomes Broken if the regular load fails.
122 : // From Console's perspective there's no practical difference
123 : // because attachment_status is polled by console only during attach operation execution.
124 0 : Self::Broken { reason, .. } => Failed {
125 0 : reason: reason.to_owned(),
126 0 : },
127 : // Why is Stopping a Maybe case? Because, during pageserver shutdown,
128 : // we set the Stopping state irrespective of whether the tenant
129 : // has finished attaching or not.
130 0 : Self::Stopping { .. } => Maybe,
131 : }
132 0 : }
133 :
134 0 : pub fn broken_from_reason(reason: String) -> Self {
135 0 : let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
136 0 : Self::Broken {
137 0 : reason,
138 0 : backtrace: backtrace_str,
139 0 : }
140 0 : }
141 : }
142 :
143 : impl std::fmt::Debug for TenantState {
144 2 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
145 2 : match self {
146 2 : Self::Broken { reason, backtrace } if !reason.is_empty() => {
147 2 : write!(f, "Broken due to: {reason}. Backtrace:\n{backtrace}")
148 : }
149 0 : _ => write!(f, "{self}"),
150 : }
151 2 : }
152 : }
153 :
154 : /// A temporary lease to a specific lsn inside a timeline.
155 : /// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
156 : #[serde_as]
157 0 : #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
158 : pub struct LsnLease {
159 : #[serde_as(as = "SystemTimeAsRfc3339Millis")]
160 : pub valid_until: SystemTime,
161 : }
162 :
163 : serde_with::serde_conv!(
164 : SystemTimeAsRfc3339Millis,
165 : SystemTime,
166 0 : |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
167 0 : |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
168 : );
169 :
170 : impl LsnLease {
171 : /// The default length for an explicit LSN lease request (10 minutes).
172 : pub const DEFAULT_LENGTH: Duration = Duration::from_secs(10 * 60);
173 :
174 : /// The default length for an implicit LSN lease granted during
175 : /// `get_lsn_by_timestamp` request (1 minutes).
176 : pub const DEFAULT_LENGTH_FOR_TS: Duration = Duration::from_secs(60);
177 :
178 : /// Checks whether the lease is expired.
179 6 : pub fn is_expired(&self, now: &SystemTime) -> bool {
180 6 : now > &self.valid_until
181 6 : }
182 : }
183 :
184 : /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
185 : ///
186 : /// XXX: We used to have more variants here, but now it's just one, which makes this rather
187 : /// useless. Remove, once we've checked that there's no client code left that looks at this.
188 2 : #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
189 : pub enum ActivatingFrom {
190 : /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
191 : Attaching,
192 : }
193 :
194 : /// A state of a timeline in pageserver's memory.
195 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
196 : pub enum TimelineState {
197 : /// The timeline is recognized by the pageserver but is not yet operational.
198 : /// In particular, the walreceiver connection loop is not running for this timeline.
199 : /// It will eventually transition to state Active or Broken.
200 : Loading,
201 : /// The timeline is fully operational.
202 : /// It can be queried, and the walreceiver connection loop is running.
203 : Active,
204 : /// The timeline was previously Loading or Active but is shutting down.
205 : /// It cannot transition back into any other state.
206 : Stopping,
207 : /// The timeline is broken and not operational (previous states: Loading or Active).
208 : Broken { reason: String, backtrace: String },
209 : }
210 :
211 0 : #[derive(Serialize, Deserialize, Clone)]
212 : pub struct TimelineCreateRequest {
213 : pub new_timeline_id: TimelineId,
214 : #[serde(flatten)]
215 : pub mode: TimelineCreateRequestMode,
216 : }
217 :
218 0 : #[derive(Serialize, Deserialize, Clone)]
219 : #[serde(untagged)]
220 : pub enum TimelineCreateRequestMode {
221 : Branch {
222 : ancestor_timeline_id: TimelineId,
223 : #[serde(default)]
224 : ancestor_start_lsn: Option<Lsn>,
225 : // TODO: cplane sets this, but, the branching code always
226 : // inherits the ancestor's pg_version. Earlier code wasn't
227 : // using a flattened enum, so, it was an accepted field, and
228 : // we continue to accept it by having it here.
229 : pg_version: Option<u32>,
230 : },
231 : // NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap.
232 : // (serde picks the first matching enum variant, in declaration order).
233 : Bootstrap {
234 : #[serde(default)]
235 : existing_initdb_timeline_id: Option<TimelineId>,
236 : pg_version: Option<u32>,
237 : },
238 : }
239 :
240 0 : #[derive(Serialize, Deserialize, Clone)]
241 : pub struct LsnLeaseRequest {
242 : pub lsn: Lsn,
243 : }
244 :
245 0 : #[derive(Serialize, Deserialize)]
246 : pub struct TenantShardSplitRequest {
247 : pub new_shard_count: u8,
248 :
249 : // A tenant's stripe size is only meaningful the first time their shard count goes
250 : // above 1: therefore during a split from 1->N shards, we may modify the stripe size.
251 : //
252 : // If this is set while the stripe count is being increased from an already >1 value,
253 : // then the request will fail with 400.
254 : pub new_stripe_size: Option<ShardStripeSize>,
255 : }
256 :
257 0 : #[derive(Serialize, Deserialize)]
258 : pub struct TenantShardSplitResponse {
259 : pub new_shards: Vec<TenantShardId>,
260 : }
261 :
262 : /// Parameters that apply to all shards in a tenant. Used during tenant creation.
263 0 : #[derive(Serialize, Deserialize, Debug)]
264 : #[serde(deny_unknown_fields)]
265 : pub struct ShardParameters {
266 : pub count: ShardCount,
267 : pub stripe_size: ShardStripeSize,
268 : }
269 :
270 : impl ShardParameters {
271 : pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
272 :
273 0 : pub fn is_unsharded(&self) -> bool {
274 0 : self.count.is_unsharded()
275 0 : }
276 : }
277 :
278 : impl Default for ShardParameters {
279 187 : fn default() -> Self {
280 187 : Self {
281 187 : count: ShardCount::new(0),
282 187 : stripe_size: Self::DEFAULT_STRIPE_SIZE,
283 187 : }
284 187 : }
285 : }
286 :
287 : /// An alternative representation of `pageserver::tenant::TenantConf` with
288 : /// simpler types.
289 2 : #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
290 : pub struct TenantConfig {
291 : pub checkpoint_distance: Option<u64>,
292 : pub checkpoint_timeout: Option<String>,
293 : pub compaction_target_size: Option<u64>,
294 : pub compaction_period: Option<String>,
295 : pub compaction_threshold: Option<usize>,
296 : // defer parsing compaction_algorithm, like eviction_policy
297 : pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
298 : pub gc_horizon: Option<u64>,
299 : pub gc_period: Option<String>,
300 : pub image_creation_threshold: Option<usize>,
301 : pub pitr_interval: Option<String>,
302 : pub walreceiver_connect_timeout: Option<String>,
303 : pub lagging_wal_timeout: Option<String>,
304 : pub max_lsn_wal_lag: Option<NonZeroU64>,
305 : pub eviction_policy: Option<EvictionPolicy>,
306 : pub min_resident_size_override: Option<u64>,
307 : pub evictions_low_residence_duration_metric_threshold: Option<String>,
308 : pub heatmap_period: Option<String>,
309 : pub lazy_slru_download: Option<bool>,
310 : pub timeline_get_throttle: Option<ThrottleConfig>,
311 : pub image_layer_creation_check_threshold: Option<u8>,
312 : pub switch_aux_file_policy: Option<AuxFilePolicy>,
313 : pub lsn_lease_length: Option<String>,
314 : pub lsn_lease_length_for_ts: Option<String>,
315 : }
316 :
317 : /// The policy for the aux file storage.
318 : ///
319 : /// It can be switched through `switch_aux_file_policy` tenant config.
320 : /// When the first aux file written, the policy will be persisted in the
321 : /// `index_part.json` file and has a limited migration path.
322 : ///
323 : /// Currently, we only allow the following migration path:
324 : ///
325 : /// Unset -> V1
326 : /// -> V2
327 : /// -> CrossValidation -> V2
328 : #[derive(
329 : Eq,
330 : PartialEq,
331 : Debug,
332 : Copy,
333 : Clone,
334 5 : strum_macros::EnumString,
335 0 : strum_macros::Display,
336 0 : serde_with::DeserializeFromStr,
337 : serde_with::SerializeDisplay,
338 : )]
339 : #[strum(serialize_all = "kebab-case")]
340 : pub enum AuxFilePolicy {
341 : /// V1 aux file policy: store everything in AUX_FILE_KEY
342 : #[strum(ascii_case_insensitive)]
343 : V1,
344 : /// V2 aux file policy: store in the AUX_FILE keyspace
345 : #[strum(ascii_case_insensitive)]
346 : V2,
347 : /// Cross validation runs both formats on the write path and does validation
348 : /// on the read path.
349 : #[strum(ascii_case_insensitive)]
350 : CrossValidation,
351 : }
352 :
353 : impl AuxFilePolicy {
354 12 : pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
355 8 : matches!(
356 12 : (from, to),
357 : (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
358 : )
359 12 : }
360 :
361 : /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
362 382 : pub fn default_tenant_config() -> Self {
363 382 : Self::V2
364 382 : }
365 : }
366 :
367 : /// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
368 : pub struct AtomicAuxFilePolicy(AtomicUsize);
369 :
370 : impl AtomicAuxFilePolicy {
371 0 : pub fn new(policy: Option<AuxFilePolicy>) -> Self {
372 0 : Self(AtomicUsize::new(
373 0 : policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
374 0 : ))
375 0 : }
376 :
377 0 : pub fn load(&self) -> Option<AuxFilePolicy> {
378 0 : match self.0.load(std::sync::atomic::Ordering::Acquire) {
379 0 : 0 => None,
380 0 : other => Some(AuxFilePolicy::from_usize(other)),
381 : }
382 0 : }
383 :
384 0 : pub fn store(&self, policy: Option<AuxFilePolicy>) {
385 0 : self.0.store(
386 0 : policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
387 0 : std::sync::atomic::Ordering::Release,
388 0 : );
389 0 : }
390 : }
391 :
392 : impl AuxFilePolicy {
393 0 : pub fn to_usize(self) -> usize {
394 0 : match self {
395 0 : Self::V1 => 1,
396 0 : Self::CrossValidation => 2,
397 0 : Self::V2 => 3,
398 : }
399 0 : }
400 :
401 0 : pub fn try_from_usize(this: usize) -> Option<Self> {
402 0 : match this {
403 0 : 1 => Some(Self::V1),
404 0 : 2 => Some(Self::CrossValidation),
405 0 : 3 => Some(Self::V2),
406 0 : _ => None,
407 : }
408 0 : }
409 :
410 0 : pub fn from_usize(this: usize) -> Self {
411 0 : Self::try_from_usize(this).unwrap()
412 0 : }
413 : }
414 :
415 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
416 : #[serde(tag = "kind")]
417 : pub enum EvictionPolicy {
418 : NoEviction,
419 : LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
420 : OnlyImitiate(EvictionPolicyLayerAccessThreshold),
421 : }
422 :
423 : impl EvictionPolicy {
424 0 : pub fn discriminant_str(&self) -> &'static str {
425 0 : match self {
426 0 : EvictionPolicy::NoEviction => "NoEviction",
427 0 : EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
428 0 : EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
429 : }
430 0 : }
431 : }
432 :
433 : #[derive(
434 : Eq,
435 : PartialEq,
436 : Debug,
437 : Copy,
438 : Clone,
439 0 : strum_macros::EnumString,
440 0 : strum_macros::Display,
441 0 : serde_with::DeserializeFromStr,
442 : serde_with::SerializeDisplay,
443 : )]
444 : #[strum(serialize_all = "kebab-case")]
445 : pub enum CompactionAlgorithm {
446 : Legacy,
447 : Tiered,
448 : }
449 :
450 : #[derive(
451 0 : Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay,
452 : )]
453 : pub enum ImageCompressionAlgorithm {
454 : // Disabled for writes, support decompressing during read path
455 : Disabled,
456 : /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
457 : /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
458 : Zstd {
459 : level: Option<i8>,
460 : },
461 : }
462 :
463 : impl FromStr for ImageCompressionAlgorithm {
464 : type Err = anyhow::Error;
465 8 : fn from_str(s: &str) -> Result<Self, Self::Err> {
466 8 : let mut components = s.split(['(', ')']);
467 8 : let first = components
468 8 : .next()
469 8 : .ok_or_else(|| anyhow::anyhow!("empty string"))?;
470 8 : match first {
471 8 : "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
472 6 : "zstd" => {
473 6 : let level = if let Some(v) = components.next() {
474 4 : let v: i8 = v.parse()?;
475 4 : Some(v)
476 : } else {
477 2 : None
478 : };
479 :
480 6 : Ok(ImageCompressionAlgorithm::Zstd { level })
481 : }
482 0 : _ => anyhow::bail!("invalid specifier '{first}'"),
483 : }
484 8 : }
485 : }
486 :
487 : impl Display for ImageCompressionAlgorithm {
488 12 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
489 12 : match self {
490 3 : ImageCompressionAlgorithm::Disabled => write!(f, "disabled"),
491 9 : ImageCompressionAlgorithm::Zstd { level } => {
492 9 : if let Some(level) = level {
493 6 : write!(f, "zstd({})", level)
494 : } else {
495 3 : write!(f, "zstd")
496 : }
497 : }
498 : }
499 12 : }
500 : }
501 :
502 0 : #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
503 : pub struct CompactionAlgorithmSettings {
504 : pub kind: CompactionAlgorithm,
505 : }
506 :
507 6 : #[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
508 : #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
509 : pub enum L0FlushConfig {
510 : #[serde(rename_all = "snake_case")]
511 : Direct { max_concurrency: NonZeroUsize },
512 : }
513 :
514 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
515 : pub struct EvictionPolicyLayerAccessThreshold {
516 : #[serde(with = "humantime_serde")]
517 : pub period: Duration,
518 : #[serde(with = "humantime_serde")]
519 : pub threshold: Duration,
520 : }
521 :
522 0 : #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
523 : pub struct ThrottleConfig {
524 : pub task_kinds: Vec<String>, // TaskKind
525 : pub initial: u32,
526 : #[serde(with = "humantime_serde")]
527 : pub refill_interval: Duration,
528 : pub refill_amount: NonZeroU32,
529 : pub max: u32,
530 : }
531 :
532 : impl ThrottleConfig {
533 382 : pub fn disabled() -> Self {
534 382 : Self {
535 382 : task_kinds: vec![], // effectively disables the throttle
536 382 : // other values don't matter with emtpy `task_kinds`.
537 382 : initial: 0,
538 382 : refill_interval: Duration::from_millis(1),
539 382 : refill_amount: NonZeroU32::new(1).unwrap(),
540 382 : max: 1,
541 382 : }
542 382 : }
543 : /// The requests per second allowed by the given config.
544 0 : pub fn steady_rps(&self) -> f64 {
545 0 : (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
546 0 : }
547 : }
548 :
549 : /// A flattened analog of a `pagesever::tenant::LocationMode`, which
550 : /// lists out all possible states (and the virtual "Detached" state)
551 : /// in a flat form rather than using rust-style enums.
552 0 : #[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
553 : pub enum LocationConfigMode {
554 : AttachedSingle,
555 : AttachedMulti,
556 : AttachedStale,
557 : Secondary,
558 : Detached,
559 : }
560 :
561 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
562 : pub struct LocationConfigSecondary {
563 : pub warm: bool,
564 : }
565 :
566 : /// An alternative representation of `pageserver::tenant::LocationConf`,
567 : /// for use in external-facing APIs.
568 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
569 : pub struct LocationConfig {
570 : pub mode: LocationConfigMode,
571 : /// If attaching, in what generation?
572 : #[serde(default)]
573 : pub generation: Option<u32>,
574 :
575 : // If requesting mode `Secondary`, configuration for that.
576 : #[serde(default)]
577 : pub secondary_conf: Option<LocationConfigSecondary>,
578 :
579 : // Shard parameters: if shard_count is nonzero, then other shard_* fields
580 : // must be set accurately.
581 : #[serde(default)]
582 : pub shard_number: u8,
583 : #[serde(default)]
584 : pub shard_count: u8,
585 : #[serde(default)]
586 : pub shard_stripe_size: u32,
587 :
588 : // This configuration only affects attached mode, but should be provided irrespective
589 : // of the mode, as a secondary location might transition on startup if the response
590 : // to the `/re-attach` control plane API requests it.
591 : pub tenant_conf: TenantConfig,
592 : }
593 :
594 0 : #[derive(Serialize, Deserialize)]
595 : pub struct LocationConfigListResponse {
596 : pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
597 : }
598 :
599 : #[derive(Serialize)]
600 : pub struct StatusResponse {
601 : pub id: NodeId,
602 : }
603 :
604 0 : #[derive(Serialize, Deserialize, Debug)]
605 : #[serde(deny_unknown_fields)]
606 : pub struct TenantLocationConfigRequest {
607 : #[serde(flatten)]
608 : pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
609 : }
610 :
611 0 : #[derive(Serialize, Deserialize, Debug)]
612 : #[serde(deny_unknown_fields)]
613 : pub struct TenantTimeTravelRequest {
614 : pub shard_counts: Vec<ShardCount>,
615 : }
616 :
617 0 : #[derive(Serialize, Deserialize, Debug)]
618 : #[serde(deny_unknown_fields)]
619 : pub struct TenantShardLocation {
620 : pub shard_id: TenantShardId,
621 : pub node_id: NodeId,
622 : }
623 :
624 0 : #[derive(Serialize, Deserialize, Debug)]
625 : #[serde(deny_unknown_fields)]
626 : pub struct TenantLocationConfigResponse {
627 : pub shards: Vec<TenantShardLocation>,
628 : // If the shards' ShardCount count is >1, stripe_size will be set.
629 : pub stripe_size: Option<ShardStripeSize>,
630 : }
631 :
632 3 : #[derive(Serialize, Deserialize, Debug)]
633 : #[serde(deny_unknown_fields)]
634 : pub struct TenantConfigRequest {
635 : pub tenant_id: TenantId,
636 : #[serde(flatten)]
637 : pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
638 : }
639 :
640 : impl std::ops::Deref for TenantConfigRequest {
641 : type Target = TenantConfig;
642 :
643 0 : fn deref(&self) -> &Self::Target {
644 0 : &self.config
645 0 : }
646 : }
647 :
648 : impl TenantConfigRequest {
649 0 : pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
650 0 : let config = TenantConfig::default();
651 0 : TenantConfigRequest { tenant_id, config }
652 0 : }
653 : }
654 :
655 : /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
656 0 : #[derive(Serialize, Deserialize, Clone)]
657 : #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
658 : pub enum TenantAttachmentStatus {
659 : Maybe,
660 : Attached,
661 : Failed { reason: String },
662 : }
663 :
664 0 : #[derive(Serialize, Deserialize, Clone)]
665 : pub struct TenantInfo {
666 : pub id: TenantShardId,
667 : // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
668 : pub state: TenantState,
669 : /// Sum of the size of all layer files.
670 : /// If a layer is present in both local FS and S3, it counts only once.
671 : pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
672 : pub attachment_status: TenantAttachmentStatus,
673 : pub generation: u32,
674 :
675 : /// Opaque explanation if gc is being blocked.
676 : ///
677 : /// Only looked up for the individual tenant detail, not the listing. This is purely for
678 : /// debugging, not included in openapi.
679 : #[serde(skip_serializing_if = "Option::is_none")]
680 : pub gc_blocking: Option<String>,
681 : }
682 :
683 0 : #[derive(Serialize, Deserialize, Clone)]
684 : pub struct TenantDetails {
685 : #[serde(flatten)]
686 : pub tenant_info: TenantInfo,
687 :
688 : pub walredo: Option<WalRedoManagerStatus>,
689 :
690 : pub timelines: Vec<TimelineId>,
691 : }
692 :
693 0 : #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
694 : pub enum TimelineArchivalState {
695 : Archived,
696 : Unarchived,
697 : }
698 :
699 0 : #[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
700 : pub struct TimelineArchivalConfigRequest {
701 : pub state: TimelineArchivalState,
702 : }
703 :
704 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
705 : pub struct TimelinesInfoAndOffloaded {
706 : pub timelines: Vec<TimelineInfo>,
707 : pub offloaded: Vec<OffloadedTimelineInfo>,
708 : }
709 :
710 : /// Analog of [`TimelineInfo`] for offloaded timelines.
711 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
712 : pub struct OffloadedTimelineInfo {
713 : pub tenant_id: TenantShardId,
714 : pub timeline_id: TimelineId,
715 : /// Whether the timeline has a parent it has been branched off from or not
716 : pub ancestor_timeline_id: Option<TimelineId>,
717 : /// Whether to retain the branch lsn at the ancestor or not
718 : pub ancestor_retain_lsn: Option<Lsn>,
719 : /// The time point when the timeline was archived
720 : pub archived_at: chrono::DateTime<chrono::Utc>,
721 : }
722 :
723 : /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
724 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
725 : pub struct TimelineInfo {
726 : pub tenant_id: TenantShardId,
727 : pub timeline_id: TimelineId,
728 :
729 : pub ancestor_timeline_id: Option<TimelineId>,
730 : pub ancestor_lsn: Option<Lsn>,
731 : pub last_record_lsn: Lsn,
732 : pub prev_record_lsn: Option<Lsn>,
733 : pub latest_gc_cutoff_lsn: Lsn,
734 : pub disk_consistent_lsn: Lsn,
735 :
736 : /// The LSN that we have succesfully uploaded to remote storage
737 : pub remote_consistent_lsn: Lsn,
738 :
739 : /// The LSN that we are advertizing to safekeepers
740 : pub remote_consistent_lsn_visible: Lsn,
741 :
742 : /// The LSN from the start of the root timeline (never changes)
743 : pub initdb_lsn: Lsn,
744 :
745 : pub current_logical_size: u64,
746 : pub current_logical_size_is_accurate: bool,
747 :
748 : pub directory_entries_counts: Vec<u64>,
749 :
750 : /// Sum of the size of all layer files.
751 : /// If a layer is present in both local FS and S3, it counts only once.
752 : pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
753 : pub current_logical_size_non_incremental: Option<u64>,
754 :
755 : /// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes
756 : /// beyond the branch's branch point, we only count up to the branch point.
757 : pub pitr_history_size: u64,
758 :
759 : /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
760 : /// ancestor data used by this branch would have been retained anyway). If this is false, then
761 : /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
762 : /// otherwise be able to GC.
763 : pub within_ancestor_pitr: bool,
764 :
765 : pub timeline_dir_layer_file_size_sum: Option<u64>,
766 :
767 : pub wal_source_connstr: Option<String>,
768 : pub last_received_msg_lsn: Option<Lsn>,
769 : /// the timestamp (in microseconds) of the last received message
770 : pub last_received_msg_ts: Option<u128>,
771 : pub pg_version: u32,
772 :
773 : pub state: TimelineState,
774 :
775 : pub walreceiver_status: String,
776 :
777 : // ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility.
778 : // Backward compatibility: you will get a JSON not containing the newly-added field.
779 : // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
780 : // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
781 : // read.
782 : pub is_archived: Option<bool>,
783 : }
784 :
785 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
786 : pub struct LayerMapInfo {
787 : pub in_memory_layers: Vec<InMemoryLayerInfo>,
788 : pub historic_layers: Vec<HistoricLayerInfo>,
789 : }
790 :
791 : /// The residence status of a layer
792 0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
793 : pub enum LayerResidenceStatus {
794 : /// Residence status for a layer file that exists locally.
795 : /// It may also exist on the remote, we don't care here.
796 : Resident,
797 : /// Residence status for a layer file that only exists on the remote.
798 : Evicted,
799 : }
800 :
801 : #[serde_as]
802 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
803 : pub struct LayerAccessStats {
804 : #[serde_as(as = "serde_with::TimestampMilliSeconds")]
805 : pub access_time: SystemTime,
806 :
807 : #[serde_as(as = "serde_with::TimestampMilliSeconds")]
808 : pub residence_time: SystemTime,
809 :
810 : pub visible: bool,
811 : }
812 :
813 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
814 : #[serde(tag = "kind")]
815 : pub enum InMemoryLayerInfo {
816 : Open { lsn_start: Lsn },
817 : Frozen { lsn_start: Lsn, lsn_end: Lsn },
818 : }
819 :
820 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
821 : #[serde(tag = "kind")]
822 : pub enum HistoricLayerInfo {
823 : Delta {
824 : layer_file_name: String,
825 : layer_file_size: u64,
826 :
827 : lsn_start: Lsn,
828 : lsn_end: Lsn,
829 : remote: bool,
830 : access_stats: LayerAccessStats,
831 :
832 : l0: bool,
833 : },
834 : Image {
835 : layer_file_name: String,
836 : layer_file_size: u64,
837 :
838 : lsn_start: Lsn,
839 : remote: bool,
840 : access_stats: LayerAccessStats,
841 : },
842 : }
843 :
844 : impl HistoricLayerInfo {
845 0 : pub fn layer_file_name(&self) -> &str {
846 0 : match self {
847 : HistoricLayerInfo::Delta {
848 0 : layer_file_name, ..
849 0 : } => layer_file_name,
850 : HistoricLayerInfo::Image {
851 0 : layer_file_name, ..
852 0 : } => layer_file_name,
853 : }
854 0 : }
855 0 : pub fn is_remote(&self) -> bool {
856 0 : match self {
857 0 : HistoricLayerInfo::Delta { remote, .. } => *remote,
858 0 : HistoricLayerInfo::Image { remote, .. } => *remote,
859 : }
860 0 : }
861 0 : pub fn set_remote(&mut self, value: bool) {
862 0 : let field = match self {
863 0 : HistoricLayerInfo::Delta { remote, .. } => remote,
864 0 : HistoricLayerInfo::Image { remote, .. } => remote,
865 : };
866 0 : *field = value;
867 0 : }
868 0 : pub fn layer_file_size(&self) -> u64 {
869 0 : match self {
870 : HistoricLayerInfo::Delta {
871 0 : layer_file_size, ..
872 0 : } => *layer_file_size,
873 : HistoricLayerInfo::Image {
874 0 : layer_file_size, ..
875 0 : } => *layer_file_size,
876 : }
877 0 : }
878 : }
879 :
880 0 : #[derive(Debug, Serialize, Deserialize)]
881 : pub struct DownloadRemoteLayersTaskSpawnRequest {
882 : pub max_concurrent_downloads: NonZeroUsize,
883 : }
884 :
885 0 : #[derive(Debug, Serialize, Deserialize)]
886 : pub struct IngestAuxFilesRequest {
887 : pub aux_files: HashMap<String, String>,
888 : }
889 :
890 0 : #[derive(Debug, Serialize, Deserialize)]
891 : pub struct ListAuxFilesRequest {
892 : pub lsn: Lsn,
893 : }
894 :
895 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
896 : pub struct DownloadRemoteLayersTaskInfo {
897 : pub task_id: String,
898 : pub state: DownloadRemoteLayersTaskState,
899 : pub total_layer_count: u64, // stable once `completed`
900 : pub successful_download_count: u64, // stable once `completed`
901 : pub failed_download_count: u64, // stable once `completed`
902 : }
903 :
904 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
905 : pub enum DownloadRemoteLayersTaskState {
906 : Running,
907 : Completed,
908 : ShutDown,
909 : }
910 :
911 0 : #[derive(Debug, Serialize, Deserialize)]
912 : pub struct TimelineGcRequest {
913 : pub gc_horizon: Option<u64>,
914 : }
915 :
916 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
917 : pub struct WalRedoManagerProcessStatus {
918 : pub pid: u32,
919 : }
920 :
921 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
922 : pub struct WalRedoManagerStatus {
923 : pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
924 : pub process: Option<WalRedoManagerProcessStatus>,
925 : }
926 :
927 : /// The progress of a secondary tenant.
928 : ///
929 : /// It is mostly useful when doing a long running download: e.g. initiating
930 : /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
931 : /// what's happening.
932 0 : #[derive(Default, Debug, Serialize, Deserialize, Clone)]
933 : pub struct SecondaryProgress {
934 : /// The remote storage LastModified time of the heatmap object we last downloaded.
935 : pub heatmap_mtime: Option<serde_system_time::SystemTime>,
936 :
937 : /// The number of layers currently on-disk
938 : pub layers_downloaded: usize,
939 : /// The number of layers in the most recently seen heatmap
940 : pub layers_total: usize,
941 :
942 : /// The number of layer bytes currently on-disk
943 : pub bytes_downloaded: u64,
944 : /// The number of layer bytes in the most recently seen heatmap
945 : pub bytes_total: u64,
946 : }
947 :
948 0 : #[derive(Serialize, Deserialize, Debug)]
949 : pub struct TenantScanRemoteStorageShard {
950 : pub tenant_shard_id: TenantShardId,
951 : pub generation: Option<u32>,
952 : }
953 :
954 0 : #[derive(Serialize, Deserialize, Debug, Default)]
955 : pub struct TenantScanRemoteStorageResponse {
956 : pub shards: Vec<TenantScanRemoteStorageShard>,
957 : }
958 :
959 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
960 : #[serde(rename_all = "snake_case")]
961 : pub enum TenantSorting {
962 : ResidentSize,
963 : MaxLogicalSize,
964 : }
965 :
966 : impl Default for TenantSorting {
967 0 : fn default() -> Self {
968 0 : Self::ResidentSize
969 0 : }
970 : }
971 :
972 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
973 : pub struct TopTenantShardsRequest {
974 : // How would you like to sort the tenants?
975 : pub order_by: TenantSorting,
976 :
977 : // How many results?
978 : pub limit: usize,
979 :
980 : // Omit tenants with more than this many shards (e.g. if this is the max number of shards
981 : // that the caller would ever split to)
982 : pub where_shards_lt: Option<ShardCount>,
983 :
984 : // Omit tenants where the ordering metric is less than this (this is an optimization to
985 : // let us quickly exclude numerous tiny shards)
986 : pub where_gt: Option<u64>,
987 : }
988 :
989 0 : #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
990 : pub struct TopTenantShardItem {
991 : pub id: TenantShardId,
992 :
993 : /// Total size of layers on local disk for all timelines in this tenant
994 : pub resident_size: u64,
995 :
996 : /// Total size of layers in remote storage for all timelines in this tenant
997 : pub physical_size: u64,
998 :
999 : /// The largest logical size of a timeline within this tenant
1000 : pub max_logical_size: u64,
1001 : }
1002 :
1003 0 : #[derive(Serialize, Deserialize, Debug, Default)]
1004 : pub struct TopTenantShardsResponse {
1005 : pub shards: Vec<TopTenantShardItem>,
1006 : }
1007 :
1008 : pub mod virtual_file {
1009 : #[derive(
1010 : Copy,
1011 : Clone,
1012 : PartialEq,
1013 : Eq,
1014 : Hash,
1015 198 : strum_macros::EnumString,
1016 0 : strum_macros::Display,
1017 0 : serde_with::DeserializeFromStr,
1018 : serde_with::SerializeDisplay,
1019 : Debug,
1020 : )]
1021 : #[strum(serialize_all = "kebab-case")]
1022 : pub enum IoEngineKind {
1023 : StdFs,
1024 : #[cfg(target_os = "linux")]
1025 : TokioEpollUring,
1026 : }
1027 :
1028 : /// Direct IO modes for a pageserver.
1029 : #[derive(
1030 : Copy,
1031 : Clone,
1032 : PartialEq,
1033 : Eq,
1034 : Hash,
1035 0 : strum_macros::EnumString,
1036 0 : strum_macros::Display,
1037 0 : serde_with::DeserializeFromStr,
1038 : serde_with::SerializeDisplay,
1039 : Debug,
1040 : )]
1041 : #[strum(serialize_all = "kebab-case")]
1042 : #[repr(u8)]
1043 : pub enum IoMode {
1044 : /// Uses buffered IO.
1045 : Buffered,
1046 : /// Uses direct IO, error out if the operation fails.
1047 : #[cfg(target_os = "linux")]
1048 : Direct,
1049 : }
1050 :
1051 : impl IoMode {
1052 204 : pub const fn preferred() -> Self {
1053 204 : Self::Buffered
1054 204 : }
1055 : }
1056 :
1057 : impl TryFrom<u8> for IoMode {
1058 : type Error = u8;
1059 :
1060 1148 : fn try_from(value: u8) -> Result<Self, Self::Error> {
1061 1148 : Ok(match value {
1062 1148 : v if v == (IoMode::Buffered as u8) => IoMode::Buffered,
1063 : #[cfg(target_os = "linux")]
1064 0 : v if v == (IoMode::Direct as u8) => IoMode::Direct,
1065 0 : x => return Err(x),
1066 : })
1067 1148 : }
1068 : }
1069 : }
1070 :
1071 : // Wrapped in libpq CopyData
1072 : #[derive(PartialEq, Eq, Debug)]
1073 : pub enum PagestreamFeMessage {
1074 : Exists(PagestreamExistsRequest),
1075 : Nblocks(PagestreamNblocksRequest),
1076 : GetPage(PagestreamGetPageRequest),
1077 : DbSize(PagestreamDbSizeRequest),
1078 : GetSlruSegment(PagestreamGetSlruSegmentRequest),
1079 : }
1080 :
1081 : // Wrapped in libpq CopyData
1082 0 : #[derive(strum_macros::EnumProperty)]
1083 : pub enum PagestreamBeMessage {
1084 : Exists(PagestreamExistsResponse),
1085 : Nblocks(PagestreamNblocksResponse),
1086 : GetPage(PagestreamGetPageResponse),
1087 : Error(PagestreamErrorResponse),
1088 : DbSize(PagestreamDbSizeResponse),
1089 : GetSlruSegment(PagestreamGetSlruSegmentResponse),
1090 : }
1091 :
1092 : // Keep in sync with `pagestore_client.h`
1093 : #[repr(u8)]
1094 : enum PagestreamBeMessageTag {
1095 : Exists = 100,
1096 : Nblocks = 101,
1097 : GetPage = 102,
1098 : Error = 103,
1099 : DbSize = 104,
1100 : GetSlruSegment = 105,
1101 : }
1102 : impl TryFrom<u8> for PagestreamBeMessageTag {
1103 : type Error = u8;
1104 0 : fn try_from(value: u8) -> Result<Self, u8> {
1105 0 : match value {
1106 0 : 100 => Ok(PagestreamBeMessageTag::Exists),
1107 0 : 101 => Ok(PagestreamBeMessageTag::Nblocks),
1108 0 : 102 => Ok(PagestreamBeMessageTag::GetPage),
1109 0 : 103 => Ok(PagestreamBeMessageTag::Error),
1110 0 : 104 => Ok(PagestreamBeMessageTag::DbSize),
1111 0 : 105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
1112 0 : _ => Err(value),
1113 : }
1114 0 : }
1115 : }
1116 :
1117 : // A GetPage request contains two LSN values:
1118 : //
1119 : // request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
1120 : // "get the latest version present". It's used by the primary server, which knows that no one else
1121 : // is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
1122 : // Lsn::Max. Standby servers use the current replay LSN as the request LSN.
1123 : //
1124 : // not_modified_since: Hint to the pageserver that the client knows that the page has not been
1125 : // modified between 'not_modified_since' and the request LSN. It's always correct to set
1126 : // 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
1127 : // passing an earlier LSN can speed up the request, by allowing the pageserver to process the
1128 : // request without waiting for 'request_lsn' to arrive.
1129 : //
1130 : // The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
1131 : // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
1132 : // 'latest' was set to true. The V2 interface was added because there was no correct way for a
1133 : // standby to request a page at a particular non-latest LSN, and also include the
1134 : // 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
1135 : // request, if the standby knows that the page hasn't been modified since, and risk getting an error
1136 : // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
1137 : // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
1138 : // interface allows sending both LSNs, and let the pageserver do the right thing. There was no
1139 : // difference in the responses between V1 and V2.
1140 : //
1141 : #[derive(Clone, Copy)]
1142 : pub enum PagestreamProtocolVersion {
1143 : V2,
1144 : }
1145 :
1146 : #[derive(Debug, PartialEq, Eq)]
1147 : pub struct PagestreamExistsRequest {
1148 : pub request_lsn: Lsn,
1149 : pub not_modified_since: Lsn,
1150 : pub rel: RelTag,
1151 : }
1152 :
1153 : #[derive(Debug, PartialEq, Eq)]
1154 : pub struct PagestreamNblocksRequest {
1155 : pub request_lsn: Lsn,
1156 : pub not_modified_since: Lsn,
1157 : pub rel: RelTag,
1158 : }
1159 :
1160 : #[derive(Debug, PartialEq, Eq)]
1161 : pub struct PagestreamGetPageRequest {
1162 : pub request_lsn: Lsn,
1163 : pub not_modified_since: Lsn,
1164 : pub rel: RelTag,
1165 : pub blkno: u32,
1166 : }
1167 :
1168 : #[derive(Debug, PartialEq, Eq)]
1169 : pub struct PagestreamDbSizeRequest {
1170 : pub request_lsn: Lsn,
1171 : pub not_modified_since: Lsn,
1172 : pub dbnode: u32,
1173 : }
1174 :
1175 : #[derive(Debug, PartialEq, Eq)]
1176 : pub struct PagestreamGetSlruSegmentRequest {
1177 : pub request_lsn: Lsn,
1178 : pub not_modified_since: Lsn,
1179 : pub kind: u8,
1180 : pub segno: u32,
1181 : }
1182 :
1183 : #[derive(Debug)]
1184 : pub struct PagestreamExistsResponse {
1185 : pub exists: bool,
1186 : }
1187 :
1188 : #[derive(Debug)]
1189 : pub struct PagestreamNblocksResponse {
1190 : pub n_blocks: u32,
1191 : }
1192 :
1193 : #[derive(Debug)]
1194 : pub struct PagestreamGetPageResponse {
1195 : pub page: Bytes,
1196 : }
1197 :
1198 : #[derive(Debug)]
1199 : pub struct PagestreamGetSlruSegmentResponse {
1200 : pub segment: Bytes,
1201 : }
1202 :
1203 : #[derive(Debug)]
1204 : pub struct PagestreamErrorResponse {
1205 : pub message: String,
1206 : }
1207 :
1208 : #[derive(Debug)]
1209 : pub struct PagestreamDbSizeResponse {
1210 : pub db_size: i64,
1211 : }
1212 :
1213 : // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
1214 : // that require pageserver-internal types. It is sufficient to get the total size.
1215 0 : #[derive(Serialize, Deserialize, Debug)]
1216 : pub struct TenantHistorySize {
1217 : pub id: TenantId,
1218 : /// Size is a mixture of WAL and logical size, so the unit is bytes.
1219 : ///
1220 : /// Will be none if `?inputs_only=true` was given.
1221 : pub size: Option<u64>,
1222 : }
1223 :
1224 : impl PagestreamFeMessage {
1225 : /// Serialize a compute -> pageserver message. This is currently only used in testing
1226 : /// tools. Always uses protocol version 2.
1227 4 : pub fn serialize(&self) -> Bytes {
1228 4 : let mut bytes = BytesMut::new();
1229 4 :
1230 4 : match self {
1231 1 : Self::Exists(req) => {
1232 1 : bytes.put_u8(0);
1233 1 : bytes.put_u64(req.request_lsn.0);
1234 1 : bytes.put_u64(req.not_modified_since.0);
1235 1 : bytes.put_u32(req.rel.spcnode);
1236 1 : bytes.put_u32(req.rel.dbnode);
1237 1 : bytes.put_u32(req.rel.relnode);
1238 1 : bytes.put_u8(req.rel.forknum);
1239 1 : }
1240 :
1241 1 : Self::Nblocks(req) => {
1242 1 : bytes.put_u8(1);
1243 1 : bytes.put_u64(req.request_lsn.0);
1244 1 : bytes.put_u64(req.not_modified_since.0);
1245 1 : bytes.put_u32(req.rel.spcnode);
1246 1 : bytes.put_u32(req.rel.dbnode);
1247 1 : bytes.put_u32(req.rel.relnode);
1248 1 : bytes.put_u8(req.rel.forknum);
1249 1 : }
1250 :
1251 1 : Self::GetPage(req) => {
1252 1 : bytes.put_u8(2);
1253 1 : bytes.put_u64(req.request_lsn.0);
1254 1 : bytes.put_u64(req.not_modified_since.0);
1255 1 : bytes.put_u32(req.rel.spcnode);
1256 1 : bytes.put_u32(req.rel.dbnode);
1257 1 : bytes.put_u32(req.rel.relnode);
1258 1 : bytes.put_u8(req.rel.forknum);
1259 1 : bytes.put_u32(req.blkno);
1260 1 : }
1261 :
1262 1 : Self::DbSize(req) => {
1263 1 : bytes.put_u8(3);
1264 1 : bytes.put_u64(req.request_lsn.0);
1265 1 : bytes.put_u64(req.not_modified_since.0);
1266 1 : bytes.put_u32(req.dbnode);
1267 1 : }
1268 :
1269 0 : Self::GetSlruSegment(req) => {
1270 0 : bytes.put_u8(4);
1271 0 : bytes.put_u64(req.request_lsn.0);
1272 0 : bytes.put_u64(req.not_modified_since.0);
1273 0 : bytes.put_u8(req.kind);
1274 0 : bytes.put_u32(req.segno);
1275 0 : }
1276 : }
1277 :
1278 4 : bytes.into()
1279 4 : }
1280 :
1281 4 : pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
1282 : // these correspond to the NeonMessageTag enum in pagestore_client.h
1283 : //
1284 : // TODO: consider using protobuf or serde bincode for less error prone
1285 : // serialization.
1286 4 : let msg_tag = body.read_u8()?;
1287 :
1288 : // these two fields are the same for every request type
1289 4 : let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
1290 4 : let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
1291 :
1292 4 : match msg_tag {
1293 : 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
1294 1 : request_lsn,
1295 1 : not_modified_since,
1296 1 : rel: RelTag {
1297 1 : spcnode: body.read_u32::<BigEndian>()?,
1298 1 : dbnode: body.read_u32::<BigEndian>()?,
1299 1 : relnode: body.read_u32::<BigEndian>()?,
1300 1 : forknum: body.read_u8()?,
1301 : },
1302 : })),
1303 : 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1304 1 : request_lsn,
1305 1 : not_modified_since,
1306 1 : rel: RelTag {
1307 1 : spcnode: body.read_u32::<BigEndian>()?,
1308 1 : dbnode: body.read_u32::<BigEndian>()?,
1309 1 : relnode: body.read_u32::<BigEndian>()?,
1310 1 : forknum: body.read_u8()?,
1311 : },
1312 : })),
1313 : 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1314 1 : request_lsn,
1315 1 : not_modified_since,
1316 1 : rel: RelTag {
1317 1 : spcnode: body.read_u32::<BigEndian>()?,
1318 1 : dbnode: body.read_u32::<BigEndian>()?,
1319 1 : relnode: body.read_u32::<BigEndian>()?,
1320 1 : forknum: body.read_u8()?,
1321 : },
1322 1 : blkno: body.read_u32::<BigEndian>()?,
1323 : })),
1324 : 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1325 1 : request_lsn,
1326 1 : not_modified_since,
1327 1 : dbnode: body.read_u32::<BigEndian>()?,
1328 : })),
1329 : 4 => Ok(PagestreamFeMessage::GetSlruSegment(
1330 : PagestreamGetSlruSegmentRequest {
1331 0 : request_lsn,
1332 0 : not_modified_since,
1333 0 : kind: body.read_u8()?,
1334 0 : segno: body.read_u32::<BigEndian>()?,
1335 : },
1336 : )),
1337 0 : _ => bail!("unknown smgr message tag: {:?}", msg_tag),
1338 : }
1339 4 : }
1340 : }
1341 :
1342 : impl PagestreamBeMessage {
1343 0 : pub fn serialize(&self) -> Bytes {
1344 0 : let mut bytes = BytesMut::new();
1345 :
1346 : use PagestreamBeMessageTag as Tag;
1347 0 : match self {
1348 0 : Self::Exists(resp) => {
1349 0 : bytes.put_u8(Tag::Exists as u8);
1350 0 : bytes.put_u8(resp.exists as u8);
1351 0 : }
1352 :
1353 0 : Self::Nblocks(resp) => {
1354 0 : bytes.put_u8(Tag::Nblocks as u8);
1355 0 : bytes.put_u32(resp.n_blocks);
1356 0 : }
1357 :
1358 0 : Self::GetPage(resp) => {
1359 0 : bytes.put_u8(Tag::GetPage as u8);
1360 0 : bytes.put(&resp.page[..]);
1361 0 : }
1362 :
1363 0 : Self::Error(resp) => {
1364 0 : bytes.put_u8(Tag::Error as u8);
1365 0 : bytes.put(resp.message.as_bytes());
1366 0 : bytes.put_u8(0); // null terminator
1367 0 : }
1368 0 : Self::DbSize(resp) => {
1369 0 : bytes.put_u8(Tag::DbSize as u8);
1370 0 : bytes.put_i64(resp.db_size);
1371 0 : }
1372 :
1373 0 : Self::GetSlruSegment(resp) => {
1374 0 : bytes.put_u8(Tag::GetSlruSegment as u8);
1375 0 : bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
1376 0 : bytes.put(&resp.segment[..]);
1377 0 : }
1378 : }
1379 :
1380 0 : bytes.into()
1381 0 : }
1382 :
1383 0 : pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
1384 0 : let mut buf = buf.reader();
1385 0 : let msg_tag = buf.read_u8()?;
1386 :
1387 : use PagestreamBeMessageTag as Tag;
1388 0 : let ok =
1389 0 : match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
1390 : Tag::Exists => {
1391 0 : let exists = buf.read_u8()?;
1392 0 : Self::Exists(PagestreamExistsResponse {
1393 0 : exists: exists != 0,
1394 0 : })
1395 : }
1396 : Tag::Nblocks => {
1397 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1398 0 : Self::Nblocks(PagestreamNblocksResponse { n_blocks })
1399 : }
1400 : Tag::GetPage => {
1401 0 : let mut page = vec![0; 8192]; // TODO: use MaybeUninit
1402 0 : buf.read_exact(&mut page)?;
1403 0 : PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
1404 : }
1405 : Tag::Error => {
1406 0 : let mut msg = Vec::new();
1407 0 : buf.read_until(0, &mut msg)?;
1408 0 : let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
1409 0 : let rust_str = cstring.to_str()?;
1410 0 : PagestreamBeMessage::Error(PagestreamErrorResponse {
1411 0 : message: rust_str.to_owned(),
1412 0 : })
1413 : }
1414 : Tag::DbSize => {
1415 0 : let db_size = buf.read_i64::<BigEndian>()?;
1416 0 : Self::DbSize(PagestreamDbSizeResponse { db_size })
1417 : }
1418 : Tag::GetSlruSegment => {
1419 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1420 0 : let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
1421 0 : buf.read_exact(&mut segment)?;
1422 0 : Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
1423 0 : segment: segment.into(),
1424 0 : })
1425 : }
1426 : };
1427 0 : let remaining = buf.into_inner();
1428 0 : if !remaining.is_empty() {
1429 0 : anyhow::bail!(
1430 0 : "remaining bytes in msg with tag={msg_tag}: {}",
1431 0 : remaining.len()
1432 0 : );
1433 0 : }
1434 0 : Ok(ok)
1435 0 : }
1436 :
1437 0 : pub fn kind(&self) -> &'static str {
1438 0 : match self {
1439 0 : Self::Exists(_) => "Exists",
1440 0 : Self::Nblocks(_) => "Nblocks",
1441 0 : Self::GetPage(_) => "GetPage",
1442 0 : Self::Error(_) => "Error",
1443 0 : Self::DbSize(_) => "DbSize",
1444 0 : Self::GetSlruSegment(_) => "GetSlruSegment",
1445 : }
1446 0 : }
1447 : }
1448 :
1449 : #[cfg(test)]
1450 : mod tests {
1451 : use serde_json::json;
1452 : use std::str::FromStr;
1453 :
1454 : use super::*;
1455 :
1456 : #[test]
1457 1 : fn test_pagestream() {
1458 1 : // Test serialization/deserialization of PagestreamFeMessage
1459 1 : let messages = vec![
1460 1 : PagestreamFeMessage::Exists(PagestreamExistsRequest {
1461 1 : request_lsn: Lsn(4),
1462 1 : not_modified_since: Lsn(3),
1463 1 : rel: RelTag {
1464 1 : forknum: 1,
1465 1 : spcnode: 2,
1466 1 : dbnode: 3,
1467 1 : relnode: 4,
1468 1 : },
1469 1 : }),
1470 1 : PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1471 1 : request_lsn: Lsn(4),
1472 1 : not_modified_since: Lsn(4),
1473 1 : rel: RelTag {
1474 1 : forknum: 1,
1475 1 : spcnode: 2,
1476 1 : dbnode: 3,
1477 1 : relnode: 4,
1478 1 : },
1479 1 : }),
1480 1 : PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1481 1 : request_lsn: Lsn(4),
1482 1 : not_modified_since: Lsn(3),
1483 1 : rel: RelTag {
1484 1 : forknum: 1,
1485 1 : spcnode: 2,
1486 1 : dbnode: 3,
1487 1 : relnode: 4,
1488 1 : },
1489 1 : blkno: 7,
1490 1 : }),
1491 1 : PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1492 1 : request_lsn: Lsn(4),
1493 1 : not_modified_since: Lsn(3),
1494 1 : dbnode: 7,
1495 1 : }),
1496 1 : ];
1497 5 : for msg in messages {
1498 4 : let bytes = msg.serialize();
1499 4 : let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
1500 4 : assert!(msg == reconstructed);
1501 : }
1502 1 : }
1503 :
1504 : #[test]
1505 1 : fn test_tenantinfo_serde() {
1506 1 : // Test serialization/deserialization of TenantInfo
1507 1 : let original_active = TenantInfo {
1508 1 : id: TenantShardId::unsharded(TenantId::generate()),
1509 1 : state: TenantState::Active,
1510 1 : current_physical_size: Some(42),
1511 1 : attachment_status: TenantAttachmentStatus::Attached,
1512 1 : generation: 1,
1513 1 : gc_blocking: None,
1514 1 : };
1515 1 : let expected_active = json!({
1516 1 : "id": original_active.id.to_string(),
1517 1 : "state": {
1518 1 : "slug": "Active",
1519 1 : },
1520 1 : "current_physical_size": 42,
1521 1 : "attachment_status": {
1522 1 : "slug":"attached",
1523 1 : },
1524 1 : "generation" : 1
1525 1 : });
1526 1 :
1527 1 : let original_broken = TenantInfo {
1528 1 : id: TenantShardId::unsharded(TenantId::generate()),
1529 1 : state: TenantState::Broken {
1530 1 : reason: "reason".into(),
1531 1 : backtrace: "backtrace info".into(),
1532 1 : },
1533 1 : current_physical_size: Some(42),
1534 1 : attachment_status: TenantAttachmentStatus::Attached,
1535 1 : generation: 1,
1536 1 : gc_blocking: None,
1537 1 : };
1538 1 : let expected_broken = json!({
1539 1 : "id": original_broken.id.to_string(),
1540 1 : "state": {
1541 1 : "slug": "Broken",
1542 1 : "data": {
1543 1 : "backtrace": "backtrace info",
1544 1 : "reason": "reason",
1545 1 : }
1546 1 : },
1547 1 : "current_physical_size": 42,
1548 1 : "attachment_status": {
1549 1 : "slug":"attached",
1550 1 : },
1551 1 : "generation" : 1
1552 1 : });
1553 1 :
1554 1 : assert_eq!(
1555 1 : serde_json::to_value(&original_active).unwrap(),
1556 1 : expected_active
1557 1 : );
1558 :
1559 1 : assert_eq!(
1560 1 : serde_json::to_value(&original_broken).unwrap(),
1561 1 : expected_broken
1562 1 : );
1563 1 : assert!(format!("{:?}", &original_broken.state).contains("reason"));
1564 1 : assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
1565 1 : }
1566 :
1567 : #[test]
1568 1 : fn test_reject_unknown_field() {
1569 1 : let id = TenantId::generate();
1570 1 : let config_request = json!({
1571 1 : "tenant_id": id.to_string(),
1572 1 : "unknown_field": "unknown_value".to_string(),
1573 1 : });
1574 1 : let err = serde_json::from_value::<TenantConfigRequest>(config_request).unwrap_err();
1575 1 : assert!(
1576 1 : err.to_string().contains("unknown field `unknown_field`"),
1577 0 : "expect unknown field `unknown_field` error, got: {}",
1578 : err
1579 : );
1580 1 : }
1581 :
1582 : #[test]
1583 1 : fn tenantstatus_activating_serde() {
1584 1 : let states = [TenantState::Activating(ActivatingFrom::Attaching)];
1585 1 : let expected = "[{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
1586 1 :
1587 1 : let actual = serde_json::to_string(&states).unwrap();
1588 1 :
1589 1 : assert_eq!(actual, expected);
1590 :
1591 1 : let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
1592 1 :
1593 1 : assert_eq!(states.as_slice(), &parsed);
1594 1 : }
1595 :
1596 : #[test]
1597 1 : fn tenantstatus_activating_strum() {
1598 1 : // tests added, because we use these for metrics
1599 1 : let examples = [
1600 1 : (line!(), TenantState::Attaching, "Attaching"),
1601 1 : (
1602 1 : line!(),
1603 1 : TenantState::Activating(ActivatingFrom::Attaching),
1604 1 : "Activating",
1605 1 : ),
1606 1 : (line!(), TenantState::Active, "Active"),
1607 1 : (
1608 1 : line!(),
1609 1 : TenantState::Stopping {
1610 1 : progress: utils::completion::Barrier::default(),
1611 1 : },
1612 1 : "Stopping",
1613 1 : ),
1614 1 : (
1615 1 : line!(),
1616 1 : TenantState::Broken {
1617 1 : reason: "Example".into(),
1618 1 : backtrace: "Looooong backtrace".into(),
1619 1 : },
1620 1 : "Broken",
1621 1 : ),
1622 1 : ];
1623 :
1624 6 : for (line, rendered, expected) in examples {
1625 5 : let actual: &'static str = rendered.into();
1626 5 : assert_eq!(actual, expected, "example on {line}");
1627 : }
1628 1 : }
1629 :
1630 : #[test]
1631 1 : fn test_aux_file_migration_path() {
1632 1 : assert!(AuxFilePolicy::is_valid_migration_path(
1633 1 : None,
1634 1 : AuxFilePolicy::V1
1635 1 : ));
1636 1 : assert!(AuxFilePolicy::is_valid_migration_path(
1637 1 : None,
1638 1 : AuxFilePolicy::V2
1639 1 : ));
1640 1 : assert!(AuxFilePolicy::is_valid_migration_path(
1641 1 : None,
1642 1 : AuxFilePolicy::CrossValidation
1643 1 : ));
1644 : // Self-migration is not a valid migration path, and the caller should handle it by itself.
1645 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1646 1 : Some(AuxFilePolicy::V1),
1647 1 : AuxFilePolicy::V1
1648 1 : ));
1649 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1650 1 : Some(AuxFilePolicy::V2),
1651 1 : AuxFilePolicy::V2
1652 1 : ));
1653 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1654 1 : Some(AuxFilePolicy::CrossValidation),
1655 1 : AuxFilePolicy::CrossValidation
1656 1 : ));
1657 : // Migrations not allowed
1658 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1659 1 : Some(AuxFilePolicy::CrossValidation),
1660 1 : AuxFilePolicy::V1
1661 1 : ));
1662 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1663 1 : Some(AuxFilePolicy::V1),
1664 1 : AuxFilePolicy::V2
1665 1 : ));
1666 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1667 1 : Some(AuxFilePolicy::V2),
1668 1 : AuxFilePolicy::V1
1669 1 : ));
1670 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1671 1 : Some(AuxFilePolicy::V2),
1672 1 : AuxFilePolicy::CrossValidation
1673 1 : ));
1674 1 : assert!(!AuxFilePolicy::is_valid_migration_path(
1675 1 : Some(AuxFilePolicy::V1),
1676 1 : AuxFilePolicy::CrossValidation
1677 1 : ));
1678 : // Migrations allowed
1679 1 : assert!(AuxFilePolicy::is_valid_migration_path(
1680 1 : Some(AuxFilePolicy::CrossValidation),
1681 1 : AuxFilePolicy::V2
1682 1 : ));
1683 1 : }
1684 :
1685 : #[test]
1686 1 : fn test_aux_parse() {
1687 1 : assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
1688 1 : assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
1689 1 : assert_eq!(
1690 1 : AuxFilePolicy::from_str("cross-validation").unwrap(),
1691 1 : AuxFilePolicy::CrossValidation
1692 1 : );
1693 1 : }
1694 :
1695 : #[test]
1696 1 : fn test_image_compression_algorithm_parsing() {
1697 : use ImageCompressionAlgorithm::*;
1698 1 : let cases = [
1699 1 : ("disabled", Disabled),
1700 1 : ("zstd", Zstd { level: None }),
1701 1 : ("zstd(18)", Zstd { level: Some(18) }),
1702 1 : ("zstd(-3)", Zstd { level: Some(-3) }),
1703 1 : ];
1704 :
1705 5 : for (display, expected) in cases {
1706 4 : assert_eq!(
1707 4 : ImageCompressionAlgorithm::from_str(display).unwrap(),
1708 : expected,
1709 0 : "parsing works"
1710 : );
1711 4 : assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip");
1712 :
1713 4 : let ser = serde_json::to_string(&expected).expect("serialization");
1714 4 : assert_eq!(
1715 4 : serde_json::from_str::<ImageCompressionAlgorithm>(&ser).unwrap(),
1716 : expected,
1717 0 : "serde roundtrip"
1718 : );
1719 :
1720 4 : assert_eq!(
1721 4 : serde_json::Value::String(display.to_string()),
1722 4 : serde_json::to_value(expected).unwrap(),
1723 0 : "Display is the serde serialization"
1724 : );
1725 : }
1726 1 : }
1727 : }
|