Line data Source code
1 : pub mod detach_ancestor;
2 : pub mod partitioning;
3 : pub mod utilization;
4 :
5 : pub use utilization::PageserverUtilization;
6 :
7 : use std::{
8 : collections::HashMap,
9 : io::{BufRead, Read},
10 : num::{NonZeroU64, NonZeroUsize},
11 : str::FromStr,
12 : sync::atomic::AtomicUsize,
13 : time::{Duration, SystemTime},
14 : };
15 :
16 : use byteorder::{BigEndian, ReadBytesExt};
17 : use postgres_ffi::BLCKSZ;
18 : use serde::{Deserialize, Serialize};
19 : use serde_with::serde_as;
20 : use utils::{
21 : completion,
22 : id::{NodeId, TenantId, TimelineId},
23 : lsn::Lsn,
24 : serde_system_time,
25 : };
26 :
27 : use crate::{
28 : reltag::RelTag,
29 : shard::{ShardCount, ShardStripeSize, TenantShardId},
30 : };
31 : use anyhow::bail;
32 : use bytes::{Buf, BufMut, Bytes, BytesMut};
33 :
34 : /// The state of a tenant in this pageserver.
35 : ///
36 : /// ```mermaid
37 : /// stateDiagram-v2
38 : ///
39 : /// [*] --> Loading: spawn_load()
40 : /// [*] --> Attaching: spawn_attach()
41 : ///
42 : /// Loading --> Activating: activate()
43 : /// Attaching --> Activating: activate()
44 : /// Activating --> Active: infallible
45 : ///
46 : /// Loading --> Broken: load() failure
47 : /// Attaching --> Broken: attach() failure
48 : ///
49 : /// Active --> Stopping: set_stopping(), part of shutdown & detach
50 : /// Stopping --> Broken: late error in remove_tenant_from_memory
51 : ///
52 : /// Broken --> [*]: ignore / detach / shutdown
53 : /// Stopping --> [*]: remove_from_memory complete
54 : ///
55 : /// Active --> Broken: cfg(testing)-only tenant break point
56 : /// ```
57 : #[derive(
58 : Clone,
59 : PartialEq,
60 : Eq,
61 2 : serde::Serialize,
62 12 : serde::Deserialize,
63 0 : strum_macros::Display,
64 : strum_macros::EnumVariantNames,
65 0 : strum_macros::AsRefStr,
66 376 : strum_macros::IntoStaticStr,
67 : )]
68 : #[serde(tag = "slug", content = "data")]
69 : pub enum TenantState {
70 : /// This tenant is being loaded from local disk.
71 : ///
72 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
73 : Loading,
74 : /// This tenant is being attached to the pageserver.
75 : ///
76 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
77 : Attaching,
78 : /// The tenant is transitioning from Loading/Attaching to Active.
79 : ///
80 : /// While in this state, the individual timelines are being activated.
81 : ///
82 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
83 : Activating(ActivatingFrom),
84 : /// The tenant has finished activating and is open for business.
85 : ///
86 : /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
87 : Active,
88 : /// The tenant is recognized by pageserver, but it is being detached or the
89 : /// system is being shut down.
90 : ///
91 : /// Transitions out of this state are possible through `set_broken()`.
92 : Stopping {
93 : // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
94 : // otherwise it will not be skipped during deserialization
95 : #[serde(skip)]
96 : progress: completion::Barrier,
97 : },
98 : /// The tenant is recognized by the pageserver, but can no longer be used for
99 : /// any operations.
100 : ///
101 : /// If the tenant fails to load or attach, it will transition to this state
102 : /// and it is guaranteed that no background tasks are running in its name.
103 : ///
104 : /// The other way to transition into this state is from `Stopping` state
105 : /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
106 : /// if the cleanup future executed by `remove_tenant_from_memory()` fails.
107 : Broken { reason: String, backtrace: String },
108 : }
109 :
110 : impl TenantState {
111 0 : pub fn attachment_status(&self) -> TenantAttachmentStatus {
112 : use TenantAttachmentStatus::*;
113 :
114 : // Below TenantState::Activating is used as "transient" or "transparent" state for
115 : // attachment_status determining.
116 0 : match self {
117 : // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
118 : // So, technically, we can return Attached here.
119 : // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
120 : // But, our attach task might still be fetching the remote timelines, etc.
121 : // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
122 0 : Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
123 : // tenant mgr startup distinguishes attaching from loading via marker file.
124 0 : Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
125 : // We only reach Active after successful load / attach.
126 : // So, call atttachment status Attached.
127 0 : Self::Active => Attached,
128 : // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
129 : // However, it also becomes Broken if the regular load fails.
130 : // From Console's perspective there's no practical difference
131 : // because attachment_status is polled by console only during attach operation execution.
132 0 : Self::Broken { reason, .. } => Failed {
133 0 : reason: reason.to_owned(),
134 0 : },
135 : // Why is Stopping a Maybe case? Because, during pageserver shutdown,
136 : // we set the Stopping state irrespective of whether the tenant
137 : // has finished attaching or not.
138 0 : Self::Stopping { .. } => Maybe,
139 : }
140 0 : }
141 :
142 0 : pub fn broken_from_reason(reason: String) -> Self {
143 0 : let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
144 0 : Self::Broken {
145 0 : reason,
146 0 : backtrace: backtrace_str,
147 0 : }
148 0 : }
149 : }
150 :
151 : impl std::fmt::Debug for TenantState {
152 4 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
153 4 : match self {
154 4 : Self::Broken { reason, backtrace } if !reason.is_empty() => {
155 4 : write!(f, "Broken due to: {reason}. Backtrace:\n{backtrace}")
156 : }
157 0 : _ => write!(f, "{self}"),
158 : }
159 4 : }
160 : }
161 :
162 : /// A temporary lease to a specific lsn inside a timeline.
163 : /// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
164 : #[serde_as]
165 0 : #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
166 : pub struct LsnLease {
167 : #[serde_as(as = "SystemTimeAsRfc3339Millis")]
168 : pub valid_until: SystemTime,
169 : }
170 :
171 : serde_with::serde_conv!(
172 : SystemTimeAsRfc3339Millis,
173 : SystemTime,
174 0 : |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
175 0 : |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
176 : );
177 :
178 : impl LsnLease {
179 : /// The default length for an explicit LSN lease request (10 minutes).
180 : pub const DEFAULT_LENGTH: Duration = Duration::from_secs(10 * 60);
181 :
182 : /// The default length for an implicit LSN lease granted during
183 : /// `get_lsn_by_timestamp` request (1 minutes).
184 : pub const DEFAULT_LENGTH_FOR_TS: Duration = Duration::from_secs(60);
185 :
186 : /// Checks whether the lease is expired.
187 6 : pub fn is_expired(&self, now: &SystemTime) -> bool {
188 6 : now > &self.valid_until
189 6 : }
190 : }
191 :
192 : /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
193 8 : #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
194 : pub enum ActivatingFrom {
195 : /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
196 : Loading,
197 : /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
198 : Attaching,
199 : }
200 :
201 : /// A state of a timeline in pageserver's memory.
202 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
203 : pub enum TimelineState {
204 : /// The timeline is recognized by the pageserver but is not yet operational.
205 : /// In particular, the walreceiver connection loop is not running for this timeline.
206 : /// It will eventually transition to state Active or Broken.
207 : Loading,
208 : /// The timeline is fully operational.
209 : /// It can be queried, and the walreceiver connection loop is running.
210 : Active,
211 : /// The timeline was previously Loading or Active but is shutting down.
212 : /// It cannot transition back into any other state.
213 : Stopping,
214 : /// The timeline is broken and not operational (previous states: Loading or Active).
215 : Broken { reason: String, backtrace: String },
216 : }
217 :
218 0 : #[derive(Serialize, Deserialize, Clone)]
219 : pub struct TimelineCreateRequest {
220 : pub new_timeline_id: TimelineId,
221 : #[serde(default)]
222 : pub ancestor_timeline_id: Option<TimelineId>,
223 : #[serde(default)]
224 : pub existing_initdb_timeline_id: Option<TimelineId>,
225 : #[serde(default)]
226 : pub ancestor_start_lsn: Option<Lsn>,
227 : pub pg_version: Option<u32>,
228 : }
229 :
230 0 : #[derive(Serialize, Deserialize, Clone)]
231 : pub struct LsnLeaseRequest {
232 : pub lsn: Lsn,
233 : }
234 :
235 0 : #[derive(Serialize, Deserialize)]
236 : pub struct TenantShardSplitRequest {
237 : pub new_shard_count: u8,
238 :
239 : // A tenant's stripe size is only meaningful the first time their shard count goes
240 : // above 1: therefore during a split from 1->N shards, we may modify the stripe size.
241 : //
242 : // If this is set while the stripe count is being increased from an already >1 value,
243 : // then the request will fail with 400.
244 : pub new_stripe_size: Option<ShardStripeSize>,
245 : }
246 :
247 0 : #[derive(Serialize, Deserialize)]
248 : pub struct TenantShardSplitResponse {
249 : pub new_shards: Vec<TenantShardId>,
250 : }
251 :
252 : /// Parameters that apply to all shards in a tenant. Used during tenant creation.
253 0 : #[derive(Serialize, Deserialize, Debug)]
254 : #[serde(deny_unknown_fields)]
255 : pub struct ShardParameters {
256 : pub count: ShardCount,
257 : pub stripe_size: ShardStripeSize,
258 : }
259 :
260 : impl ShardParameters {
261 : pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
262 :
263 0 : pub fn is_unsharded(&self) -> bool {
264 0 : self.count.is_unsharded()
265 0 : }
266 : }
267 :
268 : impl Default for ShardParameters {
269 192 : fn default() -> Self {
270 192 : Self {
271 192 : count: ShardCount::new(0),
272 192 : stripe_size: Self::DEFAULT_STRIPE_SIZE,
273 192 : }
274 192 : }
275 : }
276 :
277 : /// An alternative representation of `pageserver::tenant::TenantConf` with
278 : /// simpler types.
279 4 : #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
280 : pub struct TenantConfig {
281 : pub checkpoint_distance: Option<u64>,
282 : pub checkpoint_timeout: Option<String>,
283 : pub compaction_target_size: Option<u64>,
284 : pub compaction_period: Option<String>,
285 : pub compaction_threshold: Option<usize>,
286 : // defer parsing compaction_algorithm, like eviction_policy
287 : pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
288 : pub gc_horizon: Option<u64>,
289 : pub gc_period: Option<String>,
290 : pub image_creation_threshold: Option<usize>,
291 : pub pitr_interval: Option<String>,
292 : pub walreceiver_connect_timeout: Option<String>,
293 : pub lagging_wal_timeout: Option<String>,
294 : pub max_lsn_wal_lag: Option<NonZeroU64>,
295 : pub eviction_policy: Option<EvictionPolicy>,
296 : pub min_resident_size_override: Option<u64>,
297 : pub evictions_low_residence_duration_metric_threshold: Option<String>,
298 : pub heatmap_period: Option<String>,
299 : pub lazy_slru_download: Option<bool>,
300 : pub timeline_get_throttle: Option<ThrottleConfig>,
301 : pub image_layer_creation_check_threshold: Option<u8>,
302 : pub switch_aux_file_policy: Option<AuxFilePolicy>,
303 : pub lsn_lease_length: Option<String>,
304 : pub lsn_lease_length_for_ts: Option<String>,
305 : }
306 :
307 : /// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
308 : /// tenant config. When the first aux file written, the policy will be persisted in the
309 : /// `index_part.json` file and has a limited migration path.
310 : ///
311 : /// Currently, we only allow the following migration path:
312 : ///
313 : /// Unset -> V1
314 : /// -> V2
315 : /// -> CrossValidation -> V2
316 : #[derive(
317 : Eq,
318 : PartialEq,
319 : Debug,
320 : Copy,
321 : Clone,
322 8 : strum_macros::EnumString,
323 21 : strum_macros::Display,
324 0 : serde_with::DeserializeFromStr,
325 : serde_with::SerializeDisplay,
326 : )]
327 : #[strum(serialize_all = "kebab-case")]
328 : pub enum AuxFilePolicy {
329 : /// V1 aux file policy: store everything in AUX_FILE_KEY
330 : #[strum(ascii_case_insensitive)]
331 : V1,
332 : /// V2 aux file policy: store in the AUX_FILE keyspace
333 : #[strum(ascii_case_insensitive)]
334 : V2,
335 : /// Cross validation runs both formats on the write path and does validation
336 : /// on the read path.
337 : #[strum(ascii_case_insensitive)]
338 : CrossValidation,
339 : }
340 :
341 : impl AuxFilePolicy {
342 54 : pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
343 34 : matches!(
344 54 : (from, to),
345 : (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
346 : )
347 54 : }
348 :
349 : /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
350 420 : pub fn default_tenant_config() -> Self {
351 420 : Self::V1
352 420 : }
353 : }
354 :
355 : /// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
356 : pub struct AtomicAuxFilePolicy(AtomicUsize);
357 :
358 : impl AtomicAuxFilePolicy {
359 406 : pub fn new(policy: Option<AuxFilePolicy>) -> Self {
360 406 : Self(AtomicUsize::new(
361 406 : policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
362 406 : ))
363 406 : }
364 :
365 308 : pub fn load(&self) -> Option<AuxFilePolicy> {
366 308 : match self.0.load(std::sync::atomic::Ordering::Acquire) {
367 242 : 0 => None,
368 66 : other => Some(AuxFilePolicy::from_usize(other)),
369 : }
370 308 : }
371 :
372 22 : pub fn store(&self, policy: Option<AuxFilePolicy>) {
373 22 : self.0.store(
374 22 : policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
375 22 : std::sync::atomic::Ordering::Release,
376 22 : );
377 22 : }
378 : }
379 :
380 : impl AuxFilePolicy {
381 20 : pub fn to_usize(self) -> usize {
382 20 : match self {
383 14 : Self::V1 => 1,
384 2 : Self::CrossValidation => 2,
385 4 : Self::V2 => 3,
386 : }
387 20 : }
388 :
389 66 : pub fn try_from_usize(this: usize) -> Option<Self> {
390 66 : match this {
391 36 : 1 => Some(Self::V1),
392 6 : 2 => Some(Self::CrossValidation),
393 24 : 3 => Some(Self::V2),
394 0 : _ => None,
395 : }
396 66 : }
397 :
398 66 : pub fn from_usize(this: usize) -> Self {
399 66 : Self::try_from_usize(this).unwrap()
400 66 : }
401 : }
402 :
403 4 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
404 : #[serde(tag = "kind")]
405 : pub enum EvictionPolicy {
406 : NoEviction,
407 : LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
408 : OnlyImitiate(EvictionPolicyLayerAccessThreshold),
409 : }
410 :
411 : impl EvictionPolicy {
412 0 : pub fn discriminant_str(&self) -> &'static str {
413 0 : match self {
414 0 : EvictionPolicy::NoEviction => "NoEviction",
415 0 : EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
416 0 : EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
417 : }
418 0 : }
419 : }
420 :
421 : #[derive(
422 : Eq,
423 : PartialEq,
424 : Debug,
425 : Copy,
426 : Clone,
427 0 : strum_macros::EnumString,
428 0 : strum_macros::Display,
429 0 : serde_with::DeserializeFromStr,
430 : serde_with::SerializeDisplay,
431 : )]
432 : #[strum(serialize_all = "kebab-case")]
433 : pub enum CompactionAlgorithm {
434 : Legacy,
435 : Tiered,
436 : }
437 :
438 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
439 : pub enum ImageCompressionAlgorithm {
440 : // Disabled for writes, support decompressing during read path
441 : Disabled,
442 : /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
443 : /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
444 : Zstd {
445 : level: Option<i8>,
446 : },
447 : }
448 :
449 : impl FromStr for ImageCompressionAlgorithm {
450 : type Err = anyhow::Error;
451 220 : fn from_str(s: &str) -> Result<Self, Self::Err> {
452 220 : let mut components = s.split(['(', ')']);
453 220 : let first = components
454 220 : .next()
455 220 : .ok_or_else(|| anyhow::anyhow!("empty string"))?;
456 220 : match first {
457 220 : "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
458 218 : "zstd" => {
459 218 : let level = if let Some(v) = components.next() {
460 216 : let v: i8 = v.parse()?;
461 216 : Some(v)
462 : } else {
463 2 : None
464 : };
465 :
466 218 : Ok(ImageCompressionAlgorithm::Zstd { level })
467 : }
468 0 : _ => anyhow::bail!("invalid specifier '{first}'"),
469 : }
470 220 : }
471 : }
472 :
473 0 : #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
474 : pub struct CompactionAlgorithmSettings {
475 : pub kind: CompactionAlgorithm,
476 : }
477 :
478 20 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
479 : pub struct EvictionPolicyLayerAccessThreshold {
480 : #[serde(with = "humantime_serde")]
481 : pub period: Duration,
482 : #[serde(with = "humantime_serde")]
483 : pub threshold: Duration,
484 : }
485 :
486 0 : #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
487 : pub struct ThrottleConfig {
488 : pub task_kinds: Vec<String>, // TaskKind
489 : pub initial: usize,
490 : #[serde(with = "humantime_serde")]
491 : pub refill_interval: Duration,
492 : pub refill_amount: NonZeroUsize,
493 : pub max: usize,
494 : pub fair: bool,
495 : }
496 :
497 : impl ThrottleConfig {
498 402 : pub fn disabled() -> Self {
499 402 : Self {
500 402 : task_kinds: vec![], // effectively disables the throttle
501 402 : // other values don't matter with emtpy `task_kinds`.
502 402 : initial: 0,
503 402 : refill_interval: Duration::from_millis(1),
504 402 : refill_amount: NonZeroUsize::new(1).unwrap(),
505 402 : max: 1,
506 402 : fair: true,
507 402 : }
508 402 : }
509 : /// The requests per second allowed by the given config.
510 0 : pub fn steady_rps(&self) -> f64 {
511 0 : (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
512 0 : }
513 : }
514 :
515 : /// A flattened analog of a `pagesever::tenant::LocationMode`, which
516 : /// lists out all possible states (and the virtual "Detached" state)
517 : /// in a flat form rather than using rust-style enums.
518 0 : #[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
519 : pub enum LocationConfigMode {
520 : AttachedSingle,
521 : AttachedMulti,
522 : AttachedStale,
523 : Secondary,
524 : Detached,
525 : }
526 :
527 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
528 : pub struct LocationConfigSecondary {
529 : pub warm: bool,
530 : }
531 :
532 : /// An alternative representation of `pageserver::tenant::LocationConf`,
533 : /// for use in external-facing APIs.
534 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
535 : pub struct LocationConfig {
536 : pub mode: LocationConfigMode,
537 : /// If attaching, in what generation?
538 : #[serde(default)]
539 : pub generation: Option<u32>,
540 :
541 : // If requesting mode `Secondary`, configuration for that.
542 : #[serde(default)]
543 : pub secondary_conf: Option<LocationConfigSecondary>,
544 :
545 : // Shard parameters: if shard_count is nonzero, then other shard_* fields
546 : // must be set accurately.
547 : #[serde(default)]
548 : pub shard_number: u8,
549 : #[serde(default)]
550 : pub shard_count: u8,
551 : #[serde(default)]
552 : pub shard_stripe_size: u32,
553 :
554 : // This configuration only affects attached mode, but should be provided irrespective
555 : // of the mode, as a secondary location might transition on startup if the response
556 : // to the `/re-attach` control plane API requests it.
557 : pub tenant_conf: TenantConfig,
558 : }
559 :
560 0 : #[derive(Serialize, Deserialize)]
561 : pub struct LocationConfigListResponse {
562 : pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
563 : }
564 :
565 : #[derive(Serialize)]
566 : pub struct StatusResponse {
567 : pub id: NodeId,
568 : }
569 :
570 0 : #[derive(Serialize, Deserialize, Debug)]
571 : #[serde(deny_unknown_fields)]
572 : pub struct TenantLocationConfigRequest {
573 : #[serde(flatten)]
574 : pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
575 : }
576 :
577 0 : #[derive(Serialize, Deserialize, Debug)]
578 : #[serde(deny_unknown_fields)]
579 : pub struct TenantTimeTravelRequest {
580 : pub shard_counts: Vec<ShardCount>,
581 : }
582 :
583 0 : #[derive(Serialize, Deserialize, Debug)]
584 : #[serde(deny_unknown_fields)]
585 : pub struct TenantShardLocation {
586 : pub shard_id: TenantShardId,
587 : pub node_id: NodeId,
588 : }
589 :
590 0 : #[derive(Serialize, Deserialize, Debug)]
591 : #[serde(deny_unknown_fields)]
592 : pub struct TenantLocationConfigResponse {
593 : pub shards: Vec<TenantShardLocation>,
594 : // If the shards' ShardCount count is >1, stripe_size will be set.
595 : pub stripe_size: Option<ShardStripeSize>,
596 : }
597 :
598 6 : #[derive(Serialize, Deserialize, Debug)]
599 : #[serde(deny_unknown_fields)]
600 : pub struct TenantConfigRequest {
601 : pub tenant_id: TenantId,
602 : #[serde(flatten)]
603 : pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
604 : }
605 :
606 : impl std::ops::Deref for TenantConfigRequest {
607 : type Target = TenantConfig;
608 :
609 0 : fn deref(&self) -> &Self::Target {
610 0 : &self.config
611 0 : }
612 : }
613 :
614 : impl TenantConfigRequest {
615 0 : pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
616 0 : let config = TenantConfig::default();
617 0 : TenantConfigRequest { tenant_id, config }
618 0 : }
619 : }
620 :
621 : /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
622 0 : #[derive(Serialize, Deserialize, Clone)]
623 : #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
624 : pub enum TenantAttachmentStatus {
625 : Maybe,
626 : Attached,
627 : Failed { reason: String },
628 : }
629 :
630 0 : #[derive(Serialize, Deserialize, Clone)]
631 : pub struct TenantInfo {
632 : pub id: TenantShardId,
633 : // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
634 : pub state: TenantState,
635 : /// Sum of the size of all layer files.
636 : /// If a layer is present in both local FS and S3, it counts only once.
637 : pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
638 : pub attachment_status: TenantAttachmentStatus,
639 : pub generation: u32,
640 :
641 : /// Opaque explanation if gc is being blocked.
642 : ///
643 : /// Only looked up for the individual tenant detail, not the listing. This is purely for
644 : /// debugging, not included in openapi.
645 : #[serde(skip_serializing_if = "Option::is_none")]
646 : pub gc_blocking: Option<String>,
647 : }
648 :
649 0 : #[derive(Serialize, Deserialize, Clone)]
650 : pub struct TenantDetails {
651 : #[serde(flatten)]
652 : pub tenant_info: TenantInfo,
653 :
654 : pub walredo: Option<WalRedoManagerStatus>,
655 :
656 : pub timelines: Vec<TimelineId>,
657 : }
658 :
659 0 : #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
660 : pub enum TimelineArchivalState {
661 : Archived,
662 : Unarchived,
663 : }
664 :
665 0 : #[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
666 : pub struct TimelineArchivalConfigRequest {
667 : pub state: TimelineArchivalState,
668 : }
669 :
670 : /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
671 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
672 : pub struct TimelineInfo {
673 : pub tenant_id: TenantShardId,
674 : pub timeline_id: TimelineId,
675 :
676 : pub ancestor_timeline_id: Option<TimelineId>,
677 : pub ancestor_lsn: Option<Lsn>,
678 : pub last_record_lsn: Lsn,
679 : pub prev_record_lsn: Option<Lsn>,
680 : pub latest_gc_cutoff_lsn: Lsn,
681 : pub disk_consistent_lsn: Lsn,
682 :
683 : /// The LSN that we have succesfully uploaded to remote storage
684 : pub remote_consistent_lsn: Lsn,
685 :
686 : /// The LSN that we are advertizing to safekeepers
687 : pub remote_consistent_lsn_visible: Lsn,
688 :
689 : /// The LSN from the start of the root timeline (never changes)
690 : pub initdb_lsn: Lsn,
691 :
692 : pub current_logical_size: u64,
693 : pub current_logical_size_is_accurate: bool,
694 :
695 : pub directory_entries_counts: Vec<u64>,
696 :
697 : /// Sum of the size of all layer files.
698 : /// If a layer is present in both local FS and S3, it counts only once.
699 : pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
700 : pub current_logical_size_non_incremental: Option<u64>,
701 :
702 : /// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes
703 : /// beyond the branch's branch point, we only count up to the branch point.
704 : pub pitr_history_size: u64,
705 :
706 : /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
707 : /// ancestor data used by this branch would have been retained anyway). If this is false, then
708 : /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
709 : /// otherwise be able to GC.
710 : pub within_ancestor_pitr: bool,
711 :
712 : pub timeline_dir_layer_file_size_sum: Option<u64>,
713 :
714 : pub wal_source_connstr: Option<String>,
715 : pub last_received_msg_lsn: Option<Lsn>,
716 : /// the timestamp (in microseconds) of the last received message
717 : pub last_received_msg_ts: Option<u128>,
718 : pub pg_version: u32,
719 :
720 : pub state: TimelineState,
721 :
722 : pub walreceiver_status: String,
723 :
724 : /// The last aux file policy being used on this timeline
725 : pub last_aux_file_policy: Option<AuxFilePolicy>,
726 : }
727 :
728 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
729 : pub struct LayerMapInfo {
730 : pub in_memory_layers: Vec<InMemoryLayerInfo>,
731 : pub historic_layers: Vec<HistoricLayerInfo>,
732 : }
733 :
734 : /// The residence status of a layer
735 0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
736 : pub enum LayerResidenceStatus {
737 : /// Residence status for a layer file that exists locally.
738 : /// It may also exist on the remote, we don't care here.
739 : Resident,
740 : /// Residence status for a layer file that only exists on the remote.
741 : Evicted,
742 : }
743 :
744 : #[serde_as]
745 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
746 : pub struct LayerAccessStats {
747 : #[serde_as(as = "serde_with::TimestampMilliSeconds")]
748 : pub access_time: SystemTime,
749 :
750 : #[serde_as(as = "serde_with::TimestampMilliSeconds")]
751 : pub residence_time: SystemTime,
752 :
753 : pub visible: bool,
754 : }
755 :
756 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
757 : #[serde(tag = "kind")]
758 : pub enum InMemoryLayerInfo {
759 : Open { lsn_start: Lsn },
760 : Frozen { lsn_start: Lsn, lsn_end: Lsn },
761 : }
762 :
763 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
764 : #[serde(tag = "kind")]
765 : pub enum HistoricLayerInfo {
766 : Delta {
767 : layer_file_name: String,
768 : layer_file_size: u64,
769 :
770 : lsn_start: Lsn,
771 : lsn_end: Lsn,
772 : remote: bool,
773 : access_stats: LayerAccessStats,
774 :
775 : l0: bool,
776 : },
777 : Image {
778 : layer_file_name: String,
779 : layer_file_size: u64,
780 :
781 : lsn_start: Lsn,
782 : remote: bool,
783 : access_stats: LayerAccessStats,
784 : },
785 : }
786 :
787 : impl HistoricLayerInfo {
788 0 : pub fn layer_file_name(&self) -> &str {
789 0 : match self {
790 : HistoricLayerInfo::Delta {
791 0 : layer_file_name, ..
792 0 : } => layer_file_name,
793 : HistoricLayerInfo::Image {
794 0 : layer_file_name, ..
795 0 : } => layer_file_name,
796 : }
797 0 : }
798 0 : pub fn is_remote(&self) -> bool {
799 0 : match self {
800 0 : HistoricLayerInfo::Delta { remote, .. } => *remote,
801 0 : HistoricLayerInfo::Image { remote, .. } => *remote,
802 : }
803 0 : }
804 0 : pub fn set_remote(&mut self, value: bool) {
805 0 : let field = match self {
806 0 : HistoricLayerInfo::Delta { remote, .. } => remote,
807 0 : HistoricLayerInfo::Image { remote, .. } => remote,
808 : };
809 0 : *field = value;
810 0 : }
811 0 : pub fn layer_file_size(&self) -> u64 {
812 0 : match self {
813 : HistoricLayerInfo::Delta {
814 0 : layer_file_size, ..
815 0 : } => *layer_file_size,
816 : HistoricLayerInfo::Image {
817 0 : layer_file_size, ..
818 0 : } => *layer_file_size,
819 : }
820 0 : }
821 : }
822 :
823 0 : #[derive(Debug, Serialize, Deserialize)]
824 : pub struct DownloadRemoteLayersTaskSpawnRequest {
825 : pub max_concurrent_downloads: NonZeroUsize,
826 : }
827 :
828 0 : #[derive(Debug, Serialize, Deserialize)]
829 : pub struct IngestAuxFilesRequest {
830 : pub aux_files: HashMap<String, String>,
831 : }
832 :
833 0 : #[derive(Debug, Serialize, Deserialize)]
834 : pub struct ListAuxFilesRequest {
835 : pub lsn: Lsn,
836 : }
837 :
838 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
839 : pub struct DownloadRemoteLayersTaskInfo {
840 : pub task_id: String,
841 : pub state: DownloadRemoteLayersTaskState,
842 : pub total_layer_count: u64, // stable once `completed`
843 : pub successful_download_count: u64, // stable once `completed`
844 : pub failed_download_count: u64, // stable once `completed`
845 : }
846 :
847 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
848 : pub enum DownloadRemoteLayersTaskState {
849 : Running,
850 : Completed,
851 : ShutDown,
852 : }
853 :
854 0 : #[derive(Debug, Serialize, Deserialize)]
855 : pub struct TimelineGcRequest {
856 : pub gc_horizon: Option<u64>,
857 : }
858 :
859 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
860 : pub struct WalRedoManagerProcessStatus {
861 : pub pid: u32,
862 : }
863 :
864 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
865 : pub struct WalRedoManagerStatus {
866 : pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
867 : pub process: Option<WalRedoManagerProcessStatus>,
868 : }
869 :
870 : /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
871 : /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
872 : /// what's happening.
873 0 : #[derive(Default, Debug, Serialize, Deserialize, Clone)]
874 : pub struct SecondaryProgress {
875 : /// The remote storage LastModified time of the heatmap object we last downloaded.
876 : pub heatmap_mtime: Option<serde_system_time::SystemTime>,
877 :
878 : /// The number of layers currently on-disk
879 : pub layers_downloaded: usize,
880 : /// The number of layers in the most recently seen heatmap
881 : pub layers_total: usize,
882 :
883 : /// The number of layer bytes currently on-disk
884 : pub bytes_downloaded: u64,
885 : /// The number of layer bytes in the most recently seen heatmap
886 : pub bytes_total: u64,
887 : }
888 :
889 0 : #[derive(Serialize, Deserialize, Debug)]
890 : pub struct TenantScanRemoteStorageShard {
891 : pub tenant_shard_id: TenantShardId,
892 : pub generation: Option<u32>,
893 : }
894 :
895 0 : #[derive(Serialize, Deserialize, Debug, Default)]
896 : pub struct TenantScanRemoteStorageResponse {
897 : pub shards: Vec<TenantScanRemoteStorageShard>,
898 : }
899 :
900 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
901 : #[serde(rename_all = "snake_case")]
902 : pub enum TenantSorting {
903 : ResidentSize,
904 : MaxLogicalSize,
905 : }
906 :
907 : impl Default for TenantSorting {
908 0 : fn default() -> Self {
909 0 : Self::ResidentSize
910 0 : }
911 : }
912 :
913 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
914 : pub struct TopTenantShardsRequest {
915 : // How would you like to sort the tenants?
916 : pub order_by: TenantSorting,
917 :
918 : // How many results?
919 : pub limit: usize,
920 :
921 : // Omit tenants with more than this many shards (e.g. if this is the max number of shards
922 : // that the caller would ever split to)
923 : pub where_shards_lt: Option<ShardCount>,
924 :
925 : // Omit tenants where the ordering metric is less than this (this is an optimization to
926 : // let us quickly exclude numerous tiny shards)
927 : pub where_gt: Option<u64>,
928 : }
929 :
930 0 : #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
931 : pub struct TopTenantShardItem {
932 : pub id: TenantShardId,
933 :
934 : /// Total size of layers on local disk for all timelines in this tenant
935 : pub resident_size: u64,
936 :
937 : /// Total size of layers in remote storage for all timelines in this tenant
938 : pub physical_size: u64,
939 :
940 : /// The largest logical size of a timeline within this tenant
941 : pub max_logical_size: u64,
942 : }
943 :
944 0 : #[derive(Serialize, Deserialize, Debug, Default)]
945 : pub struct TopTenantShardsResponse {
946 : pub shards: Vec<TopTenantShardItem>,
947 : }
948 :
949 : pub mod virtual_file {
950 : use std::path::PathBuf;
951 :
952 : #[derive(
953 : Copy,
954 : Clone,
955 : PartialEq,
956 : Eq,
957 : Hash,
958 402 : strum_macros::EnumString,
959 0 : strum_macros::Display,
960 0 : serde_with::DeserializeFromStr,
961 : serde_with::SerializeDisplay,
962 : Debug,
963 : )]
964 : #[strum(serialize_all = "kebab-case")]
965 : pub enum IoEngineKind {
966 : StdFs,
967 : #[cfg(target_os = "linux")]
968 : TokioEpollUring,
969 : }
970 :
971 : /// Direct IO modes for a pageserver.
972 0 : #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
973 : #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
974 : pub enum DirectIoMode {
975 : /// Direct IO disabled (uses usual buffered IO).
976 : #[default]
977 : Disabled,
978 : /// Direct IO disabled (performs checks and perf simulations).
979 : Evaluate {
980 : /// Alignment check level
981 : alignment_check: DirectIoAlignmentCheckLevel,
982 : /// Latency padded for performance simulation.
983 : latency_padding: DirectIoLatencyPadding,
984 : },
985 : /// Direct IO enabled.
986 : Enabled {
987 : /// Actions to perform on alignment error.
988 : on_alignment_error: DirectIoOnAlignmentErrorAction,
989 : },
990 : }
991 :
992 0 : #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
993 : #[serde(rename_all = "kebab-case")]
994 : pub enum DirectIoAlignmentCheckLevel {
995 : #[default]
996 : Error,
997 : Log,
998 : None,
999 : }
1000 :
1001 0 : #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
1002 : #[serde(rename_all = "kebab-case")]
1003 : pub enum DirectIoOnAlignmentErrorAction {
1004 : Error,
1005 : #[default]
1006 : FallbackToBuffered,
1007 : }
1008 :
1009 0 : #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
1010 : #[serde(tag = "type", rename_all = "kebab-case")]
1011 : pub enum DirectIoLatencyPadding {
1012 : /// Pad virtual file operations with IO to a fake file.
1013 : FakeFileRW { path: PathBuf },
1014 : #[default]
1015 : None,
1016 : }
1017 : }
1018 :
1019 : // Wrapped in libpq CopyData
1020 : #[derive(PartialEq, Eq, Debug)]
1021 : pub enum PagestreamFeMessage {
1022 : Exists(PagestreamExistsRequest),
1023 : Nblocks(PagestreamNblocksRequest),
1024 : GetPage(PagestreamGetPageRequest),
1025 : DbSize(PagestreamDbSizeRequest),
1026 : GetSlruSegment(PagestreamGetSlruSegmentRequest),
1027 : }
1028 :
1029 : // Wrapped in libpq CopyData
1030 0 : #[derive(strum_macros::EnumProperty)]
1031 : pub enum PagestreamBeMessage {
1032 : Exists(PagestreamExistsResponse),
1033 : Nblocks(PagestreamNblocksResponse),
1034 : GetPage(PagestreamGetPageResponse),
1035 : Error(PagestreamErrorResponse),
1036 : DbSize(PagestreamDbSizeResponse),
1037 : GetSlruSegment(PagestreamGetSlruSegmentResponse),
1038 : }
1039 :
1040 : // Keep in sync with `pagestore_client.h`
1041 : #[repr(u8)]
1042 : enum PagestreamBeMessageTag {
1043 : Exists = 100,
1044 : Nblocks = 101,
1045 : GetPage = 102,
1046 : Error = 103,
1047 : DbSize = 104,
1048 : GetSlruSegment = 105,
1049 : }
1050 : impl TryFrom<u8> for PagestreamBeMessageTag {
1051 : type Error = u8;
1052 0 : fn try_from(value: u8) -> Result<Self, u8> {
1053 0 : match value {
1054 0 : 100 => Ok(PagestreamBeMessageTag::Exists),
1055 0 : 101 => Ok(PagestreamBeMessageTag::Nblocks),
1056 0 : 102 => Ok(PagestreamBeMessageTag::GetPage),
1057 0 : 103 => Ok(PagestreamBeMessageTag::Error),
1058 0 : 104 => Ok(PagestreamBeMessageTag::DbSize),
1059 0 : 105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
1060 0 : _ => Err(value),
1061 : }
1062 0 : }
1063 : }
1064 :
1065 : // In the V2 protocol version, a GetPage request contains two LSN values:
1066 : //
1067 : // request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
1068 : // "get the latest version present". It's used by the primary server, which knows that no one else
1069 : // is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
1070 : // Lsn::Max. Standby servers use the current replay LSN as the request LSN.
1071 : //
1072 : // not_modified_since: Hint to the pageserver that the client knows that the page has not been
1073 : // modified between 'not_modified_since' and the request LSN. It's always correct to set
1074 : // 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
1075 : // passing an earlier LSN can speed up the request, by allowing the pageserver to process the
1076 : // request without waiting for 'request_lsn' to arrive.
1077 : //
1078 : // The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
1079 : // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
1080 : // 'latest' was set to true. The V2 interface was added because there was no correct way for a
1081 : // standby to request a page at a particular non-latest LSN, and also include the
1082 : // 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
1083 : // request, if the standby knows that the page hasn't been modified since, and risk getting an error
1084 : // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
1085 : // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
1086 : // interface allows sending both LSNs, and let the pageserver do the right thing. There is no
1087 : // difference in the responses between V1 and V2.
1088 : //
1089 : // The Request structs below reflect the V2 interface. If V1 is used, the parse function
1090 : // maps the old format requests to the new format.
1091 : //
1092 : #[derive(Clone, Copy)]
1093 : pub enum PagestreamProtocolVersion {
1094 : V1,
1095 : V2,
1096 : }
1097 :
1098 : #[derive(Debug, PartialEq, Eq)]
1099 : pub struct PagestreamExistsRequest {
1100 : pub request_lsn: Lsn,
1101 : pub not_modified_since: Lsn,
1102 : pub rel: RelTag,
1103 : }
1104 :
1105 : #[derive(Debug, PartialEq, Eq)]
1106 : pub struct PagestreamNblocksRequest {
1107 : pub request_lsn: Lsn,
1108 : pub not_modified_since: Lsn,
1109 : pub rel: RelTag,
1110 : }
1111 :
1112 : #[derive(Debug, PartialEq, Eq)]
1113 : pub struct PagestreamGetPageRequest {
1114 : pub request_lsn: Lsn,
1115 : pub not_modified_since: Lsn,
1116 : pub rel: RelTag,
1117 : pub blkno: u32,
1118 : }
1119 :
1120 : #[derive(Debug, PartialEq, Eq)]
1121 : pub struct PagestreamDbSizeRequest {
1122 : pub request_lsn: Lsn,
1123 : pub not_modified_since: Lsn,
1124 : pub dbnode: u32,
1125 : }
1126 :
1127 : #[derive(Debug, PartialEq, Eq)]
1128 : pub struct PagestreamGetSlruSegmentRequest {
1129 : pub request_lsn: Lsn,
1130 : pub not_modified_since: Lsn,
1131 : pub kind: u8,
1132 : pub segno: u32,
1133 : }
1134 :
1135 : #[derive(Debug)]
1136 : pub struct PagestreamExistsResponse {
1137 : pub exists: bool,
1138 : }
1139 :
1140 : #[derive(Debug)]
1141 : pub struct PagestreamNblocksResponse {
1142 : pub n_blocks: u32,
1143 : }
1144 :
1145 : #[derive(Debug)]
1146 : pub struct PagestreamGetPageResponse {
1147 : pub page: Bytes,
1148 : }
1149 :
1150 : #[derive(Debug)]
1151 : pub struct PagestreamGetSlruSegmentResponse {
1152 : pub segment: Bytes,
1153 : }
1154 :
1155 : #[derive(Debug)]
1156 : pub struct PagestreamErrorResponse {
1157 : pub message: String,
1158 : }
1159 :
1160 : #[derive(Debug)]
1161 : pub struct PagestreamDbSizeResponse {
1162 : pub db_size: i64,
1163 : }
1164 :
1165 : // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
1166 : // that require pageserver-internal types. It is sufficient to get the total size.
1167 0 : #[derive(Serialize, Deserialize, Debug)]
1168 : pub struct TenantHistorySize {
1169 : pub id: TenantId,
1170 : /// Size is a mixture of WAL and logical size, so the unit is bytes.
1171 : ///
1172 : /// Will be none if `?inputs_only=true` was given.
1173 : pub size: Option<u64>,
1174 : }
1175 :
1176 : impl PagestreamFeMessage {
1177 : /// Serialize a compute -> pageserver message. This is currently only used in testing
1178 : /// tools. Always uses protocol version 2.
1179 8 : pub fn serialize(&self) -> Bytes {
1180 8 : let mut bytes = BytesMut::new();
1181 8 :
1182 8 : match self {
1183 2 : Self::Exists(req) => {
1184 2 : bytes.put_u8(0);
1185 2 : bytes.put_u64(req.request_lsn.0);
1186 2 : bytes.put_u64(req.not_modified_since.0);
1187 2 : bytes.put_u32(req.rel.spcnode);
1188 2 : bytes.put_u32(req.rel.dbnode);
1189 2 : bytes.put_u32(req.rel.relnode);
1190 2 : bytes.put_u8(req.rel.forknum);
1191 2 : }
1192 :
1193 2 : Self::Nblocks(req) => {
1194 2 : bytes.put_u8(1);
1195 2 : bytes.put_u64(req.request_lsn.0);
1196 2 : bytes.put_u64(req.not_modified_since.0);
1197 2 : bytes.put_u32(req.rel.spcnode);
1198 2 : bytes.put_u32(req.rel.dbnode);
1199 2 : bytes.put_u32(req.rel.relnode);
1200 2 : bytes.put_u8(req.rel.forknum);
1201 2 : }
1202 :
1203 2 : Self::GetPage(req) => {
1204 2 : bytes.put_u8(2);
1205 2 : bytes.put_u64(req.request_lsn.0);
1206 2 : bytes.put_u64(req.not_modified_since.0);
1207 2 : bytes.put_u32(req.rel.spcnode);
1208 2 : bytes.put_u32(req.rel.dbnode);
1209 2 : bytes.put_u32(req.rel.relnode);
1210 2 : bytes.put_u8(req.rel.forknum);
1211 2 : bytes.put_u32(req.blkno);
1212 2 : }
1213 :
1214 2 : Self::DbSize(req) => {
1215 2 : bytes.put_u8(3);
1216 2 : bytes.put_u64(req.request_lsn.0);
1217 2 : bytes.put_u64(req.not_modified_since.0);
1218 2 : bytes.put_u32(req.dbnode);
1219 2 : }
1220 :
1221 0 : Self::GetSlruSegment(req) => {
1222 0 : bytes.put_u8(4);
1223 0 : bytes.put_u64(req.request_lsn.0);
1224 0 : bytes.put_u64(req.not_modified_since.0);
1225 0 : bytes.put_u8(req.kind);
1226 0 : bytes.put_u32(req.segno);
1227 0 : }
1228 : }
1229 :
1230 8 : bytes.into()
1231 8 : }
1232 :
1233 8 : pub fn parse<R: std::io::Read>(
1234 8 : body: &mut R,
1235 8 : protocol_version: PagestreamProtocolVersion,
1236 8 : ) -> anyhow::Result<PagestreamFeMessage> {
1237 : // these correspond to the NeonMessageTag enum in pagestore_client.h
1238 : //
1239 : // TODO: consider using protobuf or serde bincode for less error prone
1240 : // serialization.
1241 8 : let msg_tag = body.read_u8()?;
1242 :
1243 8 : let (request_lsn, not_modified_since) = match protocol_version {
1244 : PagestreamProtocolVersion::V2 => (
1245 8 : Lsn::from(body.read_u64::<BigEndian>()?),
1246 8 : Lsn::from(body.read_u64::<BigEndian>()?),
1247 : ),
1248 : PagestreamProtocolVersion::V1 => {
1249 : // In the old protocol, each message starts with a boolean 'latest' flag,
1250 : // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
1251 : // 'not_modified_since', used in the new protocol version.
1252 0 : let latest = body.read_u8()? != 0;
1253 0 : let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
1254 0 : if latest {
1255 0 : (Lsn::MAX, request_lsn) // get latest version
1256 : } else {
1257 0 : (request_lsn, request_lsn) // get version at specified LSN
1258 : }
1259 : }
1260 : };
1261 :
1262 : // The rest of the messages are the same between V1 and V2
1263 8 : match msg_tag {
1264 : 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
1265 2 : request_lsn,
1266 2 : not_modified_since,
1267 2 : rel: RelTag {
1268 2 : spcnode: body.read_u32::<BigEndian>()?,
1269 2 : dbnode: body.read_u32::<BigEndian>()?,
1270 2 : relnode: body.read_u32::<BigEndian>()?,
1271 2 : forknum: body.read_u8()?,
1272 : },
1273 : })),
1274 : 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1275 2 : request_lsn,
1276 2 : not_modified_since,
1277 2 : rel: RelTag {
1278 2 : spcnode: body.read_u32::<BigEndian>()?,
1279 2 : dbnode: body.read_u32::<BigEndian>()?,
1280 2 : relnode: body.read_u32::<BigEndian>()?,
1281 2 : forknum: body.read_u8()?,
1282 : },
1283 : })),
1284 : 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1285 2 : request_lsn,
1286 2 : not_modified_since,
1287 2 : rel: RelTag {
1288 2 : spcnode: body.read_u32::<BigEndian>()?,
1289 2 : dbnode: body.read_u32::<BigEndian>()?,
1290 2 : relnode: body.read_u32::<BigEndian>()?,
1291 2 : forknum: body.read_u8()?,
1292 : },
1293 2 : blkno: body.read_u32::<BigEndian>()?,
1294 : })),
1295 : 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1296 2 : request_lsn,
1297 2 : not_modified_since,
1298 2 : dbnode: body.read_u32::<BigEndian>()?,
1299 : })),
1300 : 4 => Ok(PagestreamFeMessage::GetSlruSegment(
1301 : PagestreamGetSlruSegmentRequest {
1302 0 : request_lsn,
1303 0 : not_modified_since,
1304 0 : kind: body.read_u8()?,
1305 0 : segno: body.read_u32::<BigEndian>()?,
1306 : },
1307 : )),
1308 0 : _ => bail!("unknown smgr message tag: {:?}", msg_tag),
1309 : }
1310 8 : }
1311 : }
1312 :
1313 : impl PagestreamBeMessage {
1314 0 : pub fn serialize(&self) -> Bytes {
1315 0 : let mut bytes = BytesMut::new();
1316 0 :
1317 0 : use PagestreamBeMessageTag as Tag;
1318 0 : match self {
1319 0 : Self::Exists(resp) => {
1320 0 : bytes.put_u8(Tag::Exists as u8);
1321 0 : bytes.put_u8(resp.exists as u8);
1322 0 : }
1323 :
1324 0 : Self::Nblocks(resp) => {
1325 0 : bytes.put_u8(Tag::Nblocks as u8);
1326 0 : bytes.put_u32(resp.n_blocks);
1327 0 : }
1328 :
1329 0 : Self::GetPage(resp) => {
1330 0 : bytes.put_u8(Tag::GetPage as u8);
1331 0 : bytes.put(&resp.page[..]);
1332 0 : }
1333 :
1334 0 : Self::Error(resp) => {
1335 0 : bytes.put_u8(Tag::Error as u8);
1336 0 : bytes.put(resp.message.as_bytes());
1337 0 : bytes.put_u8(0); // null terminator
1338 0 : }
1339 0 : Self::DbSize(resp) => {
1340 0 : bytes.put_u8(Tag::DbSize as u8);
1341 0 : bytes.put_i64(resp.db_size);
1342 0 : }
1343 :
1344 0 : Self::GetSlruSegment(resp) => {
1345 0 : bytes.put_u8(Tag::GetSlruSegment as u8);
1346 0 : bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
1347 0 : bytes.put(&resp.segment[..]);
1348 0 : }
1349 : }
1350 :
1351 0 : bytes.into()
1352 0 : }
1353 :
1354 0 : pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
1355 0 : let mut buf = buf.reader();
1356 0 : let msg_tag = buf.read_u8()?;
1357 :
1358 : use PagestreamBeMessageTag as Tag;
1359 0 : let ok =
1360 0 : match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
1361 : Tag::Exists => {
1362 0 : let exists = buf.read_u8()?;
1363 0 : Self::Exists(PagestreamExistsResponse {
1364 0 : exists: exists != 0,
1365 0 : })
1366 : }
1367 : Tag::Nblocks => {
1368 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1369 0 : Self::Nblocks(PagestreamNblocksResponse { n_blocks })
1370 : }
1371 : Tag::GetPage => {
1372 0 : let mut page = vec![0; 8192]; // TODO: use MaybeUninit
1373 0 : buf.read_exact(&mut page)?;
1374 0 : PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
1375 : }
1376 : Tag::Error => {
1377 0 : let mut msg = Vec::new();
1378 0 : buf.read_until(0, &mut msg)?;
1379 0 : let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
1380 0 : let rust_str = cstring.to_str()?;
1381 0 : PagestreamBeMessage::Error(PagestreamErrorResponse {
1382 0 : message: rust_str.to_owned(),
1383 0 : })
1384 : }
1385 : Tag::DbSize => {
1386 0 : let db_size = buf.read_i64::<BigEndian>()?;
1387 0 : Self::DbSize(PagestreamDbSizeResponse { db_size })
1388 : }
1389 : Tag::GetSlruSegment => {
1390 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1391 0 : let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
1392 0 : buf.read_exact(&mut segment)?;
1393 0 : Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
1394 0 : segment: segment.into(),
1395 0 : })
1396 : }
1397 : };
1398 0 : let remaining = buf.into_inner();
1399 0 : if !remaining.is_empty() {
1400 0 : anyhow::bail!(
1401 0 : "remaining bytes in msg with tag={msg_tag}: {}",
1402 0 : remaining.len()
1403 0 : );
1404 0 : }
1405 0 : Ok(ok)
1406 0 : }
1407 :
1408 0 : pub fn kind(&self) -> &'static str {
1409 0 : match self {
1410 0 : Self::Exists(_) => "Exists",
1411 0 : Self::Nblocks(_) => "Nblocks",
1412 0 : Self::GetPage(_) => "GetPage",
1413 0 : Self::Error(_) => "Error",
1414 0 : Self::DbSize(_) => "DbSize",
1415 0 : Self::GetSlruSegment(_) => "GetSlruSegment",
1416 : }
1417 0 : }
1418 : }
1419 :
1420 : #[cfg(test)]
1421 : mod tests {
1422 : use serde_json::json;
1423 : use std::str::FromStr;
1424 :
1425 : use super::*;
1426 :
1427 : #[test]
1428 2 : fn test_pagestream() {
1429 2 : // Test serialization/deserialization of PagestreamFeMessage
1430 2 : let messages = vec![
1431 2 : PagestreamFeMessage::Exists(PagestreamExistsRequest {
1432 2 : request_lsn: Lsn(4),
1433 2 : not_modified_since: Lsn(3),
1434 2 : rel: RelTag {
1435 2 : forknum: 1,
1436 2 : spcnode: 2,
1437 2 : dbnode: 3,
1438 2 : relnode: 4,
1439 2 : },
1440 2 : }),
1441 2 : PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1442 2 : request_lsn: Lsn(4),
1443 2 : not_modified_since: Lsn(4),
1444 2 : rel: RelTag {
1445 2 : forknum: 1,
1446 2 : spcnode: 2,
1447 2 : dbnode: 3,
1448 2 : relnode: 4,
1449 2 : },
1450 2 : }),
1451 2 : PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1452 2 : request_lsn: Lsn(4),
1453 2 : not_modified_since: Lsn(3),
1454 2 : rel: RelTag {
1455 2 : forknum: 1,
1456 2 : spcnode: 2,
1457 2 : dbnode: 3,
1458 2 : relnode: 4,
1459 2 : },
1460 2 : blkno: 7,
1461 2 : }),
1462 2 : PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1463 2 : request_lsn: Lsn(4),
1464 2 : not_modified_since: Lsn(3),
1465 2 : dbnode: 7,
1466 2 : }),
1467 2 : ];
1468 10 : for msg in messages {
1469 8 : let bytes = msg.serialize();
1470 8 : let reconstructed =
1471 8 : PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
1472 8 : .unwrap();
1473 8 : assert!(msg == reconstructed);
1474 : }
1475 2 : }
1476 :
1477 : #[test]
1478 2 : fn test_tenantinfo_serde() {
1479 2 : // Test serialization/deserialization of TenantInfo
1480 2 : let original_active = TenantInfo {
1481 2 : id: TenantShardId::unsharded(TenantId::generate()),
1482 2 : state: TenantState::Active,
1483 2 : current_physical_size: Some(42),
1484 2 : attachment_status: TenantAttachmentStatus::Attached,
1485 2 : generation: 1,
1486 2 : gc_blocking: None,
1487 2 : };
1488 2 : let expected_active = json!({
1489 2 : "id": original_active.id.to_string(),
1490 2 : "state": {
1491 2 : "slug": "Active",
1492 2 : },
1493 2 : "current_physical_size": 42,
1494 2 : "attachment_status": {
1495 2 : "slug":"attached",
1496 2 : },
1497 2 : "generation" : 1
1498 2 : });
1499 2 :
1500 2 : let original_broken = TenantInfo {
1501 2 : id: TenantShardId::unsharded(TenantId::generate()),
1502 2 : state: TenantState::Broken {
1503 2 : reason: "reason".into(),
1504 2 : backtrace: "backtrace info".into(),
1505 2 : },
1506 2 : current_physical_size: Some(42),
1507 2 : attachment_status: TenantAttachmentStatus::Attached,
1508 2 : generation: 1,
1509 2 : gc_blocking: None,
1510 2 : };
1511 2 : let expected_broken = json!({
1512 2 : "id": original_broken.id.to_string(),
1513 2 : "state": {
1514 2 : "slug": "Broken",
1515 2 : "data": {
1516 2 : "backtrace": "backtrace info",
1517 2 : "reason": "reason",
1518 2 : }
1519 2 : },
1520 2 : "current_physical_size": 42,
1521 2 : "attachment_status": {
1522 2 : "slug":"attached",
1523 2 : },
1524 2 : "generation" : 1
1525 2 : });
1526 2 :
1527 2 : assert_eq!(
1528 2 : serde_json::to_value(&original_active).unwrap(),
1529 2 : expected_active
1530 2 : );
1531 :
1532 2 : assert_eq!(
1533 2 : serde_json::to_value(&original_broken).unwrap(),
1534 2 : expected_broken
1535 2 : );
1536 2 : assert!(format!("{:?}", &original_broken.state).contains("reason"));
1537 2 : assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
1538 2 : }
1539 :
1540 : #[test]
1541 2 : fn test_reject_unknown_field() {
1542 2 : let id = TenantId::generate();
1543 2 : let config_request = json!({
1544 2 : "tenant_id": id.to_string(),
1545 2 : "unknown_field": "unknown_value".to_string(),
1546 2 : });
1547 2 : let err = serde_json::from_value::<TenantConfigRequest>(config_request).unwrap_err();
1548 2 : assert!(
1549 2 : err.to_string().contains("unknown field `unknown_field`"),
1550 0 : "expect unknown field `unknown_field` error, got: {}",
1551 : err
1552 : );
1553 2 : }
1554 :
1555 : #[test]
1556 2 : fn tenantstatus_activating_serde() {
1557 2 : let states = [
1558 2 : TenantState::Activating(ActivatingFrom::Loading),
1559 2 : TenantState::Activating(ActivatingFrom::Attaching),
1560 2 : ];
1561 2 : let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
1562 2 :
1563 2 : let actual = serde_json::to_string(&states).unwrap();
1564 2 :
1565 2 : assert_eq!(actual, expected);
1566 :
1567 2 : let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
1568 2 :
1569 2 : assert_eq!(states.as_slice(), &parsed);
1570 2 : }
1571 :
1572 : #[test]
1573 2 : fn tenantstatus_activating_strum() {
1574 2 : // tests added, because we use these for metrics
1575 2 : let examples = [
1576 2 : (line!(), TenantState::Loading, "Loading"),
1577 2 : (line!(), TenantState::Attaching, "Attaching"),
1578 2 : (
1579 2 : line!(),
1580 2 : TenantState::Activating(ActivatingFrom::Loading),
1581 2 : "Activating",
1582 2 : ),
1583 2 : (
1584 2 : line!(),
1585 2 : TenantState::Activating(ActivatingFrom::Attaching),
1586 2 : "Activating",
1587 2 : ),
1588 2 : (line!(), TenantState::Active, "Active"),
1589 2 : (
1590 2 : line!(),
1591 2 : TenantState::Stopping {
1592 2 : progress: utils::completion::Barrier::default(),
1593 2 : },
1594 2 : "Stopping",
1595 2 : ),
1596 2 : (
1597 2 : line!(),
1598 2 : TenantState::Broken {
1599 2 : reason: "Example".into(),
1600 2 : backtrace: "Looooong backtrace".into(),
1601 2 : },
1602 2 : "Broken",
1603 2 : ),
1604 2 : ];
1605 :
1606 16 : for (line, rendered, expected) in examples {
1607 14 : let actual: &'static str = rendered.into();
1608 14 : assert_eq!(actual, expected, "example on {line}");
1609 : }
1610 2 : }
1611 :
1612 : #[test]
1613 2 : fn test_aux_file_migration_path() {
1614 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1615 2 : None,
1616 2 : AuxFilePolicy::V1
1617 2 : ));
1618 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1619 2 : None,
1620 2 : AuxFilePolicy::V2
1621 2 : ));
1622 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1623 2 : None,
1624 2 : AuxFilePolicy::CrossValidation
1625 2 : ));
1626 : // Self-migration is not a valid migration path, and the caller should handle it by itself.
1627 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1628 2 : Some(AuxFilePolicy::V1),
1629 2 : AuxFilePolicy::V1
1630 2 : ));
1631 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1632 2 : Some(AuxFilePolicy::V2),
1633 2 : AuxFilePolicy::V2
1634 2 : ));
1635 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1636 2 : Some(AuxFilePolicy::CrossValidation),
1637 2 : AuxFilePolicy::CrossValidation
1638 2 : ));
1639 : // Migrations not allowed
1640 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1641 2 : Some(AuxFilePolicy::CrossValidation),
1642 2 : AuxFilePolicy::V1
1643 2 : ));
1644 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1645 2 : Some(AuxFilePolicy::V1),
1646 2 : AuxFilePolicy::V2
1647 2 : ));
1648 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1649 2 : Some(AuxFilePolicy::V2),
1650 2 : AuxFilePolicy::V1
1651 2 : ));
1652 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1653 2 : Some(AuxFilePolicy::V2),
1654 2 : AuxFilePolicy::CrossValidation
1655 2 : ));
1656 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1657 2 : Some(AuxFilePolicy::V1),
1658 2 : AuxFilePolicy::CrossValidation
1659 2 : ));
1660 : // Migrations allowed
1661 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1662 2 : Some(AuxFilePolicy::CrossValidation),
1663 2 : AuxFilePolicy::V2
1664 2 : ));
1665 2 : }
1666 :
1667 : #[test]
1668 2 : fn test_aux_parse() {
1669 2 : assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
1670 2 : assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
1671 2 : assert_eq!(
1672 2 : AuxFilePolicy::from_str("cross-validation").unwrap(),
1673 2 : AuxFilePolicy::CrossValidation
1674 2 : );
1675 2 : }
1676 :
1677 : #[test]
1678 2 : fn test_image_compression_algorithm_parsing() {
1679 2 : use ImageCompressionAlgorithm::*;
1680 2 : assert_eq!(
1681 2 : ImageCompressionAlgorithm::from_str("disabled").unwrap(),
1682 2 : Disabled
1683 2 : );
1684 2 : assert_eq!(
1685 2 : ImageCompressionAlgorithm::from_str("zstd").unwrap(),
1686 2 : Zstd { level: None }
1687 2 : );
1688 2 : assert_eq!(
1689 2 : ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
1690 2 : Zstd { level: Some(18) }
1691 2 : );
1692 2 : assert_eq!(
1693 2 : ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
1694 2 : Zstd { level: Some(-3) }
1695 2 : );
1696 2 : }
1697 : }
|