Line data Source code
1 : pub mod detach_ancestor;
2 : pub mod partitioning;
3 : pub mod utilization;
4 :
5 : pub use utilization::PageserverUtilization;
6 :
7 : use std::{
8 : borrow::Cow,
9 : collections::HashMap,
10 : io::{BufRead, Read},
11 : num::{NonZeroU64, NonZeroUsize},
12 : str::FromStr,
13 : sync::atomic::AtomicUsize,
14 : time::{Duration, SystemTime},
15 : };
16 :
17 : use byteorder::{BigEndian, ReadBytesExt};
18 : use postgres_ffi::BLCKSZ;
19 : use serde::{Deserialize, Serialize};
20 : use serde_with::serde_as;
21 : use utils::{
22 : completion,
23 : history_buffer::HistoryBufferWithDropCounter,
24 : id::{NodeId, TenantId, TimelineId},
25 : lsn::Lsn,
26 : serde_system_time,
27 : };
28 :
29 : use crate::{
30 : reltag::RelTag,
31 : shard::{ShardCount, ShardStripeSize, TenantShardId},
32 : };
33 : use anyhow::bail;
34 : use bytes::{Buf, BufMut, Bytes, BytesMut};
35 :
36 : /// The state of a tenant in this pageserver.
37 : ///
38 : /// ```mermaid
39 : /// stateDiagram-v2
40 : ///
41 : /// [*] --> Loading: spawn_load()
42 : /// [*] --> Attaching: spawn_attach()
43 : ///
44 : /// Loading --> Activating: activate()
45 : /// Attaching --> Activating: activate()
46 : /// Activating --> Active: infallible
47 : ///
48 : /// Loading --> Broken: load() failure
49 : /// Attaching --> Broken: attach() failure
50 : ///
51 : /// Active --> Stopping: set_stopping(), part of shutdown & detach
52 : /// Stopping --> Broken: late error in remove_tenant_from_memory
53 : ///
54 : /// Broken --> [*]: ignore / detach / shutdown
55 : /// Stopping --> [*]: remove_from_memory complete
56 : ///
57 : /// Active --> Broken: cfg(testing)-only tenant break point
58 : /// ```
59 : #[derive(
60 : Clone,
61 : PartialEq,
62 : Eq,
63 2 : serde::Serialize,
64 12 : serde::Deserialize,
65 0 : strum_macros::Display,
66 : strum_macros::EnumVariantNames,
67 0 : strum_macros::AsRefStr,
68 336 : strum_macros::IntoStaticStr,
69 : )]
70 : #[serde(tag = "slug", content = "data")]
71 : pub enum TenantState {
72 : /// This tenant is being loaded from local disk.
73 : ///
74 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
75 : Loading,
76 : /// This tenant is being attached to the pageserver.
77 : ///
78 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
79 : Attaching,
80 : /// The tenant is transitioning from Loading/Attaching to Active.
81 : ///
82 : /// While in this state, the individual timelines are being activated.
83 : ///
84 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
85 : Activating(ActivatingFrom),
86 : /// The tenant has finished activating and is open for business.
87 : ///
88 : /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
89 : Active,
90 : /// The tenant is recognized by pageserver, but it is being detached or the
91 : /// system is being shut down.
92 : ///
93 : /// Transitions out of this state are possible through `set_broken()`.
94 : Stopping {
95 : // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
96 : // otherwise it will not be skipped during deserialization
97 : #[serde(skip)]
98 : progress: completion::Barrier,
99 : },
100 : /// The tenant is recognized by the pageserver, but can no longer be used for
101 : /// any operations.
102 : ///
103 : /// If the tenant fails to load or attach, it will transition to this state
104 : /// and it is guaranteed that no background tasks are running in its name.
105 : ///
106 : /// The other way to transition into this state is from `Stopping` state
107 : /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
108 : /// if the cleanup future executed by `remove_tenant_from_memory()` fails.
109 : Broken { reason: String, backtrace: String },
110 : }
111 :
112 : impl TenantState {
113 0 : pub fn attachment_status(&self) -> TenantAttachmentStatus {
114 : use TenantAttachmentStatus::*;
115 :
116 : // Below TenantState::Activating is used as "transient" or "transparent" state for
117 : // attachment_status determining.
118 0 : match self {
119 : // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
120 : // So, technically, we can return Attached here.
121 : // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
122 : // But, our attach task might still be fetching the remote timelines, etc.
123 : // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
124 0 : Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
125 : // tenant mgr startup distinguishes attaching from loading via marker file.
126 0 : Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
127 : // We only reach Active after successful load / attach.
128 : // So, call atttachment status Attached.
129 0 : Self::Active => Attached,
130 : // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
131 : // However, it also becomes Broken if the regular load fails.
132 : // From Console's perspective there's no practical difference
133 : // because attachment_status is polled by console only during attach operation execution.
134 0 : Self::Broken { reason, .. } => Failed {
135 0 : reason: reason.to_owned(),
136 0 : },
137 : // Why is Stopping a Maybe case? Because, during pageserver shutdown,
138 : // we set the Stopping state irrespective of whether the tenant
139 : // has finished attaching or not.
140 0 : Self::Stopping { .. } => Maybe,
141 : }
142 0 : }
143 :
144 0 : pub fn broken_from_reason(reason: String) -> Self {
145 0 : let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
146 0 : Self::Broken {
147 0 : reason,
148 0 : backtrace: backtrace_str,
149 0 : }
150 0 : }
151 : }
152 :
153 : impl std::fmt::Debug for TenantState {
154 4 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
155 4 : match self {
156 4 : Self::Broken { reason, backtrace } if !reason.is_empty() => {
157 4 : write!(f, "Broken due to: {reason}. Backtrace:\n{backtrace}")
158 : }
159 0 : _ => write!(f, "{self}"),
160 : }
161 4 : }
162 : }
163 :
164 : /// A temporary lease to a specific lsn inside a timeline.
165 : /// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
166 : #[serde_as]
167 0 : #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
168 : pub struct LsnLease {
169 : #[serde_as(as = "SystemTimeAsRfc3339Millis")]
170 : pub valid_until: SystemTime,
171 : }
172 :
173 : serde_with::serde_conv!(
174 : SystemTimeAsRfc3339Millis,
175 : SystemTime,
176 0 : |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
177 0 : |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
178 : );
179 :
180 : impl LsnLease {
181 : /// The default length for an explicit LSN lease request (10 minutes).
182 : pub const DEFAULT_LENGTH: Duration = Duration::from_secs(10 * 60);
183 :
184 : /// The default length for an implicit LSN lease granted during
185 : /// `get_lsn_by_timestamp` request (1 minutes).
186 : pub const DEFAULT_LENGTH_FOR_TS: Duration = Duration::from_secs(60);
187 :
188 : /// Checks whether the lease is expired.
189 6 : pub fn is_expired(&self, now: &SystemTime) -> bool {
190 6 : now > &self.valid_until
191 6 : }
192 : }
193 :
194 : /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
195 8 : #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
196 : pub enum ActivatingFrom {
197 : /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
198 : Loading,
199 : /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
200 : Attaching,
201 : }
202 :
203 : /// A state of a timeline in pageserver's memory.
204 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
205 : pub enum TimelineState {
206 : /// The timeline is recognized by the pageserver but is not yet operational.
207 : /// In particular, the walreceiver connection loop is not running for this timeline.
208 : /// It will eventually transition to state Active or Broken.
209 : Loading,
210 : /// The timeline is fully operational.
211 : /// It can be queried, and the walreceiver connection loop is running.
212 : Active,
213 : /// The timeline was previously Loading or Active but is shutting down.
214 : /// It cannot transition back into any other state.
215 : Stopping,
216 : /// The timeline is broken and not operational (previous states: Loading or Active).
217 : Broken { reason: String, backtrace: String },
218 : }
219 :
220 0 : #[derive(Serialize, Deserialize, Clone)]
221 : pub struct TimelineCreateRequest {
222 : pub new_timeline_id: TimelineId,
223 : #[serde(default)]
224 : pub ancestor_timeline_id: Option<TimelineId>,
225 : #[serde(default)]
226 : pub existing_initdb_timeline_id: Option<TimelineId>,
227 : #[serde(default)]
228 : pub ancestor_start_lsn: Option<Lsn>,
229 : pub pg_version: Option<u32>,
230 : }
231 :
232 0 : #[derive(Serialize, Deserialize, Clone)]
233 : pub struct LsnLeaseRequest {
234 : pub lsn: Lsn,
235 : }
236 :
237 0 : #[derive(Serialize, Deserialize)]
238 : pub struct TenantShardSplitRequest {
239 : pub new_shard_count: u8,
240 :
241 : // A tenant's stripe size is only meaningful the first time their shard count goes
242 : // above 1: therefore during a split from 1->N shards, we may modify the stripe size.
243 : //
244 : // If this is set while the stripe count is being increased from an already >1 value,
245 : // then the request will fail with 400.
246 : pub new_stripe_size: Option<ShardStripeSize>,
247 : }
248 :
249 0 : #[derive(Serialize, Deserialize)]
250 : pub struct TenantShardSplitResponse {
251 : pub new_shards: Vec<TenantShardId>,
252 : }
253 :
254 : /// Parameters that apply to all shards in a tenant. Used during tenant creation.
255 0 : #[derive(Serialize, Deserialize, Debug)]
256 : #[serde(deny_unknown_fields)]
257 : pub struct ShardParameters {
258 : pub count: ShardCount,
259 : pub stripe_size: ShardStripeSize,
260 : }
261 :
262 : impl ShardParameters {
263 : pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
264 :
265 0 : pub fn is_unsharded(&self) -> bool {
266 0 : self.count.is_unsharded()
267 0 : }
268 : }
269 :
270 : impl Default for ShardParameters {
271 172 : fn default() -> Self {
272 172 : Self {
273 172 : count: ShardCount::new(0),
274 172 : stripe_size: Self::DEFAULT_STRIPE_SIZE,
275 172 : }
276 172 : }
277 : }
278 :
279 : /// An alternative representation of `pageserver::tenant::TenantConf` with
280 : /// simpler types.
281 4 : #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
282 : pub struct TenantConfig {
283 : pub checkpoint_distance: Option<u64>,
284 : pub checkpoint_timeout: Option<String>,
285 : pub compaction_target_size: Option<u64>,
286 : pub compaction_period: Option<String>,
287 : pub compaction_threshold: Option<usize>,
288 : // defer parsing compaction_algorithm, like eviction_policy
289 : pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
290 : pub gc_horizon: Option<u64>,
291 : pub gc_period: Option<String>,
292 : pub image_creation_threshold: Option<usize>,
293 : pub pitr_interval: Option<String>,
294 : pub walreceiver_connect_timeout: Option<String>,
295 : pub lagging_wal_timeout: Option<String>,
296 : pub max_lsn_wal_lag: Option<NonZeroU64>,
297 : pub trace_read_requests: Option<bool>,
298 : pub eviction_policy: Option<EvictionPolicy>,
299 : pub min_resident_size_override: Option<u64>,
300 : pub evictions_low_residence_duration_metric_threshold: Option<String>,
301 : pub heatmap_period: Option<String>,
302 : pub lazy_slru_download: Option<bool>,
303 : pub timeline_get_throttle: Option<ThrottleConfig>,
304 : pub image_layer_creation_check_threshold: Option<u8>,
305 : pub switch_aux_file_policy: Option<AuxFilePolicy>,
306 : pub lsn_lease_length: Option<String>,
307 : pub lsn_lease_length_for_ts: Option<String>,
308 : }
309 :
310 : /// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
311 : /// tenant config. When the first aux file written, the policy will be persisted in the
312 : /// `index_part.json` file and has a limited migration path.
313 : ///
314 : /// Currently, we only allow the following migration path:
315 : ///
316 : /// Unset -> V1
317 : /// -> V2
318 : /// -> CrossValidation -> V2
319 : #[derive(
320 : Eq,
321 : PartialEq,
322 : Debug,
323 : Copy,
324 : Clone,
325 8 : strum_macros::EnumString,
326 21 : strum_macros::Display,
327 0 : serde_with::DeserializeFromStr,
328 : serde_with::SerializeDisplay,
329 : )]
330 : #[strum(serialize_all = "kebab-case")]
331 : pub enum AuxFilePolicy {
332 : /// V1 aux file policy: store everything in AUX_FILE_KEY
333 : #[strum(ascii_case_insensitive)]
334 : V1,
335 : /// V2 aux file policy: store in the AUX_FILE keyspace
336 : #[strum(ascii_case_insensitive)]
337 : V2,
338 : /// Cross validation runs both formats on the write path and does validation
339 : /// on the read path.
340 : #[strum(ascii_case_insensitive)]
341 : CrossValidation,
342 : }
343 :
344 : impl AuxFilePolicy {
345 54 : pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
346 34 : matches!(
347 54 : (from, to),
348 : (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
349 : )
350 54 : }
351 :
352 : /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
353 382 : pub fn default_tenant_config() -> Self {
354 382 : Self::V1
355 382 : }
356 : }
357 :
358 : /// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
359 : pub struct AtomicAuxFilePolicy(AtomicUsize);
360 :
361 : impl AtomicAuxFilePolicy {
362 384 : pub fn new(policy: Option<AuxFilePolicy>) -> Self {
363 384 : Self(AtomicUsize::new(
364 384 : policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
365 384 : ))
366 384 : }
367 :
368 306 : pub fn load(&self) -> Option<AuxFilePolicy> {
369 306 : match self.0.load(std::sync::atomic::Ordering::Acquire) {
370 240 : 0 => None,
371 66 : other => Some(AuxFilePolicy::from_usize(other)),
372 : }
373 306 : }
374 :
375 22 : pub fn store(&self, policy: Option<AuxFilePolicy>) {
376 22 : self.0.store(
377 22 : policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
378 22 : std::sync::atomic::Ordering::Release,
379 22 : );
380 22 : }
381 : }
382 :
383 : impl AuxFilePolicy {
384 20 : pub fn to_usize(self) -> usize {
385 20 : match self {
386 14 : Self::V1 => 1,
387 2 : Self::CrossValidation => 2,
388 4 : Self::V2 => 3,
389 : }
390 20 : }
391 :
392 66 : pub fn try_from_usize(this: usize) -> Option<Self> {
393 66 : match this {
394 36 : 1 => Some(Self::V1),
395 6 : 2 => Some(Self::CrossValidation),
396 24 : 3 => Some(Self::V2),
397 0 : _ => None,
398 : }
399 66 : }
400 :
401 66 : pub fn from_usize(this: usize) -> Self {
402 66 : Self::try_from_usize(this).unwrap()
403 66 : }
404 : }
405 :
406 4 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
407 : #[serde(tag = "kind")]
408 : pub enum EvictionPolicy {
409 : NoEviction,
410 : LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
411 : OnlyImitiate(EvictionPolicyLayerAccessThreshold),
412 : }
413 :
414 : impl EvictionPolicy {
415 0 : pub fn discriminant_str(&self) -> &'static str {
416 0 : match self {
417 0 : EvictionPolicy::NoEviction => "NoEviction",
418 0 : EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
419 0 : EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
420 : }
421 0 : }
422 : }
423 :
424 : #[derive(
425 : Eq,
426 : PartialEq,
427 : Debug,
428 : Copy,
429 : Clone,
430 0 : strum_macros::EnumString,
431 0 : strum_macros::Display,
432 0 : serde_with::DeserializeFromStr,
433 : serde_with::SerializeDisplay,
434 : )]
435 : #[strum(serialize_all = "kebab-case")]
436 : pub enum CompactionAlgorithm {
437 : Legacy,
438 : Tiered,
439 : }
440 :
441 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
442 : pub enum ImageCompressionAlgorithm {
443 : /// Disabled for writes, and never decompress during reading.
444 : /// Never set this after you've enabled compression once!
445 : DisabledNoDecompress,
446 : // Disabled for writes, support decompressing during read path
447 : Disabled,
448 : /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
449 : /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
450 : Zstd {
451 : level: Option<i8>,
452 : },
453 : }
454 :
455 : impl ImageCompressionAlgorithm {
456 90 : pub fn allow_decompression(&self) -> bool {
457 90 : !matches!(self, ImageCompressionAlgorithm::DisabledNoDecompress)
458 90 : }
459 : }
460 :
461 : impl FromStr for ImageCompressionAlgorithm {
462 : type Err = anyhow::Error;
463 10 : fn from_str(s: &str) -> Result<Self, Self::Err> {
464 10 : let mut components = s.split(['(', ')']);
465 10 : let first = components
466 10 : .next()
467 10 : .ok_or_else(|| anyhow::anyhow!("empty string"))?;
468 10 : match first {
469 10 : "disabled-no-decompress" => Ok(ImageCompressionAlgorithm::DisabledNoDecompress),
470 8 : "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
471 6 : "zstd" => {
472 6 : let level = if let Some(v) = components.next() {
473 4 : let v: i8 = v.parse()?;
474 4 : Some(v)
475 : } else {
476 2 : None
477 : };
478 :
479 6 : Ok(ImageCompressionAlgorithm::Zstd { level })
480 : }
481 0 : _ => anyhow::bail!("invalid specifier '{first}'"),
482 : }
483 10 : }
484 : }
485 :
486 0 : #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
487 : pub struct CompactionAlgorithmSettings {
488 : pub kind: CompactionAlgorithm,
489 : }
490 :
491 20 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
492 : pub struct EvictionPolicyLayerAccessThreshold {
493 : #[serde(with = "humantime_serde")]
494 : pub period: Duration,
495 : #[serde(with = "humantime_serde")]
496 : pub threshold: Duration,
497 : }
498 :
499 0 : #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
500 : pub struct ThrottleConfig {
501 : pub task_kinds: Vec<String>, // TaskKind
502 : pub initial: usize,
503 : #[serde(with = "humantime_serde")]
504 : pub refill_interval: Duration,
505 : pub refill_amount: NonZeroUsize,
506 : pub max: usize,
507 : pub fair: bool,
508 : }
509 :
510 : impl ThrottleConfig {
511 364 : pub fn disabled() -> Self {
512 364 : Self {
513 364 : task_kinds: vec![], // effectively disables the throttle
514 364 : // other values don't matter with emtpy `task_kinds`.
515 364 : initial: 0,
516 364 : refill_interval: Duration::from_millis(1),
517 364 : refill_amount: NonZeroUsize::new(1).unwrap(),
518 364 : max: 1,
519 364 : fair: true,
520 364 : }
521 364 : }
522 : /// The requests per second allowed by the given config.
523 0 : pub fn steady_rps(&self) -> f64 {
524 0 : (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
525 0 : }
526 : }
527 :
528 : /// A flattened analog of a `pagesever::tenant::LocationMode`, which
529 : /// lists out all possible states (and the virtual "Detached" state)
530 : /// in a flat form rather than using rust-style enums.
531 0 : #[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
532 : pub enum LocationConfigMode {
533 : AttachedSingle,
534 : AttachedMulti,
535 : AttachedStale,
536 : Secondary,
537 : Detached,
538 : }
539 :
540 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
541 : pub struct LocationConfigSecondary {
542 : pub warm: bool,
543 : }
544 :
545 : /// An alternative representation of `pageserver::tenant::LocationConf`,
546 : /// for use in external-facing APIs.
547 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
548 : pub struct LocationConfig {
549 : pub mode: LocationConfigMode,
550 : /// If attaching, in what generation?
551 : #[serde(default)]
552 : pub generation: Option<u32>,
553 :
554 : // If requesting mode `Secondary`, configuration for that.
555 : #[serde(default)]
556 : pub secondary_conf: Option<LocationConfigSecondary>,
557 :
558 : // Shard parameters: if shard_count is nonzero, then other shard_* fields
559 : // must be set accurately.
560 : #[serde(default)]
561 : pub shard_number: u8,
562 : #[serde(default)]
563 : pub shard_count: u8,
564 : #[serde(default)]
565 : pub shard_stripe_size: u32,
566 :
567 : // This configuration only affects attached mode, but should be provided irrespective
568 : // of the mode, as a secondary location might transition on startup if the response
569 : // to the `/re-attach` control plane API requests it.
570 : pub tenant_conf: TenantConfig,
571 : }
572 :
573 0 : #[derive(Serialize, Deserialize)]
574 : pub struct LocationConfigListResponse {
575 : pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
576 : }
577 :
578 : #[derive(Serialize)]
579 : pub struct StatusResponse {
580 : pub id: NodeId,
581 : }
582 :
583 0 : #[derive(Serialize, Deserialize, Debug)]
584 : #[serde(deny_unknown_fields)]
585 : pub struct TenantLocationConfigRequest {
586 : #[serde(flatten)]
587 : pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
588 : }
589 :
590 0 : #[derive(Serialize, Deserialize, Debug)]
591 : #[serde(deny_unknown_fields)]
592 : pub struct TenantTimeTravelRequest {
593 : pub shard_counts: Vec<ShardCount>,
594 : }
595 :
596 0 : #[derive(Serialize, Deserialize, Debug)]
597 : #[serde(deny_unknown_fields)]
598 : pub struct TenantShardLocation {
599 : pub shard_id: TenantShardId,
600 : pub node_id: NodeId,
601 : }
602 :
603 0 : #[derive(Serialize, Deserialize, Debug)]
604 : #[serde(deny_unknown_fields)]
605 : pub struct TenantLocationConfigResponse {
606 : pub shards: Vec<TenantShardLocation>,
607 : // If the shards' ShardCount count is >1, stripe_size will be set.
608 : pub stripe_size: Option<ShardStripeSize>,
609 : }
610 :
611 6 : #[derive(Serialize, Deserialize, Debug)]
612 : #[serde(deny_unknown_fields)]
613 : pub struct TenantConfigRequest {
614 : pub tenant_id: TenantId,
615 : #[serde(flatten)]
616 : pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
617 : }
618 :
619 : impl std::ops::Deref for TenantConfigRequest {
620 : type Target = TenantConfig;
621 :
622 0 : fn deref(&self) -> &Self::Target {
623 0 : &self.config
624 0 : }
625 : }
626 :
627 : impl TenantConfigRequest {
628 0 : pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
629 0 : let config = TenantConfig::default();
630 0 : TenantConfigRequest { tenant_id, config }
631 0 : }
632 : }
633 :
634 : /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
635 0 : #[derive(Serialize, Deserialize, Clone)]
636 : #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
637 : pub enum TenantAttachmentStatus {
638 : Maybe,
639 : Attached,
640 : Failed { reason: String },
641 : }
642 :
643 0 : #[derive(Serialize, Deserialize, Clone)]
644 : pub struct TenantInfo {
645 : pub id: TenantShardId,
646 : // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
647 : pub state: TenantState,
648 : /// Sum of the size of all layer files.
649 : /// If a layer is present in both local FS and S3, it counts only once.
650 : pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
651 : pub attachment_status: TenantAttachmentStatus,
652 : pub generation: u32,
653 : }
654 :
655 0 : #[derive(Serialize, Deserialize, Clone)]
656 : pub struct TenantDetails {
657 : #[serde(flatten)]
658 : pub tenant_info: TenantInfo,
659 :
660 : pub walredo: Option<WalRedoManagerStatus>,
661 :
662 : pub timelines: Vec<TimelineId>,
663 : }
664 :
665 : /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
666 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
667 : pub struct TimelineInfo {
668 : pub tenant_id: TenantShardId,
669 : pub timeline_id: TimelineId,
670 :
671 : pub ancestor_timeline_id: Option<TimelineId>,
672 : pub ancestor_lsn: Option<Lsn>,
673 : pub last_record_lsn: Lsn,
674 : pub prev_record_lsn: Option<Lsn>,
675 : pub latest_gc_cutoff_lsn: Lsn,
676 : pub disk_consistent_lsn: Lsn,
677 :
678 : /// The LSN that we have succesfully uploaded to remote storage
679 : pub remote_consistent_lsn: Lsn,
680 :
681 : /// The LSN that we are advertizing to safekeepers
682 : pub remote_consistent_lsn_visible: Lsn,
683 :
684 : /// The LSN from the start of the root timeline (never changes)
685 : pub initdb_lsn: Lsn,
686 :
687 : pub current_logical_size: u64,
688 : pub current_logical_size_is_accurate: bool,
689 :
690 : pub directory_entries_counts: Vec<u64>,
691 :
692 : /// Sum of the size of all layer files.
693 : /// If a layer is present in both local FS and S3, it counts only once.
694 : pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
695 : pub current_logical_size_non_incremental: Option<u64>,
696 :
697 : /// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes
698 : /// beyond the branch's branch point, we only count up to the branch point.
699 : pub pitr_history_size: u64,
700 :
701 : /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
702 : /// ancestor data used by this branch would have been retained anyway). If this is false, then
703 : /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
704 : /// otherwise be able to GC.
705 : pub within_ancestor_pitr: bool,
706 :
707 : pub timeline_dir_layer_file_size_sum: Option<u64>,
708 :
709 : pub wal_source_connstr: Option<String>,
710 : pub last_received_msg_lsn: Option<Lsn>,
711 : /// the timestamp (in microseconds) of the last received message
712 : pub last_received_msg_ts: Option<u128>,
713 : pub pg_version: u32,
714 :
715 : pub state: TimelineState,
716 :
717 : pub walreceiver_status: String,
718 :
719 : /// The last aux file policy being used on this timeline
720 : pub last_aux_file_policy: Option<AuxFilePolicy>,
721 : }
722 :
723 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
724 : pub struct LayerMapInfo {
725 : pub in_memory_layers: Vec<InMemoryLayerInfo>,
726 : pub historic_layers: Vec<HistoricLayerInfo>,
727 : }
728 :
729 0 : #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize, enum_map::Enum)]
730 : #[repr(usize)]
731 : pub enum LayerAccessKind {
732 : GetValueReconstructData,
733 : Iter,
734 : KeyIter,
735 : Dump,
736 : }
737 :
738 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
739 : pub struct LayerAccessStatFullDetails {
740 : pub when_millis_since_epoch: u64,
741 : pub task_kind: Cow<'static, str>,
742 : pub access_kind: LayerAccessKind,
743 : }
744 :
745 : /// An event that impacts the layer's residence status.
746 : #[serde_as]
747 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
748 : pub struct LayerResidenceEvent {
749 : /// The time when the event occurred.
750 : /// NB: this timestamp is captured while the residence status changes.
751 : /// So, it might be behind/ahead of the actual residence change by a short amount of time.
752 : ///
753 : #[serde(rename = "timestamp_millis_since_epoch")]
754 : #[serde_as(as = "serde_with::TimestampMilliSeconds")]
755 : pub timestamp: SystemTime,
756 : /// The new residence status of the layer.
757 : pub status: LayerResidenceStatus,
758 : /// The reason why we had to record this event.
759 : pub reason: LayerResidenceEventReason,
760 : }
761 :
762 : /// The reason for recording a given [`LayerResidenceEvent`].
763 0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
764 : pub enum LayerResidenceEventReason {
765 : /// The layer map is being populated, e.g. during timeline load or attach.
766 : /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
767 : /// We need to record such events because there is no persistent storage for the events.
768 : ///
769 : // https://github.com/rust-lang/rust/issues/74481
770 : /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
771 : /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
772 : LayerLoad,
773 : /// We just created the layer (e.g., freeze_and_flush or compaction).
774 : /// Such layers are always [`LayerResidenceStatus::Resident`].
775 : LayerCreate,
776 : /// We on-demand downloaded or evicted the given layer.
777 : ResidenceChange,
778 : }
779 :
780 : /// The residence status of the layer, after the given [`LayerResidenceEvent`].
781 0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
782 : pub enum LayerResidenceStatus {
783 : /// Residence status for a layer file that exists locally.
784 : /// It may also exist on the remote, we don't care here.
785 : Resident,
786 : /// Residence status for a layer file that only exists on the remote.
787 : Evicted,
788 : }
789 :
790 : impl LayerResidenceEvent {
791 3228 : pub fn new(status: LayerResidenceStatus, reason: LayerResidenceEventReason) -> Self {
792 3228 : Self {
793 3228 : status,
794 3228 : reason,
795 3228 : timestamp: SystemTime::now(),
796 3228 : }
797 3228 : }
798 : }
799 :
800 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
801 : pub struct LayerAccessStats {
802 : pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
803 : pub task_kind_access_flag: Vec<Cow<'static, str>>,
804 : pub first: Option<LayerAccessStatFullDetails>,
805 : pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
806 : pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
807 : }
808 :
809 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
810 : #[serde(tag = "kind")]
811 : pub enum InMemoryLayerInfo {
812 : Open { lsn_start: Lsn },
813 : Frozen { lsn_start: Lsn, lsn_end: Lsn },
814 : }
815 :
816 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
817 : #[serde(tag = "kind")]
818 : pub enum HistoricLayerInfo {
819 : Delta {
820 : layer_file_name: String,
821 : layer_file_size: u64,
822 :
823 : lsn_start: Lsn,
824 : lsn_end: Lsn,
825 : remote: bool,
826 : access_stats: LayerAccessStats,
827 :
828 : l0: bool,
829 : },
830 : Image {
831 : layer_file_name: String,
832 : layer_file_size: u64,
833 :
834 : lsn_start: Lsn,
835 : remote: bool,
836 : access_stats: LayerAccessStats,
837 : },
838 : }
839 :
840 : impl HistoricLayerInfo {
841 0 : pub fn layer_file_name(&self) -> &str {
842 0 : match self {
843 : HistoricLayerInfo::Delta {
844 0 : layer_file_name, ..
845 0 : } => layer_file_name,
846 : HistoricLayerInfo::Image {
847 0 : layer_file_name, ..
848 0 : } => layer_file_name,
849 : }
850 0 : }
851 0 : pub fn is_remote(&self) -> bool {
852 0 : match self {
853 0 : HistoricLayerInfo::Delta { remote, .. } => *remote,
854 0 : HistoricLayerInfo::Image { remote, .. } => *remote,
855 : }
856 0 : }
857 0 : pub fn set_remote(&mut self, value: bool) {
858 0 : let field = match self {
859 0 : HistoricLayerInfo::Delta { remote, .. } => remote,
860 0 : HistoricLayerInfo::Image { remote, .. } => remote,
861 : };
862 0 : *field = value;
863 0 : }
864 0 : pub fn layer_file_size(&self) -> u64 {
865 0 : match self {
866 : HistoricLayerInfo::Delta {
867 0 : layer_file_size, ..
868 0 : } => *layer_file_size,
869 : HistoricLayerInfo::Image {
870 0 : layer_file_size, ..
871 0 : } => *layer_file_size,
872 : }
873 0 : }
874 : }
875 :
876 0 : #[derive(Debug, Serialize, Deserialize)]
877 : pub struct DownloadRemoteLayersTaskSpawnRequest {
878 : pub max_concurrent_downloads: NonZeroUsize,
879 : }
880 :
881 0 : #[derive(Debug, Serialize, Deserialize)]
882 : pub struct IngestAuxFilesRequest {
883 : pub aux_files: HashMap<String, String>,
884 : }
885 :
886 0 : #[derive(Debug, Serialize, Deserialize)]
887 : pub struct ListAuxFilesRequest {
888 : pub lsn: Lsn,
889 : }
890 :
891 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
892 : pub struct DownloadRemoteLayersTaskInfo {
893 : pub task_id: String,
894 : pub state: DownloadRemoteLayersTaskState,
895 : pub total_layer_count: u64, // stable once `completed`
896 : pub successful_download_count: u64, // stable once `completed`
897 : pub failed_download_count: u64, // stable once `completed`
898 : }
899 :
900 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
901 : pub enum DownloadRemoteLayersTaskState {
902 : Running,
903 : Completed,
904 : ShutDown,
905 : }
906 :
907 0 : #[derive(Debug, Serialize, Deserialize)]
908 : pub struct TimelineGcRequest {
909 : pub gc_horizon: Option<u64>,
910 : }
911 :
912 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
913 : pub struct WalRedoManagerProcessStatus {
914 : pub pid: u32,
915 : }
916 :
917 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
918 : pub struct WalRedoManagerStatus {
919 : pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
920 : pub process: Option<WalRedoManagerProcessStatus>,
921 : }
922 :
923 : /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
924 : /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
925 : /// what's happening.
926 0 : #[derive(Default, Debug, Serialize, Deserialize, Clone)]
927 : pub struct SecondaryProgress {
928 : /// The remote storage LastModified time of the heatmap object we last downloaded.
929 : pub heatmap_mtime: Option<serde_system_time::SystemTime>,
930 :
931 : /// The number of layers currently on-disk
932 : pub layers_downloaded: usize,
933 : /// The number of layers in the most recently seen heatmap
934 : pub layers_total: usize,
935 :
936 : /// The number of layer bytes currently on-disk
937 : pub bytes_downloaded: u64,
938 : /// The number of layer bytes in the most recently seen heatmap
939 : pub bytes_total: u64,
940 : }
941 :
942 0 : #[derive(Serialize, Deserialize, Debug)]
943 : pub struct TenantScanRemoteStorageShard {
944 : pub tenant_shard_id: TenantShardId,
945 : pub generation: Option<u32>,
946 : }
947 :
948 0 : #[derive(Serialize, Deserialize, Debug, Default)]
949 : pub struct TenantScanRemoteStorageResponse {
950 : pub shards: Vec<TenantScanRemoteStorageShard>,
951 : }
952 :
953 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
954 : #[serde(rename_all = "snake_case")]
955 : pub enum TenantSorting {
956 : ResidentSize,
957 : MaxLogicalSize,
958 : }
959 :
960 : impl Default for TenantSorting {
961 0 : fn default() -> Self {
962 0 : Self::ResidentSize
963 0 : }
964 : }
965 :
966 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
967 : pub struct TopTenantShardsRequest {
968 : // How would you like to sort the tenants?
969 : pub order_by: TenantSorting,
970 :
971 : // How many results?
972 : pub limit: usize,
973 :
974 : // Omit tenants with more than this many shards (e.g. if this is the max number of shards
975 : // that the caller would ever split to)
976 : pub where_shards_lt: Option<ShardCount>,
977 :
978 : // Omit tenants where the ordering metric is less than this (this is an optimization to
979 : // let us quickly exclude numerous tiny shards)
980 : pub where_gt: Option<u64>,
981 : }
982 :
983 0 : #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
984 : pub struct TopTenantShardItem {
985 : pub id: TenantShardId,
986 :
987 : /// Total size of layers on local disk for all timelines in this tenant
988 : pub resident_size: u64,
989 :
990 : /// Total size of layers in remote storage for all timelines in this tenant
991 : pub physical_size: u64,
992 :
993 : /// The largest logical size of a timeline within this tenant
994 : pub max_logical_size: u64,
995 : }
996 :
997 0 : #[derive(Serialize, Deserialize, Debug, Default)]
998 : pub struct TopTenantShardsResponse {
999 : pub shards: Vec<TopTenantShardItem>,
1000 : }
1001 :
1002 : pub mod virtual_file {
1003 : #[derive(
1004 : Copy,
1005 : Clone,
1006 : PartialEq,
1007 : Eq,
1008 : Hash,
1009 356 : strum_macros::EnumString,
1010 0 : strum_macros::Display,
1011 0 : serde_with::DeserializeFromStr,
1012 : serde_with::SerializeDisplay,
1013 : Debug,
1014 : )]
1015 : #[strum(serialize_all = "kebab-case")]
1016 : pub enum IoEngineKind {
1017 : StdFs,
1018 : #[cfg(target_os = "linux")]
1019 : TokioEpollUring,
1020 : }
1021 : }
1022 :
1023 : // Wrapped in libpq CopyData
1024 : #[derive(PartialEq, Eq, Debug)]
1025 : pub enum PagestreamFeMessage {
1026 : Exists(PagestreamExistsRequest),
1027 : Nblocks(PagestreamNblocksRequest),
1028 : GetPage(PagestreamGetPageRequest),
1029 : DbSize(PagestreamDbSizeRequest),
1030 : GetSlruSegment(PagestreamGetSlruSegmentRequest),
1031 : }
1032 :
1033 : // Wrapped in libpq CopyData
1034 0 : #[derive(strum_macros::EnumProperty)]
1035 : pub enum PagestreamBeMessage {
1036 : Exists(PagestreamExistsResponse),
1037 : Nblocks(PagestreamNblocksResponse),
1038 : GetPage(PagestreamGetPageResponse),
1039 : Error(PagestreamErrorResponse),
1040 : DbSize(PagestreamDbSizeResponse),
1041 : GetSlruSegment(PagestreamGetSlruSegmentResponse),
1042 : }
1043 :
1044 : // Keep in sync with `pagestore_client.h`
1045 : #[repr(u8)]
1046 : enum PagestreamBeMessageTag {
1047 : Exists = 100,
1048 : Nblocks = 101,
1049 : GetPage = 102,
1050 : Error = 103,
1051 : DbSize = 104,
1052 : GetSlruSegment = 105,
1053 : }
1054 : impl TryFrom<u8> for PagestreamBeMessageTag {
1055 : type Error = u8;
1056 0 : fn try_from(value: u8) -> Result<Self, u8> {
1057 0 : match value {
1058 0 : 100 => Ok(PagestreamBeMessageTag::Exists),
1059 0 : 101 => Ok(PagestreamBeMessageTag::Nblocks),
1060 0 : 102 => Ok(PagestreamBeMessageTag::GetPage),
1061 0 : 103 => Ok(PagestreamBeMessageTag::Error),
1062 0 : 104 => Ok(PagestreamBeMessageTag::DbSize),
1063 0 : 105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
1064 0 : _ => Err(value),
1065 : }
1066 0 : }
1067 : }
1068 :
1069 : // In the V2 protocol version, a GetPage request contains two LSN values:
1070 : //
1071 : // request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
1072 : // "get the latest version present". It's used by the primary server, which knows that no one else
1073 : // is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
1074 : // Lsn::Max. Standby servers use the current replay LSN as the request LSN.
1075 : //
1076 : // not_modified_since: Hint to the pageserver that the client knows that the page has not been
1077 : // modified between 'not_modified_since' and the request LSN. It's always correct to set
1078 : // 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
1079 : // passing an earlier LSN can speed up the request, by allowing the pageserver to process the
1080 : // request without waiting for 'request_lsn' to arrive.
1081 : //
1082 : // The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
1083 : // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
1084 : // 'latest' was set to true. The V2 interface was added because there was no correct way for a
1085 : // standby to request a page at a particular non-latest LSN, and also include the
1086 : // 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
1087 : // request, if the standby knows that the page hasn't been modified since, and risk getting an error
1088 : // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
1089 : // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
1090 : // interface allows sending both LSNs, and let the pageserver do the right thing. There is no
1091 : // difference in the responses between V1 and V2.
1092 : //
1093 : // The Request structs below reflect the V2 interface. If V1 is used, the parse function
1094 : // maps the old format requests to the new format.
1095 : //
1096 : #[derive(Clone, Copy)]
1097 : pub enum PagestreamProtocolVersion {
1098 : V1,
1099 : V2,
1100 : }
1101 :
1102 : #[derive(Debug, PartialEq, Eq)]
1103 : pub struct PagestreamExistsRequest {
1104 : pub request_lsn: Lsn,
1105 : pub not_modified_since: Lsn,
1106 : pub rel: RelTag,
1107 : }
1108 :
1109 : #[derive(Debug, PartialEq, Eq)]
1110 : pub struct PagestreamNblocksRequest {
1111 : pub request_lsn: Lsn,
1112 : pub not_modified_since: Lsn,
1113 : pub rel: RelTag,
1114 : }
1115 :
1116 : #[derive(Debug, PartialEq, Eq)]
1117 : pub struct PagestreamGetPageRequest {
1118 : pub request_lsn: Lsn,
1119 : pub not_modified_since: Lsn,
1120 : pub rel: RelTag,
1121 : pub blkno: u32,
1122 : }
1123 :
1124 : #[derive(Debug, PartialEq, Eq)]
1125 : pub struct PagestreamDbSizeRequest {
1126 : pub request_lsn: Lsn,
1127 : pub not_modified_since: Lsn,
1128 : pub dbnode: u32,
1129 : }
1130 :
1131 : #[derive(Debug, PartialEq, Eq)]
1132 : pub struct PagestreamGetSlruSegmentRequest {
1133 : pub request_lsn: Lsn,
1134 : pub not_modified_since: Lsn,
1135 : pub kind: u8,
1136 : pub segno: u32,
1137 : }
1138 :
1139 : #[derive(Debug)]
1140 : pub struct PagestreamExistsResponse {
1141 : pub exists: bool,
1142 : }
1143 :
1144 : #[derive(Debug)]
1145 : pub struct PagestreamNblocksResponse {
1146 : pub n_blocks: u32,
1147 : }
1148 :
1149 : #[derive(Debug)]
1150 : pub struct PagestreamGetPageResponse {
1151 : pub page: Bytes,
1152 : }
1153 :
1154 : #[derive(Debug)]
1155 : pub struct PagestreamGetSlruSegmentResponse {
1156 : pub segment: Bytes,
1157 : }
1158 :
1159 : #[derive(Debug)]
1160 : pub struct PagestreamErrorResponse {
1161 : pub message: String,
1162 : }
1163 :
1164 : #[derive(Debug)]
1165 : pub struct PagestreamDbSizeResponse {
1166 : pub db_size: i64,
1167 : }
1168 :
1169 : // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
1170 : // that require pageserver-internal types. It is sufficient to get the total size.
1171 0 : #[derive(Serialize, Deserialize, Debug)]
1172 : pub struct TenantHistorySize {
1173 : pub id: TenantId,
1174 : /// Size is a mixture of WAL and logical size, so the unit is bytes.
1175 : ///
1176 : /// Will be none if `?inputs_only=true` was given.
1177 : pub size: Option<u64>,
1178 : }
1179 :
1180 : impl PagestreamFeMessage {
1181 : /// Serialize a compute -> pageserver message. This is currently only used in testing
1182 : /// tools. Always uses protocol version 2.
1183 8 : pub fn serialize(&self) -> Bytes {
1184 8 : let mut bytes = BytesMut::new();
1185 8 :
1186 8 : match self {
1187 2 : Self::Exists(req) => {
1188 2 : bytes.put_u8(0);
1189 2 : bytes.put_u64(req.request_lsn.0);
1190 2 : bytes.put_u64(req.not_modified_since.0);
1191 2 : bytes.put_u32(req.rel.spcnode);
1192 2 : bytes.put_u32(req.rel.dbnode);
1193 2 : bytes.put_u32(req.rel.relnode);
1194 2 : bytes.put_u8(req.rel.forknum);
1195 2 : }
1196 :
1197 2 : Self::Nblocks(req) => {
1198 2 : bytes.put_u8(1);
1199 2 : bytes.put_u64(req.request_lsn.0);
1200 2 : bytes.put_u64(req.not_modified_since.0);
1201 2 : bytes.put_u32(req.rel.spcnode);
1202 2 : bytes.put_u32(req.rel.dbnode);
1203 2 : bytes.put_u32(req.rel.relnode);
1204 2 : bytes.put_u8(req.rel.forknum);
1205 2 : }
1206 :
1207 2 : Self::GetPage(req) => {
1208 2 : bytes.put_u8(2);
1209 2 : bytes.put_u64(req.request_lsn.0);
1210 2 : bytes.put_u64(req.not_modified_since.0);
1211 2 : bytes.put_u32(req.rel.spcnode);
1212 2 : bytes.put_u32(req.rel.dbnode);
1213 2 : bytes.put_u32(req.rel.relnode);
1214 2 : bytes.put_u8(req.rel.forknum);
1215 2 : bytes.put_u32(req.blkno);
1216 2 : }
1217 :
1218 2 : Self::DbSize(req) => {
1219 2 : bytes.put_u8(3);
1220 2 : bytes.put_u64(req.request_lsn.0);
1221 2 : bytes.put_u64(req.not_modified_since.0);
1222 2 : bytes.put_u32(req.dbnode);
1223 2 : }
1224 :
1225 0 : Self::GetSlruSegment(req) => {
1226 0 : bytes.put_u8(4);
1227 0 : bytes.put_u64(req.request_lsn.0);
1228 0 : bytes.put_u64(req.not_modified_since.0);
1229 0 : bytes.put_u8(req.kind);
1230 0 : bytes.put_u32(req.segno);
1231 0 : }
1232 : }
1233 :
1234 8 : bytes.into()
1235 8 : }
1236 :
1237 8 : pub fn parse<R: std::io::Read>(
1238 8 : body: &mut R,
1239 8 : protocol_version: PagestreamProtocolVersion,
1240 8 : ) -> anyhow::Result<PagestreamFeMessage> {
1241 : // these correspond to the NeonMessageTag enum in pagestore_client.h
1242 : //
1243 : // TODO: consider using protobuf or serde bincode for less error prone
1244 : // serialization.
1245 8 : let msg_tag = body.read_u8()?;
1246 :
1247 8 : let (request_lsn, not_modified_since) = match protocol_version {
1248 : PagestreamProtocolVersion::V2 => (
1249 8 : Lsn::from(body.read_u64::<BigEndian>()?),
1250 8 : Lsn::from(body.read_u64::<BigEndian>()?),
1251 : ),
1252 : PagestreamProtocolVersion::V1 => {
1253 : // In the old protocol, each message starts with a boolean 'latest' flag,
1254 : // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
1255 : // 'not_modified_since', used in the new protocol version.
1256 0 : let latest = body.read_u8()? != 0;
1257 0 : let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
1258 0 : if latest {
1259 0 : (Lsn::MAX, request_lsn) // get latest version
1260 : } else {
1261 0 : (request_lsn, request_lsn) // get version at specified LSN
1262 : }
1263 : }
1264 : };
1265 :
1266 : // The rest of the messages are the same between V1 and V2
1267 8 : match msg_tag {
1268 : 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
1269 2 : request_lsn,
1270 2 : not_modified_since,
1271 2 : rel: RelTag {
1272 2 : spcnode: body.read_u32::<BigEndian>()?,
1273 2 : dbnode: body.read_u32::<BigEndian>()?,
1274 2 : relnode: body.read_u32::<BigEndian>()?,
1275 2 : forknum: body.read_u8()?,
1276 : },
1277 : })),
1278 : 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1279 2 : request_lsn,
1280 2 : not_modified_since,
1281 2 : rel: RelTag {
1282 2 : spcnode: body.read_u32::<BigEndian>()?,
1283 2 : dbnode: body.read_u32::<BigEndian>()?,
1284 2 : relnode: body.read_u32::<BigEndian>()?,
1285 2 : forknum: body.read_u8()?,
1286 : },
1287 : })),
1288 : 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1289 2 : request_lsn,
1290 2 : not_modified_since,
1291 2 : rel: RelTag {
1292 2 : spcnode: body.read_u32::<BigEndian>()?,
1293 2 : dbnode: body.read_u32::<BigEndian>()?,
1294 2 : relnode: body.read_u32::<BigEndian>()?,
1295 2 : forknum: body.read_u8()?,
1296 : },
1297 2 : blkno: body.read_u32::<BigEndian>()?,
1298 : })),
1299 : 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1300 2 : request_lsn,
1301 2 : not_modified_since,
1302 2 : dbnode: body.read_u32::<BigEndian>()?,
1303 : })),
1304 : 4 => Ok(PagestreamFeMessage::GetSlruSegment(
1305 : PagestreamGetSlruSegmentRequest {
1306 0 : request_lsn,
1307 0 : not_modified_since,
1308 0 : kind: body.read_u8()?,
1309 0 : segno: body.read_u32::<BigEndian>()?,
1310 : },
1311 : )),
1312 0 : _ => bail!("unknown smgr message tag: {:?}", msg_tag),
1313 : }
1314 8 : }
1315 : }
1316 :
1317 : impl PagestreamBeMessage {
1318 0 : pub fn serialize(&self) -> Bytes {
1319 0 : let mut bytes = BytesMut::new();
1320 0 :
1321 0 : use PagestreamBeMessageTag as Tag;
1322 0 : match self {
1323 0 : Self::Exists(resp) => {
1324 0 : bytes.put_u8(Tag::Exists as u8);
1325 0 : bytes.put_u8(resp.exists as u8);
1326 0 : }
1327 :
1328 0 : Self::Nblocks(resp) => {
1329 0 : bytes.put_u8(Tag::Nblocks as u8);
1330 0 : bytes.put_u32(resp.n_blocks);
1331 0 : }
1332 :
1333 0 : Self::GetPage(resp) => {
1334 0 : bytes.put_u8(Tag::GetPage as u8);
1335 0 : bytes.put(&resp.page[..]);
1336 0 : }
1337 :
1338 0 : Self::Error(resp) => {
1339 0 : bytes.put_u8(Tag::Error as u8);
1340 0 : bytes.put(resp.message.as_bytes());
1341 0 : bytes.put_u8(0); // null terminator
1342 0 : }
1343 0 : Self::DbSize(resp) => {
1344 0 : bytes.put_u8(Tag::DbSize as u8);
1345 0 : bytes.put_i64(resp.db_size);
1346 0 : }
1347 :
1348 0 : Self::GetSlruSegment(resp) => {
1349 0 : bytes.put_u8(Tag::GetSlruSegment as u8);
1350 0 : bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
1351 0 : bytes.put(&resp.segment[..]);
1352 0 : }
1353 : }
1354 :
1355 0 : bytes.into()
1356 0 : }
1357 :
1358 0 : pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
1359 0 : let mut buf = buf.reader();
1360 0 : let msg_tag = buf.read_u8()?;
1361 :
1362 : use PagestreamBeMessageTag as Tag;
1363 0 : let ok =
1364 0 : match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
1365 : Tag::Exists => {
1366 0 : let exists = buf.read_u8()?;
1367 0 : Self::Exists(PagestreamExistsResponse {
1368 0 : exists: exists != 0,
1369 0 : })
1370 : }
1371 : Tag::Nblocks => {
1372 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1373 0 : Self::Nblocks(PagestreamNblocksResponse { n_blocks })
1374 : }
1375 : Tag::GetPage => {
1376 0 : let mut page = vec![0; 8192]; // TODO: use MaybeUninit
1377 0 : buf.read_exact(&mut page)?;
1378 0 : PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
1379 : }
1380 : Tag::Error => {
1381 0 : let mut msg = Vec::new();
1382 0 : buf.read_until(0, &mut msg)?;
1383 0 : let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
1384 0 : let rust_str = cstring.to_str()?;
1385 0 : PagestreamBeMessage::Error(PagestreamErrorResponse {
1386 0 : message: rust_str.to_owned(),
1387 0 : })
1388 : }
1389 : Tag::DbSize => {
1390 0 : let db_size = buf.read_i64::<BigEndian>()?;
1391 0 : Self::DbSize(PagestreamDbSizeResponse { db_size })
1392 : }
1393 : Tag::GetSlruSegment => {
1394 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1395 0 : let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
1396 0 : buf.read_exact(&mut segment)?;
1397 0 : Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
1398 0 : segment: segment.into(),
1399 0 : })
1400 : }
1401 : };
1402 0 : let remaining = buf.into_inner();
1403 0 : if !remaining.is_empty() {
1404 0 : anyhow::bail!(
1405 0 : "remaining bytes in msg with tag={msg_tag}: {}",
1406 0 : remaining.len()
1407 0 : );
1408 0 : }
1409 0 : Ok(ok)
1410 0 : }
1411 :
1412 0 : pub fn kind(&self) -> &'static str {
1413 0 : match self {
1414 0 : Self::Exists(_) => "Exists",
1415 0 : Self::Nblocks(_) => "Nblocks",
1416 0 : Self::GetPage(_) => "GetPage",
1417 0 : Self::Error(_) => "Error",
1418 0 : Self::DbSize(_) => "DbSize",
1419 0 : Self::GetSlruSegment(_) => "GetSlruSegment",
1420 : }
1421 0 : }
1422 : }
1423 :
1424 : #[cfg(test)]
1425 : mod tests {
1426 : use serde_json::json;
1427 : use std::str::FromStr;
1428 :
1429 : use super::*;
1430 :
1431 : #[test]
1432 2 : fn test_pagestream() {
1433 2 : // Test serialization/deserialization of PagestreamFeMessage
1434 2 : let messages = vec![
1435 2 : PagestreamFeMessage::Exists(PagestreamExistsRequest {
1436 2 : request_lsn: Lsn(4),
1437 2 : not_modified_since: Lsn(3),
1438 2 : rel: RelTag {
1439 2 : forknum: 1,
1440 2 : spcnode: 2,
1441 2 : dbnode: 3,
1442 2 : relnode: 4,
1443 2 : },
1444 2 : }),
1445 2 : PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1446 2 : request_lsn: Lsn(4),
1447 2 : not_modified_since: Lsn(4),
1448 2 : rel: RelTag {
1449 2 : forknum: 1,
1450 2 : spcnode: 2,
1451 2 : dbnode: 3,
1452 2 : relnode: 4,
1453 2 : },
1454 2 : }),
1455 2 : PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1456 2 : request_lsn: Lsn(4),
1457 2 : not_modified_since: Lsn(3),
1458 2 : rel: RelTag {
1459 2 : forknum: 1,
1460 2 : spcnode: 2,
1461 2 : dbnode: 3,
1462 2 : relnode: 4,
1463 2 : },
1464 2 : blkno: 7,
1465 2 : }),
1466 2 : PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1467 2 : request_lsn: Lsn(4),
1468 2 : not_modified_since: Lsn(3),
1469 2 : dbnode: 7,
1470 2 : }),
1471 2 : ];
1472 10 : for msg in messages {
1473 8 : let bytes = msg.serialize();
1474 8 : let reconstructed =
1475 8 : PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
1476 8 : .unwrap();
1477 8 : assert!(msg == reconstructed);
1478 : }
1479 2 : }
1480 :
1481 : #[test]
1482 2 : fn test_tenantinfo_serde() {
1483 2 : // Test serialization/deserialization of TenantInfo
1484 2 : let original_active = TenantInfo {
1485 2 : id: TenantShardId::unsharded(TenantId::generate()),
1486 2 : state: TenantState::Active,
1487 2 : current_physical_size: Some(42),
1488 2 : attachment_status: TenantAttachmentStatus::Attached,
1489 2 : generation: 1,
1490 2 : };
1491 2 : let expected_active = json!({
1492 2 : "id": original_active.id.to_string(),
1493 2 : "state": {
1494 2 : "slug": "Active",
1495 2 : },
1496 2 : "current_physical_size": 42,
1497 2 : "attachment_status": {
1498 2 : "slug":"attached",
1499 2 : },
1500 2 : "generation" : 1
1501 2 : });
1502 2 :
1503 2 : let original_broken = TenantInfo {
1504 2 : id: TenantShardId::unsharded(TenantId::generate()),
1505 2 : state: TenantState::Broken {
1506 2 : reason: "reason".into(),
1507 2 : backtrace: "backtrace info".into(),
1508 2 : },
1509 2 : current_physical_size: Some(42),
1510 2 : attachment_status: TenantAttachmentStatus::Attached,
1511 2 : generation: 1,
1512 2 : };
1513 2 : let expected_broken = json!({
1514 2 : "id": original_broken.id.to_string(),
1515 2 : "state": {
1516 2 : "slug": "Broken",
1517 2 : "data": {
1518 2 : "backtrace": "backtrace info",
1519 2 : "reason": "reason",
1520 2 : }
1521 2 : },
1522 2 : "current_physical_size": 42,
1523 2 : "attachment_status": {
1524 2 : "slug":"attached",
1525 2 : },
1526 2 : "generation" : 1
1527 2 : });
1528 2 :
1529 2 : assert_eq!(
1530 2 : serde_json::to_value(&original_active).unwrap(),
1531 2 : expected_active
1532 2 : );
1533 :
1534 2 : assert_eq!(
1535 2 : serde_json::to_value(&original_broken).unwrap(),
1536 2 : expected_broken
1537 2 : );
1538 2 : assert!(format!("{:?}", &original_broken.state).contains("reason"));
1539 2 : assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
1540 2 : }
1541 :
1542 : #[test]
1543 2 : fn test_reject_unknown_field() {
1544 2 : let id = TenantId::generate();
1545 2 : let config_request = json!({
1546 2 : "tenant_id": id.to_string(),
1547 2 : "unknown_field": "unknown_value".to_string(),
1548 2 : });
1549 2 : let err = serde_json::from_value::<TenantConfigRequest>(config_request).unwrap_err();
1550 2 : assert!(
1551 2 : err.to_string().contains("unknown field `unknown_field`"),
1552 0 : "expect unknown field `unknown_field` error, got: {}",
1553 : err
1554 : );
1555 2 : }
1556 :
1557 : #[test]
1558 2 : fn tenantstatus_activating_serde() {
1559 2 : let states = [
1560 2 : TenantState::Activating(ActivatingFrom::Loading),
1561 2 : TenantState::Activating(ActivatingFrom::Attaching),
1562 2 : ];
1563 2 : let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
1564 2 :
1565 2 : let actual = serde_json::to_string(&states).unwrap();
1566 2 :
1567 2 : assert_eq!(actual, expected);
1568 :
1569 2 : let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
1570 2 :
1571 2 : assert_eq!(states.as_slice(), &parsed);
1572 2 : }
1573 :
1574 : #[test]
1575 2 : fn tenantstatus_activating_strum() {
1576 2 : // tests added, because we use these for metrics
1577 2 : let examples = [
1578 2 : (line!(), TenantState::Loading, "Loading"),
1579 2 : (line!(), TenantState::Attaching, "Attaching"),
1580 2 : (
1581 2 : line!(),
1582 2 : TenantState::Activating(ActivatingFrom::Loading),
1583 2 : "Activating",
1584 2 : ),
1585 2 : (
1586 2 : line!(),
1587 2 : TenantState::Activating(ActivatingFrom::Attaching),
1588 2 : "Activating",
1589 2 : ),
1590 2 : (line!(), TenantState::Active, "Active"),
1591 2 : (
1592 2 : line!(),
1593 2 : TenantState::Stopping {
1594 2 : progress: utils::completion::Barrier::default(),
1595 2 : },
1596 2 : "Stopping",
1597 2 : ),
1598 2 : (
1599 2 : line!(),
1600 2 : TenantState::Broken {
1601 2 : reason: "Example".into(),
1602 2 : backtrace: "Looooong backtrace".into(),
1603 2 : },
1604 2 : "Broken",
1605 2 : ),
1606 2 : ];
1607 :
1608 16 : for (line, rendered, expected) in examples {
1609 14 : let actual: &'static str = rendered.into();
1610 14 : assert_eq!(actual, expected, "example on {line}");
1611 : }
1612 2 : }
1613 :
1614 : #[test]
1615 2 : fn test_aux_file_migration_path() {
1616 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1617 2 : None,
1618 2 : AuxFilePolicy::V1
1619 2 : ));
1620 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1621 2 : None,
1622 2 : AuxFilePolicy::V2
1623 2 : ));
1624 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1625 2 : None,
1626 2 : AuxFilePolicy::CrossValidation
1627 2 : ));
1628 : // Self-migration is not a valid migration path, and the caller should handle it by itself.
1629 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1630 2 : Some(AuxFilePolicy::V1),
1631 2 : AuxFilePolicy::V1
1632 2 : ));
1633 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1634 2 : Some(AuxFilePolicy::V2),
1635 2 : AuxFilePolicy::V2
1636 2 : ));
1637 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1638 2 : Some(AuxFilePolicy::CrossValidation),
1639 2 : AuxFilePolicy::CrossValidation
1640 2 : ));
1641 : // Migrations not allowed
1642 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1643 2 : Some(AuxFilePolicy::CrossValidation),
1644 2 : AuxFilePolicy::V1
1645 2 : ));
1646 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1647 2 : Some(AuxFilePolicy::V1),
1648 2 : AuxFilePolicy::V2
1649 2 : ));
1650 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1651 2 : Some(AuxFilePolicy::V2),
1652 2 : AuxFilePolicy::V1
1653 2 : ));
1654 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1655 2 : Some(AuxFilePolicy::V2),
1656 2 : AuxFilePolicy::CrossValidation
1657 2 : ));
1658 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1659 2 : Some(AuxFilePolicy::V1),
1660 2 : AuxFilePolicy::CrossValidation
1661 2 : ));
1662 : // Migrations allowed
1663 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1664 2 : Some(AuxFilePolicy::CrossValidation),
1665 2 : AuxFilePolicy::V2
1666 2 : ));
1667 2 : }
1668 :
1669 : #[test]
1670 2 : fn test_aux_parse() {
1671 2 : assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
1672 2 : assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
1673 2 : assert_eq!(
1674 2 : AuxFilePolicy::from_str("cross-validation").unwrap(),
1675 2 : AuxFilePolicy::CrossValidation
1676 2 : );
1677 2 : }
1678 :
1679 : #[test]
1680 2 : fn test_image_compression_algorithm_parsing() {
1681 2 : use ImageCompressionAlgorithm::*;
1682 2 : assert_eq!(
1683 2 : ImageCompressionAlgorithm::from_str("disabled").unwrap(),
1684 2 : Disabled
1685 2 : );
1686 2 : assert_eq!(
1687 2 : ImageCompressionAlgorithm::from_str("disabled-no-decompress").unwrap(),
1688 2 : DisabledNoDecompress
1689 2 : );
1690 2 : assert_eq!(
1691 2 : ImageCompressionAlgorithm::from_str("zstd").unwrap(),
1692 2 : Zstd { level: None }
1693 2 : );
1694 2 : assert_eq!(
1695 2 : ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
1696 2 : Zstd { level: Some(18) }
1697 2 : );
1698 2 : assert_eq!(
1699 2 : ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
1700 2 : Zstd { level: Some(-3) }
1701 2 : );
1702 2 : }
1703 : }
|