Line data Source code
1 : pub mod detach_ancestor;
2 : pub mod partitioning;
3 : pub mod utilization;
4 :
5 : #[cfg(feature = "testing")]
6 : use camino::Utf8PathBuf;
7 : pub use utilization::PageserverUtilization;
8 :
9 : use std::{
10 : collections::HashMap,
11 : fmt::Display,
12 : io::{BufRead, Read},
13 : num::{NonZeroU32, NonZeroU64, NonZeroUsize},
14 : str::FromStr,
15 : time::{Duration, SystemTime},
16 : };
17 :
18 : use byteorder::{BigEndian, ReadBytesExt};
19 : use postgres_ffi::BLCKSZ;
20 : use serde::{Deserialize, Serialize};
21 : use serde_with::serde_as;
22 : use utils::{
23 : completion,
24 : id::{NodeId, TenantId, TimelineId},
25 : lsn::Lsn,
26 : serde_system_time,
27 : };
28 :
29 : use crate::{
30 : reltag::RelTag,
31 : shard::{ShardCount, ShardStripeSize, TenantShardId},
32 : };
33 : use anyhow::bail;
34 : use bytes::{Buf, BufMut, Bytes, BytesMut};
35 :
36 : /// The state of a tenant in this pageserver.
37 : ///
38 : /// ```mermaid
39 : /// stateDiagram-v2
40 : ///
41 : /// [*] --> Attaching: spawn_attach()
42 : ///
43 : /// Attaching --> Activating: activate()
44 : /// Activating --> Active: infallible
45 : ///
46 : /// Attaching --> Broken: attach() failure
47 : ///
48 : /// Active --> Stopping: set_stopping(), part of shutdown & detach
49 : /// Stopping --> Broken: late error in remove_tenant_from_memory
50 : ///
51 : /// Broken --> [*]: ignore / detach / shutdown
52 : /// Stopping --> [*]: remove_from_memory complete
53 : ///
54 : /// Active --> Broken: cfg(testing)-only tenant break point
55 : /// ```
56 : #[derive(
57 : Clone,
58 : PartialEq,
59 : Eq,
60 1 : serde::Serialize,
61 3 : serde::Deserialize,
62 0 : strum_macros::Display,
63 : strum_macros::VariantNames,
64 0 : strum_macros::AsRefStr,
65 387 : strum_macros::IntoStaticStr,
66 : )]
67 : #[serde(tag = "slug", content = "data")]
68 : pub enum TenantState {
69 : /// This tenant is being attached to the pageserver.
70 : ///
71 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
72 : Attaching,
73 : /// The tenant is transitioning from Loading/Attaching to Active.
74 : ///
75 : /// While in this state, the individual timelines are being activated.
76 : ///
77 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
78 : Activating(ActivatingFrom),
79 : /// The tenant has finished activating and is open for business.
80 : ///
81 : /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
82 : Active,
83 : /// The tenant is recognized by pageserver, but it is being detached or the
84 : /// system is being shut down.
85 : ///
86 : /// Transitions out of this state are possible through `set_broken()`.
87 : Stopping {
88 : // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
89 : // otherwise it will not be skipped during deserialization
90 : #[serde(skip)]
91 : progress: completion::Barrier,
92 : },
93 : /// The tenant is recognized by the pageserver, but can no longer be used for
94 : /// any operations.
95 : ///
96 : /// If the tenant fails to load or attach, it will transition to this state
97 : /// and it is guaranteed that no background tasks are running in its name.
98 : ///
99 : /// The other way to transition into this state is from `Stopping` state
100 : /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
101 : /// if the cleanup future executed by `remove_tenant_from_memory()` fails.
102 : Broken { reason: String, backtrace: String },
103 : }
104 :
105 : impl TenantState {
106 0 : pub fn attachment_status(&self) -> TenantAttachmentStatus {
107 : use TenantAttachmentStatus::*;
108 :
109 : // Below TenantState::Activating is used as "transient" or "transparent" state for
110 : // attachment_status determining.
111 0 : match self {
112 : // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
113 : // So, technically, we can return Attached here.
114 : // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
115 : // But, our attach task might still be fetching the remote timelines, etc.
116 : // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
117 0 : Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
118 : // We only reach Active after successful load / attach.
119 : // So, call atttachment status Attached.
120 0 : Self::Active => Attached,
121 : // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
122 : // However, it also becomes Broken if the regular load fails.
123 : // From Console's perspective there's no practical difference
124 : // because attachment_status is polled by console only during attach operation execution.
125 0 : Self::Broken { reason, .. } => Failed {
126 0 : reason: reason.to_owned(),
127 0 : },
128 : // Why is Stopping a Maybe case? Because, during pageserver shutdown,
129 : // we set the Stopping state irrespective of whether the tenant
130 : // has finished attaching or not.
131 0 : Self::Stopping { .. } => Maybe,
132 : }
133 0 : }
134 :
135 0 : pub fn broken_from_reason(reason: String) -> Self {
136 0 : let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
137 0 : Self::Broken {
138 0 : reason,
139 0 : backtrace: backtrace_str,
140 0 : }
141 0 : }
142 : }
143 :
144 : impl std::fmt::Debug for TenantState {
145 2 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
146 2 : match self {
147 2 : Self::Broken { reason, backtrace } if !reason.is_empty() => {
148 2 : write!(f, "Broken due to: {reason}. Backtrace:\n{backtrace}")
149 : }
150 0 : _ => write!(f, "{self}"),
151 : }
152 2 : }
153 : }
154 :
155 : /// A temporary lease to a specific lsn inside a timeline.
156 : /// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
157 : #[serde_as]
158 0 : #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
159 : pub struct LsnLease {
160 : #[serde_as(as = "SystemTimeAsRfc3339Millis")]
161 : pub valid_until: SystemTime,
162 : }
163 :
164 : serde_with::serde_conv!(
165 : SystemTimeAsRfc3339Millis,
166 : SystemTime,
167 0 : |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
168 0 : |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
169 : );
170 :
171 : impl LsnLease {
172 : /// The default length for an explicit LSN lease request (10 minutes).
173 : pub const DEFAULT_LENGTH: Duration = Duration::from_secs(10 * 60);
174 :
175 : /// The default length for an implicit LSN lease granted during
176 : /// `get_lsn_by_timestamp` request (1 minutes).
177 : pub const DEFAULT_LENGTH_FOR_TS: Duration = Duration::from_secs(60);
178 :
179 : /// Checks whether the lease is expired.
180 6 : pub fn is_expired(&self, now: &SystemTime) -> bool {
181 6 : now > &self.valid_until
182 6 : }
183 : }
184 :
185 : /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
186 : ///
187 : /// XXX: We used to have more variants here, but now it's just one, which makes this rather
188 : /// useless. Remove, once we've checked that there's no client code left that looks at this.
189 2 : #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
190 : pub enum ActivatingFrom {
191 : /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
192 : Attaching,
193 : }
194 :
195 : /// A state of a timeline in pageserver's memory.
196 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
197 : pub enum TimelineState {
198 : /// The timeline is recognized by the pageserver but is not yet operational.
199 : /// In particular, the walreceiver connection loop is not running for this timeline.
200 : /// It will eventually transition to state Active or Broken.
201 : Loading,
202 : /// The timeline is fully operational.
203 : /// It can be queried, and the walreceiver connection loop is running.
204 : Active,
205 : /// The timeline was previously Loading or Active but is shutting down.
206 : /// It cannot transition back into any other state.
207 : Stopping,
208 : /// The timeline is broken and not operational (previous states: Loading or Active).
209 : Broken { reason: String, backtrace: String },
210 : }
211 :
212 0 : #[derive(Serialize, Deserialize, Clone)]
213 : pub struct TimelineCreateRequest {
214 : pub new_timeline_id: TimelineId,
215 : #[serde(flatten)]
216 : pub mode: TimelineCreateRequestMode,
217 : }
218 :
219 0 : #[derive(Serialize, Deserialize, Clone)]
220 : #[serde(untagged)]
221 : pub enum TimelineCreateRequestMode {
222 : Branch {
223 : ancestor_timeline_id: TimelineId,
224 : #[serde(default)]
225 : ancestor_start_lsn: Option<Lsn>,
226 : // TODO: cplane sets this, but, the branching code always
227 : // inherits the ancestor's pg_version. Earlier code wasn't
228 : // using a flattened enum, so, it was an accepted field, and
229 : // we continue to accept it by having it here.
230 : pg_version: Option<u32>,
231 : },
232 : ImportPgdata {
233 : import_pgdata: TimelineCreateRequestModeImportPgdata,
234 : },
235 : // NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap.
236 : // (serde picks the first matching enum variant, in declaration order).
237 : Bootstrap {
238 : #[serde(default)]
239 : existing_initdb_timeline_id: Option<TimelineId>,
240 : pg_version: Option<u32>,
241 : },
242 : }
243 :
244 0 : #[derive(Serialize, Deserialize, Clone)]
245 : pub struct TimelineCreateRequestModeImportPgdata {
246 : pub location: ImportPgdataLocation,
247 : pub idempotency_key: ImportPgdataIdempotencyKey,
248 : }
249 :
250 0 : #[derive(Serialize, Deserialize, Clone, Debug)]
251 : pub enum ImportPgdataLocation {
252 : #[cfg(feature = "testing")]
253 : LocalFs { path: Utf8PathBuf },
254 : AwsS3 {
255 : region: String,
256 : bucket: String,
257 : /// A better name for this would be `prefix`; changing requires coordination with cplane.
258 : /// See <https://github.com/neondatabase/cloud/issues/20646>.
259 : key: String,
260 : },
261 : }
262 :
263 0 : #[derive(Serialize, Deserialize, Clone)]
264 : #[serde(transparent)]
265 : pub struct ImportPgdataIdempotencyKey(pub String);
266 :
267 : impl ImportPgdataIdempotencyKey {
268 0 : pub fn random() -> Self {
269 : use rand::{distributions::Alphanumeric, Rng};
270 0 : Self(
271 0 : rand::thread_rng()
272 0 : .sample_iter(&Alphanumeric)
273 0 : .take(20)
274 0 : .map(char::from)
275 0 : .collect(),
276 0 : )
277 0 : }
278 : }
279 :
280 0 : #[derive(Serialize, Deserialize, Clone)]
281 : pub struct LsnLeaseRequest {
282 : pub lsn: Lsn,
283 : }
284 :
285 0 : #[derive(Serialize, Deserialize)]
286 : pub struct TenantShardSplitRequest {
287 : pub new_shard_count: u8,
288 :
289 : // A tenant's stripe size is only meaningful the first time their shard count goes
290 : // above 1: therefore during a split from 1->N shards, we may modify the stripe size.
291 : //
292 : // If this is set while the stripe count is being increased from an already >1 value,
293 : // then the request will fail with 400.
294 : pub new_stripe_size: Option<ShardStripeSize>,
295 : }
296 :
297 0 : #[derive(Serialize, Deserialize)]
298 : pub struct TenantShardSplitResponse {
299 : pub new_shards: Vec<TenantShardId>,
300 : }
301 :
302 : /// Parameters that apply to all shards in a tenant. Used during tenant creation.
303 0 : #[derive(Serialize, Deserialize, Debug)]
304 : #[serde(deny_unknown_fields)]
305 : pub struct ShardParameters {
306 : pub count: ShardCount,
307 : pub stripe_size: ShardStripeSize,
308 : }
309 :
310 : impl ShardParameters {
311 : pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
312 :
313 0 : pub fn is_unsharded(&self) -> bool {
314 0 : self.count.is_unsharded()
315 0 : }
316 : }
317 :
318 : impl Default for ShardParameters {
319 193 : fn default() -> Self {
320 193 : Self {
321 193 : count: ShardCount::new(0),
322 193 : stripe_size: Self::DEFAULT_STRIPE_SIZE,
323 193 : }
324 193 : }
325 : }
326 :
327 : /// An alternative representation of `pageserver::tenant::TenantConf` with
328 : /// simpler types.
329 2 : #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
330 : pub struct TenantConfig {
331 : pub checkpoint_distance: Option<u64>,
332 : pub checkpoint_timeout: Option<String>,
333 : pub compaction_target_size: Option<u64>,
334 : pub compaction_period: Option<String>,
335 : pub compaction_threshold: Option<usize>,
336 : // defer parsing compaction_algorithm, like eviction_policy
337 : pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
338 : pub gc_horizon: Option<u64>,
339 : pub gc_period: Option<String>,
340 : pub image_creation_threshold: Option<usize>,
341 : pub pitr_interval: Option<String>,
342 : pub walreceiver_connect_timeout: Option<String>,
343 : pub lagging_wal_timeout: Option<String>,
344 : pub max_lsn_wal_lag: Option<NonZeroU64>,
345 : pub eviction_policy: Option<EvictionPolicy>,
346 : pub min_resident_size_override: Option<u64>,
347 : pub evictions_low_residence_duration_metric_threshold: Option<String>,
348 : pub heatmap_period: Option<String>,
349 : pub lazy_slru_download: Option<bool>,
350 : pub timeline_get_throttle: Option<ThrottleConfig>,
351 : pub image_layer_creation_check_threshold: Option<u8>,
352 : pub lsn_lease_length: Option<String>,
353 : pub lsn_lease_length_for_ts: Option<String>,
354 : pub timeline_offloading: Option<bool>,
355 : }
356 :
357 : /// The policy for the aux file storage.
358 : ///
359 : /// It can be switched through `switch_aux_file_policy` tenant config.
360 : /// When the first aux file written, the policy will be persisted in the
361 : /// `index_part.json` file and has a limited migration path.
362 : ///
363 : /// Currently, we only allow the following migration path:
364 : ///
365 : /// Unset -> V1
366 : /// -> V2
367 : /// -> CrossValidation -> V2
368 : #[derive(
369 : Eq,
370 : PartialEq,
371 : Debug,
372 : Copy,
373 : Clone,
374 2 : strum_macros::EnumString,
375 0 : strum_macros::Display,
376 0 : serde_with::DeserializeFromStr,
377 : serde_with::SerializeDisplay,
378 : )]
379 : #[strum(serialize_all = "kebab-case")]
380 : pub enum AuxFilePolicy {
381 : /// V1 aux file policy: store everything in AUX_FILE_KEY
382 : #[strum(ascii_case_insensitive)]
383 : V1,
384 : /// V2 aux file policy: store in the AUX_FILE keyspace
385 : #[strum(ascii_case_insensitive)]
386 : V2,
387 : /// Cross validation runs both formats on the write path and does validation
388 : /// on the read path.
389 : #[strum(ascii_case_insensitive)]
390 : CrossValidation,
391 : }
392 :
393 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
394 : #[serde(tag = "kind")]
395 : pub enum EvictionPolicy {
396 : NoEviction,
397 : LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
398 : OnlyImitiate(EvictionPolicyLayerAccessThreshold),
399 : }
400 :
401 : impl EvictionPolicy {
402 0 : pub fn discriminant_str(&self) -> &'static str {
403 0 : match self {
404 0 : EvictionPolicy::NoEviction => "NoEviction",
405 0 : EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
406 0 : EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
407 : }
408 0 : }
409 : }
410 :
411 : #[derive(
412 : Eq,
413 : PartialEq,
414 : Debug,
415 : Copy,
416 : Clone,
417 0 : strum_macros::EnumString,
418 0 : strum_macros::Display,
419 0 : serde_with::DeserializeFromStr,
420 : serde_with::SerializeDisplay,
421 : )]
422 : #[strum(serialize_all = "kebab-case")]
423 : pub enum CompactionAlgorithm {
424 : Legacy,
425 : Tiered,
426 : }
427 :
428 : #[derive(
429 0 : Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay,
430 : )]
431 : pub enum ImageCompressionAlgorithm {
432 : // Disabled for writes, support decompressing during read path
433 : Disabled,
434 : /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
435 : /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
436 : Zstd {
437 : level: Option<i8>,
438 : },
439 : }
440 :
441 : impl FromStr for ImageCompressionAlgorithm {
442 : type Err = anyhow::Error;
443 8 : fn from_str(s: &str) -> Result<Self, Self::Err> {
444 8 : let mut components = s.split(['(', ')']);
445 8 : let first = components
446 8 : .next()
447 8 : .ok_or_else(|| anyhow::anyhow!("empty string"))?;
448 8 : match first {
449 8 : "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
450 6 : "zstd" => {
451 6 : let level = if let Some(v) = components.next() {
452 4 : let v: i8 = v.parse()?;
453 4 : Some(v)
454 : } else {
455 2 : None
456 : };
457 :
458 6 : Ok(ImageCompressionAlgorithm::Zstd { level })
459 : }
460 0 : _ => anyhow::bail!("invalid specifier '{first}'"),
461 : }
462 8 : }
463 : }
464 :
465 : impl Display for ImageCompressionAlgorithm {
466 12 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
467 12 : match self {
468 3 : ImageCompressionAlgorithm::Disabled => write!(f, "disabled"),
469 9 : ImageCompressionAlgorithm::Zstd { level } => {
470 9 : if let Some(level) = level {
471 6 : write!(f, "zstd({})", level)
472 : } else {
473 3 : write!(f, "zstd")
474 : }
475 : }
476 : }
477 12 : }
478 : }
479 :
480 0 : #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
481 : pub struct CompactionAlgorithmSettings {
482 : pub kind: CompactionAlgorithm,
483 : }
484 :
485 6 : #[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
486 : #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
487 : pub enum L0FlushConfig {
488 : #[serde(rename_all = "snake_case")]
489 : Direct { max_concurrency: NonZeroUsize },
490 : }
491 :
492 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
493 : pub struct EvictionPolicyLayerAccessThreshold {
494 : #[serde(with = "humantime_serde")]
495 : pub period: Duration,
496 : #[serde(with = "humantime_serde")]
497 : pub threshold: Duration,
498 : }
499 :
500 0 : #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
501 : pub struct ThrottleConfig {
502 : pub task_kinds: Vec<String>, // TaskKind
503 : pub initial: u32,
504 : #[serde(with = "humantime_serde")]
505 : pub refill_interval: Duration,
506 : pub refill_amount: NonZeroU32,
507 : pub max: u32,
508 : }
509 :
510 : impl ThrottleConfig {
511 394 : pub fn disabled() -> Self {
512 394 : Self {
513 394 : task_kinds: vec![], // effectively disables the throttle
514 394 : // other values don't matter with emtpy `task_kinds`.
515 394 : initial: 0,
516 394 : refill_interval: Duration::from_millis(1),
517 394 : refill_amount: NonZeroU32::new(1).unwrap(),
518 394 : max: 1,
519 394 : }
520 394 : }
521 : /// The requests per second allowed by the given config.
522 0 : pub fn steady_rps(&self) -> f64 {
523 0 : (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
524 0 : }
525 : }
526 :
527 : /// A flattened analog of a `pagesever::tenant::LocationMode`, which
528 : /// lists out all possible states (and the virtual "Detached" state)
529 : /// in a flat form rather than using rust-style enums.
530 0 : #[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
531 : pub enum LocationConfigMode {
532 : AttachedSingle,
533 : AttachedMulti,
534 : AttachedStale,
535 : Secondary,
536 : Detached,
537 : }
538 :
539 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
540 : pub struct LocationConfigSecondary {
541 : pub warm: bool,
542 : }
543 :
544 : /// An alternative representation of `pageserver::tenant::LocationConf`,
545 : /// for use in external-facing APIs.
546 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
547 : pub struct LocationConfig {
548 : pub mode: LocationConfigMode,
549 : /// If attaching, in what generation?
550 : #[serde(default)]
551 : pub generation: Option<u32>,
552 :
553 : // If requesting mode `Secondary`, configuration for that.
554 : #[serde(default)]
555 : pub secondary_conf: Option<LocationConfigSecondary>,
556 :
557 : // Shard parameters: if shard_count is nonzero, then other shard_* fields
558 : // must be set accurately.
559 : #[serde(default)]
560 : pub shard_number: u8,
561 : #[serde(default)]
562 : pub shard_count: u8,
563 : #[serde(default)]
564 : pub shard_stripe_size: u32,
565 :
566 : // This configuration only affects attached mode, but should be provided irrespective
567 : // of the mode, as a secondary location might transition on startup if the response
568 : // to the `/re-attach` control plane API requests it.
569 : pub tenant_conf: TenantConfig,
570 : }
571 :
572 0 : #[derive(Serialize, Deserialize)]
573 : pub struct LocationConfigListResponse {
574 : pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
575 : }
576 :
577 : #[derive(Serialize)]
578 : pub struct StatusResponse {
579 : pub id: NodeId,
580 : }
581 :
582 0 : #[derive(Serialize, Deserialize, Debug)]
583 : #[serde(deny_unknown_fields)]
584 : pub struct TenantLocationConfigRequest {
585 : #[serde(flatten)]
586 : pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
587 : }
588 :
589 0 : #[derive(Serialize, Deserialize, Debug)]
590 : #[serde(deny_unknown_fields)]
591 : pub struct TenantTimeTravelRequest {
592 : pub shard_counts: Vec<ShardCount>,
593 : }
594 :
595 0 : #[derive(Serialize, Deserialize, Debug)]
596 : #[serde(deny_unknown_fields)]
597 : pub struct TenantShardLocation {
598 : pub shard_id: TenantShardId,
599 : pub node_id: NodeId,
600 : }
601 :
602 0 : #[derive(Serialize, Deserialize, Debug)]
603 : #[serde(deny_unknown_fields)]
604 : pub struct TenantLocationConfigResponse {
605 : pub shards: Vec<TenantShardLocation>,
606 : // If the shards' ShardCount count is >1, stripe_size will be set.
607 : pub stripe_size: Option<ShardStripeSize>,
608 : }
609 :
610 3 : #[derive(Serialize, Deserialize, Debug)]
611 : #[serde(deny_unknown_fields)]
612 : pub struct TenantConfigRequest {
613 : pub tenant_id: TenantId,
614 : #[serde(flatten)]
615 : pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
616 : }
617 :
618 : impl std::ops::Deref for TenantConfigRequest {
619 : type Target = TenantConfig;
620 :
621 0 : fn deref(&self) -> &Self::Target {
622 0 : &self.config
623 0 : }
624 : }
625 :
626 : impl TenantConfigRequest {
627 0 : pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
628 0 : let config = TenantConfig::default();
629 0 : TenantConfigRequest { tenant_id, config }
630 0 : }
631 : }
632 :
633 : /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
634 0 : #[derive(Serialize, Deserialize, Clone)]
635 : #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
636 : pub enum TenantAttachmentStatus {
637 : Maybe,
638 : Attached,
639 : Failed { reason: String },
640 : }
641 :
642 0 : #[derive(Serialize, Deserialize, Clone)]
643 : pub struct TenantInfo {
644 : pub id: TenantShardId,
645 : // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
646 : pub state: TenantState,
647 : /// Sum of the size of all layer files.
648 : /// If a layer is present in both local FS and S3, it counts only once.
649 : pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
650 : pub attachment_status: TenantAttachmentStatus,
651 : pub generation: u32,
652 :
653 : /// Opaque explanation if gc is being blocked.
654 : ///
655 : /// Only looked up for the individual tenant detail, not the listing. This is purely for
656 : /// debugging, not included in openapi.
657 : #[serde(skip_serializing_if = "Option::is_none")]
658 : pub gc_blocking: Option<String>,
659 : }
660 :
661 0 : #[derive(Serialize, Deserialize, Clone)]
662 : pub struct TenantDetails {
663 : #[serde(flatten)]
664 : pub tenant_info: TenantInfo,
665 :
666 : pub walredo: Option<WalRedoManagerStatus>,
667 :
668 : pub timelines: Vec<TimelineId>,
669 : }
670 :
671 0 : #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
672 : pub enum TimelineArchivalState {
673 : Archived,
674 : Unarchived,
675 : }
676 :
677 0 : #[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
678 : pub struct TimelineArchivalConfigRequest {
679 : pub state: TimelineArchivalState,
680 : }
681 :
682 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
683 : pub struct TimelinesInfoAndOffloaded {
684 : pub timelines: Vec<TimelineInfo>,
685 : pub offloaded: Vec<OffloadedTimelineInfo>,
686 : }
687 :
688 : /// Analog of [`TimelineInfo`] for offloaded timelines.
689 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
690 : pub struct OffloadedTimelineInfo {
691 : pub tenant_id: TenantShardId,
692 : pub timeline_id: TimelineId,
693 : /// Whether the timeline has a parent it has been branched off from or not
694 : pub ancestor_timeline_id: Option<TimelineId>,
695 : /// Whether to retain the branch lsn at the ancestor or not
696 : pub ancestor_retain_lsn: Option<Lsn>,
697 : /// The time point when the timeline was archived
698 : pub archived_at: chrono::DateTime<chrono::Utc>,
699 : }
700 :
701 : /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
702 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
703 : pub struct TimelineInfo {
704 : pub tenant_id: TenantShardId,
705 : pub timeline_id: TimelineId,
706 :
707 : pub ancestor_timeline_id: Option<TimelineId>,
708 : pub ancestor_lsn: Option<Lsn>,
709 : pub last_record_lsn: Lsn,
710 : pub prev_record_lsn: Option<Lsn>,
711 : pub latest_gc_cutoff_lsn: Lsn,
712 : pub disk_consistent_lsn: Lsn,
713 :
714 : /// The LSN that we have succesfully uploaded to remote storage
715 : pub remote_consistent_lsn: Lsn,
716 :
717 : /// The LSN that we are advertizing to safekeepers
718 : pub remote_consistent_lsn_visible: Lsn,
719 :
720 : /// The LSN from the start of the root timeline (never changes)
721 : pub initdb_lsn: Lsn,
722 :
723 : pub current_logical_size: u64,
724 : pub current_logical_size_is_accurate: bool,
725 :
726 : pub directory_entries_counts: Vec<u64>,
727 :
728 : /// Sum of the size of all layer files.
729 : /// If a layer is present in both local FS and S3, it counts only once.
730 : pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
731 : pub current_logical_size_non_incremental: Option<u64>,
732 :
733 : /// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes
734 : /// beyond the branch's branch point, we only count up to the branch point.
735 : pub pitr_history_size: u64,
736 :
737 : /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
738 : /// ancestor data used by this branch would have been retained anyway). If this is false, then
739 : /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
740 : /// otherwise be able to GC.
741 : pub within_ancestor_pitr: bool,
742 :
743 : pub timeline_dir_layer_file_size_sum: Option<u64>,
744 :
745 : pub wal_source_connstr: Option<String>,
746 : pub last_received_msg_lsn: Option<Lsn>,
747 : /// the timestamp (in microseconds) of the last received message
748 : pub last_received_msg_ts: Option<u128>,
749 : pub pg_version: u32,
750 :
751 : pub state: TimelineState,
752 :
753 : pub walreceiver_status: String,
754 :
755 : // ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility.
756 : // Backward compatibility: you will get a JSON not containing the newly-added field.
757 : // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
758 : // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
759 : // read.
760 : pub is_archived: Option<bool>,
761 : }
762 :
763 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
764 : pub struct LayerMapInfo {
765 : pub in_memory_layers: Vec<InMemoryLayerInfo>,
766 : pub historic_layers: Vec<HistoricLayerInfo>,
767 : }
768 :
769 : /// The residence status of a layer
770 0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
771 : pub enum LayerResidenceStatus {
772 : /// Residence status for a layer file that exists locally.
773 : /// It may also exist on the remote, we don't care here.
774 : Resident,
775 : /// Residence status for a layer file that only exists on the remote.
776 : Evicted,
777 : }
778 :
779 : #[serde_as]
780 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
781 : pub struct LayerAccessStats {
782 : #[serde_as(as = "serde_with::TimestampMilliSeconds")]
783 : pub access_time: SystemTime,
784 :
785 : #[serde_as(as = "serde_with::TimestampMilliSeconds")]
786 : pub residence_time: SystemTime,
787 :
788 : pub visible: bool,
789 : }
790 :
791 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
792 : #[serde(tag = "kind")]
793 : pub enum InMemoryLayerInfo {
794 : Open { lsn_start: Lsn },
795 : Frozen { lsn_start: Lsn, lsn_end: Lsn },
796 : }
797 :
798 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
799 : #[serde(tag = "kind")]
800 : pub enum HistoricLayerInfo {
801 : Delta {
802 : layer_file_name: String,
803 : layer_file_size: u64,
804 :
805 : lsn_start: Lsn,
806 : lsn_end: Lsn,
807 : remote: bool,
808 : access_stats: LayerAccessStats,
809 :
810 : l0: bool,
811 : },
812 : Image {
813 : layer_file_name: String,
814 : layer_file_size: u64,
815 :
816 : lsn_start: Lsn,
817 : remote: bool,
818 : access_stats: LayerAccessStats,
819 : },
820 : }
821 :
822 : impl HistoricLayerInfo {
823 0 : pub fn layer_file_name(&self) -> &str {
824 0 : match self {
825 : HistoricLayerInfo::Delta {
826 0 : layer_file_name, ..
827 0 : } => layer_file_name,
828 : HistoricLayerInfo::Image {
829 0 : layer_file_name, ..
830 0 : } => layer_file_name,
831 : }
832 0 : }
833 0 : pub fn is_remote(&self) -> bool {
834 0 : match self {
835 0 : HistoricLayerInfo::Delta { remote, .. } => *remote,
836 0 : HistoricLayerInfo::Image { remote, .. } => *remote,
837 : }
838 0 : }
839 0 : pub fn set_remote(&mut self, value: bool) {
840 0 : let field = match self {
841 0 : HistoricLayerInfo::Delta { remote, .. } => remote,
842 0 : HistoricLayerInfo::Image { remote, .. } => remote,
843 : };
844 0 : *field = value;
845 0 : }
846 0 : pub fn layer_file_size(&self) -> u64 {
847 0 : match self {
848 : HistoricLayerInfo::Delta {
849 0 : layer_file_size, ..
850 0 : } => *layer_file_size,
851 : HistoricLayerInfo::Image {
852 0 : layer_file_size, ..
853 0 : } => *layer_file_size,
854 : }
855 0 : }
856 : }
857 :
858 0 : #[derive(Debug, Serialize, Deserialize)]
859 : pub struct DownloadRemoteLayersTaskSpawnRequest {
860 : pub max_concurrent_downloads: NonZeroUsize,
861 : }
862 :
863 0 : #[derive(Debug, Serialize, Deserialize)]
864 : pub struct IngestAuxFilesRequest {
865 : pub aux_files: HashMap<String, String>,
866 : }
867 :
868 0 : #[derive(Debug, Serialize, Deserialize)]
869 : pub struct ListAuxFilesRequest {
870 : pub lsn: Lsn,
871 : }
872 :
873 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
874 : pub struct DownloadRemoteLayersTaskInfo {
875 : pub task_id: String,
876 : pub state: DownloadRemoteLayersTaskState,
877 : pub total_layer_count: u64, // stable once `completed`
878 : pub successful_download_count: u64, // stable once `completed`
879 : pub failed_download_count: u64, // stable once `completed`
880 : }
881 :
882 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
883 : pub enum DownloadRemoteLayersTaskState {
884 : Running,
885 : Completed,
886 : ShutDown,
887 : }
888 :
889 0 : #[derive(Debug, Serialize, Deserialize)]
890 : pub struct TimelineGcRequest {
891 : pub gc_horizon: Option<u64>,
892 : }
893 :
894 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
895 : pub struct WalRedoManagerProcessStatus {
896 : pub pid: u32,
897 : }
898 :
899 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
900 : pub struct WalRedoManagerStatus {
901 : pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
902 : pub process: Option<WalRedoManagerProcessStatus>,
903 : }
904 :
905 : /// The progress of a secondary tenant.
906 : ///
907 : /// It is mostly useful when doing a long running download: e.g. initiating
908 : /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
909 : /// what's happening.
910 0 : #[derive(Default, Debug, Serialize, Deserialize, Clone)]
911 : pub struct SecondaryProgress {
912 : /// The remote storage LastModified time of the heatmap object we last downloaded.
913 : pub heatmap_mtime: Option<serde_system_time::SystemTime>,
914 :
915 : /// The number of layers currently on-disk
916 : pub layers_downloaded: usize,
917 : /// The number of layers in the most recently seen heatmap
918 : pub layers_total: usize,
919 :
920 : /// The number of layer bytes currently on-disk
921 : pub bytes_downloaded: u64,
922 : /// The number of layer bytes in the most recently seen heatmap
923 : pub bytes_total: u64,
924 : }
925 :
926 0 : #[derive(Serialize, Deserialize, Debug)]
927 : pub struct TenantScanRemoteStorageShard {
928 : pub tenant_shard_id: TenantShardId,
929 : pub generation: Option<u32>,
930 : }
931 :
932 0 : #[derive(Serialize, Deserialize, Debug, Default)]
933 : pub struct TenantScanRemoteStorageResponse {
934 : pub shards: Vec<TenantScanRemoteStorageShard>,
935 : }
936 :
937 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
938 : #[serde(rename_all = "snake_case")]
939 : pub enum TenantSorting {
940 : ResidentSize,
941 : MaxLogicalSize,
942 : }
943 :
944 : impl Default for TenantSorting {
945 0 : fn default() -> Self {
946 0 : Self::ResidentSize
947 0 : }
948 : }
949 :
950 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
951 : pub struct TopTenantShardsRequest {
952 : // How would you like to sort the tenants?
953 : pub order_by: TenantSorting,
954 :
955 : // How many results?
956 : pub limit: usize,
957 :
958 : // Omit tenants with more than this many shards (e.g. if this is the max number of shards
959 : // that the caller would ever split to)
960 : pub where_shards_lt: Option<ShardCount>,
961 :
962 : // Omit tenants where the ordering metric is less than this (this is an optimization to
963 : // let us quickly exclude numerous tiny shards)
964 : pub where_gt: Option<u64>,
965 : }
966 :
967 0 : #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
968 : pub struct TopTenantShardItem {
969 : pub id: TenantShardId,
970 :
971 : /// Total size of layers on local disk for all timelines in this tenant
972 : pub resident_size: u64,
973 :
974 : /// Total size of layers in remote storage for all timelines in this tenant
975 : pub physical_size: u64,
976 :
977 : /// The largest logical size of a timeline within this tenant
978 : pub max_logical_size: u64,
979 : }
980 :
981 0 : #[derive(Serialize, Deserialize, Debug, Default)]
982 : pub struct TopTenantShardsResponse {
983 : pub shards: Vec<TopTenantShardItem>,
984 : }
985 :
986 : pub mod virtual_file {
987 : #[derive(
988 : Copy,
989 : Clone,
990 : PartialEq,
991 : Eq,
992 : Hash,
993 204 : strum_macros::EnumString,
994 0 : strum_macros::Display,
995 0 : serde_with::DeserializeFromStr,
996 : serde_with::SerializeDisplay,
997 : Debug,
998 : )]
999 : #[strum(serialize_all = "kebab-case")]
1000 : pub enum IoEngineKind {
1001 : StdFs,
1002 : #[cfg(target_os = "linux")]
1003 : TokioEpollUring,
1004 : }
1005 :
1006 : /// Direct IO modes for a pageserver.
1007 : #[derive(
1008 : Copy,
1009 : Clone,
1010 : PartialEq,
1011 : Eq,
1012 : Hash,
1013 0 : strum_macros::EnumString,
1014 0 : strum_macros::Display,
1015 0 : serde_with::DeserializeFromStr,
1016 : serde_with::SerializeDisplay,
1017 : Debug,
1018 : )]
1019 : #[strum(serialize_all = "kebab-case")]
1020 : #[repr(u8)]
1021 : pub enum IoMode {
1022 : /// Uses buffered IO.
1023 : Buffered,
1024 : /// Uses direct IO, error out if the operation fails.
1025 : #[cfg(target_os = "linux")]
1026 : Direct,
1027 : }
1028 :
1029 : impl IoMode {
1030 210 : pub const fn preferred() -> Self {
1031 210 : Self::Buffered
1032 210 : }
1033 : }
1034 :
1035 : impl TryFrom<u8> for IoMode {
1036 : type Error = u8;
1037 :
1038 1190 : fn try_from(value: u8) -> Result<Self, Self::Error> {
1039 1190 : Ok(match value {
1040 1190 : v if v == (IoMode::Buffered as u8) => IoMode::Buffered,
1041 : #[cfg(target_os = "linux")]
1042 0 : v if v == (IoMode::Direct as u8) => IoMode::Direct,
1043 0 : x => return Err(x),
1044 : })
1045 1190 : }
1046 : }
1047 : }
1048 :
1049 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
1050 : pub struct ScanDisposableKeysResponse {
1051 : pub disposable_count: usize,
1052 : pub not_disposable_count: usize,
1053 : }
1054 :
1055 : // Wrapped in libpq CopyData
1056 : #[derive(PartialEq, Eq, Debug)]
1057 : pub enum PagestreamFeMessage {
1058 : Exists(PagestreamExistsRequest),
1059 : Nblocks(PagestreamNblocksRequest),
1060 : GetPage(PagestreamGetPageRequest),
1061 : DbSize(PagestreamDbSizeRequest),
1062 : GetSlruSegment(PagestreamGetSlruSegmentRequest),
1063 : }
1064 :
1065 : // Wrapped in libpq CopyData
1066 0 : #[derive(strum_macros::EnumProperty)]
1067 : pub enum PagestreamBeMessage {
1068 : Exists(PagestreamExistsResponse),
1069 : Nblocks(PagestreamNblocksResponse),
1070 : GetPage(PagestreamGetPageResponse),
1071 : Error(PagestreamErrorResponse),
1072 : DbSize(PagestreamDbSizeResponse),
1073 : GetSlruSegment(PagestreamGetSlruSegmentResponse),
1074 : }
1075 :
1076 : // Keep in sync with `pagestore_client.h`
1077 : #[repr(u8)]
1078 : enum PagestreamBeMessageTag {
1079 : Exists = 100,
1080 : Nblocks = 101,
1081 : GetPage = 102,
1082 : Error = 103,
1083 : DbSize = 104,
1084 : GetSlruSegment = 105,
1085 : }
1086 : impl TryFrom<u8> for PagestreamBeMessageTag {
1087 : type Error = u8;
1088 0 : fn try_from(value: u8) -> Result<Self, u8> {
1089 0 : match value {
1090 0 : 100 => Ok(PagestreamBeMessageTag::Exists),
1091 0 : 101 => Ok(PagestreamBeMessageTag::Nblocks),
1092 0 : 102 => Ok(PagestreamBeMessageTag::GetPage),
1093 0 : 103 => Ok(PagestreamBeMessageTag::Error),
1094 0 : 104 => Ok(PagestreamBeMessageTag::DbSize),
1095 0 : 105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
1096 0 : _ => Err(value),
1097 : }
1098 0 : }
1099 : }
1100 :
1101 : // A GetPage request contains two LSN values:
1102 : //
1103 : // request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
1104 : // "get the latest version present". It's used by the primary server, which knows that no one else
1105 : // is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
1106 : // Lsn::Max. Standby servers use the current replay LSN as the request LSN.
1107 : //
1108 : // not_modified_since: Hint to the pageserver that the client knows that the page has not been
1109 : // modified between 'not_modified_since' and the request LSN. It's always correct to set
1110 : // 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
1111 : // passing an earlier LSN can speed up the request, by allowing the pageserver to process the
1112 : // request without waiting for 'request_lsn' to arrive.
1113 : //
1114 : // The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
1115 : // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
1116 : // 'latest' was set to true. The V2 interface was added because there was no correct way for a
1117 : // standby to request a page at a particular non-latest LSN, and also include the
1118 : // 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
1119 : // request, if the standby knows that the page hasn't been modified since, and risk getting an error
1120 : // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
1121 : // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
1122 : // interface allows sending both LSNs, and let the pageserver do the right thing. There was no
1123 : // difference in the responses between V1 and V2.
1124 : //
1125 : #[derive(Clone, Copy)]
1126 : pub enum PagestreamProtocolVersion {
1127 : V2,
1128 : }
1129 :
1130 : #[derive(Debug, PartialEq, Eq)]
1131 : pub struct PagestreamExistsRequest {
1132 : pub request_lsn: Lsn,
1133 : pub not_modified_since: Lsn,
1134 : pub rel: RelTag,
1135 : }
1136 :
1137 : #[derive(Debug, PartialEq, Eq)]
1138 : pub struct PagestreamNblocksRequest {
1139 : pub request_lsn: Lsn,
1140 : pub not_modified_since: Lsn,
1141 : pub rel: RelTag,
1142 : }
1143 :
1144 : #[derive(Debug, PartialEq, Eq)]
1145 : pub struct PagestreamGetPageRequest {
1146 : pub request_lsn: Lsn,
1147 : pub not_modified_since: Lsn,
1148 : pub rel: RelTag,
1149 : pub blkno: u32,
1150 : }
1151 :
1152 : #[derive(Debug, PartialEq, Eq)]
1153 : pub struct PagestreamDbSizeRequest {
1154 : pub request_lsn: Lsn,
1155 : pub not_modified_since: Lsn,
1156 : pub dbnode: u32,
1157 : }
1158 :
1159 : #[derive(Debug, PartialEq, Eq)]
1160 : pub struct PagestreamGetSlruSegmentRequest {
1161 : pub request_lsn: Lsn,
1162 : pub not_modified_since: Lsn,
1163 : pub kind: u8,
1164 : pub segno: u32,
1165 : }
1166 :
1167 : #[derive(Debug)]
1168 : pub struct PagestreamExistsResponse {
1169 : pub exists: bool,
1170 : }
1171 :
1172 : #[derive(Debug)]
1173 : pub struct PagestreamNblocksResponse {
1174 : pub n_blocks: u32,
1175 : }
1176 :
1177 : #[derive(Debug)]
1178 : pub struct PagestreamGetPageResponse {
1179 : pub page: Bytes,
1180 : }
1181 :
1182 : #[derive(Debug)]
1183 : pub struct PagestreamGetSlruSegmentResponse {
1184 : pub segment: Bytes,
1185 : }
1186 :
1187 : #[derive(Debug)]
1188 : pub struct PagestreamErrorResponse {
1189 : pub message: String,
1190 : }
1191 :
1192 : #[derive(Debug)]
1193 : pub struct PagestreamDbSizeResponse {
1194 : pub db_size: i64,
1195 : }
1196 :
1197 : // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
1198 : // that require pageserver-internal types. It is sufficient to get the total size.
1199 0 : #[derive(Serialize, Deserialize, Debug)]
1200 : pub struct TenantHistorySize {
1201 : pub id: TenantId,
1202 : /// Size is a mixture of WAL and logical size, so the unit is bytes.
1203 : ///
1204 : /// Will be none if `?inputs_only=true` was given.
1205 : pub size: Option<u64>,
1206 : }
1207 :
1208 : impl PagestreamFeMessage {
1209 : /// Serialize a compute -> pageserver message. This is currently only used in testing
1210 : /// tools. Always uses protocol version 2.
1211 4 : pub fn serialize(&self) -> Bytes {
1212 4 : let mut bytes = BytesMut::new();
1213 4 :
1214 4 : match self {
1215 1 : Self::Exists(req) => {
1216 1 : bytes.put_u8(0);
1217 1 : bytes.put_u64(req.request_lsn.0);
1218 1 : bytes.put_u64(req.not_modified_since.0);
1219 1 : bytes.put_u32(req.rel.spcnode);
1220 1 : bytes.put_u32(req.rel.dbnode);
1221 1 : bytes.put_u32(req.rel.relnode);
1222 1 : bytes.put_u8(req.rel.forknum);
1223 1 : }
1224 :
1225 1 : Self::Nblocks(req) => {
1226 1 : bytes.put_u8(1);
1227 1 : bytes.put_u64(req.request_lsn.0);
1228 1 : bytes.put_u64(req.not_modified_since.0);
1229 1 : bytes.put_u32(req.rel.spcnode);
1230 1 : bytes.put_u32(req.rel.dbnode);
1231 1 : bytes.put_u32(req.rel.relnode);
1232 1 : bytes.put_u8(req.rel.forknum);
1233 1 : }
1234 :
1235 1 : Self::GetPage(req) => {
1236 1 : bytes.put_u8(2);
1237 1 : bytes.put_u64(req.request_lsn.0);
1238 1 : bytes.put_u64(req.not_modified_since.0);
1239 1 : bytes.put_u32(req.rel.spcnode);
1240 1 : bytes.put_u32(req.rel.dbnode);
1241 1 : bytes.put_u32(req.rel.relnode);
1242 1 : bytes.put_u8(req.rel.forknum);
1243 1 : bytes.put_u32(req.blkno);
1244 1 : }
1245 :
1246 1 : Self::DbSize(req) => {
1247 1 : bytes.put_u8(3);
1248 1 : bytes.put_u64(req.request_lsn.0);
1249 1 : bytes.put_u64(req.not_modified_since.0);
1250 1 : bytes.put_u32(req.dbnode);
1251 1 : }
1252 :
1253 0 : Self::GetSlruSegment(req) => {
1254 0 : bytes.put_u8(4);
1255 0 : bytes.put_u64(req.request_lsn.0);
1256 0 : bytes.put_u64(req.not_modified_since.0);
1257 0 : bytes.put_u8(req.kind);
1258 0 : bytes.put_u32(req.segno);
1259 0 : }
1260 : }
1261 :
1262 4 : bytes.into()
1263 4 : }
1264 :
1265 4 : pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
1266 : // these correspond to the NeonMessageTag enum in pagestore_client.h
1267 : //
1268 : // TODO: consider using protobuf or serde bincode for less error prone
1269 : // serialization.
1270 4 : let msg_tag = body.read_u8()?;
1271 :
1272 : // these two fields are the same for every request type
1273 4 : let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
1274 4 : let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
1275 :
1276 4 : match msg_tag {
1277 : 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
1278 1 : request_lsn,
1279 1 : not_modified_since,
1280 1 : rel: RelTag {
1281 1 : spcnode: body.read_u32::<BigEndian>()?,
1282 1 : dbnode: body.read_u32::<BigEndian>()?,
1283 1 : relnode: body.read_u32::<BigEndian>()?,
1284 1 : forknum: body.read_u8()?,
1285 : },
1286 : })),
1287 : 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1288 1 : request_lsn,
1289 1 : not_modified_since,
1290 1 : rel: RelTag {
1291 1 : spcnode: body.read_u32::<BigEndian>()?,
1292 1 : dbnode: body.read_u32::<BigEndian>()?,
1293 1 : relnode: body.read_u32::<BigEndian>()?,
1294 1 : forknum: body.read_u8()?,
1295 : },
1296 : })),
1297 : 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1298 1 : request_lsn,
1299 1 : not_modified_since,
1300 1 : rel: RelTag {
1301 1 : spcnode: body.read_u32::<BigEndian>()?,
1302 1 : dbnode: body.read_u32::<BigEndian>()?,
1303 1 : relnode: body.read_u32::<BigEndian>()?,
1304 1 : forknum: body.read_u8()?,
1305 : },
1306 1 : blkno: body.read_u32::<BigEndian>()?,
1307 : })),
1308 : 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1309 1 : request_lsn,
1310 1 : not_modified_since,
1311 1 : dbnode: body.read_u32::<BigEndian>()?,
1312 : })),
1313 : 4 => Ok(PagestreamFeMessage::GetSlruSegment(
1314 : PagestreamGetSlruSegmentRequest {
1315 0 : request_lsn,
1316 0 : not_modified_since,
1317 0 : kind: body.read_u8()?,
1318 0 : segno: body.read_u32::<BigEndian>()?,
1319 : },
1320 : )),
1321 0 : _ => bail!("unknown smgr message tag: {:?}", msg_tag),
1322 : }
1323 4 : }
1324 : }
1325 :
1326 : impl PagestreamBeMessage {
1327 0 : pub fn serialize(&self) -> Bytes {
1328 0 : let mut bytes = BytesMut::new();
1329 :
1330 : use PagestreamBeMessageTag as Tag;
1331 0 : match self {
1332 0 : Self::Exists(resp) => {
1333 0 : bytes.put_u8(Tag::Exists as u8);
1334 0 : bytes.put_u8(resp.exists as u8);
1335 0 : }
1336 :
1337 0 : Self::Nblocks(resp) => {
1338 0 : bytes.put_u8(Tag::Nblocks as u8);
1339 0 : bytes.put_u32(resp.n_blocks);
1340 0 : }
1341 :
1342 0 : Self::GetPage(resp) => {
1343 0 : bytes.put_u8(Tag::GetPage as u8);
1344 0 : bytes.put(&resp.page[..]);
1345 0 : }
1346 :
1347 0 : Self::Error(resp) => {
1348 0 : bytes.put_u8(Tag::Error as u8);
1349 0 : bytes.put(resp.message.as_bytes());
1350 0 : bytes.put_u8(0); // null terminator
1351 0 : }
1352 0 : Self::DbSize(resp) => {
1353 0 : bytes.put_u8(Tag::DbSize as u8);
1354 0 : bytes.put_i64(resp.db_size);
1355 0 : }
1356 :
1357 0 : Self::GetSlruSegment(resp) => {
1358 0 : bytes.put_u8(Tag::GetSlruSegment as u8);
1359 0 : bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
1360 0 : bytes.put(&resp.segment[..]);
1361 0 : }
1362 : }
1363 :
1364 0 : bytes.into()
1365 0 : }
1366 :
1367 0 : pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
1368 0 : let mut buf = buf.reader();
1369 0 : let msg_tag = buf.read_u8()?;
1370 :
1371 : use PagestreamBeMessageTag as Tag;
1372 0 : let ok =
1373 0 : match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
1374 : Tag::Exists => {
1375 0 : let exists = buf.read_u8()?;
1376 0 : Self::Exists(PagestreamExistsResponse {
1377 0 : exists: exists != 0,
1378 0 : })
1379 : }
1380 : Tag::Nblocks => {
1381 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1382 0 : Self::Nblocks(PagestreamNblocksResponse { n_blocks })
1383 : }
1384 : Tag::GetPage => {
1385 0 : let mut page = vec![0; 8192]; // TODO: use MaybeUninit
1386 0 : buf.read_exact(&mut page)?;
1387 0 : PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
1388 : }
1389 : Tag::Error => {
1390 0 : let mut msg = Vec::new();
1391 0 : buf.read_until(0, &mut msg)?;
1392 0 : let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
1393 0 : let rust_str = cstring.to_str()?;
1394 0 : PagestreamBeMessage::Error(PagestreamErrorResponse {
1395 0 : message: rust_str.to_owned(),
1396 0 : })
1397 : }
1398 : Tag::DbSize => {
1399 0 : let db_size = buf.read_i64::<BigEndian>()?;
1400 0 : Self::DbSize(PagestreamDbSizeResponse { db_size })
1401 : }
1402 : Tag::GetSlruSegment => {
1403 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1404 0 : let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
1405 0 : buf.read_exact(&mut segment)?;
1406 0 : Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
1407 0 : segment: segment.into(),
1408 0 : })
1409 : }
1410 : };
1411 0 : let remaining = buf.into_inner();
1412 0 : if !remaining.is_empty() {
1413 0 : anyhow::bail!(
1414 0 : "remaining bytes in msg with tag={msg_tag}: {}",
1415 0 : remaining.len()
1416 0 : );
1417 0 : }
1418 0 : Ok(ok)
1419 0 : }
1420 :
1421 0 : pub fn kind(&self) -> &'static str {
1422 0 : match self {
1423 0 : Self::Exists(_) => "Exists",
1424 0 : Self::Nblocks(_) => "Nblocks",
1425 0 : Self::GetPage(_) => "GetPage",
1426 0 : Self::Error(_) => "Error",
1427 0 : Self::DbSize(_) => "DbSize",
1428 0 : Self::GetSlruSegment(_) => "GetSlruSegment",
1429 : }
1430 0 : }
1431 : }
1432 :
1433 : #[cfg(test)]
1434 : mod tests {
1435 : use serde_json::json;
1436 : use std::str::FromStr;
1437 :
1438 : use super::*;
1439 :
1440 : #[test]
1441 1 : fn test_pagestream() {
1442 1 : // Test serialization/deserialization of PagestreamFeMessage
1443 1 : let messages = vec![
1444 1 : PagestreamFeMessage::Exists(PagestreamExistsRequest {
1445 1 : request_lsn: Lsn(4),
1446 1 : not_modified_since: Lsn(3),
1447 1 : rel: RelTag {
1448 1 : forknum: 1,
1449 1 : spcnode: 2,
1450 1 : dbnode: 3,
1451 1 : relnode: 4,
1452 1 : },
1453 1 : }),
1454 1 : PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1455 1 : request_lsn: Lsn(4),
1456 1 : not_modified_since: Lsn(4),
1457 1 : rel: RelTag {
1458 1 : forknum: 1,
1459 1 : spcnode: 2,
1460 1 : dbnode: 3,
1461 1 : relnode: 4,
1462 1 : },
1463 1 : }),
1464 1 : PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1465 1 : request_lsn: Lsn(4),
1466 1 : not_modified_since: Lsn(3),
1467 1 : rel: RelTag {
1468 1 : forknum: 1,
1469 1 : spcnode: 2,
1470 1 : dbnode: 3,
1471 1 : relnode: 4,
1472 1 : },
1473 1 : blkno: 7,
1474 1 : }),
1475 1 : PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1476 1 : request_lsn: Lsn(4),
1477 1 : not_modified_since: Lsn(3),
1478 1 : dbnode: 7,
1479 1 : }),
1480 1 : ];
1481 5 : for msg in messages {
1482 4 : let bytes = msg.serialize();
1483 4 : let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
1484 4 : assert!(msg == reconstructed);
1485 : }
1486 1 : }
1487 :
1488 : #[test]
1489 1 : fn test_tenantinfo_serde() {
1490 1 : // Test serialization/deserialization of TenantInfo
1491 1 : let original_active = TenantInfo {
1492 1 : id: TenantShardId::unsharded(TenantId::generate()),
1493 1 : state: TenantState::Active,
1494 1 : current_physical_size: Some(42),
1495 1 : attachment_status: TenantAttachmentStatus::Attached,
1496 1 : generation: 1,
1497 1 : gc_blocking: None,
1498 1 : };
1499 1 : let expected_active = json!({
1500 1 : "id": original_active.id.to_string(),
1501 1 : "state": {
1502 1 : "slug": "Active",
1503 1 : },
1504 1 : "current_physical_size": 42,
1505 1 : "attachment_status": {
1506 1 : "slug":"attached",
1507 1 : },
1508 1 : "generation" : 1
1509 1 : });
1510 1 :
1511 1 : let original_broken = TenantInfo {
1512 1 : id: TenantShardId::unsharded(TenantId::generate()),
1513 1 : state: TenantState::Broken {
1514 1 : reason: "reason".into(),
1515 1 : backtrace: "backtrace info".into(),
1516 1 : },
1517 1 : current_physical_size: Some(42),
1518 1 : attachment_status: TenantAttachmentStatus::Attached,
1519 1 : generation: 1,
1520 1 : gc_blocking: None,
1521 1 : };
1522 1 : let expected_broken = json!({
1523 1 : "id": original_broken.id.to_string(),
1524 1 : "state": {
1525 1 : "slug": "Broken",
1526 1 : "data": {
1527 1 : "backtrace": "backtrace info",
1528 1 : "reason": "reason",
1529 1 : }
1530 1 : },
1531 1 : "current_physical_size": 42,
1532 1 : "attachment_status": {
1533 1 : "slug":"attached",
1534 1 : },
1535 1 : "generation" : 1
1536 1 : });
1537 1 :
1538 1 : assert_eq!(
1539 1 : serde_json::to_value(&original_active).unwrap(),
1540 1 : expected_active
1541 1 : );
1542 :
1543 1 : assert_eq!(
1544 1 : serde_json::to_value(&original_broken).unwrap(),
1545 1 : expected_broken
1546 1 : );
1547 1 : assert!(format!("{:?}", &original_broken.state).contains("reason"));
1548 1 : assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
1549 1 : }
1550 :
1551 : #[test]
1552 1 : fn test_reject_unknown_field() {
1553 1 : let id = TenantId::generate();
1554 1 : let config_request = json!({
1555 1 : "tenant_id": id.to_string(),
1556 1 : "unknown_field": "unknown_value".to_string(),
1557 1 : });
1558 1 : let err = serde_json::from_value::<TenantConfigRequest>(config_request).unwrap_err();
1559 1 : assert!(
1560 1 : err.to_string().contains("unknown field `unknown_field`"),
1561 0 : "expect unknown field `unknown_field` error, got: {}",
1562 : err
1563 : );
1564 1 : }
1565 :
1566 : #[test]
1567 1 : fn tenantstatus_activating_serde() {
1568 1 : let states = [TenantState::Activating(ActivatingFrom::Attaching)];
1569 1 : let expected = "[{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
1570 1 :
1571 1 : let actual = serde_json::to_string(&states).unwrap();
1572 1 :
1573 1 : assert_eq!(actual, expected);
1574 :
1575 1 : let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
1576 1 :
1577 1 : assert_eq!(states.as_slice(), &parsed);
1578 1 : }
1579 :
1580 : #[test]
1581 1 : fn tenantstatus_activating_strum() {
1582 1 : // tests added, because we use these for metrics
1583 1 : let examples = [
1584 1 : (line!(), TenantState::Attaching, "Attaching"),
1585 1 : (
1586 1 : line!(),
1587 1 : TenantState::Activating(ActivatingFrom::Attaching),
1588 1 : "Activating",
1589 1 : ),
1590 1 : (line!(), TenantState::Active, "Active"),
1591 1 : (
1592 1 : line!(),
1593 1 : TenantState::Stopping {
1594 1 : progress: utils::completion::Barrier::default(),
1595 1 : },
1596 1 : "Stopping",
1597 1 : ),
1598 1 : (
1599 1 : line!(),
1600 1 : TenantState::Broken {
1601 1 : reason: "Example".into(),
1602 1 : backtrace: "Looooong backtrace".into(),
1603 1 : },
1604 1 : "Broken",
1605 1 : ),
1606 1 : ];
1607 :
1608 6 : for (line, rendered, expected) in examples {
1609 5 : let actual: &'static str = rendered.into();
1610 5 : assert_eq!(actual, expected, "example on {line}");
1611 : }
1612 1 : }
1613 :
1614 : #[test]
1615 1 : fn test_image_compression_algorithm_parsing() {
1616 : use ImageCompressionAlgorithm::*;
1617 1 : let cases = [
1618 1 : ("disabled", Disabled),
1619 1 : ("zstd", Zstd { level: None }),
1620 1 : ("zstd(18)", Zstd { level: Some(18) }),
1621 1 : ("zstd(-3)", Zstd { level: Some(-3) }),
1622 1 : ];
1623 :
1624 5 : for (display, expected) in cases {
1625 4 : assert_eq!(
1626 4 : ImageCompressionAlgorithm::from_str(display).unwrap(),
1627 : expected,
1628 0 : "parsing works"
1629 : );
1630 4 : assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip");
1631 :
1632 4 : let ser = serde_json::to_string(&expected).expect("serialization");
1633 4 : assert_eq!(
1634 4 : serde_json::from_str::<ImageCompressionAlgorithm>(&ser).unwrap(),
1635 : expected,
1636 0 : "serde roundtrip"
1637 : );
1638 :
1639 4 : assert_eq!(
1640 4 : serde_json::Value::String(display.to_string()),
1641 4 : serde_json::to_value(expected).unwrap(),
1642 0 : "Display is the serde serialization"
1643 : );
1644 : }
1645 1 : }
1646 : }
|