Line data Source code
1 : pub mod detach_ancestor;
2 : pub mod partitioning;
3 : pub mod utilization;
4 :
5 : pub use utilization::PageserverUtilization;
6 :
7 : use std::{
8 : collections::HashMap,
9 : fmt::Display,
10 : io::{BufRead, Read},
11 : num::{NonZeroU32, NonZeroU64, NonZeroUsize},
12 : str::FromStr,
13 : time::{Duration, SystemTime},
14 : };
15 :
16 : use byteorder::{BigEndian, ReadBytesExt};
17 : use postgres_ffi::BLCKSZ;
18 : use serde::{Deserialize, Serialize};
19 : use serde_with::serde_as;
20 : use utils::{
21 : completion,
22 : id::{NodeId, TenantId, TimelineId},
23 : lsn::Lsn,
24 : serde_system_time,
25 : };
26 :
27 : use crate::{
28 : reltag::RelTag,
29 : shard::{ShardCount, ShardStripeSize, TenantShardId},
30 : };
31 : use anyhow::bail;
32 : use bytes::{Buf, BufMut, Bytes, BytesMut};
33 :
34 : /// The state of a tenant in this pageserver.
35 : ///
36 : /// ```mermaid
37 : /// stateDiagram-v2
38 : ///
39 : /// [*] --> Attaching: spawn_attach()
40 : ///
41 : /// Attaching --> Activating: activate()
42 : /// Activating --> Active: infallible
43 : ///
44 : /// Attaching --> Broken: attach() failure
45 : ///
46 : /// Active --> Stopping: set_stopping(), part of shutdown & detach
47 : /// Stopping --> Broken: late error in remove_tenant_from_memory
48 : ///
49 : /// Broken --> [*]: ignore / detach / shutdown
50 : /// Stopping --> [*]: remove_from_memory complete
51 : ///
52 : /// Active --> Broken: cfg(testing)-only tenant break point
53 : /// ```
54 : #[derive(
55 : Clone,
56 : PartialEq,
57 : Eq,
58 1 : serde::Serialize,
59 3 : serde::Deserialize,
60 0 : strum_macros::Display,
61 : strum_macros::VariantNames,
62 0 : strum_macros::AsRefStr,
63 387 : strum_macros::IntoStaticStr,
64 : )]
65 : #[serde(tag = "slug", content = "data")]
66 : pub enum TenantState {
67 : /// This tenant is being attached to the pageserver.
68 : ///
69 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
70 : Attaching,
71 : /// The tenant is transitioning from Loading/Attaching to Active.
72 : ///
73 : /// While in this state, the individual timelines are being activated.
74 : ///
75 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
76 : Activating(ActivatingFrom),
77 : /// The tenant has finished activating and is open for business.
78 : ///
79 : /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
80 : Active,
81 : /// The tenant is recognized by pageserver, but it is being detached or the
82 : /// system is being shut down.
83 : ///
84 : /// Transitions out of this state are possible through `set_broken()`.
85 : Stopping {
86 : // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
87 : // otherwise it will not be skipped during deserialization
88 : #[serde(skip)]
89 : progress: completion::Barrier,
90 : },
91 : /// The tenant is recognized by the pageserver, but can no longer be used for
92 : /// any operations.
93 : ///
94 : /// If the tenant fails to load or attach, it will transition to this state
95 : /// and it is guaranteed that no background tasks are running in its name.
96 : ///
97 : /// The other way to transition into this state is from `Stopping` state
98 : /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
99 : /// if the cleanup future executed by `remove_tenant_from_memory()` fails.
100 : Broken { reason: String, backtrace: String },
101 : }
102 :
103 : impl TenantState {
104 0 : pub fn attachment_status(&self) -> TenantAttachmentStatus {
105 : use TenantAttachmentStatus::*;
106 :
107 : // Below TenantState::Activating is used as "transient" or "transparent" state for
108 : // attachment_status determining.
109 0 : match self {
110 : // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
111 : // So, technically, we can return Attached here.
112 : // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
113 : // But, our attach task might still be fetching the remote timelines, etc.
114 : // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
115 0 : Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
116 : // We only reach Active after successful load / attach.
117 : // So, call atttachment status Attached.
118 0 : Self::Active => Attached,
119 : // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
120 : // However, it also becomes Broken if the regular load fails.
121 : // From Console's perspective there's no practical difference
122 : // because attachment_status is polled by console only during attach operation execution.
123 0 : Self::Broken { reason, .. } => Failed {
124 0 : reason: reason.to_owned(),
125 0 : },
126 : // Why is Stopping a Maybe case? Because, during pageserver shutdown,
127 : // we set the Stopping state irrespective of whether the tenant
128 : // has finished attaching or not.
129 0 : Self::Stopping { .. } => Maybe,
130 : }
131 0 : }
132 :
133 0 : pub fn broken_from_reason(reason: String) -> Self {
134 0 : let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
135 0 : Self::Broken {
136 0 : reason,
137 0 : backtrace: backtrace_str,
138 0 : }
139 0 : }
140 : }
141 :
142 : impl std::fmt::Debug for TenantState {
143 2 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
144 2 : match self {
145 2 : Self::Broken { reason, backtrace } if !reason.is_empty() => {
146 2 : write!(f, "Broken due to: {reason}. Backtrace:\n{backtrace}")
147 : }
148 0 : _ => write!(f, "{self}"),
149 : }
150 2 : }
151 : }
152 :
153 : /// A temporary lease to a specific lsn inside a timeline.
154 : /// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
155 : #[serde_as]
156 0 : #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
157 : pub struct LsnLease {
158 : #[serde_as(as = "SystemTimeAsRfc3339Millis")]
159 : pub valid_until: SystemTime,
160 : }
161 :
162 : serde_with::serde_conv!(
163 : SystemTimeAsRfc3339Millis,
164 : SystemTime,
165 0 : |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
166 0 : |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
167 : );
168 :
169 : impl LsnLease {
170 : /// The default length for an explicit LSN lease request (10 minutes).
171 : pub const DEFAULT_LENGTH: Duration = Duration::from_secs(10 * 60);
172 :
173 : /// The default length for an implicit LSN lease granted during
174 : /// `get_lsn_by_timestamp` request (1 minutes).
175 : pub const DEFAULT_LENGTH_FOR_TS: Duration = Duration::from_secs(60);
176 :
177 : /// Checks whether the lease is expired.
178 6 : pub fn is_expired(&self, now: &SystemTime) -> bool {
179 6 : now > &self.valid_until
180 6 : }
181 : }
182 :
183 : /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
184 : ///
185 : /// XXX: We used to have more variants here, but now it's just one, which makes this rather
186 : /// useless. Remove, once we've checked that there's no client code left that looks at this.
187 2 : #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
188 : pub enum ActivatingFrom {
189 : /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
190 : Attaching,
191 : }
192 :
193 : /// A state of a timeline in pageserver's memory.
194 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
195 : pub enum TimelineState {
196 : /// The timeline is recognized by the pageserver but is not yet operational.
197 : /// In particular, the walreceiver connection loop is not running for this timeline.
198 : /// It will eventually transition to state Active or Broken.
199 : Loading,
200 : /// The timeline is fully operational.
201 : /// It can be queried, and the walreceiver connection loop is running.
202 : Active,
203 : /// The timeline was previously Loading or Active but is shutting down.
204 : /// It cannot transition back into any other state.
205 : Stopping,
206 : /// The timeline is broken and not operational (previous states: Loading or Active).
207 : Broken { reason: String, backtrace: String },
208 : }
209 :
210 0 : #[derive(Serialize, Deserialize, Clone)]
211 : pub struct TimelineCreateRequest {
212 : pub new_timeline_id: TimelineId,
213 : #[serde(flatten)]
214 : pub mode: TimelineCreateRequestMode,
215 : }
216 :
217 0 : #[derive(Serialize, Deserialize, Clone)]
218 : #[serde(untagged)]
219 : pub enum TimelineCreateRequestMode {
220 : Branch {
221 : ancestor_timeline_id: TimelineId,
222 : #[serde(default)]
223 : ancestor_start_lsn: Option<Lsn>,
224 : // TODO: cplane sets this, but, the branching code always
225 : // inherits the ancestor's pg_version. Earlier code wasn't
226 : // using a flattened enum, so, it was an accepted field, and
227 : // we continue to accept it by having it here.
228 : pg_version: Option<u32>,
229 : },
230 : // NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap.
231 : // (serde picks the first matching enum variant, in declaration order).
232 : Bootstrap {
233 : #[serde(default)]
234 : existing_initdb_timeline_id: Option<TimelineId>,
235 : pg_version: Option<u32>,
236 : },
237 : }
238 :
239 0 : #[derive(Serialize, Deserialize, Clone)]
240 : pub struct LsnLeaseRequest {
241 : pub lsn: Lsn,
242 : }
243 :
244 0 : #[derive(Serialize, Deserialize)]
245 : pub struct TenantShardSplitRequest {
246 : pub new_shard_count: u8,
247 :
248 : // A tenant's stripe size is only meaningful the first time their shard count goes
249 : // above 1: therefore during a split from 1->N shards, we may modify the stripe size.
250 : //
251 : // If this is set while the stripe count is being increased from an already >1 value,
252 : // then the request will fail with 400.
253 : pub new_stripe_size: Option<ShardStripeSize>,
254 : }
255 :
256 0 : #[derive(Serialize, Deserialize)]
257 : pub struct TenantShardSplitResponse {
258 : pub new_shards: Vec<TenantShardId>,
259 : }
260 :
261 : /// Parameters that apply to all shards in a tenant. Used during tenant creation.
262 0 : #[derive(Serialize, Deserialize, Debug)]
263 : #[serde(deny_unknown_fields)]
264 : pub struct ShardParameters {
265 : pub count: ShardCount,
266 : pub stripe_size: ShardStripeSize,
267 : }
268 :
269 : impl ShardParameters {
270 : pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
271 :
272 0 : pub fn is_unsharded(&self) -> bool {
273 0 : self.count.is_unsharded()
274 0 : }
275 : }
276 :
277 : impl Default for ShardParameters {
278 193 : fn default() -> Self {
279 193 : Self {
280 193 : count: ShardCount::new(0),
281 193 : stripe_size: Self::DEFAULT_STRIPE_SIZE,
282 193 : }
283 193 : }
284 : }
285 :
286 : /// An alternative representation of `pageserver::tenant::TenantConf` with
287 : /// simpler types.
288 2 : #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
289 : pub struct TenantConfig {
290 : pub checkpoint_distance: Option<u64>,
291 : pub checkpoint_timeout: Option<String>,
292 : pub compaction_target_size: Option<u64>,
293 : pub compaction_period: Option<String>,
294 : pub compaction_threshold: Option<usize>,
295 : // defer parsing compaction_algorithm, like eviction_policy
296 : pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
297 : pub gc_horizon: Option<u64>,
298 : pub gc_period: Option<String>,
299 : pub image_creation_threshold: Option<usize>,
300 : pub pitr_interval: Option<String>,
301 : pub walreceiver_connect_timeout: Option<String>,
302 : pub lagging_wal_timeout: Option<String>,
303 : pub max_lsn_wal_lag: Option<NonZeroU64>,
304 : pub eviction_policy: Option<EvictionPolicy>,
305 : pub min_resident_size_override: Option<u64>,
306 : pub evictions_low_residence_duration_metric_threshold: Option<String>,
307 : pub heatmap_period: Option<String>,
308 : pub lazy_slru_download: Option<bool>,
309 : pub timeline_get_throttle: Option<ThrottleConfig>,
310 : pub image_layer_creation_check_threshold: Option<u8>,
311 : pub lsn_lease_length: Option<String>,
312 : pub lsn_lease_length_for_ts: Option<String>,
313 : pub timeline_offloading: Option<bool>,
314 : }
315 :
316 : /// The policy for the aux file storage.
317 : ///
318 : /// It can be switched through `switch_aux_file_policy` tenant config.
319 : /// When the first aux file written, the policy will be persisted in the
320 : /// `index_part.json` file and has a limited migration path.
321 : ///
322 : /// Currently, we only allow the following migration path:
323 : ///
324 : /// Unset -> V1
325 : /// -> V2
326 : /// -> CrossValidation -> V2
327 : #[derive(
328 : Eq,
329 : PartialEq,
330 : Debug,
331 : Copy,
332 : Clone,
333 2 : strum_macros::EnumString,
334 0 : strum_macros::Display,
335 0 : serde_with::DeserializeFromStr,
336 : serde_with::SerializeDisplay,
337 : )]
338 : #[strum(serialize_all = "kebab-case")]
339 : pub enum AuxFilePolicy {
340 : /// V1 aux file policy: store everything in AUX_FILE_KEY
341 : #[strum(ascii_case_insensitive)]
342 : V1,
343 : /// V2 aux file policy: store in the AUX_FILE keyspace
344 : #[strum(ascii_case_insensitive)]
345 : V2,
346 : /// Cross validation runs both formats on the write path and does validation
347 : /// on the read path.
348 : #[strum(ascii_case_insensitive)]
349 : CrossValidation,
350 : }
351 :
352 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
353 : #[serde(tag = "kind")]
354 : pub enum EvictionPolicy {
355 : NoEviction,
356 : LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
357 : OnlyImitiate(EvictionPolicyLayerAccessThreshold),
358 : }
359 :
360 : impl EvictionPolicy {
361 0 : pub fn discriminant_str(&self) -> &'static str {
362 0 : match self {
363 0 : EvictionPolicy::NoEviction => "NoEviction",
364 0 : EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
365 0 : EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
366 : }
367 0 : }
368 : }
369 :
370 : #[derive(
371 : Eq,
372 : PartialEq,
373 : Debug,
374 : Copy,
375 : Clone,
376 0 : strum_macros::EnumString,
377 0 : strum_macros::Display,
378 0 : serde_with::DeserializeFromStr,
379 : serde_with::SerializeDisplay,
380 : )]
381 : #[strum(serialize_all = "kebab-case")]
382 : pub enum CompactionAlgorithm {
383 : Legacy,
384 : Tiered,
385 : }
386 :
387 : #[derive(
388 0 : Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay,
389 : )]
390 : pub enum ImageCompressionAlgorithm {
391 : // Disabled for writes, support decompressing during read path
392 : Disabled,
393 : /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
394 : /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
395 : Zstd {
396 : level: Option<i8>,
397 : },
398 : }
399 :
400 : impl FromStr for ImageCompressionAlgorithm {
401 : type Err = anyhow::Error;
402 8 : fn from_str(s: &str) -> Result<Self, Self::Err> {
403 8 : let mut components = s.split(['(', ')']);
404 8 : let first = components
405 8 : .next()
406 8 : .ok_or_else(|| anyhow::anyhow!("empty string"))?;
407 8 : match first {
408 8 : "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
409 6 : "zstd" => {
410 6 : let level = if let Some(v) = components.next() {
411 4 : let v: i8 = v.parse()?;
412 4 : Some(v)
413 : } else {
414 2 : None
415 : };
416 :
417 6 : Ok(ImageCompressionAlgorithm::Zstd { level })
418 : }
419 0 : _ => anyhow::bail!("invalid specifier '{first}'"),
420 : }
421 8 : }
422 : }
423 :
424 : impl Display for ImageCompressionAlgorithm {
425 12 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
426 12 : match self {
427 3 : ImageCompressionAlgorithm::Disabled => write!(f, "disabled"),
428 9 : ImageCompressionAlgorithm::Zstd { level } => {
429 9 : if let Some(level) = level {
430 6 : write!(f, "zstd({})", level)
431 : } else {
432 3 : write!(f, "zstd")
433 : }
434 : }
435 : }
436 12 : }
437 : }
438 :
439 0 : #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
440 : pub struct CompactionAlgorithmSettings {
441 : pub kind: CompactionAlgorithm,
442 : }
443 :
444 6 : #[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
445 : #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
446 : pub enum L0FlushConfig {
447 : #[serde(rename_all = "snake_case")]
448 : Direct { max_concurrency: NonZeroUsize },
449 : }
450 :
451 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
452 : pub struct EvictionPolicyLayerAccessThreshold {
453 : #[serde(with = "humantime_serde")]
454 : pub period: Duration,
455 : #[serde(with = "humantime_serde")]
456 : pub threshold: Duration,
457 : }
458 :
459 0 : #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
460 : pub struct ThrottleConfig {
461 : pub task_kinds: Vec<String>, // TaskKind
462 : pub initial: u32,
463 : #[serde(with = "humantime_serde")]
464 : pub refill_interval: Duration,
465 : pub refill_amount: NonZeroU32,
466 : pub max: u32,
467 : }
468 :
469 : impl ThrottleConfig {
470 394 : pub fn disabled() -> Self {
471 394 : Self {
472 394 : task_kinds: vec![], // effectively disables the throttle
473 394 : // other values don't matter with emtpy `task_kinds`.
474 394 : initial: 0,
475 394 : refill_interval: Duration::from_millis(1),
476 394 : refill_amount: NonZeroU32::new(1).unwrap(),
477 394 : max: 1,
478 394 : }
479 394 : }
480 : /// The requests per second allowed by the given config.
481 0 : pub fn steady_rps(&self) -> f64 {
482 0 : (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
483 0 : }
484 : }
485 :
486 : /// A flattened analog of a `pagesever::tenant::LocationMode`, which
487 : /// lists out all possible states (and the virtual "Detached" state)
488 : /// in a flat form rather than using rust-style enums.
489 0 : #[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
490 : pub enum LocationConfigMode {
491 : AttachedSingle,
492 : AttachedMulti,
493 : AttachedStale,
494 : Secondary,
495 : Detached,
496 : }
497 :
498 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
499 : pub struct LocationConfigSecondary {
500 : pub warm: bool,
501 : }
502 :
503 : /// An alternative representation of `pageserver::tenant::LocationConf`,
504 : /// for use in external-facing APIs.
505 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
506 : pub struct LocationConfig {
507 : pub mode: LocationConfigMode,
508 : /// If attaching, in what generation?
509 : #[serde(default)]
510 : pub generation: Option<u32>,
511 :
512 : // If requesting mode `Secondary`, configuration for that.
513 : #[serde(default)]
514 : pub secondary_conf: Option<LocationConfigSecondary>,
515 :
516 : // Shard parameters: if shard_count is nonzero, then other shard_* fields
517 : // must be set accurately.
518 : #[serde(default)]
519 : pub shard_number: u8,
520 : #[serde(default)]
521 : pub shard_count: u8,
522 : #[serde(default)]
523 : pub shard_stripe_size: u32,
524 :
525 : // This configuration only affects attached mode, but should be provided irrespective
526 : // of the mode, as a secondary location might transition on startup if the response
527 : // to the `/re-attach` control plane API requests it.
528 : pub tenant_conf: TenantConfig,
529 : }
530 :
531 0 : #[derive(Serialize, Deserialize)]
532 : pub struct LocationConfigListResponse {
533 : pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
534 : }
535 :
536 : #[derive(Serialize)]
537 : pub struct StatusResponse {
538 : pub id: NodeId,
539 : }
540 :
541 0 : #[derive(Serialize, Deserialize, Debug)]
542 : #[serde(deny_unknown_fields)]
543 : pub struct TenantLocationConfigRequest {
544 : #[serde(flatten)]
545 : pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
546 : }
547 :
548 0 : #[derive(Serialize, Deserialize, Debug)]
549 : #[serde(deny_unknown_fields)]
550 : pub struct TenantTimeTravelRequest {
551 : pub shard_counts: Vec<ShardCount>,
552 : }
553 :
554 0 : #[derive(Serialize, Deserialize, Debug)]
555 : #[serde(deny_unknown_fields)]
556 : pub struct TenantShardLocation {
557 : pub shard_id: TenantShardId,
558 : pub node_id: NodeId,
559 : }
560 :
561 0 : #[derive(Serialize, Deserialize, Debug)]
562 : #[serde(deny_unknown_fields)]
563 : pub struct TenantLocationConfigResponse {
564 : pub shards: Vec<TenantShardLocation>,
565 : // If the shards' ShardCount count is >1, stripe_size will be set.
566 : pub stripe_size: Option<ShardStripeSize>,
567 : }
568 :
569 3 : #[derive(Serialize, Deserialize, Debug)]
570 : #[serde(deny_unknown_fields)]
571 : pub struct TenantConfigRequest {
572 : pub tenant_id: TenantId,
573 : #[serde(flatten)]
574 : pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
575 : }
576 :
577 : impl std::ops::Deref for TenantConfigRequest {
578 : type Target = TenantConfig;
579 :
580 0 : fn deref(&self) -> &Self::Target {
581 0 : &self.config
582 0 : }
583 : }
584 :
585 : impl TenantConfigRequest {
586 0 : pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
587 0 : let config = TenantConfig::default();
588 0 : TenantConfigRequest { tenant_id, config }
589 0 : }
590 : }
591 :
592 : /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
593 0 : #[derive(Serialize, Deserialize, Clone)]
594 : #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
595 : pub enum TenantAttachmentStatus {
596 : Maybe,
597 : Attached,
598 : Failed { reason: String },
599 : }
600 :
601 0 : #[derive(Serialize, Deserialize, Clone)]
602 : pub struct TenantInfo {
603 : pub id: TenantShardId,
604 : // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
605 : pub state: TenantState,
606 : /// Sum of the size of all layer files.
607 : /// If a layer is present in both local FS and S3, it counts only once.
608 : pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
609 : pub attachment_status: TenantAttachmentStatus,
610 : pub generation: u32,
611 :
612 : /// Opaque explanation if gc is being blocked.
613 : ///
614 : /// Only looked up for the individual tenant detail, not the listing. This is purely for
615 : /// debugging, not included in openapi.
616 : #[serde(skip_serializing_if = "Option::is_none")]
617 : pub gc_blocking: Option<String>,
618 : }
619 :
620 0 : #[derive(Serialize, Deserialize, Clone)]
621 : pub struct TenantDetails {
622 : #[serde(flatten)]
623 : pub tenant_info: TenantInfo,
624 :
625 : pub walredo: Option<WalRedoManagerStatus>,
626 :
627 : pub timelines: Vec<TimelineId>,
628 : }
629 :
630 0 : #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
631 : pub enum TimelineArchivalState {
632 : Archived,
633 : Unarchived,
634 : }
635 :
636 0 : #[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
637 : pub struct TimelineArchivalConfigRequest {
638 : pub state: TimelineArchivalState,
639 : }
640 :
641 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
642 : pub struct TimelinesInfoAndOffloaded {
643 : pub timelines: Vec<TimelineInfo>,
644 : pub offloaded: Vec<OffloadedTimelineInfo>,
645 : }
646 :
647 : /// Analog of [`TimelineInfo`] for offloaded timelines.
648 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
649 : pub struct OffloadedTimelineInfo {
650 : pub tenant_id: TenantShardId,
651 : pub timeline_id: TimelineId,
652 : /// Whether the timeline has a parent it has been branched off from or not
653 : pub ancestor_timeline_id: Option<TimelineId>,
654 : /// Whether to retain the branch lsn at the ancestor or not
655 : pub ancestor_retain_lsn: Option<Lsn>,
656 : /// The time point when the timeline was archived
657 : pub archived_at: chrono::DateTime<chrono::Utc>,
658 : }
659 :
660 : /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
661 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
662 : pub struct TimelineInfo {
663 : pub tenant_id: TenantShardId,
664 : pub timeline_id: TimelineId,
665 :
666 : pub ancestor_timeline_id: Option<TimelineId>,
667 : pub ancestor_lsn: Option<Lsn>,
668 : pub last_record_lsn: Lsn,
669 : pub prev_record_lsn: Option<Lsn>,
670 : pub latest_gc_cutoff_lsn: Lsn,
671 : pub disk_consistent_lsn: Lsn,
672 :
673 : /// The LSN that we have succesfully uploaded to remote storage
674 : pub remote_consistent_lsn: Lsn,
675 :
676 : /// The LSN that we are advertizing to safekeepers
677 : pub remote_consistent_lsn_visible: Lsn,
678 :
679 : /// The LSN from the start of the root timeline (never changes)
680 : pub initdb_lsn: Lsn,
681 :
682 : pub current_logical_size: u64,
683 : pub current_logical_size_is_accurate: bool,
684 :
685 : pub directory_entries_counts: Vec<u64>,
686 :
687 : /// Sum of the size of all layer files.
688 : /// If a layer is present in both local FS and S3, it counts only once.
689 : pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
690 : pub current_logical_size_non_incremental: Option<u64>,
691 :
692 : /// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes
693 : /// beyond the branch's branch point, we only count up to the branch point.
694 : pub pitr_history_size: u64,
695 :
696 : /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
697 : /// ancestor data used by this branch would have been retained anyway). If this is false, then
698 : /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
699 : /// otherwise be able to GC.
700 : pub within_ancestor_pitr: bool,
701 :
702 : pub timeline_dir_layer_file_size_sum: Option<u64>,
703 :
704 : pub wal_source_connstr: Option<String>,
705 : pub last_received_msg_lsn: Option<Lsn>,
706 : /// the timestamp (in microseconds) of the last received message
707 : pub last_received_msg_ts: Option<u128>,
708 : pub pg_version: u32,
709 :
710 : pub state: TimelineState,
711 :
712 : pub walreceiver_status: String,
713 :
714 : // ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility.
715 : // Backward compatibility: you will get a JSON not containing the newly-added field.
716 : // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
717 : // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
718 : // read.
719 : pub is_archived: Option<bool>,
720 : }
721 :
722 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
723 : pub struct LayerMapInfo {
724 : pub in_memory_layers: Vec<InMemoryLayerInfo>,
725 : pub historic_layers: Vec<HistoricLayerInfo>,
726 : }
727 :
728 : /// The residence status of a layer
729 0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
730 : pub enum LayerResidenceStatus {
731 : /// Residence status for a layer file that exists locally.
732 : /// It may also exist on the remote, we don't care here.
733 : Resident,
734 : /// Residence status for a layer file that only exists on the remote.
735 : Evicted,
736 : }
737 :
738 : #[serde_as]
739 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
740 : pub struct LayerAccessStats {
741 : #[serde_as(as = "serde_with::TimestampMilliSeconds")]
742 : pub access_time: SystemTime,
743 :
744 : #[serde_as(as = "serde_with::TimestampMilliSeconds")]
745 : pub residence_time: SystemTime,
746 :
747 : pub visible: bool,
748 : }
749 :
750 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
751 : #[serde(tag = "kind")]
752 : pub enum InMemoryLayerInfo {
753 : Open { lsn_start: Lsn },
754 : Frozen { lsn_start: Lsn, lsn_end: Lsn },
755 : }
756 :
757 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
758 : #[serde(tag = "kind")]
759 : pub enum HistoricLayerInfo {
760 : Delta {
761 : layer_file_name: String,
762 : layer_file_size: u64,
763 :
764 : lsn_start: Lsn,
765 : lsn_end: Lsn,
766 : remote: bool,
767 : access_stats: LayerAccessStats,
768 :
769 : l0: bool,
770 : },
771 : Image {
772 : layer_file_name: String,
773 : layer_file_size: u64,
774 :
775 : lsn_start: Lsn,
776 : remote: bool,
777 : access_stats: LayerAccessStats,
778 : },
779 : }
780 :
781 : impl HistoricLayerInfo {
782 0 : pub fn layer_file_name(&self) -> &str {
783 0 : match self {
784 : HistoricLayerInfo::Delta {
785 0 : layer_file_name, ..
786 0 : } => layer_file_name,
787 : HistoricLayerInfo::Image {
788 0 : layer_file_name, ..
789 0 : } => layer_file_name,
790 : }
791 0 : }
792 0 : pub fn is_remote(&self) -> bool {
793 0 : match self {
794 0 : HistoricLayerInfo::Delta { remote, .. } => *remote,
795 0 : HistoricLayerInfo::Image { remote, .. } => *remote,
796 : }
797 0 : }
798 0 : pub fn set_remote(&mut self, value: bool) {
799 0 : let field = match self {
800 0 : HistoricLayerInfo::Delta { remote, .. } => remote,
801 0 : HistoricLayerInfo::Image { remote, .. } => remote,
802 : };
803 0 : *field = value;
804 0 : }
805 0 : pub fn layer_file_size(&self) -> u64 {
806 0 : match self {
807 : HistoricLayerInfo::Delta {
808 0 : layer_file_size, ..
809 0 : } => *layer_file_size,
810 : HistoricLayerInfo::Image {
811 0 : layer_file_size, ..
812 0 : } => *layer_file_size,
813 : }
814 0 : }
815 : }
816 :
817 0 : #[derive(Debug, Serialize, Deserialize)]
818 : pub struct DownloadRemoteLayersTaskSpawnRequest {
819 : pub max_concurrent_downloads: NonZeroUsize,
820 : }
821 :
822 0 : #[derive(Debug, Serialize, Deserialize)]
823 : pub struct IngestAuxFilesRequest {
824 : pub aux_files: HashMap<String, String>,
825 : }
826 :
827 0 : #[derive(Debug, Serialize, Deserialize)]
828 : pub struct ListAuxFilesRequest {
829 : pub lsn: Lsn,
830 : }
831 :
832 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
833 : pub struct DownloadRemoteLayersTaskInfo {
834 : pub task_id: String,
835 : pub state: DownloadRemoteLayersTaskState,
836 : pub total_layer_count: u64, // stable once `completed`
837 : pub successful_download_count: u64, // stable once `completed`
838 : pub failed_download_count: u64, // stable once `completed`
839 : }
840 :
841 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
842 : pub enum DownloadRemoteLayersTaskState {
843 : Running,
844 : Completed,
845 : ShutDown,
846 : }
847 :
848 0 : #[derive(Debug, Serialize, Deserialize)]
849 : pub struct TimelineGcRequest {
850 : pub gc_horizon: Option<u64>,
851 : }
852 :
853 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
854 : pub struct WalRedoManagerProcessStatus {
855 : pub pid: u32,
856 : }
857 :
858 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
859 : pub struct WalRedoManagerStatus {
860 : pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
861 : pub process: Option<WalRedoManagerProcessStatus>,
862 : }
863 :
864 : /// The progress of a secondary tenant.
865 : ///
866 : /// It is mostly useful when doing a long running download: e.g. initiating
867 : /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
868 : /// what's happening.
869 0 : #[derive(Default, Debug, Serialize, Deserialize, Clone)]
870 : pub struct SecondaryProgress {
871 : /// The remote storage LastModified time of the heatmap object we last downloaded.
872 : pub heatmap_mtime: Option<serde_system_time::SystemTime>,
873 :
874 : /// The number of layers currently on-disk
875 : pub layers_downloaded: usize,
876 : /// The number of layers in the most recently seen heatmap
877 : pub layers_total: usize,
878 :
879 : /// The number of layer bytes currently on-disk
880 : pub bytes_downloaded: u64,
881 : /// The number of layer bytes in the most recently seen heatmap
882 : pub bytes_total: u64,
883 : }
884 :
885 0 : #[derive(Serialize, Deserialize, Debug)]
886 : pub struct TenantScanRemoteStorageShard {
887 : pub tenant_shard_id: TenantShardId,
888 : pub generation: Option<u32>,
889 : }
890 :
891 0 : #[derive(Serialize, Deserialize, Debug, Default)]
892 : pub struct TenantScanRemoteStorageResponse {
893 : pub shards: Vec<TenantScanRemoteStorageShard>,
894 : }
895 :
896 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
897 : #[serde(rename_all = "snake_case")]
898 : pub enum TenantSorting {
899 : ResidentSize,
900 : MaxLogicalSize,
901 : }
902 :
903 : impl Default for TenantSorting {
904 0 : fn default() -> Self {
905 0 : Self::ResidentSize
906 0 : }
907 : }
908 :
909 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
910 : pub struct TopTenantShardsRequest {
911 : // How would you like to sort the tenants?
912 : pub order_by: TenantSorting,
913 :
914 : // How many results?
915 : pub limit: usize,
916 :
917 : // Omit tenants with more than this many shards (e.g. if this is the max number of shards
918 : // that the caller would ever split to)
919 : pub where_shards_lt: Option<ShardCount>,
920 :
921 : // Omit tenants where the ordering metric is less than this (this is an optimization to
922 : // let us quickly exclude numerous tiny shards)
923 : pub where_gt: Option<u64>,
924 : }
925 :
926 0 : #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
927 : pub struct TopTenantShardItem {
928 : pub id: TenantShardId,
929 :
930 : /// Total size of layers on local disk for all timelines in this tenant
931 : pub resident_size: u64,
932 :
933 : /// Total size of layers in remote storage for all timelines in this tenant
934 : pub physical_size: u64,
935 :
936 : /// The largest logical size of a timeline within this tenant
937 : pub max_logical_size: u64,
938 : }
939 :
940 0 : #[derive(Serialize, Deserialize, Debug, Default)]
941 : pub struct TopTenantShardsResponse {
942 : pub shards: Vec<TopTenantShardItem>,
943 : }
944 :
945 : pub mod virtual_file {
946 : #[derive(
947 : Copy,
948 : Clone,
949 : PartialEq,
950 : Eq,
951 : Hash,
952 204 : strum_macros::EnumString,
953 0 : strum_macros::Display,
954 0 : serde_with::DeserializeFromStr,
955 : serde_with::SerializeDisplay,
956 : Debug,
957 : )]
958 : #[strum(serialize_all = "kebab-case")]
959 : pub enum IoEngineKind {
960 : StdFs,
961 : #[cfg(target_os = "linux")]
962 : TokioEpollUring,
963 : }
964 :
965 : /// Direct IO modes for a pageserver.
966 : #[derive(
967 : Copy,
968 : Clone,
969 : PartialEq,
970 : Eq,
971 : Hash,
972 0 : strum_macros::EnumString,
973 0 : strum_macros::Display,
974 0 : serde_with::DeserializeFromStr,
975 : serde_with::SerializeDisplay,
976 : Debug,
977 : )]
978 : #[strum(serialize_all = "kebab-case")]
979 : #[repr(u8)]
980 : pub enum IoMode {
981 : /// Uses buffered IO.
982 : Buffered,
983 : /// Uses direct IO, error out if the operation fails.
984 : #[cfg(target_os = "linux")]
985 : Direct,
986 : }
987 :
988 : impl IoMode {
989 210 : pub const fn preferred() -> Self {
990 210 : Self::Buffered
991 210 : }
992 : }
993 :
994 : impl TryFrom<u8> for IoMode {
995 : type Error = u8;
996 :
997 1190 : fn try_from(value: u8) -> Result<Self, Self::Error> {
998 1190 : Ok(match value {
999 1190 : v if v == (IoMode::Buffered as u8) => IoMode::Buffered,
1000 : #[cfg(target_os = "linux")]
1001 0 : v if v == (IoMode::Direct as u8) => IoMode::Direct,
1002 0 : x => return Err(x),
1003 : })
1004 1190 : }
1005 : }
1006 : }
1007 :
1008 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
1009 : pub struct ScanDisposableKeysResponse {
1010 : pub disposable_count: usize,
1011 : pub not_disposable_count: usize,
1012 : }
1013 :
1014 : // Wrapped in libpq CopyData
1015 : #[derive(PartialEq, Eq, Debug)]
1016 : pub enum PagestreamFeMessage {
1017 : Exists(PagestreamExistsRequest),
1018 : Nblocks(PagestreamNblocksRequest),
1019 : GetPage(PagestreamGetPageRequest),
1020 : DbSize(PagestreamDbSizeRequest),
1021 : GetSlruSegment(PagestreamGetSlruSegmentRequest),
1022 : }
1023 :
1024 : // Wrapped in libpq CopyData
1025 0 : #[derive(strum_macros::EnumProperty)]
1026 : pub enum PagestreamBeMessage {
1027 : Exists(PagestreamExistsResponse),
1028 : Nblocks(PagestreamNblocksResponse),
1029 : GetPage(PagestreamGetPageResponse),
1030 : Error(PagestreamErrorResponse),
1031 : DbSize(PagestreamDbSizeResponse),
1032 : GetSlruSegment(PagestreamGetSlruSegmentResponse),
1033 : }
1034 :
1035 : // Keep in sync with `pagestore_client.h`
1036 : #[repr(u8)]
1037 : enum PagestreamBeMessageTag {
1038 : Exists = 100,
1039 : Nblocks = 101,
1040 : GetPage = 102,
1041 : Error = 103,
1042 : DbSize = 104,
1043 : GetSlruSegment = 105,
1044 : }
1045 : impl TryFrom<u8> for PagestreamBeMessageTag {
1046 : type Error = u8;
1047 0 : fn try_from(value: u8) -> Result<Self, u8> {
1048 0 : match value {
1049 0 : 100 => Ok(PagestreamBeMessageTag::Exists),
1050 0 : 101 => Ok(PagestreamBeMessageTag::Nblocks),
1051 0 : 102 => Ok(PagestreamBeMessageTag::GetPage),
1052 0 : 103 => Ok(PagestreamBeMessageTag::Error),
1053 0 : 104 => Ok(PagestreamBeMessageTag::DbSize),
1054 0 : 105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
1055 0 : _ => Err(value),
1056 : }
1057 0 : }
1058 : }
1059 :
1060 : // A GetPage request contains two LSN values:
1061 : //
1062 : // request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
1063 : // "get the latest version present". It's used by the primary server, which knows that no one else
1064 : // is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
1065 : // Lsn::Max. Standby servers use the current replay LSN as the request LSN.
1066 : //
1067 : // not_modified_since: Hint to the pageserver that the client knows that the page has not been
1068 : // modified between 'not_modified_since' and the request LSN. It's always correct to set
1069 : // 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
1070 : // passing an earlier LSN can speed up the request, by allowing the pageserver to process the
1071 : // request without waiting for 'request_lsn' to arrive.
1072 : //
1073 : // The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
1074 : // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
1075 : // 'latest' was set to true. The V2 interface was added because there was no correct way for a
1076 : // standby to request a page at a particular non-latest LSN, and also include the
1077 : // 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
1078 : // request, if the standby knows that the page hasn't been modified since, and risk getting an error
1079 : // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
1080 : // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
1081 : // interface allows sending both LSNs, and let the pageserver do the right thing. There was no
1082 : // difference in the responses between V1 and V2.
1083 : //
1084 : #[derive(Clone, Copy)]
1085 : pub enum PagestreamProtocolVersion {
1086 : V2,
1087 : }
1088 :
1089 : #[derive(Debug, PartialEq, Eq)]
1090 : pub struct PagestreamExistsRequest {
1091 : pub request_lsn: Lsn,
1092 : pub not_modified_since: Lsn,
1093 : pub rel: RelTag,
1094 : }
1095 :
1096 : #[derive(Debug, PartialEq, Eq)]
1097 : pub struct PagestreamNblocksRequest {
1098 : pub request_lsn: Lsn,
1099 : pub not_modified_since: Lsn,
1100 : pub rel: RelTag,
1101 : }
1102 :
1103 : #[derive(Debug, PartialEq, Eq)]
1104 : pub struct PagestreamGetPageRequest {
1105 : pub request_lsn: Lsn,
1106 : pub not_modified_since: Lsn,
1107 : pub rel: RelTag,
1108 : pub blkno: u32,
1109 : }
1110 :
1111 : #[derive(Debug, PartialEq, Eq)]
1112 : pub struct PagestreamDbSizeRequest {
1113 : pub request_lsn: Lsn,
1114 : pub not_modified_since: Lsn,
1115 : pub dbnode: u32,
1116 : }
1117 :
1118 : #[derive(Debug, PartialEq, Eq)]
1119 : pub struct PagestreamGetSlruSegmentRequest {
1120 : pub request_lsn: Lsn,
1121 : pub not_modified_since: Lsn,
1122 : pub kind: u8,
1123 : pub segno: u32,
1124 : }
1125 :
1126 : #[derive(Debug)]
1127 : pub struct PagestreamExistsResponse {
1128 : pub exists: bool,
1129 : }
1130 :
1131 : #[derive(Debug)]
1132 : pub struct PagestreamNblocksResponse {
1133 : pub n_blocks: u32,
1134 : }
1135 :
1136 : #[derive(Debug)]
1137 : pub struct PagestreamGetPageResponse {
1138 : pub page: Bytes,
1139 : }
1140 :
1141 : #[derive(Debug)]
1142 : pub struct PagestreamGetSlruSegmentResponse {
1143 : pub segment: Bytes,
1144 : }
1145 :
1146 : #[derive(Debug)]
1147 : pub struct PagestreamErrorResponse {
1148 : pub message: String,
1149 : }
1150 :
1151 : #[derive(Debug)]
1152 : pub struct PagestreamDbSizeResponse {
1153 : pub db_size: i64,
1154 : }
1155 :
1156 : // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
1157 : // that require pageserver-internal types. It is sufficient to get the total size.
1158 0 : #[derive(Serialize, Deserialize, Debug)]
1159 : pub struct TenantHistorySize {
1160 : pub id: TenantId,
1161 : /// Size is a mixture of WAL and logical size, so the unit is bytes.
1162 : ///
1163 : /// Will be none if `?inputs_only=true` was given.
1164 : pub size: Option<u64>,
1165 : }
1166 :
1167 : impl PagestreamFeMessage {
1168 : /// Serialize a compute -> pageserver message. This is currently only used in testing
1169 : /// tools. Always uses protocol version 2.
1170 4 : pub fn serialize(&self) -> Bytes {
1171 4 : let mut bytes = BytesMut::new();
1172 4 :
1173 4 : match self {
1174 1 : Self::Exists(req) => {
1175 1 : bytes.put_u8(0);
1176 1 : bytes.put_u64(req.request_lsn.0);
1177 1 : bytes.put_u64(req.not_modified_since.0);
1178 1 : bytes.put_u32(req.rel.spcnode);
1179 1 : bytes.put_u32(req.rel.dbnode);
1180 1 : bytes.put_u32(req.rel.relnode);
1181 1 : bytes.put_u8(req.rel.forknum);
1182 1 : }
1183 :
1184 1 : Self::Nblocks(req) => {
1185 1 : bytes.put_u8(1);
1186 1 : bytes.put_u64(req.request_lsn.0);
1187 1 : bytes.put_u64(req.not_modified_since.0);
1188 1 : bytes.put_u32(req.rel.spcnode);
1189 1 : bytes.put_u32(req.rel.dbnode);
1190 1 : bytes.put_u32(req.rel.relnode);
1191 1 : bytes.put_u8(req.rel.forknum);
1192 1 : }
1193 :
1194 1 : Self::GetPage(req) => {
1195 1 : bytes.put_u8(2);
1196 1 : bytes.put_u64(req.request_lsn.0);
1197 1 : bytes.put_u64(req.not_modified_since.0);
1198 1 : bytes.put_u32(req.rel.spcnode);
1199 1 : bytes.put_u32(req.rel.dbnode);
1200 1 : bytes.put_u32(req.rel.relnode);
1201 1 : bytes.put_u8(req.rel.forknum);
1202 1 : bytes.put_u32(req.blkno);
1203 1 : }
1204 :
1205 1 : Self::DbSize(req) => {
1206 1 : bytes.put_u8(3);
1207 1 : bytes.put_u64(req.request_lsn.0);
1208 1 : bytes.put_u64(req.not_modified_since.0);
1209 1 : bytes.put_u32(req.dbnode);
1210 1 : }
1211 :
1212 0 : Self::GetSlruSegment(req) => {
1213 0 : bytes.put_u8(4);
1214 0 : bytes.put_u64(req.request_lsn.0);
1215 0 : bytes.put_u64(req.not_modified_since.0);
1216 0 : bytes.put_u8(req.kind);
1217 0 : bytes.put_u32(req.segno);
1218 0 : }
1219 : }
1220 :
1221 4 : bytes.into()
1222 4 : }
1223 :
1224 4 : pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
1225 : // these correspond to the NeonMessageTag enum in pagestore_client.h
1226 : //
1227 : // TODO: consider using protobuf or serde bincode for less error prone
1228 : // serialization.
1229 4 : let msg_tag = body.read_u8()?;
1230 :
1231 : // these two fields are the same for every request type
1232 4 : let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
1233 4 : let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
1234 :
1235 4 : match msg_tag {
1236 : 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
1237 1 : request_lsn,
1238 1 : not_modified_since,
1239 1 : rel: RelTag {
1240 1 : spcnode: body.read_u32::<BigEndian>()?,
1241 1 : dbnode: body.read_u32::<BigEndian>()?,
1242 1 : relnode: body.read_u32::<BigEndian>()?,
1243 1 : forknum: body.read_u8()?,
1244 : },
1245 : })),
1246 : 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1247 1 : request_lsn,
1248 1 : not_modified_since,
1249 1 : rel: RelTag {
1250 1 : spcnode: body.read_u32::<BigEndian>()?,
1251 1 : dbnode: body.read_u32::<BigEndian>()?,
1252 1 : relnode: body.read_u32::<BigEndian>()?,
1253 1 : forknum: body.read_u8()?,
1254 : },
1255 : })),
1256 : 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1257 1 : request_lsn,
1258 1 : not_modified_since,
1259 1 : rel: RelTag {
1260 1 : spcnode: body.read_u32::<BigEndian>()?,
1261 1 : dbnode: body.read_u32::<BigEndian>()?,
1262 1 : relnode: body.read_u32::<BigEndian>()?,
1263 1 : forknum: body.read_u8()?,
1264 : },
1265 1 : blkno: body.read_u32::<BigEndian>()?,
1266 : })),
1267 : 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1268 1 : request_lsn,
1269 1 : not_modified_since,
1270 1 : dbnode: body.read_u32::<BigEndian>()?,
1271 : })),
1272 : 4 => Ok(PagestreamFeMessage::GetSlruSegment(
1273 : PagestreamGetSlruSegmentRequest {
1274 0 : request_lsn,
1275 0 : not_modified_since,
1276 0 : kind: body.read_u8()?,
1277 0 : segno: body.read_u32::<BigEndian>()?,
1278 : },
1279 : )),
1280 0 : _ => bail!("unknown smgr message tag: {:?}", msg_tag),
1281 : }
1282 4 : }
1283 : }
1284 :
1285 : impl PagestreamBeMessage {
1286 0 : pub fn serialize(&self) -> Bytes {
1287 0 : let mut bytes = BytesMut::new();
1288 :
1289 : use PagestreamBeMessageTag as Tag;
1290 0 : match self {
1291 0 : Self::Exists(resp) => {
1292 0 : bytes.put_u8(Tag::Exists as u8);
1293 0 : bytes.put_u8(resp.exists as u8);
1294 0 : }
1295 :
1296 0 : Self::Nblocks(resp) => {
1297 0 : bytes.put_u8(Tag::Nblocks as u8);
1298 0 : bytes.put_u32(resp.n_blocks);
1299 0 : }
1300 :
1301 0 : Self::GetPage(resp) => {
1302 0 : bytes.put_u8(Tag::GetPage as u8);
1303 0 : bytes.put(&resp.page[..]);
1304 0 : }
1305 :
1306 0 : Self::Error(resp) => {
1307 0 : bytes.put_u8(Tag::Error as u8);
1308 0 : bytes.put(resp.message.as_bytes());
1309 0 : bytes.put_u8(0); // null terminator
1310 0 : }
1311 0 : Self::DbSize(resp) => {
1312 0 : bytes.put_u8(Tag::DbSize as u8);
1313 0 : bytes.put_i64(resp.db_size);
1314 0 : }
1315 :
1316 0 : Self::GetSlruSegment(resp) => {
1317 0 : bytes.put_u8(Tag::GetSlruSegment as u8);
1318 0 : bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
1319 0 : bytes.put(&resp.segment[..]);
1320 0 : }
1321 : }
1322 :
1323 0 : bytes.into()
1324 0 : }
1325 :
1326 0 : pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
1327 0 : let mut buf = buf.reader();
1328 0 : let msg_tag = buf.read_u8()?;
1329 :
1330 : use PagestreamBeMessageTag as Tag;
1331 0 : let ok =
1332 0 : match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
1333 : Tag::Exists => {
1334 0 : let exists = buf.read_u8()?;
1335 0 : Self::Exists(PagestreamExistsResponse {
1336 0 : exists: exists != 0,
1337 0 : })
1338 : }
1339 : Tag::Nblocks => {
1340 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1341 0 : Self::Nblocks(PagestreamNblocksResponse { n_blocks })
1342 : }
1343 : Tag::GetPage => {
1344 0 : let mut page = vec![0; 8192]; // TODO: use MaybeUninit
1345 0 : buf.read_exact(&mut page)?;
1346 0 : PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
1347 : }
1348 : Tag::Error => {
1349 0 : let mut msg = Vec::new();
1350 0 : buf.read_until(0, &mut msg)?;
1351 0 : let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
1352 0 : let rust_str = cstring.to_str()?;
1353 0 : PagestreamBeMessage::Error(PagestreamErrorResponse {
1354 0 : message: rust_str.to_owned(),
1355 0 : })
1356 : }
1357 : Tag::DbSize => {
1358 0 : let db_size = buf.read_i64::<BigEndian>()?;
1359 0 : Self::DbSize(PagestreamDbSizeResponse { db_size })
1360 : }
1361 : Tag::GetSlruSegment => {
1362 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1363 0 : let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
1364 0 : buf.read_exact(&mut segment)?;
1365 0 : Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
1366 0 : segment: segment.into(),
1367 0 : })
1368 : }
1369 : };
1370 0 : let remaining = buf.into_inner();
1371 0 : if !remaining.is_empty() {
1372 0 : anyhow::bail!(
1373 0 : "remaining bytes in msg with tag={msg_tag}: {}",
1374 0 : remaining.len()
1375 0 : );
1376 0 : }
1377 0 : Ok(ok)
1378 0 : }
1379 :
1380 0 : pub fn kind(&self) -> &'static str {
1381 0 : match self {
1382 0 : Self::Exists(_) => "Exists",
1383 0 : Self::Nblocks(_) => "Nblocks",
1384 0 : Self::GetPage(_) => "GetPage",
1385 0 : Self::Error(_) => "Error",
1386 0 : Self::DbSize(_) => "DbSize",
1387 0 : Self::GetSlruSegment(_) => "GetSlruSegment",
1388 : }
1389 0 : }
1390 : }
1391 :
1392 : #[cfg(test)]
1393 : mod tests {
1394 : use serde_json::json;
1395 : use std::str::FromStr;
1396 :
1397 : use super::*;
1398 :
1399 : #[test]
1400 1 : fn test_pagestream() {
1401 1 : // Test serialization/deserialization of PagestreamFeMessage
1402 1 : let messages = vec![
1403 1 : PagestreamFeMessage::Exists(PagestreamExistsRequest {
1404 1 : request_lsn: Lsn(4),
1405 1 : not_modified_since: Lsn(3),
1406 1 : rel: RelTag {
1407 1 : forknum: 1,
1408 1 : spcnode: 2,
1409 1 : dbnode: 3,
1410 1 : relnode: 4,
1411 1 : },
1412 1 : }),
1413 1 : PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1414 1 : request_lsn: Lsn(4),
1415 1 : not_modified_since: Lsn(4),
1416 1 : rel: RelTag {
1417 1 : forknum: 1,
1418 1 : spcnode: 2,
1419 1 : dbnode: 3,
1420 1 : relnode: 4,
1421 1 : },
1422 1 : }),
1423 1 : PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1424 1 : request_lsn: Lsn(4),
1425 1 : not_modified_since: Lsn(3),
1426 1 : rel: RelTag {
1427 1 : forknum: 1,
1428 1 : spcnode: 2,
1429 1 : dbnode: 3,
1430 1 : relnode: 4,
1431 1 : },
1432 1 : blkno: 7,
1433 1 : }),
1434 1 : PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1435 1 : request_lsn: Lsn(4),
1436 1 : not_modified_since: Lsn(3),
1437 1 : dbnode: 7,
1438 1 : }),
1439 1 : ];
1440 5 : for msg in messages {
1441 4 : let bytes = msg.serialize();
1442 4 : let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
1443 4 : assert!(msg == reconstructed);
1444 : }
1445 1 : }
1446 :
1447 : #[test]
1448 1 : fn test_tenantinfo_serde() {
1449 1 : // Test serialization/deserialization of TenantInfo
1450 1 : let original_active = TenantInfo {
1451 1 : id: TenantShardId::unsharded(TenantId::generate()),
1452 1 : state: TenantState::Active,
1453 1 : current_physical_size: Some(42),
1454 1 : attachment_status: TenantAttachmentStatus::Attached,
1455 1 : generation: 1,
1456 1 : gc_blocking: None,
1457 1 : };
1458 1 : let expected_active = json!({
1459 1 : "id": original_active.id.to_string(),
1460 1 : "state": {
1461 1 : "slug": "Active",
1462 1 : },
1463 1 : "current_physical_size": 42,
1464 1 : "attachment_status": {
1465 1 : "slug":"attached",
1466 1 : },
1467 1 : "generation" : 1
1468 1 : });
1469 1 :
1470 1 : let original_broken = TenantInfo {
1471 1 : id: TenantShardId::unsharded(TenantId::generate()),
1472 1 : state: TenantState::Broken {
1473 1 : reason: "reason".into(),
1474 1 : backtrace: "backtrace info".into(),
1475 1 : },
1476 1 : current_physical_size: Some(42),
1477 1 : attachment_status: TenantAttachmentStatus::Attached,
1478 1 : generation: 1,
1479 1 : gc_blocking: None,
1480 1 : };
1481 1 : let expected_broken = json!({
1482 1 : "id": original_broken.id.to_string(),
1483 1 : "state": {
1484 1 : "slug": "Broken",
1485 1 : "data": {
1486 1 : "backtrace": "backtrace info",
1487 1 : "reason": "reason",
1488 1 : }
1489 1 : },
1490 1 : "current_physical_size": 42,
1491 1 : "attachment_status": {
1492 1 : "slug":"attached",
1493 1 : },
1494 1 : "generation" : 1
1495 1 : });
1496 1 :
1497 1 : assert_eq!(
1498 1 : serde_json::to_value(&original_active).unwrap(),
1499 1 : expected_active
1500 1 : );
1501 :
1502 1 : assert_eq!(
1503 1 : serde_json::to_value(&original_broken).unwrap(),
1504 1 : expected_broken
1505 1 : );
1506 1 : assert!(format!("{:?}", &original_broken.state).contains("reason"));
1507 1 : assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
1508 1 : }
1509 :
1510 : #[test]
1511 1 : fn test_reject_unknown_field() {
1512 1 : let id = TenantId::generate();
1513 1 : let config_request = json!({
1514 1 : "tenant_id": id.to_string(),
1515 1 : "unknown_field": "unknown_value".to_string(),
1516 1 : });
1517 1 : let err = serde_json::from_value::<TenantConfigRequest>(config_request).unwrap_err();
1518 1 : assert!(
1519 1 : err.to_string().contains("unknown field `unknown_field`"),
1520 0 : "expect unknown field `unknown_field` error, got: {}",
1521 : err
1522 : );
1523 1 : }
1524 :
1525 : #[test]
1526 1 : fn tenantstatus_activating_serde() {
1527 1 : let states = [TenantState::Activating(ActivatingFrom::Attaching)];
1528 1 : let expected = "[{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
1529 1 :
1530 1 : let actual = serde_json::to_string(&states).unwrap();
1531 1 :
1532 1 : assert_eq!(actual, expected);
1533 :
1534 1 : let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
1535 1 :
1536 1 : assert_eq!(states.as_slice(), &parsed);
1537 1 : }
1538 :
1539 : #[test]
1540 1 : fn tenantstatus_activating_strum() {
1541 1 : // tests added, because we use these for metrics
1542 1 : let examples = [
1543 1 : (line!(), TenantState::Attaching, "Attaching"),
1544 1 : (
1545 1 : line!(),
1546 1 : TenantState::Activating(ActivatingFrom::Attaching),
1547 1 : "Activating",
1548 1 : ),
1549 1 : (line!(), TenantState::Active, "Active"),
1550 1 : (
1551 1 : line!(),
1552 1 : TenantState::Stopping {
1553 1 : progress: utils::completion::Barrier::default(),
1554 1 : },
1555 1 : "Stopping",
1556 1 : ),
1557 1 : (
1558 1 : line!(),
1559 1 : TenantState::Broken {
1560 1 : reason: "Example".into(),
1561 1 : backtrace: "Looooong backtrace".into(),
1562 1 : },
1563 1 : "Broken",
1564 1 : ),
1565 1 : ];
1566 :
1567 6 : for (line, rendered, expected) in examples {
1568 5 : let actual: &'static str = rendered.into();
1569 5 : assert_eq!(actual, expected, "example on {line}");
1570 : }
1571 1 : }
1572 :
1573 : #[test]
1574 1 : fn test_image_compression_algorithm_parsing() {
1575 : use ImageCompressionAlgorithm::*;
1576 1 : let cases = [
1577 1 : ("disabled", Disabled),
1578 1 : ("zstd", Zstd { level: None }),
1579 1 : ("zstd(18)", Zstd { level: Some(18) }),
1580 1 : ("zstd(-3)", Zstd { level: Some(-3) }),
1581 1 : ];
1582 :
1583 5 : for (display, expected) in cases {
1584 4 : assert_eq!(
1585 4 : ImageCompressionAlgorithm::from_str(display).unwrap(),
1586 : expected,
1587 0 : "parsing works"
1588 : );
1589 4 : assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip");
1590 :
1591 4 : let ser = serde_json::to_string(&expected).expect("serialization");
1592 4 : assert_eq!(
1593 4 : serde_json::from_str::<ImageCompressionAlgorithm>(&ser).unwrap(),
1594 : expected,
1595 0 : "serde roundtrip"
1596 : );
1597 :
1598 4 : assert_eq!(
1599 4 : serde_json::Value::String(display.to_string()),
1600 4 : serde_json::to_value(expected).unwrap(),
1601 0 : "Display is the serde serialization"
1602 : );
1603 : }
1604 1 : }
1605 : }
|