Line data Source code
1 : pub mod detach_ancestor;
2 : pub mod partitioning;
3 : pub mod utilization;
4 :
5 : pub use utilization::PageserverUtilization;
6 :
7 : use std::{
8 : borrow::Cow,
9 : collections::HashMap,
10 : io::{BufRead, Read},
11 : num::{NonZeroU64, NonZeroUsize},
12 : str::FromStr,
13 : sync::atomic::AtomicUsize,
14 : time::{Duration, SystemTime},
15 : };
16 :
17 : use byteorder::{BigEndian, ReadBytesExt};
18 : use postgres_ffi::BLCKSZ;
19 : use serde::{Deserialize, Serialize};
20 : use serde_with::serde_as;
21 : use utils::{
22 : completion,
23 : history_buffer::HistoryBufferWithDropCounter,
24 : id::{NodeId, TenantId, TimelineId},
25 : lsn::Lsn,
26 : serde_system_time,
27 : };
28 :
29 : use crate::controller_api::PlacementPolicy;
30 : use crate::{
31 : reltag::RelTag,
32 : shard::{ShardCount, ShardStripeSize, TenantShardId},
33 : };
34 : use anyhow::bail;
35 : use bytes::{Buf, BufMut, Bytes, BytesMut};
36 :
37 : /// The state of a tenant in this pageserver.
38 : ///
39 : /// ```mermaid
40 : /// stateDiagram-v2
41 : ///
42 : /// [*] --> Loading: spawn_load()
43 : /// [*] --> Attaching: spawn_attach()
44 : ///
45 : /// Loading --> Activating: activate()
46 : /// Attaching --> Activating: activate()
47 : /// Activating --> Active: infallible
48 : ///
49 : /// Loading --> Broken: load() failure
50 : /// Attaching --> Broken: attach() failure
51 : ///
52 : /// Active --> Stopping: set_stopping(), part of shutdown & detach
53 : /// Stopping --> Broken: late error in remove_tenant_from_memory
54 : ///
55 : /// Broken --> [*]: ignore / detach / shutdown
56 : /// Stopping --> [*]: remove_from_memory complete
57 : ///
58 : /// Active --> Broken: cfg(testing)-only tenant break point
59 : /// ```
60 : #[derive(
61 : Clone,
62 : PartialEq,
63 : Eq,
64 2 : serde::Serialize,
65 12 : serde::Deserialize,
66 0 : strum_macros::Display,
67 : strum_macros::EnumVariantNames,
68 0 : strum_macros::AsRefStr,
69 272 : strum_macros::IntoStaticStr,
70 : )]
71 : #[serde(tag = "slug", content = "data")]
72 : pub enum TenantState {
73 : /// This tenant is being loaded from local disk.
74 : ///
75 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
76 : Loading,
77 : /// This tenant is being attached to the pageserver.
78 : ///
79 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
80 : Attaching,
81 : /// The tenant is transitioning from Loading/Attaching to Active.
82 : ///
83 : /// While in this state, the individual timelines are being activated.
84 : ///
85 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
86 : Activating(ActivatingFrom),
87 : /// The tenant has finished activating and is open for business.
88 : ///
89 : /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
90 : Active,
91 : /// The tenant is recognized by pageserver, but it is being detached or the
92 : /// system is being shut down.
93 : ///
94 : /// Transitions out of this state are possible through `set_broken()`.
95 : Stopping {
96 : // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
97 : // otherwise it will not be skipped during deserialization
98 : #[serde(skip)]
99 : progress: completion::Barrier,
100 : },
101 : /// The tenant is recognized by the pageserver, but can no longer be used for
102 : /// any operations.
103 : ///
104 : /// If the tenant fails to load or attach, it will transition to this state
105 : /// and it is guaranteed that no background tasks are running in its name.
106 : ///
107 : /// The other way to transition into this state is from `Stopping` state
108 : /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
109 : /// if the cleanup future executed by `remove_tenant_from_memory()` fails.
110 : Broken { reason: String, backtrace: String },
111 : }
112 :
113 : impl TenantState {
114 0 : pub fn attachment_status(&self) -> TenantAttachmentStatus {
115 : use TenantAttachmentStatus::*;
116 :
117 : // Below TenantState::Activating is used as "transient" or "transparent" state for
118 : // attachment_status determining.
119 0 : match self {
120 : // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
121 : // So, technically, we can return Attached here.
122 : // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
123 : // But, our attach task might still be fetching the remote timelines, etc.
124 : // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
125 0 : Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
126 : // tenant mgr startup distinguishes attaching from loading via marker file.
127 0 : Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
128 : // We only reach Active after successful load / attach.
129 : // So, call atttachment status Attached.
130 0 : Self::Active => Attached,
131 : // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
132 : // However, it also becomes Broken if the regular load fails.
133 : // From Console's perspective there's no practical difference
134 : // because attachment_status is polled by console only during attach operation execution.
135 0 : Self::Broken { reason, .. } => Failed {
136 0 : reason: reason.to_owned(),
137 0 : },
138 : // Why is Stopping a Maybe case? Because, during pageserver shutdown,
139 : // we set the Stopping state irrespective of whether the tenant
140 : // has finished attaching or not.
141 0 : Self::Stopping { .. } => Maybe,
142 : }
143 0 : }
144 :
145 0 : pub fn broken_from_reason(reason: String) -> Self {
146 0 : let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
147 0 : Self::Broken {
148 0 : reason,
149 0 : backtrace: backtrace_str,
150 0 : }
151 0 : }
152 : }
153 :
154 : impl std::fmt::Debug for TenantState {
155 4 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
156 4 : match self {
157 4 : Self::Broken { reason, backtrace } if !reason.is_empty() => {
158 4 : write!(f, "Broken due to: {reason}. Backtrace:\n{backtrace}")
159 : }
160 0 : _ => write!(f, "{self}"),
161 : }
162 4 : }
163 : }
164 :
165 : /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
166 8 : #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
167 : pub enum ActivatingFrom {
168 : /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
169 : Loading,
170 : /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
171 : Attaching,
172 : }
173 :
174 : /// A state of a timeline in pageserver's memory.
175 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
176 : pub enum TimelineState {
177 : /// The timeline is recognized by the pageserver but is not yet operational.
178 : /// In particular, the walreceiver connection loop is not running for this timeline.
179 : /// It will eventually transition to state Active or Broken.
180 : Loading,
181 : /// The timeline is fully operational.
182 : /// It can be queried, and the walreceiver connection loop is running.
183 : Active,
184 : /// The timeline was previously Loading or Active but is shutting down.
185 : /// It cannot transition back into any other state.
186 : Stopping,
187 : /// The timeline is broken and not operational (previous states: Loading or Active).
188 : Broken { reason: String, backtrace: String },
189 : }
190 :
191 0 : #[derive(Serialize, Deserialize, Clone)]
192 : pub struct TimelineCreateRequest {
193 : pub new_timeline_id: TimelineId,
194 : #[serde(default)]
195 : pub ancestor_timeline_id: Option<TimelineId>,
196 : #[serde(default)]
197 : pub existing_initdb_timeline_id: Option<TimelineId>,
198 : #[serde(default)]
199 : pub ancestor_start_lsn: Option<Lsn>,
200 : pub pg_version: Option<u32>,
201 : }
202 :
203 0 : #[derive(Serialize, Deserialize)]
204 : pub struct TenantShardSplitRequest {
205 : pub new_shard_count: u8,
206 :
207 : // A tenant's stripe size is only meaningful the first time their shard count goes
208 : // above 1: therefore during a split from 1->N shards, we may modify the stripe size.
209 : //
210 : // If this is set while the stripe count is being increased from an already >1 value,
211 : // then the request will fail with 400.
212 : pub new_stripe_size: Option<ShardStripeSize>,
213 : }
214 :
215 0 : #[derive(Serialize, Deserialize)]
216 : pub struct TenantShardSplitResponse {
217 : pub new_shards: Vec<TenantShardId>,
218 : }
219 :
220 : /// Parameters that apply to all shards in a tenant. Used during tenant creation.
221 0 : #[derive(Serialize, Deserialize, Debug)]
222 : #[serde(deny_unknown_fields)]
223 : pub struct ShardParameters {
224 : pub count: ShardCount,
225 : pub stripe_size: ShardStripeSize,
226 : }
227 :
228 : impl ShardParameters {
229 : pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
230 :
231 0 : pub fn is_unsharded(&self) -> bool {
232 0 : self.count.is_unsharded()
233 0 : }
234 : }
235 :
236 : impl Default for ShardParameters {
237 140 : fn default() -> Self {
238 140 : Self {
239 140 : count: ShardCount::new(0),
240 140 : stripe_size: Self::DEFAULT_STRIPE_SIZE,
241 140 : }
242 140 : }
243 : }
244 :
245 6 : #[derive(Serialize, Deserialize, Debug)]
246 : #[serde(deny_unknown_fields)]
247 : pub struct TenantCreateRequest {
248 : pub new_tenant_id: TenantShardId,
249 : #[serde(default)]
250 : #[serde(skip_serializing_if = "Option::is_none")]
251 : pub generation: Option<u32>,
252 :
253 : // If omitted, create a single shard with TenantShardId::unsharded()
254 : #[serde(default)]
255 : #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
256 : pub shard_parameters: ShardParameters,
257 :
258 : // This parameter is only meaningful in requests sent to the storage controller
259 : #[serde(default)]
260 : #[serde(skip_serializing_if = "Option::is_none")]
261 : pub placement_policy: Option<PlacementPolicy>,
262 :
263 : #[serde(flatten)]
264 : pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
265 : }
266 :
267 0 : #[derive(Deserialize, Debug)]
268 : #[serde(deny_unknown_fields)]
269 : pub struct TenantLoadRequest {
270 : #[serde(default)]
271 : #[serde(skip_serializing_if = "Option::is_none")]
272 : pub generation: Option<u32>,
273 : }
274 :
275 : impl std::ops::Deref for TenantCreateRequest {
276 : type Target = TenantConfig;
277 :
278 0 : fn deref(&self) -> &Self::Target {
279 0 : &self.config
280 0 : }
281 : }
282 :
283 : /// An alternative representation of `pageserver::tenant::TenantConf` with
284 : /// simpler types.
285 6 : #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
286 : pub struct TenantConfig {
287 : pub checkpoint_distance: Option<u64>,
288 : pub checkpoint_timeout: Option<String>,
289 : pub compaction_target_size: Option<u64>,
290 : pub compaction_period: Option<String>,
291 : pub compaction_threshold: Option<usize>,
292 : // defer parsing compaction_algorithm, like eviction_policy
293 : pub compaction_algorithm: Option<CompactionAlgorithm>,
294 : pub gc_horizon: Option<u64>,
295 : pub gc_period: Option<String>,
296 : pub image_creation_threshold: Option<usize>,
297 : pub pitr_interval: Option<String>,
298 : pub walreceiver_connect_timeout: Option<String>,
299 : pub lagging_wal_timeout: Option<String>,
300 : pub max_lsn_wal_lag: Option<NonZeroU64>,
301 : pub trace_read_requests: Option<bool>,
302 : pub eviction_policy: Option<EvictionPolicy>,
303 : pub min_resident_size_override: Option<u64>,
304 : pub evictions_low_residence_duration_metric_threshold: Option<String>,
305 : pub heatmap_period: Option<String>,
306 : pub lazy_slru_download: Option<bool>,
307 : pub timeline_get_throttle: Option<ThrottleConfig>,
308 : pub image_layer_creation_check_threshold: Option<u8>,
309 : pub switch_aux_file_policy: Option<AuxFilePolicy>,
310 : }
311 :
312 : /// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
313 : /// tenant config. When the first aux file written, the policy will be persisted in the
314 : /// `index_part.json` file and has a limited migration path.
315 : ///
316 : /// Currently, we only allow the following migration path:
317 : ///
318 : /// Unset -> V1
319 : /// -> V2
320 : /// -> CrossValidation -> V2
321 24 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
322 : pub enum AuxFilePolicy {
323 : /// V1 aux file policy: store everything in AUX_FILE_KEY
324 : V1,
325 : /// V2 aux file policy: store in the AUX_FILE keyspace
326 : V2,
327 : /// Cross validation runs both formats on the write path and does validation
328 : /// on the read path.
329 : CrossValidation,
330 : }
331 :
332 : impl AuxFilePolicy {
333 48 : pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
334 30 : matches!(
335 48 : (from, to),
336 : (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
337 : )
338 48 : }
339 :
340 : /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
341 322 : pub fn default_tenant_config() -> Self {
342 322 : Self::V1
343 322 : }
344 : }
345 :
346 : /// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
347 : pub struct AtomicAuxFilePolicy(AtomicUsize);
348 :
349 : impl AtomicAuxFilePolicy {
350 352 : pub fn new(policy: Option<AuxFilePolicy>) -> Self {
351 352 : Self(AtomicUsize::new(
352 352 : policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
353 352 : ))
354 352 : }
355 :
356 518 : pub fn load(&self) -> Option<AuxFilePolicy> {
357 518 : match self.0.load(std::sync::atomic::Ordering::Acquire) {
358 462 : 0 => None,
359 56 : other => Some(AuxFilePolicy::from_usize(other)),
360 : }
361 518 : }
362 :
363 16 : pub fn store(&self, policy: Option<AuxFilePolicy>) {
364 16 : self.0.store(
365 16 : policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
366 16 : std::sync::atomic::Ordering::Release,
367 16 : );
368 16 : }
369 : }
370 :
371 : impl AuxFilePolicy {
372 14 : pub fn to_usize(self) -> usize {
373 14 : match self {
374 10 : Self::V1 => 1,
375 2 : Self::CrossValidation => 2,
376 2 : Self::V2 => 3,
377 : }
378 14 : }
379 :
380 56 : pub fn try_from_usize(this: usize) -> Option<Self> {
381 56 : match this {
382 32 : 1 => Some(Self::V1),
383 6 : 2 => Some(Self::CrossValidation),
384 18 : 3 => Some(Self::V2),
385 0 : _ => None,
386 : }
387 56 : }
388 :
389 56 : pub fn from_usize(this: usize) -> Self {
390 56 : Self::try_from_usize(this).unwrap()
391 56 : }
392 : }
393 :
394 : impl FromStr for AuxFilePolicy {
395 : type Err = anyhow::Error;
396 :
397 0 : fn from_str(s: &str) -> Result<Self, Self::Err> {
398 0 : let s = s.to_lowercase();
399 0 : if s == "v1" {
400 0 : Ok(Self::V1)
401 0 : } else if s == "v2" {
402 0 : Ok(Self::V2)
403 0 : } else if s == "crossvalidation" || s == "cross_validation" {
404 0 : Ok(Self::CrossValidation)
405 : } else {
406 0 : anyhow::bail!("cannot parse {} to aux file policy", s)
407 : }
408 0 : }
409 : }
410 :
411 4 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
412 : #[serde(tag = "kind")]
413 : pub enum EvictionPolicy {
414 : NoEviction,
415 : LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
416 : OnlyImitiate(EvictionPolicyLayerAccessThreshold),
417 : }
418 :
419 : impl EvictionPolicy {
420 0 : pub fn discriminant_str(&self) -> &'static str {
421 0 : match self {
422 0 : EvictionPolicy::NoEviction => "NoEviction",
423 0 : EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
424 0 : EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
425 : }
426 0 : }
427 : }
428 :
429 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
430 : #[serde(tag = "kind")]
431 : pub enum CompactionAlgorithm {
432 : Legacy,
433 : Tiered,
434 : }
435 :
436 20 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
437 : pub struct EvictionPolicyLayerAccessThreshold {
438 : #[serde(with = "humantime_serde")]
439 : pub period: Duration,
440 : #[serde(with = "humantime_serde")]
441 : pub threshold: Duration,
442 : }
443 :
444 0 : #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
445 : pub struct ThrottleConfig {
446 : pub task_kinds: Vec<String>, // TaskKind
447 : pub initial: usize,
448 : #[serde(with = "humantime_serde")]
449 : pub refill_interval: Duration,
450 : pub refill_amount: NonZeroUsize,
451 : pub max: usize,
452 : pub fair: bool,
453 : }
454 :
455 : impl ThrottleConfig {
456 308 : pub fn disabled() -> Self {
457 308 : Self {
458 308 : task_kinds: vec![], // effectively disables the throttle
459 308 : // other values don't matter with emtpy `task_kinds`.
460 308 : initial: 0,
461 308 : refill_interval: Duration::from_millis(1),
462 308 : refill_amount: NonZeroUsize::new(1).unwrap(),
463 308 : max: 1,
464 308 : fair: true,
465 308 : }
466 308 : }
467 : /// The requests per second allowed by the given config.
468 0 : pub fn steady_rps(&self) -> f64 {
469 0 : (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
470 0 : }
471 : }
472 :
473 : /// A flattened analog of a `pagesever::tenant::LocationMode`, which
474 : /// lists out all possible states (and the virtual "Detached" state)
475 : /// in a flat form rather than using rust-style enums.
476 0 : #[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
477 : pub enum LocationConfigMode {
478 : AttachedSingle,
479 : AttachedMulti,
480 : AttachedStale,
481 : Secondary,
482 : Detached,
483 : }
484 :
485 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
486 : pub struct LocationConfigSecondary {
487 : pub warm: bool,
488 : }
489 :
490 : /// An alternative representation of `pageserver::tenant::LocationConf`,
491 : /// for use in external-facing APIs.
492 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
493 : pub struct LocationConfig {
494 : pub mode: LocationConfigMode,
495 : /// If attaching, in what generation?
496 : #[serde(default)]
497 : pub generation: Option<u32>,
498 :
499 : // If requesting mode `Secondary`, configuration for that.
500 : #[serde(default)]
501 : pub secondary_conf: Option<LocationConfigSecondary>,
502 :
503 : // Shard parameters: if shard_count is nonzero, then other shard_* fields
504 : // must be set accurately.
505 : #[serde(default)]
506 : pub shard_number: u8,
507 : #[serde(default)]
508 : pub shard_count: u8,
509 : #[serde(default)]
510 : pub shard_stripe_size: u32,
511 :
512 : // This configuration only affects attached mode, but should be provided irrespective
513 : // of the mode, as a secondary location might transition on startup if the response
514 : // to the `/re-attach` control plane API requests it.
515 : pub tenant_conf: TenantConfig,
516 : }
517 :
518 0 : #[derive(Serialize, Deserialize)]
519 : pub struct LocationConfigListResponse {
520 : pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
521 : }
522 :
523 0 : #[derive(Serialize, Deserialize)]
524 : #[serde(transparent)]
525 : pub struct TenantCreateResponse(pub TenantId);
526 :
527 : #[derive(Serialize)]
528 : pub struct StatusResponse {
529 : pub id: NodeId,
530 : }
531 :
532 0 : #[derive(Serialize, Deserialize, Debug)]
533 : #[serde(deny_unknown_fields)]
534 : pub struct TenantLocationConfigRequest {
535 : #[serde(flatten)]
536 : pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
537 : }
538 :
539 0 : #[derive(Serialize, Deserialize, Debug)]
540 : #[serde(deny_unknown_fields)]
541 : pub struct TenantTimeTravelRequest {
542 : pub shard_counts: Vec<ShardCount>,
543 : }
544 :
545 0 : #[derive(Serialize, Deserialize, Debug)]
546 : #[serde(deny_unknown_fields)]
547 : pub struct TenantShardLocation {
548 : pub shard_id: TenantShardId,
549 : pub node_id: NodeId,
550 : }
551 :
552 0 : #[derive(Serialize, Deserialize, Debug)]
553 : #[serde(deny_unknown_fields)]
554 : pub struct TenantLocationConfigResponse {
555 : pub shards: Vec<TenantShardLocation>,
556 : // If the shards' ShardCount count is >1, stripe_size will be set.
557 : pub stripe_size: Option<ShardStripeSize>,
558 : }
559 :
560 6 : #[derive(Serialize, Deserialize, Debug)]
561 : #[serde(deny_unknown_fields)]
562 : pub struct TenantConfigRequest {
563 : pub tenant_id: TenantId,
564 : #[serde(flatten)]
565 : pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
566 : }
567 :
568 : impl std::ops::Deref for TenantConfigRequest {
569 : type Target = TenantConfig;
570 :
571 0 : fn deref(&self) -> &Self::Target {
572 0 : &self.config
573 0 : }
574 : }
575 :
576 : impl TenantConfigRequest {
577 0 : pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
578 0 : let config = TenantConfig::default();
579 0 : TenantConfigRequest { tenant_id, config }
580 0 : }
581 : }
582 :
583 6 : #[derive(Debug, Deserialize)]
584 : pub struct TenantAttachRequest {
585 : #[serde(default)]
586 : pub config: TenantAttachConfig,
587 : #[serde(default)]
588 : pub generation: Option<u32>,
589 : }
590 :
591 : /// Newtype to enforce deny_unknown_fields on TenantConfig for
592 : /// its usage inside `TenantAttachRequest`.
593 2 : #[derive(Debug, Serialize, Deserialize, Default)]
594 : #[serde(deny_unknown_fields)]
595 : pub struct TenantAttachConfig {
596 : #[serde(flatten)]
597 : allowing_unknown_fields: TenantConfig,
598 : }
599 :
600 : impl std::ops::Deref for TenantAttachConfig {
601 : type Target = TenantConfig;
602 :
603 0 : fn deref(&self) -> &Self::Target {
604 0 : &self.allowing_unknown_fields
605 0 : }
606 : }
607 :
608 : /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
609 0 : #[derive(Serialize, Deserialize, Clone)]
610 : #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
611 : pub enum TenantAttachmentStatus {
612 : Maybe,
613 : Attached,
614 : Failed { reason: String },
615 : }
616 :
617 0 : #[derive(Serialize, Deserialize, Clone)]
618 : pub struct TenantInfo {
619 : pub id: TenantShardId,
620 : // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
621 : pub state: TenantState,
622 : /// Sum of the size of all layer files.
623 : /// If a layer is present in both local FS and S3, it counts only once.
624 : pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
625 : pub attachment_status: TenantAttachmentStatus,
626 : #[serde(skip_serializing_if = "Option::is_none")]
627 : pub generation: Option<u32>,
628 : }
629 :
630 0 : #[derive(Serialize, Deserialize, Clone)]
631 : pub struct TenantDetails {
632 : #[serde(flatten)]
633 : pub tenant_info: TenantInfo,
634 :
635 : pub walredo: Option<WalRedoManagerStatus>,
636 :
637 : pub timelines: Vec<TimelineId>,
638 : }
639 :
640 : /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
641 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
642 : pub struct TimelineInfo {
643 : pub tenant_id: TenantShardId,
644 : pub timeline_id: TimelineId,
645 :
646 : pub ancestor_timeline_id: Option<TimelineId>,
647 : pub ancestor_lsn: Option<Lsn>,
648 : pub last_record_lsn: Lsn,
649 : pub prev_record_lsn: Option<Lsn>,
650 : pub latest_gc_cutoff_lsn: Lsn,
651 : pub disk_consistent_lsn: Lsn,
652 :
653 : /// The LSN that we have succesfully uploaded to remote storage
654 : pub remote_consistent_lsn: Lsn,
655 :
656 : /// The LSN that we are advertizing to safekeepers
657 : pub remote_consistent_lsn_visible: Lsn,
658 :
659 : /// The LSN from the start of the root timeline (never changes)
660 : pub initdb_lsn: Lsn,
661 :
662 : pub current_logical_size: u64,
663 : pub current_logical_size_is_accurate: bool,
664 :
665 : pub directory_entries_counts: Vec<u64>,
666 :
667 : /// Sum of the size of all layer files.
668 : /// If a layer is present in both local FS and S3, it counts only once.
669 : pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
670 : pub current_logical_size_non_incremental: Option<u64>,
671 :
672 : pub timeline_dir_layer_file_size_sum: Option<u64>,
673 :
674 : pub wal_source_connstr: Option<String>,
675 : pub last_received_msg_lsn: Option<Lsn>,
676 : /// the timestamp (in microseconds) of the last received message
677 : pub last_received_msg_ts: Option<u128>,
678 : pub pg_version: u32,
679 :
680 : pub state: TimelineState,
681 :
682 : pub walreceiver_status: String,
683 :
684 : /// The last aux file policy being used on this timeline
685 : pub last_aux_file_policy: Option<AuxFilePolicy>,
686 : }
687 :
688 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
689 : pub struct LayerMapInfo {
690 : pub in_memory_layers: Vec<InMemoryLayerInfo>,
691 : pub historic_layers: Vec<HistoricLayerInfo>,
692 : }
693 :
694 0 : #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize, enum_map::Enum)]
695 : #[repr(usize)]
696 : pub enum LayerAccessKind {
697 : GetValueReconstructData,
698 : Iter,
699 : KeyIter,
700 : Dump,
701 : }
702 :
703 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
704 : pub struct LayerAccessStatFullDetails {
705 : pub when_millis_since_epoch: u64,
706 : pub task_kind: Cow<'static, str>,
707 : pub access_kind: LayerAccessKind,
708 : }
709 :
710 : /// An event that impacts the layer's residence status.
711 : #[serde_as]
712 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
713 : pub struct LayerResidenceEvent {
714 : /// The time when the event occurred.
715 : /// NB: this timestamp is captured while the residence status changes.
716 : /// So, it might be behind/ahead of the actual residence change by a short amount of time.
717 : ///
718 : #[serde(rename = "timestamp_millis_since_epoch")]
719 : #[serde_as(as = "serde_with::TimestampMilliSeconds")]
720 : pub timestamp: SystemTime,
721 : /// The new residence status of the layer.
722 : pub status: LayerResidenceStatus,
723 : /// The reason why we had to record this event.
724 : pub reason: LayerResidenceEventReason,
725 : }
726 :
727 : /// The reason for recording a given [`LayerResidenceEvent`].
728 0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
729 : pub enum LayerResidenceEventReason {
730 : /// The layer map is being populated, e.g. during timeline load or attach.
731 : /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
732 : /// We need to record such events because there is no persistent storage for the events.
733 : ///
734 : // https://github.com/rust-lang/rust/issues/74481
735 : /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
736 : /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
737 : LayerLoad,
738 : /// We just created the layer (e.g., freeze_and_flush or compaction).
739 : /// Such layers are always [`LayerResidenceStatus::Resident`].
740 : LayerCreate,
741 : /// We on-demand downloaded or evicted the given layer.
742 : ResidenceChange,
743 : }
744 :
745 : /// The residence status of the layer, after the given [`LayerResidenceEvent`].
746 0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
747 : pub enum LayerResidenceStatus {
748 : /// Residence status for a layer file that exists locally.
749 : /// It may also exist on the remote, we don't care here.
750 : Resident,
751 : /// Residence status for a layer file that only exists on the remote.
752 : Evicted,
753 : }
754 :
755 : impl LayerResidenceEvent {
756 2684 : pub fn new(status: LayerResidenceStatus, reason: LayerResidenceEventReason) -> Self {
757 2684 : Self {
758 2684 : status,
759 2684 : reason,
760 2684 : timestamp: SystemTime::now(),
761 2684 : }
762 2684 : }
763 : }
764 :
765 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
766 : pub struct LayerAccessStats {
767 : pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
768 : pub task_kind_access_flag: Vec<Cow<'static, str>>,
769 : pub first: Option<LayerAccessStatFullDetails>,
770 : pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
771 : pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
772 : }
773 :
774 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
775 : #[serde(tag = "kind")]
776 : pub enum InMemoryLayerInfo {
777 : Open { lsn_start: Lsn },
778 : Frozen { lsn_start: Lsn, lsn_end: Lsn },
779 : }
780 :
781 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
782 : #[serde(tag = "kind")]
783 : pub enum HistoricLayerInfo {
784 : Delta {
785 : layer_file_name: String,
786 : layer_file_size: u64,
787 :
788 : lsn_start: Lsn,
789 : lsn_end: Lsn,
790 : remote: bool,
791 : access_stats: LayerAccessStats,
792 :
793 : l0: bool,
794 : },
795 : Image {
796 : layer_file_name: String,
797 : layer_file_size: u64,
798 :
799 : lsn_start: Lsn,
800 : remote: bool,
801 : access_stats: LayerAccessStats,
802 : },
803 : }
804 :
805 : impl HistoricLayerInfo {
806 0 : pub fn layer_file_name(&self) -> &str {
807 0 : match self {
808 : HistoricLayerInfo::Delta {
809 0 : layer_file_name, ..
810 0 : } => layer_file_name,
811 : HistoricLayerInfo::Image {
812 0 : layer_file_name, ..
813 0 : } => layer_file_name,
814 : }
815 0 : }
816 0 : pub fn is_remote(&self) -> bool {
817 0 : match self {
818 0 : HistoricLayerInfo::Delta { remote, .. } => *remote,
819 0 : HistoricLayerInfo::Image { remote, .. } => *remote,
820 : }
821 0 : }
822 0 : pub fn set_remote(&mut self, value: bool) {
823 0 : let field = match self {
824 0 : HistoricLayerInfo::Delta { remote, .. } => remote,
825 0 : HistoricLayerInfo::Image { remote, .. } => remote,
826 : };
827 0 : *field = value;
828 0 : }
829 0 : pub fn layer_file_size(&self) -> u64 {
830 0 : match self {
831 : HistoricLayerInfo::Delta {
832 0 : layer_file_size, ..
833 0 : } => *layer_file_size,
834 : HistoricLayerInfo::Image {
835 0 : layer_file_size, ..
836 0 : } => *layer_file_size,
837 : }
838 0 : }
839 : }
840 :
841 0 : #[derive(Debug, Serialize, Deserialize)]
842 : pub struct DownloadRemoteLayersTaskSpawnRequest {
843 : pub max_concurrent_downloads: NonZeroUsize,
844 : }
845 :
846 0 : #[derive(Debug, Serialize, Deserialize)]
847 : pub struct IngestAuxFilesRequest {
848 : pub aux_files: HashMap<String, String>,
849 : }
850 :
851 0 : #[derive(Debug, Serialize, Deserialize)]
852 : pub struct ListAuxFilesRequest {
853 : pub lsn: Lsn,
854 : }
855 :
856 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
857 : pub struct DownloadRemoteLayersTaskInfo {
858 : pub task_id: String,
859 : pub state: DownloadRemoteLayersTaskState,
860 : pub total_layer_count: u64, // stable once `completed`
861 : pub successful_download_count: u64, // stable once `completed`
862 : pub failed_download_count: u64, // stable once `completed`
863 : }
864 :
865 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
866 : pub enum DownloadRemoteLayersTaskState {
867 : Running,
868 : Completed,
869 : ShutDown,
870 : }
871 :
872 0 : #[derive(Debug, Serialize, Deserialize)]
873 : pub struct TimelineGcRequest {
874 : pub gc_horizon: Option<u64>,
875 : }
876 :
877 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
878 : pub struct WalRedoManagerProcessStatus {
879 : pub pid: u32,
880 : }
881 :
882 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
883 : pub struct WalRedoManagerStatus {
884 : pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
885 : pub process: Option<WalRedoManagerProcessStatus>,
886 : }
887 :
888 : /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
889 : /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
890 : /// what's happening.
891 0 : #[derive(Default, Debug, Serialize, Deserialize, Clone)]
892 : pub struct SecondaryProgress {
893 : /// The remote storage LastModified time of the heatmap object we last downloaded.
894 : pub heatmap_mtime: Option<serde_system_time::SystemTime>,
895 :
896 : /// The number of layers currently on-disk
897 : pub layers_downloaded: usize,
898 : /// The number of layers in the most recently seen heatmap
899 : pub layers_total: usize,
900 :
901 : /// The number of layer bytes currently on-disk
902 : pub bytes_downloaded: u64,
903 : /// The number of layer bytes in the most recently seen heatmap
904 : pub bytes_total: u64,
905 : }
906 :
907 0 : #[derive(Serialize, Deserialize, Debug)]
908 : pub struct TenantScanRemoteStorageShard {
909 : pub tenant_shard_id: TenantShardId,
910 : pub generation: Option<u32>,
911 : }
912 :
913 0 : #[derive(Serialize, Deserialize, Debug, Default)]
914 : pub struct TenantScanRemoteStorageResponse {
915 : pub shards: Vec<TenantScanRemoteStorageShard>,
916 : }
917 :
918 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
919 : #[serde(rename_all = "snake_case")]
920 : pub enum TenantSorting {
921 : ResidentSize,
922 : MaxLogicalSize,
923 : }
924 :
925 : impl Default for TenantSorting {
926 0 : fn default() -> Self {
927 0 : Self::ResidentSize
928 0 : }
929 : }
930 :
931 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
932 : pub struct TopTenantShardsRequest {
933 : // How would you like to sort the tenants?
934 : pub order_by: TenantSorting,
935 :
936 : // How many results?
937 : pub limit: usize,
938 :
939 : // Omit tenants with more than this many shards (e.g. if this is the max number of shards
940 : // that the caller would ever split to)
941 : pub where_shards_lt: Option<ShardCount>,
942 :
943 : // Omit tenants where the ordering metric is less than this (this is an optimization to
944 : // let us quickly exclude numerous tiny shards)
945 : pub where_gt: Option<u64>,
946 : }
947 :
948 0 : #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
949 : pub struct TopTenantShardItem {
950 : pub id: TenantShardId,
951 :
952 : /// Total size of layers on local disk for all timelines in this tenant
953 : pub resident_size: u64,
954 :
955 : /// Total size of layers in remote storage for all timelines in this tenant
956 : pub physical_size: u64,
957 :
958 : /// The largest logical size of a timeline within this tenant
959 : pub max_logical_size: u64,
960 : }
961 :
962 0 : #[derive(Serialize, Deserialize, Debug, Default)]
963 : pub struct TopTenantShardsResponse {
964 : pub shards: Vec<TopTenantShardItem>,
965 : }
966 :
967 : pub mod virtual_file {
968 : #[derive(
969 : Copy,
970 : Clone,
971 : PartialEq,
972 : Eq,
973 : Hash,
974 300 : strum_macros::EnumString,
975 0 : strum_macros::Display,
976 0 : serde_with::DeserializeFromStr,
977 : serde_with::SerializeDisplay,
978 : Debug,
979 : )]
980 : #[strum(serialize_all = "kebab-case")]
981 : pub enum IoEngineKind {
982 : StdFs,
983 : #[cfg(target_os = "linux")]
984 : TokioEpollUring,
985 : }
986 : }
987 :
988 : // Wrapped in libpq CopyData
989 : #[derive(PartialEq, Eq, Debug)]
990 : pub enum PagestreamFeMessage {
991 : Exists(PagestreamExistsRequest),
992 : Nblocks(PagestreamNblocksRequest),
993 : GetPage(PagestreamGetPageRequest),
994 : DbSize(PagestreamDbSizeRequest),
995 : GetSlruSegment(PagestreamGetSlruSegmentRequest),
996 : }
997 :
998 : // Wrapped in libpq CopyData
999 0 : #[derive(strum_macros::EnumProperty)]
1000 : pub enum PagestreamBeMessage {
1001 : Exists(PagestreamExistsResponse),
1002 : Nblocks(PagestreamNblocksResponse),
1003 : GetPage(PagestreamGetPageResponse),
1004 : Error(PagestreamErrorResponse),
1005 : DbSize(PagestreamDbSizeResponse),
1006 : GetSlruSegment(PagestreamGetSlruSegmentResponse),
1007 : }
1008 :
1009 : // Keep in sync with `pagestore_client.h`
1010 : #[repr(u8)]
1011 : enum PagestreamBeMessageTag {
1012 : Exists = 100,
1013 : Nblocks = 101,
1014 : GetPage = 102,
1015 : Error = 103,
1016 : DbSize = 104,
1017 : GetSlruSegment = 105,
1018 : }
1019 : impl TryFrom<u8> for PagestreamBeMessageTag {
1020 : type Error = u8;
1021 0 : fn try_from(value: u8) -> Result<Self, u8> {
1022 0 : match value {
1023 0 : 100 => Ok(PagestreamBeMessageTag::Exists),
1024 0 : 101 => Ok(PagestreamBeMessageTag::Nblocks),
1025 0 : 102 => Ok(PagestreamBeMessageTag::GetPage),
1026 0 : 103 => Ok(PagestreamBeMessageTag::Error),
1027 0 : 104 => Ok(PagestreamBeMessageTag::DbSize),
1028 0 : 105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
1029 0 : _ => Err(value),
1030 : }
1031 0 : }
1032 : }
1033 :
1034 : // In the V2 protocol version, a GetPage request contains two LSN values:
1035 : //
1036 : // request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
1037 : // "get the latest version present". It's used by the primary server, which knows that no one else
1038 : // is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
1039 : // Lsn::Max. Standby servers use the current replay LSN as the request LSN.
1040 : //
1041 : // not_modified_since: Hint to the pageserver that the client knows that the page has not been
1042 : // modified between 'not_modified_since' and the request LSN. It's always correct to set
1043 : // 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
1044 : // passing an earlier LSN can speed up the request, by allowing the pageserver to process the
1045 : // request without waiting for 'request_lsn' to arrive.
1046 : //
1047 : // The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
1048 : // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
1049 : // 'latest' was set to true. The V2 interface was added because there was no correct way for a
1050 : // standby to request a page at a particular non-latest LSN, and also include the
1051 : // 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
1052 : // request, if the standby knows that the page hasn't been modified since, and risk getting an error
1053 : // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
1054 : // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
1055 : // interface allows sending both LSNs, and let the pageserver do the right thing. There is no
1056 : // difference in the responses between V1 and V2.
1057 : //
1058 : // The Request structs below reflect the V2 interface. If V1 is used, the parse function
1059 : // maps the old format requests to the new format.
1060 : //
1061 : #[derive(Clone, Copy)]
1062 : pub enum PagestreamProtocolVersion {
1063 : V1,
1064 : V2,
1065 : }
1066 :
1067 : #[derive(Debug, PartialEq, Eq)]
1068 : pub struct PagestreamExistsRequest {
1069 : pub request_lsn: Lsn,
1070 : pub not_modified_since: Lsn,
1071 : pub rel: RelTag,
1072 : }
1073 :
1074 : #[derive(Debug, PartialEq, Eq)]
1075 : pub struct PagestreamNblocksRequest {
1076 : pub request_lsn: Lsn,
1077 : pub not_modified_since: Lsn,
1078 : pub rel: RelTag,
1079 : }
1080 :
1081 : #[derive(Debug, PartialEq, Eq)]
1082 : pub struct PagestreamGetPageRequest {
1083 : pub request_lsn: Lsn,
1084 : pub not_modified_since: Lsn,
1085 : pub rel: RelTag,
1086 : pub blkno: u32,
1087 : }
1088 :
1089 : #[derive(Debug, PartialEq, Eq)]
1090 : pub struct PagestreamDbSizeRequest {
1091 : pub request_lsn: Lsn,
1092 : pub not_modified_since: Lsn,
1093 : pub dbnode: u32,
1094 : }
1095 :
1096 : #[derive(Debug, PartialEq, Eq)]
1097 : pub struct PagestreamGetSlruSegmentRequest {
1098 : pub request_lsn: Lsn,
1099 : pub not_modified_since: Lsn,
1100 : pub kind: u8,
1101 : pub segno: u32,
1102 : }
1103 :
1104 : #[derive(Debug)]
1105 : pub struct PagestreamExistsResponse {
1106 : pub exists: bool,
1107 : }
1108 :
1109 : #[derive(Debug)]
1110 : pub struct PagestreamNblocksResponse {
1111 : pub n_blocks: u32,
1112 : }
1113 :
1114 : #[derive(Debug)]
1115 : pub struct PagestreamGetPageResponse {
1116 : pub page: Bytes,
1117 : }
1118 :
1119 : #[derive(Debug)]
1120 : pub struct PagestreamGetSlruSegmentResponse {
1121 : pub segment: Bytes,
1122 : }
1123 :
1124 : #[derive(Debug)]
1125 : pub struct PagestreamErrorResponse {
1126 : pub message: String,
1127 : }
1128 :
1129 : #[derive(Debug)]
1130 : pub struct PagestreamDbSizeResponse {
1131 : pub db_size: i64,
1132 : }
1133 :
1134 : // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
1135 : // that require pageserver-internal types. It is sufficient to get the total size.
1136 0 : #[derive(Serialize, Deserialize, Debug)]
1137 : pub struct TenantHistorySize {
1138 : pub id: TenantId,
1139 : /// Size is a mixture of WAL and logical size, so the unit is bytes.
1140 : ///
1141 : /// Will be none if `?inputs_only=true` was given.
1142 : pub size: Option<u64>,
1143 : }
1144 :
1145 : impl PagestreamFeMessage {
1146 : /// Serialize a compute -> pageserver message. This is currently only used in testing
1147 : /// tools. Always uses protocol version 2.
1148 8 : pub fn serialize(&self) -> Bytes {
1149 8 : let mut bytes = BytesMut::new();
1150 8 :
1151 8 : match self {
1152 2 : Self::Exists(req) => {
1153 2 : bytes.put_u8(0);
1154 2 : bytes.put_u64(req.request_lsn.0);
1155 2 : bytes.put_u64(req.not_modified_since.0);
1156 2 : bytes.put_u32(req.rel.spcnode);
1157 2 : bytes.put_u32(req.rel.dbnode);
1158 2 : bytes.put_u32(req.rel.relnode);
1159 2 : bytes.put_u8(req.rel.forknum);
1160 2 : }
1161 :
1162 2 : Self::Nblocks(req) => {
1163 2 : bytes.put_u8(1);
1164 2 : bytes.put_u64(req.request_lsn.0);
1165 2 : bytes.put_u64(req.not_modified_since.0);
1166 2 : bytes.put_u32(req.rel.spcnode);
1167 2 : bytes.put_u32(req.rel.dbnode);
1168 2 : bytes.put_u32(req.rel.relnode);
1169 2 : bytes.put_u8(req.rel.forknum);
1170 2 : }
1171 :
1172 2 : Self::GetPage(req) => {
1173 2 : bytes.put_u8(2);
1174 2 : bytes.put_u64(req.request_lsn.0);
1175 2 : bytes.put_u64(req.not_modified_since.0);
1176 2 : bytes.put_u32(req.rel.spcnode);
1177 2 : bytes.put_u32(req.rel.dbnode);
1178 2 : bytes.put_u32(req.rel.relnode);
1179 2 : bytes.put_u8(req.rel.forknum);
1180 2 : bytes.put_u32(req.blkno);
1181 2 : }
1182 :
1183 2 : Self::DbSize(req) => {
1184 2 : bytes.put_u8(3);
1185 2 : bytes.put_u64(req.request_lsn.0);
1186 2 : bytes.put_u64(req.not_modified_since.0);
1187 2 : bytes.put_u32(req.dbnode);
1188 2 : }
1189 :
1190 0 : Self::GetSlruSegment(req) => {
1191 0 : bytes.put_u8(4);
1192 0 : bytes.put_u64(req.request_lsn.0);
1193 0 : bytes.put_u64(req.not_modified_since.0);
1194 0 : bytes.put_u8(req.kind);
1195 0 : bytes.put_u32(req.segno);
1196 0 : }
1197 : }
1198 :
1199 8 : bytes.into()
1200 8 : }
1201 :
1202 8 : pub fn parse<R: std::io::Read>(
1203 8 : body: &mut R,
1204 8 : protocol_version: PagestreamProtocolVersion,
1205 8 : ) -> anyhow::Result<PagestreamFeMessage> {
1206 : // these correspond to the NeonMessageTag enum in pagestore_client.h
1207 : //
1208 : // TODO: consider using protobuf or serde bincode for less error prone
1209 : // serialization.
1210 8 : let msg_tag = body.read_u8()?;
1211 :
1212 8 : let (request_lsn, not_modified_since) = match protocol_version {
1213 : PagestreamProtocolVersion::V2 => (
1214 8 : Lsn::from(body.read_u64::<BigEndian>()?),
1215 8 : Lsn::from(body.read_u64::<BigEndian>()?),
1216 : ),
1217 : PagestreamProtocolVersion::V1 => {
1218 : // In the old protocol, each message starts with a boolean 'latest' flag,
1219 : // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
1220 : // 'not_modified_since', used in the new protocol version.
1221 0 : let latest = body.read_u8()? != 0;
1222 0 : let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
1223 0 : if latest {
1224 0 : (Lsn::MAX, request_lsn) // get latest version
1225 : } else {
1226 0 : (request_lsn, request_lsn) // get version at specified LSN
1227 : }
1228 : }
1229 : };
1230 :
1231 : // The rest of the messages are the same between V1 and V2
1232 8 : match msg_tag {
1233 : 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
1234 2 : request_lsn,
1235 2 : not_modified_since,
1236 2 : rel: RelTag {
1237 2 : spcnode: body.read_u32::<BigEndian>()?,
1238 2 : dbnode: body.read_u32::<BigEndian>()?,
1239 2 : relnode: body.read_u32::<BigEndian>()?,
1240 2 : forknum: body.read_u8()?,
1241 : },
1242 : })),
1243 : 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1244 2 : request_lsn,
1245 2 : not_modified_since,
1246 2 : rel: RelTag {
1247 2 : spcnode: body.read_u32::<BigEndian>()?,
1248 2 : dbnode: body.read_u32::<BigEndian>()?,
1249 2 : relnode: body.read_u32::<BigEndian>()?,
1250 2 : forknum: body.read_u8()?,
1251 : },
1252 : })),
1253 : 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1254 2 : request_lsn,
1255 2 : not_modified_since,
1256 2 : rel: RelTag {
1257 2 : spcnode: body.read_u32::<BigEndian>()?,
1258 2 : dbnode: body.read_u32::<BigEndian>()?,
1259 2 : relnode: body.read_u32::<BigEndian>()?,
1260 2 : forknum: body.read_u8()?,
1261 : },
1262 2 : blkno: body.read_u32::<BigEndian>()?,
1263 : })),
1264 : 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1265 2 : request_lsn,
1266 2 : not_modified_since,
1267 2 : dbnode: body.read_u32::<BigEndian>()?,
1268 : })),
1269 : 4 => Ok(PagestreamFeMessage::GetSlruSegment(
1270 : PagestreamGetSlruSegmentRequest {
1271 0 : request_lsn,
1272 0 : not_modified_since,
1273 0 : kind: body.read_u8()?,
1274 0 : segno: body.read_u32::<BigEndian>()?,
1275 : },
1276 : )),
1277 0 : _ => bail!("unknown smgr message tag: {:?}", msg_tag),
1278 : }
1279 8 : }
1280 : }
1281 :
1282 : impl PagestreamBeMessage {
1283 0 : pub fn serialize(&self) -> Bytes {
1284 0 : let mut bytes = BytesMut::new();
1285 0 :
1286 0 : use PagestreamBeMessageTag as Tag;
1287 0 : match self {
1288 0 : Self::Exists(resp) => {
1289 0 : bytes.put_u8(Tag::Exists as u8);
1290 0 : bytes.put_u8(resp.exists as u8);
1291 0 : }
1292 :
1293 0 : Self::Nblocks(resp) => {
1294 0 : bytes.put_u8(Tag::Nblocks as u8);
1295 0 : bytes.put_u32(resp.n_blocks);
1296 0 : }
1297 :
1298 0 : Self::GetPage(resp) => {
1299 0 : bytes.put_u8(Tag::GetPage as u8);
1300 0 : bytes.put(&resp.page[..]);
1301 0 : }
1302 :
1303 0 : Self::Error(resp) => {
1304 0 : bytes.put_u8(Tag::Error as u8);
1305 0 : bytes.put(resp.message.as_bytes());
1306 0 : bytes.put_u8(0); // null terminator
1307 0 : }
1308 0 : Self::DbSize(resp) => {
1309 0 : bytes.put_u8(Tag::DbSize as u8);
1310 0 : bytes.put_i64(resp.db_size);
1311 0 : }
1312 :
1313 0 : Self::GetSlruSegment(resp) => {
1314 0 : bytes.put_u8(Tag::GetSlruSegment as u8);
1315 0 : bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
1316 0 : bytes.put(&resp.segment[..]);
1317 0 : }
1318 : }
1319 :
1320 0 : bytes.into()
1321 0 : }
1322 :
1323 0 : pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
1324 0 : let mut buf = buf.reader();
1325 0 : let msg_tag = buf.read_u8()?;
1326 :
1327 : use PagestreamBeMessageTag as Tag;
1328 0 : let ok =
1329 0 : match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
1330 : Tag::Exists => {
1331 0 : let exists = buf.read_u8()?;
1332 0 : Self::Exists(PagestreamExistsResponse {
1333 0 : exists: exists != 0,
1334 0 : })
1335 : }
1336 : Tag::Nblocks => {
1337 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1338 0 : Self::Nblocks(PagestreamNblocksResponse { n_blocks })
1339 : }
1340 : Tag::GetPage => {
1341 0 : let mut page = vec![0; 8192]; // TODO: use MaybeUninit
1342 0 : buf.read_exact(&mut page)?;
1343 0 : PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
1344 : }
1345 : Tag::Error => {
1346 0 : let mut msg = Vec::new();
1347 0 : buf.read_until(0, &mut msg)?;
1348 0 : let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
1349 0 : let rust_str = cstring.to_str()?;
1350 0 : PagestreamBeMessage::Error(PagestreamErrorResponse {
1351 0 : message: rust_str.to_owned(),
1352 0 : })
1353 : }
1354 : Tag::DbSize => {
1355 0 : let db_size = buf.read_i64::<BigEndian>()?;
1356 0 : Self::DbSize(PagestreamDbSizeResponse { db_size })
1357 : }
1358 : Tag::GetSlruSegment => {
1359 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1360 0 : let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
1361 0 : buf.read_exact(&mut segment)?;
1362 0 : Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
1363 0 : segment: segment.into(),
1364 0 : })
1365 : }
1366 : };
1367 0 : let remaining = buf.into_inner();
1368 0 : if !remaining.is_empty() {
1369 0 : anyhow::bail!(
1370 0 : "remaining bytes in msg with tag={msg_tag}: {}",
1371 0 : remaining.len()
1372 0 : );
1373 0 : }
1374 0 : Ok(ok)
1375 0 : }
1376 :
1377 0 : pub fn kind(&self) -> &'static str {
1378 0 : match self {
1379 0 : Self::Exists(_) => "Exists",
1380 0 : Self::Nblocks(_) => "Nblocks",
1381 0 : Self::GetPage(_) => "GetPage",
1382 0 : Self::Error(_) => "Error",
1383 0 : Self::DbSize(_) => "DbSize",
1384 0 : Self::GetSlruSegment(_) => "GetSlruSegment",
1385 : }
1386 0 : }
1387 : }
1388 :
1389 : #[cfg(test)]
1390 : mod tests {
1391 : use serde_json::json;
1392 :
1393 : use super::*;
1394 :
1395 : #[test]
1396 2 : fn test_pagestream() {
1397 2 : // Test serialization/deserialization of PagestreamFeMessage
1398 2 : let messages = vec![
1399 2 : PagestreamFeMessage::Exists(PagestreamExistsRequest {
1400 2 : request_lsn: Lsn(4),
1401 2 : not_modified_since: Lsn(3),
1402 2 : rel: RelTag {
1403 2 : forknum: 1,
1404 2 : spcnode: 2,
1405 2 : dbnode: 3,
1406 2 : relnode: 4,
1407 2 : },
1408 2 : }),
1409 2 : PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1410 2 : request_lsn: Lsn(4),
1411 2 : not_modified_since: Lsn(4),
1412 2 : rel: RelTag {
1413 2 : forknum: 1,
1414 2 : spcnode: 2,
1415 2 : dbnode: 3,
1416 2 : relnode: 4,
1417 2 : },
1418 2 : }),
1419 2 : PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1420 2 : request_lsn: Lsn(4),
1421 2 : not_modified_since: Lsn(3),
1422 2 : rel: RelTag {
1423 2 : forknum: 1,
1424 2 : spcnode: 2,
1425 2 : dbnode: 3,
1426 2 : relnode: 4,
1427 2 : },
1428 2 : blkno: 7,
1429 2 : }),
1430 2 : PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1431 2 : request_lsn: Lsn(4),
1432 2 : not_modified_since: Lsn(3),
1433 2 : dbnode: 7,
1434 2 : }),
1435 2 : ];
1436 10 : for msg in messages {
1437 8 : let bytes = msg.serialize();
1438 8 : let reconstructed =
1439 8 : PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
1440 8 : .unwrap();
1441 8 : assert!(msg == reconstructed);
1442 : }
1443 2 : }
1444 :
1445 : #[test]
1446 2 : fn test_tenantinfo_serde() {
1447 2 : // Test serialization/deserialization of TenantInfo
1448 2 : let original_active = TenantInfo {
1449 2 : id: TenantShardId::unsharded(TenantId::generate()),
1450 2 : state: TenantState::Active,
1451 2 : current_physical_size: Some(42),
1452 2 : attachment_status: TenantAttachmentStatus::Attached,
1453 2 : generation: None,
1454 2 : };
1455 2 : let expected_active = json!({
1456 2 : "id": original_active.id.to_string(),
1457 2 : "state": {
1458 2 : "slug": "Active",
1459 2 : },
1460 2 : "current_physical_size": 42,
1461 2 : "attachment_status": {
1462 2 : "slug":"attached",
1463 2 : }
1464 2 : });
1465 2 :
1466 2 : let original_broken = TenantInfo {
1467 2 : id: TenantShardId::unsharded(TenantId::generate()),
1468 2 : state: TenantState::Broken {
1469 2 : reason: "reason".into(),
1470 2 : backtrace: "backtrace info".into(),
1471 2 : },
1472 2 : current_physical_size: Some(42),
1473 2 : attachment_status: TenantAttachmentStatus::Attached,
1474 2 : generation: None,
1475 2 : };
1476 2 : let expected_broken = json!({
1477 2 : "id": original_broken.id.to_string(),
1478 2 : "state": {
1479 2 : "slug": "Broken",
1480 2 : "data": {
1481 2 : "backtrace": "backtrace info",
1482 2 : "reason": "reason",
1483 2 : }
1484 2 : },
1485 2 : "current_physical_size": 42,
1486 2 : "attachment_status": {
1487 2 : "slug":"attached",
1488 2 : }
1489 2 : });
1490 2 :
1491 2 : assert_eq!(
1492 2 : serde_json::to_value(&original_active).unwrap(),
1493 2 : expected_active
1494 2 : );
1495 :
1496 2 : assert_eq!(
1497 2 : serde_json::to_value(&original_broken).unwrap(),
1498 2 : expected_broken
1499 2 : );
1500 2 : assert!(format!("{:?}", &original_broken.state).contains("reason"));
1501 2 : assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
1502 2 : }
1503 :
1504 : #[test]
1505 2 : fn test_reject_unknown_field() {
1506 2 : let id = TenantId::generate();
1507 2 : let create_request = json!({
1508 2 : "new_tenant_id": id.to_string(),
1509 2 : "unknown_field": "unknown_value".to_string(),
1510 2 : });
1511 2 : let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
1512 2 : assert!(
1513 2 : err.to_string().contains("unknown field `unknown_field`"),
1514 0 : "expect unknown field `unknown_field` error, got: {}",
1515 : err
1516 : );
1517 :
1518 2 : let id = TenantId::generate();
1519 2 : let config_request = json!({
1520 2 : "tenant_id": id.to_string(),
1521 2 : "unknown_field": "unknown_value".to_string(),
1522 2 : });
1523 2 : let err = serde_json::from_value::<TenantConfigRequest>(config_request).unwrap_err();
1524 2 : assert!(
1525 2 : err.to_string().contains("unknown field `unknown_field`"),
1526 0 : "expect unknown field `unknown_field` error, got: {}",
1527 : err
1528 : );
1529 :
1530 2 : let attach_request = json!({
1531 2 : "config": {
1532 2 : "unknown_field": "unknown_value".to_string(),
1533 2 : },
1534 2 : });
1535 2 : let err = serde_json::from_value::<TenantAttachRequest>(attach_request).unwrap_err();
1536 2 : assert!(
1537 2 : err.to_string().contains("unknown field `unknown_field`"),
1538 0 : "expect unknown field `unknown_field` error, got: {}",
1539 : err
1540 : );
1541 2 : }
1542 :
1543 : #[test]
1544 2 : fn tenantstatus_activating_serde() {
1545 2 : let states = [
1546 2 : TenantState::Activating(ActivatingFrom::Loading),
1547 2 : TenantState::Activating(ActivatingFrom::Attaching),
1548 2 : ];
1549 2 : let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
1550 2 :
1551 2 : let actual = serde_json::to_string(&states).unwrap();
1552 2 :
1553 2 : assert_eq!(actual, expected);
1554 :
1555 2 : let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
1556 2 :
1557 2 : assert_eq!(states.as_slice(), &parsed);
1558 2 : }
1559 :
1560 : #[test]
1561 2 : fn tenantstatus_activating_strum() {
1562 2 : // tests added, because we use these for metrics
1563 2 : let examples = [
1564 2 : (line!(), TenantState::Loading, "Loading"),
1565 2 : (line!(), TenantState::Attaching, "Attaching"),
1566 2 : (
1567 2 : line!(),
1568 2 : TenantState::Activating(ActivatingFrom::Loading),
1569 2 : "Activating",
1570 2 : ),
1571 2 : (
1572 2 : line!(),
1573 2 : TenantState::Activating(ActivatingFrom::Attaching),
1574 2 : "Activating",
1575 2 : ),
1576 2 : (line!(), TenantState::Active, "Active"),
1577 2 : (
1578 2 : line!(),
1579 2 : TenantState::Stopping {
1580 2 : progress: utils::completion::Barrier::default(),
1581 2 : },
1582 2 : "Stopping",
1583 2 : ),
1584 2 : (
1585 2 : line!(),
1586 2 : TenantState::Broken {
1587 2 : reason: "Example".into(),
1588 2 : backtrace: "Looooong backtrace".into(),
1589 2 : },
1590 2 : "Broken",
1591 2 : ),
1592 2 : ];
1593 :
1594 16 : for (line, rendered, expected) in examples {
1595 14 : let actual: &'static str = rendered.into();
1596 14 : assert_eq!(actual, expected, "example on {line}");
1597 : }
1598 2 : }
1599 :
1600 : #[test]
1601 2 : fn test_aux_file_migration_path() {
1602 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1603 2 : None,
1604 2 : AuxFilePolicy::V1
1605 2 : ));
1606 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1607 2 : None,
1608 2 : AuxFilePolicy::V2
1609 2 : ));
1610 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1611 2 : None,
1612 2 : AuxFilePolicy::CrossValidation
1613 2 : ));
1614 : // Self-migration is not a valid migration path, and the caller should handle it by itself.
1615 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1616 2 : Some(AuxFilePolicy::V1),
1617 2 : AuxFilePolicy::V1
1618 2 : ));
1619 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1620 2 : Some(AuxFilePolicy::V2),
1621 2 : AuxFilePolicy::V2
1622 2 : ));
1623 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1624 2 : Some(AuxFilePolicy::CrossValidation),
1625 2 : AuxFilePolicy::CrossValidation
1626 2 : ));
1627 : // Migrations not allowed
1628 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1629 2 : Some(AuxFilePolicy::CrossValidation),
1630 2 : AuxFilePolicy::V1
1631 2 : ));
1632 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1633 2 : Some(AuxFilePolicy::V1),
1634 2 : AuxFilePolicy::V2
1635 2 : ));
1636 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1637 2 : Some(AuxFilePolicy::V2),
1638 2 : AuxFilePolicy::V1
1639 2 : ));
1640 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1641 2 : Some(AuxFilePolicy::V2),
1642 2 : AuxFilePolicy::CrossValidation
1643 2 : ));
1644 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1645 2 : Some(AuxFilePolicy::V1),
1646 2 : AuxFilePolicy::CrossValidation
1647 2 : ));
1648 : // Migrations allowed
1649 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1650 2 : Some(AuxFilePolicy::CrossValidation),
1651 2 : AuxFilePolicy::V2
1652 2 : ));
1653 2 : }
1654 : }
|