Line data Source code
1 : pub mod detach_ancestor;
2 : pub mod partitioning;
3 : pub mod utilization;
4 :
5 : pub use utilization::PageserverUtilization;
6 :
7 : use std::{
8 : borrow::Cow,
9 : collections::HashMap,
10 : io::{BufRead, Read},
11 : num::{NonZeroU64, NonZeroUsize},
12 : sync::atomic::AtomicUsize,
13 : time::{Duration, SystemTime},
14 : };
15 :
16 : use byteorder::{BigEndian, ReadBytesExt};
17 : use postgres_ffi::BLCKSZ;
18 : use serde::{Deserialize, Serialize};
19 : use serde_with::serde_as;
20 : use utils::{
21 : completion,
22 : history_buffer::HistoryBufferWithDropCounter,
23 : id::{NodeId, TenantId, TimelineId},
24 : lsn::Lsn,
25 : serde_system_time,
26 : };
27 :
28 : use crate::{
29 : reltag::RelTag,
30 : shard::{ShardCount, ShardStripeSize, TenantShardId},
31 : };
32 : use anyhow::bail;
33 : use bytes::{Buf, BufMut, Bytes, BytesMut};
34 :
35 : /// The state of a tenant in this pageserver.
36 : ///
37 : /// ```mermaid
38 : /// stateDiagram-v2
39 : ///
40 : /// [*] --> Loading: spawn_load()
41 : /// [*] --> Attaching: spawn_attach()
42 : ///
43 : /// Loading --> Activating: activate()
44 : /// Attaching --> Activating: activate()
45 : /// Activating --> Active: infallible
46 : ///
47 : /// Loading --> Broken: load() failure
48 : /// Attaching --> Broken: attach() failure
49 : ///
50 : /// Active --> Stopping: set_stopping(), part of shutdown & detach
51 : /// Stopping --> Broken: late error in remove_tenant_from_memory
52 : ///
53 : /// Broken --> [*]: ignore / detach / shutdown
54 : /// Stopping --> [*]: remove_from_memory complete
55 : ///
56 : /// Active --> Broken: cfg(testing)-only tenant break point
57 : /// ```
58 : #[derive(
59 : Clone,
60 : PartialEq,
61 : Eq,
62 2 : serde::Serialize,
63 12 : serde::Deserialize,
64 0 : strum_macros::Display,
65 : strum_macros::EnumVariantNames,
66 0 : strum_macros::AsRefStr,
67 332 : strum_macros::IntoStaticStr,
68 : )]
69 : #[serde(tag = "slug", content = "data")]
70 : pub enum TenantState {
71 : /// This tenant is being loaded from local disk.
72 : ///
73 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
74 : Loading,
75 : /// This tenant is being attached to the pageserver.
76 : ///
77 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
78 : Attaching,
79 : /// The tenant is transitioning from Loading/Attaching to Active.
80 : ///
81 : /// While in this state, the individual timelines are being activated.
82 : ///
83 : /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
84 : Activating(ActivatingFrom),
85 : /// The tenant has finished activating and is open for business.
86 : ///
87 : /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
88 : Active,
89 : /// The tenant is recognized by pageserver, but it is being detached or the
90 : /// system is being shut down.
91 : ///
92 : /// Transitions out of this state are possible through `set_broken()`.
93 : Stopping {
94 : // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
95 : // otherwise it will not be skipped during deserialization
96 : #[serde(skip)]
97 : progress: completion::Barrier,
98 : },
99 : /// The tenant is recognized by the pageserver, but can no longer be used for
100 : /// any operations.
101 : ///
102 : /// If the tenant fails to load or attach, it will transition to this state
103 : /// and it is guaranteed that no background tasks are running in its name.
104 : ///
105 : /// The other way to transition into this state is from `Stopping` state
106 : /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
107 : /// if the cleanup future executed by `remove_tenant_from_memory()` fails.
108 : Broken { reason: String, backtrace: String },
109 : }
110 :
111 : impl TenantState {
112 0 : pub fn attachment_status(&self) -> TenantAttachmentStatus {
113 : use TenantAttachmentStatus::*;
114 :
115 : // Below TenantState::Activating is used as "transient" or "transparent" state for
116 : // attachment_status determining.
117 0 : match self {
118 : // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
119 : // So, technically, we can return Attached here.
120 : // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
121 : // But, our attach task might still be fetching the remote timelines, etc.
122 : // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
123 0 : Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
124 : // tenant mgr startup distinguishes attaching from loading via marker file.
125 0 : Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
126 : // We only reach Active after successful load / attach.
127 : // So, call atttachment status Attached.
128 0 : Self::Active => Attached,
129 : // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
130 : // However, it also becomes Broken if the regular load fails.
131 : // From Console's perspective there's no practical difference
132 : // because attachment_status is polled by console only during attach operation execution.
133 0 : Self::Broken { reason, .. } => Failed {
134 0 : reason: reason.to_owned(),
135 0 : },
136 : // Why is Stopping a Maybe case? Because, during pageserver shutdown,
137 : // we set the Stopping state irrespective of whether the tenant
138 : // has finished attaching or not.
139 0 : Self::Stopping { .. } => Maybe,
140 : }
141 0 : }
142 :
143 0 : pub fn broken_from_reason(reason: String) -> Self {
144 0 : let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
145 0 : Self::Broken {
146 0 : reason,
147 0 : backtrace: backtrace_str,
148 0 : }
149 0 : }
150 : }
151 :
152 : impl std::fmt::Debug for TenantState {
153 4 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
154 4 : match self {
155 4 : Self::Broken { reason, backtrace } if !reason.is_empty() => {
156 4 : write!(f, "Broken due to: {reason}. Backtrace:\n{backtrace}")
157 : }
158 0 : _ => write!(f, "{self}"),
159 : }
160 4 : }
161 : }
162 :
163 : /// A temporary lease to a specific lsn inside a timeline.
164 : /// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
165 : #[serde_as]
166 0 : #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
167 : pub struct LsnLease {
168 : #[serde_as(as = "SystemTimeAsRfc3339Millis")]
169 : pub valid_until: SystemTime,
170 : }
171 :
172 : serde_with::serde_conv!(
173 : SystemTimeAsRfc3339Millis,
174 : SystemTime,
175 0 : |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
176 0 : |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
177 : );
178 :
179 : impl LsnLease {
180 : /// The default length for an explicit LSN lease request (10 minutes).
181 : pub const DEFAULT_LENGTH: Duration = Duration::from_secs(10 * 60);
182 :
183 : /// The default length for an implicit LSN lease granted during
184 : /// `get_lsn_by_timestamp` request (1 minutes).
185 : pub const DEFAULT_LENGTH_FOR_TS: Duration = Duration::from_secs(60);
186 :
187 : /// Checks whether the lease is expired.
188 6 : pub fn is_expired(&self, now: &SystemTime) -> bool {
189 6 : now > &self.valid_until
190 6 : }
191 : }
192 :
193 : /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
194 8 : #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
195 : pub enum ActivatingFrom {
196 : /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
197 : Loading,
198 : /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
199 : Attaching,
200 : }
201 :
202 : /// A state of a timeline in pageserver's memory.
203 0 : #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
204 : pub enum TimelineState {
205 : /// The timeline is recognized by the pageserver but is not yet operational.
206 : /// In particular, the walreceiver connection loop is not running for this timeline.
207 : /// It will eventually transition to state Active or Broken.
208 : Loading,
209 : /// The timeline is fully operational.
210 : /// It can be queried, and the walreceiver connection loop is running.
211 : Active,
212 : /// The timeline was previously Loading or Active but is shutting down.
213 : /// It cannot transition back into any other state.
214 : Stopping,
215 : /// The timeline is broken and not operational (previous states: Loading or Active).
216 : Broken { reason: String, backtrace: String },
217 : }
218 :
219 0 : #[derive(Serialize, Deserialize, Clone)]
220 : pub struct TimelineCreateRequest {
221 : pub new_timeline_id: TimelineId,
222 : #[serde(default)]
223 : pub ancestor_timeline_id: Option<TimelineId>,
224 : #[serde(default)]
225 : pub existing_initdb_timeline_id: Option<TimelineId>,
226 : #[serde(default)]
227 : pub ancestor_start_lsn: Option<Lsn>,
228 : pub pg_version: Option<u32>,
229 : }
230 :
231 0 : #[derive(Serialize, Deserialize)]
232 : pub struct TenantShardSplitRequest {
233 : pub new_shard_count: u8,
234 :
235 : // A tenant's stripe size is only meaningful the first time their shard count goes
236 : // above 1: therefore during a split from 1->N shards, we may modify the stripe size.
237 : //
238 : // If this is set while the stripe count is being increased from an already >1 value,
239 : // then the request will fail with 400.
240 : pub new_stripe_size: Option<ShardStripeSize>,
241 : }
242 :
243 0 : #[derive(Serialize, Deserialize)]
244 : pub struct TenantShardSplitResponse {
245 : pub new_shards: Vec<TenantShardId>,
246 : }
247 :
248 : /// Parameters that apply to all shards in a tenant. Used during tenant creation.
249 0 : #[derive(Serialize, Deserialize, Debug)]
250 : #[serde(deny_unknown_fields)]
251 : pub struct ShardParameters {
252 : pub count: ShardCount,
253 : pub stripe_size: ShardStripeSize,
254 : }
255 :
256 : impl ShardParameters {
257 : pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
258 :
259 0 : pub fn is_unsharded(&self) -> bool {
260 0 : self.count.is_unsharded()
261 0 : }
262 : }
263 :
264 : impl Default for ShardParameters {
265 171 : fn default() -> Self {
266 171 : Self {
267 171 : count: ShardCount::new(0),
268 171 : stripe_size: Self::DEFAULT_STRIPE_SIZE,
269 171 : }
270 171 : }
271 : }
272 :
273 : /// An alternative representation of `pageserver::tenant::TenantConf` with
274 : /// simpler types.
275 4 : #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
276 : pub struct TenantConfig {
277 : pub checkpoint_distance: Option<u64>,
278 : pub checkpoint_timeout: Option<String>,
279 : pub compaction_target_size: Option<u64>,
280 : pub compaction_period: Option<String>,
281 : pub compaction_threshold: Option<usize>,
282 : // defer parsing compaction_algorithm, like eviction_policy
283 : pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
284 : pub gc_horizon: Option<u64>,
285 : pub gc_period: Option<String>,
286 : pub image_creation_threshold: Option<usize>,
287 : pub pitr_interval: Option<String>,
288 : pub walreceiver_connect_timeout: Option<String>,
289 : pub lagging_wal_timeout: Option<String>,
290 : pub max_lsn_wal_lag: Option<NonZeroU64>,
291 : pub trace_read_requests: Option<bool>,
292 : pub eviction_policy: Option<EvictionPolicy>,
293 : pub min_resident_size_override: Option<u64>,
294 : pub evictions_low_residence_duration_metric_threshold: Option<String>,
295 : pub heatmap_period: Option<String>,
296 : pub lazy_slru_download: Option<bool>,
297 : pub timeline_get_throttle: Option<ThrottleConfig>,
298 : pub image_layer_creation_check_threshold: Option<u8>,
299 : pub switch_aux_file_policy: Option<AuxFilePolicy>,
300 : pub lsn_lease_length: Option<String>,
301 : pub lsn_lease_length_for_ts: Option<String>,
302 : }
303 :
304 : /// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
305 : /// tenant config. When the first aux file written, the policy will be persisted in the
306 : /// `index_part.json` file and has a limited migration path.
307 : ///
308 : /// Currently, we only allow the following migration path:
309 : ///
310 : /// Unset -> V1
311 : /// -> V2
312 : /// -> CrossValidation -> V2
313 : #[derive(
314 : Eq,
315 : PartialEq,
316 : Debug,
317 : Copy,
318 : Clone,
319 8 : strum_macros::EnumString,
320 20 : strum_macros::Display,
321 0 : serde_with::DeserializeFromStr,
322 : serde_with::SerializeDisplay,
323 : )]
324 : #[strum(serialize_all = "kebab-case")]
325 : pub enum AuxFilePolicy {
326 : /// V1 aux file policy: store everything in AUX_FILE_KEY
327 : #[strum(ascii_case_insensitive)]
328 : V1,
329 : /// V2 aux file policy: store in the AUX_FILE keyspace
330 : #[strum(ascii_case_insensitive)]
331 : V2,
332 : /// Cross validation runs both formats on the write path and does validation
333 : /// on the read path.
334 : #[strum(ascii_case_insensitive)]
335 : CrossValidation,
336 : }
337 :
338 : impl AuxFilePolicy {
339 54 : pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
340 34 : matches!(
341 54 : (from, to),
342 : (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
343 : )
344 54 : }
345 :
346 : /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
347 380 : pub fn default_tenant_config() -> Self {
348 380 : Self::V1
349 380 : }
350 : }
351 :
352 : /// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
353 : pub struct AtomicAuxFilePolicy(AtomicUsize);
354 :
355 : impl AtomicAuxFilePolicy {
356 383 : pub fn new(policy: Option<AuxFilePolicy>) -> Self {
357 383 : Self(AtomicUsize::new(
358 383 : policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
359 383 : ))
360 383 : }
361 :
362 306 : pub fn load(&self) -> Option<AuxFilePolicy> {
363 306 : match self.0.load(std::sync::atomic::Ordering::Acquire) {
364 240 : 0 => None,
365 66 : other => Some(AuxFilePolicy::from_usize(other)),
366 : }
367 306 : }
368 :
369 22 : pub fn store(&self, policy: Option<AuxFilePolicy>) {
370 22 : self.0.store(
371 22 : policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
372 22 : std::sync::atomic::Ordering::Release,
373 22 : );
374 22 : }
375 : }
376 :
377 : impl AuxFilePolicy {
378 20 : pub fn to_usize(self) -> usize {
379 20 : match self {
380 14 : Self::V1 => 1,
381 2 : Self::CrossValidation => 2,
382 4 : Self::V2 => 3,
383 : }
384 20 : }
385 :
386 66 : pub fn try_from_usize(this: usize) -> Option<Self> {
387 66 : match this {
388 36 : 1 => Some(Self::V1),
389 6 : 2 => Some(Self::CrossValidation),
390 24 : 3 => Some(Self::V2),
391 0 : _ => None,
392 : }
393 66 : }
394 :
395 66 : pub fn from_usize(this: usize) -> Self {
396 66 : Self::try_from_usize(this).unwrap()
397 66 : }
398 : }
399 :
400 4 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
401 : #[serde(tag = "kind")]
402 : pub enum EvictionPolicy {
403 : NoEviction,
404 : LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
405 : OnlyImitiate(EvictionPolicyLayerAccessThreshold),
406 : }
407 :
408 : impl EvictionPolicy {
409 0 : pub fn discriminant_str(&self) -> &'static str {
410 0 : match self {
411 0 : EvictionPolicy::NoEviction => "NoEviction",
412 0 : EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
413 0 : EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
414 : }
415 0 : }
416 : }
417 :
418 : #[derive(
419 : Eq,
420 : PartialEq,
421 : Debug,
422 : Copy,
423 : Clone,
424 0 : strum_macros::EnumString,
425 0 : strum_macros::Display,
426 0 : serde_with::DeserializeFromStr,
427 : serde_with::SerializeDisplay,
428 : )]
429 : #[strum(serialize_all = "kebab-case")]
430 : pub enum CompactionAlgorithm {
431 : Legacy,
432 : Tiered,
433 : }
434 :
435 0 : #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
436 : pub struct CompactionAlgorithmSettings {
437 : pub kind: CompactionAlgorithm,
438 : }
439 :
440 20 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
441 : pub struct EvictionPolicyLayerAccessThreshold {
442 : #[serde(with = "humantime_serde")]
443 : pub period: Duration,
444 : #[serde(with = "humantime_serde")]
445 : pub threshold: Duration,
446 : }
447 :
448 0 : #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
449 : pub struct ThrottleConfig {
450 : pub task_kinds: Vec<String>, // TaskKind
451 : pub initial: usize,
452 : #[serde(with = "humantime_serde")]
453 : pub refill_interval: Duration,
454 : pub refill_amount: NonZeroUsize,
455 : pub max: usize,
456 : pub fair: bool,
457 : }
458 :
459 : impl ThrottleConfig {
460 362 : pub fn disabled() -> Self {
461 362 : Self {
462 362 : task_kinds: vec![], // effectively disables the throttle
463 362 : // other values don't matter with emtpy `task_kinds`.
464 362 : initial: 0,
465 362 : refill_interval: Duration::from_millis(1),
466 362 : refill_amount: NonZeroUsize::new(1).unwrap(),
467 362 : max: 1,
468 362 : fair: true,
469 362 : }
470 362 : }
471 : /// The requests per second allowed by the given config.
472 0 : pub fn steady_rps(&self) -> f64 {
473 0 : (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
474 0 : }
475 : }
476 :
477 : /// A flattened analog of a `pagesever::tenant::LocationMode`, which
478 : /// lists out all possible states (and the virtual "Detached" state)
479 : /// in a flat form rather than using rust-style enums.
480 0 : #[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
481 : pub enum LocationConfigMode {
482 : AttachedSingle,
483 : AttachedMulti,
484 : AttachedStale,
485 : Secondary,
486 : Detached,
487 : }
488 :
489 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
490 : pub struct LocationConfigSecondary {
491 : pub warm: bool,
492 : }
493 :
494 : /// An alternative representation of `pageserver::tenant::LocationConf`,
495 : /// for use in external-facing APIs.
496 0 : #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
497 : pub struct LocationConfig {
498 : pub mode: LocationConfigMode,
499 : /// If attaching, in what generation?
500 : #[serde(default)]
501 : pub generation: Option<u32>,
502 :
503 : // If requesting mode `Secondary`, configuration for that.
504 : #[serde(default)]
505 : pub secondary_conf: Option<LocationConfigSecondary>,
506 :
507 : // Shard parameters: if shard_count is nonzero, then other shard_* fields
508 : // must be set accurately.
509 : #[serde(default)]
510 : pub shard_number: u8,
511 : #[serde(default)]
512 : pub shard_count: u8,
513 : #[serde(default)]
514 : pub shard_stripe_size: u32,
515 :
516 : // This configuration only affects attached mode, but should be provided irrespective
517 : // of the mode, as a secondary location might transition on startup if the response
518 : // to the `/re-attach` control plane API requests it.
519 : pub tenant_conf: TenantConfig,
520 : }
521 :
522 0 : #[derive(Serialize, Deserialize)]
523 : pub struct LocationConfigListResponse {
524 : pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
525 : }
526 :
527 : #[derive(Serialize)]
528 : pub struct StatusResponse {
529 : pub id: NodeId,
530 : }
531 :
532 0 : #[derive(Serialize, Deserialize, Debug)]
533 : #[serde(deny_unknown_fields)]
534 : pub struct TenantLocationConfigRequest {
535 : #[serde(flatten)]
536 : pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
537 : }
538 :
539 0 : #[derive(Serialize, Deserialize, Debug)]
540 : #[serde(deny_unknown_fields)]
541 : pub struct TenantTimeTravelRequest {
542 : pub shard_counts: Vec<ShardCount>,
543 : }
544 :
545 0 : #[derive(Serialize, Deserialize, Debug)]
546 : #[serde(deny_unknown_fields)]
547 : pub struct TenantShardLocation {
548 : pub shard_id: TenantShardId,
549 : pub node_id: NodeId,
550 : }
551 :
552 0 : #[derive(Serialize, Deserialize, Debug)]
553 : #[serde(deny_unknown_fields)]
554 : pub struct TenantLocationConfigResponse {
555 : pub shards: Vec<TenantShardLocation>,
556 : // If the shards' ShardCount count is >1, stripe_size will be set.
557 : pub stripe_size: Option<ShardStripeSize>,
558 : }
559 :
560 6 : #[derive(Serialize, Deserialize, Debug)]
561 : #[serde(deny_unknown_fields)]
562 : pub struct TenantConfigRequest {
563 : pub tenant_id: TenantId,
564 : #[serde(flatten)]
565 : pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
566 : }
567 :
568 : impl std::ops::Deref for TenantConfigRequest {
569 : type Target = TenantConfig;
570 :
571 0 : fn deref(&self) -> &Self::Target {
572 0 : &self.config
573 0 : }
574 : }
575 :
576 : impl TenantConfigRequest {
577 0 : pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
578 0 : let config = TenantConfig::default();
579 0 : TenantConfigRequest { tenant_id, config }
580 0 : }
581 : }
582 :
583 : /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
584 0 : #[derive(Serialize, Deserialize, Clone)]
585 : #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
586 : pub enum TenantAttachmentStatus {
587 : Maybe,
588 : Attached,
589 : Failed { reason: String },
590 : }
591 :
592 0 : #[derive(Serialize, Deserialize, Clone)]
593 : pub struct TenantInfo {
594 : pub id: TenantShardId,
595 : // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
596 : pub state: TenantState,
597 : /// Sum of the size of all layer files.
598 : /// If a layer is present in both local FS and S3, it counts only once.
599 : pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
600 : pub attachment_status: TenantAttachmentStatus,
601 : #[serde(skip_serializing_if = "Option::is_none")]
602 : pub generation: Option<u32>,
603 : }
604 :
605 0 : #[derive(Serialize, Deserialize, Clone)]
606 : pub struct TenantDetails {
607 : #[serde(flatten)]
608 : pub tenant_info: TenantInfo,
609 :
610 : pub walredo: Option<WalRedoManagerStatus>,
611 :
612 : pub timelines: Vec<TimelineId>,
613 : }
614 :
615 : /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
616 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
617 : pub struct TimelineInfo {
618 : pub tenant_id: TenantShardId,
619 : pub timeline_id: TimelineId,
620 :
621 : pub ancestor_timeline_id: Option<TimelineId>,
622 : pub ancestor_lsn: Option<Lsn>,
623 : pub last_record_lsn: Lsn,
624 : pub prev_record_lsn: Option<Lsn>,
625 : pub latest_gc_cutoff_lsn: Lsn,
626 : pub disk_consistent_lsn: Lsn,
627 :
628 : /// The LSN that we have succesfully uploaded to remote storage
629 : pub remote_consistent_lsn: Lsn,
630 :
631 : /// The LSN that we are advertizing to safekeepers
632 : pub remote_consistent_lsn_visible: Lsn,
633 :
634 : /// The LSN from the start of the root timeline (never changes)
635 : pub initdb_lsn: Lsn,
636 :
637 : pub current_logical_size: u64,
638 : pub current_logical_size_is_accurate: bool,
639 :
640 : pub directory_entries_counts: Vec<u64>,
641 :
642 : /// Sum of the size of all layer files.
643 : /// If a layer is present in both local FS and S3, it counts only once.
644 : pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
645 : pub current_logical_size_non_incremental: Option<u64>,
646 :
647 : pub timeline_dir_layer_file_size_sum: Option<u64>,
648 :
649 : pub wal_source_connstr: Option<String>,
650 : pub last_received_msg_lsn: Option<Lsn>,
651 : /// the timestamp (in microseconds) of the last received message
652 : pub last_received_msg_ts: Option<u128>,
653 : pub pg_version: u32,
654 :
655 : pub state: TimelineState,
656 :
657 : pub walreceiver_status: String,
658 :
659 : /// The last aux file policy being used on this timeline
660 : pub last_aux_file_policy: Option<AuxFilePolicy>,
661 : }
662 :
663 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
664 : pub struct LayerMapInfo {
665 : pub in_memory_layers: Vec<InMemoryLayerInfo>,
666 : pub historic_layers: Vec<HistoricLayerInfo>,
667 : }
668 :
669 0 : #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize, enum_map::Enum)]
670 : #[repr(usize)]
671 : pub enum LayerAccessKind {
672 : GetValueReconstructData,
673 : Iter,
674 : KeyIter,
675 : Dump,
676 : }
677 :
678 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
679 : pub struct LayerAccessStatFullDetails {
680 : pub when_millis_since_epoch: u64,
681 : pub task_kind: Cow<'static, str>,
682 : pub access_kind: LayerAccessKind,
683 : }
684 :
685 : /// An event that impacts the layer's residence status.
686 : #[serde_as]
687 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
688 : pub struct LayerResidenceEvent {
689 : /// The time when the event occurred.
690 : /// NB: this timestamp is captured while the residence status changes.
691 : /// So, it might be behind/ahead of the actual residence change by a short amount of time.
692 : ///
693 : #[serde(rename = "timestamp_millis_since_epoch")]
694 : #[serde_as(as = "serde_with::TimestampMilliSeconds")]
695 : pub timestamp: SystemTime,
696 : /// The new residence status of the layer.
697 : pub status: LayerResidenceStatus,
698 : /// The reason why we had to record this event.
699 : pub reason: LayerResidenceEventReason,
700 : }
701 :
702 : /// The reason for recording a given [`LayerResidenceEvent`].
703 0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
704 : pub enum LayerResidenceEventReason {
705 : /// The layer map is being populated, e.g. during timeline load or attach.
706 : /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
707 : /// We need to record such events because there is no persistent storage for the events.
708 : ///
709 : // https://github.com/rust-lang/rust/issues/74481
710 : /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
711 : /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
712 : LayerLoad,
713 : /// We just created the layer (e.g., freeze_and_flush or compaction).
714 : /// Such layers are always [`LayerResidenceStatus::Resident`].
715 : LayerCreate,
716 : /// We on-demand downloaded or evicted the given layer.
717 : ResidenceChange,
718 : }
719 :
720 : /// The residence status of the layer, after the given [`LayerResidenceEvent`].
721 0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
722 : pub enum LayerResidenceStatus {
723 : /// Residence status for a layer file that exists locally.
724 : /// It may also exist on the remote, we don't care here.
725 : Resident,
726 : /// Residence status for a layer file that only exists on the remote.
727 : Evicted,
728 : }
729 :
730 : impl LayerResidenceEvent {
731 3220 : pub fn new(status: LayerResidenceStatus, reason: LayerResidenceEventReason) -> Self {
732 3220 : Self {
733 3220 : status,
734 3220 : reason,
735 3220 : timestamp: SystemTime::now(),
736 3220 : }
737 3220 : }
738 : }
739 :
740 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
741 : pub struct LayerAccessStats {
742 : pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
743 : pub task_kind_access_flag: Vec<Cow<'static, str>>,
744 : pub first: Option<LayerAccessStatFullDetails>,
745 : pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
746 : pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
747 : }
748 :
749 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
750 : #[serde(tag = "kind")]
751 : pub enum InMemoryLayerInfo {
752 : Open { lsn_start: Lsn },
753 : Frozen { lsn_start: Lsn, lsn_end: Lsn },
754 : }
755 :
756 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
757 : #[serde(tag = "kind")]
758 : pub enum HistoricLayerInfo {
759 : Delta {
760 : layer_file_name: String,
761 : layer_file_size: u64,
762 :
763 : lsn_start: Lsn,
764 : lsn_end: Lsn,
765 : remote: bool,
766 : access_stats: LayerAccessStats,
767 :
768 : l0: bool,
769 : },
770 : Image {
771 : layer_file_name: String,
772 : layer_file_size: u64,
773 :
774 : lsn_start: Lsn,
775 : remote: bool,
776 : access_stats: LayerAccessStats,
777 : },
778 : }
779 :
780 : impl HistoricLayerInfo {
781 0 : pub fn layer_file_name(&self) -> &str {
782 0 : match self {
783 : HistoricLayerInfo::Delta {
784 0 : layer_file_name, ..
785 0 : } => layer_file_name,
786 : HistoricLayerInfo::Image {
787 0 : layer_file_name, ..
788 0 : } => layer_file_name,
789 : }
790 0 : }
791 0 : pub fn is_remote(&self) -> bool {
792 0 : match self {
793 0 : HistoricLayerInfo::Delta { remote, .. } => *remote,
794 0 : HistoricLayerInfo::Image { remote, .. } => *remote,
795 : }
796 0 : }
797 0 : pub fn set_remote(&mut self, value: bool) {
798 0 : let field = match self {
799 0 : HistoricLayerInfo::Delta { remote, .. } => remote,
800 0 : HistoricLayerInfo::Image { remote, .. } => remote,
801 : };
802 0 : *field = value;
803 0 : }
804 0 : pub fn layer_file_size(&self) -> u64 {
805 0 : match self {
806 : HistoricLayerInfo::Delta {
807 0 : layer_file_size, ..
808 0 : } => *layer_file_size,
809 : HistoricLayerInfo::Image {
810 0 : layer_file_size, ..
811 0 : } => *layer_file_size,
812 : }
813 0 : }
814 : }
815 :
816 0 : #[derive(Debug, Serialize, Deserialize)]
817 : pub struct DownloadRemoteLayersTaskSpawnRequest {
818 : pub max_concurrent_downloads: NonZeroUsize,
819 : }
820 :
821 0 : #[derive(Debug, Serialize, Deserialize)]
822 : pub struct IngestAuxFilesRequest {
823 : pub aux_files: HashMap<String, String>,
824 : }
825 :
826 0 : #[derive(Debug, Serialize, Deserialize)]
827 : pub struct ListAuxFilesRequest {
828 : pub lsn: Lsn,
829 : }
830 :
831 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
832 : pub struct DownloadRemoteLayersTaskInfo {
833 : pub task_id: String,
834 : pub state: DownloadRemoteLayersTaskState,
835 : pub total_layer_count: u64, // stable once `completed`
836 : pub successful_download_count: u64, // stable once `completed`
837 : pub failed_download_count: u64, // stable once `completed`
838 : }
839 :
840 0 : #[derive(Debug, Serialize, Deserialize, Clone)]
841 : pub enum DownloadRemoteLayersTaskState {
842 : Running,
843 : Completed,
844 : ShutDown,
845 : }
846 :
847 0 : #[derive(Debug, Serialize, Deserialize)]
848 : pub struct TimelineGcRequest {
849 : pub gc_horizon: Option<u64>,
850 : }
851 :
852 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
853 : pub struct WalRedoManagerProcessStatus {
854 : pub pid: u32,
855 : }
856 :
857 0 : #[derive(Debug, Clone, Serialize, Deserialize)]
858 : pub struct WalRedoManagerStatus {
859 : pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
860 : pub process: Option<WalRedoManagerProcessStatus>,
861 : }
862 :
863 : /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
864 : /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
865 : /// what's happening.
866 0 : #[derive(Default, Debug, Serialize, Deserialize, Clone)]
867 : pub struct SecondaryProgress {
868 : /// The remote storage LastModified time of the heatmap object we last downloaded.
869 : pub heatmap_mtime: Option<serde_system_time::SystemTime>,
870 :
871 : /// The number of layers currently on-disk
872 : pub layers_downloaded: usize,
873 : /// The number of layers in the most recently seen heatmap
874 : pub layers_total: usize,
875 :
876 : /// The number of layer bytes currently on-disk
877 : pub bytes_downloaded: u64,
878 : /// The number of layer bytes in the most recently seen heatmap
879 : pub bytes_total: u64,
880 : }
881 :
882 0 : #[derive(Serialize, Deserialize, Debug)]
883 : pub struct TenantScanRemoteStorageShard {
884 : pub tenant_shard_id: TenantShardId,
885 : pub generation: Option<u32>,
886 : }
887 :
888 0 : #[derive(Serialize, Deserialize, Debug, Default)]
889 : pub struct TenantScanRemoteStorageResponse {
890 : pub shards: Vec<TenantScanRemoteStorageShard>,
891 : }
892 :
893 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
894 : #[serde(rename_all = "snake_case")]
895 : pub enum TenantSorting {
896 : ResidentSize,
897 : MaxLogicalSize,
898 : }
899 :
900 : impl Default for TenantSorting {
901 0 : fn default() -> Self {
902 0 : Self::ResidentSize
903 0 : }
904 : }
905 :
906 0 : #[derive(Serialize, Deserialize, Debug, Clone)]
907 : pub struct TopTenantShardsRequest {
908 : // How would you like to sort the tenants?
909 : pub order_by: TenantSorting,
910 :
911 : // How many results?
912 : pub limit: usize,
913 :
914 : // Omit tenants with more than this many shards (e.g. if this is the max number of shards
915 : // that the caller would ever split to)
916 : pub where_shards_lt: Option<ShardCount>,
917 :
918 : // Omit tenants where the ordering metric is less than this (this is an optimization to
919 : // let us quickly exclude numerous tiny shards)
920 : pub where_gt: Option<u64>,
921 : }
922 :
923 0 : #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
924 : pub struct TopTenantShardItem {
925 : pub id: TenantShardId,
926 :
927 : /// Total size of layers on local disk for all timelines in this tenant
928 : pub resident_size: u64,
929 :
930 : /// Total size of layers in remote storage for all timelines in this tenant
931 : pub physical_size: u64,
932 :
933 : /// The largest logical size of a timeline within this tenant
934 : pub max_logical_size: u64,
935 : }
936 :
937 0 : #[derive(Serialize, Deserialize, Debug, Default)]
938 : pub struct TopTenantShardsResponse {
939 : pub shards: Vec<TopTenantShardItem>,
940 : }
941 :
942 : pub mod virtual_file {
943 : #[derive(
944 : Copy,
945 : Clone,
946 : PartialEq,
947 : Eq,
948 : Hash,
949 353 : strum_macros::EnumString,
950 0 : strum_macros::Display,
951 0 : serde_with::DeserializeFromStr,
952 : serde_with::SerializeDisplay,
953 : Debug,
954 : )]
955 : #[strum(serialize_all = "kebab-case")]
956 : pub enum IoEngineKind {
957 : StdFs,
958 : #[cfg(target_os = "linux")]
959 : TokioEpollUring,
960 : }
961 : }
962 :
963 : // Wrapped in libpq CopyData
964 : #[derive(PartialEq, Eq, Debug)]
965 : pub enum PagestreamFeMessage {
966 : Exists(PagestreamExistsRequest),
967 : Nblocks(PagestreamNblocksRequest),
968 : GetPage(PagestreamGetPageRequest),
969 : DbSize(PagestreamDbSizeRequest),
970 : GetSlruSegment(PagestreamGetSlruSegmentRequest),
971 : }
972 :
973 : // Wrapped in libpq CopyData
974 0 : #[derive(strum_macros::EnumProperty)]
975 : pub enum PagestreamBeMessage {
976 : Exists(PagestreamExistsResponse),
977 : Nblocks(PagestreamNblocksResponse),
978 : GetPage(PagestreamGetPageResponse),
979 : Error(PagestreamErrorResponse),
980 : DbSize(PagestreamDbSizeResponse),
981 : GetSlruSegment(PagestreamGetSlruSegmentResponse),
982 : }
983 :
984 : // Keep in sync with `pagestore_client.h`
985 : #[repr(u8)]
986 : enum PagestreamBeMessageTag {
987 : Exists = 100,
988 : Nblocks = 101,
989 : GetPage = 102,
990 : Error = 103,
991 : DbSize = 104,
992 : GetSlruSegment = 105,
993 : }
994 : impl TryFrom<u8> for PagestreamBeMessageTag {
995 : type Error = u8;
996 0 : fn try_from(value: u8) -> Result<Self, u8> {
997 0 : match value {
998 0 : 100 => Ok(PagestreamBeMessageTag::Exists),
999 0 : 101 => Ok(PagestreamBeMessageTag::Nblocks),
1000 0 : 102 => Ok(PagestreamBeMessageTag::GetPage),
1001 0 : 103 => Ok(PagestreamBeMessageTag::Error),
1002 0 : 104 => Ok(PagestreamBeMessageTag::DbSize),
1003 0 : 105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
1004 0 : _ => Err(value),
1005 : }
1006 0 : }
1007 : }
1008 :
1009 : // In the V2 protocol version, a GetPage request contains two LSN values:
1010 : //
1011 : // request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
1012 : // "get the latest version present". It's used by the primary server, which knows that no one else
1013 : // is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
1014 : // Lsn::Max. Standby servers use the current replay LSN as the request LSN.
1015 : //
1016 : // not_modified_since: Hint to the pageserver that the client knows that the page has not been
1017 : // modified between 'not_modified_since' and the request LSN. It's always correct to set
1018 : // 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
1019 : // passing an earlier LSN can speed up the request, by allowing the pageserver to process the
1020 : // request without waiting for 'request_lsn' to arrive.
1021 : //
1022 : // The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
1023 : // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
1024 : // 'latest' was set to true. The V2 interface was added because there was no correct way for a
1025 : // standby to request a page at a particular non-latest LSN, and also include the
1026 : // 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
1027 : // request, if the standby knows that the page hasn't been modified since, and risk getting an error
1028 : // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
1029 : // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
1030 : // interface allows sending both LSNs, and let the pageserver do the right thing. There is no
1031 : // difference in the responses between V1 and V2.
1032 : //
1033 : // The Request structs below reflect the V2 interface. If V1 is used, the parse function
1034 : // maps the old format requests to the new format.
1035 : //
1036 : #[derive(Clone, Copy)]
1037 : pub enum PagestreamProtocolVersion {
1038 : V1,
1039 : V2,
1040 : }
1041 :
1042 : #[derive(Debug, PartialEq, Eq)]
1043 : pub struct PagestreamExistsRequest {
1044 : pub request_lsn: Lsn,
1045 : pub not_modified_since: Lsn,
1046 : pub rel: RelTag,
1047 : }
1048 :
1049 : #[derive(Debug, PartialEq, Eq)]
1050 : pub struct PagestreamNblocksRequest {
1051 : pub request_lsn: Lsn,
1052 : pub not_modified_since: Lsn,
1053 : pub rel: RelTag,
1054 : }
1055 :
1056 : #[derive(Debug, PartialEq, Eq)]
1057 : pub struct PagestreamGetPageRequest {
1058 : pub request_lsn: Lsn,
1059 : pub not_modified_since: Lsn,
1060 : pub rel: RelTag,
1061 : pub blkno: u32,
1062 : }
1063 :
1064 : #[derive(Debug, PartialEq, Eq)]
1065 : pub struct PagestreamDbSizeRequest {
1066 : pub request_lsn: Lsn,
1067 : pub not_modified_since: Lsn,
1068 : pub dbnode: u32,
1069 : }
1070 :
1071 : #[derive(Debug, PartialEq, Eq)]
1072 : pub struct PagestreamGetSlruSegmentRequest {
1073 : pub request_lsn: Lsn,
1074 : pub not_modified_since: Lsn,
1075 : pub kind: u8,
1076 : pub segno: u32,
1077 : }
1078 :
1079 : #[derive(Debug)]
1080 : pub struct PagestreamExistsResponse {
1081 : pub exists: bool,
1082 : }
1083 :
1084 : #[derive(Debug)]
1085 : pub struct PagestreamNblocksResponse {
1086 : pub n_blocks: u32,
1087 : }
1088 :
1089 : #[derive(Debug)]
1090 : pub struct PagestreamGetPageResponse {
1091 : pub page: Bytes,
1092 : }
1093 :
1094 : #[derive(Debug)]
1095 : pub struct PagestreamGetSlruSegmentResponse {
1096 : pub segment: Bytes,
1097 : }
1098 :
1099 : #[derive(Debug)]
1100 : pub struct PagestreamErrorResponse {
1101 : pub message: String,
1102 : }
1103 :
1104 : #[derive(Debug)]
1105 : pub struct PagestreamDbSizeResponse {
1106 : pub db_size: i64,
1107 : }
1108 :
1109 : // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
1110 : // that require pageserver-internal types. It is sufficient to get the total size.
1111 0 : #[derive(Serialize, Deserialize, Debug)]
1112 : pub struct TenantHistorySize {
1113 : pub id: TenantId,
1114 : /// Size is a mixture of WAL and logical size, so the unit is bytes.
1115 : ///
1116 : /// Will be none if `?inputs_only=true` was given.
1117 : pub size: Option<u64>,
1118 : }
1119 :
1120 : impl PagestreamFeMessage {
1121 : /// Serialize a compute -> pageserver message. This is currently only used in testing
1122 : /// tools. Always uses protocol version 2.
1123 8 : pub fn serialize(&self) -> Bytes {
1124 8 : let mut bytes = BytesMut::new();
1125 8 :
1126 8 : match self {
1127 2 : Self::Exists(req) => {
1128 2 : bytes.put_u8(0);
1129 2 : bytes.put_u64(req.request_lsn.0);
1130 2 : bytes.put_u64(req.not_modified_since.0);
1131 2 : bytes.put_u32(req.rel.spcnode);
1132 2 : bytes.put_u32(req.rel.dbnode);
1133 2 : bytes.put_u32(req.rel.relnode);
1134 2 : bytes.put_u8(req.rel.forknum);
1135 2 : }
1136 :
1137 2 : Self::Nblocks(req) => {
1138 2 : bytes.put_u8(1);
1139 2 : bytes.put_u64(req.request_lsn.0);
1140 2 : bytes.put_u64(req.not_modified_since.0);
1141 2 : bytes.put_u32(req.rel.spcnode);
1142 2 : bytes.put_u32(req.rel.dbnode);
1143 2 : bytes.put_u32(req.rel.relnode);
1144 2 : bytes.put_u8(req.rel.forknum);
1145 2 : }
1146 :
1147 2 : Self::GetPage(req) => {
1148 2 : bytes.put_u8(2);
1149 2 : bytes.put_u64(req.request_lsn.0);
1150 2 : bytes.put_u64(req.not_modified_since.0);
1151 2 : bytes.put_u32(req.rel.spcnode);
1152 2 : bytes.put_u32(req.rel.dbnode);
1153 2 : bytes.put_u32(req.rel.relnode);
1154 2 : bytes.put_u8(req.rel.forknum);
1155 2 : bytes.put_u32(req.blkno);
1156 2 : }
1157 :
1158 2 : Self::DbSize(req) => {
1159 2 : bytes.put_u8(3);
1160 2 : bytes.put_u64(req.request_lsn.0);
1161 2 : bytes.put_u64(req.not_modified_since.0);
1162 2 : bytes.put_u32(req.dbnode);
1163 2 : }
1164 :
1165 0 : Self::GetSlruSegment(req) => {
1166 0 : bytes.put_u8(4);
1167 0 : bytes.put_u64(req.request_lsn.0);
1168 0 : bytes.put_u64(req.not_modified_since.0);
1169 0 : bytes.put_u8(req.kind);
1170 0 : bytes.put_u32(req.segno);
1171 0 : }
1172 : }
1173 :
1174 8 : bytes.into()
1175 8 : }
1176 :
1177 8 : pub fn parse<R: std::io::Read>(
1178 8 : body: &mut R,
1179 8 : protocol_version: PagestreamProtocolVersion,
1180 8 : ) -> anyhow::Result<PagestreamFeMessage> {
1181 : // these correspond to the NeonMessageTag enum in pagestore_client.h
1182 : //
1183 : // TODO: consider using protobuf or serde bincode for less error prone
1184 : // serialization.
1185 8 : let msg_tag = body.read_u8()?;
1186 :
1187 8 : let (request_lsn, not_modified_since) = match protocol_version {
1188 : PagestreamProtocolVersion::V2 => (
1189 8 : Lsn::from(body.read_u64::<BigEndian>()?),
1190 8 : Lsn::from(body.read_u64::<BigEndian>()?),
1191 : ),
1192 : PagestreamProtocolVersion::V1 => {
1193 : // In the old protocol, each message starts with a boolean 'latest' flag,
1194 : // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
1195 : // 'not_modified_since', used in the new protocol version.
1196 0 : let latest = body.read_u8()? != 0;
1197 0 : let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
1198 0 : if latest {
1199 0 : (Lsn::MAX, request_lsn) // get latest version
1200 : } else {
1201 0 : (request_lsn, request_lsn) // get version at specified LSN
1202 : }
1203 : }
1204 : };
1205 :
1206 : // The rest of the messages are the same between V1 and V2
1207 8 : match msg_tag {
1208 : 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
1209 2 : request_lsn,
1210 2 : not_modified_since,
1211 2 : rel: RelTag {
1212 2 : spcnode: body.read_u32::<BigEndian>()?,
1213 2 : dbnode: body.read_u32::<BigEndian>()?,
1214 2 : relnode: body.read_u32::<BigEndian>()?,
1215 2 : forknum: body.read_u8()?,
1216 : },
1217 : })),
1218 : 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1219 2 : request_lsn,
1220 2 : not_modified_since,
1221 2 : rel: RelTag {
1222 2 : spcnode: body.read_u32::<BigEndian>()?,
1223 2 : dbnode: body.read_u32::<BigEndian>()?,
1224 2 : relnode: body.read_u32::<BigEndian>()?,
1225 2 : forknum: body.read_u8()?,
1226 : },
1227 : })),
1228 : 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1229 2 : request_lsn,
1230 2 : not_modified_since,
1231 2 : rel: RelTag {
1232 2 : spcnode: body.read_u32::<BigEndian>()?,
1233 2 : dbnode: body.read_u32::<BigEndian>()?,
1234 2 : relnode: body.read_u32::<BigEndian>()?,
1235 2 : forknum: body.read_u8()?,
1236 : },
1237 2 : blkno: body.read_u32::<BigEndian>()?,
1238 : })),
1239 : 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1240 2 : request_lsn,
1241 2 : not_modified_since,
1242 2 : dbnode: body.read_u32::<BigEndian>()?,
1243 : })),
1244 : 4 => Ok(PagestreamFeMessage::GetSlruSegment(
1245 : PagestreamGetSlruSegmentRequest {
1246 0 : request_lsn,
1247 0 : not_modified_since,
1248 0 : kind: body.read_u8()?,
1249 0 : segno: body.read_u32::<BigEndian>()?,
1250 : },
1251 : )),
1252 0 : _ => bail!("unknown smgr message tag: {:?}", msg_tag),
1253 : }
1254 8 : }
1255 : }
1256 :
1257 : impl PagestreamBeMessage {
1258 0 : pub fn serialize(&self) -> Bytes {
1259 0 : let mut bytes = BytesMut::new();
1260 0 :
1261 0 : use PagestreamBeMessageTag as Tag;
1262 0 : match self {
1263 0 : Self::Exists(resp) => {
1264 0 : bytes.put_u8(Tag::Exists as u8);
1265 0 : bytes.put_u8(resp.exists as u8);
1266 0 : }
1267 :
1268 0 : Self::Nblocks(resp) => {
1269 0 : bytes.put_u8(Tag::Nblocks as u8);
1270 0 : bytes.put_u32(resp.n_blocks);
1271 0 : }
1272 :
1273 0 : Self::GetPage(resp) => {
1274 0 : bytes.put_u8(Tag::GetPage as u8);
1275 0 : bytes.put(&resp.page[..]);
1276 0 : }
1277 :
1278 0 : Self::Error(resp) => {
1279 0 : bytes.put_u8(Tag::Error as u8);
1280 0 : bytes.put(resp.message.as_bytes());
1281 0 : bytes.put_u8(0); // null terminator
1282 0 : }
1283 0 : Self::DbSize(resp) => {
1284 0 : bytes.put_u8(Tag::DbSize as u8);
1285 0 : bytes.put_i64(resp.db_size);
1286 0 : }
1287 :
1288 0 : Self::GetSlruSegment(resp) => {
1289 0 : bytes.put_u8(Tag::GetSlruSegment as u8);
1290 0 : bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
1291 0 : bytes.put(&resp.segment[..]);
1292 0 : }
1293 : }
1294 :
1295 0 : bytes.into()
1296 0 : }
1297 :
1298 0 : pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
1299 0 : let mut buf = buf.reader();
1300 0 : let msg_tag = buf.read_u8()?;
1301 :
1302 : use PagestreamBeMessageTag as Tag;
1303 0 : let ok =
1304 0 : match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
1305 : Tag::Exists => {
1306 0 : let exists = buf.read_u8()?;
1307 0 : Self::Exists(PagestreamExistsResponse {
1308 0 : exists: exists != 0,
1309 0 : })
1310 : }
1311 : Tag::Nblocks => {
1312 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1313 0 : Self::Nblocks(PagestreamNblocksResponse { n_blocks })
1314 : }
1315 : Tag::GetPage => {
1316 0 : let mut page = vec![0; 8192]; // TODO: use MaybeUninit
1317 0 : buf.read_exact(&mut page)?;
1318 0 : PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
1319 : }
1320 : Tag::Error => {
1321 0 : let mut msg = Vec::new();
1322 0 : buf.read_until(0, &mut msg)?;
1323 0 : let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
1324 0 : let rust_str = cstring.to_str()?;
1325 0 : PagestreamBeMessage::Error(PagestreamErrorResponse {
1326 0 : message: rust_str.to_owned(),
1327 0 : })
1328 : }
1329 : Tag::DbSize => {
1330 0 : let db_size = buf.read_i64::<BigEndian>()?;
1331 0 : Self::DbSize(PagestreamDbSizeResponse { db_size })
1332 : }
1333 : Tag::GetSlruSegment => {
1334 0 : let n_blocks = buf.read_u32::<BigEndian>()?;
1335 0 : let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
1336 0 : buf.read_exact(&mut segment)?;
1337 0 : Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
1338 0 : segment: segment.into(),
1339 0 : })
1340 : }
1341 : };
1342 0 : let remaining = buf.into_inner();
1343 0 : if !remaining.is_empty() {
1344 0 : anyhow::bail!(
1345 0 : "remaining bytes in msg with tag={msg_tag}: {}",
1346 0 : remaining.len()
1347 0 : );
1348 0 : }
1349 0 : Ok(ok)
1350 0 : }
1351 :
1352 0 : pub fn kind(&self) -> &'static str {
1353 0 : match self {
1354 0 : Self::Exists(_) => "Exists",
1355 0 : Self::Nblocks(_) => "Nblocks",
1356 0 : Self::GetPage(_) => "GetPage",
1357 0 : Self::Error(_) => "Error",
1358 0 : Self::DbSize(_) => "DbSize",
1359 0 : Self::GetSlruSegment(_) => "GetSlruSegment",
1360 : }
1361 0 : }
1362 : }
1363 :
1364 : #[cfg(test)]
1365 : mod tests {
1366 : use serde_json::json;
1367 : use std::str::FromStr;
1368 :
1369 : use super::*;
1370 :
1371 : #[test]
1372 2 : fn test_pagestream() {
1373 2 : // Test serialization/deserialization of PagestreamFeMessage
1374 2 : let messages = vec![
1375 2 : PagestreamFeMessage::Exists(PagestreamExistsRequest {
1376 2 : request_lsn: Lsn(4),
1377 2 : not_modified_since: Lsn(3),
1378 2 : rel: RelTag {
1379 2 : forknum: 1,
1380 2 : spcnode: 2,
1381 2 : dbnode: 3,
1382 2 : relnode: 4,
1383 2 : },
1384 2 : }),
1385 2 : PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
1386 2 : request_lsn: Lsn(4),
1387 2 : not_modified_since: Lsn(4),
1388 2 : rel: RelTag {
1389 2 : forknum: 1,
1390 2 : spcnode: 2,
1391 2 : dbnode: 3,
1392 2 : relnode: 4,
1393 2 : },
1394 2 : }),
1395 2 : PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
1396 2 : request_lsn: Lsn(4),
1397 2 : not_modified_since: Lsn(3),
1398 2 : rel: RelTag {
1399 2 : forknum: 1,
1400 2 : spcnode: 2,
1401 2 : dbnode: 3,
1402 2 : relnode: 4,
1403 2 : },
1404 2 : blkno: 7,
1405 2 : }),
1406 2 : PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
1407 2 : request_lsn: Lsn(4),
1408 2 : not_modified_since: Lsn(3),
1409 2 : dbnode: 7,
1410 2 : }),
1411 2 : ];
1412 10 : for msg in messages {
1413 8 : let bytes = msg.serialize();
1414 8 : let reconstructed =
1415 8 : PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
1416 8 : .unwrap();
1417 8 : assert!(msg == reconstructed);
1418 : }
1419 2 : }
1420 :
1421 : #[test]
1422 2 : fn test_tenantinfo_serde() {
1423 2 : // Test serialization/deserialization of TenantInfo
1424 2 : let original_active = TenantInfo {
1425 2 : id: TenantShardId::unsharded(TenantId::generate()),
1426 2 : state: TenantState::Active,
1427 2 : current_physical_size: Some(42),
1428 2 : attachment_status: TenantAttachmentStatus::Attached,
1429 2 : generation: None,
1430 2 : };
1431 2 : let expected_active = json!({
1432 2 : "id": original_active.id.to_string(),
1433 2 : "state": {
1434 2 : "slug": "Active",
1435 2 : },
1436 2 : "current_physical_size": 42,
1437 2 : "attachment_status": {
1438 2 : "slug":"attached",
1439 2 : }
1440 2 : });
1441 2 :
1442 2 : let original_broken = TenantInfo {
1443 2 : id: TenantShardId::unsharded(TenantId::generate()),
1444 2 : state: TenantState::Broken {
1445 2 : reason: "reason".into(),
1446 2 : backtrace: "backtrace info".into(),
1447 2 : },
1448 2 : current_physical_size: Some(42),
1449 2 : attachment_status: TenantAttachmentStatus::Attached,
1450 2 : generation: None,
1451 2 : };
1452 2 : let expected_broken = json!({
1453 2 : "id": original_broken.id.to_string(),
1454 2 : "state": {
1455 2 : "slug": "Broken",
1456 2 : "data": {
1457 2 : "backtrace": "backtrace info",
1458 2 : "reason": "reason",
1459 2 : }
1460 2 : },
1461 2 : "current_physical_size": 42,
1462 2 : "attachment_status": {
1463 2 : "slug":"attached",
1464 2 : }
1465 2 : });
1466 2 :
1467 2 : assert_eq!(
1468 2 : serde_json::to_value(&original_active).unwrap(),
1469 2 : expected_active
1470 2 : );
1471 :
1472 2 : assert_eq!(
1473 2 : serde_json::to_value(&original_broken).unwrap(),
1474 2 : expected_broken
1475 2 : );
1476 2 : assert!(format!("{:?}", &original_broken.state).contains("reason"));
1477 2 : assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
1478 2 : }
1479 :
1480 : #[test]
1481 2 : fn test_reject_unknown_field() {
1482 2 : let id = TenantId::generate();
1483 2 : let config_request = json!({
1484 2 : "tenant_id": id.to_string(),
1485 2 : "unknown_field": "unknown_value".to_string(),
1486 2 : });
1487 2 : let err = serde_json::from_value::<TenantConfigRequest>(config_request).unwrap_err();
1488 2 : assert!(
1489 2 : err.to_string().contains("unknown field `unknown_field`"),
1490 0 : "expect unknown field `unknown_field` error, got: {}",
1491 : err
1492 : );
1493 2 : }
1494 :
1495 : #[test]
1496 2 : fn tenantstatus_activating_serde() {
1497 2 : let states = [
1498 2 : TenantState::Activating(ActivatingFrom::Loading),
1499 2 : TenantState::Activating(ActivatingFrom::Attaching),
1500 2 : ];
1501 2 : let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
1502 2 :
1503 2 : let actual = serde_json::to_string(&states).unwrap();
1504 2 :
1505 2 : assert_eq!(actual, expected);
1506 :
1507 2 : let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
1508 2 :
1509 2 : assert_eq!(states.as_slice(), &parsed);
1510 2 : }
1511 :
1512 : #[test]
1513 2 : fn tenantstatus_activating_strum() {
1514 2 : // tests added, because we use these for metrics
1515 2 : let examples = [
1516 2 : (line!(), TenantState::Loading, "Loading"),
1517 2 : (line!(), TenantState::Attaching, "Attaching"),
1518 2 : (
1519 2 : line!(),
1520 2 : TenantState::Activating(ActivatingFrom::Loading),
1521 2 : "Activating",
1522 2 : ),
1523 2 : (
1524 2 : line!(),
1525 2 : TenantState::Activating(ActivatingFrom::Attaching),
1526 2 : "Activating",
1527 2 : ),
1528 2 : (line!(), TenantState::Active, "Active"),
1529 2 : (
1530 2 : line!(),
1531 2 : TenantState::Stopping {
1532 2 : progress: utils::completion::Barrier::default(),
1533 2 : },
1534 2 : "Stopping",
1535 2 : ),
1536 2 : (
1537 2 : line!(),
1538 2 : TenantState::Broken {
1539 2 : reason: "Example".into(),
1540 2 : backtrace: "Looooong backtrace".into(),
1541 2 : },
1542 2 : "Broken",
1543 2 : ),
1544 2 : ];
1545 :
1546 16 : for (line, rendered, expected) in examples {
1547 14 : let actual: &'static str = rendered.into();
1548 14 : assert_eq!(actual, expected, "example on {line}");
1549 : }
1550 2 : }
1551 :
1552 : #[test]
1553 2 : fn test_aux_file_migration_path() {
1554 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1555 2 : None,
1556 2 : AuxFilePolicy::V1
1557 2 : ));
1558 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1559 2 : None,
1560 2 : AuxFilePolicy::V2
1561 2 : ));
1562 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1563 2 : None,
1564 2 : AuxFilePolicy::CrossValidation
1565 2 : ));
1566 : // Self-migration is not a valid migration path, and the caller should handle it by itself.
1567 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1568 2 : Some(AuxFilePolicy::V1),
1569 2 : AuxFilePolicy::V1
1570 2 : ));
1571 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1572 2 : Some(AuxFilePolicy::V2),
1573 2 : AuxFilePolicy::V2
1574 2 : ));
1575 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1576 2 : Some(AuxFilePolicy::CrossValidation),
1577 2 : AuxFilePolicy::CrossValidation
1578 2 : ));
1579 : // Migrations not allowed
1580 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1581 2 : Some(AuxFilePolicy::CrossValidation),
1582 2 : AuxFilePolicy::V1
1583 2 : ));
1584 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1585 2 : Some(AuxFilePolicy::V1),
1586 2 : AuxFilePolicy::V2
1587 2 : ));
1588 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1589 2 : Some(AuxFilePolicy::V2),
1590 2 : AuxFilePolicy::V1
1591 2 : ));
1592 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1593 2 : Some(AuxFilePolicy::V2),
1594 2 : AuxFilePolicy::CrossValidation
1595 2 : ));
1596 2 : assert!(!AuxFilePolicy::is_valid_migration_path(
1597 2 : Some(AuxFilePolicy::V1),
1598 2 : AuxFilePolicy::CrossValidation
1599 2 : ));
1600 : // Migrations allowed
1601 2 : assert!(AuxFilePolicy::is_valid_migration_path(
1602 2 : Some(AuxFilePolicy::CrossValidation),
1603 2 : AuxFilePolicy::V2
1604 2 : ));
1605 2 : }
1606 :
1607 : #[test]
1608 2 : fn test_aux_parse() {
1609 2 : assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
1610 2 : assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
1611 2 : assert_eq!(
1612 2 : AuxFilePolicy::from_str("cross-validation").unwrap(),
1613 2 : AuxFilePolicy::CrossValidation
1614 2 : );
1615 2 : }
1616 : }
|