Line data Source code
1 : //! Timeline repository implementation that keeps old data in layer files, and
2 : //! the recent changes in ephemeral files.
3 : //!
4 : //! See tenant/*_layer.rs files. The functions here are responsible for locating
5 : //! the correct layer for the get/put call, walking back the timeline branching
6 : //! history as needed.
7 : //!
8 : //! The files are stored in the .neon/tenants/<tenant_id>/timelines/<timeline_id>
9 : //! directory. See docs/pageserver-storage.md for how the files are managed.
10 : //! In addition to the layer files, there is a metadata file in the same
11 : //! directory that contains information about the timeline, in particular its
12 : //! parent timeline, and the last LSN that has been written to disk.
13 : //!
14 :
15 : use anyhow::{bail, Context};
16 : use arc_swap::ArcSwap;
17 : use camino::Utf8Path;
18 : use camino::Utf8PathBuf;
19 : use enumset::EnumSet;
20 : use futures::stream::FuturesUnordered;
21 : use futures::StreamExt;
22 : use pageserver_api::models;
23 : use pageserver_api::models::AuxFilePolicy;
24 : use pageserver_api::models::TimelineArchivalState;
25 : use pageserver_api::models::TimelineState;
26 : use pageserver_api::models::TopTenantShardItem;
27 : use pageserver_api::models::WalRedoManagerStatus;
28 : use pageserver_api::shard::ShardIdentity;
29 : use pageserver_api::shard::ShardStripeSize;
30 : use pageserver_api::shard::TenantShardId;
31 : use remote_storage::DownloadError;
32 : use remote_storage::GenericRemoteStorage;
33 : use remote_storage::TimeoutOrCancel;
34 : use std::collections::BTreeMap;
35 : use std::fmt;
36 : use std::future::Future;
37 : use std::sync::Weak;
38 : use std::time::SystemTime;
39 : use storage_broker::BrokerClientChannel;
40 : use tokio::io::BufReader;
41 : use tokio::sync::watch;
42 : use tokio::task::JoinSet;
43 : use tokio_util::sync::CancellationToken;
44 : use tracing::*;
45 : use upload_queue::NotInitialized;
46 : use utils::backoff;
47 : use utils::circuit_breaker::CircuitBreaker;
48 : use utils::completion;
49 : use utils::crashsafe::path_with_suffix_extension;
50 : use utils::failpoint_support;
51 : use utils::fs_ext;
52 : use utils::pausable_failpoint;
53 : use utils::sync::gate::Gate;
54 : use utils::sync::gate::GateGuard;
55 : use utils::timeout::timeout_cancellable;
56 : use utils::timeout::TimeoutCancellableError;
57 : use utils::zstd::create_zst_tarball;
58 : use utils::zstd::extract_zst_tarball;
59 :
60 : use self::config::AttachedLocationConfig;
61 : use self::config::AttachmentMode;
62 : use self::config::LocationConf;
63 : use self::config::TenantConf;
64 : use self::metadata::TimelineMetadata;
65 : use self::mgr::GetActiveTenantError;
66 : use self::mgr::GetTenantError;
67 : use self::remote_timeline_client::upload::upload_index_part;
68 : use self::remote_timeline_client::RemoteTimelineClient;
69 : use self::timeline::uninit::TimelineCreateGuard;
70 : use self::timeline::uninit::TimelineExclusionError;
71 : use self::timeline::uninit::UninitializedTimeline;
72 : use self::timeline::EvictionTaskTenantState;
73 : use self::timeline::GcCutoffs;
74 : use self::timeline::TimelineResources;
75 : use self::timeline::WaitLsnError;
76 : use crate::config::PageServerConf;
77 : use crate::context::{DownloadBehavior, RequestContext};
78 : use crate::deletion_queue::DeletionQueueClient;
79 : use crate::deletion_queue::DeletionQueueError;
80 : use crate::import_datadir;
81 : use crate::is_uninit_mark;
82 : use crate::l0_flush::L0FlushGlobalState;
83 : use crate::metrics::TENANT;
84 : use crate::metrics::{
85 : remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
86 : TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
87 : };
88 : use crate::repository::GcResult;
89 : use crate::task_mgr;
90 : use crate::task_mgr::TaskKind;
91 : use crate::tenant::config::LocationMode;
92 : use crate::tenant::config::TenantConfOpt;
93 : pub use crate::tenant::remote_timeline_client::index::IndexPart;
94 : use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
95 : use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
96 : use crate::tenant::remote_timeline_client::INITDB_PATH;
97 : use crate::tenant::storage_layer::DeltaLayer;
98 : use crate::tenant::storage_layer::ImageLayer;
99 : use crate::walredo;
100 : use crate::InitializationOrder;
101 : use std::collections::hash_map::Entry;
102 : use std::collections::HashMap;
103 : use std::collections::HashSet;
104 : use std::fmt::Debug;
105 : use std::fmt::Display;
106 : use std::fs;
107 : use std::fs::File;
108 : use std::sync::atomic::{AtomicU64, Ordering};
109 : use std::sync::Arc;
110 : use std::sync::Mutex;
111 : use std::time::{Duration, Instant};
112 :
113 : use crate::span;
114 : use crate::tenant::timeline::delete::DeleteTimelineFlow;
115 : use crate::tenant::timeline::uninit::cleanup_timeline_directory;
116 : use crate::virtual_file::VirtualFile;
117 : use crate::walredo::PostgresRedoManager;
118 : use crate::TEMP_FILE_SUFFIX;
119 : use once_cell::sync::Lazy;
120 : pub use pageserver_api::models::TenantState;
121 : use tokio::sync::Semaphore;
122 :
123 0 : static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
124 : use utils::{
125 : crashsafe,
126 : generation::Generation,
127 : id::TimelineId,
128 : lsn::{Lsn, RecordLsn},
129 : };
130 :
131 : pub mod blob_io;
132 : pub mod block_io;
133 : pub mod vectored_blob_io;
134 :
135 : pub mod disk_btree;
136 : pub(crate) mod ephemeral_file;
137 : pub mod layer_map;
138 :
139 : pub mod metadata;
140 : pub mod remote_timeline_client;
141 : pub mod storage_layer;
142 :
143 : pub mod checks;
144 : pub mod config;
145 : pub mod mgr;
146 : pub mod secondary;
147 : pub mod tasks;
148 : pub mod upload_queue;
149 :
150 : pub(crate) mod timeline;
151 :
152 : pub mod size;
153 :
154 : mod gc_block;
155 : pub(crate) mod throttle;
156 :
157 : pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
158 : pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
159 :
160 : // re-export for use in walreceiver
161 : pub use crate::tenant::timeline::WalReceiverInfo;
162 :
163 : /// The "tenants" part of `tenants/<tenant>/timelines...`
164 : pub const TENANTS_SEGMENT_NAME: &str = "tenants";
165 :
166 : /// Parts of the `.neon/tenants/<tenant_id>/timelines/<timeline_id>` directory prefix.
167 : pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
168 :
169 : /// References to shared objects that are passed into each tenant, such
170 : /// as the shared remote storage client and process initialization state.
171 : #[derive(Clone)]
172 : pub struct TenantSharedResources {
173 : pub broker_client: storage_broker::BrokerClientChannel,
174 : pub remote_storage: GenericRemoteStorage,
175 : pub deletion_queue_client: DeletionQueueClient,
176 : pub l0_flush_global_state: L0FlushGlobalState,
177 : }
178 :
179 : /// A [`Tenant`] is really an _attached_ tenant. The configuration
180 : /// for an attached tenant is a subset of the [`LocationConf`], represented
181 : /// in this struct.
182 : pub(super) struct AttachedTenantConf {
183 : tenant_conf: TenantConfOpt,
184 : location: AttachedLocationConfig,
185 : }
186 :
187 : impl AttachedTenantConf {
188 0 : fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
189 0 : Self {
190 0 : tenant_conf,
191 0 : location,
192 0 : }
193 0 : }
194 :
195 594 : fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
196 594 : match &location_conf.mode {
197 594 : LocationMode::Attached(attach_conf) => Ok(Self {
198 594 : tenant_conf: location_conf.tenant_conf,
199 594 : location: *attach_conf,
200 594 : }),
201 : LocationMode::Secondary(_) => {
202 0 : anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
203 : }
204 : }
205 594 : }
206 : }
207 : struct TimelinePreload {
208 : timeline_id: TimelineId,
209 : client: RemoteTimelineClient,
210 : index_part: Result<MaybeDeletedIndexPart, DownloadError>,
211 : }
212 :
213 : pub(crate) struct TenantPreload {
214 : timelines: HashMap<TimelineId, TimelinePreload>,
215 : }
216 :
217 : /// When we spawn a tenant, there is a special mode for tenant creation that
218 : /// avoids trying to read anything from remote storage.
219 : pub(crate) enum SpawnMode {
220 : /// Activate as soon as possible
221 : Eager,
222 : /// Lazy activation in the background, with the option to skip the queue if the need comes up
223 : Lazy,
224 : }
225 :
226 : ///
227 : /// Tenant consists of multiple timelines. Keep them in a hash table.
228 : ///
229 : pub struct Tenant {
230 : // Global pageserver config parameters
231 : pub conf: &'static PageServerConf,
232 :
233 : /// The value creation timestamp, used to measure activation delay, see:
234 : /// <https://github.com/neondatabase/neon/issues/4025>
235 : constructed_at: Instant,
236 :
237 : state: watch::Sender<TenantState>,
238 :
239 : // Overridden tenant-specific config parameters.
240 : // We keep TenantConfOpt sturct here to preserve the information
241 : // about parameters that are not set.
242 : // This is necessary to allow global config updates.
243 : tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
244 :
245 : tenant_shard_id: TenantShardId,
246 :
247 : // The detailed sharding information, beyond the number/count in tenant_shard_id
248 : shard_identity: ShardIdentity,
249 :
250 : /// The remote storage generation, used to protect S3 objects from split-brain.
251 : /// Does not change over the lifetime of the [`Tenant`] object.
252 : ///
253 : /// This duplicates the generation stored in LocationConf, but that structure is mutable:
254 : /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
255 : generation: Generation,
256 :
257 : timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
258 :
259 : /// During timeline creation, we first insert the TimelineId to the
260 : /// creating map, then `timelines`, then remove it from the creating map.
261 : /// **Lock order**: if acquring both, acquire`timelines` before `timelines_creating`
262 : timelines_creating: std::sync::Mutex<HashSet<TimelineId>>,
263 :
264 : // This mutex prevents creation of new timelines during GC.
265 : // Adding yet another mutex (in addition to `timelines`) is needed because holding
266 : // `timelines` mutex during all GC iteration
267 : // may block for a long time `get_timeline`, `get_timelines_state`,... and other operations
268 : // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
269 : // timeout...
270 : gc_cs: tokio::sync::Mutex<()>,
271 : walredo_mgr: Option<Arc<WalRedoManager>>,
272 :
273 : // provides access to timeline data sitting in the remote storage
274 : pub(crate) remote_storage: GenericRemoteStorage,
275 :
276 : // Access to global deletion queue for when this tenant wants to schedule a deletion
277 : deletion_queue_client: DeletionQueueClient,
278 :
279 : /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
280 : cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
281 : cached_synthetic_tenant_size: Arc<AtomicU64>,
282 :
283 : eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
284 :
285 : /// Track repeated failures to compact, so that we can back off.
286 : /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
287 : compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
288 :
289 : /// If the tenant is in Activating state, notify this to encourage it
290 : /// to proceed to Active as soon as possible, rather than waiting for lazy
291 : /// background warmup.
292 : pub(crate) activate_now_sem: tokio::sync::Semaphore,
293 :
294 : // Cancellation token fires when we have entered shutdown(). This is a parent of
295 : // Timelines' cancellation token.
296 : pub(crate) cancel: CancellationToken,
297 :
298 : // Users of the Tenant such as the page service must take this Gate to avoid
299 : // trying to use a Tenant which is shutting down.
300 : pub(crate) gate: Gate,
301 :
302 : /// Throttle applied at the top of [`Timeline::get`].
303 : /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
304 : pub(crate) timeline_get_throttle:
305 : Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
306 :
307 : /// An ongoing timeline detach concurrency limiter.
308 : ///
309 : /// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense
310 : /// to have two running at the same time. A different one can be started if an earlier one
311 : /// has failed for whatever reason.
312 : ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
313 :
314 : /// `index_part.json` based gc blocking reason tracking.
315 : ///
316 : /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before
317 : /// proceeding.
318 : pub(crate) gc_block: gc_block::GcBlock,
319 :
320 : l0_flush_global_state: L0FlushGlobalState,
321 : }
322 :
323 : impl std::fmt::Debug for Tenant {
324 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
325 0 : write!(f, "{} ({})", self.tenant_shard_id, self.current_state())
326 0 : }
327 : }
328 :
329 : pub(crate) enum WalRedoManager {
330 : Prod(WalredoManagerId, PostgresRedoManager),
331 : #[cfg(test)]
332 : Test(harness::TestRedoManager),
333 : }
334 :
335 0 : #[derive(thiserror::Error, Debug)]
336 : #[error("pageserver is shutting down")]
337 : pub(crate) struct GlobalShutDown;
338 :
339 : impl WalRedoManager {
340 0 : pub(crate) fn new(mgr: PostgresRedoManager) -> Result<Arc<Self>, GlobalShutDown> {
341 0 : let id = WalredoManagerId::next();
342 0 : let arc = Arc::new(Self::Prod(id, mgr));
343 0 : let mut guard = WALREDO_MANAGERS.lock().unwrap();
344 0 : match &mut *guard {
345 0 : Some(map) => {
346 0 : map.insert(id, Arc::downgrade(&arc));
347 0 : Ok(arc)
348 : }
349 0 : None => Err(GlobalShutDown),
350 : }
351 0 : }
352 : }
353 :
354 : impl Drop for WalRedoManager {
355 30 : fn drop(&mut self) {
356 30 : match self {
357 0 : Self::Prod(id, _) => {
358 0 : let mut guard = WALREDO_MANAGERS.lock().unwrap();
359 0 : if let Some(map) = &mut *guard {
360 0 : map.remove(id).expect("new() registers, drop() unregisters");
361 0 : }
362 : }
363 : #[cfg(test)]
364 30 : Self::Test(_) => {
365 30 : // Not applicable to test redo manager
366 30 : }
367 : }
368 30 : }
369 : }
370 :
371 : /// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down
372 : /// the walredo processes outside of the regular order.
373 : ///
374 : /// This is necessary to work around a systemd bug where it freezes if there are
375 : /// walredo processes left => <https://github.com/neondatabase/cloud/issues/11387>
376 : #[allow(clippy::type_complexity)]
377 : pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy<
378 : Mutex<Option<HashMap<WalredoManagerId, Weak<WalRedoManager>>>>,
379 0 : > = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new())));
380 : #[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
381 : pub(crate) struct WalredoManagerId(u64);
382 : impl WalredoManagerId {
383 0 : pub fn next() -> Self {
384 : static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
385 0 : let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
386 0 : if id == 0 {
387 0 : panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
388 0 : }
389 0 : Self(id)
390 0 : }
391 : }
392 :
393 : #[cfg(test)]
394 : impl From<harness::TestRedoManager> for WalRedoManager {
395 570 : fn from(mgr: harness::TestRedoManager) -> Self {
396 570 : Self::Test(mgr)
397 570 : }
398 : }
399 :
400 : impl WalRedoManager {
401 18 : pub(crate) async fn shutdown(&self) -> bool {
402 18 : match self {
403 0 : Self::Prod(_, mgr) => mgr.shutdown().await,
404 : #[cfg(test)]
405 : Self::Test(_) => {
406 : // Not applicable to test redo manager
407 18 : true
408 : }
409 : }
410 18 : }
411 :
412 0 : pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
413 0 : match self {
414 0 : Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout),
415 0 : #[cfg(test)]
416 0 : Self::Test(_) => {
417 0 : // Not applicable to test redo manager
418 0 : }
419 0 : }
420 0 : }
421 :
422 : /// # Cancel-Safety
423 : ///
424 : /// This method is cancellation-safe.
425 1038 : pub async fn request_redo(
426 1038 : &self,
427 1038 : key: crate::repository::Key,
428 1038 : lsn: Lsn,
429 1038 : base_img: Option<(Lsn, bytes::Bytes)>,
430 1038 : records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
431 1038 : pg_version: u32,
432 1038 : ) -> Result<bytes::Bytes, walredo::Error> {
433 1038 : match self {
434 0 : Self::Prod(_, mgr) => {
435 0 : mgr.request_redo(key, lsn, base_img, records, pg_version)
436 0 : .await
437 : }
438 : #[cfg(test)]
439 1038 : Self::Test(mgr) => {
440 1038 : mgr.request_redo(key, lsn, base_img, records, pg_version)
441 0 : .await
442 : }
443 : }
444 1038 : }
445 :
446 0 : pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
447 0 : match self {
448 0 : WalRedoManager::Prod(_, m) => Some(m.status()),
449 0 : #[cfg(test)]
450 0 : WalRedoManager::Test(_) => None,
451 0 : }
452 0 : }
453 : }
454 :
455 0 : #[derive(Debug, thiserror::Error, PartialEq, Eq)]
456 : pub enum GetTimelineError {
457 : #[error("Timeline is shutting down")]
458 : ShuttingDown,
459 : #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
460 : NotActive {
461 : tenant_id: TenantShardId,
462 : timeline_id: TimelineId,
463 : state: TimelineState,
464 : },
465 : #[error("Timeline {tenant_id}/{timeline_id} was not found")]
466 : NotFound {
467 : tenant_id: TenantShardId,
468 : timeline_id: TimelineId,
469 : },
470 : }
471 :
472 0 : #[derive(Debug, thiserror::Error)]
473 : pub enum LoadLocalTimelineError {
474 : #[error("FailedToLoad")]
475 : Load(#[source] anyhow::Error),
476 : #[error("FailedToResumeDeletion")]
477 : ResumeDeletion(#[source] anyhow::Error),
478 : }
479 :
480 0 : #[derive(thiserror::Error)]
481 : pub enum DeleteTimelineError {
482 : #[error("NotFound")]
483 : NotFound,
484 :
485 : #[error("HasChildren")]
486 : HasChildren(Vec<TimelineId>),
487 :
488 : #[error("Timeline deletion is already in progress")]
489 : AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),
490 :
491 : #[error(transparent)]
492 : Other(#[from] anyhow::Error),
493 : }
494 :
495 : impl Debug for DeleteTimelineError {
496 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
497 0 : match self {
498 0 : Self::NotFound => write!(f, "NotFound"),
499 0 : Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
500 0 : Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
501 0 : Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
502 : }
503 0 : }
504 : }
505 :
506 0 : #[derive(thiserror::Error)]
507 : pub enum TimelineArchivalError {
508 : #[error("NotFound")]
509 : NotFound,
510 :
511 : #[error("Timeout")]
512 : Timeout,
513 :
514 : #[error("ancestor is archived: {}", .0)]
515 : HasArchivedParent(TimelineId),
516 :
517 : #[error("HasUnarchivedChildren")]
518 : HasUnarchivedChildren(Vec<TimelineId>),
519 :
520 : #[error("Timeline archival is already in progress")]
521 : AlreadyInProgress,
522 :
523 : #[error(transparent)]
524 : Other(#[from] anyhow::Error),
525 : }
526 :
527 : impl Debug for TimelineArchivalError {
528 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
529 0 : match self {
530 0 : Self::NotFound => write!(f, "NotFound"),
531 0 : Self::Timeout => write!(f, "Timeout"),
532 0 : Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(),
533 0 : Self::HasUnarchivedChildren(c) => {
534 0 : f.debug_tuple("HasUnarchivedChildren").field(c).finish()
535 : }
536 0 : Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
537 0 : Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
538 : }
539 0 : }
540 : }
541 :
542 : pub enum SetStoppingError {
543 : AlreadyStopping(completion::Barrier),
544 : Broken,
545 : }
546 :
547 : impl Debug for SetStoppingError {
548 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
549 0 : match self {
550 0 : Self::AlreadyStopping(_) => f.debug_tuple("AlreadyStopping").finish(),
551 0 : Self::Broken => write!(f, "Broken"),
552 : }
553 0 : }
554 : }
555 :
556 0 : #[derive(thiserror::Error, Debug)]
557 : pub enum CreateTimelineError {
558 : #[error("creation of timeline with the given ID is in progress")]
559 : AlreadyCreating,
560 : #[error("timeline already exists with different parameters")]
561 : Conflict,
562 : #[error(transparent)]
563 : AncestorLsn(anyhow::Error),
564 : #[error("ancestor timeline is not active")]
565 : AncestorNotActive,
566 : #[error("tenant shutting down")]
567 : ShuttingDown,
568 : #[error(transparent)]
569 : Other(#[from] anyhow::Error),
570 : }
571 :
572 : #[derive(thiserror::Error, Debug)]
573 : enum InitdbError {
574 : Other(anyhow::Error),
575 : Cancelled,
576 : Spawn(std::io::Result<()>),
577 : Failed(std::process::ExitStatus, Vec<u8>),
578 : }
579 :
580 : impl fmt::Display for InitdbError {
581 0 : fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
582 0 : match self {
583 0 : InitdbError::Cancelled => write!(f, "Operation was cancelled"),
584 0 : InitdbError::Spawn(e) => write!(f, "Spawn error: {:?}", e),
585 0 : InitdbError::Failed(status, stderr) => write!(
586 0 : f,
587 0 : "Command failed with status {:?}: {}",
588 0 : status,
589 0 : String::from_utf8_lossy(stderr)
590 0 : ),
591 0 : InitdbError::Other(e) => write!(f, "Error: {:?}", e),
592 : }
593 0 : }
594 : }
595 :
596 : impl From<std::io::Error> for InitdbError {
597 0 : fn from(error: std::io::Error) -> Self {
598 0 : InitdbError::Spawn(Err(error))
599 0 : }
600 : }
601 :
602 : enum CreateTimelineCause {
603 : Load,
604 : Delete,
605 : }
606 :
607 0 : #[derive(thiserror::Error, Debug)]
608 : pub(crate) enum GcError {
609 : // The tenant is shutting down
610 : #[error("tenant shutting down")]
611 : TenantCancelled,
612 :
613 : // The tenant is shutting down
614 : #[error("timeline shutting down")]
615 : TimelineCancelled,
616 :
617 : // The tenant is in a state inelegible to run GC
618 : #[error("not active")]
619 : NotActive,
620 :
621 : // A requested GC cutoff LSN was invalid, for example it tried to move backwards
622 : #[error("not active")]
623 : BadLsn { why: String },
624 :
625 : // A remote storage error while scheduling updates after compaction
626 : #[error(transparent)]
627 : Remote(anyhow::Error),
628 :
629 : // An error reading while calculating GC cutoffs
630 : #[error(transparent)]
631 : GcCutoffs(PageReconstructError),
632 :
633 : // If GC was invoked for a particular timeline, this error means it didn't exist
634 : #[error("timeline not found")]
635 : TimelineNotFound,
636 : }
637 :
638 : impl From<PageReconstructError> for GcError {
639 0 : fn from(value: PageReconstructError) -> Self {
640 0 : match value {
641 0 : PageReconstructError::Cancelled => Self::TimelineCancelled,
642 0 : other => Self::GcCutoffs(other),
643 : }
644 0 : }
645 : }
646 :
647 : impl From<NotInitialized> for GcError {
648 0 : fn from(value: NotInitialized) -> Self {
649 0 : match value {
650 0 : NotInitialized::Uninitialized => GcError::Remote(value.into()),
651 0 : NotInitialized::Stopped | NotInitialized::ShuttingDown => GcError::TimelineCancelled,
652 : }
653 0 : }
654 : }
655 :
656 : impl From<timeline::layer_manager::Shutdown> for GcError {
657 0 : fn from(_: timeline::layer_manager::Shutdown) -> Self {
658 0 : GcError::TimelineCancelled
659 0 : }
660 : }
661 :
662 0 : #[derive(thiserror::Error, Debug)]
663 : pub(crate) enum LoadConfigError {
664 : #[error("TOML deserialization error: '{0}'")]
665 : DeserializeToml(#[from] toml_edit::de::Error),
666 :
667 : #[error("Config not found at {0}")]
668 : NotFound(Utf8PathBuf),
669 : }
670 :
671 : impl Tenant {
672 : /// Yet another helper for timeline initialization.
673 : ///
674 : /// - Initializes the Timeline struct and inserts it into the tenant's hash map
675 : /// - Scans the local timeline directory for layer files and builds the layer map
676 : /// - Downloads remote index file and adds remote files to the layer map
677 : /// - Schedules remote upload tasks for any files that are present locally but missing from remote storage.
678 : ///
679 : /// If the operation fails, the timeline is left in the tenant's hash map in Broken state. On success,
680 : /// it is marked as Active.
681 : #[allow(clippy::too_many_arguments)]
682 18 : async fn timeline_init_and_sync(
683 18 : &self,
684 18 : timeline_id: TimelineId,
685 18 : resources: TimelineResources,
686 18 : index_part: Option<IndexPart>,
687 18 : metadata: TimelineMetadata,
688 18 : ancestor: Option<Arc<Timeline>>,
689 18 : last_aux_file_policy: Option<AuxFilePolicy>,
690 18 : _ctx: &RequestContext,
691 18 : ) -> anyhow::Result<()> {
692 18 : let tenant_id = self.tenant_shard_id;
693 :
694 18 : let timeline = self.create_timeline_struct(
695 18 : timeline_id,
696 18 : &metadata,
697 18 : ancestor.clone(),
698 18 : resources,
699 18 : CreateTimelineCause::Load,
700 18 : // This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`,
701 18 : // there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence.
702 18 : // Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2.
703 18 : last_aux_file_policy,
704 18 : )?;
705 18 : let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
706 18 : anyhow::ensure!(
707 18 : disk_consistent_lsn.is_valid(),
708 0 : "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
709 : );
710 18 : assert_eq!(
711 18 : disk_consistent_lsn,
712 18 : metadata.disk_consistent_lsn(),
713 0 : "these are used interchangeably"
714 : );
715 :
716 18 : if let Some(index_part) = index_part.as_ref() {
717 18 : timeline.remote_client.init_upload_queue(index_part)?;
718 :
719 18 : timeline
720 18 : .last_aux_file_policy
721 18 : .store(index_part.last_aux_file_policy());
722 : } else {
723 : // No data on the remote storage, but we have local metadata file. We can end up
724 : // here with timeline_create being interrupted before finishing index part upload.
725 : // By doing what we do here, the index part upload is retried.
726 : // If control plane retries timeline creation in the meantime, the mgmt API handler
727 : // for timeline creation will coalesce on the upload we queue here.
728 :
729 : // FIXME: this branch should be dead code as we no longer write local metadata.
730 :
731 0 : timeline
732 0 : .remote_client
733 0 : .init_upload_queue_for_empty_remote(&metadata)?;
734 0 : timeline
735 0 : .remote_client
736 0 : .schedule_index_upload_for_full_metadata_update(&metadata)?;
737 : }
738 :
739 18 : timeline
740 18 : .load_layer_map(disk_consistent_lsn, index_part)
741 18 : .await
742 18 : .with_context(|| {
743 0 : format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
744 18 : })?;
745 :
746 : {
747 : // avoiding holding it across awaits
748 18 : let mut timelines_accessor = self.timelines.lock().unwrap();
749 18 : match timelines_accessor.entry(timeline_id) {
750 : // We should never try and load the same timeline twice during startup
751 : Entry::Occupied(_) => {
752 0 : unreachable!(
753 0 : "Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
754 0 : );
755 : }
756 18 : Entry::Vacant(v) => {
757 18 : v.insert(Arc::clone(&timeline));
758 18 : timeline.maybe_spawn_flush_loop();
759 18 : }
760 18 : }
761 18 : };
762 18 :
763 18 : // Sanity check: a timeline should have some content.
764 18 : anyhow::ensure!(
765 18 : ancestor.is_some()
766 12 : || timeline
767 12 : .layers
768 12 : .read()
769 0 : .await
770 12 : .layer_map()
771 12 : .expect("currently loading, layer manager cannot be shutdown already")
772 12 : .iter_historic_layers()
773 12 : .next()
774 12 : .is_some(),
775 0 : "Timeline has no ancestor and no layer files"
776 : );
777 :
778 18 : Ok(())
779 18 : }
780 :
781 : /// Attach a tenant that's available in cloud storage.
782 : ///
783 : /// This returns quickly, after just creating the in-memory object
784 : /// Tenant struct and launching a background task to download
785 : /// the remote index files. On return, the tenant is most likely still in
786 : /// Attaching state, and it will become Active once the background task
787 : /// finishes. You can use wait_until_active() to wait for the task to
788 : /// complete.
789 : ///
790 : #[allow(clippy::too_many_arguments)]
791 0 : pub(crate) fn spawn(
792 0 : conf: &'static PageServerConf,
793 0 : tenant_shard_id: TenantShardId,
794 0 : resources: TenantSharedResources,
795 0 : attached_conf: AttachedTenantConf,
796 0 : shard_identity: ShardIdentity,
797 0 : init_order: Option<InitializationOrder>,
798 0 : mode: SpawnMode,
799 0 : ctx: &RequestContext,
800 0 : ) -> Result<Arc<Tenant>, GlobalShutDown> {
801 0 : let wal_redo_manager =
802 0 : WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;
803 :
804 : let TenantSharedResources {
805 0 : broker_client,
806 0 : remote_storage,
807 0 : deletion_queue_client,
808 0 : l0_flush_global_state,
809 0 : } = resources;
810 0 :
811 0 : let attach_mode = attached_conf.location.attach_mode;
812 0 : let generation = attached_conf.location.generation;
813 0 :
814 0 : let tenant = Arc::new(Tenant::new(
815 0 : TenantState::Attaching,
816 0 : conf,
817 0 : attached_conf,
818 0 : shard_identity,
819 0 : Some(wal_redo_manager),
820 0 : tenant_shard_id,
821 0 : remote_storage.clone(),
822 0 : deletion_queue_client,
823 0 : l0_flush_global_state,
824 0 : ));
825 0 :
826 0 : // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
827 0 : // we shut down while attaching.
828 0 : let attach_gate_guard = tenant
829 0 : .gate
830 0 : .enter()
831 0 : .expect("We just created the Tenant: nothing else can have shut it down yet");
832 0 :
833 0 : // Do all the hard work in the background
834 0 : let tenant_clone = Arc::clone(&tenant);
835 0 : let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
836 0 : task_mgr::spawn(
837 0 : &tokio::runtime::Handle::current(),
838 0 : TaskKind::Attach,
839 0 : tenant_shard_id,
840 0 : None,
841 0 : "attach tenant",
842 0 : async move {
843 0 :
844 0 : info!(
845 : ?attach_mode,
846 0 : "Attaching tenant"
847 : );
848 :
849 0 : let _gate_guard = attach_gate_guard;
850 0 :
851 0 : // Is this tenant being spawned as part of process startup?
852 0 : let starting_up = init_order.is_some();
853 0 : scopeguard::defer! {
854 0 : if starting_up {
855 0 : TENANT.startup_complete.inc();
856 0 : }
857 0 : }
858 :
859 : // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
860 : enum BrokenVerbosity {
861 : Error,
862 : Info
863 : }
864 0 : let make_broken =
865 0 : |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
866 0 : match verbosity {
867 : BrokenVerbosity::Info => {
868 0 : info!("attach cancelled, setting tenant state to Broken: {err}");
869 : },
870 : BrokenVerbosity::Error => {
871 0 : error!("attach failed, setting tenant state to Broken: {err:?}");
872 : }
873 : }
874 0 : t.state.send_modify(|state| {
875 0 : // The Stopping case is for when we have passed control on to DeleteTenantFlow:
876 0 : // if it errors, we will call make_broken when tenant is already in Stopping.
877 0 : assert!(
878 0 : matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
879 0 : "the attach task owns the tenant state until activation is complete"
880 : );
881 :
882 0 : *state = TenantState::broken_from_reason(err.to_string());
883 0 : });
884 0 : };
885 :
886 : // TODO: should also be rejecting tenant conf changes that violate this check.
887 0 : if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
888 0 : make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
889 0 : return Ok(());
890 0 : }
891 0 :
892 0 : let mut init_order = init_order;
893 0 : // take the completion because initial tenant loading will complete when all of
894 0 : // these tasks complete.
895 0 : let _completion = init_order
896 0 : .as_mut()
897 0 : .and_then(|x| x.initial_tenant_load.take());
898 0 : let remote_load_completion = init_order
899 0 : .as_mut()
900 0 : .and_then(|x| x.initial_tenant_load_remote.take());
901 :
902 : enum AttachType<'a> {
903 : /// We are attaching this tenant lazily in the background.
904 : Warmup {
905 : _permit: tokio::sync::SemaphorePermit<'a>,
906 : during_startup: bool
907 : },
908 : /// We are attaching this tenant as soon as we can, because for example an
909 : /// endpoint tried to access it.
910 : OnDemand,
911 : /// During normal operations after startup, we are attaching a tenant, and
912 : /// eager attach was requested.
913 : Normal,
914 : }
915 :
916 0 : let attach_type = if matches!(mode, SpawnMode::Lazy) {
917 : // Before doing any I/O, wait for at least one of:
918 : // - A client attempting to access to this tenant (on-demand loading)
919 : // - A permit becoming available in the warmup semaphore (background warmup)
920 :
921 0 : tokio::select!(
922 0 : permit = tenant_clone.activate_now_sem.acquire() => {
923 0 : let _ = permit.expect("activate_now_sem is never closed");
924 0 : tracing::info!("Activating tenant (on-demand)");
925 0 : AttachType::OnDemand
926 : },
927 0 : permit = conf.concurrent_tenant_warmup.inner().acquire() => {
928 0 : let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed");
929 0 : tracing::info!("Activating tenant (warmup)");
930 0 : AttachType::Warmup {
931 0 : _permit,
932 0 : during_startup: init_order.is_some()
933 0 : }
934 : }
935 0 : _ = tenant_clone.cancel.cancelled() => {
936 : // This is safe, but should be pretty rare: it is interesting if a tenant
937 : // stayed in Activating for such a long time that shutdown found it in
938 : // that state.
939 0 : tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
940 : // Make the tenant broken so that set_stopping will not hang waiting for it to leave
941 : // the Attaching state. This is an over-reaction (nothing really broke, the tenant is
942 : // just shutting down), but ensures progress.
943 0 : make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
944 0 : return Ok(());
945 : },
946 : )
947 : } else {
948 : // SpawnMode::{Create,Eager} always cause jumping ahead of the
949 : // concurrent_tenant_warmup queue
950 0 : AttachType::Normal
951 : };
952 :
953 0 : let preload = match &mode {
954 : SpawnMode::Eager | SpawnMode::Lazy => {
955 0 : let _preload_timer = TENANT.preload.start_timer();
956 0 : let res = tenant_clone
957 0 : .preload(&remote_storage, task_mgr::shutdown_token())
958 0 : .await;
959 0 : match res {
960 0 : Ok(p) => Some(p),
961 0 : Err(e) => {
962 0 : make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
963 0 : return Ok(());
964 : }
965 : }
966 : }
967 :
968 : };
969 :
970 : // Remote preload is complete.
971 0 : drop(remote_load_completion);
972 :
973 : // We will time the duration of the attach phase unless this is a creation (attach will do no work)
974 0 : let attached = {
975 0 : let _attach_timer = Some(TENANT.attach.start_timer());
976 0 : tenant_clone.attach(preload, &ctx).await
977 : };
978 :
979 0 : match attached {
980 : Ok(()) => {
981 0 : info!("attach finished, activating");
982 0 : tenant_clone.activate(broker_client, None, &ctx);
983 : }
984 0 : Err(e) => {
985 0 : make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
986 0 : }
987 : }
988 :
989 : // If we are doing an opportunistic warmup attachment at startup, initialize
990 : // logical size at the same time. This is better than starting a bunch of idle tenants
991 : // with cold caches and then coming back later to initialize their logical sizes.
992 : //
993 : // It also prevents the warmup proccess competing with the concurrency limit on
994 : // logical size calculations: if logical size calculation semaphore is saturated,
995 : // then warmup will wait for that before proceeding to the next tenant.
996 0 : if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) {
997 0 : let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
998 0 : tracing::info!("Waiting for initial logical sizes while warming up...");
999 0 : while futs.next().await.is_some() {}
1000 0 : tracing::info!("Warm-up complete");
1001 0 : }
1002 :
1003 0 : Ok(())
1004 0 : }
1005 0 : .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
1006 : );
1007 0 : Ok(tenant)
1008 0 : }
1009 :
1010 570 : #[instrument(skip_all)]
1011 : pub(crate) async fn preload(
1012 : self: &Arc<Self>,
1013 : remote_storage: &GenericRemoteStorage,
1014 : cancel: CancellationToken,
1015 : ) -> anyhow::Result<TenantPreload> {
1016 : span::debug_assert_current_span_has_tenant_id();
1017 : // Get list of remote timelines
1018 : // download index files for every tenant timeline
1019 : info!("listing remote timelines");
1020 : let (remote_timeline_ids, other_keys) = remote_timeline_client::list_remote_timelines(
1021 : remote_storage,
1022 : self.tenant_shard_id,
1023 : cancel.clone(),
1024 : )
1025 : .await?;
1026 :
1027 : info!("found {} timelines", remote_timeline_ids.len(),);
1028 :
1029 : for k in other_keys {
1030 : warn!("Unexpected non timeline key {k}");
1031 : }
1032 :
1033 : Ok(TenantPreload {
1034 : timelines: self
1035 : .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
1036 : .await?,
1037 : })
1038 : }
1039 :
1040 : ///
1041 : /// Background task that downloads all data for a tenant and brings it to Active state.
1042 : ///
1043 : /// No background tasks are started as part of this routine.
1044 : ///
1045 570 : async fn attach(
1046 570 : self: &Arc<Tenant>,
1047 570 : preload: Option<TenantPreload>,
1048 570 : ctx: &RequestContext,
1049 570 : ) -> anyhow::Result<()> {
1050 570 : span::debug_assert_current_span_has_tenant_id();
1051 570 :
1052 570 : failpoint_support::sleep_millis_async!("before-attaching-tenant");
1053 :
1054 570 : let Some(preload) = preload else {
1055 0 : anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
1056 : };
1057 :
1058 570 : let mut timelines_to_resume_deletions = vec![];
1059 570 :
1060 570 : let mut remote_index_and_client = HashMap::new();
1061 570 : let mut timeline_ancestors = HashMap::new();
1062 570 : let mut existent_timelines = HashSet::new();
1063 588 : for (timeline_id, preload) in preload.timelines {
1064 18 : let index_part = match preload.index_part {
1065 18 : Ok(i) => {
1066 18 : debug!("remote index part exists for timeline {timeline_id}");
1067 : // We found index_part on the remote, this is the standard case.
1068 18 : existent_timelines.insert(timeline_id);
1069 18 : i
1070 : }
1071 : Err(DownloadError::NotFound) => {
1072 : // There is no index_part on the remote. We only get here
1073 : // if there is some prefix for the timeline in the remote storage.
1074 : // This can e.g. be the initdb.tar.zst archive, maybe a
1075 : // remnant from a prior incomplete creation or deletion attempt.
1076 : // Delete the local directory as the deciding criterion for a
1077 : // timeline's existence is presence of index_part.
1078 0 : info!(%timeline_id, "index_part not found on remote");
1079 0 : continue;
1080 : }
1081 0 : Err(e) => {
1082 0 : // Some (possibly ephemeral) error happened during index_part download.
1083 0 : // Pretend the timeline exists to not delete the timeline directory,
1084 0 : // as it might be a temporary issue and we don't want to re-download
1085 0 : // everything after it resolves.
1086 0 : warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})");
1087 :
1088 0 : existent_timelines.insert(timeline_id);
1089 0 : continue;
1090 : }
1091 : };
1092 18 : match index_part {
1093 18 : MaybeDeletedIndexPart::IndexPart(index_part) => {
1094 18 : timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
1095 18 : remote_index_and_client.insert(timeline_id, (index_part, preload.client));
1096 18 : }
1097 0 : MaybeDeletedIndexPart::Deleted(index_part) => {
1098 0 : info!(
1099 0 : "timeline {} is deleted, picking to resume deletion",
1100 : timeline_id
1101 : );
1102 0 : timelines_to_resume_deletions.push((timeline_id, index_part, preload.client));
1103 : }
1104 : }
1105 : }
1106 :
1107 570 : let mut gc_blocks = HashMap::new();
1108 :
1109 : // For every timeline, download the metadata file, scan the local directory,
1110 : // and build a layer map that contains an entry for each remote and local
1111 : // layer file.
1112 570 : let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
1113 588 : for (timeline_id, remote_metadata) in sorted_timelines {
1114 18 : let (index_part, remote_client) = remote_index_and_client
1115 18 : .remove(&timeline_id)
1116 18 : .expect("just put it in above");
1117 :
1118 18 : if let Some(blocking) = index_part.gc_blocking.as_ref() {
1119 : // could just filter these away, but it helps while testing
1120 0 : anyhow::ensure!(
1121 0 : !blocking.reasons.is_empty(),
1122 0 : "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons"
1123 : );
1124 0 : let prev = gc_blocks.insert(timeline_id, blocking.reasons);
1125 0 : assert!(prev.is_none());
1126 18 : }
1127 :
1128 : // TODO again handle early failure
1129 18 : self.load_remote_timeline(
1130 18 : timeline_id,
1131 18 : index_part,
1132 18 : remote_metadata,
1133 18 : TimelineResources {
1134 18 : remote_client,
1135 18 : timeline_get_throttle: self.timeline_get_throttle.clone(),
1136 18 : l0_flush_global_state: self.l0_flush_global_state.clone(),
1137 18 : },
1138 18 : ctx,
1139 18 : )
1140 32 : .await
1141 18 : .with_context(|| {
1142 0 : format!(
1143 0 : "failed to load remote timeline {} for tenant {}",
1144 0 : timeline_id, self.tenant_shard_id
1145 0 : )
1146 18 : })?;
1147 : }
1148 :
1149 : // Walk through deleted timelines, resume deletion
1150 570 : for (timeline_id, index_part, remote_timeline_client) in timelines_to_resume_deletions {
1151 0 : remote_timeline_client
1152 0 : .init_upload_queue_stopped_to_continue_deletion(&index_part)
1153 0 : .context("init queue stopped")
1154 0 : .map_err(LoadLocalTimelineError::ResumeDeletion)?;
1155 :
1156 0 : DeleteTimelineFlow::resume_deletion(
1157 0 : Arc::clone(self),
1158 0 : timeline_id,
1159 0 : &index_part.metadata,
1160 0 : remote_timeline_client,
1161 0 : )
1162 0 : .instrument(tracing::info_span!("timeline_delete", %timeline_id))
1163 0 : .await
1164 0 : .context("resume_deletion")
1165 0 : .map_err(LoadLocalTimelineError::ResumeDeletion)?;
1166 : }
1167 :
1168 : // The local filesystem contents are a cache of what's in the remote IndexPart;
1169 : // IndexPart is the source of truth.
1170 570 : self.clean_up_timelines(&existent_timelines)?;
1171 :
1172 570 : self.gc_block.set_scanned(gc_blocks);
1173 570 :
1174 570 : fail::fail_point!("attach-before-activate", |_| {
1175 0 : anyhow::bail!("attach-before-activate");
1176 570 : });
1177 570 : failpoint_support::sleep_millis_async!("attach-before-activate-sleep", &self.cancel);
1178 :
1179 570 : info!("Done");
1180 :
1181 570 : Ok(())
1182 570 : }
1183 :
1184 : /// Check for any local timeline directories that are temporary, or do not correspond to a
1185 : /// timeline that still exists: this can happen if we crashed during a deletion/creation, or
1186 : /// if a timeline was deleted while the tenant was attached to a different pageserver.
1187 570 : fn clean_up_timelines(&self, existent_timelines: &HashSet<TimelineId>) -> anyhow::Result<()> {
1188 570 : let timelines_dir = self.conf.timelines_path(&self.tenant_shard_id);
1189 :
1190 570 : let entries = match timelines_dir.read_dir_utf8() {
1191 570 : Ok(d) => d,
1192 0 : Err(e) => {
1193 0 : if e.kind() == std::io::ErrorKind::NotFound {
1194 0 : return Ok(());
1195 : } else {
1196 0 : return Err(e).context("list timelines directory for tenant");
1197 : }
1198 : }
1199 : };
1200 :
1201 594 : for entry in entries {
1202 24 : let entry = entry.context("read timeline dir entry")?;
1203 24 : let entry_path = entry.path();
1204 :
1205 24 : let purge = if crate::is_temporary(entry_path)
1206 : // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
1207 24 : || is_uninit_mark(entry_path)
1208 24 : || crate::is_delete_mark(entry_path)
1209 : {
1210 0 : true
1211 : } else {
1212 24 : match TimelineId::try_from(entry_path.file_name()) {
1213 24 : Ok(i) => {
1214 24 : // Purge if the timeline ID does not exist in remote storage: remote storage is the authority.
1215 24 : !existent_timelines.contains(&i)
1216 : }
1217 0 : Err(e) => {
1218 0 : tracing::warn!(
1219 0 : "Unparseable directory in timelines directory: {entry_path}, ignoring ({e})"
1220 : );
1221 : // Do not purge junk: if we don't recognize it, be cautious and leave it for a human.
1222 0 : false
1223 : }
1224 : }
1225 : };
1226 :
1227 24 : if purge {
1228 6 : tracing::info!("Purging stale timeline dentry {entry_path}");
1229 6 : if let Err(e) = match entry.file_type() {
1230 6 : Ok(t) => if t.is_dir() {
1231 6 : std::fs::remove_dir_all(entry_path)
1232 : } else {
1233 0 : std::fs::remove_file(entry_path)
1234 : }
1235 6 : .or_else(fs_ext::ignore_not_found),
1236 0 : Err(e) => Err(e),
1237 : } {
1238 0 : tracing::warn!("Failed to purge stale timeline dentry {entry_path}: {e}");
1239 6 : }
1240 18 : }
1241 : }
1242 :
1243 570 : Ok(())
1244 570 : }
1245 :
1246 : /// Get sum of all remote timelines sizes
1247 : ///
1248 : /// This function relies on the index_part instead of listing the remote storage
1249 0 : pub fn remote_size(&self) -> u64 {
1250 0 : let mut size = 0;
1251 :
1252 0 : for timeline in self.list_timelines() {
1253 0 : size += timeline.remote_client.get_remote_physical_size();
1254 0 : }
1255 :
1256 0 : size
1257 0 : }
1258 :
1259 18 : #[instrument(skip_all, fields(timeline_id=%timeline_id))]
1260 : async fn load_remote_timeline(
1261 : &self,
1262 : timeline_id: TimelineId,
1263 : index_part: IndexPart,
1264 : remote_metadata: TimelineMetadata,
1265 : resources: TimelineResources,
1266 : ctx: &RequestContext,
1267 : ) -> anyhow::Result<()> {
1268 : span::debug_assert_current_span_has_tenant_id();
1269 :
1270 : info!("downloading index file for timeline {}", timeline_id);
1271 : tokio::fs::create_dir_all(self.conf.timeline_path(&self.tenant_shard_id, &timeline_id))
1272 : .await
1273 : .context("Failed to create new timeline directory")?;
1274 :
1275 : let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() {
1276 : let timelines = self.timelines.lock().unwrap();
1277 : Some(Arc::clone(timelines.get(&ancestor_id).ok_or_else(
1278 0 : || {
1279 0 : anyhow::anyhow!(
1280 0 : "cannot find ancestor timeline {ancestor_id} for timeline {timeline_id}"
1281 0 : )
1282 0 : },
1283 : )?))
1284 : } else {
1285 : None
1286 : };
1287 :
1288 : let last_aux_file_policy = index_part.last_aux_file_policy();
1289 :
1290 : self.timeline_init_and_sync(
1291 : timeline_id,
1292 : resources,
1293 : Some(index_part),
1294 : remote_metadata,
1295 : ancestor,
1296 : last_aux_file_policy,
1297 : ctx,
1298 : )
1299 : .await
1300 : }
1301 :
1302 570 : async fn load_timelines_metadata(
1303 570 : self: &Arc<Tenant>,
1304 570 : timeline_ids: HashSet<TimelineId>,
1305 570 : remote_storage: &GenericRemoteStorage,
1306 570 : cancel: CancellationToken,
1307 570 : ) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
1308 570 : let mut part_downloads = JoinSet::new();
1309 588 : for timeline_id in timeline_ids {
1310 18 : let cancel_clone = cancel.clone();
1311 18 : part_downloads.spawn(
1312 18 : self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
1313 18 : .instrument(info_span!("download_index_part", %timeline_id)),
1314 : );
1315 : }
1316 :
1317 570 : let mut timeline_preloads: HashMap<TimelineId, TimelinePreload> = HashMap::new();
1318 :
1319 : loop {
1320 588 : tokio::select!(
1321 588 : next = part_downloads.join_next() => {
1322 588 : match next {
1323 18 : Some(result) => {
1324 18 : let preload = result.context("join preload task")?;
1325 18 : timeline_preloads.insert(preload.timeline_id, preload);
1326 : },
1327 : None => {
1328 570 : break;
1329 : }
1330 : }
1331 : },
1332 588 : _ = cancel.cancelled() => {
1333 0 : anyhow::bail!("Cancelled while waiting for remote index download")
1334 : }
1335 : )
1336 : }
1337 :
1338 570 : Ok(timeline_preloads)
1339 570 : }
1340 :
1341 18 : fn load_timeline_metadata(
1342 18 : self: &Arc<Tenant>,
1343 18 : timeline_id: TimelineId,
1344 18 : remote_storage: GenericRemoteStorage,
1345 18 : cancel: CancellationToken,
1346 18 : ) -> impl Future<Output = TimelinePreload> {
1347 18 : let client = RemoteTimelineClient::new(
1348 18 : remote_storage.clone(),
1349 18 : self.deletion_queue_client.clone(),
1350 18 : self.conf,
1351 18 : self.tenant_shard_id,
1352 18 : timeline_id,
1353 18 : self.generation,
1354 18 : );
1355 18 : async move {
1356 18 : debug_assert_current_span_has_tenant_and_timeline_id();
1357 18 : debug!("starting index part download");
1358 :
1359 67 : let index_part = client.download_index_file(&cancel).await;
1360 :
1361 18 : debug!("finished index part download");
1362 :
1363 18 : TimelinePreload {
1364 18 : client,
1365 18 : timeline_id,
1366 18 : index_part,
1367 18 : }
1368 18 : }
1369 18 : }
1370 :
1371 0 : pub(crate) async fn apply_timeline_archival_config(
1372 0 : &self,
1373 0 : timeline_id: TimelineId,
1374 0 : state: TimelineArchivalState,
1375 0 : ) -> Result<(), TimelineArchivalError> {
1376 0 : info!("setting timeline archival config");
1377 0 : let timeline = {
1378 0 : let timelines = self.timelines.lock().unwrap();
1379 :
1380 0 : let Some(timeline) = timelines.get(&timeline_id) else {
1381 0 : return Err(TimelineArchivalError::NotFound);
1382 : };
1383 :
1384 0 : if state == TimelineArchivalState::Unarchived {
1385 0 : if let Some(ancestor_timeline) = timeline.ancestor_timeline() {
1386 0 : if ancestor_timeline.is_archived() == Some(true) {
1387 0 : return Err(TimelineArchivalError::HasArchivedParent(
1388 0 : ancestor_timeline.timeline_id,
1389 0 : ));
1390 0 : }
1391 0 : }
1392 0 : }
1393 :
1394 : // Ensure that there are no non-archived child timelines
1395 0 : let children: Vec<TimelineId> = timelines
1396 0 : .iter()
1397 0 : .filter_map(|(id, entry)| {
1398 0 : if entry.get_ancestor_timeline_id() != Some(timeline_id) {
1399 0 : return None;
1400 0 : }
1401 0 : if entry.is_archived() == Some(true) {
1402 0 : return None;
1403 0 : }
1404 0 : Some(*id)
1405 0 : })
1406 0 : .collect();
1407 0 :
1408 0 : if !children.is_empty() && state == TimelineArchivalState::Archived {
1409 0 : return Err(TimelineArchivalError::HasUnarchivedChildren(children));
1410 0 : }
1411 0 : Arc::clone(timeline)
1412 : };
1413 :
1414 0 : let upload_needed = timeline
1415 0 : .remote_client
1416 0 : .schedule_index_upload_for_timeline_archival_state(state)?;
1417 :
1418 0 : if upload_needed {
1419 0 : info!("Uploading new state");
1420 : const MAX_WAIT: Duration = Duration::from_secs(10);
1421 0 : let Ok(v) =
1422 0 : tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
1423 : else {
1424 0 : tracing::warn!("reached timeout for waiting on upload queue");
1425 0 : return Err(TimelineArchivalError::Timeout);
1426 : };
1427 0 : v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
1428 0 : }
1429 0 : Ok(())
1430 0 : }
1431 :
1432 12 : pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
1433 12 : self.tenant_shard_id
1434 12 : }
1435 :
1436 : /// Get Timeline handle for given Neon timeline ID.
1437 : /// This function is idempotent. It doesn't change internal state in any way.
1438 666 : pub fn get_timeline(
1439 666 : &self,
1440 666 : timeline_id: TimelineId,
1441 666 : active_only: bool,
1442 666 : ) -> Result<Arc<Timeline>, GetTimelineError> {
1443 666 : let timelines_accessor = self.timelines.lock().unwrap();
1444 666 : let timeline = timelines_accessor
1445 666 : .get(&timeline_id)
1446 666 : .ok_or(GetTimelineError::NotFound {
1447 666 : tenant_id: self.tenant_shard_id,
1448 666 : timeline_id,
1449 666 : })?;
1450 :
1451 660 : if active_only && !timeline.is_active() {
1452 0 : Err(GetTimelineError::NotActive {
1453 0 : tenant_id: self.tenant_shard_id,
1454 0 : timeline_id,
1455 0 : state: timeline.current_state(),
1456 0 : })
1457 : } else {
1458 660 : Ok(Arc::clone(timeline))
1459 : }
1460 666 : }
1461 :
1462 : /// Lists timelines the tenant contains.
1463 : /// Up to tenant's implementation to omit certain timelines that ar not considered ready for use.
1464 24 : pub fn list_timelines(&self) -> Vec<Arc<Timeline>> {
1465 24 : self.timelines
1466 24 : .lock()
1467 24 : .unwrap()
1468 24 : .values()
1469 24 : .map(Arc::clone)
1470 24 : .collect()
1471 24 : }
1472 :
1473 0 : pub fn list_timeline_ids(&self) -> Vec<TimelineId> {
1474 0 : self.timelines.lock().unwrap().keys().cloned().collect()
1475 0 : }
1476 :
1477 : /// This is used to create the initial 'main' timeline during bootstrapping,
1478 : /// or when importing a new base backup. The caller is expected to load an
1479 : /// initial image of the datadir to the new timeline after this.
1480 : ///
1481 : /// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
1482 : /// and the timeline will fail to load at a restart.
1483 : ///
1484 : /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
1485 : /// minimum amount of keys required to get a writable timeline.
1486 : /// (Without it, `put` might fail due to `repartition` failing.)
1487 546 : pub(crate) async fn create_empty_timeline(
1488 546 : &self,
1489 546 : new_timeline_id: TimelineId,
1490 546 : initdb_lsn: Lsn,
1491 546 : pg_version: u32,
1492 546 : _ctx: &RequestContext,
1493 546 : ) -> anyhow::Result<UninitializedTimeline> {
1494 546 : anyhow::ensure!(
1495 546 : self.is_active(),
1496 0 : "Cannot create empty timelines on inactive tenant"
1497 : );
1498 :
1499 : // Protect against concurrent attempts to use this TimelineId
1500 546 : let create_guard = self.create_timeline_create_guard(new_timeline_id)?;
1501 :
1502 540 : let new_metadata = TimelineMetadata::new(
1503 540 : // Initialize disk_consistent LSN to 0, The caller must import some data to
1504 540 : // make it valid, before calling finish_creation()
1505 540 : Lsn(0),
1506 540 : None,
1507 540 : None,
1508 540 : Lsn(0),
1509 540 : initdb_lsn,
1510 540 : initdb_lsn,
1511 540 : pg_version,
1512 540 : );
1513 540 : self.prepare_new_timeline(
1514 540 : new_timeline_id,
1515 540 : &new_metadata,
1516 540 : create_guard,
1517 540 : initdb_lsn,
1518 540 : None,
1519 540 : None,
1520 540 : )
1521 0 : .await
1522 546 : }
1523 :
1524 : /// Helper for unit tests to create an empty timeline.
1525 : ///
1526 : /// The timeline is has state value `Active` but its background loops are not running.
1527 : // This makes the various functions which anyhow::ensure! for Active state work in tests.
1528 : // Our current tests don't need the background loops.
1529 : #[cfg(test)]
1530 516 : pub async fn create_test_timeline(
1531 516 : &self,
1532 516 : new_timeline_id: TimelineId,
1533 516 : initdb_lsn: Lsn,
1534 516 : pg_version: u32,
1535 516 : ctx: &RequestContext,
1536 516 : ) -> anyhow::Result<Arc<Timeline>> {
1537 516 : let uninit_tl = self
1538 516 : .create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
1539 0 : .await?;
1540 516 : let tline = uninit_tl.raw_timeline().expect("we just created it");
1541 516 : assert_eq!(tline.get_last_record_lsn(), Lsn(0));
1542 :
1543 : // Setup minimum keys required for the timeline to be usable.
1544 516 : let mut modification = tline.begin_modification(initdb_lsn);
1545 516 : modification
1546 516 : .init_empty_test_timeline()
1547 516 : .context("init_empty_test_timeline")?;
1548 516 : modification
1549 516 : .commit(ctx)
1550 498 : .await
1551 516 : .context("commit init_empty_test_timeline modification")?;
1552 :
1553 : // Flush to disk so that uninit_tl's check for valid disk_consistent_lsn passes.
1554 516 : tline.maybe_spawn_flush_loop();
1555 516 : tline.freeze_and_flush().await.context("freeze_and_flush")?;
1556 :
1557 : // Make sure the freeze_and_flush reaches remote storage.
1558 516 : tline.remote_client.wait_completion().await.unwrap();
1559 :
1560 516 : let tl = uninit_tl.finish_creation()?;
1561 : // The non-test code would call tl.activate() here.
1562 516 : tl.set_state(TimelineState::Active);
1563 516 : Ok(tl)
1564 516 : }
1565 :
1566 : /// Helper for unit tests to create a timeline with some pre-loaded states.
1567 : #[cfg(test)]
1568 : #[allow(clippy::too_many_arguments)]
1569 84 : pub async fn create_test_timeline_with_layers(
1570 84 : &self,
1571 84 : new_timeline_id: TimelineId,
1572 84 : initdb_lsn: Lsn,
1573 84 : pg_version: u32,
1574 84 : ctx: &RequestContext,
1575 84 : delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
1576 84 : image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
1577 84 : end_lsn: Lsn,
1578 84 : ) -> anyhow::Result<Arc<Timeline>> {
1579 : use checks::check_valid_layermap;
1580 : use itertools::Itertools;
1581 :
1582 84 : let tline = self
1583 84 : .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
1584 168 : .await?;
1585 84 : tline.force_advance_lsn(end_lsn);
1586 246 : for deltas in delta_layer_desc {
1587 162 : tline
1588 162 : .force_create_delta_layer(deltas, Some(initdb_lsn), ctx)
1589 486 : .await?;
1590 : }
1591 222 : for (lsn, images) in image_layer_desc {
1592 138 : tline
1593 138 : .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
1594 807 : .await?;
1595 : }
1596 84 : let layer_names = tline
1597 84 : .layers
1598 84 : .read()
1599 0 : .await
1600 84 : .layer_map()
1601 84 : .unwrap()
1602 84 : .iter_historic_layers()
1603 384 : .map(|layer| layer.layer_name())
1604 84 : .collect_vec();
1605 84 : if let Some(err) = check_valid_layermap(&layer_names) {
1606 0 : bail!("invalid layermap: {err}");
1607 84 : }
1608 84 : Ok(tline)
1609 84 : }
1610 :
1611 : /// Create a new timeline.
1612 : ///
1613 : /// Returns the new timeline ID and reference to its Timeline object.
1614 : ///
1615 : /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
1616 : /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
1617 : #[allow(clippy::too_many_arguments)]
1618 0 : pub(crate) async fn create_timeline(
1619 0 : self: &Arc<Tenant>,
1620 0 : new_timeline_id: TimelineId,
1621 0 : ancestor_timeline_id: Option<TimelineId>,
1622 0 : mut ancestor_start_lsn: Option<Lsn>,
1623 0 : pg_version: u32,
1624 0 : load_existing_initdb: Option<TimelineId>,
1625 0 : broker_client: storage_broker::BrokerClientChannel,
1626 0 : ctx: &RequestContext,
1627 0 : ) -> Result<Arc<Timeline>, CreateTimelineError> {
1628 0 : if !self.is_active() {
1629 0 : if matches!(self.current_state(), TenantState::Stopping { .. }) {
1630 0 : return Err(CreateTimelineError::ShuttingDown);
1631 : } else {
1632 0 : return Err(CreateTimelineError::Other(anyhow::anyhow!(
1633 0 : "Cannot create timelines on inactive tenant"
1634 0 : )));
1635 : }
1636 0 : }
1637 :
1638 0 : let _gate = self
1639 0 : .gate
1640 0 : .enter()
1641 0 : .map_err(|_| CreateTimelineError::ShuttingDown)?;
1642 :
1643 : // Get exclusive access to the timeline ID: this ensures that it does not already exist,
1644 : // and that no other creation attempts will be allowed in while we are working.
1645 0 : let create_guard = match self.create_timeline_create_guard(new_timeline_id) {
1646 0 : Ok(m) => m,
1647 : Err(TimelineExclusionError::AlreadyCreating) => {
1648 : // Creation is in progress, we cannot create it again, and we cannot
1649 : // check if this request matches the existing one, so caller must try
1650 : // again later.
1651 0 : return Err(CreateTimelineError::AlreadyCreating);
1652 : }
1653 0 : Err(TimelineExclusionError::Other(e)) => {
1654 0 : return Err(CreateTimelineError::Other(e));
1655 : }
1656 0 : Err(TimelineExclusionError::AlreadyExists(existing)) => {
1657 0 : debug!("timeline {new_timeline_id} already exists");
1658 :
1659 : // Idempotency: creating the same timeline twice is not an error, unless
1660 : // the second creation has different parameters.
1661 0 : if existing.get_ancestor_timeline_id() != ancestor_timeline_id
1662 0 : || existing.pg_version != pg_version
1663 0 : || (ancestor_start_lsn.is_some()
1664 0 : && ancestor_start_lsn != Some(existing.get_ancestor_lsn()))
1665 : {
1666 0 : return Err(CreateTimelineError::Conflict);
1667 0 : }
1668 0 :
1669 0 : // Wait for uploads to complete, so that when we return Ok, the timeline
1670 0 : // is known to be durable on remote storage. Just like we do at the end of
1671 0 : // this function, after we have created the timeline ourselves.
1672 0 : //
1673 0 : // We only really care that the initial version of `index_part.json` has
1674 0 : // been uploaded. That's enough to remember that the timeline
1675 0 : // exists. However, there is no function to wait specifically for that so
1676 0 : // we just wait for all in-progress uploads to finish.
1677 0 : existing
1678 0 : .remote_client
1679 0 : .wait_completion()
1680 0 : .await
1681 0 : .context("wait for timeline uploads to complete")?;
1682 :
1683 0 : return Ok(existing);
1684 : }
1685 : };
1686 :
1687 0 : pausable_failpoint!("timeline-creation-after-uninit");
1688 :
1689 0 : let loaded_timeline = match ancestor_timeline_id {
1690 0 : Some(ancestor_timeline_id) => {
1691 0 : let ancestor_timeline = self
1692 0 : .get_timeline(ancestor_timeline_id, false)
1693 0 : .context("Cannot branch off the timeline that's not present in pageserver")?;
1694 :
1695 : // instead of waiting around, just deny the request because ancestor is not yet
1696 : // ready for other purposes either.
1697 0 : if !ancestor_timeline.is_active() {
1698 0 : return Err(CreateTimelineError::AncestorNotActive);
1699 0 : }
1700 :
1701 0 : if let Some(lsn) = ancestor_start_lsn.as_mut() {
1702 0 : *lsn = lsn.align();
1703 0 :
1704 0 : let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
1705 0 : if ancestor_ancestor_lsn > *lsn {
1706 : // can we safely just branch from the ancestor instead?
1707 0 : return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
1708 0 : "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
1709 0 : lsn,
1710 0 : ancestor_timeline_id,
1711 0 : ancestor_ancestor_lsn,
1712 0 : )));
1713 0 : }
1714 0 :
1715 0 : // Wait for the WAL to arrive and be processed on the parent branch up
1716 0 : // to the requested branch point. The repository code itself doesn't
1717 0 : // require it, but if we start to receive WAL on the new timeline,
1718 0 : // decoding the new WAL might need to look up previous pages, relation
1719 0 : // sizes etc. and that would get confused if the previous page versions
1720 0 : // are not in the repository yet.
1721 0 : ancestor_timeline
1722 0 : .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
1723 0 : .await
1724 0 : .map_err(|e| match e {
1725 0 : e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => {
1726 0 : CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
1727 : }
1728 0 : WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
1729 0 : })?;
1730 0 : }
1731 :
1732 0 : self.branch_timeline(
1733 0 : &ancestor_timeline,
1734 0 : new_timeline_id,
1735 0 : ancestor_start_lsn,
1736 0 : create_guard,
1737 0 : ctx,
1738 0 : )
1739 0 : .await?
1740 : }
1741 : None => {
1742 0 : self.bootstrap_timeline(
1743 0 : new_timeline_id,
1744 0 : pg_version,
1745 0 : load_existing_initdb,
1746 0 : create_guard,
1747 0 : ctx,
1748 0 : )
1749 0 : .await?
1750 : }
1751 : };
1752 :
1753 : // At this point we have dropped our guard on [`Self::timelines_creating`], and
1754 : // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must
1755 : // not send a success to the caller until it is. The same applies to handling retries,
1756 : // see the handling of [`TimelineExclusionError::AlreadyExists`] above.
1757 0 : let kind = ancestor_timeline_id
1758 0 : .map(|_| "branched")
1759 0 : .unwrap_or("bootstrapped");
1760 0 : loaded_timeline
1761 0 : .remote_client
1762 0 : .wait_completion()
1763 0 : .await
1764 0 : .with_context(|| format!("wait for {} timeline initial uploads to complete", kind))?;
1765 :
1766 0 : loaded_timeline.activate(self.clone(), broker_client, None, ctx);
1767 0 :
1768 0 : Ok(loaded_timeline)
1769 0 : }
1770 :
1771 0 : pub(crate) async fn delete_timeline(
1772 0 : self: Arc<Self>,
1773 0 : timeline_id: TimelineId,
1774 0 : ) -> Result<(), DeleteTimelineError> {
1775 0 : DeleteTimelineFlow::run(&self, timeline_id).await?;
1776 :
1777 0 : Ok(())
1778 0 : }
1779 :
1780 : /// perform one garbage collection iteration, removing old data files from disk.
1781 : /// this function is periodically called by gc task.
1782 : /// also it can be explicitly requested through page server api 'do_gc' command.
1783 : ///
1784 : /// `target_timeline_id` specifies the timeline to GC, or None for all.
1785 : ///
1786 : /// The `horizon` an `pitr` parameters determine how much WAL history needs to be retained.
1787 : /// Also known as the retention period, or the GC cutoff point. `horizon` specifies
1788 : /// the amount of history, as LSN difference from current latest LSN on each timeline.
1789 : /// `pitr` specifies the same as a time difference from the current time. The effective
1790 : /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
1791 : /// requires more history to be retained.
1792 : //
1793 2262 : pub(crate) async fn gc_iteration(
1794 2262 : &self,
1795 2262 : target_timeline_id: Option<TimelineId>,
1796 2262 : horizon: u64,
1797 2262 : pitr: Duration,
1798 2262 : cancel: &CancellationToken,
1799 2262 : ctx: &RequestContext,
1800 2262 : ) -> Result<GcResult, GcError> {
1801 2262 : // Don't start doing work during shutdown
1802 2262 : if let TenantState::Stopping { .. } = self.current_state() {
1803 0 : return Ok(GcResult::default());
1804 2262 : }
1805 2262 :
1806 2262 : // there is a global allowed_error for this
1807 2262 : if !self.is_active() {
1808 0 : return Err(GcError::NotActive);
1809 2262 : }
1810 2262 :
1811 2262 : {
1812 2262 : let conf = self.tenant_conf.load();
1813 2262 :
1814 2262 : if !conf.location.may_delete_layers_hint() {
1815 0 : info!("Skipping GC in location state {:?}", conf.location);
1816 0 : return Ok(GcResult::default());
1817 2262 : }
1818 : }
1819 :
1820 2262 : let _guard = match self.gc_block.start().await {
1821 2262 : Ok(guard) => guard,
1822 0 : Err(reasons) => {
1823 0 : info!("Skipping GC: {reasons}");
1824 0 : return Ok(GcResult::default());
1825 : }
1826 : };
1827 :
1828 2262 : self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
1829 2232 : .await
1830 2262 : }
1831 :
1832 : /// Perform one compaction iteration.
1833 : /// This function is periodically called by compactor task.
1834 : /// Also it can be explicitly requested per timeline through page server
1835 : /// api's 'compact' command.
1836 : ///
1837 : /// Returns whether we have pending compaction task.
1838 0 : async fn compaction_iteration(
1839 0 : &self,
1840 0 : cancel: &CancellationToken,
1841 0 : ctx: &RequestContext,
1842 0 : ) -> Result<bool, timeline::CompactionError> {
1843 0 : // Don't start doing work during shutdown, or when broken, we do not need those in the logs
1844 0 : if !self.is_active() {
1845 0 : return Ok(false);
1846 0 : }
1847 0 :
1848 0 : {
1849 0 : let conf = self.tenant_conf.load();
1850 0 : if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
1851 0 : info!("Skipping compaction in location state {:?}", conf.location);
1852 0 : return Ok(false);
1853 0 : }
1854 0 : }
1855 0 :
1856 0 : // Scan through the hashmap and collect a list of all the timelines,
1857 0 : // while holding the lock. Then drop the lock and actually perform the
1858 0 : // compactions. We don't want to block everything else while the
1859 0 : // compaction runs.
1860 0 : let timelines_to_compact = {
1861 0 : let timelines = self.timelines.lock().unwrap();
1862 0 : let timelines_to_compact = timelines
1863 0 : .iter()
1864 0 : .filter_map(|(timeline_id, timeline)| {
1865 0 : if timeline.is_active() {
1866 0 : Some((*timeline_id, timeline.clone()))
1867 : } else {
1868 0 : None
1869 : }
1870 0 : })
1871 0 : .collect::<Vec<_>>();
1872 0 : drop(timelines);
1873 0 : timelines_to_compact
1874 0 : };
1875 0 :
1876 0 : // Before doing any I/O work, check our circuit breaker
1877 0 : if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
1878 0 : info!("Skipping compaction due to previous failures");
1879 0 : return Ok(false);
1880 0 : }
1881 0 :
1882 0 : let mut has_pending_task = false;
1883 :
1884 0 : for (timeline_id, timeline) in &timelines_to_compact {
1885 0 : has_pending_task |= timeline
1886 0 : .compact(cancel, EnumSet::empty(), ctx)
1887 0 : .instrument(info_span!("compact_timeline", %timeline_id))
1888 0 : .await
1889 0 : .inspect_err(|e| match e {
1890 0 : timeline::CompactionError::ShuttingDown => (),
1891 0 : timeline::CompactionError::Other(e) => {
1892 0 : self.compaction_circuit_breaker
1893 0 : .lock()
1894 0 : .unwrap()
1895 0 : .fail(&CIRCUIT_BREAKERS_BROKEN, e);
1896 0 : }
1897 0 : })?;
1898 : }
1899 :
1900 0 : self.compaction_circuit_breaker
1901 0 : .lock()
1902 0 : .unwrap()
1903 0 : .success(&CIRCUIT_BREAKERS_UNBROKEN);
1904 0 :
1905 0 : Ok(has_pending_task)
1906 0 : }
1907 :
1908 : // Call through to all timelines to freeze ephemeral layers if needed. Usually
1909 : // this happens during ingest: this background housekeeping is for freezing layers
1910 : // that are open but haven't been written to for some time.
1911 0 : async fn ingest_housekeeping(&self) {
1912 0 : // Scan through the hashmap and collect a list of all the timelines,
1913 0 : // while holding the lock. Then drop the lock and actually perform the
1914 0 : // compactions. We don't want to block everything else while the
1915 0 : // compaction runs.
1916 0 : let timelines = {
1917 0 : self.timelines
1918 0 : .lock()
1919 0 : .unwrap()
1920 0 : .values()
1921 0 : .filter_map(|timeline| {
1922 0 : if timeline.is_active() {
1923 0 : Some(timeline.clone())
1924 : } else {
1925 0 : None
1926 : }
1927 0 : })
1928 0 : .collect::<Vec<_>>()
1929 : };
1930 :
1931 0 : for timeline in &timelines {
1932 0 : timeline.maybe_freeze_ephemeral_layer().await;
1933 : }
1934 0 : }
1935 :
1936 7350 : pub fn current_state(&self) -> TenantState {
1937 7350 : self.state.borrow().clone()
1938 7350 : }
1939 :
1940 5070 : pub fn is_active(&self) -> bool {
1941 5070 : self.current_state() == TenantState::Active
1942 5070 : }
1943 :
1944 0 : pub fn generation(&self) -> Generation {
1945 0 : self.generation
1946 0 : }
1947 :
1948 0 : pub(crate) fn wal_redo_manager_status(&self) -> Option<WalRedoManagerStatus> {
1949 0 : self.walredo_mgr.as_ref().and_then(|mgr| mgr.status())
1950 0 : }
1951 :
1952 : /// Changes tenant status to active, unless shutdown was already requested.
1953 : ///
1954 : /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup
1955 : /// to delay background jobs. Background jobs can be started right away when None is given.
1956 0 : fn activate(
1957 0 : self: &Arc<Self>,
1958 0 : broker_client: BrokerClientChannel,
1959 0 : background_jobs_can_start: Option<&completion::Barrier>,
1960 0 : ctx: &RequestContext,
1961 0 : ) {
1962 0 : span::debug_assert_current_span_has_tenant_id();
1963 0 :
1964 0 : let mut activating = false;
1965 0 : self.state.send_modify(|current_state| {
1966 : use pageserver_api::models::ActivatingFrom;
1967 0 : match &*current_state {
1968 : TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => {
1969 0 : panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
1970 : }
1971 0 : TenantState::Loading => {
1972 0 : *current_state = TenantState::Activating(ActivatingFrom::Loading);
1973 0 : }
1974 0 : TenantState::Attaching => {
1975 0 : *current_state = TenantState::Activating(ActivatingFrom::Attaching);
1976 0 : }
1977 : }
1978 0 : debug!(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), "Activating tenant");
1979 0 : activating = true;
1980 0 : // Continue outside the closure. We need to grab timelines.lock()
1981 0 : // and we plan to turn it into a tokio::sync::Mutex in a future patch.
1982 0 : });
1983 0 :
1984 0 : if activating {
1985 0 : let timelines_accessor = self.timelines.lock().unwrap();
1986 0 : let timelines_to_activate = timelines_accessor
1987 0 : .values()
1988 0 : .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping()));
1989 0 :
1990 0 : // Before activation, populate each Timeline's GcInfo with information about its children
1991 0 : self.initialize_gc_info(&timelines_accessor);
1992 0 :
1993 0 : // Spawn gc and compaction loops. The loops will shut themselves
1994 0 : // down when they notice that the tenant is inactive.
1995 0 : tasks::start_background_loops(self, background_jobs_can_start);
1996 0 :
1997 0 : let mut activated_timelines = 0;
1998 :
1999 0 : for timeline in timelines_to_activate {
2000 0 : timeline.activate(
2001 0 : self.clone(),
2002 0 : broker_client.clone(),
2003 0 : background_jobs_can_start,
2004 0 : ctx,
2005 0 : );
2006 0 : activated_timelines += 1;
2007 0 : }
2008 :
2009 0 : self.state.send_modify(move |current_state| {
2010 0 : assert!(
2011 0 : matches!(current_state, TenantState::Activating(_)),
2012 0 : "set_stopping and set_broken wait for us to leave Activating state",
2013 : );
2014 0 : *current_state = TenantState::Active;
2015 0 :
2016 0 : let elapsed = self.constructed_at.elapsed();
2017 0 : let total_timelines = timelines_accessor.len();
2018 0 :
2019 0 : // log a lot of stuff, because some tenants sometimes suffer from user-visible
2020 0 : // times to activate. see https://github.com/neondatabase/neon/issues/4025
2021 0 : info!(
2022 0 : since_creation_millis = elapsed.as_millis(),
2023 0 : tenant_id = %self.tenant_shard_id.tenant_id,
2024 0 : shard_id = %self.tenant_shard_id.shard_slug(),
2025 0 : activated_timelines,
2026 0 : total_timelines,
2027 0 : post_state = <&'static str>::from(&*current_state),
2028 0 : "activation attempt finished"
2029 : );
2030 :
2031 0 : TENANT.activation.observe(elapsed.as_secs_f64());
2032 0 : });
2033 0 : }
2034 0 : }
2035 :
2036 : /// Shutdown the tenant and join all of the spawned tasks.
2037 : ///
2038 : /// The method caters for all use-cases:
2039 : /// - pageserver shutdown (freeze_and_flush == true)
2040 : /// - detach + ignore (freeze_and_flush == false)
2041 : ///
2042 : /// This will attempt to shutdown even if tenant is broken.
2043 : ///
2044 : /// `shutdown_progress` is a [`completion::Barrier`] for the shutdown initiated by this call.
2045 : /// If the tenant is already shutting down, we return a clone of the first shutdown call's
2046 : /// `Barrier` as an `Err`. This not-first caller can use the returned barrier to join with
2047 : /// the ongoing shutdown.
2048 18 : async fn shutdown(
2049 18 : &self,
2050 18 : shutdown_progress: completion::Barrier,
2051 18 : shutdown_mode: timeline::ShutdownMode,
2052 18 : ) -> Result<(), completion::Barrier> {
2053 18 : span::debug_assert_current_span_has_tenant_id();
2054 :
2055 : // Set tenant (and its timlines) to Stoppping state.
2056 : //
2057 : // Since we can only transition into Stopping state after activation is complete,
2058 : // run it in a JoinSet so all tenants have a chance to stop before we get SIGKILLed.
2059 : //
2060 : // Transitioning tenants to Stopping state has a couple of non-obvious side effects:
2061 : // 1. Lock out any new requests to the tenants.
2062 : // 2. Signal cancellation to WAL receivers (we wait on it below).
2063 : // 3. Signal cancellation for other tenant background loops.
2064 : // 4. ???
2065 : //
2066 : // The waiting for the cancellation is not done uniformly.
2067 : // We certainly wait for WAL receivers to shut down.
2068 : // That is necessary so that no new data comes in before the freeze_and_flush.
2069 : // But the tenant background loops are joined-on in our caller.
2070 : // It's mesed up.
2071 : // we just ignore the failure to stop
2072 :
2073 : // If we're still attaching, fire the cancellation token early to drop out: this
2074 : // will prevent us flushing, but ensures timely shutdown if some I/O during attach
2075 : // is very slow.
2076 18 : let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) {
2077 0 : self.cancel.cancel();
2078 0 :
2079 0 : // Having fired our cancellation token, do not try and flush timelines: their cancellation tokens
2080 0 : // are children of ours, so their flush loops will have shut down already
2081 0 : timeline::ShutdownMode::Hard
2082 : } else {
2083 18 : shutdown_mode
2084 : };
2085 :
2086 18 : match self.set_stopping(shutdown_progress, false, false).await {
2087 18 : Ok(()) => {}
2088 0 : Err(SetStoppingError::Broken) => {
2089 0 : // assume that this is acceptable
2090 0 : }
2091 0 : Err(SetStoppingError::AlreadyStopping(other)) => {
2092 0 : // give caller the option to wait for this this shutdown
2093 0 : info!("Tenant::shutdown: AlreadyStopping");
2094 0 : return Err(other);
2095 : }
2096 : };
2097 :
2098 18 : let mut js = tokio::task::JoinSet::new();
2099 18 : {
2100 18 : let timelines = self.timelines.lock().unwrap();
2101 18 : timelines.values().for_each(|timeline| {
2102 18 : let timeline = Arc::clone(timeline);
2103 18 : let timeline_id = timeline.timeline_id;
2104 18 : let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
2105 28 : js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
2106 18 : })
2107 18 : };
2108 18 : // test_long_timeline_create_then_tenant_delete is leaning on this message
2109 18 : tracing::info!("Waiting for timelines...");
2110 36 : while let Some(res) = js.join_next().await {
2111 0 : match res {
2112 18 : Ok(()) => {}
2113 0 : Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
2114 0 : Err(je) if je.is_panic() => { /* logged already */ }
2115 0 : Err(je) => warn!("unexpected JoinError: {je:?}"),
2116 : }
2117 : }
2118 :
2119 : // We cancel the Tenant's cancellation token _after_ the timelines have all shut down. This permits
2120 : // them to continue to do work during their shutdown methods, e.g. flushing data.
2121 18 : tracing::debug!("Cancelling CancellationToken");
2122 18 : self.cancel.cancel();
2123 18 :
2124 18 : // shutdown all tenant and timeline tasks: gc, compaction, page service
2125 18 : // No new tasks will be started for this tenant because it's in `Stopping` state.
2126 18 : //
2127 18 : // this will additionally shutdown and await all timeline tasks.
2128 18 : tracing::debug!("Waiting for tasks...");
2129 18 : task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await;
2130 :
2131 18 : if let Some(walredo_mgr) = self.walredo_mgr.as_ref() {
2132 18 : walredo_mgr.shutdown().await;
2133 0 : }
2134 :
2135 : // Wait for any in-flight operations to complete
2136 18 : self.gate.close().await;
2137 :
2138 18 : remove_tenant_metrics(&self.tenant_shard_id);
2139 18 :
2140 18 : Ok(())
2141 18 : }
2142 :
2143 : /// Change tenant status to Stopping, to mark that it is being shut down.
2144 : ///
2145 : /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
2146 : ///
2147 : /// This function is not cancel-safe!
2148 : ///
2149 : /// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
2150 : /// `allow_transition_from_attaching` is needed for the special case of attaching deleted tenant.
2151 18 : async fn set_stopping(
2152 18 : &self,
2153 18 : progress: completion::Barrier,
2154 18 : allow_transition_from_loading: bool,
2155 18 : allow_transition_from_attaching: bool,
2156 18 : ) -> Result<(), SetStoppingError> {
2157 18 : let mut rx = self.state.subscribe();
2158 18 :
2159 18 : // cannot stop before we're done activating, so wait out until we're done activating
2160 18 : rx.wait_for(|state| match state {
2161 0 : TenantState::Attaching if allow_transition_from_attaching => true,
2162 : TenantState::Activating(_) | TenantState::Attaching => {
2163 0 : info!(
2164 0 : "waiting for {} to turn Active|Broken|Stopping",
2165 0 : <&'static str>::from(state)
2166 : );
2167 0 : false
2168 : }
2169 0 : TenantState::Loading => allow_transition_from_loading,
2170 18 : TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
2171 18 : })
2172 0 : .await
2173 18 : .expect("cannot drop self.state while on a &self method");
2174 18 :
2175 18 : // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
2176 18 : let mut err = None;
2177 18 : let stopping = self.state.send_if_modified(|current_state| match current_state {
2178 : TenantState::Activating(_) => {
2179 0 : unreachable!("1we ensured above that we're done with activation, and, there is no re-activation")
2180 : }
2181 : TenantState::Attaching => {
2182 0 : if !allow_transition_from_attaching {
2183 0 : unreachable!("2we ensured above that we're done with activation, and, there is no re-activation")
2184 0 : };
2185 0 : *current_state = TenantState::Stopping { progress };
2186 0 : true
2187 : }
2188 : TenantState::Loading => {
2189 0 : if !allow_transition_from_loading {
2190 0 : unreachable!("3we ensured above that we're done with activation, and, there is no re-activation")
2191 0 : };
2192 0 : *current_state = TenantState::Stopping { progress };
2193 0 : true
2194 : }
2195 : TenantState::Active => {
2196 : // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
2197 : // are created after the transition to Stopping. That's harmless, as the Timelines
2198 : // won't be accessible to anyone afterwards, because the Tenant is in Stopping state.
2199 18 : *current_state = TenantState::Stopping { progress };
2200 18 : // Continue stopping outside the closure. We need to grab timelines.lock()
2201 18 : // and we plan to turn it into a tokio::sync::Mutex in a future patch.
2202 18 : true
2203 : }
2204 0 : TenantState::Broken { reason, .. } => {
2205 0 : info!(
2206 0 : "Cannot set tenant to Stopping state, it is in Broken state due to: {reason}"
2207 : );
2208 0 : err = Some(SetStoppingError::Broken);
2209 0 : false
2210 : }
2211 0 : TenantState::Stopping { progress } => {
2212 0 : info!("Tenant is already in Stopping state");
2213 0 : err = Some(SetStoppingError::AlreadyStopping(progress.clone()));
2214 0 : false
2215 : }
2216 18 : });
2217 18 : match (stopping, err) {
2218 18 : (true, None) => {} // continue
2219 0 : (false, Some(err)) => return Err(err),
2220 0 : (true, Some(_)) => unreachable!(
2221 0 : "send_if_modified closure must error out if not transitioning to Stopping"
2222 0 : ),
2223 0 : (false, None) => unreachable!(
2224 0 : "send_if_modified closure must return true if transitioning to Stopping"
2225 0 : ),
2226 : }
2227 :
2228 18 : let timelines_accessor = self.timelines.lock().unwrap();
2229 18 : let not_broken_timelines = timelines_accessor
2230 18 : .values()
2231 18 : .filter(|timeline| !timeline.is_broken());
2232 36 : for timeline in not_broken_timelines {
2233 18 : timeline.set_state(TimelineState::Stopping);
2234 18 : }
2235 18 : Ok(())
2236 18 : }
2237 :
2238 : /// Method for tenant::mgr to transition us into Broken state in case of a late failure in
2239 : /// `remove_tenant_from_memory`
2240 : ///
2241 : /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
2242 : ///
2243 : /// In tests, we also use this to set tenants to Broken state on purpose.
2244 0 : pub(crate) async fn set_broken(&self, reason: String) {
2245 0 : let mut rx = self.state.subscribe();
2246 0 :
2247 0 : // The load & attach routines own the tenant state until it has reached `Active`.
2248 0 : // So, wait until it's done.
2249 0 : rx.wait_for(|state| match state {
2250 : TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
2251 0 : info!(
2252 0 : "waiting for {} to turn Active|Broken|Stopping",
2253 0 : <&'static str>::from(state)
2254 : );
2255 0 : false
2256 : }
2257 0 : TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
2258 0 : })
2259 0 : .await
2260 0 : .expect("cannot drop self.state while on a &self method");
2261 0 :
2262 0 : // we now know we're done activating, let's see whether this task is the winner to transition into Broken
2263 0 : self.set_broken_no_wait(reason)
2264 0 : }
2265 :
2266 0 : pub(crate) fn set_broken_no_wait(&self, reason: impl Display) {
2267 0 : let reason = reason.to_string();
2268 0 : self.state.send_modify(|current_state| {
2269 0 : match *current_state {
2270 : TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
2271 0 : unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
2272 : }
2273 : TenantState::Active => {
2274 0 : if cfg!(feature = "testing") {
2275 0 : warn!("Changing Active tenant to Broken state, reason: {}", reason);
2276 0 : *current_state = TenantState::broken_from_reason(reason);
2277 : } else {
2278 0 : unreachable!("not allowed to call set_broken on Active tenants in non-testing builds")
2279 : }
2280 : }
2281 : TenantState::Broken { .. } => {
2282 0 : warn!("Tenant is already in Broken state");
2283 : }
2284 : // This is the only "expected" path, any other path is a bug.
2285 : TenantState::Stopping { .. } => {
2286 0 : warn!(
2287 0 : "Marking Stopping tenant as Broken state, reason: {}",
2288 : reason
2289 : );
2290 0 : *current_state = TenantState::broken_from_reason(reason);
2291 : }
2292 : }
2293 0 : });
2294 0 : }
2295 :
2296 0 : pub fn subscribe_for_state_updates(&self) -> watch::Receiver<TenantState> {
2297 0 : self.state.subscribe()
2298 0 : }
2299 :
2300 : /// The activate_now semaphore is initialized with zero units. As soon as
2301 : /// we add a unit, waiters will be able to acquire a unit and proceed.
2302 0 : pub(crate) fn activate_now(&self) {
2303 0 : self.activate_now_sem.add_permits(1);
2304 0 : }
2305 :
2306 0 : pub(crate) async fn wait_to_become_active(
2307 0 : &self,
2308 0 : timeout: Duration,
2309 0 : ) -> Result<(), GetActiveTenantError> {
2310 0 : let mut receiver = self.state.subscribe();
2311 : loop {
2312 0 : let current_state = receiver.borrow_and_update().clone();
2313 0 : match current_state {
2314 : TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
2315 : // in these states, there's a chance that we can reach ::Active
2316 0 : self.activate_now();
2317 0 : match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
2318 0 : Ok(r) => {
2319 0 : r.map_err(
2320 0 : |_e: tokio::sync::watch::error::RecvError|
2321 : // Tenant existed but was dropped: report it as non-existent
2322 0 : GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
2323 0 : )?
2324 : }
2325 : Err(TimeoutCancellableError::Cancelled) => {
2326 0 : return Err(GetActiveTenantError::Cancelled);
2327 : }
2328 : Err(TimeoutCancellableError::Timeout) => {
2329 0 : return Err(GetActiveTenantError::WaitForActiveTimeout {
2330 0 : latest_state: Some(self.current_state()),
2331 0 : wait_time: timeout,
2332 0 : });
2333 : }
2334 : }
2335 : }
2336 : TenantState::Active { .. } => {
2337 0 : return Ok(());
2338 : }
2339 0 : TenantState::Broken { reason, .. } => {
2340 0 : // This is fatal, and reported distinctly from the general case of "will never be active" because
2341 0 : // it's logically a 500 to external API users (broken is always a bug).
2342 0 : return Err(GetActiveTenantError::Broken(reason));
2343 : }
2344 : TenantState::Stopping { .. } => {
2345 : // There's no chance the tenant can transition back into ::Active
2346 0 : return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
2347 : }
2348 : }
2349 : }
2350 0 : }
2351 :
2352 0 : pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
2353 0 : self.tenant_conf.load().location.attach_mode
2354 0 : }
2355 :
2356 : /// For API access: generate a LocationConfig equivalent to the one that would be used to
2357 : /// create a Tenant in the same state. Do not use this in hot paths: it's for relatively
2358 : /// rare external API calls, like a reconciliation at startup.
2359 0 : pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
2360 0 : let conf = self.tenant_conf.load();
2361 :
2362 0 : let location_config_mode = match conf.location.attach_mode {
2363 0 : AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
2364 0 : AttachmentMode::Multi => models::LocationConfigMode::AttachedMulti,
2365 0 : AttachmentMode::Stale => models::LocationConfigMode::AttachedStale,
2366 : };
2367 :
2368 : // We have a pageserver TenantConf, we need the API-facing TenantConfig.
2369 0 : let tenant_config: models::TenantConfig = conf.tenant_conf.clone().into();
2370 0 :
2371 0 : models::LocationConfig {
2372 0 : mode: location_config_mode,
2373 0 : generation: self.generation.into(),
2374 0 : secondary_conf: None,
2375 0 : shard_number: self.shard_identity.number.0,
2376 0 : shard_count: self.shard_identity.count.literal(),
2377 0 : shard_stripe_size: self.shard_identity.stripe_size.0,
2378 0 : tenant_conf: tenant_config,
2379 0 : }
2380 0 : }
2381 :
2382 0 : pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
2383 0 : &self.tenant_shard_id
2384 0 : }
2385 :
2386 0 : pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize {
2387 0 : self.shard_identity.stripe_size
2388 0 : }
2389 :
2390 0 : pub(crate) fn get_generation(&self) -> Generation {
2391 0 : self.generation
2392 0 : }
2393 :
2394 : /// This function partially shuts down the tenant (it shuts down the Timelines) and is fallible,
2395 : /// and can leave the tenant in a bad state if it fails. The caller is responsible for
2396 : /// resetting this tenant to a valid state if we fail.
2397 0 : pub(crate) async fn split_prepare(
2398 0 : &self,
2399 0 : child_shards: &Vec<TenantShardId>,
2400 0 : ) -> anyhow::Result<()> {
2401 0 : let timelines = self.timelines.lock().unwrap().clone();
2402 0 : for timeline in timelines.values() {
2403 : // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
2404 : // to ensure that they do not start a split if currently in the process of doing these.
2405 :
2406 : // Upload an index from the parent: this is partly to provide freshness for the
2407 : // child tenants that will copy it, and partly for general ease-of-debugging: there will
2408 : // always be a parent shard index in the same generation as we wrote the child shard index.
2409 0 : tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index");
2410 0 : timeline
2411 0 : .remote_client
2412 0 : .schedule_index_upload_for_file_changes()?;
2413 0 : timeline.remote_client.wait_completion().await?;
2414 :
2415 : // Shut down the timeline's remote client: this means that the indices we write
2416 : // for child shards will not be invalidated by the parent shard deleting layers.
2417 0 : tracing::info!(timeline_id=%timeline.timeline_id, "Shutting down remote storage client");
2418 0 : timeline.remote_client.shutdown().await;
2419 :
2420 : // Download methods can still be used after shutdown, as they don't flow through the remote client's
2421 : // queue. In principal the RemoteTimelineClient could provide this without downloading it, but this
2422 : // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
2423 : // we use here really is the remotely persistent one).
2424 0 : tracing::info!(timeline_id=%timeline.timeline_id, "Downloading index_part from parent");
2425 0 : let result = timeline.remote_client
2426 0 : .download_index_file(&self.cancel)
2427 0 : .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
2428 0 : .await?;
2429 0 : let index_part = match result {
2430 : MaybeDeletedIndexPart::Deleted(_) => {
2431 0 : anyhow::bail!("Timeline deletion happened concurrently with split")
2432 : }
2433 0 : MaybeDeletedIndexPart::IndexPart(p) => p,
2434 : };
2435 :
2436 0 : for child_shard in child_shards {
2437 0 : tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index_part for child {}", child_shard.to_index());
2438 0 : upload_index_part(
2439 0 : &self.remote_storage,
2440 0 : child_shard,
2441 0 : &timeline.timeline_id,
2442 0 : self.generation,
2443 0 : &index_part,
2444 0 : &self.cancel,
2445 0 : )
2446 0 : .await?;
2447 : }
2448 : }
2449 :
2450 0 : Ok(())
2451 0 : }
2452 :
2453 0 : pub(crate) fn get_sizes(&self) -> TopTenantShardItem {
2454 0 : let mut result = TopTenantShardItem {
2455 0 : id: self.tenant_shard_id,
2456 0 : resident_size: 0,
2457 0 : physical_size: 0,
2458 0 : max_logical_size: 0,
2459 0 : };
2460 :
2461 0 : for timeline in self.timelines.lock().unwrap().values() {
2462 0 : result.resident_size += timeline.metrics.resident_physical_size_gauge.get();
2463 0 :
2464 0 : result.physical_size += timeline
2465 0 : .remote_client
2466 0 : .metrics
2467 0 : .remote_physical_size_gauge
2468 0 : .get();
2469 0 : result.max_logical_size = std::cmp::max(
2470 0 : result.max_logical_size,
2471 0 : timeline.metrics.current_logical_size_gauge.get(),
2472 0 : );
2473 0 : }
2474 :
2475 0 : result
2476 0 : }
2477 : }
2478 :
2479 : /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
2480 : /// perform a topological sort, so that the parent of each timeline comes
2481 : /// before the children.
2482 : /// E extracts the ancestor from T
2483 : /// This allows for T to be different. It can be TimelineMetadata, can be Timeline itself, etc.
2484 570 : fn tree_sort_timelines<T, E>(
2485 570 : timelines: HashMap<TimelineId, T>,
2486 570 : extractor: E,
2487 570 : ) -> anyhow::Result<Vec<(TimelineId, T)>>
2488 570 : where
2489 570 : E: Fn(&T) -> Option<TimelineId>,
2490 570 : {
2491 570 : let mut result = Vec::with_capacity(timelines.len());
2492 570 :
2493 570 : let mut now = Vec::with_capacity(timelines.len());
2494 570 : // (ancestor, children)
2495 570 : let mut later: HashMap<TimelineId, Vec<(TimelineId, T)>> =
2496 570 : HashMap::with_capacity(timelines.len());
2497 :
2498 588 : for (timeline_id, value) in timelines {
2499 18 : if let Some(ancestor_id) = extractor(&value) {
2500 6 : let children = later.entry(ancestor_id).or_default();
2501 6 : children.push((timeline_id, value));
2502 12 : } else {
2503 12 : now.push((timeline_id, value));
2504 12 : }
2505 : }
2506 :
2507 588 : while let Some((timeline_id, metadata)) = now.pop() {
2508 18 : result.push((timeline_id, metadata));
2509 : // All children of this can be loaded now
2510 18 : if let Some(mut children) = later.remove(&timeline_id) {
2511 6 : now.append(&mut children);
2512 12 : }
2513 : }
2514 :
2515 : // All timelines should be visited now. Unless there were timelines with missing ancestors.
2516 570 : if !later.is_empty() {
2517 0 : for (missing_id, orphan_ids) in later {
2518 0 : for (orphan_id, _) in orphan_ids {
2519 0 : error!("could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded");
2520 : }
2521 : }
2522 0 : bail!("could not load tenant because some timelines are missing ancestors");
2523 570 : }
2524 570 :
2525 570 : Ok(result)
2526 570 : }
2527 :
2528 : impl Tenant {
2529 0 : pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
2530 0 : self.tenant_conf.load().tenant_conf.clone()
2531 0 : }
2532 :
2533 0 : pub fn effective_config(&self) -> TenantConf {
2534 0 : self.tenant_specific_overrides()
2535 0 : .merge(self.conf.default_tenant_conf.clone())
2536 0 : }
2537 :
2538 0 : pub fn get_checkpoint_distance(&self) -> u64 {
2539 0 : let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
2540 0 : tenant_conf
2541 0 : .checkpoint_distance
2542 0 : .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
2543 0 : }
2544 :
2545 0 : pub fn get_checkpoint_timeout(&self) -> Duration {
2546 0 : let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
2547 0 : tenant_conf
2548 0 : .checkpoint_timeout
2549 0 : .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
2550 0 : }
2551 :
2552 0 : pub fn get_compaction_target_size(&self) -> u64 {
2553 0 : let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
2554 0 : tenant_conf
2555 0 : .compaction_target_size
2556 0 : .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
2557 0 : }
2558 :
2559 0 : pub fn get_compaction_period(&self) -> Duration {
2560 0 : let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
2561 0 : tenant_conf
2562 0 : .compaction_period
2563 0 : .unwrap_or(self.conf.default_tenant_conf.compaction_period)
2564 0 : }
2565 :
2566 0 : pub fn get_compaction_threshold(&self) -> usize {
2567 0 : let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
2568 0 : tenant_conf
2569 0 : .compaction_threshold
2570 0 : .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
2571 0 : }
2572 :
2573 0 : pub fn get_gc_horizon(&self) -> u64 {
2574 0 : let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
2575 0 : tenant_conf
2576 0 : .gc_horizon
2577 0 : .unwrap_or(self.conf.default_tenant_conf.gc_horizon)
2578 0 : }
2579 :
2580 0 : pub fn get_gc_period(&self) -> Duration {
2581 0 : let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
2582 0 : tenant_conf
2583 0 : .gc_period
2584 0 : .unwrap_or(self.conf.default_tenant_conf.gc_period)
2585 0 : }
2586 :
2587 0 : pub fn get_image_creation_threshold(&self) -> usize {
2588 0 : let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
2589 0 : tenant_conf
2590 0 : .image_creation_threshold
2591 0 : .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
2592 0 : }
2593 :
2594 0 : pub fn get_pitr_interval(&self) -> Duration {
2595 0 : let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
2596 0 : tenant_conf
2597 0 : .pitr_interval
2598 0 : .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
2599 0 : }
2600 :
2601 0 : pub fn get_min_resident_size_override(&self) -> Option<u64> {
2602 0 : let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
2603 0 : tenant_conf
2604 0 : .min_resident_size_override
2605 0 : .or(self.conf.default_tenant_conf.min_resident_size_override)
2606 0 : }
2607 :
2608 0 : pub fn get_heatmap_period(&self) -> Option<Duration> {
2609 0 : let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
2610 0 : let heatmap_period = tenant_conf
2611 0 : .heatmap_period
2612 0 : .unwrap_or(self.conf.default_tenant_conf.heatmap_period);
2613 0 : if heatmap_period.is_zero() {
2614 0 : None
2615 : } else {
2616 0 : Some(heatmap_period)
2617 : }
2618 0 : }
2619 :
2620 0 : pub fn get_lsn_lease_length(&self) -> Duration {
2621 0 : let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
2622 0 : tenant_conf
2623 0 : .lsn_lease_length
2624 0 : .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
2625 0 : }
2626 :
2627 0 : pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
2628 0 : // Use read-copy-update in order to avoid overwriting the location config
2629 0 : // state if this races with [`Tenant::set_new_location_config`]. Note that
2630 0 : // this race is not possible if both request types come from the storage
2631 0 : // controller (as they should!) because an exclusive op lock is required
2632 0 : // on the storage controller side.
2633 0 : self.tenant_conf.rcu(|inner| {
2634 0 : Arc::new(AttachedTenantConf {
2635 0 : tenant_conf: new_tenant_conf.clone(),
2636 0 : location: inner.location,
2637 0 : })
2638 0 : });
2639 0 :
2640 0 : self.tenant_conf_updated(&new_tenant_conf);
2641 0 : // Don't hold self.timelines.lock() during the notifies.
2642 0 : // There's no risk of deadlock right now, but there could be if we consolidate
2643 0 : // mutexes in struct Timeline in the future.
2644 0 : let timelines = self.list_timelines();
2645 0 : for timeline in timelines {
2646 0 : timeline.tenant_conf_updated(&new_tenant_conf);
2647 0 : }
2648 0 : }
2649 :
2650 24 : pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
2651 24 : let new_tenant_conf = new_conf.tenant_conf.clone();
2652 24 :
2653 24 : self.tenant_conf.store(Arc::new(new_conf));
2654 24 :
2655 24 : self.tenant_conf_updated(&new_tenant_conf);
2656 24 : // Don't hold self.timelines.lock() during the notifies.
2657 24 : // There's no risk of deadlock right now, but there could be if we consolidate
2658 24 : // mutexes in struct Timeline in the future.
2659 24 : let timelines = self.list_timelines();
2660 48 : for timeline in timelines {
2661 24 : timeline.tenant_conf_updated(&new_tenant_conf);
2662 24 : }
2663 24 : }
2664 :
2665 594 : fn get_timeline_get_throttle_config(
2666 594 : psconf: &'static PageServerConf,
2667 594 : overrides: &TenantConfOpt,
2668 594 : ) -> throttle::Config {
2669 594 : overrides
2670 594 : .timeline_get_throttle
2671 594 : .clone()
2672 594 : .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
2673 594 : }
2674 :
2675 24 : pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
2676 24 : let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
2677 24 : self.timeline_get_throttle.reconfigure(conf)
2678 24 : }
2679 :
2680 : /// Helper function to create a new Timeline struct.
2681 : ///
2682 : /// The returned Timeline is in Loading state. The caller is responsible for
2683 : /// initializing any on-disk state, and for inserting the Timeline to the 'timelines'
2684 : /// map.
2685 : ///
2686 : /// `validate_ancestor == false` is used when a timeline is created for deletion
2687 : /// and we might not have the ancestor present anymore which is fine for to be
2688 : /// deleted timelines.
2689 1242 : fn create_timeline_struct(
2690 1242 : &self,
2691 1242 : new_timeline_id: TimelineId,
2692 1242 : new_metadata: &TimelineMetadata,
2693 1242 : ancestor: Option<Arc<Timeline>>,
2694 1242 : resources: TimelineResources,
2695 1242 : cause: CreateTimelineCause,
2696 1242 : last_aux_file_policy: Option<AuxFilePolicy>,
2697 1242 : ) -> anyhow::Result<Arc<Timeline>> {
2698 1242 : let state = match cause {
2699 : CreateTimelineCause::Load => {
2700 1242 : let ancestor_id = new_metadata.ancestor_timeline();
2701 1242 : anyhow::ensure!(
2702 1242 : ancestor_id == ancestor.as_ref().map(|t| t.timeline_id),
2703 0 : "Timeline's {new_timeline_id} ancestor {ancestor_id:?} was not found"
2704 : );
2705 1242 : TimelineState::Loading
2706 : }
2707 0 : CreateTimelineCause::Delete => TimelineState::Stopping,
2708 : };
2709 :
2710 1242 : let pg_version = new_metadata.pg_version();
2711 1242 :
2712 1242 : let timeline = Timeline::new(
2713 1242 : self.conf,
2714 1242 : Arc::clone(&self.tenant_conf),
2715 1242 : new_metadata,
2716 1242 : ancestor,
2717 1242 : new_timeline_id,
2718 1242 : self.tenant_shard_id,
2719 1242 : self.generation,
2720 1242 : self.shard_identity,
2721 1242 : self.walredo_mgr.clone(),
2722 1242 : resources,
2723 1242 : pg_version,
2724 1242 : state,
2725 1242 : last_aux_file_policy,
2726 1242 : self.cancel.child_token(),
2727 1242 : );
2728 1242 :
2729 1242 : Ok(timeline)
2730 1242 : }
2731 :
2732 : // Allow too_many_arguments because a constructor's argument list naturally grows with the
2733 : // number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
2734 : #[allow(clippy::too_many_arguments)]
2735 570 : fn new(
2736 570 : state: TenantState,
2737 570 : conf: &'static PageServerConf,
2738 570 : attached_conf: AttachedTenantConf,
2739 570 : shard_identity: ShardIdentity,
2740 570 : walredo_mgr: Option<Arc<WalRedoManager>>,
2741 570 : tenant_shard_id: TenantShardId,
2742 570 : remote_storage: GenericRemoteStorage,
2743 570 : deletion_queue_client: DeletionQueueClient,
2744 570 : l0_flush_global_state: L0FlushGlobalState,
2745 570 : ) -> Tenant {
2746 570 : debug_assert!(
2747 570 : !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
2748 : );
2749 :
2750 570 : let (state, mut rx) = watch::channel(state);
2751 570 :
2752 570 : tokio::spawn(async move {
2753 570 : // reflect tenant state in metrics:
2754 570 : // - global per tenant state: TENANT_STATE_METRIC
2755 570 : // - "set" of broken tenants: BROKEN_TENANTS_SET
2756 570 : //
2757 570 : // set of broken tenants should not have zero counts so that it remains accessible for
2758 570 : // alerting.
2759 570 :
2760 570 : let tid = tenant_shard_id.to_string();
2761 570 : let shard_id = tenant_shard_id.shard_slug().to_string();
2762 570 : let set_key = &[tid.as_str(), shard_id.as_str()][..];
2763 :
2764 1134 : fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
2765 1134 : ([state.into()], matches!(state, TenantState::Broken { .. }))
2766 1134 : }
2767 :
2768 570 : let mut tuple = inspect_state(&rx.borrow_and_update());
2769 570 :
2770 570 : let is_broken = tuple.1;
2771 570 : let mut counted_broken = if is_broken {
2772 : // add the id to the set right away, there should not be any updates on the channel
2773 : // after before tenant is removed, if ever
2774 0 : BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
2775 0 : true
2776 : } else {
2777 570 : false
2778 : };
2779 :
2780 : loop {
2781 1134 : let labels = &tuple.0;
2782 1134 : let current = TENANT_STATE_METRIC.with_label_values(labels);
2783 1134 : current.inc();
2784 1134 :
2785 1134 : if rx.changed().await.is_err() {
2786 : // tenant has been dropped
2787 48 : current.dec();
2788 48 : drop(BROKEN_TENANTS_SET.remove_label_values(set_key));
2789 48 : break;
2790 564 : }
2791 564 :
2792 564 : current.dec();
2793 564 : tuple = inspect_state(&rx.borrow_and_update());
2794 564 :
2795 564 : let is_broken = tuple.1;
2796 564 : if is_broken && !counted_broken {
2797 0 : counted_broken = true;
2798 0 : // insert the tenant_id (back) into the set while avoiding needless counter
2799 0 : // access
2800 0 : BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
2801 564 : }
2802 : }
2803 570 : });
2804 570 :
2805 570 : Tenant {
2806 570 : tenant_shard_id,
2807 570 : shard_identity,
2808 570 : generation: attached_conf.location.generation,
2809 570 : conf,
2810 570 : // using now here is good enough approximation to catch tenants with really long
2811 570 : // activation times.
2812 570 : constructed_at: Instant::now(),
2813 570 : timelines: Mutex::new(HashMap::new()),
2814 570 : timelines_creating: Mutex::new(HashSet::new()),
2815 570 : gc_cs: tokio::sync::Mutex::new(()),
2816 570 : walredo_mgr,
2817 570 : remote_storage,
2818 570 : deletion_queue_client,
2819 570 : state,
2820 570 : cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
2821 570 : cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
2822 570 : eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
2823 570 : compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new(
2824 570 : format!("compaction-{tenant_shard_id}"),
2825 570 : 5,
2826 570 : // Compaction can be a very expensive operation, and might leak disk space. It also ought
2827 570 : // to be infallible, as long as remote storage is available. So if it repeatedly fails,
2828 570 : // use an extremely long backoff.
2829 570 : Some(Duration::from_secs(3600 * 24)),
2830 570 : )),
2831 570 : activate_now_sem: tokio::sync::Semaphore::new(0),
2832 570 : cancel: CancellationToken::default(),
2833 570 : gate: Gate::default(),
2834 570 : timeline_get_throttle: Arc::new(throttle::Throttle::new(
2835 570 : Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
2836 570 : crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
2837 570 : )),
2838 570 : tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
2839 570 : ongoing_timeline_detach: std::sync::Mutex::default(),
2840 570 : gc_block: Default::default(),
2841 570 : l0_flush_global_state,
2842 570 : }
2843 570 : }
2844 :
2845 : /// Locate and load config
2846 0 : pub(super) fn load_tenant_config(
2847 0 : conf: &'static PageServerConf,
2848 0 : tenant_shard_id: &TenantShardId,
2849 0 : ) -> Result<LocationConf, LoadConfigError> {
2850 0 : let config_path = conf.tenant_location_config_path(tenant_shard_id);
2851 0 :
2852 0 : info!("loading tenant configuration from {config_path}");
2853 :
2854 : // load and parse file
2855 0 : let config = fs::read_to_string(&config_path).map_err(|e| {
2856 0 : match e.kind() {
2857 : std::io::ErrorKind::NotFound => {
2858 : // The config should almost always exist for a tenant directory:
2859 : // - When attaching a tenant, the config is the first thing we write
2860 : // - When detaching a tenant, we atomically move the directory to a tmp location
2861 : // before deleting contents.
2862 : //
2863 : // The very rare edge case that can result in a missing config is if we crash during attach
2864 : // between creating directory and writing config. Callers should handle that as if the
2865 : // directory didn't exist.
2866 :
2867 0 : LoadConfigError::NotFound(config_path)
2868 : }
2869 : _ => {
2870 : // No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues
2871 : // that we cannot cleanly recover
2872 0 : crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file")
2873 : }
2874 : }
2875 0 : })?;
2876 :
2877 0 : Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
2878 0 : }
2879 :
2880 0 : #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
2881 : pub(super) async fn persist_tenant_config(
2882 : conf: &'static PageServerConf,
2883 : tenant_shard_id: &TenantShardId,
2884 : location_conf: &LocationConf,
2885 : ) -> std::io::Result<()> {
2886 : let config_path = conf.tenant_location_config_path(tenant_shard_id);
2887 :
2888 : Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await
2889 : }
2890 :
2891 0 : #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
2892 : pub(super) async fn persist_tenant_config_at(
2893 : tenant_shard_id: &TenantShardId,
2894 : config_path: &Utf8Path,
2895 : location_conf: &LocationConf,
2896 : ) -> std::io::Result<()> {
2897 : debug!("persisting tenantconf to {config_path}");
2898 :
2899 : let mut conf_content = r#"# This file contains a specific per-tenant's config.
2900 : # It is read in case of pageserver restart.
2901 : "#
2902 : .to_string();
2903 :
2904 0 : fail::fail_point!("tenant-config-before-write", |_| {
2905 0 : Err(std::io::Error::new(
2906 0 : std::io::ErrorKind::Other,
2907 0 : "tenant-config-before-write",
2908 0 : ))
2909 0 : });
2910 :
2911 : // Convert the config to a toml file.
2912 : conf_content +=
2913 : &toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed");
2914 :
2915 : let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX);
2916 :
2917 : let conf_content = conf_content.into_bytes();
2918 : VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await
2919 : }
2920 :
2921 : //
2922 : // How garbage collection works:
2923 : //
2924 : // +--bar------------->
2925 : // /
2926 : // +----+-----foo---------------->
2927 : // /
2928 : // ----main--+-------------------------->
2929 : // \
2930 : // +-----baz-------->
2931 : //
2932 : //
2933 : // 1. Grab 'gc_cs' mutex to prevent new timelines from being created while Timeline's
2934 : // `gc_infos` are being refreshed
2935 : // 2. Scan collected timelines, and on each timeline, make note of the
2936 : // all the points where other timelines have been branched off.
2937 : // We will refrain from removing page versions at those LSNs.
2938 : // 3. For each timeline, scan all layer files on the timeline.
2939 : // Remove all files for which a newer file exists and which
2940 : // don't cover any branch point LSNs.
2941 : //
2942 : // TODO:
2943 : // - if a relation has a non-incremental persistent layer on a child branch, then we
2944 : // don't need to keep that in the parent anymore. But currently
2945 : // we do.
2946 2262 : async fn gc_iteration_internal(
2947 2262 : &self,
2948 2262 : target_timeline_id: Option<TimelineId>,
2949 2262 : horizon: u64,
2950 2262 : pitr: Duration,
2951 2262 : cancel: &CancellationToken,
2952 2262 : ctx: &RequestContext,
2953 2262 : ) -> Result<GcResult, GcError> {
2954 2262 : let mut totals: GcResult = Default::default();
2955 2262 : let now = Instant::now();
2956 :
2957 2262 : let gc_timelines = self
2958 2262 : .refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
2959 2232 : .await?;
2960 :
2961 2262 : failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
2962 :
2963 : // If there is nothing to GC, we don't want any messages in the INFO log.
2964 2262 : if !gc_timelines.is_empty() {
2965 2262 : info!("{} timelines need GC", gc_timelines.len());
2966 : } else {
2967 0 : debug!("{} timelines need GC", gc_timelines.len());
2968 : }
2969 :
2970 : // Perform GC for each timeline.
2971 : //
2972 : // Note that we don't hold the `Tenant::gc_cs` lock here because we don't want to delay the
2973 : // branch creation task, which requires the GC lock. A GC iteration can run concurrently
2974 : // with branch creation.
2975 : //
2976 : // See comments in [`Tenant::branch_timeline`] for more information about why branch
2977 : // creation task can run concurrently with timeline's GC iteration.
2978 4524 : for timeline in gc_timelines {
2979 2262 : if cancel.is_cancelled() {
2980 : // We were requested to shut down. Stop and return with the progress we
2981 : // made.
2982 0 : break;
2983 2262 : }
2984 2262 : let result = match timeline.gc().await {
2985 : Err(GcError::TimelineCancelled) => {
2986 0 : if target_timeline_id.is_some() {
2987 : // If we were targetting this specific timeline, surface cancellation to caller
2988 0 : return Err(GcError::TimelineCancelled);
2989 : } else {
2990 : // A timeline may be shutting down independently of the tenant's lifecycle: we should
2991 : // skip past this and proceed to try GC on other timelines.
2992 0 : continue;
2993 : }
2994 : }
2995 2262 : r => r?,
2996 : };
2997 2262 : totals += result;
2998 : }
2999 :
3000 2262 : totals.elapsed = now.elapsed();
3001 2262 : Ok(totals)
3002 2262 : }
3003 :
3004 : /// Refreshes the Timeline::gc_info for all timelines, returning the
3005 : /// vector of timelines which have [`Timeline::get_last_record_lsn`] past
3006 : /// [`Tenant::get_gc_horizon`].
3007 : ///
3008 : /// This is usually executed as part of periodic gc, but can now be triggered more often.
3009 0 : pub(crate) async fn refresh_gc_info(
3010 0 : &self,
3011 0 : cancel: &CancellationToken,
3012 0 : ctx: &RequestContext,
3013 0 : ) -> Result<Vec<Arc<Timeline>>, GcError> {
3014 0 : // since this method can now be called at different rates than the configured gc loop, it
3015 0 : // might be that these configuration values get applied faster than what it was previously,
3016 0 : // since these were only read from the gc task.
3017 0 : let horizon = self.get_gc_horizon();
3018 0 : let pitr = self.get_pitr_interval();
3019 0 :
3020 0 : // refresh all timelines
3021 0 : let target_timeline_id = None;
3022 0 :
3023 0 : self.refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
3024 0 : .await
3025 0 : }
3026 :
3027 : /// Populate all Timelines' `GcInfo` with information about their children. We do not set the
3028 : /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`]
3029 : ///
3030 : /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion.
3031 0 : fn initialize_gc_info(
3032 0 : &self,
3033 0 : timelines: &std::sync::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
3034 0 : ) {
3035 0 : // This function must be called before activation: after activation timeline create/delete operations
3036 0 : // might happen, and this function is not safe to run concurrently with those.
3037 0 : assert!(!self.is_active());
3038 :
3039 : // Scan all timelines. For each timeline, remember the timeline ID and
3040 : // the branch point where it was created.
3041 0 : let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> = BTreeMap::new();
3042 0 : timelines.iter().for_each(|(timeline_id, timeline_entry)| {
3043 0 : if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
3044 0 : let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default();
3045 0 : ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id));
3046 0 : }
3047 0 : });
3048 0 :
3049 0 : // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines
3050 0 : let horizon = self.get_gc_horizon();
3051 :
3052 : // Populate each timeline's GcInfo with information about its child branches
3053 0 : for timeline in timelines.values() {
3054 0 : let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
3055 0 : .remove(&timeline.timeline_id)
3056 0 : .unwrap_or_default();
3057 0 :
3058 0 : branchpoints.sort_by_key(|b| b.0);
3059 0 :
3060 0 : let mut target = timeline.gc_info.write().unwrap();
3061 0 :
3062 0 : target.retain_lsns = branchpoints;
3063 0 :
3064 0 : let space_cutoff = timeline
3065 0 : .get_last_record_lsn()
3066 0 : .checked_sub(horizon)
3067 0 : .unwrap_or(Lsn(0));
3068 0 :
3069 0 : target.cutoffs = GcCutoffs {
3070 0 : space: space_cutoff,
3071 0 : time: Lsn::INVALID,
3072 0 : };
3073 0 : }
3074 0 : }
3075 :
3076 2262 : async fn refresh_gc_info_internal(
3077 2262 : &self,
3078 2262 : target_timeline_id: Option<TimelineId>,
3079 2262 : horizon: u64,
3080 2262 : pitr: Duration,
3081 2262 : cancel: &CancellationToken,
3082 2262 : ctx: &RequestContext,
3083 2262 : ) -> Result<Vec<Arc<Timeline>>, GcError> {
3084 2262 : // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
3085 2262 : // currently visible timelines.
3086 2262 : let timelines = self
3087 2262 : .timelines
3088 2262 : .lock()
3089 2262 : .unwrap()
3090 2262 : .values()
3091 9930 : .filter(|tl| match target_timeline_id.as_ref() {
3092 9930 : Some(target) => &tl.timeline_id == target,
3093 0 : None => true,
3094 9930 : })
3095 2262 : .cloned()
3096 2262 : .collect::<Vec<_>>();
3097 2262 :
3098 2262 : if target_timeline_id.is_some() && timelines.is_empty() {
3099 : // We were to act on a particular timeline and it wasn't found
3100 0 : return Err(GcError::TimelineNotFound);
3101 2262 : }
3102 2262 :
3103 2262 : let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
3104 2262 : HashMap::with_capacity(timelines.len());
3105 :
3106 2262 : for timeline in timelines.iter() {
3107 2262 : let cutoff = timeline
3108 2262 : .get_last_record_lsn()
3109 2262 : .checked_sub(horizon)
3110 2262 : .unwrap_or(Lsn(0));
3111 :
3112 2262 : let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
3113 2262 : let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
3114 2262 : assert!(old.is_none());
3115 : }
3116 :
3117 2262 : if !self.is_active() || self.cancel.is_cancelled() {
3118 0 : return Err(GcError::TenantCancelled);
3119 2262 : }
3120 :
3121 : // grab mutex to prevent new timelines from being created here; avoid doing long operations
3122 : // because that will stall branch creation.
3123 2262 : let gc_cs = self.gc_cs.lock().await;
3124 :
3125 : // Ok, we now know all the branch points.
3126 : // Update the GC information for each timeline.
3127 2262 : let mut gc_timelines = Vec::with_capacity(timelines.len());
3128 4524 : for timeline in timelines {
3129 : // We filtered the timeline list above
3130 2262 : if let Some(target_timeline_id) = target_timeline_id {
3131 2262 : assert_eq!(target_timeline_id, timeline.timeline_id);
3132 0 : }
3133 :
3134 : {
3135 2262 : let mut target = timeline.gc_info.write().unwrap();
3136 2262 :
3137 2262 : // Cull any expired leases
3138 2262 : let now = SystemTime::now();
3139 2262 : target.leases.retain(|_, lease| !lease.is_expired(&now));
3140 2262 :
3141 2262 : timeline
3142 2262 : .metrics
3143 2262 : .valid_lsn_lease_count_gauge
3144 2262 : .set(target.leases.len() as u64);
3145 :
3146 : // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
3147 2262 : if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
3148 300 : if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
3149 0 : target.within_ancestor_pitr =
3150 0 : timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
3151 300 : }
3152 1962 : }
3153 :
3154 : // Update metrics that depend on GC state
3155 2262 : timeline
3156 2262 : .metrics
3157 2262 : .archival_size
3158 2262 : .set(if target.within_ancestor_pitr {
3159 0 : timeline.metrics.current_logical_size_gauge.get()
3160 : } else {
3161 2262 : 0
3162 : });
3163 2262 : timeline.metrics.pitr_history_size.set(
3164 2262 : timeline
3165 2262 : .get_last_record_lsn()
3166 2262 : .checked_sub(target.cutoffs.time)
3167 2262 : .unwrap_or(Lsn(0))
3168 2262 : .0,
3169 2262 : );
3170 :
3171 : // Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline?
3172 : // - this timeline was created while we were finding cutoffs
3173 : // - lsn for timestamp search fails for this timeline repeatedly
3174 2262 : if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) {
3175 2262 : target.cutoffs = cutoffs.clone();
3176 2262 : }
3177 : }
3178 :
3179 2262 : gc_timelines.push(timeline);
3180 : }
3181 2262 : drop(gc_cs);
3182 2262 : Ok(gc_timelines)
3183 2262 : }
3184 :
3185 : /// A substitute for `branch_timeline` for use in unit tests.
3186 : /// The returned timeline will have state value `Active` to make various `anyhow::ensure!()`
3187 : /// calls pass, but, we do not actually call `.activate()` under the hood. So, none of the
3188 : /// timeline background tasks are launched, except the flush loop.
3189 : #[cfg(test)]
3190 690 : async fn branch_timeline_test(
3191 690 : &self,
3192 690 : src_timeline: &Arc<Timeline>,
3193 690 : dst_id: TimelineId,
3194 690 : ancestor_lsn: Option<Lsn>,
3195 690 : ctx: &RequestContext,
3196 690 : ) -> Result<Arc<Timeline>, CreateTimelineError> {
3197 690 : let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
3198 690 : let tl = self
3199 690 : .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx)
3200 12 : .await?;
3201 678 : tl.set_state(TimelineState::Active);
3202 678 : Ok(tl)
3203 690 : }
3204 :
3205 : /// Helper for unit tests to branch a timeline with some pre-loaded states.
3206 : #[cfg(test)]
3207 : #[allow(clippy::too_many_arguments)]
3208 18 : pub async fn branch_timeline_test_with_layers(
3209 18 : &self,
3210 18 : src_timeline: &Arc<Timeline>,
3211 18 : dst_id: TimelineId,
3212 18 : ancestor_lsn: Option<Lsn>,
3213 18 : ctx: &RequestContext,
3214 18 : delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
3215 18 : image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
3216 18 : end_lsn: Lsn,
3217 18 : ) -> anyhow::Result<Arc<Timeline>> {
3218 : use checks::check_valid_layermap;
3219 : use itertools::Itertools;
3220 :
3221 18 : let tline = self
3222 18 : .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
3223 0 : .await?;
3224 18 : let ancestor_lsn = if let Some(ancestor_lsn) = ancestor_lsn {
3225 18 : ancestor_lsn
3226 : } else {
3227 0 : tline.get_last_record_lsn()
3228 : };
3229 18 : assert!(end_lsn >= ancestor_lsn);
3230 18 : tline.force_advance_lsn(end_lsn);
3231 36 : for deltas in delta_layer_desc {
3232 18 : tline
3233 18 : .force_create_delta_layer(deltas, Some(ancestor_lsn), ctx)
3234 54 : .await?;
3235 : }
3236 30 : for (lsn, images) in image_layer_desc {
3237 12 : tline
3238 12 : .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
3239 42 : .await?;
3240 : }
3241 18 : let layer_names = tline
3242 18 : .layers
3243 18 : .read()
3244 0 : .await
3245 18 : .layer_map()
3246 18 : .unwrap()
3247 18 : .iter_historic_layers()
3248 30 : .map(|layer| layer.layer_name())
3249 18 : .collect_vec();
3250 18 : if let Some(err) = check_valid_layermap(&layer_names) {
3251 0 : bail!("invalid layermap: {err}");
3252 18 : }
3253 18 : Ok(tline)
3254 18 : }
3255 :
3256 : /// Branch an existing timeline.
3257 : ///
3258 : /// The caller is responsible for activating the returned timeline.
3259 0 : async fn branch_timeline(
3260 0 : &self,
3261 0 : src_timeline: &Arc<Timeline>,
3262 0 : dst_id: TimelineId,
3263 0 : start_lsn: Option<Lsn>,
3264 0 : timeline_create_guard: TimelineCreateGuard<'_>,
3265 0 : ctx: &RequestContext,
3266 0 : ) -> Result<Arc<Timeline>, CreateTimelineError> {
3267 0 : self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
3268 0 : .await
3269 0 : }
3270 :
3271 690 : async fn branch_timeline_impl(
3272 690 : &self,
3273 690 : src_timeline: &Arc<Timeline>,
3274 690 : dst_id: TimelineId,
3275 690 : start_lsn: Option<Lsn>,
3276 690 : timeline_create_guard: TimelineCreateGuard<'_>,
3277 690 : _ctx: &RequestContext,
3278 690 : ) -> Result<Arc<Timeline>, CreateTimelineError> {
3279 690 : let src_id = src_timeline.timeline_id;
3280 :
3281 : // We will validate our ancestor LSN in this function. Acquire the GC lock so that
3282 : // this check cannot race with GC, and the ancestor LSN is guaranteed to remain
3283 : // valid while we are creating the branch.
3284 690 : let _gc_cs = self.gc_cs.lock().await;
3285 :
3286 : // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
3287 690 : let start_lsn = start_lsn.unwrap_or_else(|| {
3288 6 : let lsn = src_timeline.get_last_record_lsn();
3289 6 : info!("branching timeline {dst_id} from timeline {src_id} at last record LSN: {lsn}");
3290 6 : lsn
3291 690 : });
3292 690 :
3293 690 : // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
3294 690 : // horizon on the source timeline
3295 690 : //
3296 690 : // We check it against both the planned GC cutoff stored in 'gc_info',
3297 690 : // and the 'latest_gc_cutoff' of the last GC that was performed. The
3298 690 : // planned GC cutoff in 'gc_info' is normally larger than
3299 690 : // 'latest_gc_cutoff_lsn', but beware of corner cases like if you just
3300 690 : // changed the GC settings for the tenant to make the PITR window
3301 690 : // larger, but some of the data was already removed by an earlier GC
3302 690 : // iteration.
3303 690 :
3304 690 : // check against last actual 'latest_gc_cutoff' first
3305 690 : let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
3306 690 : src_timeline
3307 690 : .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
3308 690 : .context(format!(
3309 690 : "invalid branch start lsn: less than latest GC cutoff {}",
3310 690 : *latest_gc_cutoff_lsn,
3311 690 : ))
3312 690 : .map_err(CreateTimelineError::AncestorLsn)?;
3313 :
3314 : // and then the planned GC cutoff
3315 : {
3316 678 : let gc_info = src_timeline.gc_info.read().unwrap();
3317 678 : let cutoff = gc_info.min_cutoff();
3318 678 : if start_lsn < cutoff {
3319 0 : return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
3320 0 : "invalid branch start lsn: less than planned GC cutoff {cutoff}"
3321 0 : )));
3322 678 : }
3323 678 : }
3324 678 :
3325 678 : //
3326 678 : // The branch point is valid, and we are still holding the 'gc_cs' lock
3327 678 : // so that GC cannot advance the GC cutoff until we are finished.
3328 678 : // Proceed with the branch creation.
3329 678 : //
3330 678 :
3331 678 : // Determine prev-LSN for the new timeline. We can only determine it if
3332 678 : // the timeline was branched at the current end of the source timeline.
3333 678 : let RecordLsn {
3334 678 : last: src_last,
3335 678 : prev: src_prev,
3336 678 : } = src_timeline.get_last_record_rlsn();
3337 678 : let dst_prev = if src_last == start_lsn {
3338 648 : Some(src_prev)
3339 : } else {
3340 30 : None
3341 : };
3342 :
3343 : // Create the metadata file, noting the ancestor of the new timeline.
3344 : // There is initially no data in it, but all the read-calls know to look
3345 : // into the ancestor.
3346 678 : let metadata = TimelineMetadata::new(
3347 678 : start_lsn,
3348 678 : dst_prev,
3349 678 : Some(src_id),
3350 678 : start_lsn,
3351 678 : *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
3352 678 : src_timeline.initdb_lsn,
3353 678 : src_timeline.pg_version,
3354 678 : );
3355 :
3356 678 : let uninitialized_timeline = self
3357 678 : .prepare_new_timeline(
3358 678 : dst_id,
3359 678 : &metadata,
3360 678 : timeline_create_guard,
3361 678 : start_lsn + 1,
3362 678 : Some(Arc::clone(src_timeline)),
3363 678 : src_timeline.last_aux_file_policy.load(),
3364 678 : )
3365 0 : .await?;
3366 :
3367 678 : let new_timeline = uninitialized_timeline.finish_creation()?;
3368 :
3369 : // Root timeline gets its layers during creation and uploads them along with the metadata.
3370 : // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
3371 : // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC
3372 : // could get incorrect information and remove more layers, than needed.
3373 : // See also https://github.com/neondatabase/neon/issues/3865
3374 678 : new_timeline
3375 678 : .remote_client
3376 678 : .schedule_index_upload_for_full_metadata_update(&metadata)
3377 678 : .context("branch initial metadata upload")?;
3378 :
3379 678 : Ok(new_timeline)
3380 690 : }
3381 :
3382 : /// For unit tests, make this visible so that other modules can directly create timelines
3383 : #[cfg(test)]
3384 6 : #[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
3385 : pub(crate) async fn bootstrap_timeline_test(
3386 : &self,
3387 : timeline_id: TimelineId,
3388 : pg_version: u32,
3389 : load_existing_initdb: Option<TimelineId>,
3390 : ctx: &RequestContext,
3391 : ) -> anyhow::Result<Arc<Timeline>> {
3392 : let create_guard = self.create_timeline_create_guard(timeline_id).unwrap();
3393 : self.bootstrap_timeline(
3394 : timeline_id,
3395 : pg_version,
3396 : load_existing_initdb,
3397 : create_guard,
3398 : ctx,
3399 : )
3400 : .await
3401 : }
3402 :
3403 0 : async fn upload_initdb(
3404 0 : &self,
3405 0 : timelines_path: &Utf8PathBuf,
3406 0 : pgdata_path: &Utf8PathBuf,
3407 0 : timeline_id: &TimelineId,
3408 0 : ) -> anyhow::Result<()> {
3409 0 : let temp_path = timelines_path.join(format!(
3410 0 : "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
3411 0 : ));
3412 0 :
3413 0 : scopeguard::defer! {
3414 0 : if let Err(e) = fs::remove_file(&temp_path) {
3415 0 : error!("Failed to remove temporary initdb archive '{temp_path}': {e}");
3416 0 : }
3417 0 : }
3418 :
3419 0 : let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
3420 : const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
3421 0 : if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
3422 0 : warn!(
3423 0 : "compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
3424 : );
3425 0 : }
3426 :
3427 0 : pausable_failpoint!("before-initdb-upload");
3428 :
3429 0 : backoff::retry(
3430 0 : || async {
3431 0 : self::remote_timeline_client::upload_initdb_dir(
3432 0 : &self.remote_storage,
3433 0 : &self.tenant_shard_id.tenant_id,
3434 0 : timeline_id,
3435 0 : pgdata_zstd.try_clone().await?,
3436 0 : tar_zst_size,
3437 0 : &self.cancel,
3438 : )
3439 0 : .await
3440 0 : },
3441 0 : |_| false,
3442 0 : 3,
3443 0 : u32::MAX,
3444 0 : "persist_initdb_tar_zst",
3445 0 : &self.cancel,
3446 0 : )
3447 0 : .await
3448 0 : .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
3449 0 : .and_then(|x| x)
3450 0 : }
3451 :
3452 : /// - run initdb to init temporary instance and get bootstrap data
3453 : /// - after initialization completes, tar up the temp dir and upload it to S3.
3454 : ///
3455 : /// The caller is responsible for activating the returned timeline.
3456 6 : async fn bootstrap_timeline(
3457 6 : &self,
3458 6 : timeline_id: TimelineId,
3459 6 : pg_version: u32,
3460 6 : load_existing_initdb: Option<TimelineId>,
3461 6 : timeline_create_guard: TimelineCreateGuard<'_>,
3462 6 : ctx: &RequestContext,
3463 6 : ) -> anyhow::Result<Arc<Timeline>> {
3464 6 : // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
3465 6 : // temporary directory for basebackup files for the given timeline.
3466 6 :
3467 6 : let timelines_path = self.conf.timelines_path(&self.tenant_shard_id);
3468 6 : let pgdata_path = path_with_suffix_extension(
3469 6 : timelines_path.join(format!("basebackup-{timeline_id}")),
3470 6 : TEMP_FILE_SUFFIX,
3471 6 : );
3472 6 :
3473 6 : // Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees
3474 6 : // we won't race with other creations or existent timelines with the same path.
3475 6 : if pgdata_path.exists() {
3476 0 : fs::remove_dir_all(&pgdata_path).with_context(|| {
3477 0 : format!("Failed to remove already existing initdb directory: {pgdata_path}")
3478 0 : })?;
3479 6 : }
3480 :
3481 : // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
3482 6 : scopeguard::defer! {
3483 6 : if let Err(e) = fs::remove_dir_all(&pgdata_path) {
3484 6 : // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call
3485 6 : error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
3486 6 : }
3487 6 : }
3488 6 : if let Some(existing_initdb_timeline_id) = load_existing_initdb {
3489 6 : if existing_initdb_timeline_id != timeline_id {
3490 0 : let source_path = &remote_initdb_archive_path(
3491 0 : &self.tenant_shard_id.tenant_id,
3492 0 : &existing_initdb_timeline_id,
3493 0 : );
3494 0 : let dest_path =
3495 0 : &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);
3496 0 :
3497 0 : // if this fails, it will get retried by retried control plane requests
3498 0 : self.remote_storage
3499 0 : .copy_object(source_path, dest_path, &self.cancel)
3500 0 : .await
3501 0 : .context("copy initdb tar")?;
3502 6 : }
3503 6 : let (initdb_tar_zst_path, initdb_tar_zst) =
3504 6 : self::remote_timeline_client::download_initdb_tar_zst(
3505 6 : self.conf,
3506 6 : &self.remote_storage,
3507 6 : &self.tenant_shard_id,
3508 6 : &existing_initdb_timeline_id,
3509 6 : &self.cancel,
3510 6 : )
3511 1972 : .await
3512 6 : .context("download initdb tar")?;
3513 :
3514 6 : scopeguard::defer! {
3515 6 : if let Err(e) = fs::remove_file(&initdb_tar_zst_path) {
3516 6 : error!("Failed to remove temporary initdb archive '{initdb_tar_zst_path}': {e}");
3517 6 : }
3518 6 : }
3519 6 :
3520 6 : let buf_read =
3521 6 : BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
3522 6 : extract_zst_tarball(&pgdata_path, buf_read)
3523 27946 : .await
3524 6 : .context("extract initdb tar")?;
3525 : } else {
3526 : // Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path
3527 0 : run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
3528 :
3529 : // Upload the created data dir to S3
3530 0 : if self.tenant_shard_id().is_shard_zero() {
3531 0 : self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
3532 0 : .await?;
3533 0 : }
3534 : }
3535 6 : let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
3536 6 :
3537 6 : // Import the contents of the data directory at the initial checkpoint
3538 6 : // LSN, and any WAL after that.
3539 6 : // Initdb lsn will be equal to last_record_lsn which will be set after import.
3540 6 : // Because we know it upfront avoid having an option or dummy zero value by passing it to the metadata.
3541 6 : let new_metadata = TimelineMetadata::new(
3542 6 : Lsn(0),
3543 6 : None,
3544 6 : None,
3545 6 : Lsn(0),
3546 6 : pgdata_lsn,
3547 6 : pgdata_lsn,
3548 6 : pg_version,
3549 6 : );
3550 6 : let raw_timeline = self
3551 6 : .prepare_new_timeline(
3552 6 : timeline_id,
3553 6 : &new_metadata,
3554 6 : timeline_create_guard,
3555 6 : pgdata_lsn,
3556 6 : None,
3557 6 : None,
3558 6 : )
3559 0 : .await?;
3560 :
3561 6 : let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
3562 6 : let unfinished_timeline = raw_timeline.raw_timeline()?;
3563 :
3564 : // Flush the new layer files to disk, before we make the timeline as available to
3565 : // the outside world.
3566 : //
3567 : // Flush loop needs to be spawned in order to be able to flush.
3568 6 : unfinished_timeline.maybe_spawn_flush_loop();
3569 6 :
3570 6 : import_datadir::import_timeline_from_postgres_datadir(
3571 6 : unfinished_timeline,
3572 6 : &pgdata_path,
3573 6 : pgdata_lsn,
3574 6 : ctx,
3575 6 : )
3576 27697 : .await
3577 6 : .with_context(|| {
3578 0 : format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
3579 6 : })?;
3580 :
3581 6 : fail::fail_point!("before-checkpoint-new-timeline", |_| {
3582 0 : anyhow::bail!("failpoint before-checkpoint-new-timeline");
3583 6 : });
3584 :
3585 6 : unfinished_timeline
3586 6 : .freeze_and_flush()
3587 6 : .await
3588 6 : .with_context(|| {
3589 0 : format!(
3590 0 : "Failed to flush after pgdatadir import for timeline {tenant_shard_id}/{timeline_id}"
3591 0 : )
3592 6 : })?;
3593 :
3594 : // All done!
3595 6 : let timeline = raw_timeline.finish_creation()?;
3596 :
3597 6 : Ok(timeline)
3598 6 : }
3599 :
3600 : /// Call this before constructing a timeline, to build its required structures
3601 1224 : fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
3602 1224 : let remote_client = RemoteTimelineClient::new(
3603 1224 : self.remote_storage.clone(),
3604 1224 : self.deletion_queue_client.clone(),
3605 1224 : self.conf,
3606 1224 : self.tenant_shard_id,
3607 1224 : timeline_id,
3608 1224 : self.generation,
3609 1224 : );
3610 1224 : TimelineResources {
3611 1224 : remote_client,
3612 1224 : timeline_get_throttle: self.timeline_get_throttle.clone(),
3613 1224 : l0_flush_global_state: self.l0_flush_global_state.clone(),
3614 1224 : }
3615 1224 : }
3616 :
3617 : /// Creates intermediate timeline structure and its files.
3618 : ///
3619 : /// An empty layer map is initialized, and new data and WAL can be imported starting
3620 : /// at 'disk_consistent_lsn'. After any initial data has been imported, call
3621 : /// `finish_creation` to insert the Timeline into the timelines map.
3622 1224 : async fn prepare_new_timeline<'a>(
3623 1224 : &'a self,
3624 1224 : new_timeline_id: TimelineId,
3625 1224 : new_metadata: &TimelineMetadata,
3626 1224 : create_guard: TimelineCreateGuard<'a>,
3627 1224 : start_lsn: Lsn,
3628 1224 : ancestor: Option<Arc<Timeline>>,
3629 1224 : last_aux_file_policy: Option<AuxFilePolicy>,
3630 1224 : ) -> anyhow::Result<UninitializedTimeline> {
3631 1224 : let tenant_shard_id = self.tenant_shard_id;
3632 1224 :
3633 1224 : let resources = self.build_timeline_resources(new_timeline_id);
3634 1224 : resources
3635 1224 : .remote_client
3636 1224 : .init_upload_queue_for_empty_remote(new_metadata)?;
3637 :
3638 1224 : let timeline_struct = self
3639 1224 : .create_timeline_struct(
3640 1224 : new_timeline_id,
3641 1224 : new_metadata,
3642 1224 : ancestor,
3643 1224 : resources,
3644 1224 : CreateTimelineCause::Load,
3645 1224 : last_aux_file_policy,
3646 1224 : )
3647 1224 : .context("Failed to create timeline data structure")?;
3648 :
3649 1224 : timeline_struct.init_empty_layer_map(start_lsn);
3650 :
3651 1224 : if let Err(e) = self
3652 1224 : .create_timeline_files(&create_guard.timeline_path)
3653 0 : .await
3654 : {
3655 0 : error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
3656 0 : cleanup_timeline_directory(create_guard);
3657 0 : return Err(e);
3658 1224 : }
3659 1224 :
3660 1224 : debug!(
3661 0 : "Successfully created initial files for timeline {tenant_shard_id}/{new_timeline_id}"
3662 : );
3663 :
3664 1224 : Ok(UninitializedTimeline::new(
3665 1224 : self,
3666 1224 : new_timeline_id,
3667 1224 : Some((timeline_struct, create_guard)),
3668 1224 : ))
3669 1224 : }
3670 :
3671 1224 : async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
3672 1224 : crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
3673 :
3674 1224 : fail::fail_point!("after-timeline-dir-creation", |_| {
3675 0 : anyhow::bail!("failpoint after-timeline-dir-creation");
3676 1224 : });
3677 :
3678 1224 : Ok(())
3679 1224 : }
3680 :
3681 : /// Get a guard that provides exclusive access to the timeline directory, preventing
3682 : /// concurrent attempts to create the same timeline.
3683 1242 : fn create_timeline_create_guard(
3684 1242 : &self,
3685 1242 : timeline_id: TimelineId,
3686 1242 : ) -> Result<TimelineCreateGuard, TimelineExclusionError> {
3687 1242 : let tenant_shard_id = self.tenant_shard_id;
3688 1242 :
3689 1242 : let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
3690 :
3691 1242 : let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?;
3692 :
3693 : // At this stage, we have got exclusive access to in-memory state for this timeline ID
3694 : // for creation.
3695 : // A timeline directory should never exist on disk already:
3696 : // - a previous failed creation would have cleaned up after itself
3697 : // - a pageserver restart would clean up timeline directories that don't have valid remote state
3698 : //
3699 : // Therefore it is an unexpected internal error to encounter a timeline directory already existing here,
3700 : // this error may indicate a bug in cleanup on failed creations.
3701 1236 : if timeline_path.exists() {
3702 0 : return Err(TimelineExclusionError::Other(anyhow::anyhow!(
3703 0 : "Timeline directory already exists! This is a bug."
3704 0 : )));
3705 1236 : }
3706 1236 :
3707 1236 : Ok(create_guard)
3708 1242 : }
3709 :
3710 : /// Gathers inputs from all of the timelines to produce a sizing model input.
3711 : ///
3712 : /// Future is cancellation safe. Only one calculation can be running at once per tenant.
3713 0 : #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
3714 : pub async fn gather_size_inputs(
3715 : &self,
3716 : // `max_retention_period` overrides the cutoff that is used to calculate the size
3717 : // (only if it is shorter than the real cutoff).
3718 : max_retention_period: Option<u64>,
3719 : cause: LogicalSizeCalculationCause,
3720 : cancel: &CancellationToken,
3721 : ctx: &RequestContext,
3722 : ) -> Result<size::ModelInputs, size::CalculateSyntheticSizeError> {
3723 : let logical_sizes_at_once = self
3724 : .conf
3725 : .concurrent_tenant_size_logical_size_queries
3726 : .inner();
3727 :
3728 : // TODO: Having a single mutex block concurrent reads is not great for performance.
3729 : //
3730 : // But the only case where we need to run multiple of these at once is when we
3731 : // request a size for a tenant manually via API, while another background calculation
3732 : // is in progress (which is not a common case).
3733 : //
3734 : // See more for on the issue #2748 condenced out of the initial PR review.
3735 : let mut shared_cache = tokio::select! {
3736 : locked = self.cached_logical_sizes.lock() => locked,
3737 : _ = cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
3738 : _ = self.cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
3739 : };
3740 :
3741 : size::gather_inputs(
3742 : self,
3743 : logical_sizes_at_once,
3744 : max_retention_period,
3745 : &mut shared_cache,
3746 : cause,
3747 : cancel,
3748 : ctx,
3749 : )
3750 : .await
3751 : }
3752 :
3753 : /// Calculate synthetic tenant size and cache the result.
3754 : /// This is periodically called by background worker.
3755 : /// result is cached in tenant struct
3756 0 : #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
3757 : pub async fn calculate_synthetic_size(
3758 : &self,
3759 : cause: LogicalSizeCalculationCause,
3760 : cancel: &CancellationToken,
3761 : ctx: &RequestContext,
3762 : ) -> Result<u64, size::CalculateSyntheticSizeError> {
3763 : let inputs = self.gather_size_inputs(None, cause, cancel, ctx).await?;
3764 :
3765 : let size = inputs.calculate();
3766 :
3767 : self.set_cached_synthetic_size(size);
3768 :
3769 : Ok(size)
3770 : }
3771 :
3772 : /// Cache given synthetic size and update the metric value
3773 0 : pub fn set_cached_synthetic_size(&self, size: u64) {
3774 0 : self.cached_synthetic_tenant_size
3775 0 : .store(size, Ordering::Relaxed);
3776 0 :
3777 0 : // Only shard zero should be calculating synthetic sizes
3778 0 : debug_assert!(self.shard_identity.is_shard_zero());
3779 :
3780 0 : TENANT_SYNTHETIC_SIZE_METRIC
3781 0 : .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
3782 0 : .unwrap()
3783 0 : .set(size);
3784 0 : }
3785 :
3786 0 : pub fn cached_synthetic_size(&self) -> u64 {
3787 0 : self.cached_synthetic_tenant_size.load(Ordering::Relaxed)
3788 0 : }
3789 :
3790 : /// Flush any in-progress layers, schedule uploads, and wait for uploads to complete.
3791 : ///
3792 : /// This function can take a long time: callers should wrap it in a timeout if calling
3793 : /// from an external API handler.
3794 : ///
3795 : /// Cancel-safety: cancelling this function may leave I/O running, but such I/O is
3796 : /// still bounded by tenant/timeline shutdown.
3797 0 : #[tracing::instrument(skip_all)]
3798 : pub(crate) async fn flush_remote(&self) -> anyhow::Result<()> {
3799 : let timelines = self.timelines.lock().unwrap().clone();
3800 :
3801 0 : async fn flush_timeline(_gate: GateGuard, timeline: Arc<Timeline>) -> anyhow::Result<()> {
3802 0 : tracing::info!(timeline_id=%timeline.timeline_id, "Flushing...");
3803 0 : timeline.freeze_and_flush().await?;
3804 0 : tracing::info!(timeline_id=%timeline.timeline_id, "Waiting for uploads...");
3805 0 : timeline.remote_client.wait_completion().await?;
3806 :
3807 0 : Ok(())
3808 0 : }
3809 :
3810 : // We do not use a JoinSet for these tasks, because we don't want them to be
3811 : // aborted when this function's future is cancelled: they should stay alive
3812 : // holding their GateGuard until they complete, to ensure their I/Os complete
3813 : // before Timeline shutdown completes.
3814 : let mut results = FuturesUnordered::new();
3815 :
3816 : for (_timeline_id, timeline) in timelines {
3817 : // Run each timeline's flush in a task holding the timeline's gate: this
3818 : // means that if this function's future is cancelled, the Timeline shutdown
3819 : // will still wait for any I/O in here to complete.
3820 : let Ok(gate) = timeline.gate.enter() else {
3821 : continue;
3822 : };
3823 0 : let jh = tokio::task::spawn(async move { flush_timeline(gate, timeline).await });
3824 : results.push(jh);
3825 : }
3826 :
3827 : while let Some(r) = results.next().await {
3828 : if let Err(e) = r {
3829 : if !e.is_cancelled() && !e.is_panic() {
3830 : tracing::error!("unexpected join error: {e:?}");
3831 : }
3832 : }
3833 : }
3834 :
3835 : // The flushes we did above were just writes, but the Tenant might have had
3836 : // pending deletions as well from recent compaction/gc: we want to flush those
3837 : // as well. This requires flushing the global delete queue. This is cheap
3838 : // because it's typically a no-op.
3839 : match self.deletion_queue_client.flush_execute().await {
3840 : Ok(_) => {}
3841 : Err(DeletionQueueError::ShuttingDown) => {}
3842 : }
3843 :
3844 : Ok(())
3845 : }
3846 :
3847 0 : pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
3848 0 : self.tenant_conf.load().tenant_conf.clone()
3849 0 : }
3850 :
3851 : /// How much local storage would this tenant like to have? It can cope with
3852 : /// less than this (via eviction and on-demand downloads), but this function enables
3853 : /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
3854 : /// by keeping important things on local disk.
3855 : ///
3856 : /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
3857 : /// than they report here, due to layer eviction. Tenants with many active branches may
3858 : /// actually use more than they report here.
3859 0 : pub(crate) fn local_storage_wanted(&self) -> u64 {
3860 0 : let timelines = self.timelines.lock().unwrap();
3861 0 :
3862 0 : // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum. This
3863 0 : // reflects the observation that on tenants with multiple large branches, typically only one
3864 0 : // of them is used actively enough to occupy space on disk.
3865 0 : timelines
3866 0 : .values()
3867 0 : .map(|t| t.metrics.visible_physical_size_gauge.get())
3868 0 : .max()
3869 0 : .unwrap_or(0)
3870 0 : }
3871 : }
3872 :
3873 : /// Create the cluster temporarily in 'initdbpath' directory inside the repository
3874 : /// to get bootstrap data for timeline initialization.
3875 0 : async fn run_initdb(
3876 0 : conf: &'static PageServerConf,
3877 0 : initdb_target_dir: &Utf8Path,
3878 0 : pg_version: u32,
3879 0 : cancel: &CancellationToken,
3880 0 : ) -> Result<(), InitdbError> {
3881 0 : let initdb_bin_path = conf
3882 0 : .pg_bin_dir(pg_version)
3883 0 : .map_err(InitdbError::Other)?
3884 0 : .join("initdb");
3885 0 : let initdb_lib_dir = conf.pg_lib_dir(pg_version).map_err(InitdbError::Other)?;
3886 0 : info!(
3887 0 : "running {} in {}, libdir: {}",
3888 : initdb_bin_path, initdb_target_dir, initdb_lib_dir,
3889 : );
3890 :
3891 0 : let _permit = INIT_DB_SEMAPHORE.acquire().await;
3892 :
3893 0 : let initdb_command = tokio::process::Command::new(&initdb_bin_path)
3894 0 : .args(["-D", initdb_target_dir.as_ref()])
3895 0 : .args(["-U", &conf.superuser])
3896 0 : .args(["-E", "utf8"])
3897 0 : .arg("--no-instructions")
3898 0 : .arg("--no-sync")
3899 0 : .env_clear()
3900 0 : .env("LD_LIBRARY_PATH", &initdb_lib_dir)
3901 0 : .env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
3902 0 : .stdin(std::process::Stdio::null())
3903 0 : // stdout invocation produces the same output every time, we don't need it
3904 0 : .stdout(std::process::Stdio::null())
3905 0 : // we would be interested in the stderr output, if there was any
3906 0 : .stderr(std::process::Stdio::piped())
3907 0 : .spawn()?;
3908 :
3909 : // Ideally we'd select here with the cancellation token, but the problem is that
3910 : // we can't safely terminate initdb: it launches processes of its own, and killing
3911 : // initdb doesn't kill them. After we return from this function, we want the target
3912 : // directory to be able to be cleaned up.
3913 : // See https://github.com/neondatabase/neon/issues/6385
3914 0 : let initdb_output = initdb_command.wait_with_output().await?;
3915 0 : if !initdb_output.status.success() {
3916 0 : return Err(InitdbError::Failed(
3917 0 : initdb_output.status,
3918 0 : initdb_output.stderr,
3919 0 : ));
3920 0 : }
3921 0 :
3922 0 : // This isn't true cancellation support, see above. Still return an error to
3923 0 : // excercise the cancellation code path.
3924 0 : if cancel.is_cancelled() {
3925 0 : return Err(InitdbError::Cancelled);
3926 0 : }
3927 0 :
3928 0 : Ok(())
3929 0 : }
3930 :
3931 : /// Dump contents of a layer file to stdout.
3932 0 : pub async fn dump_layerfile_from_path(
3933 0 : path: &Utf8Path,
3934 0 : verbose: bool,
3935 0 : ctx: &RequestContext,
3936 0 : ) -> anyhow::Result<()> {
3937 : use std::os::unix::fs::FileExt;
3938 :
3939 : // All layer files start with a two-byte "magic" value, to identify the kind of
3940 : // file.
3941 0 : let file = File::open(path)?;
3942 0 : let mut header_buf = [0u8; 2];
3943 0 : file.read_exact_at(&mut header_buf, 0)?;
3944 :
3945 0 : match u16::from_be_bytes(header_buf) {
3946 : crate::IMAGE_FILE_MAGIC => {
3947 0 : ImageLayer::new_for_path(path, file)?
3948 0 : .dump(verbose, ctx)
3949 0 : .await?
3950 : }
3951 : crate::DELTA_FILE_MAGIC => {
3952 0 : DeltaLayer::new_for_path(path, file)?
3953 0 : .dump(verbose, ctx)
3954 0 : .await?
3955 : }
3956 0 : magic => bail!("unrecognized magic identifier: {:?}", magic),
3957 : }
3958 :
3959 0 : Ok(())
3960 0 : }
3961 :
3962 : #[cfg(test)]
3963 : pub(crate) mod harness {
3964 : use bytes::{Bytes, BytesMut};
3965 : use once_cell::sync::OnceCell;
3966 : use pageserver_api::models::ShardParameters;
3967 : use pageserver_api::shard::ShardIndex;
3968 : use utils::logging;
3969 :
3970 : use crate::deletion_queue::mock::MockDeletionQueue;
3971 : use crate::l0_flush::L0FlushConfig;
3972 : use crate::walredo::apply_neon;
3973 : use crate::{repository::Key, walrecord::NeonWalRecord};
3974 :
3975 : use super::*;
3976 : use hex_literal::hex;
3977 : use utils::id::TenantId;
3978 :
3979 : pub const TIMELINE_ID: TimelineId =
3980 : TimelineId::from_array(hex!("11223344556677881122334455667788"));
3981 : pub const NEW_TIMELINE_ID: TimelineId =
3982 : TimelineId::from_array(hex!("AA223344556677881122334455667788"));
3983 :
3984 : /// Convenience function to create a page image with given string as the only content
3985 15086163 : pub fn test_img(s: &str) -> Bytes {
3986 15086163 : let mut buf = BytesMut::new();
3987 15086163 : buf.extend_from_slice(s.as_bytes());
3988 15086163 : buf.resize(64, 0);
3989 15086163 :
3990 15086163 : buf.freeze()
3991 15086163 : }
3992 :
3993 : impl From<TenantConf> for TenantConfOpt {
3994 570 : fn from(tenant_conf: TenantConf) -> Self {
3995 570 : Self {
3996 570 : checkpoint_distance: Some(tenant_conf.checkpoint_distance),
3997 570 : checkpoint_timeout: Some(tenant_conf.checkpoint_timeout),
3998 570 : compaction_target_size: Some(tenant_conf.compaction_target_size),
3999 570 : compaction_period: Some(tenant_conf.compaction_period),
4000 570 : compaction_threshold: Some(tenant_conf.compaction_threshold),
4001 570 : compaction_algorithm: Some(tenant_conf.compaction_algorithm),
4002 570 : gc_horizon: Some(tenant_conf.gc_horizon),
4003 570 : gc_period: Some(tenant_conf.gc_period),
4004 570 : image_creation_threshold: Some(tenant_conf.image_creation_threshold),
4005 570 : pitr_interval: Some(tenant_conf.pitr_interval),
4006 570 : walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
4007 570 : lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
4008 570 : max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
4009 570 : eviction_policy: Some(tenant_conf.eviction_policy),
4010 570 : min_resident_size_override: tenant_conf.min_resident_size_override,
4011 570 : evictions_low_residence_duration_metric_threshold: Some(
4012 570 : tenant_conf.evictions_low_residence_duration_metric_threshold,
4013 570 : ),
4014 570 : heatmap_period: Some(tenant_conf.heatmap_period),
4015 570 : lazy_slru_download: Some(tenant_conf.lazy_slru_download),
4016 570 : timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
4017 570 : image_layer_creation_check_threshold: Some(
4018 570 : tenant_conf.image_layer_creation_check_threshold,
4019 570 : ),
4020 570 : switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
4021 570 : lsn_lease_length: Some(tenant_conf.lsn_lease_length),
4022 570 : lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
4023 570 : }
4024 570 : }
4025 : }
4026 :
4027 : pub struct TenantHarness {
4028 : pub conf: &'static PageServerConf,
4029 : pub tenant_conf: TenantConf,
4030 : pub tenant_shard_id: TenantShardId,
4031 : pub generation: Generation,
4032 : pub shard: ShardIndex,
4033 : pub remote_storage: GenericRemoteStorage,
4034 : pub remote_fs_dir: Utf8PathBuf,
4035 : pub deletion_queue: MockDeletionQueue,
4036 : }
4037 :
4038 : static LOG_HANDLE: OnceCell<()> = OnceCell::new();
4039 :
4040 618 : pub(crate) fn setup_logging() {
4041 618 : LOG_HANDLE.get_or_init(|| {
4042 582 : logging::init(
4043 582 : logging::LogFormat::Test,
4044 582 : // enable it in case the tests exercise code paths that use
4045 582 : // debug_assert_current_span_has_tenant_and_timeline_id
4046 582 : logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
4047 582 : logging::Output::Stdout,
4048 582 : )
4049 582 : .expect("Failed to init test logging")
4050 618 : });
4051 618 : }
4052 :
4053 : impl TenantHarness {
4054 570 : pub async fn create_custom(
4055 570 : test_name: &'static str,
4056 570 : tenant_conf: TenantConf,
4057 570 : tenant_id: TenantId,
4058 570 : shard_identity: ShardIdentity,
4059 570 : generation: Generation,
4060 570 : ) -> anyhow::Result<Self> {
4061 570 : setup_logging();
4062 570 :
4063 570 : let repo_dir = PageServerConf::test_repo_dir(test_name);
4064 570 : let _ = fs::remove_dir_all(&repo_dir);
4065 570 : fs::create_dir_all(&repo_dir)?;
4066 :
4067 570 : let conf = PageServerConf::dummy_conf(repo_dir);
4068 570 : // Make a static copy of the config. This can never be free'd, but that's
4069 570 : // OK in a test.
4070 570 : let conf: &'static PageServerConf = Box::leak(Box::new(conf));
4071 570 :
4072 570 : let shard = shard_identity.shard_index();
4073 570 : let tenant_shard_id = TenantShardId {
4074 570 : tenant_id,
4075 570 : shard_number: shard.shard_number,
4076 570 : shard_count: shard.shard_count,
4077 570 : };
4078 570 : fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?;
4079 570 : fs::create_dir_all(conf.timelines_path(&tenant_shard_id))?;
4080 :
4081 : use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
4082 570 : let remote_fs_dir = conf.workdir.join("localfs");
4083 570 : std::fs::create_dir_all(&remote_fs_dir).unwrap();
4084 570 : let config = RemoteStorageConfig {
4085 570 : storage: RemoteStorageKind::LocalFs {
4086 570 : local_path: remote_fs_dir.clone(),
4087 570 : },
4088 570 : timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
4089 570 : };
4090 570 : let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap();
4091 570 : let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
4092 570 :
4093 570 : Ok(Self {
4094 570 : conf,
4095 570 : tenant_conf,
4096 570 : tenant_shard_id,
4097 570 : generation,
4098 570 : shard,
4099 570 : remote_storage,
4100 570 : remote_fs_dir,
4101 570 : deletion_queue,
4102 570 : })
4103 570 : }
4104 :
4105 534 : pub async fn create(test_name: &'static str) -> anyhow::Result<Self> {
4106 534 : // Disable automatic GC and compaction to make the unit tests more deterministic.
4107 534 : // The tests perform them manually if needed.
4108 534 : let tenant_conf = TenantConf {
4109 534 : gc_period: Duration::ZERO,
4110 534 : compaction_period: Duration::ZERO,
4111 534 : ..TenantConf::default()
4112 534 : };
4113 534 : let tenant_id = TenantId::generate();
4114 534 : let shard = ShardIdentity::unsharded();
4115 534 : Self::create_custom(
4116 534 : test_name,
4117 534 : tenant_conf,
4118 534 : tenant_id,
4119 534 : shard,
4120 534 : Generation::new(0xdeadbeef),
4121 534 : )
4122 0 : .await
4123 534 : }
4124 :
4125 60 : pub fn span(&self) -> tracing::Span {
4126 60 : info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
4127 60 : }
4128 :
4129 570 : pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
4130 570 : let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
4131 570 : (
4132 570 : self.do_try_load(&ctx)
4133 2288 : .await
4134 570 : .expect("failed to load test tenant"),
4135 570 : ctx,
4136 570 : )
4137 570 : }
4138 :
4139 570 : #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
4140 : pub(crate) async fn do_try_load(
4141 : &self,
4142 : ctx: &RequestContext,
4143 : ) -> anyhow::Result<Arc<Tenant>> {
4144 : let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
4145 :
4146 : let tenant = Arc::new(Tenant::new(
4147 : TenantState::Loading,
4148 : self.conf,
4149 : AttachedTenantConf::try_from(LocationConf::attached_single(
4150 : TenantConfOpt::from(self.tenant_conf.clone()),
4151 : self.generation,
4152 : &ShardParameters::default(),
4153 : ))
4154 : .unwrap(),
4155 : // This is a legacy/test code path: sharding isn't supported here.
4156 : ShardIdentity::unsharded(),
4157 : Some(walredo_mgr),
4158 : self.tenant_shard_id,
4159 : self.remote_storage.clone(),
4160 : self.deletion_queue.new_client(),
4161 : // TODO: ideally we should run all unit tests with both configs
4162 : L0FlushGlobalState::new(L0FlushConfig::default()),
4163 : ));
4164 :
4165 : let preload = tenant
4166 : .preload(&self.remote_storage, CancellationToken::new())
4167 : .await?;
4168 : tenant.attach(Some(preload), ctx).await?;
4169 :
4170 : tenant.state.send_replace(TenantState::Active);
4171 : for timeline in tenant.timelines.lock().unwrap().values() {
4172 : timeline.set_state(TimelineState::Active);
4173 : }
4174 : Ok(tenant)
4175 : }
4176 :
4177 6 : pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
4178 6 : self.conf.timeline_path(&self.tenant_shard_id, timeline_id)
4179 6 : }
4180 : }
4181 :
4182 : // Mock WAL redo manager that doesn't do much
4183 : pub(crate) struct TestRedoManager;
4184 :
4185 : impl TestRedoManager {
4186 : /// # Cancel-Safety
4187 : ///
4188 : /// This method is cancellation-safe.
4189 1038 : pub async fn request_redo(
4190 1038 : &self,
4191 1038 : key: Key,
4192 1038 : lsn: Lsn,
4193 1038 : base_img: Option<(Lsn, Bytes)>,
4194 1038 : records: Vec<(Lsn, NeonWalRecord)>,
4195 1038 : _pg_version: u32,
4196 1038 : ) -> Result<Bytes, walredo::Error> {
4197 1386 : let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
4198 1038 : if records_neon {
4199 : // For Neon wal records, we can decode without spawning postgres, so do so.
4200 1038 : let mut page = match (base_img, records.first()) {
4201 1026 : (Some((_lsn, img)), _) => {
4202 1026 : let mut page = BytesMut::new();
4203 1026 : page.extend_from_slice(&img);
4204 1026 : page
4205 : }
4206 12 : (_, Some((_lsn, rec))) if rec.will_init() => BytesMut::new(),
4207 : _ => {
4208 0 : panic!("Neon WAL redo requires base image or will init record");
4209 : }
4210 : };
4211 :
4212 2424 : for (record_lsn, record) in records {
4213 1386 : apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
4214 : }
4215 1038 : Ok(page.freeze())
4216 : } else {
4217 : // We never spawn a postgres walredo process in unit tests: just log what we might have done.
4218 0 : let s = format!(
4219 0 : "redo for {} to get to {}, with {} and {} records",
4220 0 : key,
4221 0 : lsn,
4222 0 : if base_img.is_some() {
4223 0 : "base image"
4224 : } else {
4225 0 : "no base image"
4226 : },
4227 0 : records.len()
4228 0 : );
4229 0 : println!("{s}");
4230 0 :
4231 0 : Ok(test_img(&s))
4232 : }
4233 1038 : }
4234 : }
4235 : }
4236 :
4237 : #[cfg(test)]
4238 : mod tests {
4239 : use std::collections::{BTreeMap, BTreeSet};
4240 :
4241 : use super::*;
4242 : use crate::keyspace::KeySpaceAccum;
4243 : use crate::pgdatadir_mapping::AuxFilesDirectory;
4244 : use crate::repository::{Key, Value};
4245 : use crate::tenant::harness::*;
4246 : use crate::tenant::timeline::CompactFlags;
4247 : use crate::walrecord::NeonWalRecord;
4248 : use crate::DEFAULT_PG_VERSION;
4249 : use bytes::{Bytes, BytesMut};
4250 : use hex_literal::hex;
4251 : use itertools::Itertools;
4252 : use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
4253 : use pageserver_api::keyspace::KeySpace;
4254 : use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
4255 : use rand::{thread_rng, Rng};
4256 : use storage_layer::PersistentLayerKey;
4257 : use tests::storage_layer::ValuesReconstructState;
4258 : use tests::timeline::{GetVectoredError, ShutdownMode};
4259 : use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
4260 : use timeline::{DeltaLayerTestDesc, GcInfo};
4261 : use utils::bin_ser::BeSer;
4262 : use utils::id::TenantId;
4263 :
4264 : static TEST_KEY: Lazy<Key> =
4265 54 : Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
4266 :
4267 : #[tokio::test]
4268 6 : async fn test_basic() -> anyhow::Result<()> {
4269 24 : let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await;
4270 6 : let tline = tenant
4271 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
4272 12 : .await?;
4273 6 :
4274 6 : let mut writer = tline.writer().await;
4275 6 : writer
4276 6 : .put(
4277 6 : *TEST_KEY,
4278 6 : Lsn(0x10),
4279 6 : &Value::Image(test_img("foo at 0x10")),
4280 6 : &ctx,
4281 6 : )
4282 6 : .await?;
4283 6 : writer.finish_write(Lsn(0x10));
4284 6 : drop(writer);
4285 6 :
4286 6 : let mut writer = tline.writer().await;
4287 6 : writer
4288 6 : .put(
4289 6 : *TEST_KEY,
4290 6 : Lsn(0x20),
4291 6 : &Value::Image(test_img("foo at 0x20")),
4292 6 : &ctx,
4293 6 : )
4294 6 : .await?;
4295 6 : writer.finish_write(Lsn(0x20));
4296 6 : drop(writer);
4297 6 :
4298 6 : assert_eq!(
4299 6 : tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
4300 6 : test_img("foo at 0x10")
4301 6 : );
4302 6 : assert_eq!(
4303 6 : tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
4304 6 : test_img("foo at 0x10")
4305 6 : );
4306 6 : assert_eq!(
4307 6 : tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
4308 6 : test_img("foo at 0x20")
4309 6 : );
4310 6 :
4311 6 : Ok(())
4312 6 : }
4313 :
4314 : #[tokio::test]
4315 6 : async fn no_duplicate_timelines() -> anyhow::Result<()> {
4316 6 : let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")
4317 6 : .await?
4318 6 : .load()
4319 24 : .await;
4320 6 : let _ = tenant
4321 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
4322 12 : .await?;
4323 6 :
4324 6 : match tenant
4325 6 : .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
4326 6 : .await
4327 6 : {
4328 6 : Ok(_) => panic!("duplicate timeline creation should fail"),
4329 6 : Err(e) => assert_eq!(e.to_string(), "Already exists".to_string()),
4330 6 : }
4331 6 :
4332 6 : Ok(())
4333 6 : }
4334 :
4335 : /// Convenience function to create a page image with given string as the only content
4336 30 : pub fn test_value(s: &str) -> Value {
4337 30 : let mut buf = BytesMut::new();
4338 30 : buf.extend_from_slice(s.as_bytes());
4339 30 : Value::Image(buf.freeze())
4340 30 : }
4341 :
4342 : ///
4343 : /// Test branch creation
4344 : ///
4345 : #[tokio::test]
4346 6 : async fn test_branch() -> anyhow::Result<()> {
4347 6 : use std::str::from_utf8;
4348 6 :
4349 24 : let (tenant, ctx) = TenantHarness::create("test_branch").await?.load().await;
4350 6 : let tline = tenant
4351 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
4352 12 : .await?;
4353 6 : let mut writer = tline.writer().await;
4354 6 :
4355 6 : #[allow(non_snake_case)]
4356 6 : let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
4357 6 : #[allow(non_snake_case)]
4358 6 : let TEST_KEY_B: Key = Key::from_hex("110000000033333333444444445500000002").unwrap();
4359 6 :
4360 6 : // Insert a value on the timeline
4361 6 : writer
4362 6 : .put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"), &ctx)
4363 6 : .await?;
4364 6 : writer
4365 6 : .put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"), &ctx)
4366 6 : .await?;
4367 6 : writer.finish_write(Lsn(0x20));
4368 6 :
4369 6 : writer
4370 6 : .put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"), &ctx)
4371 6 : .await?;
4372 6 : writer.finish_write(Lsn(0x30));
4373 6 : writer
4374 6 : .put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"), &ctx)
4375 6 : .await?;
4376 6 : writer.finish_write(Lsn(0x40));
4377 6 :
4378 6 : //assert_current_logical_size(&tline, Lsn(0x40));
4379 6 :
4380 6 : // Branch the history, modify relation differently on the new timeline
4381 6 : tenant
4382 6 : .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x30)), &ctx)
4383 6 : .await?;
4384 6 : let newtline = tenant
4385 6 : .get_timeline(NEW_TIMELINE_ID, true)
4386 6 : .expect("Should have a local timeline");
4387 6 : let mut new_writer = newtline.writer().await;
4388 6 : new_writer
4389 6 : .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
4390 6 : .await?;
4391 6 : new_writer.finish_write(Lsn(0x40));
4392 6 :
4393 6 : // Check page contents on both branches
4394 6 : assert_eq!(
4395 6 : from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?,
4396 6 : "foo at 0x40"
4397 6 : );
4398 6 : assert_eq!(
4399 6 : from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?,
4400 6 : "bar at 0x40"
4401 6 : );
4402 6 : assert_eq!(
4403 6 : from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40), &ctx).await?)?,
4404 6 : "foobar at 0x20"
4405 6 : );
4406 6 :
4407 6 : //assert_current_logical_size(&tline, Lsn(0x40));
4408 6 :
4409 6 : Ok(())
4410 6 : }
4411 :
4412 60 : async fn make_some_layers(
4413 60 : tline: &Timeline,
4414 60 : start_lsn: Lsn,
4415 60 : ctx: &RequestContext,
4416 60 : ) -> anyhow::Result<()> {
4417 60 : let mut lsn = start_lsn;
4418 : {
4419 60 : let mut writer = tline.writer().await;
4420 : // Create a relation on the timeline
4421 60 : writer
4422 60 : .put(
4423 60 : *TEST_KEY,
4424 60 : lsn,
4425 60 : &Value::Image(test_img(&format!("foo at {}", lsn))),
4426 60 : ctx,
4427 60 : )
4428 30 : .await?;
4429 60 : writer.finish_write(lsn);
4430 60 : lsn += 0x10;
4431 60 : writer
4432 60 : .put(
4433 60 : *TEST_KEY,
4434 60 : lsn,
4435 60 : &Value::Image(test_img(&format!("foo at {}", lsn))),
4436 60 : ctx,
4437 60 : )
4438 0 : .await?;
4439 60 : writer.finish_write(lsn);
4440 60 : lsn += 0x10;
4441 60 : }
4442 60 : tline.freeze_and_flush().await?;
4443 : {
4444 60 : let mut writer = tline.writer().await;
4445 60 : writer
4446 60 : .put(
4447 60 : *TEST_KEY,
4448 60 : lsn,
4449 60 : &Value::Image(test_img(&format!("foo at {}", lsn))),
4450 60 : ctx,
4451 60 : )
4452 30 : .await?;
4453 60 : writer.finish_write(lsn);
4454 60 : lsn += 0x10;
4455 60 : writer
4456 60 : .put(
4457 60 : *TEST_KEY,
4458 60 : lsn,
4459 60 : &Value::Image(test_img(&format!("foo at {}", lsn))),
4460 60 : ctx,
4461 60 : )
4462 0 : .await?;
4463 60 : writer.finish_write(lsn);
4464 60 : }
4465 60 : tline.freeze_and_flush().await.map_err(|e| e.into())
4466 60 : }
4467 :
4468 : #[tokio::test]
4469 6 : async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
4470 6 : let (tenant, ctx) =
4471 6 : TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
4472 6 : .await?
4473 6 : .load()
4474 24 : .await;
4475 6 : let tline = tenant
4476 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
4477 12 : .await?;
4478 18 : make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
4479 6 :
4480 6 : // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
4481 6 : // FIXME: this doesn't actually remove any layer currently, given how the flushing
4482 6 : // and compaction works. But it does set the 'cutoff' point so that the cross check
4483 6 : // below should fail.
4484 6 : tenant
4485 6 : .gc_iteration(
4486 6 : Some(TIMELINE_ID),
4487 6 : 0x10,
4488 6 : Duration::ZERO,
4489 6 : &CancellationToken::new(),
4490 6 : &ctx,
4491 6 : )
4492 6 : .await?;
4493 6 :
4494 6 : // try to branch at lsn 25, should fail because we already garbage collected the data
4495 6 : match tenant
4496 6 : .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
4497 6 : .await
4498 6 : {
4499 6 : Ok(_) => panic!("branching should have failed"),
4500 6 : Err(err) => {
4501 6 : let CreateTimelineError::AncestorLsn(err) = err else {
4502 6 : panic!("wrong error type")
4503 6 : };
4504 6 : assert!(err.to_string().contains("invalid branch start lsn"));
4505 6 : assert!(err
4506 6 : .source()
4507 6 : .unwrap()
4508 6 : .to_string()
4509 6 : .contains("we might've already garbage collected needed data"))
4510 6 : }
4511 6 : }
4512 6 :
4513 6 : Ok(())
4514 6 : }
4515 :
4516 : #[tokio::test]
4517 6 : async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
4518 6 : let (tenant, ctx) =
4519 6 : TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")
4520 6 : .await?
4521 6 : .load()
4522 24 : .await;
4523 6 :
4524 6 : let tline = tenant
4525 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx)
4526 12 : .await?;
4527 6 : // try to branch at lsn 0x25, should fail because initdb lsn is 0x50
4528 6 : match tenant
4529 6 : .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
4530 6 : .await
4531 6 : {
4532 6 : Ok(_) => panic!("branching should have failed"),
4533 6 : Err(err) => {
4534 6 : let CreateTimelineError::AncestorLsn(err) = err else {
4535 6 : panic!("wrong error type");
4536 6 : };
4537 6 : assert!(&err.to_string().contains("invalid branch start lsn"));
4538 6 : assert!(&err
4539 6 : .source()
4540 6 : .unwrap()
4541 6 : .to_string()
4542 6 : .contains("is earlier than latest GC cutoff"));
4543 6 : }
4544 6 : }
4545 6 :
4546 6 : Ok(())
4547 6 : }
4548 :
4549 : /*
4550 : // FIXME: This currently fails to error out. Calling GC doesn't currently
4551 : // remove the old value, we'd need to work a little harder
4552 : #[tokio::test]
4553 : async fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
4554 : let repo =
4555 : RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
4556 : .load();
4557 :
4558 : let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
4559 : make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
4560 :
4561 : repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
4562 : let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
4563 : assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
4564 : match tline.get(*TEST_KEY, Lsn(0x25)) {
4565 : Ok(_) => panic!("request for page should have failed"),
4566 : Err(err) => assert!(err.to_string().contains("not found at")),
4567 : }
4568 : Ok(())
4569 : }
4570 : */
4571 :
4572 : #[tokio::test]
4573 6 : async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> {
4574 6 : let (tenant, ctx) =
4575 6 : TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")
4576 6 : .await?
4577 6 : .load()
4578 24 : .await;
4579 6 : let tline = tenant
4580 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
4581 12 : .await?;
4582 18 : make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
4583 6 :
4584 6 : tenant
4585 6 : .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
4586 6 : .await?;
4587 6 : let newtline = tenant
4588 6 : .get_timeline(NEW_TIMELINE_ID, true)
4589 6 : .expect("Should have a local timeline");
4590 6 :
4591 18 : make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;
4592 6 :
4593 6 : tline.set_broken("test".to_owned());
4594 6 :
4595 6 : tenant
4596 6 : .gc_iteration(
4597 6 : Some(TIMELINE_ID),
4598 6 : 0x10,
4599 6 : Duration::ZERO,
4600 6 : &CancellationToken::new(),
4601 6 : &ctx,
4602 6 : )
4603 6 : .await?;
4604 6 :
4605 6 : // The branchpoints should contain all timelines, even ones marked
4606 6 : // as Broken.
4607 6 : {
4608 6 : let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
4609 6 : assert_eq!(branchpoints.len(), 1);
4610 6 : assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID));
4611 6 : }
4612 6 :
4613 6 : // You can read the key from the child branch even though the parent is
4614 6 : // Broken, as long as you don't need to access data from the parent.
4615 6 : assert_eq!(
4616 12 : newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?,
4617 6 : test_img(&format!("foo at {}", Lsn(0x70)))
4618 6 : );
4619 6 :
4620 6 : // This needs to traverse to the parent, and fails.
4621 6 : let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
4622 6 : assert!(
4623 6 : err.to_string().starts_with(&format!(
4624 6 : "bad state on timeline {}: Broken",
4625 6 : tline.timeline_id
4626 6 : )),
4627 6 : "{err}"
4628 6 : );
4629 6 :
4630 6 : Ok(())
4631 6 : }
4632 :
4633 : #[tokio::test]
4634 6 : async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
4635 6 : let (tenant, ctx) =
4636 6 : TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")
4637 6 : .await?
4638 6 : .load()
4639 24 : .await;
4640 6 : let tline = tenant
4641 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
4642 12 : .await?;
4643 18 : make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
4644 6 :
4645 6 : tenant
4646 6 : .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
4647 6 : .await?;
4648 6 : let newtline = tenant
4649 6 : .get_timeline(NEW_TIMELINE_ID, true)
4650 6 : .expect("Should have a local timeline");
4651 6 : // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
4652 6 : tenant
4653 6 : .gc_iteration(
4654 6 : Some(TIMELINE_ID),
4655 6 : 0x10,
4656 6 : Duration::ZERO,
4657 6 : &CancellationToken::new(),
4658 6 : &ctx,
4659 6 : )
4660 6 : .await?;
4661 12 : assert!(newtline.get(*TEST_KEY, Lsn(0x25), &ctx).await.is_ok());
4662 6 :
4663 6 : Ok(())
4664 6 : }
4665 : #[tokio::test]
4666 6 : async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
4667 6 : let (tenant, ctx) = TenantHarness::create("test_parent_keeps_data_forever_after_branching")
4668 6 : .await?
4669 6 : .load()
4670 24 : .await;
4671 6 : let tline = tenant
4672 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
4673 12 : .await?;
4674 18 : make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
4675 6 :
4676 6 : tenant
4677 6 : .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
4678 6 : .await?;
4679 6 : let newtline = tenant
4680 6 : .get_timeline(NEW_TIMELINE_ID, true)
4681 6 : .expect("Should have a local timeline");
4682 6 :
4683 18 : make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;
4684 6 :
4685 6 : // run gc on parent
4686 6 : tenant
4687 6 : .gc_iteration(
4688 6 : Some(TIMELINE_ID),
4689 6 : 0x10,
4690 6 : Duration::ZERO,
4691 6 : &CancellationToken::new(),
4692 6 : &ctx,
4693 6 : )
4694 6 : .await?;
4695 6 :
4696 6 : // Check that the data is still accessible on the branch.
4697 6 : assert_eq!(
4698 21 : newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
4699 6 : test_img(&format!("foo at {}", Lsn(0x40)))
4700 6 : );
4701 6 :
4702 6 : Ok(())
4703 6 : }
4704 :
4705 : #[tokio::test]
4706 6 : async fn timeline_load() -> anyhow::Result<()> {
4707 6 : const TEST_NAME: &str = "timeline_load";
4708 6 : let harness = TenantHarness::create(TEST_NAME).await?;
4709 6 : {
4710 24 : let (tenant, ctx) = harness.load().await;
4711 6 : let tline = tenant
4712 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx)
4713 12 : .await?;
4714 18 : make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
4715 6 : // so that all uploads finish & we can call harness.load() below again
4716 6 : tenant
4717 6 : .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
4718 6 : .instrument(harness.span())
4719 6 : .await
4720 6 : .ok()
4721 6 : .unwrap();
4722 6 : }
4723 6 :
4724 23 : let (tenant, _ctx) = harness.load().await;
4725 6 : tenant
4726 6 : .get_timeline(TIMELINE_ID, true)
4727 6 : .expect("cannot load timeline");
4728 6 :
4729 6 : Ok(())
4730 6 : }
4731 :
4732 : #[tokio::test]
4733 6 : async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
4734 6 : const TEST_NAME: &str = "timeline_load_with_ancestor";
4735 6 : let harness = TenantHarness::create(TEST_NAME).await?;
4736 6 : // create two timelines
4737 6 : {
4738 24 : let (tenant, ctx) = harness.load().await;
4739 6 : let tline = tenant
4740 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
4741 12 : .await?;
4742 6 :
4743 18 : make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
4744 6 :
4745 6 : let child_tline = tenant
4746 6 : .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
4747 6 : .await?;
4748 6 : child_tline.set_state(TimelineState::Active);
4749 6 :
4750 6 : let newtline = tenant
4751 6 : .get_timeline(NEW_TIMELINE_ID, true)
4752 6 : .expect("Should have a local timeline");
4753 6 :
4754 18 : make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;
4755 6 :
4756 6 : // so that all uploads finish & we can call harness.load() below again
4757 6 : tenant
4758 6 : .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
4759 6 : .instrument(harness.span())
4760 6 : .await
4761 6 : .ok()
4762 6 : .unwrap();
4763 6 : }
4764 6 :
4765 6 : // check that both of them are initially unloaded
4766 35 : let (tenant, _ctx) = harness.load().await;
4767 6 :
4768 6 : // check that both, child and ancestor are loaded
4769 6 : let _child_tline = tenant
4770 6 : .get_timeline(NEW_TIMELINE_ID, true)
4771 6 : .expect("cannot get child timeline loaded");
4772 6 :
4773 6 : let _ancestor_tline = tenant
4774 6 : .get_timeline(TIMELINE_ID, true)
4775 6 : .expect("cannot get ancestor timeline loaded");
4776 6 :
4777 6 : Ok(())
4778 6 : }
4779 :
4780 : #[tokio::test]
4781 6 : async fn delta_layer_dumping() -> anyhow::Result<()> {
4782 6 : use storage_layer::AsLayerDesc;
4783 6 : let (tenant, ctx) = TenantHarness::create("test_layer_dumping")
4784 6 : .await?
4785 6 : .load()
4786 24 : .await;
4787 6 : let tline = tenant
4788 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
4789 12 : .await?;
4790 18 : make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
4791 6 :
4792 6 : let layer_map = tline.layers.read().await;
4793 6 : let level0_deltas = layer_map
4794 6 : .layer_map()?
4795 6 : .level0_deltas()
4796 6 : .iter()
4797 12 : .map(|desc| layer_map.get_from_desc(desc))
4798 6 : .collect::<Vec<_>>();
4799 6 :
4800 6 : assert!(!level0_deltas.is_empty());
4801 6 :
4802 18 : for delta in level0_deltas {
4803 6 : // Ensure we are dumping a delta layer here
4804 12 : assert!(delta.layer_desc().is_delta);
4805 24 : delta.dump(true, &ctx).await.unwrap();
4806 6 : }
4807 6 :
4808 6 : Ok(())
4809 6 : }
4810 :
4811 : #[tokio::test]
4812 6 : async fn test_images() -> anyhow::Result<()> {
4813 24 : let (tenant, ctx) = TenantHarness::create("test_images").await?.load().await;
4814 6 : let tline = tenant
4815 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
4816 12 : .await?;
4817 6 :
4818 6 : let mut writer = tline.writer().await;
4819 6 : writer
4820 6 : .put(
4821 6 : *TEST_KEY,
4822 6 : Lsn(0x10),
4823 6 : &Value::Image(test_img("foo at 0x10")),
4824 6 : &ctx,
4825 6 : )
4826 6 : .await?;
4827 6 : writer.finish_write(Lsn(0x10));
4828 6 : drop(writer);
4829 6 :
4830 6 : tline.freeze_and_flush().await?;
4831 6 : tline
4832 6 : .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
4833 6 : .await?;
4834 6 :
4835 6 : let mut writer = tline.writer().await;
4836 6 : writer
4837 6 : .put(
4838 6 : *TEST_KEY,
4839 6 : Lsn(0x20),
4840 6 : &Value::Image(test_img("foo at 0x20")),
4841 6 : &ctx,
4842 6 : )
4843 6 : .await?;
4844 6 : writer.finish_write(Lsn(0x20));
4845 6 : drop(writer);
4846 6 :
4847 6 : tline.freeze_and_flush().await?;
4848 6 : tline
4849 6 : .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
4850 6 : .await?;
4851 6 :
4852 6 : let mut writer = tline.writer().await;
4853 6 : writer
4854 6 : .put(
4855 6 : *TEST_KEY,
4856 6 : Lsn(0x30),
4857 6 : &Value::Image(test_img("foo at 0x30")),
4858 6 : &ctx,
4859 6 : )
4860 6 : .await?;
4861 6 : writer.finish_write(Lsn(0x30));
4862 6 : drop(writer);
4863 6 :
4864 6 : tline.freeze_and_flush().await?;
4865 6 : tline
4866 6 : .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
4867 6 : .await?;
4868 6 :
4869 6 : let mut writer = tline.writer().await;
4870 6 : writer
4871 6 : .put(
4872 6 : *TEST_KEY,
4873 6 : Lsn(0x40),
4874 6 : &Value::Image(test_img("foo at 0x40")),
4875 6 : &ctx,
4876 6 : )
4877 6 : .await?;
4878 6 : writer.finish_write(Lsn(0x40));
4879 6 : drop(writer);
4880 6 :
4881 6 : tline.freeze_and_flush().await?;
4882 6 : tline
4883 6 : .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
4884 6 : .await?;
4885 6 :
4886 6 : assert_eq!(
4887 12 : tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
4888 6 : test_img("foo at 0x10")
4889 6 : );
4890 6 : assert_eq!(
4891 12 : tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
4892 6 : test_img("foo at 0x10")
4893 6 : );
4894 6 : assert_eq!(
4895 6 : tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
4896 6 : test_img("foo at 0x20")
4897 6 : );
4898 6 : assert_eq!(
4899 12 : tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?,
4900 6 : test_img("foo at 0x30")
4901 6 : );
4902 6 : assert_eq!(
4903 12 : tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?,
4904 6 : test_img("foo at 0x40")
4905 6 : );
4906 6 :
4907 6 : Ok(())
4908 6 : }
4909 :
4910 12 : async fn bulk_insert_compact_gc(
4911 12 : tenant: &Tenant,
4912 12 : timeline: &Arc<Timeline>,
4913 12 : ctx: &RequestContext,
4914 12 : lsn: Lsn,
4915 12 : repeat: usize,
4916 12 : key_count: usize,
4917 12 : ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
4918 12 : let compact = true;
4919 122751 : bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
4920 12 : }
4921 :
4922 24 : async fn bulk_insert_maybe_compact_gc(
4923 24 : tenant: &Tenant,
4924 24 : timeline: &Arc<Timeline>,
4925 24 : ctx: &RequestContext,
4926 24 : mut lsn: Lsn,
4927 24 : repeat: usize,
4928 24 : key_count: usize,
4929 24 : compact: bool,
4930 24 : ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
4931 24 : let mut inserted: HashMap<Key, BTreeSet<Lsn>> = Default::default();
4932 24 :
4933 24 : let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
4934 24 : let mut blknum = 0;
4935 24 :
4936 24 : // Enforce that key range is monotonously increasing
4937 24 : let mut keyspace = KeySpaceAccum::new();
4938 24 :
4939 24 : let cancel = CancellationToken::new();
4940 24 :
4941 24 : for _ in 0..repeat {
4942 1200 : for _ in 0..key_count {
4943 12000000 : test_key.field6 = blknum;
4944 12000000 : let mut writer = timeline.writer().await;
4945 12000000 : writer
4946 12000000 : .put(
4947 12000000 : test_key,
4948 12000000 : lsn,
4949 12000000 : &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
4950 12000000 : ctx,
4951 12000000 : )
4952 7788 : .await?;
4953 12000000 : inserted.entry(test_key).or_default().insert(lsn);
4954 12000000 : writer.finish_write(lsn);
4955 12000000 : drop(writer);
4956 12000000 :
4957 12000000 : keyspace.add_key(test_key);
4958 12000000 :
4959 12000000 : lsn = Lsn(lsn.0 + 0x10);
4960 12000000 : blknum += 1;
4961 : }
4962 :
4963 1200 : timeline.freeze_and_flush().await?;
4964 1200 : if compact {
4965 : // this requires timeline to be &Arc<Timeline>
4966 25854 : timeline.compact(&cancel, EnumSet::empty(), ctx).await?;
4967 600 : }
4968 :
4969 : // this doesn't really need to use the timeline_id target, but it is closer to what it
4970 : // originally was.
4971 1200 : let res = tenant
4972 1200 : .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx)
4973 1196 : .await?;
4974 :
4975 1200 : assert_eq!(res.layers_removed, 0, "this never removes anything");
4976 : }
4977 :
4978 24 : Ok(inserted)
4979 24 : }
4980 :
4981 : //
4982 : // Insert 1000 key-value pairs with increasing keys, flush, compact, GC.
4983 : // Repeat 50 times.
4984 : //
4985 : #[tokio::test]
4986 6 : async fn test_bulk_insert() -> anyhow::Result<()> {
4987 6 : let harness = TenantHarness::create("test_bulk_insert").await?;
4988 24 : let (tenant, ctx) = harness.load().await;
4989 6 : let tline = tenant
4990 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
4991 12 : .await?;
4992 6 :
4993 6 : let lsn = Lsn(0x10);
4994 61375 : bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
4995 6 :
4996 6 : Ok(())
4997 6 : }
4998 :
4999 : // Test the vectored get real implementation against a simple sequential implementation.
5000 : //
5001 : // The test generates a keyspace by repeatedly flushing the in-memory layer and compacting.
5002 : // Projected to 2D the key space looks like below. Lsn grows upwards on the Y axis and keys
5003 : // grow to the right on the X axis.
5004 : // [Delta]
5005 : // [Delta]
5006 : // [Delta]
5007 : // [Delta]
5008 : // ------------ Image ---------------
5009 : //
5010 : // After layer generation we pick the ranges to query as follows:
5011 : // 1. The beginning of each delta layer
5012 : // 2. At the seam between two adjacent delta layers
5013 : //
5014 : // There's one major downside to this test: delta layers only contains images,
5015 : // so the search can stop at the first delta layer and doesn't traverse any deeper.
5016 : #[tokio::test]
5017 6 : async fn test_get_vectored() -> anyhow::Result<()> {
5018 6 : let harness = TenantHarness::create("test_get_vectored").await?;
5019 24 : let (tenant, ctx) = harness.load().await;
5020 6 : let tline = tenant
5021 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
5022 12 : .await?;
5023 6 :
5024 6 : let lsn = Lsn(0x10);
5025 61376 : let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
5026 6 :
5027 6 : let guard = tline.layers.read().await;
5028 6 : let lm = guard.layer_map()?;
5029 6 :
5030 6 : lm.dump(true, &ctx).await?;
5031 6 :
5032 6 : let mut reads = Vec::new();
5033 6 : let mut prev = None;
5034 36 : lm.iter_historic_layers().for_each(|desc| {
5035 36 : if !desc.is_delta() {
5036 6 : prev = Some(desc.clone());
5037 6 : return;
5038 30 : }
5039 30 :
5040 30 : let start = desc.key_range.start;
5041 30 : let end = desc
5042 30 : .key_range
5043 30 : .start
5044 30 : .add(Timeline::MAX_GET_VECTORED_KEYS.try_into().unwrap());
5045 30 : reads.push(KeySpace {
5046 30 : ranges: vec![start..end],
5047 30 : });
5048 6 :
5049 30 : if let Some(prev) = &prev {
5050 30 : if !prev.is_delta() {
5051 30 : return;
5052 6 : }
5053 0 :
5054 0 : let first_range = Key {
5055 0 : field6: prev.key_range.end.field6 - 4,
5056 0 : ..prev.key_range.end
5057 0 : }..prev.key_range.end;
5058 0 :
5059 0 : let second_range = desc.key_range.start..Key {
5060 0 : field6: desc.key_range.start.field6 + 4,
5061 0 : ..desc.key_range.start
5062 0 : };
5063 0 :
5064 0 : reads.push(KeySpace {
5065 0 : ranges: vec![first_range, second_range],
5066 0 : });
5067 6 : };
5068 6 :
5069 6 : prev = Some(desc.clone());
5070 36 : });
5071 6 :
5072 6 : drop(guard);
5073 6 :
5074 6 : // Pick a big LSN such that we query over all the changes.
5075 6 : let reads_lsn = Lsn(u64::MAX - 1);
5076 6 :
5077 36 : for read in reads {
5078 30 : info!("Doing vectored read on {:?}", read);
5079 6 :
5080 30 : let vectored_res = tline
5081 30 : .get_vectored_impl(
5082 30 : read.clone(),
5083 30 : reads_lsn,
5084 30 : &mut ValuesReconstructState::new(),
5085 30 : &ctx,
5086 30 : )
5087 75 : .await;
5088 6 :
5089 30 : let mut expected_lsns: HashMap<Key, Lsn> = Default::default();
5090 30 : let mut expect_missing = false;
5091 30 : let mut key = read.start().unwrap();
5092 990 : while key != read.end().unwrap() {
5093 960 : if let Some(lsns) = inserted.get(&key) {
5094 960 : let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn);
5095 960 : match expected_lsn {
5096 960 : Some(lsn) => {
5097 960 : expected_lsns.insert(key, *lsn);
5098 960 : }
5099 6 : None => {
5100 6 : expect_missing = true;
5101 0 : break;
5102 6 : }
5103 6 : }
5104 6 : } else {
5105 6 : expect_missing = true;
5106 0 : break;
5107 6 : }
5108 6 :
5109 960 : key = key.next();
5110 6 : }
5111 6 :
5112 30 : if expect_missing {
5113 6 : assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_))));
5114 6 : } else {
5115 960 : for (key, image) in vectored_res? {
5116 960 : let expected_lsn = expected_lsns.get(&key).expect("determined above");
5117 960 : let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn));
5118 960 : assert_eq!(image?, expected_image);
5119 6 : }
5120 6 : }
5121 6 : }
5122 6 :
5123 6 : Ok(())
5124 6 : }
5125 :
5126 : #[tokio::test]
5127 6 : async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
5128 6 : let harness = TenantHarness::create("test_get_vectored_aux_files").await?;
5129 6 :
5130 24 : let (tenant, ctx) = harness.load().await;
5131 6 : let tline = tenant
5132 6 : .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
5133 6 : .await?;
5134 6 : let tline = tline.raw_timeline().unwrap();
5135 6 :
5136 6 : let mut modification = tline.begin_modification(Lsn(0x1000));
5137 6 : modification.put_file("foo/bar1", b"content1", &ctx).await?;
5138 6 : modification.set_lsn(Lsn(0x1008))?;
5139 6 : modification.put_file("foo/bar2", b"content2", &ctx).await?;
5140 6 : modification.commit(&ctx).await?;
5141 6 :
5142 6 : let child_timeline_id = TimelineId::generate();
5143 6 : tenant
5144 6 : .branch_timeline_test(
5145 6 : tline,
5146 6 : child_timeline_id,
5147 6 : Some(tline.get_last_record_lsn()),
5148 6 : &ctx,
5149 6 : )
5150 6 : .await?;
5151 6 :
5152 6 : let child_timeline = tenant
5153 6 : .get_timeline(child_timeline_id, true)
5154 6 : .expect("Should have the branched timeline");
5155 6 :
5156 6 : let aux_keyspace = KeySpace {
5157 6 : ranges: vec![NON_INHERITED_RANGE],
5158 6 : };
5159 6 : let read_lsn = child_timeline.get_last_record_lsn();
5160 6 :
5161 6 : let vectored_res = child_timeline
5162 6 : .get_vectored_impl(
5163 6 : aux_keyspace.clone(),
5164 6 : read_lsn,
5165 6 : &mut ValuesReconstructState::new(),
5166 6 : &ctx,
5167 6 : )
5168 6 : .await;
5169 6 :
5170 6 : let images = vectored_res?;
5171 6 : assert!(images.is_empty());
5172 6 : Ok(())
5173 6 : }
5174 :
5175 : // Test that vectored get handles layer gaps correctly
5176 : // by advancing into the next ancestor timeline if required.
5177 : //
5178 : // The test generates timelines that look like the diagram below.
5179 : // We leave a gap in one of the L1 layers at `gap_at_key` (`/` in the diagram).
5180 : // The reconstruct data for that key lies in the ancestor timeline (`X` in the diagram).
5181 : //
5182 : // ```
5183 : //-------------------------------+
5184 : // ... |
5185 : // [ L1 ] |
5186 : // [ / L1 ] | Child Timeline
5187 : // ... |
5188 : // ------------------------------+
5189 : // [ X L1 ] | Parent Timeline
5190 : // ------------------------------+
5191 : // ```
5192 : #[tokio::test]
5193 6 : async fn test_get_vectored_key_gap() -> anyhow::Result<()> {
5194 6 : let tenant_conf = TenantConf {
5195 6 : // Make compaction deterministic
5196 6 : gc_period: Duration::ZERO,
5197 6 : compaction_period: Duration::ZERO,
5198 6 : // Encourage creation of L1 layers
5199 6 : checkpoint_distance: 16 * 1024,
5200 6 : compaction_target_size: 8 * 1024,
5201 6 : ..TenantConf::default()
5202 6 : };
5203 6 :
5204 6 : let harness = TenantHarness::create_custom(
5205 6 : "test_get_vectored_key_gap",
5206 6 : tenant_conf,
5207 6 : TenantId::generate(),
5208 6 : ShardIdentity::unsharded(),
5209 6 : Generation::new(0xdeadbeef),
5210 6 : )
5211 6 : .await?;
5212 24 : let (tenant, ctx) = harness.load().await;
5213 6 :
5214 6 : let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
5215 6 : let gap_at_key = current_key.add(100);
5216 6 : let mut current_lsn = Lsn(0x10);
5217 6 :
5218 6 : const KEY_COUNT: usize = 10_000;
5219 6 :
5220 6 : let timeline_id = TimelineId::generate();
5221 6 : let current_timeline = tenant
5222 6 : .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
5223 12 : .await?;
5224 6 :
5225 6 : current_lsn += 0x100;
5226 6 :
5227 6 : let mut writer = current_timeline.writer().await;
5228 6 : writer
5229 6 : .put(
5230 6 : gap_at_key,
5231 6 : current_lsn,
5232 6 : &Value::Image(test_img(&format!("{} at {}", gap_at_key, current_lsn))),
5233 6 : &ctx,
5234 6 : )
5235 6 : .await?;
5236 6 : writer.finish_write(current_lsn);
5237 6 : drop(writer);
5238 6 :
5239 6 : let mut latest_lsns = HashMap::new();
5240 6 : latest_lsns.insert(gap_at_key, current_lsn);
5241 6 :
5242 6 : current_timeline.freeze_and_flush().await?;
5243 6 :
5244 6 : let child_timeline_id = TimelineId::generate();
5245 6 :
5246 6 : tenant
5247 6 : .branch_timeline_test(
5248 6 : ¤t_timeline,
5249 6 : child_timeline_id,
5250 6 : Some(current_lsn),
5251 6 : &ctx,
5252 6 : )
5253 6 : .await?;
5254 6 : let child_timeline = tenant
5255 6 : .get_timeline(child_timeline_id, true)
5256 6 : .expect("Should have the branched timeline");
5257 6 :
5258 60006 : for i in 0..KEY_COUNT {
5259 60000 : if current_key == gap_at_key {
5260 6 : current_key = current_key.next();
5261 6 : continue;
5262 59994 : }
5263 59994 :
5264 59994 : current_lsn += 0x10;
5265 6 :
5266 59994 : let mut writer = child_timeline.writer().await;
5267 59994 : writer
5268 59994 : .put(
5269 59994 : current_key,
5270 59994 : current_lsn,
5271 59994 : &Value::Image(test_img(&format!("{} at {}", current_key, current_lsn))),
5272 59994 : &ctx,
5273 59994 : )
5274 224 : .await?;
5275 59994 : writer.finish_write(current_lsn);
5276 59994 : drop(writer);
5277 59994 :
5278 59994 : latest_lsns.insert(current_key, current_lsn);
5279 59994 : current_key = current_key.next();
5280 59994 :
5281 59994 : // Flush every now and then to encourage layer file creation.
5282 59994 : if i % 500 == 0 {
5283 124 : child_timeline.freeze_and_flush().await?;
5284 59874 : }
5285 6 : }
5286 6 :
5287 7 : child_timeline.freeze_and_flush().await?;
5288 6 : let mut flags = EnumSet::new();
5289 6 : flags.insert(CompactFlags::ForceRepartition);
5290 6 : child_timeline
5291 6 : .compact(&CancellationToken::new(), flags, &ctx)
5292 5598 : .await?;
5293 6 :
5294 6 : let key_near_end = {
5295 6 : let mut tmp = current_key;
5296 6 : tmp.field6 -= 10;
5297 6 : tmp
5298 6 : };
5299 6 :
5300 6 : let key_near_gap = {
5301 6 : let mut tmp = gap_at_key;
5302 6 : tmp.field6 -= 10;
5303 6 : tmp
5304 6 : };
5305 6 :
5306 6 : let read = KeySpace {
5307 6 : ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
5308 6 : };
5309 6 : let results = child_timeline
5310 6 : .get_vectored_impl(
5311 6 : read.clone(),
5312 6 : current_lsn,
5313 6 : &mut ValuesReconstructState::new(),
5314 6 : &ctx,
5315 6 : )
5316 45 : .await?;
5317 6 :
5318 132 : for (key, img_res) in results {
5319 126 : let expected = test_img(&format!("{} at {}", key, latest_lsns[&key]));
5320 126 : assert_eq!(img_res?, expected);
5321 6 : }
5322 6 :
5323 6 : Ok(())
5324 6 : }
5325 :
5326 : // Test that vectored get descends into ancestor timelines correctly and
5327 : // does not return an image that's newer than requested.
5328 : //
5329 : // The diagram below ilustrates an interesting case. We have a parent timeline
5330 : // (top of the Lsn range) and a child timeline. The request key cannot be reconstructed
5331 : // from the child timeline, so the parent timeline must be visited. When advacing into
5332 : // the child timeline, the read path needs to remember what the requested Lsn was in
5333 : // order to avoid returning an image that's too new. The test below constructs such
5334 : // a timeline setup and does a few queries around the Lsn of each page image.
5335 : // ```
5336 : // LSN
5337 : // ^
5338 : // |
5339 : // |
5340 : // 500 | --------------------------------------> branch point
5341 : // 400 | X
5342 : // 300 | X
5343 : // 200 | --------------------------------------> requested lsn
5344 : // 100 | X
5345 : // |---------------------------------------> Key
5346 : // |
5347 : // ------> requested key
5348 : //
5349 : // Legend:
5350 : // * X - page images
5351 : // ```
5352 : #[tokio::test]
5353 6 : async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
5354 6 : let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?;
5355 24 : let (tenant, ctx) = harness.load().await;
5356 6 :
5357 6 : let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
5358 6 : let end_key = start_key.add(1000);
5359 6 : let child_gap_at_key = start_key.add(500);
5360 6 : let mut parent_gap_lsns: BTreeMap<Lsn, String> = BTreeMap::new();
5361 6 :
5362 6 : let mut current_lsn = Lsn(0x10);
5363 6 :
5364 6 : let timeline_id = TimelineId::generate();
5365 6 : let parent_timeline = tenant
5366 6 : .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
5367 12 : .await?;
5368 6 :
5369 6 : current_lsn += 0x100;
5370 6 :
5371 24 : for _ in 0..3 {
5372 18 : let mut key = start_key;
5373 18018 : while key < end_key {
5374 18000 : current_lsn += 0x10;
5375 18000 :
5376 18000 : let image_value = format!("{} at {}", child_gap_at_key, current_lsn);
5377 6 :
5378 18000 : let mut writer = parent_timeline.writer().await;
5379 18000 : writer
5380 18000 : .put(
5381 18000 : key,
5382 18000 : current_lsn,
5383 18000 : &Value::Image(test_img(&image_value)),
5384 18000 : &ctx,
5385 18000 : )
5386 18 : .await?;
5387 18000 : writer.finish_write(current_lsn);
5388 18000 :
5389 18000 : if key == child_gap_at_key {
5390 18 : parent_gap_lsns.insert(current_lsn, image_value);
5391 17982 : }
5392 6 :
5393 18000 : key = key.next();
5394 6 : }
5395 6 :
5396 18 : parent_timeline.freeze_and_flush().await?;
5397 6 : }
5398 6 :
5399 6 : let child_timeline_id = TimelineId::generate();
5400 6 :
5401 6 : let child_timeline = tenant
5402 6 : .branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx)
5403 6 : .await?;
5404 6 :
5405 6 : let mut key = start_key;
5406 6006 : while key < end_key {
5407 6000 : if key == child_gap_at_key {
5408 6 : key = key.next();
5409 6 : continue;
5410 5994 : }
5411 5994 :
5412 5994 : current_lsn += 0x10;
5413 6 :
5414 5994 : let mut writer = child_timeline.writer().await;
5415 5994 : writer
5416 5994 : .put(
5417 5994 : key,
5418 5994 : current_lsn,
5419 5994 : &Value::Image(test_img(&format!("{} at {}", key, current_lsn))),
5420 5994 : &ctx,
5421 5994 : )
5422 51 : .await?;
5423 5994 : writer.finish_write(current_lsn);
5424 5994 :
5425 5994 : key = key.next();
5426 6 : }
5427 6 :
5428 6 : child_timeline.freeze_and_flush().await?;
5429 6 :
5430 6 : let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10];
5431 6 : let mut query_lsns = Vec::new();
5432 18 : for image_lsn in parent_gap_lsns.keys().rev() {
5433 108 : for offset in lsn_offsets {
5434 90 : query_lsns.push(Lsn(image_lsn
5435 90 : .0
5436 90 : .checked_add_signed(offset)
5437 90 : .expect("Shouldn't overflow")));
5438 90 : }
5439 6 : }
5440 6 :
5441 96 : for query_lsn in query_lsns {
5442 90 : let results = child_timeline
5443 90 : .get_vectored_impl(
5444 90 : KeySpace {
5445 90 : ranges: vec![child_gap_at_key..child_gap_at_key.next()],
5446 90 : },
5447 90 : query_lsn,
5448 90 : &mut ValuesReconstructState::new(),
5449 90 : &ctx,
5450 90 : )
5451 87 : .await;
5452 6 :
5453 90 : let expected_item = parent_gap_lsns
5454 90 : .iter()
5455 90 : .rev()
5456 204 : .find(|(lsn, _)| **lsn <= query_lsn);
5457 90 :
5458 90 : info!(
5459 6 : "Doing vectored read at LSN {}. Expecting image to be: {:?}",
5460 6 : query_lsn, expected_item
5461 6 : );
5462 6 :
5463 90 : match expected_item {
5464 78 : Some((_, img_value)) => {
5465 78 : let key_results = results.expect("No vectored get error expected");
5466 78 : let key_result = &key_results[&child_gap_at_key];
5467 78 : let returned_img = key_result
5468 78 : .as_ref()
5469 78 : .expect("No page reconstruct error expected");
5470 78 :
5471 78 : info!(
5472 6 : "Vectored read at LSN {} returned image {}",
5473 0 : query_lsn,
5474 0 : std::str::from_utf8(returned_img)?
5475 6 : );
5476 78 : assert_eq!(*returned_img, test_img(img_value));
5477 6 : }
5478 6 : None => {
5479 12 : assert!(matches!(results, Err(GetVectoredError::MissingKey(_))));
5480 6 : }
5481 6 : }
5482 6 : }
5483 6 :
5484 6 : Ok(())
5485 6 : }
5486 :
5487 : #[tokio::test]
5488 6 : async fn test_random_updates() -> anyhow::Result<()> {
5489 6 : let names_algorithms = [
5490 6 : ("test_random_updates_legacy", CompactionAlgorithm::Legacy),
5491 6 : ("test_random_updates_tiered", CompactionAlgorithm::Tiered),
5492 6 : ];
5493 18 : for (name, algorithm) in names_algorithms {
5494 289132 : test_random_updates_algorithm(name, algorithm).await?;
5495 6 : }
5496 6 : Ok(())
5497 6 : }
5498 :
5499 12 : async fn test_random_updates_algorithm(
5500 12 : name: &'static str,
5501 12 : compaction_algorithm: CompactionAlgorithm,
5502 12 : ) -> anyhow::Result<()> {
5503 12 : let mut harness = TenantHarness::create(name).await?;
5504 12 : harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
5505 12 : kind: compaction_algorithm,
5506 12 : };
5507 48 : let (tenant, ctx) = harness.load().await;
5508 12 : let tline = tenant
5509 12 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
5510 21 : .await?;
5511 :
5512 : const NUM_KEYS: usize = 1000;
5513 12 : let cancel = CancellationToken::new();
5514 12 :
5515 12 : let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
5516 12 : let mut test_key_end = test_key;
5517 12 : test_key_end.field6 = NUM_KEYS as u32;
5518 12 : tline.add_extra_test_dense_keyspace(KeySpace::single(test_key..test_key_end));
5519 12 :
5520 12 : let mut keyspace = KeySpaceAccum::new();
5521 12 :
5522 12 : // Track when each page was last modified. Used to assert that
5523 12 : // a read sees the latest page version.
5524 12 : let mut updated = [Lsn(0); NUM_KEYS];
5525 12 :
5526 12 : let mut lsn = Lsn(0x10);
5527 : #[allow(clippy::needless_range_loop)]
5528 12012 : for blknum in 0..NUM_KEYS {
5529 12000 : lsn = Lsn(lsn.0 + 0x10);
5530 12000 : test_key.field6 = blknum as u32;
5531 12000 : let mut writer = tline.writer().await;
5532 12000 : writer
5533 12000 : .put(
5534 12000 : test_key,
5535 12000 : lsn,
5536 12000 : &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
5537 12000 : &ctx,
5538 12000 : )
5539 12 : .await?;
5540 12000 : writer.finish_write(lsn);
5541 12000 : updated[blknum] = lsn;
5542 12000 : drop(writer);
5543 12000 :
5544 12000 : keyspace.add_key(test_key);
5545 : }
5546 :
5547 612 : for _ in 0..50 {
5548 600600 : for _ in 0..NUM_KEYS {
5549 600000 : lsn = Lsn(lsn.0 + 0x10);
5550 600000 : let blknum = thread_rng().gen_range(0..NUM_KEYS);
5551 600000 : test_key.field6 = blknum as u32;
5552 600000 : let mut writer = tline.writer().await;
5553 600000 : writer
5554 600000 : .put(
5555 600000 : test_key,
5556 600000 : lsn,
5557 600000 : &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
5558 600000 : &ctx,
5559 600000 : )
5560 624 : .await?;
5561 600000 : writer.finish_write(lsn);
5562 600000 : drop(writer);
5563 600000 : updated[blknum] = lsn;
5564 : }
5565 :
5566 : // Read all the blocks
5567 600000 : for (blknum, last_lsn) in updated.iter().enumerate() {
5568 600000 : test_key.field6 = blknum as u32;
5569 600000 : assert_eq!(
5570 600000 : tline.get(test_key, lsn, &ctx).await?,
5571 600000 : test_img(&format!("{} at {}", blknum, last_lsn))
5572 : );
5573 : }
5574 :
5575 : // Perform a cycle of flush, and GC
5576 603 : tline.freeze_and_flush().await?;
5577 600 : tenant
5578 600 : .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
5579 589 : .await?;
5580 : }
5581 :
5582 12 : Ok(())
5583 12 : }
5584 :
5585 : #[tokio::test]
5586 6 : async fn test_traverse_branches() -> anyhow::Result<()> {
5587 6 : let (tenant, ctx) = TenantHarness::create("test_traverse_branches")
5588 6 : .await?
5589 6 : .load()
5590 24 : .await;
5591 6 : let mut tline = tenant
5592 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
5593 12 : .await?;
5594 6 :
5595 6 : const NUM_KEYS: usize = 1000;
5596 6 :
5597 6 : let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
5598 6 :
5599 6 : let mut keyspace = KeySpaceAccum::new();
5600 6 :
5601 6 : let cancel = CancellationToken::new();
5602 6 :
5603 6 : // Track when each page was last modified. Used to assert that
5604 6 : // a read sees the latest page version.
5605 6 : let mut updated = [Lsn(0); NUM_KEYS];
5606 6 :
5607 6 : let mut lsn = Lsn(0x10);
5608 6 : #[allow(clippy::needless_range_loop)]
5609 6006 : for blknum in 0..NUM_KEYS {
5610 6000 : lsn = Lsn(lsn.0 + 0x10);
5611 6000 : test_key.field6 = blknum as u32;
5612 6000 : let mut writer = tline.writer().await;
5613 6000 : writer
5614 6000 : .put(
5615 6000 : test_key,
5616 6000 : lsn,
5617 6000 : &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
5618 6000 : &ctx,
5619 6000 : )
5620 6 : .await?;
5621 6000 : writer.finish_write(lsn);
5622 6000 : updated[blknum] = lsn;
5623 6000 : drop(writer);
5624 6000 :
5625 6000 : keyspace.add_key(test_key);
5626 6 : }
5627 6 :
5628 306 : for _ in 0..50 {
5629 300 : let new_tline_id = TimelineId::generate();
5630 300 : tenant
5631 300 : .branch_timeline_test(&tline, new_tline_id, Some(lsn), &ctx)
5632 6 : .await?;
5633 300 : tline = tenant
5634 300 : .get_timeline(new_tline_id, true)
5635 300 : .expect("Should have the branched timeline");
5636 6 :
5637 300300 : for _ in 0..NUM_KEYS {
5638 300000 : lsn = Lsn(lsn.0 + 0x10);
5639 300000 : let blknum = thread_rng().gen_range(0..NUM_KEYS);
5640 300000 : test_key.field6 = blknum as u32;
5641 300000 : let mut writer = tline.writer().await;
5642 300000 : writer
5643 300000 : .put(
5644 300000 : test_key,
5645 300000 : lsn,
5646 300000 : &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
5647 300000 : &ctx,
5648 300000 : )
5649 2553 : .await?;
5650 300000 : println!("updating {} at {}", blknum, lsn);
5651 300000 : writer.finish_write(lsn);
5652 300000 : drop(writer);
5653 300000 : updated[blknum] = lsn;
5654 6 : }
5655 6 :
5656 6 : // Read all the blocks
5657 300000 : for (blknum, last_lsn) in updated.iter().enumerate() {
5658 300000 : test_key.field6 = blknum as u32;
5659 300000 : assert_eq!(
5660 300000 : tline.get(test_key, lsn, &ctx).await?,
5661 300000 : test_img(&format!("{} at {}", blknum, last_lsn))
5662 6 : );
5663 6 : }
5664 6 :
5665 6 : // Perform a cycle of flush, compact, and GC
5666 305 : tline.freeze_and_flush().await?;
5667 45271 : tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
5668 300 : tenant
5669 300 : .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
5670 298 : .await?;
5671 6 : }
5672 6 :
5673 6 : Ok(())
5674 6 : }
5675 :
5676 : #[tokio::test]
5677 6 : async fn test_traverse_ancestors() -> anyhow::Result<()> {
5678 6 : let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")
5679 6 : .await?
5680 6 : .load()
5681 24 : .await;
5682 6 : let mut tline = tenant
5683 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
5684 12 : .await?;
5685 6 :
5686 6 : const NUM_KEYS: usize = 100;
5687 6 : const NUM_TLINES: usize = 50;
5688 6 :
5689 6 : let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
5690 6 : // Track page mutation lsns across different timelines.
5691 6 : let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES];
5692 6 :
5693 6 : let mut lsn = Lsn(0x10);
5694 6 :
5695 6 : #[allow(clippy::needless_range_loop)]
5696 306 : for idx in 0..NUM_TLINES {
5697 300 : let new_tline_id = TimelineId::generate();
5698 300 : tenant
5699 300 : .branch_timeline_test(&tline, new_tline_id, Some(lsn), &ctx)
5700 6 : .await?;
5701 300 : tline = tenant
5702 300 : .get_timeline(new_tline_id, true)
5703 300 : .expect("Should have the branched timeline");
5704 6 :
5705 30300 : for _ in 0..NUM_KEYS {
5706 30000 : lsn = Lsn(lsn.0 + 0x10);
5707 30000 : let blknum = thread_rng().gen_range(0..NUM_KEYS);
5708 30000 : test_key.field6 = blknum as u32;
5709 30000 : let mut writer = tline.writer().await;
5710 30000 : writer
5711 30000 : .put(
5712 30000 : test_key,
5713 30000 : lsn,
5714 30000 : &Value::Image(test_img(&format!("{} {} at {}", idx, blknum, lsn))),
5715 30000 : &ctx,
5716 30000 : )
5717 264 : .await?;
5718 30000 : println!("updating [{}][{}] at {}", idx, blknum, lsn);
5719 30000 : writer.finish_write(lsn);
5720 30000 : drop(writer);
5721 30000 : updated[idx][blknum] = lsn;
5722 6 : }
5723 6 : }
5724 6 :
5725 6 : // Read pages from leaf timeline across all ancestors.
5726 300 : for (idx, lsns) in updated.iter().enumerate() {
5727 30000 : for (blknum, lsn) in lsns.iter().enumerate() {
5728 6 : // Skip empty mutations.
5729 30000 : if lsn.0 == 0 {
5730 11019 : continue;
5731 18981 : }
5732 18981 : println!("checking [{idx}][{blknum}] at {lsn}");
5733 18981 : test_key.field6 = blknum as u32;
5734 18981 : assert_eq!(
5735 18981 : tline.get(test_key, *lsn, &ctx).await?,
5736 18981 : test_img(&format!("{idx} {blknum} at {lsn}"))
5737 6 : );
5738 6 : }
5739 6 : }
5740 6 : Ok(())
5741 6 : }
5742 :
5743 : #[tokio::test]
5744 6 : async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> {
5745 6 : let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")
5746 6 : .await?
5747 6 : .load()
5748 24 : .await;
5749 6 :
5750 6 : let initdb_lsn = Lsn(0x20);
5751 6 : let utline = tenant
5752 6 : .create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)
5753 6 : .await?;
5754 6 : let tline = utline.raw_timeline().unwrap();
5755 6 :
5756 6 : // Spawn flush loop now so that we can set the `expect_initdb_optimization`
5757 6 : tline.maybe_spawn_flush_loop();
5758 6 :
5759 6 : // Make sure the timeline has the minimum set of required keys for operation.
5760 6 : // The only operation you can always do on an empty timeline is to `put` new data.
5761 6 : // Except if you `put` at `initdb_lsn`.
5762 6 : // In that case, there's an optimization to directly create image layers instead of delta layers.
5763 6 : // It uses `repartition()`, which assumes some keys to be present.
5764 6 : // Let's make sure the test timeline can handle that case.
5765 6 : {
5766 6 : let mut state = tline.flush_loop_state.lock().unwrap();
5767 6 : assert_eq!(
5768 6 : timeline::FlushLoopState::Running {
5769 6 : expect_initdb_optimization: false,
5770 6 : initdb_optimization_count: 0,
5771 6 : },
5772 6 : *state
5773 6 : );
5774 6 : *state = timeline::FlushLoopState::Running {
5775 6 : expect_initdb_optimization: true,
5776 6 : initdb_optimization_count: 0,
5777 6 : };
5778 6 : }
5779 6 :
5780 6 : // Make writes at the initdb_lsn. When we flush it below, it should be handled by the optimization.
5781 6 : // As explained above, the optimization requires some keys to be present.
5782 6 : // As per `create_empty_timeline` documentation, use init_empty to set them.
5783 6 : // This is what `create_test_timeline` does, by the way.
5784 6 : let mut modification = tline.begin_modification(initdb_lsn);
5785 6 : modification
5786 6 : .init_empty_test_timeline()
5787 6 : .context("init_empty_test_timeline")?;
5788 6 : modification
5789 6 : .commit(&ctx)
5790 6 : .await
5791 6 : .context("commit init_empty_test_timeline modification")?;
5792 6 :
5793 6 : // Do the flush. The flush code will check the expectations that we set above.
5794 6 : tline.freeze_and_flush().await?;
5795 6 :
5796 6 : // assert freeze_and_flush exercised the initdb optimization
5797 6 : {
5798 6 : let state = tline.flush_loop_state.lock().unwrap();
5799 6 : let timeline::FlushLoopState::Running {
5800 6 : expect_initdb_optimization,
5801 6 : initdb_optimization_count,
5802 6 : } = *state
5803 6 : else {
5804 6 : panic!("unexpected state: {:?}", *state);
5805 6 : };
5806 6 : assert!(expect_initdb_optimization);
5807 6 : assert!(initdb_optimization_count > 0);
5808 6 : }
5809 6 : Ok(())
5810 6 : }
5811 :
5812 : #[tokio::test]
5813 6 : async fn test_create_guard_crash() -> anyhow::Result<()> {
5814 6 : let name = "test_create_guard_crash";
5815 6 : let harness = TenantHarness::create(name).await?;
5816 6 : {
5817 24 : let (tenant, ctx) = harness.load().await;
5818 6 : let tline = tenant
5819 6 : .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
5820 6 : .await?;
5821 6 : // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
5822 6 : let raw_tline = tline.raw_timeline().unwrap();
5823 6 : raw_tline
5824 6 : .shutdown(super::timeline::ShutdownMode::Hard)
5825 6 : .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
5826 6 : .await;
5827 6 : std::mem::forget(tline);
5828 6 : }
5829 6 :
5830 24 : let (tenant, _) = harness.load().await;
5831 6 : match tenant.get_timeline(TIMELINE_ID, false) {
5832 6 : Ok(_) => panic!("timeline should've been removed during load"),
5833 6 : Err(e) => {
5834 6 : assert_eq!(
5835 6 : e,
5836 6 : GetTimelineError::NotFound {
5837 6 : tenant_id: tenant.tenant_shard_id,
5838 6 : timeline_id: TIMELINE_ID,
5839 6 : }
5840 6 : )
5841 6 : }
5842 6 : }
5843 6 :
5844 6 : assert!(!harness
5845 6 : .conf
5846 6 : .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID)
5847 6 : .exists());
5848 6 :
5849 6 : Ok(())
5850 6 : }
5851 :
5852 : #[tokio::test]
5853 6 : async fn test_read_at_max_lsn() -> anyhow::Result<()> {
5854 6 : let names_algorithms = [
5855 6 : ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy),
5856 6 : ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered),
5857 6 : ];
5858 18 : for (name, algorithm) in names_algorithms {
5859 98198 : test_read_at_max_lsn_algorithm(name, algorithm).await?;
5860 6 : }
5861 6 : Ok(())
5862 6 : }
5863 :
5864 12 : async fn test_read_at_max_lsn_algorithm(
5865 12 : name: &'static str,
5866 12 : compaction_algorithm: CompactionAlgorithm,
5867 12 : ) -> anyhow::Result<()> {
5868 12 : let mut harness = TenantHarness::create(name).await?;
5869 12 : harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
5870 12 : kind: compaction_algorithm,
5871 12 : };
5872 48 : let (tenant, ctx) = harness.load().await;
5873 12 : let tline = tenant
5874 12 : .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
5875 21 : .await?;
5876 :
5877 12 : let lsn = Lsn(0x10);
5878 12 : let compact = false;
5879 96899 : bulk_insert_maybe_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000, compact).await?;
5880 :
5881 12 : let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
5882 12 : let read_lsn = Lsn(u64::MAX - 1);
5883 :
5884 1230 : let result = tline.get(test_key, read_lsn, &ctx).await;
5885 12 : assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err());
5886 :
5887 12 : Ok(())
5888 12 : }
5889 :
5890 : #[tokio::test]
5891 6 : async fn test_metadata_scan() -> anyhow::Result<()> {
5892 6 : let harness = TenantHarness::create("test_metadata_scan").await?;
5893 23 : let (tenant, ctx) = harness.load().await;
5894 6 : let tline = tenant
5895 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
5896 12 : .await?;
5897 6 :
5898 6 : const NUM_KEYS: usize = 1000;
5899 6 : const STEP: usize = 10000; // random update + scan base_key + idx * STEP
5900 6 :
5901 6 : let cancel = CancellationToken::new();
5902 6 :
5903 6 : let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
5904 6 : base_key.field1 = AUX_KEY_PREFIX;
5905 6 : let mut test_key = base_key;
5906 6 :
5907 6 : // Track when each page was last modified. Used to assert that
5908 6 : // a read sees the latest page version.
5909 6 : let mut updated = [Lsn(0); NUM_KEYS];
5910 6 :
5911 6 : let mut lsn = Lsn(0x10);
5912 6 : #[allow(clippy::needless_range_loop)]
5913 6006 : for blknum in 0..NUM_KEYS {
5914 6000 : lsn = Lsn(lsn.0 + 0x10);
5915 6000 : test_key.field6 = (blknum * STEP) as u32;
5916 6000 : let mut writer = tline.writer().await;
5917 6000 : writer
5918 6000 : .put(
5919 6000 : test_key,
5920 6000 : lsn,
5921 6000 : &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
5922 6000 : &ctx,
5923 6000 : )
5924 6 : .await?;
5925 6000 : writer.finish_write(lsn);
5926 6000 : updated[blknum] = lsn;
5927 6000 : drop(writer);
5928 6 : }
5929 6 :
5930 6 : let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
5931 6 :
5932 72 : for iter in 0..=10 {
5933 6 : // Read all the blocks
5934 66000 : for (blknum, last_lsn) in updated.iter().enumerate() {
5935 66000 : test_key.field6 = (blknum * STEP) as u32;
5936 66000 : assert_eq!(
5937 66000 : tline.get(test_key, lsn, &ctx).await?,
5938 66000 : test_img(&format!("{} at {}", blknum, last_lsn))
5939 6 : );
5940 6 : }
5941 6 :
5942 66 : let mut cnt = 0;
5943 66000 : for (key, value) in tline
5944 66 : .get_vectored_impl(
5945 66 : keyspace.clone(),
5946 66 : lsn,
5947 66 : &mut ValuesReconstructState::default(),
5948 66 : &ctx,
5949 66 : )
5950 12607 : .await?
5951 6 : {
5952 66000 : let blknum = key.field6 as usize;
5953 66000 : let value = value?;
5954 66000 : assert!(blknum % STEP == 0);
5955 66000 : let blknum = blknum / STEP;
5956 66000 : assert_eq!(
5957 66000 : value,
5958 66000 : test_img(&format!("{} at {}", blknum, updated[blknum]))
5959 66000 : );
5960 66000 : cnt += 1;
5961 6 : }
5962 6 :
5963 66 : assert_eq!(cnt, NUM_KEYS);
5964 6 :
5965 66066 : for _ in 0..NUM_KEYS {
5966 66000 : lsn = Lsn(lsn.0 + 0x10);
5967 66000 : let blknum = thread_rng().gen_range(0..NUM_KEYS);
5968 66000 : test_key.field6 = (blknum * STEP) as u32;
5969 66000 : let mut writer = tline.writer().await;
5970 66000 : writer
5971 66000 : .put(
5972 66000 : test_key,
5973 66000 : lsn,
5974 66000 : &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
5975 66000 : &ctx,
5976 66000 : )
5977 422 : .await?;
5978 66000 : writer.finish_write(lsn);
5979 66000 : drop(writer);
5980 66000 : updated[blknum] = lsn;
5981 6 : }
5982 6 :
5983 6 : // Perform two cycles of flush, compact, and GC
5984 198 : for round in 0..2 {
5985 132 : tline.freeze_and_flush().await?;
5986 132 : tline
5987 132 : .compact(
5988 132 : &cancel,
5989 132 : if iter % 5 == 0 && round == 0 {
5990 18 : let mut flags = EnumSet::new();
5991 18 : flags.insert(CompactFlags::ForceImageLayerCreation);
5992 18 : flags.insert(CompactFlags::ForceRepartition);
5993 18 : flags
5994 6 : } else {
5995 114 : EnumSet::empty()
5996 6 : },
5997 132 : &ctx,
5998 6 : )
5999 23966 : .await?;
6000 132 : tenant
6001 132 : .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
6002 121 : .await?;
6003 6 : }
6004 6 : }
6005 6 :
6006 6 : Ok(())
6007 6 : }
6008 :
6009 : #[tokio::test]
6010 6 : async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
6011 6 : let harness = TenantHarness::create("test_metadata_compaction_trigger").await?;
6012 24 : let (tenant, ctx) = harness.load().await;
6013 6 : let tline = tenant
6014 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
6015 12 : .await?;
6016 6 :
6017 6 : let cancel = CancellationToken::new();
6018 6 :
6019 6 : let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
6020 6 : base_key.field1 = AUX_KEY_PREFIX;
6021 6 : let test_key = base_key;
6022 6 : let mut lsn = Lsn(0x10);
6023 6 :
6024 126 : for _ in 0..20 {
6025 120 : lsn = Lsn(lsn.0 + 0x10);
6026 120 : let mut writer = tline.writer().await;
6027 120 : writer
6028 120 : .put(
6029 120 : test_key,
6030 120 : lsn,
6031 120 : &Value::Image(test_img(&format!("{} at {}", 0, lsn))),
6032 120 : &ctx,
6033 120 : )
6034 60 : .await?;
6035 120 : writer.finish_write(lsn);
6036 120 : drop(writer);
6037 120 : tline.freeze_and_flush().await?; // force create a delta layer
6038 6 : }
6039 6 :
6040 6 : let before_num_l0_delta_files =
6041 6 : tline.layers.read().await.layer_map()?.level0_deltas().len();
6042 6 :
6043 330 : tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
6044 6 :
6045 6 : let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();
6046 6 :
6047 6 : assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
6048 6 :
6049 6 : assert_eq!(
6050 12 : tline.get(test_key, lsn, &ctx).await?,
6051 6 : test_img(&format!("{} at {}", 0, lsn))
6052 6 : );
6053 6 :
6054 6 : Ok(())
6055 6 : }
6056 :
6057 : #[tokio::test]
6058 6 : async fn test_branch_copies_dirty_aux_file_flag() {
6059 6 : let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag")
6060 6 : .await
6061 6 : .unwrap();
6062 6 :
6063 6 : // the default aux file policy to switch is v2 if not set by the admins
6064 6 : assert_eq!(
6065 6 : harness.tenant_conf.switch_aux_file_policy,
6066 6 : AuxFilePolicy::default_tenant_config()
6067 6 : );
6068 24 : let (tenant, ctx) = harness.load().await;
6069 6 :
6070 6 : let mut lsn = Lsn(0x08);
6071 6 :
6072 6 : let tline: Arc<Timeline> = tenant
6073 6 : .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
6074 12 : .await
6075 6 : .unwrap();
6076 6 :
6077 6 : // no aux file is written at this point, so the persistent flag should be unset
6078 6 : assert_eq!(tline.last_aux_file_policy.load(), None);
6079 6 :
6080 6 : {
6081 6 : lsn += 8;
6082 6 : let mut modification = tline.begin_modification(lsn);
6083 6 : modification
6084 6 : .put_file("pg_logical/mappings/test1", b"first", &ctx)
6085 6 : .await
6086 6 : .unwrap();
6087 6 : modification.commit(&ctx).await.unwrap();
6088 6 : }
6089 6 :
6090 6 : // there is no tenant manager to pass the configuration through, so lets mimic it
6091 6 : tenant.set_new_location_config(
6092 6 : AttachedTenantConf::try_from(LocationConf::attached_single(
6093 6 : TenantConfOpt {
6094 6 : switch_aux_file_policy: Some(AuxFilePolicy::V2),
6095 6 : ..Default::default()
6096 6 : },
6097 6 : tenant.generation,
6098 6 : &pageserver_api::models::ShardParameters::default(),
6099 6 : ))
6100 6 : .unwrap(),
6101 6 : );
6102 6 :
6103 6 : assert_eq!(
6104 6 : tline.get_switch_aux_file_policy(),
6105 6 : AuxFilePolicy::V2,
6106 6 : "wanted state has been updated"
6107 6 : );
6108 6 : assert_eq!(
6109 6 : tline.last_aux_file_policy.load(),
6110 6 : Some(AuxFilePolicy::V2),
6111 6 : "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
6112 6 : );
6113 6 :
6114 6 : // we can read everything from the storage
6115 6 : let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
6116 6 : assert_eq!(
6117 6 : files.get("pg_logical/mappings/test1"),
6118 6 : Some(&bytes::Bytes::from_static(b"first"))
6119 6 : );
6120 6 :
6121 6 : {
6122 6 : lsn += 8;
6123 6 : let mut modification = tline.begin_modification(lsn);
6124 6 : modification
6125 6 : .put_file("pg_logical/mappings/test2", b"second", &ctx)
6126 6 : .await
6127 6 : .unwrap();
6128 6 : modification.commit(&ctx).await.unwrap();
6129 6 : }
6130 6 :
6131 6 : assert_eq!(
6132 6 : tline.last_aux_file_policy.load(),
6133 6 : Some(AuxFilePolicy::V2),
6134 6 : "keep v2 storage format when new files are written"
6135 6 : );
6136 6 :
6137 6 : let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
6138 6 : assert_eq!(
6139 6 : files.get("pg_logical/mappings/test2"),
6140 6 : Some(&bytes::Bytes::from_static(b"second"))
6141 6 : );
6142 6 :
6143 6 : let child = tenant
6144 6 : .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
6145 6 : .await
6146 6 : .unwrap();
6147 6 :
6148 6 : // child copies the last flag even if that is not on remote storage yet
6149 6 : assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
6150 6 : assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
6151 6 :
6152 6 : let files = child.list_aux_files(lsn, &ctx).await.unwrap();
6153 6 : assert_eq!(files.get("pg_logical/mappings/test1"), None);
6154 6 : assert_eq!(files.get("pg_logical/mappings/test2"), None);
6155 6 :
6156 6 : // even if we crash here without flushing parent timeline with it's new
6157 6 : // last_aux_file_policy we are safe, because child was never meant to access ancestor's
6158 6 : // files. the ancestor can even switch back to V1 because of a migration safely.
6159 6 : }
6160 :
6161 : #[tokio::test]
6162 6 : async fn aux_file_policy_switch() {
6163 6 : let mut harness = TenantHarness::create("aux_file_policy_switch")
6164 6 : .await
6165 6 : .unwrap();
6166 6 : harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
6167 24 : let (tenant, ctx) = harness.load().await;
6168 6 :
6169 6 : let mut lsn = Lsn(0x08);
6170 6 :
6171 6 : let tline: Arc<Timeline> = tenant
6172 6 : .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
6173 12 : .await
6174 6 : .unwrap();
6175 6 :
6176 6 : assert_eq!(
6177 6 : tline.last_aux_file_policy.load(),
6178 6 : None,
6179 6 : "no aux file is written so it should be unset"
6180 6 : );
6181 6 :
6182 6 : {
6183 6 : lsn += 8;
6184 6 : let mut modification = tline.begin_modification(lsn);
6185 6 : modification
6186 6 : .put_file("pg_logical/mappings/test1", b"first", &ctx)
6187 15 : .await
6188 6 : .unwrap();
6189 6 : modification.commit(&ctx).await.unwrap();
6190 6 : }
6191 6 :
6192 6 : // there is no tenant manager to pass the configuration through, so lets mimic it
6193 6 : tenant.set_new_location_config(
6194 6 : AttachedTenantConf::try_from(LocationConf::attached_single(
6195 6 : TenantConfOpt {
6196 6 : switch_aux_file_policy: Some(AuxFilePolicy::V2),
6197 6 : ..Default::default()
6198 6 : },
6199 6 : tenant.generation,
6200 6 : &pageserver_api::models::ShardParameters::default(),
6201 6 : ))
6202 6 : .unwrap(),
6203 6 : );
6204 6 :
6205 6 : assert_eq!(
6206 6 : tline.get_switch_aux_file_policy(),
6207 6 : AuxFilePolicy::V2,
6208 6 : "wanted state has been updated"
6209 6 : );
6210 6 : assert_eq!(
6211 6 : tline.last_aux_file_policy.load(),
6212 6 : Some(AuxFilePolicy::CrossValidation),
6213 6 : "dirty index_part.json reflected state is yet to be updated"
6214 6 : );
6215 6 :
6216 6 : // we can still read the auxfile v1 before we ingest anything new
6217 6 : let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
6218 6 : assert_eq!(
6219 6 : files.get("pg_logical/mappings/test1"),
6220 6 : Some(&bytes::Bytes::from_static(b"first"))
6221 6 : );
6222 6 :
6223 6 : {
6224 6 : lsn += 8;
6225 6 : let mut modification = tline.begin_modification(lsn);
6226 6 : modification
6227 6 : .put_file("pg_logical/mappings/test2", b"second", &ctx)
6228 6 : .await
6229 6 : .unwrap();
6230 6 : modification.commit(&ctx).await.unwrap();
6231 6 : }
6232 6 :
6233 6 : assert_eq!(
6234 6 : tline.last_aux_file_policy.load(),
6235 6 : Some(AuxFilePolicy::V2),
6236 6 : "ingesting a file should apply the wanted switch state when applicable"
6237 6 : );
6238 6 :
6239 6 : let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
6240 6 : assert_eq!(
6241 6 : files.get("pg_logical/mappings/test1"),
6242 6 : Some(&bytes::Bytes::from_static(b"first")),
6243 6 : "cross validation writes to both v1 and v2 so this should be available in v2"
6244 6 : );
6245 6 : assert_eq!(
6246 6 : files.get("pg_logical/mappings/test2"),
6247 6 : Some(&bytes::Bytes::from_static(b"second"))
6248 6 : );
6249 6 :
6250 6 : // mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file)
6251 6 : tenant.set_new_location_config(
6252 6 : AttachedTenantConf::try_from(LocationConf::attached_single(
6253 6 : TenantConfOpt {
6254 6 : switch_aux_file_policy: Some(AuxFilePolicy::V1),
6255 6 : ..Default::default()
6256 6 : },
6257 6 : tenant.generation,
6258 6 : &pageserver_api::models::ShardParameters::default(),
6259 6 : ))
6260 6 : .unwrap(),
6261 6 : );
6262 6 :
6263 6 : {
6264 6 : lsn += 8;
6265 6 : let mut modification = tline.begin_modification(lsn);
6266 6 : modification
6267 6 : .put_file("pg_logical/mappings/test2", b"third", &ctx)
6268 6 : .await
6269 6 : .unwrap();
6270 6 : modification.commit(&ctx).await.unwrap();
6271 6 : }
6272 6 :
6273 6 : assert_eq!(
6274 6 : tline.get_switch_aux_file_policy(),
6275 6 : AuxFilePolicy::V1,
6276 6 : "wanted state has been updated again, even if invalid request"
6277 6 : );
6278 6 :
6279 6 : assert_eq!(
6280 6 : tline.last_aux_file_policy.load(),
6281 6 : Some(AuxFilePolicy::V2),
6282 6 : "ingesting a file should apply the wanted switch state when applicable"
6283 6 : );
6284 6 :
6285 6 : let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
6286 6 : assert_eq!(
6287 6 : files.get("pg_logical/mappings/test1"),
6288 6 : Some(&bytes::Bytes::from_static(b"first"))
6289 6 : );
6290 6 : assert_eq!(
6291 6 : files.get("pg_logical/mappings/test2"),
6292 6 : Some(&bytes::Bytes::from_static(b"third"))
6293 6 : );
6294 6 :
6295 6 : // mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file)
6296 6 : tenant.set_new_location_config(
6297 6 : AttachedTenantConf::try_from(LocationConf::attached_single(
6298 6 : TenantConfOpt {
6299 6 : switch_aux_file_policy: Some(AuxFilePolicy::V2),
6300 6 : ..Default::default()
6301 6 : },
6302 6 : tenant.generation,
6303 6 : &pageserver_api::models::ShardParameters::default(),
6304 6 : ))
6305 6 : .unwrap(),
6306 6 : );
6307 6 :
6308 6 : {
6309 6 : lsn += 8;
6310 6 : let mut modification = tline.begin_modification(lsn);
6311 6 : modification
6312 6 : .put_file("pg_logical/mappings/test3", b"last", &ctx)
6313 6 : .await
6314 6 : .unwrap();
6315 6 : modification.commit(&ctx).await.unwrap();
6316 6 : }
6317 6 :
6318 6 : assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2);
6319 6 :
6320 6 : assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
6321 6 :
6322 6 : let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
6323 6 : assert_eq!(
6324 6 : files.get("pg_logical/mappings/test1"),
6325 6 : Some(&bytes::Bytes::from_static(b"first"))
6326 6 : );
6327 6 : assert_eq!(
6328 6 : files.get("pg_logical/mappings/test2"),
6329 6 : Some(&bytes::Bytes::from_static(b"third"))
6330 6 : );
6331 6 : assert_eq!(
6332 6 : files.get("pg_logical/mappings/test3"),
6333 6 : Some(&bytes::Bytes::from_static(b"last"))
6334 6 : );
6335 6 : }
6336 :
6337 : #[tokio::test]
6338 6 : async fn aux_file_policy_force_switch() {
6339 6 : let mut harness = TenantHarness::create("aux_file_policy_force_switch")
6340 6 : .await
6341 6 : .unwrap();
6342 6 : harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
6343 24 : let (tenant, ctx) = harness.load().await;
6344 6 :
6345 6 : let mut lsn = Lsn(0x08);
6346 6 :
6347 6 : let tline: Arc<Timeline> = tenant
6348 6 : .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
6349 12 : .await
6350 6 : .unwrap();
6351 6 :
6352 6 : assert_eq!(
6353 6 : tline.last_aux_file_policy.load(),
6354 6 : None,
6355 6 : "no aux file is written so it should be unset"
6356 6 : );
6357 6 :
6358 6 : {
6359 6 : lsn += 8;
6360 6 : let mut modification = tline.begin_modification(lsn);
6361 6 : modification
6362 6 : .put_file("pg_logical/mappings/test1", b"first", &ctx)
6363 15 : .await
6364 6 : .unwrap();
6365 6 : modification.commit(&ctx).await.unwrap();
6366 6 : }
6367 6 :
6368 6 : tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap();
6369 6 :
6370 6 : assert_eq!(
6371 6 : tline.last_aux_file_policy.load(),
6372 6 : Some(AuxFilePolicy::V2),
6373 6 : "dirty index_part.json reflected state is yet to be updated"
6374 6 : );
6375 6 :
6376 6 : // lose all data from v1
6377 6 : let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
6378 6 : assert_eq!(files.get("pg_logical/mappings/test1"), None);
6379 6 :
6380 6 : {
6381 6 : lsn += 8;
6382 6 : let mut modification = tline.begin_modification(lsn);
6383 6 : modification
6384 6 : .put_file("pg_logical/mappings/test2", b"second", &ctx)
6385 6 : .await
6386 6 : .unwrap();
6387 6 : modification.commit(&ctx).await.unwrap();
6388 6 : }
6389 6 :
6390 6 : // read data ingested in v2
6391 6 : let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
6392 6 : assert_eq!(
6393 6 : files.get("pg_logical/mappings/test2"),
6394 6 : Some(&bytes::Bytes::from_static(b"second"))
6395 6 : );
6396 6 : // lose all data from v1
6397 6 : assert_eq!(files.get("pg_logical/mappings/test1"), None);
6398 6 : }
6399 :
6400 : #[tokio::test]
6401 6 : async fn aux_file_policy_auto_detect() {
6402 6 : let mut harness = TenantHarness::create("aux_file_policy_auto_detect")
6403 6 : .await
6404 6 : .unwrap();
6405 6 : harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
6406 24 : let (tenant, ctx) = harness.load().await;
6407 6 :
6408 6 : let mut lsn = Lsn(0x08);
6409 6 :
6410 6 : let tline: Arc<Timeline> = tenant
6411 6 : .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
6412 12 : .await
6413 6 : .unwrap();
6414 6 :
6415 6 : assert_eq!(
6416 6 : tline.last_aux_file_policy.load(),
6417 6 : None,
6418 6 : "no aux file is written so it should be unset"
6419 6 : );
6420 6 :
6421 6 : {
6422 6 : lsn += 8;
6423 6 : let mut modification = tline.begin_modification(lsn);
6424 6 : let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
6425 6 : files: vec![(
6426 6 : "test_file".to_string(),
6427 6 : Bytes::copy_from_slice(b"test_file"),
6428 6 : )]
6429 6 : .into_iter()
6430 6 : .collect(),
6431 6 : })
6432 6 : .unwrap();
6433 6 : modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
6434 6 : modification.commit(&ctx).await.unwrap();
6435 6 : }
6436 6 :
6437 6 : {
6438 6 : lsn += 8;
6439 6 : let mut modification = tline.begin_modification(lsn);
6440 6 : modification
6441 6 : .put_file("pg_logical/mappings/test1", b"first", &ctx)
6442 6 : .await
6443 6 : .unwrap();
6444 6 : modification.commit(&ctx).await.unwrap();
6445 6 : }
6446 6 :
6447 6 : assert_eq!(
6448 6 : tline.last_aux_file_policy.load(),
6449 6 : Some(AuxFilePolicy::V1),
6450 6 : "keep using v1 because there are aux files writting with v1"
6451 6 : );
6452 6 :
6453 6 : // we can still read the auxfile v1
6454 6 : let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
6455 6 : assert_eq!(
6456 6 : files.get("pg_logical/mappings/test1"),
6457 6 : Some(&bytes::Bytes::from_static(b"first"))
6458 6 : );
6459 6 : assert_eq!(
6460 6 : files.get("test_file"),
6461 6 : Some(&bytes::Bytes::from_static(b"test_file"))
6462 6 : );
6463 6 : }
6464 :
6465 : #[tokio::test]
6466 6 : async fn test_metadata_image_creation() -> anyhow::Result<()> {
6467 6 : let harness = TenantHarness::create("test_metadata_image_creation").await?;
6468 24 : let (tenant, ctx) = harness.load().await;
6469 6 : let tline = tenant
6470 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
6471 12 : .await?;
6472 6 :
6473 6 : const NUM_KEYS: usize = 1000;
6474 6 : const STEP: usize = 10000; // random update + scan base_key + idx * STEP
6475 6 :
6476 6 : let cancel = CancellationToken::new();
6477 6 :
6478 6 : let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
6479 6 : assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
6480 6 : let mut test_key = base_key;
6481 6 : let mut lsn = Lsn(0x10);
6482 6 :
6483 24 : async fn scan_with_statistics(
6484 24 : tline: &Timeline,
6485 24 : keyspace: &KeySpace,
6486 24 : lsn: Lsn,
6487 24 : ctx: &RequestContext,
6488 24 : ) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
6489 24 : let mut reconstruct_state = ValuesReconstructState::default();
6490 24 : let res = tline
6491 24 : .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
6492 4059 : .await?;
6493 24 : Ok((res, reconstruct_state.get_delta_layers_visited() as usize))
6494 24 : }
6495 6 :
6496 6 : #[allow(clippy::needless_range_loop)]
6497 6006 : for blknum in 0..NUM_KEYS {
6498 6000 : lsn = Lsn(lsn.0 + 0x10);
6499 6000 : test_key.field6 = (blknum * STEP) as u32;
6500 6000 : let mut writer = tline.writer().await;
6501 6000 : writer
6502 6000 : .put(
6503 6000 : test_key,
6504 6000 : lsn,
6505 6000 : &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
6506 6000 : &ctx,
6507 6000 : )
6508 6 : .await?;
6509 6000 : writer.finish_write(lsn);
6510 6000 : drop(writer);
6511 6 : }
6512 6 :
6513 6 : let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
6514 6 :
6515 66 : for iter in 1..=10 {
6516 60060 : for _ in 0..NUM_KEYS {
6517 60000 : lsn = Lsn(lsn.0 + 0x10);
6518 60000 : let blknum = thread_rng().gen_range(0..NUM_KEYS);
6519 60000 : test_key.field6 = (blknum * STEP) as u32;
6520 60000 : let mut writer = tline.writer().await;
6521 60000 : writer
6522 60000 : .put(
6523 60000 : test_key,
6524 60000 : lsn,
6525 60000 : &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
6526 60000 : &ctx,
6527 60000 : )
6528 104 : .await?;
6529 60000 : writer.finish_write(lsn);
6530 60000 : drop(writer);
6531 6 : }
6532 6 :
6533 60 : tline.freeze_and_flush().await?;
6534 6 :
6535 60 : if iter % 5 == 0 {
6536 12 : let (_, before_delta_file_accessed) =
6537 4035 : scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
6538 12 : tline
6539 12 : .compact(
6540 12 : &cancel,
6541 12 : {
6542 12 : let mut flags = EnumSet::new();
6543 12 : flags.insert(CompactFlags::ForceImageLayerCreation);
6544 12 : flags.insert(CompactFlags::ForceRepartition);
6545 12 : flags
6546 12 : },
6547 12 : &ctx,
6548 12 : )
6549 17237 : .await?;
6550 12 : let (_, after_delta_file_accessed) =
6551 24 : scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
6552 12 : assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}");
6553 6 : // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances.
6554 12 : assert!(
6555 12 : after_delta_file_accessed <= 2,
6556 6 : "after_delta_file_accessed={after_delta_file_accessed}"
6557 6 : );
6558 48 : }
6559 6 : }
6560 6 :
6561 6 : Ok(())
6562 6 : }
6563 :
6564 : #[tokio::test]
6565 6 : async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
6566 6 : let harness = TenantHarness::create("test_vectored_missing_data_key_reads").await?;
6567 24 : let (tenant, ctx) = harness.load().await;
6568 6 :
6569 6 : let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
6570 6 : let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
6571 6 : let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
6572 6 :
6573 6 : let tline = tenant
6574 6 : .create_test_timeline_with_layers(
6575 6 : TIMELINE_ID,
6576 6 : Lsn(0x10),
6577 6 : DEFAULT_PG_VERSION,
6578 6 : &ctx,
6579 6 : Vec::new(), // delta layers
6580 6 : vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
6581 6 : Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
6582 6 : )
6583 33 : .await?;
6584 6 : tline.add_extra_test_dense_keyspace(KeySpace::single(base_key..(base_key_nonexist.next())));
6585 6 :
6586 6 : let child = tenant
6587 6 : .branch_timeline_test_with_layers(
6588 6 : &tline,
6589 6 : NEW_TIMELINE_ID,
6590 6 : Some(Lsn(0x20)),
6591 6 : &ctx,
6592 6 : Vec::new(), // delta layers
6593 6 : vec![(Lsn(0x30), vec![(base_key_child, test_img("data key 2"))])], // image layers
6594 6 : Lsn(0x30),
6595 6 : )
6596 21 : .await
6597 6 : .unwrap();
6598 6 :
6599 6 : let lsn = Lsn(0x30);
6600 6 :
6601 6 : // test vectored get on parent timeline
6602 6 : assert_eq!(
6603 12 : get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
6604 6 : Some(test_img("data key 1"))
6605 6 : );
6606 6 : assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx)
6607 9 : .await
6608 6 : .unwrap_err()
6609 6 : .is_missing_key_error());
6610 6 : assert!(
6611 6 : get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx)
6612 6 : .await
6613 6 : .unwrap_err()
6614 6 : .is_missing_key_error()
6615 6 : );
6616 6 :
6617 6 : // test vectored get on child timeline
6618 6 : assert_eq!(
6619 6 : get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
6620 6 : Some(test_img("data key 1"))
6621 6 : );
6622 6 : assert_eq!(
6623 12 : get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
6624 6 : Some(test_img("data key 2"))
6625 6 : );
6626 6 : assert!(
6627 6 : get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx)
6628 6 : .await
6629 6 : .unwrap_err()
6630 6 : .is_missing_key_error()
6631 6 : );
6632 6 :
6633 6 : Ok(())
6634 6 : }
6635 :
6636 : #[tokio::test]
6637 6 : async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
6638 6 : let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?;
6639 24 : let (tenant, ctx) = harness.load().await;
6640 6 :
6641 6 : let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
6642 6 : let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
6643 6 : let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
6644 6 : assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
6645 6 :
6646 6 : let tline = tenant
6647 6 : .create_test_timeline_with_layers(
6648 6 : TIMELINE_ID,
6649 6 : Lsn(0x10),
6650 6 : DEFAULT_PG_VERSION,
6651 6 : &ctx,
6652 6 : Vec::new(), // delta layers
6653 6 : vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
6654 6 : Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
6655 6 : )
6656 33 : .await?;
6657 6 :
6658 6 : let child = tenant
6659 6 : .branch_timeline_test_with_layers(
6660 6 : &tline,
6661 6 : NEW_TIMELINE_ID,
6662 6 : Some(Lsn(0x20)),
6663 6 : &ctx,
6664 6 : Vec::new(), // delta layers
6665 6 : vec![(
6666 6 : Lsn(0x30),
6667 6 : vec![(base_key_child, test_img("metadata key 2"))],
6668 6 : )], // image layers
6669 6 : Lsn(0x30),
6670 6 : )
6671 21 : .await
6672 6 : .unwrap();
6673 6 :
6674 6 : let lsn = Lsn(0x30);
6675 6 :
6676 6 : // test vectored get on parent timeline
6677 6 : assert_eq!(
6678 12 : get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
6679 6 : Some(test_img("metadata key 1"))
6680 6 : );
6681 6 : assert_eq!(
6682 6 : get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?,
6683 6 : None
6684 6 : );
6685 6 : assert_eq!(
6686 6 : get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
6687 6 : None
6688 6 : );
6689 6 :
6690 6 : // test vectored get on child timeline
6691 6 : assert_eq!(
6692 6 : get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
6693 6 : None
6694 6 : );
6695 6 : assert_eq!(
6696 12 : get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
6697 6 : Some(test_img("metadata key 2"))
6698 6 : );
6699 6 : assert_eq!(
6700 6 : get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
6701 6 : None
6702 6 : );
6703 6 :
6704 6 : Ok(())
6705 6 : }
6706 :
6707 108 : async fn get_vectored_impl_wrapper(
6708 108 : tline: &Arc<Timeline>,
6709 108 : key: Key,
6710 108 : lsn: Lsn,
6711 108 : ctx: &RequestContext,
6712 108 : ) -> Result<Option<Bytes>, GetVectoredError> {
6713 108 : let mut reconstruct_state = ValuesReconstructState::new();
6714 108 : let mut res = tline
6715 108 : .get_vectored_impl(
6716 108 : KeySpace::single(key..key.next()),
6717 108 : lsn,
6718 108 : &mut reconstruct_state,
6719 108 : ctx,
6720 108 : )
6721 99 : .await?;
6722 90 : Ok(res.pop_last().map(|(k, v)| {
6723 54 : assert_eq!(k, key);
6724 54 : v.unwrap()
6725 90 : }))
6726 108 : }
6727 :
6728 : #[tokio::test]
6729 6 : async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
6730 6 : let harness = TenantHarness::create("test_metadata_tombstone_reads").await?;
6731 24 : let (tenant, ctx) = harness.load().await;
6732 6 : let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
6733 6 : let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
6734 6 : let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
6735 6 : let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();
6736 6 :
6737 6 : // We emulate the situation that the compaction algorithm creates an image layer that removes the tombstones
6738 6 : // Lsn 0x30 key0, key3, no key1+key2
6739 6 : // Lsn 0x20 key1+key2 tomestones
6740 6 : // Lsn 0x10 key1 in image, key2 in delta
6741 6 : let tline = tenant
6742 6 : .create_test_timeline_with_layers(
6743 6 : TIMELINE_ID,
6744 6 : Lsn(0x10),
6745 6 : DEFAULT_PG_VERSION,
6746 6 : &ctx,
6747 6 : // delta layers
6748 6 : vec![
6749 6 : DeltaLayerTestDesc::new_with_inferred_key_range(
6750 6 : Lsn(0x10)..Lsn(0x20),
6751 6 : vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
6752 6 : ),
6753 6 : DeltaLayerTestDesc::new_with_inferred_key_range(
6754 6 : Lsn(0x20)..Lsn(0x30),
6755 6 : vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
6756 6 : ),
6757 6 : DeltaLayerTestDesc::new_with_inferred_key_range(
6758 6 : Lsn(0x20)..Lsn(0x30),
6759 6 : vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
6760 6 : ),
6761 6 : ],
6762 6 : // image layers
6763 6 : vec![
6764 6 : (Lsn(0x10), vec![(key1, test_img("metadata key 1"))]),
6765 6 : (
6766 6 : Lsn(0x30),
6767 6 : vec![
6768 6 : (key0, test_img("metadata key 0")),
6769 6 : (key3, test_img("metadata key 3")),
6770 6 : ],
6771 6 : ),
6772 6 : ],
6773 6 : Lsn(0x30),
6774 6 : )
6775 114 : .await?;
6776 6 :
6777 6 : let lsn = Lsn(0x30);
6778 6 : let old_lsn = Lsn(0x20);
6779 6 :
6780 6 : assert_eq!(
6781 12 : get_vectored_impl_wrapper(&tline, key0, lsn, &ctx).await?,
6782 6 : Some(test_img("metadata key 0"))
6783 6 : );
6784 6 : assert_eq!(
6785 6 : get_vectored_impl_wrapper(&tline, key1, lsn, &ctx).await?,
6786 6 : None,
6787 6 : );
6788 6 : assert_eq!(
6789 6 : get_vectored_impl_wrapper(&tline, key2, lsn, &ctx).await?,
6790 6 : None,
6791 6 : );
6792 6 : assert_eq!(
6793 12 : get_vectored_impl_wrapper(&tline, key1, old_lsn, &ctx).await?,
6794 6 : Some(Bytes::new()),
6795 6 : );
6796 6 : assert_eq!(
6797 12 : get_vectored_impl_wrapper(&tline, key2, old_lsn, &ctx).await?,
6798 6 : Some(Bytes::new()),
6799 6 : );
6800 6 : assert_eq!(
6801 6 : get_vectored_impl_wrapper(&tline, key3, lsn, &ctx).await?,
6802 6 : Some(test_img("metadata key 3"))
6803 6 : );
6804 6 :
6805 6 : Ok(())
6806 6 : }
6807 :
6808 : #[tokio::test]
6809 6 : async fn test_metadata_tombstone_image_creation() {
6810 6 : let harness = TenantHarness::create("test_metadata_tombstone_image_creation")
6811 6 : .await
6812 6 : .unwrap();
6813 24 : let (tenant, ctx) = harness.load().await;
6814 6 :
6815 6 : let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
6816 6 : let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
6817 6 : let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
6818 6 : let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();
6819 6 :
6820 6 : let tline = tenant
6821 6 : .create_test_timeline_with_layers(
6822 6 : TIMELINE_ID,
6823 6 : Lsn(0x10),
6824 6 : DEFAULT_PG_VERSION,
6825 6 : &ctx,
6826 6 : // delta layers
6827 6 : vec![
6828 6 : DeltaLayerTestDesc::new_with_inferred_key_range(
6829 6 : Lsn(0x10)..Lsn(0x20),
6830 6 : vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
6831 6 : ),
6832 6 : DeltaLayerTestDesc::new_with_inferred_key_range(
6833 6 : Lsn(0x20)..Lsn(0x30),
6834 6 : vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
6835 6 : ),
6836 6 : DeltaLayerTestDesc::new_with_inferred_key_range(
6837 6 : Lsn(0x20)..Lsn(0x30),
6838 6 : vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
6839 6 : ),
6840 6 : DeltaLayerTestDesc::new_with_inferred_key_range(
6841 6 : Lsn(0x30)..Lsn(0x40),
6842 6 : vec![
6843 6 : (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
6844 6 : (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
6845 6 : ],
6846 6 : ),
6847 6 : ],
6848 6 : // image layers
6849 6 : vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
6850 6 : Lsn(0x40),
6851 6 : )
6852 105 : .await
6853 6 : .unwrap();
6854 6 :
6855 6 : let cancel = CancellationToken::new();
6856 6 :
6857 6 : tline
6858 6 : .compact(
6859 6 : &cancel,
6860 6 : {
6861 6 : let mut flags = EnumSet::new();
6862 6 : flags.insert(CompactFlags::ForceImageLayerCreation);
6863 6 : flags.insert(CompactFlags::ForceRepartition);
6864 6 : flags
6865 6 : },
6866 6 : &ctx,
6867 6 : )
6868 165 : .await
6869 6 : .unwrap();
6870 6 :
6871 6 : // Image layers are created at last_record_lsn
6872 6 : let images = tline
6873 6 : .inspect_image_layers(Lsn(0x40), &ctx)
6874 24 : .await
6875 6 : .unwrap()
6876 6 : .into_iter()
6877 54 : .filter(|(k, _)| k.is_metadata_key())
6878 6 : .collect::<Vec<_>>();
6879 6 : assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed.
6880 6 : }
6881 :
6882 : #[tokio::test]
6883 6 : async fn test_metadata_tombstone_empty_image_creation() {
6884 6 : let harness = TenantHarness::create("test_metadata_tombstone_empty_image_creation")
6885 6 : .await
6886 6 : .unwrap();
6887 24 : let (tenant, ctx) = harness.load().await;
6888 6 :
6889 6 : let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
6890 6 : let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
6891 6 :
6892 6 : let tline = tenant
6893 6 : .create_test_timeline_with_layers(
6894 6 : TIMELINE_ID,
6895 6 : Lsn(0x10),
6896 6 : DEFAULT_PG_VERSION,
6897 6 : &ctx,
6898 6 : // delta layers
6899 6 : vec![
6900 6 : DeltaLayerTestDesc::new_with_inferred_key_range(
6901 6 : Lsn(0x10)..Lsn(0x20),
6902 6 : vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
6903 6 : ),
6904 6 : DeltaLayerTestDesc::new_with_inferred_key_range(
6905 6 : Lsn(0x20)..Lsn(0x30),
6906 6 : vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
6907 6 : ),
6908 6 : DeltaLayerTestDesc::new_with_inferred_key_range(
6909 6 : Lsn(0x20)..Lsn(0x30),
6910 6 : vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
6911 6 : ),
6912 6 : ],
6913 6 : // image layers
6914 6 : vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
6915 6 : Lsn(0x30),
6916 6 : )
6917 87 : .await
6918 6 : .unwrap();
6919 6 :
6920 6 : let cancel = CancellationToken::new();
6921 6 :
6922 6 : tline
6923 6 : .compact(
6924 6 : &cancel,
6925 6 : {
6926 6 : let mut flags = EnumSet::new();
6927 6 : flags.insert(CompactFlags::ForceImageLayerCreation);
6928 6 : flags.insert(CompactFlags::ForceRepartition);
6929 6 : flags
6930 6 : },
6931 6 : &ctx,
6932 6 : )
6933 129 : .await
6934 6 : .unwrap();
6935 6 :
6936 6 : // Image layers are created at last_record_lsn
6937 6 : let images = tline
6938 6 : .inspect_image_layers(Lsn(0x30), &ctx)
6939 12 : .await
6940 6 : .unwrap()
6941 6 : .into_iter()
6942 42 : .filter(|(k, _)| k.is_metadata_key())
6943 6 : .collect::<Vec<_>>();
6944 6 : assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
6945 6 : }
6946 :
6947 : #[tokio::test]
6948 6 : async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
6949 6 : let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?;
6950 24 : let (tenant, ctx) = harness.load().await;
6951 6 :
6952 306 : fn get_key(id: u32) -> Key {
6953 306 : // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
6954 306 : let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
6955 306 : key.field6 = id;
6956 306 : key
6957 306 : }
6958 6 :
6959 6 : // We create
6960 6 : // - one bottom-most image layer,
6961 6 : // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
6962 6 : // - a delta layer D2 crossing the GC horizon with data only below the horizon,
6963 6 : // - a delta layer D3 above the horizon.
6964 6 : //
6965 6 : // | D3 |
6966 6 : // | D1 |
6967 6 : // -| |-- gc horizon -----------------
6968 6 : // | | | D2 |
6969 6 : // --------- img layer ------------------
6970 6 : //
6971 6 : // What we should expact from this compaction is:
6972 6 : // | D3 |
6973 6 : // | Part of D1 |
6974 6 : // --------- img layer with D1+D2 at GC horizon------------------
6975 6 :
6976 6 : // img layer at 0x10
6977 6 : let img_layer = (0..10)
6978 60 : .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
6979 6 : .collect_vec();
6980 6 :
6981 6 : let delta1 = vec![
6982 6 : (
6983 6 : get_key(1),
6984 6 : Lsn(0x20),
6985 6 : Value::Image(Bytes::from("value 1@0x20")),
6986 6 : ),
6987 6 : (
6988 6 : get_key(2),
6989 6 : Lsn(0x30),
6990 6 : Value::Image(Bytes::from("value 2@0x30")),
6991 6 : ),
6992 6 : (
6993 6 : get_key(3),
6994 6 : Lsn(0x40),
6995 6 : Value::Image(Bytes::from("value 3@0x40")),
6996 6 : ),
6997 6 : ];
6998 6 : let delta2 = vec![
6999 6 : (
7000 6 : get_key(5),
7001 6 : Lsn(0x20),
7002 6 : Value::Image(Bytes::from("value 5@0x20")),
7003 6 : ),
7004 6 : (
7005 6 : get_key(6),
7006 6 : Lsn(0x20),
7007 6 : Value::Image(Bytes::from("value 6@0x20")),
7008 6 : ),
7009 6 : ];
7010 6 : let delta3 = vec![
7011 6 : (
7012 6 : get_key(8),
7013 6 : Lsn(0x48),
7014 6 : Value::Image(Bytes::from("value 8@0x48")),
7015 6 : ),
7016 6 : (
7017 6 : get_key(9),
7018 6 : Lsn(0x48),
7019 6 : Value::Image(Bytes::from("value 9@0x48")),
7020 6 : ),
7021 6 : ];
7022 6 :
7023 6 : let tline = tenant
7024 6 : .create_test_timeline_with_layers(
7025 6 : TIMELINE_ID,
7026 6 : Lsn(0x10),
7027 6 : DEFAULT_PG_VERSION,
7028 6 : &ctx,
7029 6 : vec![
7030 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
7031 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
7032 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
7033 6 : ], // delta layers
7034 6 : vec![(Lsn(0x10), img_layer)], // image layers
7035 6 : Lsn(0x50),
7036 6 : )
7037 141 : .await?;
7038 6 : {
7039 6 : // Update GC info
7040 6 : let mut guard = tline.gc_info.write().unwrap();
7041 6 : guard.cutoffs.time = Lsn(0x30);
7042 6 : guard.cutoffs.space = Lsn(0x30);
7043 6 : }
7044 6 :
7045 6 : let expected_result = [
7046 6 : Bytes::from_static(b"value 0@0x10"),
7047 6 : Bytes::from_static(b"value 1@0x20"),
7048 6 : Bytes::from_static(b"value 2@0x30"),
7049 6 : Bytes::from_static(b"value 3@0x40"),
7050 6 : Bytes::from_static(b"value 4@0x10"),
7051 6 : Bytes::from_static(b"value 5@0x20"),
7052 6 : Bytes::from_static(b"value 6@0x20"),
7053 6 : Bytes::from_static(b"value 7@0x10"),
7054 6 : Bytes::from_static(b"value 8@0x48"),
7055 6 : Bytes::from_static(b"value 9@0x48"),
7056 6 : ];
7057 6 :
7058 60 : for (idx, expected) in expected_result.iter().enumerate() {
7059 60 : assert_eq!(
7060 60 : tline
7061 60 : .get(get_key(idx as u32), Lsn(0x50), &ctx)
7062 69 : .await
7063 60 : .unwrap(),
7064 6 : expected
7065 6 : );
7066 6 : }
7067 6 :
7068 6 : let cancel = CancellationToken::new();
7069 6 : tline
7070 6 : .compact_with_gc(&cancel, EnumSet::new(), &ctx)
7071 159 : .await
7072 6 : .unwrap();
7073 6 :
7074 60 : for (idx, expected) in expected_result.iter().enumerate() {
7075 60 : assert_eq!(
7076 60 : tline
7077 60 : .get(get_key(idx as u32), Lsn(0x50), &ctx)
7078 48 : .await
7079 60 : .unwrap(),
7080 6 : expected
7081 6 : );
7082 6 : }
7083 6 :
7084 6 : // Check if the image layer at the GC horizon contains exactly what we want
7085 6 : let image_at_gc_horizon = tline
7086 6 : .inspect_image_layers(Lsn(0x30), &ctx)
7087 6 : .await
7088 6 : .unwrap()
7089 6 : .into_iter()
7090 102 : .filter(|(k, _)| k.is_metadata_key())
7091 6 : .collect::<Vec<_>>();
7092 6 :
7093 6 : assert_eq!(image_at_gc_horizon.len(), 10);
7094 6 : let expected_result = [
7095 6 : Bytes::from_static(b"value 0@0x10"),
7096 6 : Bytes::from_static(b"value 1@0x20"),
7097 6 : Bytes::from_static(b"value 2@0x30"),
7098 6 : Bytes::from_static(b"value 3@0x10"),
7099 6 : Bytes::from_static(b"value 4@0x10"),
7100 6 : Bytes::from_static(b"value 5@0x20"),
7101 6 : Bytes::from_static(b"value 6@0x20"),
7102 6 : Bytes::from_static(b"value 7@0x10"),
7103 6 : Bytes::from_static(b"value 8@0x10"),
7104 6 : Bytes::from_static(b"value 9@0x10"),
7105 6 : ];
7106 66 : for idx in 0..10 {
7107 60 : assert_eq!(
7108 60 : image_at_gc_horizon[idx],
7109 60 : (get_key(idx as u32), expected_result[idx].clone())
7110 60 : );
7111 6 : }
7112 6 :
7113 6 : // Check if old layers are removed / new layers have the expected LSN
7114 6 : let mut all_layers = tline.inspect_historic_layers().await.unwrap();
7115 12 : all_layers.sort_by(|k1, k2| {
7116 12 : (
7117 12 : k1.is_delta,
7118 12 : k1.key_range.start,
7119 12 : k1.key_range.end,
7120 12 : k1.lsn_range.start,
7121 12 : k1.lsn_range.end,
7122 12 : )
7123 12 : .cmp(&(
7124 12 : k2.is_delta,
7125 12 : k2.key_range.start,
7126 12 : k2.key_range.end,
7127 12 : k2.lsn_range.start,
7128 12 : k2.lsn_range.end,
7129 12 : ))
7130 12 : });
7131 6 : assert_eq!(
7132 6 : all_layers,
7133 6 : vec![
7134 6 : // Image layer at GC horizon
7135 6 : PersistentLayerKey {
7136 6 : key_range: Key::MIN..Key::MAX,
7137 6 : lsn_range: Lsn(0x30)..Lsn(0x31),
7138 6 : is_delta: false
7139 6 : },
7140 6 : // The delta layer below the horizon
7141 6 : PersistentLayerKey {
7142 6 : key_range: get_key(3)..get_key(4),
7143 6 : lsn_range: Lsn(0x30)..Lsn(0x48),
7144 6 : is_delta: true
7145 6 : },
7146 6 : // The delta3 layer that should not be picked for the compaction
7147 6 : PersistentLayerKey {
7148 6 : key_range: get_key(8)..get_key(10),
7149 6 : lsn_range: Lsn(0x48)..Lsn(0x50),
7150 6 : is_delta: true
7151 6 : }
7152 6 : ]
7153 6 : );
7154 6 :
7155 6 : // increase GC horizon and compact again
7156 6 : {
7157 6 : // Update GC info
7158 6 : let mut guard = tline.gc_info.write().unwrap();
7159 6 : guard.cutoffs.time = Lsn(0x40);
7160 6 : guard.cutoffs.space = Lsn(0x40);
7161 6 : }
7162 6 : tline
7163 6 : .compact_with_gc(&cancel, EnumSet::new(), &ctx)
7164 126 : .await
7165 6 : .unwrap();
7166 6 :
7167 6 : Ok(())
7168 6 : }
7169 :
7170 : #[tokio::test]
7171 6 : async fn test_neon_test_record() -> anyhow::Result<()> {
7172 6 : let harness = TenantHarness::create("test_neon_test_record").await?;
7173 24 : let (tenant, ctx) = harness.load().await;
7174 6 :
7175 72 : fn get_key(id: u32) -> Key {
7176 72 : // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
7177 72 : let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
7178 72 : key.field6 = id;
7179 72 : key
7180 72 : }
7181 6 :
7182 6 : let delta1 = vec![
7183 6 : (
7184 6 : get_key(1),
7185 6 : Lsn(0x20),
7186 6 : Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
7187 6 : ),
7188 6 : (
7189 6 : get_key(1),
7190 6 : Lsn(0x30),
7191 6 : Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
7192 6 : ),
7193 6 : (get_key(2), Lsn(0x10), Value::Image("0x10".into())),
7194 6 : (
7195 6 : get_key(2),
7196 6 : Lsn(0x20),
7197 6 : Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
7198 6 : ),
7199 6 : (
7200 6 : get_key(2),
7201 6 : Lsn(0x30),
7202 6 : Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
7203 6 : ),
7204 6 : (get_key(3), Lsn(0x10), Value::Image("0x10".into())),
7205 6 : (
7206 6 : get_key(3),
7207 6 : Lsn(0x20),
7208 6 : Value::WalRecord(NeonWalRecord::wal_clear()),
7209 6 : ),
7210 6 : (get_key(4), Lsn(0x10), Value::Image("0x10".into())),
7211 6 : (
7212 6 : get_key(4),
7213 6 : Lsn(0x20),
7214 6 : Value::WalRecord(NeonWalRecord::wal_init()),
7215 6 : ),
7216 6 : ];
7217 6 : let image1 = vec![(get_key(1), "0x10".into())];
7218 6 :
7219 6 : let tline = tenant
7220 6 : .create_test_timeline_with_layers(
7221 6 : TIMELINE_ID,
7222 6 : Lsn(0x10),
7223 6 : DEFAULT_PG_VERSION,
7224 6 : &ctx,
7225 6 : vec![DeltaLayerTestDesc::new_with_inferred_key_range(
7226 6 : Lsn(0x10)..Lsn(0x40),
7227 6 : delta1,
7228 6 : )], // delta layers
7229 6 : vec![(Lsn(0x10), image1)], // image layers
7230 6 : Lsn(0x50),
7231 6 : )
7232 51 : .await?;
7233 6 :
7234 6 : assert_eq!(
7235 24 : tline.get(get_key(1), Lsn(0x50), &ctx).await?,
7236 6 : Bytes::from_static(b"0x10,0x20,0x30")
7237 6 : );
7238 6 : assert_eq!(
7239 6 : tline.get(get_key(2), Lsn(0x50), &ctx).await?,
7240 6 : Bytes::from_static(b"0x10,0x20,0x30")
7241 6 : );
7242 6 :
7243 6 : // Need to remove the limit of "Neon WAL redo requires base image".
7244 6 :
7245 6 : // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
7246 6 : // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());
7247 6 :
7248 6 : Ok(())
7249 6 : }
7250 :
7251 : #[tokio::test]
7252 6 : async fn test_lsn_lease() -> anyhow::Result<()> {
7253 24 : let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await;
7254 6 : let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
7255 6 :
7256 6 : let end_lsn = Lsn(0x100);
7257 6 : let image_layers = (0x20..=0x90)
7258 6 : .step_by(0x10)
7259 48 : .map(|n| {
7260 48 : (
7261 48 : Lsn(n),
7262 48 : vec![(key, test_img(&format!("data key at {:x}", n)))],
7263 48 : )
7264 48 : })
7265 6 : .collect();
7266 6 :
7267 6 : let timeline = tenant
7268 6 : .create_test_timeline_with_layers(
7269 6 : TIMELINE_ID,
7270 6 : Lsn(0x10),
7271 6 : DEFAULT_PG_VERSION,
7272 6 : &ctx,
7273 6 : Vec::new(),
7274 6 : image_layers,
7275 6 : end_lsn,
7276 6 : )
7277 180 : .await?;
7278 6 :
7279 6 : let leased_lsns = [0x30, 0x50, 0x70];
7280 6 : let mut leases = Vec::new();
7281 18 : let _: anyhow::Result<_> = leased_lsns.iter().try_for_each(|n| {
7282 18 : leases.push(timeline.make_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)?);
7283 18 : Ok(())
7284 18 : });
7285 6 :
7286 6 : // Renewing with shorter lease should not change the lease.
7287 6 : let updated_lease_0 =
7288 6 : timeline.make_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)?;
7289 6 : assert_eq!(updated_lease_0.valid_until, leases[0].valid_until);
7290 6 :
7291 6 : // Renewing with a long lease should renew lease with later expiration time.
7292 6 : let updated_lease_1 = timeline.make_lsn_lease(
7293 6 : Lsn(leased_lsns[1]),
7294 6 : timeline.get_lsn_lease_length() * 2,
7295 6 : &ctx,
7296 6 : )?;
7297 6 :
7298 6 : assert!(updated_lease_1.valid_until > leases[1].valid_until);
7299 6 :
7300 6 : // Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
7301 6 : info!(
7302 6 : "latest_gc_cutoff_lsn: {}",
7303 0 : *timeline.get_latest_gc_cutoff_lsn()
7304 6 : );
7305 6 : timeline.force_set_disk_consistent_lsn(end_lsn);
7306 6 :
7307 6 : let res = tenant
7308 6 : .gc_iteration(
7309 6 : Some(TIMELINE_ID),
7310 6 : 0,
7311 6 : Duration::ZERO,
7312 6 : &CancellationToken::new(),
7313 6 : &ctx,
7314 6 : )
7315 6 : .await?;
7316 6 :
7317 6 : // Keeping everything <= Lsn(0x80) b/c leases:
7318 6 : // 0/10: initdb layer
7319 6 : // (0/20..=0/70).step_by(0x10): image layers added when creating the timeline.
7320 6 : assert_eq!(res.layers_needed_by_leases, 7);
7321 6 : // Keeping 0/90 b/c it is the latest layer.
7322 6 : assert_eq!(res.layers_not_updated, 1);
7323 6 : // Removed 0/80.
7324 6 : assert_eq!(res.layers_removed, 1);
7325 6 :
7326 6 : // Make lease on a already GC-ed LSN.
7327 6 : // 0/80 does not have a valid lease + is below latest_gc_cutoff
7328 6 : assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
7329 6 : let res = timeline.make_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx);
7330 6 : assert!(res.is_err());
7331 6 :
7332 6 : // Should still be able to renew a currently valid lease
7333 6 : // Assumption: original lease to is still valid for 0/50.
7334 6 : let _ =
7335 6 : timeline.make_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)?;
7336 6 :
7337 6 : Ok(())
7338 6 : }
7339 :
7340 : #[tokio::test]
7341 6 : async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
7342 6 : let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
7343 24 : let (tenant, ctx) = harness.load().await;
7344 6 :
7345 354 : fn get_key(id: u32) -> Key {
7346 354 : // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
7347 354 : let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
7348 354 : key.field6 = id;
7349 354 : key
7350 354 : }
7351 6 :
7352 6 : // We create
7353 6 : // - one bottom-most image layer,
7354 6 : // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
7355 6 : // - a delta layer D2 crossing the GC horizon with data only below the horizon,
7356 6 : // - a delta layer D3 above the horizon.
7357 6 : //
7358 6 : // | D3 |
7359 6 : // | D1 |
7360 6 : // -| |-- gc horizon -----------------
7361 6 : // | | | D2 |
7362 6 : // --------- img layer ------------------
7363 6 : //
7364 6 : // What we should expact from this compaction is:
7365 6 : // | D3 |
7366 6 : // | Part of D1 |
7367 6 : // --------- img layer with D1+D2 at GC horizon------------------
7368 6 :
7369 6 : // img layer at 0x10
7370 6 : let img_layer = (0..10)
7371 60 : .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
7372 6 : .collect_vec();
7373 6 :
7374 6 : let delta1 = vec![
7375 6 : (
7376 6 : get_key(1),
7377 6 : Lsn(0x20),
7378 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
7379 6 : ),
7380 6 : (
7381 6 : get_key(2),
7382 6 : Lsn(0x30),
7383 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
7384 6 : ),
7385 6 : (
7386 6 : get_key(3),
7387 6 : Lsn(0x28),
7388 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
7389 6 : ),
7390 6 : (
7391 6 : get_key(3),
7392 6 : Lsn(0x30),
7393 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
7394 6 : ),
7395 6 : (
7396 6 : get_key(3),
7397 6 : Lsn(0x40),
7398 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
7399 6 : ),
7400 6 : ];
7401 6 : let delta2 = vec![
7402 6 : (
7403 6 : get_key(5),
7404 6 : Lsn(0x20),
7405 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
7406 6 : ),
7407 6 : (
7408 6 : get_key(6),
7409 6 : Lsn(0x20),
7410 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
7411 6 : ),
7412 6 : ];
7413 6 : let delta3 = vec![
7414 6 : (
7415 6 : get_key(8),
7416 6 : Lsn(0x48),
7417 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
7418 6 : ),
7419 6 : (
7420 6 : get_key(9),
7421 6 : Lsn(0x48),
7422 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
7423 6 : ),
7424 6 : ];
7425 6 :
7426 6 : let tline = tenant
7427 6 : .create_test_timeline_with_layers(
7428 6 : TIMELINE_ID,
7429 6 : Lsn(0x10),
7430 6 : DEFAULT_PG_VERSION,
7431 6 : &ctx,
7432 6 : vec![
7433 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
7434 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
7435 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
7436 6 : ], // delta layers
7437 6 : vec![(Lsn(0x10), img_layer)], // image layers
7438 6 : Lsn(0x50),
7439 6 : )
7440 141 : .await?;
7441 6 : {
7442 6 : // Update GC info
7443 6 : let mut guard = tline.gc_info.write().unwrap();
7444 6 : *guard = GcInfo {
7445 6 : retain_lsns: vec![],
7446 6 : cutoffs: GcCutoffs {
7447 6 : time: Lsn(0x30),
7448 6 : space: Lsn(0x30),
7449 6 : },
7450 6 : leases: Default::default(),
7451 6 : within_ancestor_pitr: false,
7452 6 : };
7453 6 : }
7454 6 :
7455 6 : let expected_result = [
7456 6 : Bytes::from_static(b"value 0@0x10"),
7457 6 : Bytes::from_static(b"value 1@0x10@0x20"),
7458 6 : Bytes::from_static(b"value 2@0x10@0x30"),
7459 6 : Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
7460 6 : Bytes::from_static(b"value 4@0x10"),
7461 6 : Bytes::from_static(b"value 5@0x10@0x20"),
7462 6 : Bytes::from_static(b"value 6@0x10@0x20"),
7463 6 : Bytes::from_static(b"value 7@0x10"),
7464 6 : Bytes::from_static(b"value 8@0x10@0x48"),
7465 6 : Bytes::from_static(b"value 9@0x10@0x48"),
7466 6 : ];
7467 6 :
7468 6 : let expected_result_at_gc_horizon = [
7469 6 : Bytes::from_static(b"value 0@0x10"),
7470 6 : Bytes::from_static(b"value 1@0x10@0x20"),
7471 6 : Bytes::from_static(b"value 2@0x10@0x30"),
7472 6 : Bytes::from_static(b"value 3@0x10@0x28@0x30"),
7473 6 : Bytes::from_static(b"value 4@0x10"),
7474 6 : Bytes::from_static(b"value 5@0x10@0x20"),
7475 6 : Bytes::from_static(b"value 6@0x10@0x20"),
7476 6 : Bytes::from_static(b"value 7@0x10"),
7477 6 : Bytes::from_static(b"value 8@0x10"),
7478 6 : Bytes::from_static(b"value 9@0x10"),
7479 6 : ];
7480 6 :
7481 66 : for idx in 0..10 {
7482 60 : assert_eq!(
7483 60 : tline
7484 60 : .get(get_key(idx as u32), Lsn(0x50), &ctx)
7485 93 : .await
7486 60 : .unwrap(),
7487 60 : &expected_result[idx]
7488 6 : );
7489 60 : assert_eq!(
7490 60 : tline
7491 60 : .get(get_key(idx as u32), Lsn(0x30), &ctx)
7492 45 : .await
7493 60 : .unwrap(),
7494 60 : &expected_result_at_gc_horizon[idx]
7495 6 : );
7496 6 : }
7497 6 :
7498 6 : let cancel = CancellationToken::new();
7499 6 : tline
7500 6 : .compact_with_gc(&cancel, EnumSet::new(), &ctx)
7501 156 : .await
7502 6 : .unwrap();
7503 6 :
7504 66 : for idx in 0..10 {
7505 60 : assert_eq!(
7506 60 : tline
7507 60 : .get(get_key(idx as u32), Lsn(0x50), &ctx)
7508 60 : .await
7509 60 : .unwrap(),
7510 60 : &expected_result[idx]
7511 6 : );
7512 60 : assert_eq!(
7513 60 : tline
7514 60 : .get(get_key(idx as u32), Lsn(0x30), &ctx)
7515 30 : .await
7516 60 : .unwrap(),
7517 60 : &expected_result_at_gc_horizon[idx]
7518 6 : );
7519 6 : }
7520 6 :
7521 6 : // increase GC horizon and compact again
7522 6 : {
7523 6 : // Update GC info
7524 6 : let mut guard = tline.gc_info.write().unwrap();
7525 6 : guard.cutoffs.time = Lsn(0x40);
7526 6 : guard.cutoffs.space = Lsn(0x40);
7527 6 : }
7528 6 : tline
7529 6 : .compact_with_gc(&cancel, EnumSet::new(), &ctx)
7530 126 : .await
7531 6 : .unwrap();
7532 6 :
7533 6 : Ok(())
7534 6 : }
7535 :
7536 : #[tokio::test]
7537 6 : async fn test_generate_key_retention() -> anyhow::Result<()> {
7538 6 : let harness = TenantHarness::create("test_generate_key_retention").await?;
7539 24 : let (tenant, ctx) = harness.load().await;
7540 6 : let tline = tenant
7541 6 : .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
7542 12 : .await?;
7543 6 : tline.force_advance_lsn(Lsn(0x70));
7544 6 : let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
7545 6 : let history = vec![
7546 6 : (
7547 6 : key,
7548 6 : Lsn(0x10),
7549 6 : Value::Image(Bytes::copy_from_slice(b"0x10")),
7550 6 : ),
7551 6 : (
7552 6 : key,
7553 6 : Lsn(0x20),
7554 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
7555 6 : ),
7556 6 : (
7557 6 : key,
7558 6 : Lsn(0x30),
7559 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
7560 6 : ),
7561 6 : (
7562 6 : key,
7563 6 : Lsn(0x40),
7564 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
7565 6 : ),
7566 6 : (
7567 6 : key,
7568 6 : Lsn(0x50),
7569 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x50")),
7570 6 : ),
7571 6 : (
7572 6 : key,
7573 6 : Lsn(0x60),
7574 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
7575 6 : ),
7576 6 : (
7577 6 : key,
7578 6 : Lsn(0x70),
7579 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
7580 6 : ),
7581 6 : (
7582 6 : key,
7583 6 : Lsn(0x80),
7584 6 : Value::Image(Bytes::copy_from_slice(
7585 6 : b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80",
7586 6 : )),
7587 6 : ),
7588 6 : (
7589 6 : key,
7590 6 : Lsn(0x90),
7591 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
7592 6 : ),
7593 6 : ];
7594 6 : let res = tline
7595 6 : .generate_key_retention(
7596 6 : key,
7597 6 : &history,
7598 6 : Lsn(0x60),
7599 6 : &[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
7600 6 : 3,
7601 6 : None,
7602 6 : )
7603 6 : .await
7604 6 : .unwrap();
7605 6 : let expected_res = KeyHistoryRetention {
7606 6 : below_horizon: vec![
7607 6 : (
7608 6 : Lsn(0x20),
7609 6 : KeyLogAtLsn(vec![(
7610 6 : Lsn(0x20),
7611 6 : Value::Image(Bytes::copy_from_slice(b"0x10;0x20")),
7612 6 : )]),
7613 6 : ),
7614 6 : (
7615 6 : Lsn(0x40),
7616 6 : KeyLogAtLsn(vec![
7617 6 : (
7618 6 : Lsn(0x30),
7619 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
7620 6 : ),
7621 6 : (
7622 6 : Lsn(0x40),
7623 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
7624 6 : ),
7625 6 : ]),
7626 6 : ),
7627 6 : (
7628 6 : Lsn(0x50),
7629 6 : KeyLogAtLsn(vec![(
7630 6 : Lsn(0x50),
7631 6 : Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40;0x50")),
7632 6 : )]),
7633 6 : ),
7634 6 : (
7635 6 : Lsn(0x60),
7636 6 : KeyLogAtLsn(vec![(
7637 6 : Lsn(0x60),
7638 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
7639 6 : )]),
7640 6 : ),
7641 6 : ],
7642 6 : above_horizon: KeyLogAtLsn(vec![
7643 6 : (
7644 6 : Lsn(0x70),
7645 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
7646 6 : ),
7647 6 : (
7648 6 : Lsn(0x80),
7649 6 : Value::Image(Bytes::copy_from_slice(
7650 6 : b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80",
7651 6 : )),
7652 6 : ),
7653 6 : (
7654 6 : Lsn(0x90),
7655 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
7656 6 : ),
7657 6 : ]),
7658 6 : };
7659 6 : assert_eq!(res, expected_res);
7660 6 :
7661 6 : // We expect GC-compaction to run with the original GC. This would create a situation that
7662 6 : // the original GC algorithm removes some delta layers b/c there are full image coverage,
7663 6 : // therefore causing some keys to have an incomplete history below the lowest retain LSN.
7664 6 : // For example, we have
7665 6 : // ```plain
7666 6 : // init delta @ 0x10, image @ 0x20, delta @ 0x30 (gc_horizon), image @ 0x40.
7667 6 : // ```
7668 6 : // Now the GC horizon moves up, and we have
7669 6 : // ```plain
7670 6 : // init delta @ 0x10, image @ 0x20, delta @ 0x30, image @ 0x40 (gc_horizon)
7671 6 : // ```
7672 6 : // The original GC algorithm kicks in, and removes delta @ 0x10, image @ 0x20.
7673 6 : // We will end up with
7674 6 : // ```plain
7675 6 : // delta @ 0x30, image @ 0x40 (gc_horizon)
7676 6 : // ```
7677 6 : // Now we run the GC-compaction, and this key does not have a full history.
7678 6 : // We should be able to handle this partial history and drop everything before the
7679 6 : // gc_horizon image.
7680 6 :
7681 6 : let history = vec![
7682 6 : (
7683 6 : key,
7684 6 : Lsn(0x20),
7685 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
7686 6 : ),
7687 6 : (
7688 6 : key,
7689 6 : Lsn(0x30),
7690 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
7691 6 : ),
7692 6 : (
7693 6 : key,
7694 6 : Lsn(0x40),
7695 6 : Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")),
7696 6 : ),
7697 6 : (
7698 6 : key,
7699 6 : Lsn(0x50),
7700 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x50")),
7701 6 : ),
7702 6 : (
7703 6 : key,
7704 6 : Lsn(0x60),
7705 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
7706 6 : ),
7707 6 : (
7708 6 : key,
7709 6 : Lsn(0x70),
7710 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
7711 6 : ),
7712 6 : (
7713 6 : key,
7714 6 : Lsn(0x80),
7715 6 : Value::Image(Bytes::copy_from_slice(
7716 6 : b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80",
7717 6 : )),
7718 6 : ),
7719 6 : (
7720 6 : key,
7721 6 : Lsn(0x90),
7722 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
7723 6 : ),
7724 6 : ];
7725 6 : let res = tline
7726 6 : .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
7727 6 : .await
7728 6 : .unwrap();
7729 6 : let expected_res = KeyHistoryRetention {
7730 6 : below_horizon: vec![
7731 6 : (
7732 6 : Lsn(0x40),
7733 6 : KeyLogAtLsn(vec![(
7734 6 : Lsn(0x40),
7735 6 : Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")),
7736 6 : )]),
7737 6 : ),
7738 6 : (
7739 6 : Lsn(0x50),
7740 6 : KeyLogAtLsn(vec![(
7741 6 : Lsn(0x50),
7742 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x50")),
7743 6 : )]),
7744 6 : ),
7745 6 : (
7746 6 : Lsn(0x60),
7747 6 : KeyLogAtLsn(vec![(
7748 6 : Lsn(0x60),
7749 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
7750 6 : )]),
7751 6 : ),
7752 6 : ],
7753 6 : above_horizon: KeyLogAtLsn(vec![
7754 6 : (
7755 6 : Lsn(0x70),
7756 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
7757 6 : ),
7758 6 : (
7759 6 : Lsn(0x80),
7760 6 : Value::Image(Bytes::copy_from_slice(
7761 6 : b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80",
7762 6 : )),
7763 6 : ),
7764 6 : (
7765 6 : Lsn(0x90),
7766 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
7767 6 : ),
7768 6 : ]),
7769 6 : };
7770 6 : assert_eq!(res, expected_res);
7771 6 :
7772 6 : // In case of branch compaction, the branch itself does not have the full history, and we need to provide
7773 6 : // the ancestor image in the test case.
7774 6 :
7775 6 : let history = vec![
7776 6 : (
7777 6 : key,
7778 6 : Lsn(0x20),
7779 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
7780 6 : ),
7781 6 : (
7782 6 : key,
7783 6 : Lsn(0x30),
7784 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
7785 6 : ),
7786 6 : (
7787 6 : key,
7788 6 : Lsn(0x40),
7789 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
7790 6 : ),
7791 6 : (
7792 6 : key,
7793 6 : Lsn(0x70),
7794 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
7795 6 : ),
7796 6 : ];
7797 6 : let res = tline
7798 6 : .generate_key_retention(
7799 6 : key,
7800 6 : &history,
7801 6 : Lsn(0x60),
7802 6 : &[],
7803 6 : 3,
7804 6 : Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
7805 6 : )
7806 6 : .await
7807 6 : .unwrap();
7808 6 : let expected_res = KeyHistoryRetention {
7809 6 : below_horizon: vec![(
7810 6 : Lsn(0x60),
7811 6 : KeyLogAtLsn(vec![(
7812 6 : Lsn(0x60),
7813 6 : Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page
7814 6 : )]),
7815 6 : )],
7816 6 : above_horizon: KeyLogAtLsn(vec![(
7817 6 : Lsn(0x70),
7818 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
7819 6 : )]),
7820 6 : };
7821 6 : assert_eq!(res, expected_res);
7822 6 :
7823 6 : let history = vec![
7824 6 : (
7825 6 : key,
7826 6 : Lsn(0x20),
7827 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
7828 6 : ),
7829 6 : (
7830 6 : key,
7831 6 : Lsn(0x40),
7832 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
7833 6 : ),
7834 6 : (
7835 6 : key,
7836 6 : Lsn(0x60),
7837 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
7838 6 : ),
7839 6 : (
7840 6 : key,
7841 6 : Lsn(0x70),
7842 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
7843 6 : ),
7844 6 : ];
7845 6 : let res = tline
7846 6 : .generate_key_retention(
7847 6 : key,
7848 6 : &history,
7849 6 : Lsn(0x60),
7850 6 : &[Lsn(0x30)],
7851 6 : 3,
7852 6 : Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
7853 6 : )
7854 6 : .await
7855 6 : .unwrap();
7856 6 : let expected_res = KeyHistoryRetention {
7857 6 : below_horizon: vec![
7858 6 : (
7859 6 : Lsn(0x30),
7860 6 : KeyLogAtLsn(vec![(
7861 6 : Lsn(0x20),
7862 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
7863 6 : )]),
7864 6 : ),
7865 6 : (
7866 6 : Lsn(0x60),
7867 6 : KeyLogAtLsn(vec![(
7868 6 : Lsn(0x60),
7869 6 : Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")),
7870 6 : )]),
7871 6 : ),
7872 6 : ],
7873 6 : above_horizon: KeyLogAtLsn(vec![(
7874 6 : Lsn(0x70),
7875 6 : Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
7876 6 : )]),
7877 6 : };
7878 6 : assert_eq!(res, expected_res);
7879 6 :
7880 6 : Ok(())
7881 6 : }
7882 :
7883 : #[tokio::test]
7884 6 : async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> {
7885 6 : let harness =
7886 6 : TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns").await?;
7887 24 : let (tenant, ctx) = harness.load().await;
7888 6 :
7889 1554 : fn get_key(id: u32) -> Key {
7890 1554 : // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
7891 1554 : let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
7892 1554 : key.field6 = id;
7893 1554 : key
7894 1554 : }
7895 6 :
7896 6 : let img_layer = (0..10)
7897 60 : .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
7898 6 : .collect_vec();
7899 6 :
7900 6 : let delta1 = vec![
7901 6 : (
7902 6 : get_key(1),
7903 6 : Lsn(0x20),
7904 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
7905 6 : ),
7906 6 : (
7907 6 : get_key(2),
7908 6 : Lsn(0x30),
7909 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
7910 6 : ),
7911 6 : (
7912 6 : get_key(3),
7913 6 : Lsn(0x28),
7914 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
7915 6 : ),
7916 6 : (
7917 6 : get_key(3),
7918 6 : Lsn(0x30),
7919 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
7920 6 : ),
7921 6 : (
7922 6 : get_key(3),
7923 6 : Lsn(0x40),
7924 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
7925 6 : ),
7926 6 : ];
7927 6 : let delta2 = vec![
7928 6 : (
7929 6 : get_key(5),
7930 6 : Lsn(0x20),
7931 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
7932 6 : ),
7933 6 : (
7934 6 : get_key(6),
7935 6 : Lsn(0x20),
7936 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
7937 6 : ),
7938 6 : ];
7939 6 : let delta3 = vec![
7940 6 : (
7941 6 : get_key(8),
7942 6 : Lsn(0x48),
7943 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
7944 6 : ),
7945 6 : (
7946 6 : get_key(9),
7947 6 : Lsn(0x48),
7948 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
7949 6 : ),
7950 6 : ];
7951 6 :
7952 6 : let tline = tenant
7953 6 : .create_test_timeline_with_layers(
7954 6 : TIMELINE_ID,
7955 6 : Lsn(0x10),
7956 6 : DEFAULT_PG_VERSION,
7957 6 : &ctx,
7958 6 : vec![
7959 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
7960 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
7961 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
7962 6 : ], // delta layers
7963 6 : vec![(Lsn(0x10), img_layer)], // image layers
7964 6 : Lsn(0x50),
7965 6 : )
7966 141 : .await?;
7967 6 : {
7968 6 : // Update GC info
7969 6 : let mut guard = tline.gc_info.write().unwrap();
7970 6 : *guard = GcInfo {
7971 6 : retain_lsns: vec![
7972 6 : (Lsn(0x10), tline.timeline_id),
7973 6 : (Lsn(0x20), tline.timeline_id),
7974 6 : ],
7975 6 : cutoffs: GcCutoffs {
7976 6 : time: Lsn(0x30),
7977 6 : space: Lsn(0x30),
7978 6 : },
7979 6 : leases: Default::default(),
7980 6 : within_ancestor_pitr: false,
7981 6 : };
7982 6 : }
7983 6 :
7984 6 : let expected_result = [
7985 6 : Bytes::from_static(b"value 0@0x10"),
7986 6 : Bytes::from_static(b"value 1@0x10@0x20"),
7987 6 : Bytes::from_static(b"value 2@0x10@0x30"),
7988 6 : Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
7989 6 : Bytes::from_static(b"value 4@0x10"),
7990 6 : Bytes::from_static(b"value 5@0x10@0x20"),
7991 6 : Bytes::from_static(b"value 6@0x10@0x20"),
7992 6 : Bytes::from_static(b"value 7@0x10"),
7993 6 : Bytes::from_static(b"value 8@0x10@0x48"),
7994 6 : Bytes::from_static(b"value 9@0x10@0x48"),
7995 6 : ];
7996 6 :
7997 6 : let expected_result_at_gc_horizon = [
7998 6 : Bytes::from_static(b"value 0@0x10"),
7999 6 : Bytes::from_static(b"value 1@0x10@0x20"),
8000 6 : Bytes::from_static(b"value 2@0x10@0x30"),
8001 6 : Bytes::from_static(b"value 3@0x10@0x28@0x30"),
8002 6 : Bytes::from_static(b"value 4@0x10"),
8003 6 : Bytes::from_static(b"value 5@0x10@0x20"),
8004 6 : Bytes::from_static(b"value 6@0x10@0x20"),
8005 6 : Bytes::from_static(b"value 7@0x10"),
8006 6 : Bytes::from_static(b"value 8@0x10"),
8007 6 : Bytes::from_static(b"value 9@0x10"),
8008 6 : ];
8009 6 :
8010 6 : let expected_result_at_lsn_20 = [
8011 6 : Bytes::from_static(b"value 0@0x10"),
8012 6 : Bytes::from_static(b"value 1@0x10@0x20"),
8013 6 : Bytes::from_static(b"value 2@0x10"),
8014 6 : Bytes::from_static(b"value 3@0x10"),
8015 6 : Bytes::from_static(b"value 4@0x10"),
8016 6 : Bytes::from_static(b"value 5@0x10@0x20"),
8017 6 : Bytes::from_static(b"value 6@0x10@0x20"),
8018 6 : Bytes::from_static(b"value 7@0x10"),
8019 6 : Bytes::from_static(b"value 8@0x10"),
8020 6 : Bytes::from_static(b"value 9@0x10"),
8021 6 : ];
8022 6 :
8023 6 : let expected_result_at_lsn_10 = [
8024 6 : Bytes::from_static(b"value 0@0x10"),
8025 6 : Bytes::from_static(b"value 1@0x10"),
8026 6 : Bytes::from_static(b"value 2@0x10"),
8027 6 : Bytes::from_static(b"value 3@0x10"),
8028 6 : Bytes::from_static(b"value 4@0x10"),
8029 6 : Bytes::from_static(b"value 5@0x10"),
8030 6 : Bytes::from_static(b"value 6@0x10"),
8031 6 : Bytes::from_static(b"value 7@0x10"),
8032 6 : Bytes::from_static(b"value 8@0x10"),
8033 6 : Bytes::from_static(b"value 9@0x10"),
8034 6 : ];
8035 6 :
8036 36 : let verify_result = || async {
8037 36 : let gc_horizon = {
8038 36 : let gc_info = tline.gc_info.read().unwrap();
8039 36 : gc_info.cutoffs.time
8040 6 : };
8041 396 : for idx in 0..10 {
8042 360 : assert_eq!(
8043 360 : tline
8044 360 : .get(get_key(idx as u32), Lsn(0x50), &ctx)
8045 372 : .await
8046 360 : .unwrap(),
8047 360 : &expected_result[idx]
8048 6 : );
8049 360 : assert_eq!(
8050 360 : tline
8051 360 : .get(get_key(idx as u32), gc_horizon, &ctx)
8052 288 : .await
8053 360 : .unwrap(),
8054 360 : &expected_result_at_gc_horizon[idx]
8055 6 : );
8056 360 : assert_eq!(
8057 360 : tline
8058 360 : .get(get_key(idx as u32), Lsn(0x20), &ctx)
8059 243 : .await
8060 360 : .unwrap(),
8061 360 : &expected_result_at_lsn_20[idx]
8062 6 : );
8063 360 : assert_eq!(
8064 360 : tline
8065 360 : .get(get_key(idx as u32), Lsn(0x10), &ctx)
8066 180 : .await
8067 360 : .unwrap(),
8068 360 : &expected_result_at_lsn_10[idx]
8069 6 : );
8070 6 : }
8071 72 : };
8072 6 :
8073 207 : verify_result().await;
8074 6 :
8075 6 : let cancel = CancellationToken::new();
8076 6 : let mut dryrun_flags = EnumSet::new();
8077 6 : dryrun_flags.insert(CompactFlags::DryRun);
8078 6 :
8079 6 : tline
8080 6 : .compact_with_gc(&cancel, dryrun_flags, &ctx)
8081 132 : .await
8082 6 : .unwrap();
8083 6 : // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
8084 6 : // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
8085 171 : verify_result().await;
8086 6 :
8087 6 : tline
8088 6 : .compact_with_gc(&cancel, EnumSet::new(), &ctx)
8089 150 : .await
8090 6 : .unwrap();
8091 192 : verify_result().await;
8092 6 :
8093 6 : // compact again
8094 6 : tline
8095 6 : .compact_with_gc(&cancel, EnumSet::new(), &ctx)
8096 114 : .await
8097 6 : .unwrap();
8098 171 : verify_result().await;
8099 6 :
8100 6 : // increase GC horizon and compact again
8101 6 : {
8102 6 : // Update GC info
8103 6 : let mut guard = tline.gc_info.write().unwrap();
8104 6 : guard.cutoffs.time = Lsn(0x38);
8105 6 : guard.cutoffs.space = Lsn(0x38);
8106 6 : }
8107 6 : tline
8108 6 : .compact_with_gc(&cancel, EnumSet::new(), &ctx)
8109 117 : .await
8110 6 : .unwrap();
8111 171 : verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
8112 6 :
8113 6 : // not increasing the GC horizon and compact again
8114 6 : tline
8115 6 : .compact_with_gc(&cancel, EnumSet::new(), &ctx)
8116 117 : .await
8117 6 : .unwrap();
8118 171 : verify_result().await;
8119 6 :
8120 6 : Ok(())
8121 6 : }
8122 :
8123 : #[tokio::test]
8124 6 : async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
8125 6 : {
8126 6 : let harness =
8127 6 : TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key")
8128 6 : .await?;
8129 24 : let (tenant, ctx) = harness.load().await;
8130 6 :
8131 1056 : fn get_key(id: u32) -> Key {
8132 1056 : // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
8133 1056 : let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
8134 1056 : key.field6 = id;
8135 1056 : key
8136 1056 : }
8137 6 :
8138 6 : let img_layer = (0..10)
8139 60 : .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
8140 6 : .collect_vec();
8141 6 :
8142 6 : let delta1 = vec![
8143 6 : (
8144 6 : get_key(1),
8145 6 : Lsn(0x20),
8146 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
8147 6 : ),
8148 6 : (
8149 6 : get_key(1),
8150 6 : Lsn(0x28),
8151 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
8152 6 : ),
8153 6 : ];
8154 6 : let delta2 = vec![
8155 6 : (
8156 6 : get_key(1),
8157 6 : Lsn(0x30),
8158 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
8159 6 : ),
8160 6 : (
8161 6 : get_key(1),
8162 6 : Lsn(0x38),
8163 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
8164 6 : ),
8165 6 : ];
8166 6 : let delta3 = vec![
8167 6 : (
8168 6 : get_key(8),
8169 6 : Lsn(0x48),
8170 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
8171 6 : ),
8172 6 : (
8173 6 : get_key(9),
8174 6 : Lsn(0x48),
8175 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
8176 6 : ),
8177 6 : ];
8178 6 :
8179 6 : let tline = tenant
8180 6 : .create_test_timeline_with_layers(
8181 6 : TIMELINE_ID,
8182 6 : Lsn(0x10),
8183 6 : DEFAULT_PG_VERSION,
8184 6 : &ctx,
8185 6 : vec![
8186 6 : // delta1 and delta 2 only contain a single key but multiple updates
8187 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
8188 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
8189 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3),
8190 6 : ], // delta layers
8191 6 : vec![(Lsn(0x10), img_layer)], // image layers
8192 6 : Lsn(0x50),
8193 6 : )
8194 141 : .await?;
8195 6 : {
8196 6 : // Update GC info
8197 6 : let mut guard = tline.gc_info.write().unwrap();
8198 6 : *guard = GcInfo {
8199 6 : retain_lsns: vec![
8200 6 : (Lsn(0x10), tline.timeline_id),
8201 6 : (Lsn(0x20), tline.timeline_id),
8202 6 : ],
8203 6 : cutoffs: GcCutoffs {
8204 6 : time: Lsn(0x30),
8205 6 : space: Lsn(0x30),
8206 6 : },
8207 6 : leases: Default::default(),
8208 6 : within_ancestor_pitr: false,
8209 6 : };
8210 6 : }
8211 6 :
8212 6 : let expected_result = [
8213 6 : Bytes::from_static(b"value 0@0x10"),
8214 6 : Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
8215 6 : Bytes::from_static(b"value 2@0x10"),
8216 6 : Bytes::from_static(b"value 3@0x10"),
8217 6 : Bytes::from_static(b"value 4@0x10"),
8218 6 : Bytes::from_static(b"value 5@0x10"),
8219 6 : Bytes::from_static(b"value 6@0x10"),
8220 6 : Bytes::from_static(b"value 7@0x10"),
8221 6 : Bytes::from_static(b"value 8@0x10@0x48"),
8222 6 : Bytes::from_static(b"value 9@0x10@0x48"),
8223 6 : ];
8224 6 :
8225 6 : let expected_result_at_gc_horizon = [
8226 6 : Bytes::from_static(b"value 0@0x10"),
8227 6 : Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
8228 6 : Bytes::from_static(b"value 2@0x10"),
8229 6 : Bytes::from_static(b"value 3@0x10"),
8230 6 : Bytes::from_static(b"value 4@0x10"),
8231 6 : Bytes::from_static(b"value 5@0x10"),
8232 6 : Bytes::from_static(b"value 6@0x10"),
8233 6 : Bytes::from_static(b"value 7@0x10"),
8234 6 : Bytes::from_static(b"value 8@0x10"),
8235 6 : Bytes::from_static(b"value 9@0x10"),
8236 6 : ];
8237 6 :
8238 6 : let expected_result_at_lsn_20 = [
8239 6 : Bytes::from_static(b"value 0@0x10"),
8240 6 : Bytes::from_static(b"value 1@0x10@0x20"),
8241 6 : Bytes::from_static(b"value 2@0x10"),
8242 6 : Bytes::from_static(b"value 3@0x10"),
8243 6 : Bytes::from_static(b"value 4@0x10"),
8244 6 : Bytes::from_static(b"value 5@0x10"),
8245 6 : Bytes::from_static(b"value 6@0x10"),
8246 6 : Bytes::from_static(b"value 7@0x10"),
8247 6 : Bytes::from_static(b"value 8@0x10"),
8248 6 : Bytes::from_static(b"value 9@0x10"),
8249 6 : ];
8250 6 :
8251 6 : let expected_result_at_lsn_10 = [
8252 6 : Bytes::from_static(b"value 0@0x10"),
8253 6 : Bytes::from_static(b"value 1@0x10"),
8254 6 : Bytes::from_static(b"value 2@0x10"),
8255 6 : Bytes::from_static(b"value 3@0x10"),
8256 6 : Bytes::from_static(b"value 4@0x10"),
8257 6 : Bytes::from_static(b"value 5@0x10"),
8258 6 : Bytes::from_static(b"value 6@0x10"),
8259 6 : Bytes::from_static(b"value 7@0x10"),
8260 6 : Bytes::from_static(b"value 8@0x10"),
8261 6 : Bytes::from_static(b"value 9@0x10"),
8262 6 : ];
8263 6 :
8264 24 : let verify_result = || async {
8265 24 : let gc_horizon = {
8266 24 : let gc_info = tline.gc_info.read().unwrap();
8267 24 : gc_info.cutoffs.time
8268 6 : };
8269 264 : for idx in 0..10 {
8270 240 : assert_eq!(
8271 240 : tline
8272 240 : .get(get_key(idx as u32), Lsn(0x50), &ctx)
8273 222 : .await
8274 240 : .unwrap(),
8275 240 : &expected_result[idx]
8276 6 : );
8277 240 : assert_eq!(
8278 240 : tline
8279 240 : .get(get_key(idx as u32), gc_horizon, &ctx)
8280 147 : .await
8281 240 : .unwrap(),
8282 240 : &expected_result_at_gc_horizon[idx]
8283 6 : );
8284 240 : assert_eq!(
8285 240 : tline
8286 240 : .get(get_key(idx as u32), Lsn(0x20), &ctx)
8287 138 : .await
8288 240 : .unwrap(),
8289 240 : &expected_result_at_lsn_20[idx]
8290 6 : );
8291 240 : assert_eq!(
8292 240 : tline
8293 240 : .get(get_key(idx as u32), Lsn(0x10), &ctx)
8294 123 : .await
8295 240 : .unwrap(),
8296 240 : &expected_result_at_lsn_10[idx]
8297 6 : );
8298 6 : }
8299 48 : };
8300 6 :
8301 183 : verify_result().await;
8302 6 :
8303 6 : let cancel = CancellationToken::new();
8304 6 : let mut dryrun_flags = EnumSet::new();
8305 6 : dryrun_flags.insert(CompactFlags::DryRun);
8306 6 :
8307 6 : tline
8308 6 : .compact_with_gc(&cancel, dryrun_flags, &ctx)
8309 135 : .await
8310 6 : .unwrap();
8311 6 : // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
8312 6 : // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
8313 147 : verify_result().await;
8314 6 :
8315 6 : tline
8316 6 : .compact_with_gc(&cancel, EnumSet::new(), &ctx)
8317 153 : .await
8318 6 : .unwrap();
8319 159 : verify_result().await;
8320 6 :
8321 6 : // compact again
8322 6 : tline
8323 6 : .compact_with_gc(&cancel, EnumSet::new(), &ctx)
8324 117 : .await
8325 6 : .unwrap();
8326 141 : verify_result().await;
8327 6 :
8328 6 : Ok(())
8329 6 : }
8330 :
8331 : #[tokio::test]
8332 6 : async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
8333 6 : let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
8334 24 : let (tenant, ctx) = harness.load().await;
8335 6 :
8336 378 : fn get_key(id: u32) -> Key {
8337 378 : let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
8338 378 : key.field6 = id;
8339 378 : key
8340 378 : }
8341 6 :
8342 6 : let img_layer = (0..10)
8343 60 : .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
8344 6 : .collect_vec();
8345 6 :
8346 6 : let delta1 = vec![
8347 6 : (
8348 6 : get_key(1),
8349 6 : Lsn(0x20),
8350 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
8351 6 : ),
8352 6 : (
8353 6 : get_key(2),
8354 6 : Lsn(0x30),
8355 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
8356 6 : ),
8357 6 : (
8358 6 : get_key(3),
8359 6 : Lsn(0x28),
8360 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
8361 6 : ),
8362 6 : (
8363 6 : get_key(3),
8364 6 : Lsn(0x30),
8365 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
8366 6 : ),
8367 6 : (
8368 6 : get_key(3),
8369 6 : Lsn(0x40),
8370 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
8371 6 : ),
8372 6 : ];
8373 6 : let delta2 = vec![
8374 6 : (
8375 6 : get_key(5),
8376 6 : Lsn(0x20),
8377 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
8378 6 : ),
8379 6 : (
8380 6 : get_key(6),
8381 6 : Lsn(0x20),
8382 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
8383 6 : ),
8384 6 : ];
8385 6 : let delta3 = vec![
8386 6 : (
8387 6 : get_key(8),
8388 6 : Lsn(0x48),
8389 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
8390 6 : ),
8391 6 : (
8392 6 : get_key(9),
8393 6 : Lsn(0x48),
8394 6 : Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
8395 6 : ),
8396 6 : ];
8397 6 :
8398 6 : let parent_tline = tenant
8399 6 : .create_test_timeline_with_layers(
8400 6 : TIMELINE_ID,
8401 6 : Lsn(0x10),
8402 6 : DEFAULT_PG_VERSION,
8403 6 : &ctx,
8404 6 : vec![], // delta layers
8405 6 : vec![(Lsn(0x18), img_layer)], // image layers
8406 6 : Lsn(0x18),
8407 6 : )
8408 87 : .await?;
8409 6 :
8410 6 : parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
8411 6 :
8412 6 : let branch_tline = tenant
8413 6 : .branch_timeline_test_with_layers(
8414 6 : &parent_tline,
8415 6 : NEW_TIMELINE_ID,
8416 6 : Some(Lsn(0x18)),
8417 6 : &ctx,
8418 6 : vec![
8419 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
8420 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
8421 6 : DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
8422 6 : ], // delta layers
8423 6 : vec![], // image layers
8424 6 : Lsn(0x50),
8425 6 : )
8426 54 : .await?;
8427 6 :
8428 6 : branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
8429 6 :
8430 6 : {
8431 6 : // Update GC info
8432 6 : let mut guard = parent_tline.gc_info.write().unwrap();
8433 6 : *guard = GcInfo {
8434 6 : retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
8435 6 : cutoffs: GcCutoffs {
8436 6 : time: Lsn(0x10),
8437 6 : space: Lsn(0x10),
8438 6 : },
8439 6 : leases: Default::default(),
8440 6 : within_ancestor_pitr: false,
8441 6 : };
8442 6 : }
8443 6 :
8444 6 : {
8445 6 : // Update GC info
8446 6 : let mut guard = branch_tline.gc_info.write().unwrap();
8447 6 : *guard = GcInfo {
8448 6 : retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
8449 6 : cutoffs: GcCutoffs {
8450 6 : time: Lsn(0x50),
8451 6 : space: Lsn(0x50),
8452 6 : },
8453 6 : leases: Default::default(),
8454 6 : within_ancestor_pitr: false,
8455 6 : };
8456 6 : }
8457 6 :
8458 6 : let expected_result_at_gc_horizon = [
8459 6 : Bytes::from_static(b"value 0@0x10"),
8460 6 : Bytes::from_static(b"value 1@0x10@0x20"),
8461 6 : Bytes::from_static(b"value 2@0x10@0x30"),
8462 6 : Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
8463 6 : Bytes::from_static(b"value 4@0x10"),
8464 6 : Bytes::from_static(b"value 5@0x10@0x20"),
8465 6 : Bytes::from_static(b"value 6@0x10@0x20"),
8466 6 : Bytes::from_static(b"value 7@0x10"),
8467 6 : Bytes::from_static(b"value 8@0x10@0x48"),
8468 6 : Bytes::from_static(b"value 9@0x10@0x48"),
8469 6 : ];
8470 6 :
8471 6 : let expected_result_at_lsn_40 = [
8472 6 : Bytes::from_static(b"value 0@0x10"),
8473 6 : Bytes::from_static(b"value 1@0x10@0x20"),
8474 6 : Bytes::from_static(b"value 2@0x10@0x30"),
8475 6 : Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
8476 6 : Bytes::from_static(b"value 4@0x10"),
8477 6 : Bytes::from_static(b"value 5@0x10@0x20"),
8478 6 : Bytes::from_static(b"value 6@0x10@0x20"),
8479 6 : Bytes::from_static(b"value 7@0x10"),
8480 6 : Bytes::from_static(b"value 8@0x10"),
8481 6 : Bytes::from_static(b"value 9@0x10"),
8482 6 : ];
8483 6 :
8484 12 : let verify_result = || async {
8485 132 : for idx in 0..10 {
8486 120 : assert_eq!(
8487 120 : branch_tline
8488 120 : .get(get_key(idx as u32), Lsn(0x50), &ctx)
8489 150 : .await
8490 120 : .unwrap(),
8491 120 : &expected_result_at_gc_horizon[idx]
8492 6 : );
8493 120 : assert_eq!(
8494 120 : branch_tline
8495 120 : .get(get_key(idx as u32), Lsn(0x40), &ctx)
8496 99 : .await
8497 120 : .unwrap(),
8498 120 : &expected_result_at_lsn_40[idx]
8499 6 : );
8500 6 : }
8501 24 : };
8502 6 :
8503 138 : verify_result().await;
8504 6 :
8505 6 : let cancel = CancellationToken::new();
8506 6 : branch_tline
8507 6 : .compact_with_gc(&cancel, EnumSet::new(), &ctx)
8508 48 : .await
8509 6 : .unwrap();
8510 6 :
8511 111 : verify_result().await;
8512 6 :
8513 6 : Ok(())
8514 6 : }
8515 :
8516 : // Regression test for https://github.com/neondatabase/neon/issues/9012
8517 : // Create an image arrangement where we have to read at different LSN ranges
8518 : // from a delta layer. This is achieved by overlapping an image layer on top of
8519 : // a delta layer. Like so:
8520 : //
8521 : // A B
8522 : // +----------------+ -> delta_layer
8523 : // | | ^ lsn
8524 : // | =========|-> nested_image_layer |
8525 : // | C | |
8526 : // +----------------+ |
8527 : // ======== -> baseline_image_layer +-------> key
8528 : //
8529 : //
8530 : // When querying the key range [A, B) we need to read at different LSN ranges
8531 : // for [A, C) and [C, B). This test checks that the described edge case is handled correctly.
8532 : #[tokio::test]
8533 6 : async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> {
8534 6 : let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
8535 24 : let (tenant, ctx) = harness.load().await;
8536 6 :
8537 6 : let will_init_keys = [2, 6];
8538 132 : fn get_key(id: u32) -> Key {
8539 132 : let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
8540 132 : key.field6 = id;
8541 132 : key
8542 132 : }
8543 6 :
8544 6 : let mut expected_key_values = HashMap::new();
8545 6 :
8546 6 : let baseline_image_layer_lsn = Lsn(0x10);
8547 6 : let mut baseline_img_layer = Vec::new();
8548 36 : for i in 0..5 {
8549 30 : let key = get_key(i);
8550 30 : let value = format!("value {i}@{baseline_image_layer_lsn}");
8551 30 :
8552 30 : let removed = expected_key_values.insert(key, value.clone());
8553 30 : assert!(removed.is_none());
8554 6 :
8555 30 : baseline_img_layer.push((key, Bytes::from(value)));
8556 6 : }
8557 6 :
8558 6 : let nested_image_layer_lsn = Lsn(0x50);
8559 6 : let mut nested_img_layer = Vec::new();
8560 36 : for i in 5..10 {
8561 30 : let key = get_key(i);
8562 30 : let value = format!("value {i}@{nested_image_layer_lsn}");
8563 30 :
8564 30 : let removed = expected_key_values.insert(key, value.clone());
8565 30 : assert!(removed.is_none());
8566 6 :
8567 30 : nested_img_layer.push((key, Bytes::from(value)));
8568 6 : }
8569 6 :
8570 6 : let mut delta_layer_spec = Vec::default();
8571 6 : let delta_layer_start_lsn = Lsn(0x20);
8572 6 : let mut delta_layer_end_lsn = delta_layer_start_lsn;
8573 6 :
8574 66 : for i in 0..10 {
8575 60 : let key = get_key(i);
8576 60 : let key_in_nested = nested_img_layer
8577 60 : .iter()
8578 240 : .any(|(key_with_img, _)| *key_with_img == key);
8579 60 : let lsn = {
8580 60 : if key_in_nested {
8581 30 : Lsn(nested_image_layer_lsn.0 + 0x10)
8582 6 : } else {
8583 30 : delta_layer_start_lsn
8584 6 : }
8585 6 : };
8586 6 :
8587 60 : let will_init = will_init_keys.contains(&i);
8588 60 : if will_init {
8589 12 : delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
8590 12 :
8591 12 : expected_key_values.insert(key, "".to_string());
8592 48 : } else {
8593 48 : let delta = format!("@{lsn}");
8594 48 : delta_layer_spec.push((
8595 48 : key,
8596 48 : lsn,
8597 48 : Value::WalRecord(NeonWalRecord::wal_append(&delta)),
8598 48 : ));
8599 48 :
8600 48 : expected_key_values
8601 48 : .get_mut(&key)
8602 48 : .expect("An image exists for each key")
8603 48 : .push_str(delta.as_str());
8604 48 : }
8605 60 : delta_layer_end_lsn = std::cmp::max(delta_layer_start_lsn, lsn);
8606 6 : }
8607 6 :
8608 6 : delta_layer_end_lsn = Lsn(delta_layer_end_lsn.0 + 1);
8609 6 :
8610 6 : assert!(
8611 6 : nested_image_layer_lsn > delta_layer_start_lsn
8612 6 : && nested_image_layer_lsn < delta_layer_end_lsn
8613 6 : );
8614 6 :
8615 6 : let tline = tenant
8616 6 : .create_test_timeline_with_layers(
8617 6 : TIMELINE_ID,
8618 6 : baseline_image_layer_lsn,
8619 6 : DEFAULT_PG_VERSION,
8620 6 : &ctx,
8621 6 : vec![DeltaLayerTestDesc::new_with_inferred_key_range(
8622 6 : delta_layer_start_lsn..delta_layer_end_lsn,
8623 6 : delta_layer_spec,
8624 6 : )], // delta layers
8625 6 : vec![
8626 6 : (baseline_image_layer_lsn, baseline_img_layer),
8627 6 : (nested_image_layer_lsn, nested_img_layer),
8628 6 : ], // image layers
8629 6 : delta_layer_end_lsn,
8630 6 : )
8631 120 : .await?;
8632 6 :
8633 6 : let keyspace = KeySpace::single(get_key(0)..get_key(10));
8634 6 : let results = tline
8635 6 : .get_vectored(keyspace, delta_layer_end_lsn, &ctx)
8636 43 : .await
8637 6 : .expect("No vectored errors");
8638 66 : for (key, res) in results {
8639 60 : let value = res.expect("No key errors");
8640 60 : let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
8641 60 : assert_eq!(value, Bytes::from(expected_value));
8642 6 : }
8643 6 :
8644 6 : Ok(())
8645 6 : }
8646 : }
|