LCOV - code coverage report
Current view: top level - pageserver/src - pgdatadir_mapping.rs (source / functions) Coverage Total Hit
Test: feead26e04cdef6e988ff1765b1cb7075eb48d3d.info Lines: 54.5 % 1895 1032
Test Date: 2025-02-28 12:11:00 Functions: 42.6 % 195 83

            Line data    Source code
       1              : //!
       2              : //! This provides an abstraction to store PostgreSQL relations and other files
       3              : //! in the key-value store that implements the Repository interface.
       4              : //!
       5              : //! (TODO: The line between PUT-functions here and walingest.rs is a bit blurry, as
       6              : //! walingest.rs handles a few things like implicit relation creation and extension.
       7              : //! Clarify that)
       8              : //!
       9              : use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
      10              : use std::ops::{ControlFlow, Range};
      11              : 
      12              : use anyhow::{Context, ensure};
      13              : use bytes::{Buf, Bytes, BytesMut};
      14              : use enum_map::Enum;
      15              : use itertools::Itertools;
      16              : use pageserver_api::key::{
      17              :     AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists,
      18              :     TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range,
      19              :     rel_size_to_key, rel_tag_sparse_key, rel_tag_sparse_key_range, relmap_file_key,
      20              :     repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
      21              :     slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
      22              : };
      23              : use pageserver_api::keyspace::SparseKeySpace;
      24              : use pageserver_api::record::NeonWalRecord;
      25              : use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
      26              : use pageserver_api::shard::ShardIdentity;
      27              : use pageserver_api::value::Value;
      28              : use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
      29              : use postgres_ffi::{BLCKSZ, Oid, RepOriginId, TimestampTz, TransactionId};
      30              : use serde::{Deserialize, Serialize};
      31              : use strum::IntoEnumIterator;
      32              : use tokio_util::sync::CancellationToken;
      33              : use tracing::{debug, info, trace, warn};
      34              : use utils::bin_ser::{BeSer, DeserializeError};
      35              : use utils::lsn::Lsn;
      36              : use utils::pausable_failpoint;
      37              : use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
      38              : 
      39              : use super::tenant::{PageReconstructError, Timeline};
      40              : use crate::aux_file;
      41              : use crate::context::RequestContext;
      42              : use crate::keyspace::{KeySpace, KeySpaceAccum};
      43              : use crate::metrics::{
      44              :     RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
      45              : };
      46              : use crate::span::{
      47              :     debug_assert_current_span_has_tenant_and_timeline_id,
      48              :     debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
      49              : };
      50              : use crate::tenant::storage_layer::IoConcurrency;
      51              : use crate::tenant::timeline::GetVectoredError;
      52              : 
      53              : /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
      54              : pub const MAX_AUX_FILE_DELTAS: usize = 1024;
      55              : 
      56              : /// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
      57              : pub const MAX_AUX_FILE_V2_DELTAS: usize = 16;
      58              : 
      59              : #[derive(Debug)]
      60              : pub enum LsnForTimestamp {
      61              :     /// Found commits both before and after the given timestamp
      62              :     Present(Lsn),
      63              : 
      64              :     /// Found no commits after the given timestamp, this means
      65              :     /// that the newest data in the branch is older than the given
      66              :     /// timestamp.
      67              :     ///
      68              :     /// All commits <= LSN happened before the given timestamp
      69              :     Future(Lsn),
      70              : 
      71              :     /// The queried timestamp is past our horizon we look back at (PITR)
      72              :     ///
      73              :     /// All commits > LSN happened after the given timestamp,
      74              :     /// but any commits < LSN might have happened before or after
      75              :     /// the given timestamp. We don't know because no data before
      76              :     /// the given lsn is available.
      77              :     Past(Lsn),
      78              : 
      79              :     /// We have found no commit with a timestamp,
      80              :     /// so we can't return anything meaningful.
      81              :     ///
      82              :     /// The associated LSN is the lower bound value we can safely
      83              :     /// create branches on, but no statement is made if it is
      84              :     /// older or newer than the timestamp.
      85              :     ///
      86              :     /// This variant can e.g. be returned right after a
      87              :     /// cluster import.
      88              :     NoData(Lsn),
      89              : }
      90              : 
      91              : #[derive(Debug, thiserror::Error)]
      92              : pub(crate) enum CalculateLogicalSizeError {
      93              :     #[error("cancelled")]
      94              :     Cancelled,
      95              : 
      96              :     /// Something went wrong while reading the metadata we use to calculate logical size
      97              :     /// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`]
      98              :     /// in the `From` implementation for this variant.
      99              :     #[error(transparent)]
     100              :     PageRead(PageReconstructError),
     101              : 
     102              :     /// Something went wrong deserializing metadata that we read to calculate logical size
     103              :     #[error("decode error: {0}")]
     104              :     Decode(#[from] DeserializeError),
     105              : }
     106              : 
     107              : #[derive(Debug, thiserror::Error)]
     108              : pub(crate) enum CollectKeySpaceError {
     109              :     #[error(transparent)]
     110              :     Decode(#[from] DeserializeError),
     111              :     #[error(transparent)]
     112              :     PageRead(PageReconstructError),
     113              :     #[error("cancelled")]
     114              :     Cancelled,
     115              : }
     116              : 
     117              : impl From<PageReconstructError> for CollectKeySpaceError {
     118            0 :     fn from(err: PageReconstructError) -> Self {
     119            0 :         match err {
     120            0 :             PageReconstructError::Cancelled => Self::Cancelled,
     121            0 :             err => Self::PageRead(err),
     122              :         }
     123            0 :     }
     124              : }
     125              : 
     126              : impl From<PageReconstructError> for CalculateLogicalSizeError {
     127            0 :     fn from(pre: PageReconstructError) -> Self {
     128            0 :         match pre {
     129            0 :             PageReconstructError::Cancelled => Self::Cancelled,
     130            0 :             _ => Self::PageRead(pre),
     131              :         }
     132            0 :     }
     133              : }
     134              : 
     135              : #[derive(Debug, thiserror::Error)]
     136              : pub enum RelationError {
     137              :     #[error("Relation Already Exists")]
     138              :     AlreadyExists,
     139              :     #[error("invalid relnode")]
     140              :     InvalidRelnode,
     141              :     #[error(transparent)]
     142              :     Other(#[from] anyhow::Error),
     143              : }
     144              : 
     145              : ///
     146              : /// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
     147              : /// and other special kinds of files, in a versioned key-value store. The
     148              : /// Timeline struct provides the key-value store.
     149              : ///
     150              : /// This is a separate impl, so that we can easily include all these functions in a Timeline
     151              : /// implementation, and might be moved into a separate struct later.
     152              : impl Timeline {
     153              :     /// Start ingesting a WAL record, or other atomic modification of
     154              :     /// the timeline.
     155              :     ///
     156              :     /// This provides a transaction-like interface to perform a bunch
     157              :     /// of modifications atomically.
     158              :     ///
     159              :     /// To ingest a WAL record, call begin_modification(lsn) to get a
     160              :     /// DatadirModification object. Use the functions in the object to
     161              :     /// modify the repository state, updating all the pages and metadata
     162              :     /// that the WAL record affects. When you're done, call commit() to
     163              :     /// commit the changes.
     164              :     ///
     165              :     /// Lsn stored in modification is advanced by `ingest_record` and
     166              :     /// is used by `commit()` to update `last_record_lsn`.
     167              :     ///
     168              :     /// Calling commit() will flush all the changes and reset the state,
     169              :     /// so the `DatadirModification` struct can be reused to perform the next modification.
     170              :     ///
     171              :     /// Note that any pending modifications you make through the
     172              :     /// modification object won't be visible to calls to the 'get' and list
     173              :     /// functions of the timeline until you finish! And if you update the
     174              :     /// same page twice, the last update wins.
     175              :     ///
     176       536820 :     pub fn begin_modification(&self, lsn: Lsn) -> DatadirModification
     177       536820 :     where
     178       536820 :         Self: Sized,
     179       536820 :     {
     180       536820 :         DatadirModification {
     181       536820 :             tline: self,
     182       536820 :             pending_lsns: Vec::new(),
     183       536820 :             pending_metadata_pages: HashMap::new(),
     184       536820 :             pending_data_batch: None,
     185       536820 :             pending_deletions: Vec::new(),
     186       536820 :             pending_nblocks: 0,
     187       536820 :             pending_directory_entries: Vec::new(),
     188       536820 :             pending_metadata_bytes: 0,
     189       536820 :             lsn,
     190       536820 :         }
     191       536820 :     }
     192              : 
     193              :     //------------------------------------------------------------------------------
     194              :     // Public GET functions
     195              :     //------------------------------------------------------------------------------
     196              : 
     197              :     /// Look up given page version.
     198        36768 :     pub(crate) async fn get_rel_page_at_lsn(
     199        36768 :         &self,
     200        36768 :         tag: RelTag,
     201        36768 :         blknum: BlockNumber,
     202        36768 :         version: Version<'_>,
     203        36768 :         ctx: &RequestContext,
     204        36768 :         io_concurrency: IoConcurrency,
     205        36768 :     ) -> Result<Bytes, PageReconstructError> {
     206        36768 :         match version {
     207        36768 :             Version::Lsn(effective_lsn) => {
     208        36768 :                 let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
     209        36768 :                 let res = self
     210        36768 :                     .get_rel_page_at_lsn_batched(
     211        36768 :                         pages.iter().map(|(tag, blknum)| (tag, blknum)),
     212        36768 :                         effective_lsn,
     213        36768 :                         io_concurrency.clone(),
     214        36768 :                         ctx,
     215        36768 :                     )
     216        36768 :                     .await;
     217        36768 :                 assert_eq!(res.len(), 1);
     218        36768 :                 res.into_iter().next().unwrap()
     219              :             }
     220            0 :             Version::Modified(modification) => {
     221            0 :                 if tag.relnode == 0 {
     222            0 :                     return Err(PageReconstructError::Other(
     223            0 :                         RelationError::InvalidRelnode.into(),
     224            0 :                     ));
     225            0 :                 }
     226              : 
     227            0 :                 let nblocks = self.get_rel_size(tag, version, ctx).await?;
     228            0 :                 if blknum >= nblocks {
     229            0 :                     debug!(
     230            0 :                         "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
     231            0 :                         tag,
     232            0 :                         blknum,
     233            0 :                         version.get_lsn(),
     234              :                         nblocks
     235              :                     );
     236            0 :                     return Ok(ZERO_PAGE.clone());
     237            0 :                 }
     238            0 : 
     239            0 :                 let key = rel_block_to_key(tag, blknum);
     240            0 :                 modification.get(key, ctx).await
     241              :             }
     242              :         }
     243        36768 :     }
     244              : 
     245              :     /// Like [`Self::get_rel_page_at_lsn`], but returns a batch of pages.
     246              :     ///
     247              :     /// The ordering of the returned vec corresponds to the ordering of `pages`.
     248        36768 :     pub(crate) async fn get_rel_page_at_lsn_batched(
     249        36768 :         &self,
     250        36768 :         pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber)>,
     251        36768 :         effective_lsn: Lsn,
     252        36768 :         io_concurrency: IoConcurrency,
     253        36768 :         ctx: &RequestContext,
     254        36768 :     ) -> Vec<Result<Bytes, PageReconstructError>> {
     255        36768 :         debug_assert_current_span_has_tenant_and_timeline_id();
     256        36768 : 
     257        36768 :         let mut slots_filled = 0;
     258        36768 :         let page_count = pages.len();
     259        36768 : 
     260        36768 :         // Would be nice to use smallvec here but it doesn't provide the spare_capacity_mut() API.
     261        36768 :         let mut result = Vec::with_capacity(pages.len());
     262        36768 :         let result_slots = result.spare_capacity_mut();
     263        36768 : 
     264        36768 :         let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[usize; 1]>> = BTreeMap::default();
     265        36768 :         for (response_slot_idx, (tag, blknum)) in pages.enumerate() {
     266        36768 :             if tag.relnode == 0 {
     267            0 :                 result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
     268            0 :                     RelationError::InvalidRelnode.into(),
     269            0 :                 )));
     270            0 : 
     271            0 :                 slots_filled += 1;
     272            0 :                 continue;
     273        36768 :             }
     274              : 
     275        36768 :             let nblocks = match self
     276        36768 :                 .get_rel_size(*tag, Version::Lsn(effective_lsn), ctx)
     277        36768 :                 .await
     278              :             {
     279        36768 :                 Ok(nblocks) => nblocks,
     280            0 :                 Err(err) => {
     281            0 :                     result_slots[response_slot_idx].write(Err(err));
     282            0 :                     slots_filled += 1;
     283            0 :                     continue;
     284              :                 }
     285              :             };
     286              : 
     287        36768 :             if *blknum >= nblocks {
     288            0 :                 debug!(
     289            0 :                     "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
     290              :                     tag, blknum, effective_lsn, nblocks
     291              :                 );
     292            0 :                 result_slots[response_slot_idx].write(Ok(ZERO_PAGE.clone()));
     293            0 :                 slots_filled += 1;
     294            0 :                 continue;
     295        36768 :             }
     296        36768 : 
     297        36768 :             let key = rel_block_to_key(*tag, *blknum);
     298        36768 : 
     299        36768 :             let key_slots = keys_slots.entry(key).or_default();
     300        36768 :             key_slots.push(response_slot_idx);
     301              :         }
     302              : 
     303        36768 :         let keyspace = {
     304              :             // add_key requires monotonicity
     305        36768 :             let mut acc = KeySpaceAccum::new();
     306        36768 :             for key in keys_slots
     307        36768 :                 .keys()
     308        36768 :                 // in fact it requires strong monotonicity
     309        36768 :                 .dedup()
     310        36768 :             {
     311        36768 :                 acc.add_key(*key);
     312        36768 :             }
     313        36768 :             acc.to_keyspace()
     314        36768 :         };
     315        36768 : 
     316        36768 :         match self
     317        36768 :             .get_vectored(keyspace, effective_lsn, io_concurrency, ctx)
     318        36768 :             .await
     319              :         {
     320        36768 :             Ok(results) => {
     321        73536 :                 for (key, res) in results {
     322        36768 :                     let mut key_slots = keys_slots.remove(&key).unwrap().into_iter();
     323        36768 :                     let first_slot = key_slots.next().unwrap();
     324              : 
     325        36768 :                     for slot in key_slots {
     326            0 :                         let clone = match &res {
     327            0 :                             Ok(buf) => Ok(buf.clone()),
     328            0 :                             Err(err) => Err(match err {
     329            0 :                                 PageReconstructError::Cancelled => PageReconstructError::Cancelled,
     330              : 
     331            0 :                                 x @ PageReconstructError::Other(_)
     332            0 :                                 | x @ PageReconstructError::AncestorLsnTimeout(_)
     333            0 :                                 | x @ PageReconstructError::WalRedo(_)
     334            0 :                                 | x @ PageReconstructError::MissingKey(_) => {
     335            0 :                                     PageReconstructError::Other(anyhow::anyhow!(
     336            0 :                                         "there was more than one request for this key in the batch, error logged once: {x:?}"
     337            0 :                                     ))
     338              :                                 }
     339              :                             }),
     340              :                         };
     341              : 
     342            0 :                         result_slots[slot].write(clone);
     343            0 :                         slots_filled += 1;
     344              :                     }
     345              : 
     346        36768 :                     result_slots[first_slot].write(res);
     347        36768 :                     slots_filled += 1;
     348              :                 }
     349              :             }
     350            0 :             Err(err) => {
     351              :                 // this cannot really happen because get_vectored only errors globally on invalid LSN or too large batch size
     352              :                 // (We enforce the max batch size outside of this function, in the code that constructs the batch request.)
     353            0 :                 for slot in keys_slots.values().flatten() {
     354              :                     // this whole `match` is a lot like `From<GetVectoredError> for PageReconstructError`
     355              :                     // but without taking ownership of the GetVectoredError
     356            0 :                     let err = match &err {
     357            0 :                         GetVectoredError::Cancelled => Err(PageReconstructError::Cancelled),
     358              :                         // TODO: restructure get_vectored API to make this error per-key
     359            0 :                         GetVectoredError::MissingKey(err) => {
     360            0 :                             Err(PageReconstructError::Other(anyhow::anyhow!(
     361            0 :                                 "whole vectored get request failed because one or more of the requested keys were missing: {err:?}"
     362            0 :                             )))
     363              :                         }
     364              :                         // TODO: restructure get_vectored API to make this error per-key
     365            0 :                         GetVectoredError::GetReadyAncestorError(err) => {
     366            0 :                             Err(PageReconstructError::Other(anyhow::anyhow!(
     367            0 :                                 "whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}"
     368            0 :                             )))
     369              :                         }
     370              :                         // TODO: restructure get_vectored API to make this error per-key
     371            0 :                         GetVectoredError::Other(err) => Err(PageReconstructError::Other(
     372            0 :                             anyhow::anyhow!("whole vectored get request failed: {err:?}"),
     373            0 :                         )),
     374              :                         // TODO: we can prevent this error class by moving this check into the type system
     375            0 :                         GetVectoredError::InvalidLsn(e) => {
     376            0 :                             Err(anyhow::anyhow!("invalid LSN: {e:?}").into())
     377              :                         }
     378              :                         // NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS
     379              :                         // TODO: we can prevent this error class by moving this check into the type system
     380            0 :                         GetVectoredError::Oversized(err) => {
     381            0 :                             Err(anyhow::anyhow!("batching oversized: {err:?}").into())
     382              :                         }
     383              :                     };
     384              : 
     385            0 :                     result_slots[*slot].write(err);
     386              :                 }
     387              : 
     388            0 :                 slots_filled += keys_slots.values().map(|slots| slots.len()).sum::<usize>();
     389            0 :             }
     390              :         };
     391              : 
     392        36768 :         assert_eq!(slots_filled, page_count);
     393              :         // SAFETY:
     394              :         // 1. `result` and any of its uninint members are not read from until this point
     395              :         // 2. The length below is tracked at run-time and matches the number of requested pages.
     396        36768 :         unsafe {
     397        36768 :             result.set_len(page_count);
     398        36768 :         }
     399        36768 : 
     400        36768 :         result
     401        36768 :     }
     402              : 
     403              :     /// Get size of a database in blocks. This is only accurate on shard 0. It will undercount on
     404              :     /// other shards, by only accounting for relations the shard has pages for, and only accounting
     405              :     /// for pages up to the highest page number it has stored.
     406            0 :     pub(crate) async fn get_db_size(
     407            0 :         &self,
     408            0 :         spcnode: Oid,
     409            0 :         dbnode: Oid,
     410            0 :         version: Version<'_>,
     411            0 :         ctx: &RequestContext,
     412            0 :     ) -> Result<usize, PageReconstructError> {
     413            0 :         let mut total_blocks = 0;
     414              : 
     415            0 :         let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
     416              : 
     417            0 :         for rel in rels {
     418            0 :             let n_blocks = self.get_rel_size(rel, version, ctx).await?;
     419            0 :             total_blocks += n_blocks as usize;
     420              :         }
     421            0 :         Ok(total_blocks)
     422            0 :     }
     423              : 
     424              :     /// Get size of a relation file. The relation must exist, otherwise an error is returned.
     425              :     ///
     426              :     /// This is only accurate on shard 0. On other shards, it will return the size up to the highest
     427              :     /// page number stored in the shard.
     428        48868 :     pub(crate) async fn get_rel_size(
     429        48868 :         &self,
     430        48868 :         tag: RelTag,
     431        48868 :         version: Version<'_>,
     432        48868 :         ctx: &RequestContext,
     433        48868 :     ) -> Result<BlockNumber, PageReconstructError> {
     434        48868 :         if tag.relnode == 0 {
     435            0 :             return Err(PageReconstructError::Other(
     436            0 :                 RelationError::InvalidRelnode.into(),
     437            0 :             ));
     438        48868 :         }
     439              : 
     440        48868 :         if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
     441        38588 :             return Ok(nblocks);
     442        10280 :         }
     443        10280 : 
     444        10280 :         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
     445            0 :             && !self.get_rel_exists(tag, version, ctx).await?
     446              :         {
     447              :             // FIXME: Postgres sometimes calls smgrcreate() to create
     448              :             // FSM, and smgrnblocks() on it immediately afterwards,
     449              :             // without extending it.  Tolerate that by claiming that
     450              :             // any non-existent FSM fork has size 0.
     451            0 :             return Ok(0);
     452        10280 :         }
     453        10280 : 
     454        10280 :         let key = rel_size_to_key(tag);
     455        10280 :         let mut buf = version.get(self, key, ctx).await?;
     456        10272 :         let nblocks = buf.get_u32_le();
     457        10272 : 
     458        10272 :         self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
     459        10272 : 
     460        10272 :         Ok(nblocks)
     461        48868 :     }
     462              : 
     463              :     /// Does the relation exist?
     464              :     ///
     465              :     /// Only shard 0 has a full view of the relations. Other shards only know about relations that
     466              :     /// the shard stores pages for.
     467        12100 :     pub(crate) async fn get_rel_exists(
     468        12100 :         &self,
     469        12100 :         tag: RelTag,
     470        12100 :         version: Version<'_>,
     471        12100 :         ctx: &RequestContext,
     472        12100 :     ) -> Result<bool, PageReconstructError> {
     473        12100 :         if tag.relnode == 0 {
     474            0 :             return Err(PageReconstructError::Other(
     475            0 :                 RelationError::InvalidRelnode.into(),
     476            0 :             ));
     477        12100 :         }
     478              : 
     479              :         // first try to lookup relation in cache
     480        12100 :         if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
     481        12064 :             return Ok(true);
     482           36 :         }
     483              :         // then check if the database was already initialized.
     484              :         // get_rel_exists can be called before dbdir is created.
     485           36 :         let buf = version.get(self, DBDIR_KEY, ctx).await?;
     486           36 :         let dbdirs = DbDirectory::des(&buf)?.dbdirs;
     487           36 :         if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
     488            0 :             return Ok(false);
     489           36 :         }
     490           36 : 
     491           36 :         // Read path: first read the new reldir keyspace. Early return if the relation exists.
     492           36 :         // Otherwise, read the old reldir keyspace.
     493           36 :         // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
     494           36 : 
     495           36 :         if self.get_rel_size_v2_enabled() {
     496              :             // fetch directory listing (new)
     497            0 :             let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
     498            0 :             let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
     499            0 :                 .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
     500            0 :             let exists_v2 = buf == RelDirExists::Exists;
     501            0 :             // Fast path: if the relation exists in the new format, return true.
     502            0 :             // TODO: we should have a verification mode that checks both keyspaces
     503            0 :             // to ensure the relation only exists in one of them.
     504            0 :             if exists_v2 {
     505            0 :                 return Ok(true);
     506            0 :             }
     507           36 :         }
     508              : 
     509              :         // fetch directory listing (old)
     510              : 
     511           36 :         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
     512           36 :         let buf = version.get(self, key, ctx).await?;
     513              : 
     514           36 :         let dir = RelDirectory::des(&buf)?;
     515           36 :         let exists_v1 = dir.rels.contains(&(tag.relnode, tag.forknum));
     516           36 :         Ok(exists_v1)
     517        12100 :     }
     518              : 
     519              :     /// Get a list of all existing relations in given tablespace and database.
     520              :     ///
     521              :     /// Only shard 0 has a full view of the relations. Other shards only know about relations that
     522              :     /// the shard stores pages for.
     523              :     ///
     524              :     /// # Cancel-Safety
     525              :     ///
     526              :     /// This method is cancellation-safe.
     527            0 :     pub(crate) async fn list_rels(
     528            0 :         &self,
     529            0 :         spcnode: Oid,
     530            0 :         dbnode: Oid,
     531            0 :         version: Version<'_>,
     532            0 :         ctx: &RequestContext,
     533            0 :     ) -> Result<HashSet<RelTag>, PageReconstructError> {
     534            0 :         // fetch directory listing (old)
     535            0 :         let key = rel_dir_to_key(spcnode, dbnode);
     536            0 :         let buf = version.get(self, key, ctx).await?;
     537              : 
     538            0 :         let dir = RelDirectory::des(&buf)?;
     539            0 :         let rels_v1: HashSet<RelTag> =
     540            0 :             HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
     541            0 :                 spcnode,
     542            0 :                 dbnode,
     543            0 :                 relnode: *relnode,
     544            0 :                 forknum: *forknum,
     545            0 :             }));
     546            0 : 
     547            0 :         if !self.get_rel_size_v2_enabled() {
     548            0 :             return Ok(rels_v1);
     549            0 :         }
     550            0 : 
     551            0 :         // scan directory listing (new), merge with the old results
     552            0 :         let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
     553            0 :         let io_concurrency = IoConcurrency::spawn_from_conf(
     554            0 :             self.conf,
     555            0 :             self.gate
     556            0 :                 .enter()
     557            0 :                 .map_err(|_| PageReconstructError::Cancelled)?,
     558              :         );
     559            0 :         let results = self
     560            0 :             .scan(
     561            0 :                 KeySpace::single(key_range),
     562            0 :                 version.get_lsn(),
     563            0 :                 ctx,
     564            0 :                 io_concurrency,
     565            0 :             )
     566            0 :             .await?;
     567            0 :         let mut rels = rels_v1;
     568            0 :         for (key, val) in results {
     569            0 :             let val = RelDirExists::decode(&val?)
     570            0 :                 .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
     571            0 :             assert_eq!(key.field6, 1);
     572            0 :             assert_eq!(key.field2, spcnode);
     573            0 :             assert_eq!(key.field3, dbnode);
     574            0 :             let tag = RelTag {
     575            0 :                 spcnode,
     576            0 :                 dbnode,
     577            0 :                 relnode: key.field4,
     578            0 :                 forknum: key.field5,
     579            0 :             };
     580            0 :             if val == RelDirExists::Removed {
     581            0 :                 debug_assert!(!rels.contains(&tag), "removed reltag in v2");
     582            0 :                 continue;
     583            0 :             }
     584            0 :             let did_not_contain = rels.insert(tag);
     585            0 :             debug_assert!(did_not_contain, "duplicate reltag in v2");
     586              :         }
     587            0 :         Ok(rels)
     588            0 :     }
     589              : 
     590              :     /// Get the whole SLRU segment
     591            0 :     pub(crate) async fn get_slru_segment(
     592            0 :         &self,
     593            0 :         kind: SlruKind,
     594            0 :         segno: u32,
     595            0 :         lsn: Lsn,
     596            0 :         ctx: &RequestContext,
     597            0 :     ) -> Result<Bytes, PageReconstructError> {
     598            0 :         assert!(self.tenant_shard_id.is_shard_zero());
     599            0 :         let n_blocks = self
     600            0 :             .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
     601            0 :             .await?;
     602            0 :         let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
     603            0 :         for blkno in 0..n_blocks {
     604            0 :             let block = self
     605            0 :                 .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
     606            0 :                 .await?;
     607            0 :             segment.extend_from_slice(&block[..BLCKSZ as usize]);
     608              :         }
     609            0 :         Ok(segment.freeze())
     610            0 :     }
     611              : 
     612              :     /// Look up given SLRU page version.
     613            0 :     pub(crate) async fn get_slru_page_at_lsn(
     614            0 :         &self,
     615            0 :         kind: SlruKind,
     616            0 :         segno: u32,
     617            0 :         blknum: BlockNumber,
     618            0 :         lsn: Lsn,
     619            0 :         ctx: &RequestContext,
     620            0 :     ) -> Result<Bytes, PageReconstructError> {
     621            0 :         assert!(self.tenant_shard_id.is_shard_zero());
     622            0 :         let key = slru_block_to_key(kind, segno, blknum);
     623            0 :         self.get(key, lsn, ctx).await
     624            0 :     }
     625              : 
     626              :     /// Get size of an SLRU segment
     627            0 :     pub(crate) async fn get_slru_segment_size(
     628            0 :         &self,
     629            0 :         kind: SlruKind,
     630            0 :         segno: u32,
     631            0 :         version: Version<'_>,
     632            0 :         ctx: &RequestContext,
     633            0 :     ) -> Result<BlockNumber, PageReconstructError> {
     634            0 :         assert!(self.tenant_shard_id.is_shard_zero());
     635            0 :         let key = slru_segment_size_to_key(kind, segno);
     636            0 :         let mut buf = version.get(self, key, ctx).await?;
     637            0 :         Ok(buf.get_u32_le())
     638            0 :     }
     639              : 
     640              :     /// Get size of an SLRU segment
     641            0 :     pub(crate) async fn get_slru_segment_exists(
     642            0 :         &self,
     643            0 :         kind: SlruKind,
     644            0 :         segno: u32,
     645            0 :         version: Version<'_>,
     646            0 :         ctx: &RequestContext,
     647            0 :     ) -> Result<bool, PageReconstructError> {
     648            0 :         assert!(self.tenant_shard_id.is_shard_zero());
     649              :         // fetch directory listing
     650            0 :         let key = slru_dir_to_key(kind);
     651            0 :         let buf = version.get(self, key, ctx).await?;
     652              : 
     653            0 :         let dir = SlruSegmentDirectory::des(&buf)?;
     654            0 :         Ok(dir.segments.contains(&segno))
     655            0 :     }
     656              : 
     657              :     /// Locate LSN, such that all transactions that committed before
     658              :     /// 'search_timestamp' are visible, but nothing newer is.
     659              :     ///
     660              :     /// This is not exact. Commit timestamps are not guaranteed to be ordered,
     661              :     /// so it's not well defined which LSN you get if there were multiple commits
     662              :     /// "in flight" at that point in time.
     663              :     ///
     664            0 :     pub(crate) async fn find_lsn_for_timestamp(
     665            0 :         &self,
     666            0 :         search_timestamp: TimestampTz,
     667            0 :         cancel: &CancellationToken,
     668            0 :         ctx: &RequestContext,
     669            0 :     ) -> Result<LsnForTimestamp, PageReconstructError> {
     670            0 :         pausable_failpoint!("find-lsn-for-timestamp-pausable");
     671              : 
     672            0 :         let gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn();
     673            0 :         let gc_cutoff_planned = {
     674            0 :             let gc_info = self.gc_info.read().unwrap();
     675            0 :             gc_info.min_cutoff()
     676            0 :         };
     677            0 :         // Usually the planned cutoff is newer than the cutoff of the last gc run,
     678            0 :         // but let's be defensive.
     679            0 :         let gc_cutoff = gc_cutoff_planned.max(*gc_cutoff_lsn_guard);
     680            0 :         // We use this method to figure out the branching LSN for the new branch, but the
     681            0 :         // GC cutoff could be before the branching point and we cannot create a new branch
     682            0 :         // with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be
     683            0 :         // on the safe side.
     684            0 :         let min_lsn = std::cmp::max(gc_cutoff, self.get_ancestor_lsn());
     685            0 :         let max_lsn = self.get_last_record_lsn();
     686            0 : 
     687            0 :         // LSNs are always 8-byte aligned. low/mid/high represent the
     688            0 :         // LSN divided by 8.
     689            0 :         let mut low = min_lsn.0 / 8;
     690            0 :         let mut high = max_lsn.0 / 8 + 1;
     691            0 : 
     692            0 :         let mut found_smaller = false;
     693            0 :         let mut found_larger = false;
     694              : 
     695            0 :         while low < high {
     696            0 :             if cancel.is_cancelled() {
     697            0 :                 return Err(PageReconstructError::Cancelled);
     698            0 :             }
     699            0 :             // cannot overflow, high and low are both smaller than u64::MAX / 2
     700            0 :             let mid = (high + low) / 2;
     701              : 
     702            0 :             let cmp = match self
     703            0 :                 .is_latest_commit_timestamp_ge_than(
     704            0 :                     search_timestamp,
     705            0 :                     Lsn(mid * 8),
     706            0 :                     &mut found_smaller,
     707            0 :                     &mut found_larger,
     708            0 :                     ctx,
     709            0 :                 )
     710            0 :                 .await
     711              :             {
     712            0 :                 Ok(res) => res,
     713            0 :                 Err(PageReconstructError::MissingKey(e)) => {
     714            0 :                     warn!(
     715            0 :                         "Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}",
     716              :                         e
     717              :                     );
     718              :                     // Return that we didn't find any requests smaller than the LSN, and logging the error.
     719            0 :                     return Ok(LsnForTimestamp::Past(min_lsn));
     720              :                 }
     721            0 :                 Err(e) => return Err(e),
     722              :             };
     723              : 
     724            0 :             if cmp {
     725            0 :                 high = mid;
     726            0 :             } else {
     727            0 :                 low = mid + 1;
     728            0 :             }
     729              :         }
     730              : 
     731              :         // If `found_smaller == true`, `low = t + 1` where `t` is the target LSN,
     732              :         // so the LSN of the last commit record before or at `search_timestamp`.
     733              :         // Remove one from `low` to get `t`.
     734              :         //
     735              :         // FIXME: it would be better to get the LSN of the previous commit.
     736              :         // Otherwise, if you restore to the returned LSN, the database will
     737              :         // include physical changes from later commits that will be marked
     738              :         // as aborted, and will need to be vacuumed away.
     739            0 :         let commit_lsn = Lsn((low - 1) * 8);
     740            0 :         match (found_smaller, found_larger) {
     741              :             (false, false) => {
     742              :                 // This can happen if no commit records have been processed yet, e.g.
     743              :                 // just after importing a cluster.
     744            0 :                 Ok(LsnForTimestamp::NoData(min_lsn))
     745              :             }
     746              :             (false, true) => {
     747              :                 // Didn't find any commit timestamps smaller than the request
     748            0 :                 Ok(LsnForTimestamp::Past(min_lsn))
     749              :             }
     750            0 :             (true, _) if commit_lsn < min_lsn => {
     751            0 :                 // the search above did set found_smaller to true but it never increased the lsn.
     752            0 :                 // Then, low is still the old min_lsn, and the subtraction above gave a value
     753            0 :                 // below the min_lsn. We should never do that.
     754            0 :                 Ok(LsnForTimestamp::Past(min_lsn))
     755              :             }
     756              :             (true, false) => {
     757              :                 // Only found commits with timestamps smaller than the request.
     758              :                 // It's still a valid case for branch creation, return it.
     759              :                 // And `update_gc_info()` ignores LSN for a `LsnForTimestamp::Future`
     760              :                 // case, anyway.
     761            0 :                 Ok(LsnForTimestamp::Future(commit_lsn))
     762              :             }
     763            0 :             (true, true) => Ok(LsnForTimestamp::Present(commit_lsn)),
     764              :         }
     765            0 :     }
     766              : 
     767              :     /// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any
     768              :     /// commits that committed after 'search_timestamp', at LSN 'probe_lsn'.
     769              :     ///
     770              :     /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
     771              :     /// with a smaller/larger timestamp.
     772              :     ///
     773            0 :     pub(crate) async fn is_latest_commit_timestamp_ge_than(
     774            0 :         &self,
     775            0 :         search_timestamp: TimestampTz,
     776            0 :         probe_lsn: Lsn,
     777            0 :         found_smaller: &mut bool,
     778            0 :         found_larger: &mut bool,
     779            0 :         ctx: &RequestContext,
     780            0 :     ) -> Result<bool, PageReconstructError> {
     781            0 :         self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
     782            0 :             if timestamp >= search_timestamp {
     783            0 :                 *found_larger = true;
     784            0 :                 return ControlFlow::Break(true);
     785            0 :             } else {
     786            0 :                 *found_smaller = true;
     787            0 :             }
     788            0 :             ControlFlow::Continue(())
     789            0 :         })
     790            0 :         .await
     791            0 :     }
     792              : 
     793              :     /// Obtain the possible timestamp range for the given lsn.
     794              :     ///
     795              :     /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
     796            0 :     pub(crate) async fn get_timestamp_for_lsn(
     797            0 :         &self,
     798            0 :         probe_lsn: Lsn,
     799            0 :         ctx: &RequestContext,
     800            0 :     ) -> Result<Option<TimestampTz>, PageReconstructError> {
     801            0 :         let mut max: Option<TimestampTz> = None;
     802            0 :         self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| {
     803            0 :             if let Some(max_prev) = max {
     804            0 :                 max = Some(max_prev.max(timestamp));
     805            0 :             } else {
     806            0 :                 max = Some(timestamp);
     807            0 :             }
     808            0 :             ControlFlow::Continue(())
     809            0 :         })
     810            0 :         .await?;
     811              : 
     812            0 :         Ok(max)
     813            0 :     }
     814              : 
     815              :     /// Runs the given function on all the timestamps for a given lsn
     816              :     ///
     817              :     /// The return value is either given by the closure, or set to the `Default`
     818              :     /// impl's output.
     819            0 :     async fn map_all_timestamps<T: Default>(
     820            0 :         &self,
     821            0 :         probe_lsn: Lsn,
     822            0 :         ctx: &RequestContext,
     823            0 :         mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
     824            0 :     ) -> Result<T, PageReconstructError> {
     825            0 :         for segno in self
     826            0 :             .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
     827            0 :             .await?
     828              :         {
     829            0 :             let nblocks = self
     830            0 :                 .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
     831            0 :                 .await?;
     832            0 :             for blknum in (0..nblocks).rev() {
     833            0 :                 let clog_page = self
     834            0 :                     .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
     835            0 :                     .await?;
     836              : 
     837            0 :                 if clog_page.len() == BLCKSZ as usize + 8 {
     838            0 :                     let mut timestamp_bytes = [0u8; 8];
     839            0 :                     timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
     840            0 :                     let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
     841            0 : 
     842            0 :                     match f(timestamp) {
     843            0 :                         ControlFlow::Break(b) => return Ok(b),
     844            0 :                         ControlFlow::Continue(()) => (),
     845              :                     }
     846            0 :                 }
     847              :             }
     848              :         }
     849            0 :         Ok(Default::default())
     850            0 :     }
     851              : 
     852            0 :     pub(crate) async fn get_slru_keyspace(
     853            0 :         &self,
     854            0 :         version: Version<'_>,
     855            0 :         ctx: &RequestContext,
     856            0 :     ) -> Result<KeySpace, PageReconstructError> {
     857            0 :         let mut accum = KeySpaceAccum::new();
     858              : 
     859            0 :         for kind in SlruKind::iter() {
     860            0 :             let mut segments: Vec<u32> = self
     861            0 :                 .list_slru_segments(kind, version, ctx)
     862            0 :                 .await?
     863            0 :                 .into_iter()
     864            0 :                 .collect();
     865            0 :             segments.sort_unstable();
     866              : 
     867            0 :             for seg in segments {
     868            0 :                 let block_count = self.get_slru_segment_size(kind, seg, version, ctx).await?;
     869              : 
     870            0 :                 accum.add_range(
     871            0 :                     slru_block_to_key(kind, seg, 0)..slru_block_to_key(kind, seg, block_count),
     872            0 :                 );
     873              :             }
     874              :         }
     875              : 
     876            0 :         Ok(accum.to_keyspace())
     877            0 :     }
     878              : 
     879              :     /// Get a list of SLRU segments
     880            0 :     pub(crate) async fn list_slru_segments(
     881            0 :         &self,
     882            0 :         kind: SlruKind,
     883            0 :         version: Version<'_>,
     884            0 :         ctx: &RequestContext,
     885            0 :     ) -> Result<HashSet<u32>, PageReconstructError> {
     886            0 :         // fetch directory entry
     887            0 :         let key = slru_dir_to_key(kind);
     888              : 
     889            0 :         let buf = version.get(self, key, ctx).await?;
     890            0 :         Ok(SlruSegmentDirectory::des(&buf)?.segments)
     891            0 :     }
     892              : 
     893            0 :     pub(crate) async fn get_relmap_file(
     894            0 :         &self,
     895            0 :         spcnode: Oid,
     896            0 :         dbnode: Oid,
     897            0 :         version: Version<'_>,
     898            0 :         ctx: &RequestContext,
     899            0 :     ) -> Result<Bytes, PageReconstructError> {
     900            0 :         let key = relmap_file_key(spcnode, dbnode);
     901              : 
     902            0 :         let buf = version.get(self, key, ctx).await?;
     903            0 :         Ok(buf)
     904            0 :     }
     905              : 
     906          644 :     pub(crate) async fn list_dbdirs(
     907          644 :         &self,
     908          644 :         lsn: Lsn,
     909          644 :         ctx: &RequestContext,
     910          644 :     ) -> Result<HashMap<(Oid, Oid), bool>, PageReconstructError> {
     911              :         // fetch directory entry
     912          644 :         let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
     913              : 
     914          644 :         Ok(DbDirectory::des(&buf)?.dbdirs)
     915          644 :     }
     916              : 
     917            0 :     pub(crate) async fn get_twophase_file(
     918            0 :         &self,
     919            0 :         xid: u64,
     920            0 :         lsn: Lsn,
     921            0 :         ctx: &RequestContext,
     922            0 :     ) -> Result<Bytes, PageReconstructError> {
     923            0 :         let key = twophase_file_key(xid);
     924            0 :         let buf = self.get(key, lsn, ctx).await?;
     925            0 :         Ok(buf)
     926            0 :     }
     927              : 
     928          648 :     pub(crate) async fn list_twophase_files(
     929          648 :         &self,
     930          648 :         lsn: Lsn,
     931          648 :         ctx: &RequestContext,
     932          648 :     ) -> Result<HashSet<u64>, PageReconstructError> {
     933              :         // fetch directory entry
     934          648 :         let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
     935              : 
     936          648 :         if self.pg_version >= 17 {
     937            0 :             Ok(TwoPhaseDirectoryV17::des(&buf)?.xids)
     938              :         } else {
     939          648 :             Ok(TwoPhaseDirectory::des(&buf)?
     940              :                 .xids
     941          648 :                 .iter()
     942          648 :                 .map(|x| u64::from(*x))
     943          648 :                 .collect())
     944              :         }
     945          648 :     }
     946              : 
     947            0 :     pub(crate) async fn get_control_file(
     948            0 :         &self,
     949            0 :         lsn: Lsn,
     950            0 :         ctx: &RequestContext,
     951            0 :     ) -> Result<Bytes, PageReconstructError> {
     952            0 :         self.get(CONTROLFILE_KEY, lsn, ctx).await
     953            0 :     }
     954              : 
     955           24 :     pub(crate) async fn get_checkpoint(
     956           24 :         &self,
     957           24 :         lsn: Lsn,
     958           24 :         ctx: &RequestContext,
     959           24 :     ) -> Result<Bytes, PageReconstructError> {
     960           24 :         self.get(CHECKPOINT_KEY, lsn, ctx).await
     961           24 :     }
     962              : 
     963           24 :     async fn list_aux_files_v2(
     964           24 :         &self,
     965           24 :         lsn: Lsn,
     966           24 :         ctx: &RequestContext,
     967           24 :         io_concurrency: IoConcurrency,
     968           24 :     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
     969           24 :         let kv = self
     970           24 :             .scan(
     971           24 :                 KeySpace::single(Key::metadata_aux_key_range()),
     972           24 :                 lsn,
     973           24 :                 ctx,
     974           24 :                 io_concurrency,
     975           24 :             )
     976           24 :             .await?;
     977           24 :         let mut result = HashMap::new();
     978           24 :         let mut sz = 0;
     979           60 :         for (_, v) in kv {
     980           36 :             let v = v?;
     981           36 :             let v = aux_file::decode_file_value_bytes(&v)
     982           36 :                 .context("value decode")
     983           36 :                 .map_err(PageReconstructError::Other)?;
     984           68 :             for (fname, content) in v {
     985           32 :                 sz += fname.len();
     986           32 :                 sz += content.len();
     987           32 :                 result.insert(fname, content);
     988           32 :             }
     989              :         }
     990           24 :         self.aux_file_size_estimator.on_initial(sz);
     991           24 :         Ok(result)
     992           24 :     }
     993              : 
     994            0 :     pub(crate) async fn trigger_aux_file_size_computation(
     995            0 :         &self,
     996            0 :         lsn: Lsn,
     997            0 :         ctx: &RequestContext,
     998            0 :         io_concurrency: IoConcurrency,
     999            0 :     ) -> Result<(), PageReconstructError> {
    1000            0 :         self.list_aux_files_v2(lsn, ctx, io_concurrency).await?;
    1001            0 :         Ok(())
    1002            0 :     }
    1003              : 
    1004           24 :     pub(crate) async fn list_aux_files(
    1005           24 :         &self,
    1006           24 :         lsn: Lsn,
    1007           24 :         ctx: &RequestContext,
    1008           24 :         io_concurrency: IoConcurrency,
    1009           24 :     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
    1010           24 :         self.list_aux_files_v2(lsn, ctx, io_concurrency).await
    1011           24 :     }
    1012              : 
    1013            0 :     pub(crate) async fn get_replorigins(
    1014            0 :         &self,
    1015            0 :         lsn: Lsn,
    1016            0 :         ctx: &RequestContext,
    1017            0 :         io_concurrency: IoConcurrency,
    1018            0 :     ) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
    1019            0 :         let kv = self
    1020            0 :             .scan(
    1021            0 :                 KeySpace::single(repl_origin_key_range()),
    1022            0 :                 lsn,
    1023            0 :                 ctx,
    1024            0 :                 io_concurrency,
    1025            0 :             )
    1026            0 :             .await?;
    1027            0 :         let mut result = HashMap::new();
    1028            0 :         for (k, v) in kv {
    1029            0 :             let v = v?;
    1030            0 :             let origin_id = k.field6 as RepOriginId;
    1031            0 :             let origin_lsn = Lsn::des(&v).unwrap();
    1032            0 :             if origin_lsn != Lsn::INVALID {
    1033            0 :                 result.insert(origin_id, origin_lsn);
    1034            0 :             }
    1035              :         }
    1036            0 :         Ok(result)
    1037            0 :     }
    1038              : 
    1039              :     /// Does the same as get_current_logical_size but counted on demand.
    1040              :     /// Used to initialize the logical size tracking on startup.
    1041              :     ///
    1042              :     /// Only relation blocks are counted currently. That excludes metadata,
    1043              :     /// SLRUs, twophase files etc.
    1044              :     ///
    1045              :     /// # Cancel-Safety
    1046              :     ///
    1047              :     /// This method is cancellation-safe.
    1048            0 :     pub(crate) async fn get_current_logical_size_non_incremental(
    1049            0 :         &self,
    1050            0 :         lsn: Lsn,
    1051            0 :         ctx: &RequestContext,
    1052            0 :     ) -> Result<u64, CalculateLogicalSizeError> {
    1053            0 :         debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
    1054            0 : 
    1055            0 :         fail::fail_point!("skip-logical-size-calculation", |_| { Ok(0) });
    1056              : 
    1057              :         // Fetch list of database dirs and iterate them
    1058            0 :         let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
    1059            0 :         let dbdir = DbDirectory::des(&buf)?;
    1060              : 
    1061            0 :         let mut total_size: u64 = 0;
    1062            0 :         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
    1063            0 :             for rel in self
    1064            0 :                 .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
    1065            0 :                 .await?
    1066              :             {
    1067            0 :                 if self.cancel.is_cancelled() {
    1068            0 :                     return Err(CalculateLogicalSizeError::Cancelled);
    1069            0 :                 }
    1070            0 :                 let relsize_key = rel_size_to_key(rel);
    1071            0 :                 let mut buf = self.get(relsize_key, lsn, ctx).await?;
    1072            0 :                 let relsize = buf.get_u32_le();
    1073            0 : 
    1074            0 :                 total_size += relsize as u64;
    1075              :             }
    1076              :         }
    1077            0 :         Ok(total_size * BLCKSZ as u64)
    1078            0 :     }
    1079              : 
    1080              :     /// Get a KeySpace that covers all the Keys that are in use at AND below the given LSN. This is only used
    1081              :     /// for gc-compaction.
    1082              :     ///
    1083              :     /// gc-compaction cannot use the same `collect_keyspace` function as the legacy compaction because it
    1084              :     /// processes data at multiple LSNs and needs to be aware of the fact that some key ranges might need to
    1085              :     /// be kept only for a specific range of LSN.
    1086              :     ///
    1087              :     /// Consider the case that the user created branches at LSN 10 and 20, where the user created a table A at
    1088              :     /// LSN 10 and dropped that table at LSN 20. `collect_keyspace` at LSN 10 will return the key range
    1089              :     /// corresponding to that table, while LSN 20 won't. The keyspace info at a single LSN is not enough to
    1090              :     /// determine which keys to retain/drop for gc-compaction.
    1091              :     ///
    1092              :     /// For now, it only drops AUX-v1 keys. But in the future, the function will be extended to return the keyspace
    1093              :     /// to be retained for each of the branch LSN.
    1094              :     ///
    1095              :     /// The return value is (dense keyspace, sparse keyspace).
    1096          104 :     pub(crate) async fn collect_gc_compaction_keyspace(
    1097          104 :         &self,
    1098          104 :     ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
    1099          104 :         let metadata_key_begin = Key::metadata_key_range().start;
    1100          104 :         let aux_v1_key = AUX_FILES_KEY;
    1101          104 :         let dense_keyspace = KeySpace {
    1102          104 :             ranges: vec![Key::MIN..aux_v1_key, aux_v1_key.next()..metadata_key_begin],
    1103          104 :         };
    1104          104 :         Ok((
    1105          104 :             dense_keyspace,
    1106          104 :             SparseKeySpace(KeySpace::single(Key::metadata_key_range())),
    1107          104 :         ))
    1108          104 :     }
    1109              : 
    1110              :     ///
    1111              :     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    1112              :     /// Anything that's not listed maybe removed from the underlying storage (from
    1113              :     /// that LSN forwards).
    1114              :     ///
    1115              :     /// The return value is (dense keyspace, sparse keyspace).
    1116          644 :     pub(crate) async fn collect_keyspace(
    1117          644 :         &self,
    1118          644 :         lsn: Lsn,
    1119          644 :         ctx: &RequestContext,
    1120          644 :     ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
    1121          644 :         // Iterate through key ranges, greedily packing them into partitions
    1122          644 :         let mut result = KeySpaceAccum::new();
    1123          644 : 
    1124          644 :         // The dbdir metadata always exists
    1125          644 :         result.add_key(DBDIR_KEY);
    1126              : 
    1127              :         // Fetch list of database dirs and iterate them
    1128          644 :         let dbdir = self.list_dbdirs(lsn, ctx).await?;
    1129          644 :         let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect();
    1130          644 : 
    1131          644 :         dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b));
    1132          644 :         for ((spcnode, dbnode), has_relmap_file) in dbs {
    1133            0 :             if has_relmap_file {
    1134            0 :                 result.add_key(relmap_file_key(spcnode, dbnode));
    1135            0 :             }
    1136            0 :             result.add_key(rel_dir_to_key(spcnode, dbnode));
    1137              : 
    1138            0 :             let mut rels: Vec<RelTag> = self
    1139            0 :                 .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
    1140            0 :                 .await?
    1141            0 :                 .into_iter()
    1142            0 :                 .collect();
    1143            0 :             rels.sort_unstable();
    1144            0 :             for rel in rels {
    1145            0 :                 let relsize_key = rel_size_to_key(rel);
    1146            0 :                 let mut buf = self.get(relsize_key, lsn, ctx).await?;
    1147            0 :                 let relsize = buf.get_u32_le();
    1148            0 : 
    1149            0 :                 result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
    1150            0 :                 result.add_key(relsize_key);
    1151              :             }
    1152              :         }
    1153              : 
    1154              :         // Iterate SLRUs next
    1155          644 :         if self.tenant_shard_id.is_shard_zero() {
    1156         1896 :             for kind in [
    1157          632 :                 SlruKind::Clog,
    1158          632 :                 SlruKind::MultiXactMembers,
    1159          632 :                 SlruKind::MultiXactOffsets,
    1160              :             ] {
    1161         1896 :                 let slrudir_key = slru_dir_to_key(kind);
    1162         1896 :                 result.add_key(slrudir_key);
    1163         1896 :                 let buf = self.get(slrudir_key, lsn, ctx).await?;
    1164         1896 :                 let dir = SlruSegmentDirectory::des(&buf)?;
    1165         1896 :                 let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
    1166         1896 :                 segments.sort_unstable();
    1167         1896 :                 for segno in segments {
    1168            0 :                     let segsize_key = slru_segment_size_to_key(kind, segno);
    1169            0 :                     let mut buf = self.get(segsize_key, lsn, ctx).await?;
    1170            0 :                     let segsize = buf.get_u32_le();
    1171            0 : 
    1172            0 :                     result.add_range(
    1173            0 :                         slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize),
    1174            0 :                     );
    1175            0 :                     result.add_key(segsize_key);
    1176              :                 }
    1177              :             }
    1178           12 :         }
    1179              : 
    1180              :         // Then pg_twophase
    1181          644 :         result.add_key(TWOPHASEDIR_KEY);
    1182              : 
    1183          644 :         let mut xids: Vec<u64> = self
    1184          644 :             .list_twophase_files(lsn, ctx)
    1185          644 :             .await?
    1186          644 :             .iter()
    1187          644 :             .cloned()
    1188          644 :             .collect();
    1189          644 :         xids.sort_unstable();
    1190          644 :         for xid in xids {
    1191            0 :             result.add_key(twophase_file_key(xid));
    1192            0 :         }
    1193              : 
    1194          644 :         result.add_key(CONTROLFILE_KEY);
    1195          644 :         result.add_key(CHECKPOINT_KEY);
    1196          644 : 
    1197          644 :         // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
    1198          644 :         // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
    1199          644 :         // and the keys will not be garbage-colllected.
    1200          644 :         #[cfg(test)]
    1201          644 :         {
    1202          644 :             let guard = self.extra_test_dense_keyspace.load();
    1203          644 :             for kr in &guard.ranges {
    1204            0 :                 result.add_range(kr.clone());
    1205            0 :             }
    1206            0 :         }
    1207            0 : 
    1208          644 :         let dense_keyspace = result.to_keyspace();
    1209          644 :         let sparse_keyspace = SparseKeySpace(KeySpace {
    1210          644 :             ranges: vec![
    1211          644 :                 Key::metadata_aux_key_range(),
    1212          644 :                 repl_origin_key_range(),
    1213          644 :                 Key::rel_dir_sparse_key_range(),
    1214          644 :             ],
    1215          644 :         });
    1216          644 : 
    1217          644 :         if cfg!(debug_assertions) {
    1218              :             // Verify if the sparse keyspaces are ordered and non-overlapping.
    1219              : 
    1220              :             // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each
    1221              :             // category of sparse keys are split into their own image/delta files. If there
    1222              :             // are overlapping keyspaces, they will be automatically merged by keyspace accum,
    1223              :             // and we want the developer to keep the keyspaces separated.
    1224              : 
    1225          644 :             let ranges = &sparse_keyspace.0.ranges;
    1226              : 
    1227              :             // TODO: use a single overlaps_with across the codebase
    1228         1932 :             fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
    1229         1932 :                 !(a.end <= b.start || b.end <= a.start)
    1230         1932 :             }
    1231         1932 :             for i in 0..ranges.len() {
    1232         1932 :                 for j in 0..i {
    1233         1932 :                     if overlaps_with(&ranges[i], &ranges[j]) {
    1234            0 :                         panic!(
    1235            0 :                             "overlapping sparse keyspace: {}..{} and {}..{}",
    1236            0 :                             ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end
    1237            0 :                         );
    1238         1932 :                     }
    1239              :                 }
    1240              :             }
    1241         1288 :             for i in 1..ranges.len() {
    1242         1288 :                 assert!(
    1243         1288 :                     ranges[i - 1].end <= ranges[i].start,
    1244            0 :                     "unordered sparse keyspace: {}..{} and {}..{}",
    1245            0 :                     ranges[i - 1].start,
    1246            0 :                     ranges[i - 1].end,
    1247            0 :                     ranges[i].start,
    1248            0 :                     ranges[i].end
    1249              :                 );
    1250              :             }
    1251            0 :         }
    1252              : 
    1253          644 :         Ok((dense_keyspace, sparse_keyspace))
    1254          644 :     }
    1255              : 
    1256              :     /// Get cached size of relation if it not updated after specified LSN
    1257       897080 :     pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
    1258       897080 :         let rel_size_cache = self.rel_size_cache.read().unwrap();
    1259       897080 :         if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
    1260       897036 :             if lsn >= *cached_lsn {
    1261       886744 :                 RELSIZE_CACHE_HITS.inc();
    1262       886744 :                 return Some(*nblocks);
    1263        10292 :             }
    1264        10292 :             RELSIZE_CACHE_MISSES_OLD.inc();
    1265           44 :         }
    1266        10336 :         RELSIZE_CACHE_MISSES.inc();
    1267        10336 :         None
    1268       897080 :     }
    1269              : 
    1270              :     /// Update cached relation size if there is no more recent update
    1271        10272 :     pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
    1272        10272 :         let mut rel_size_cache = self.rel_size_cache.write().unwrap();
    1273        10272 : 
    1274        10272 :         if lsn < rel_size_cache.complete_as_of {
    1275              :             // Do not cache old values. It's safe to cache the size on read, as long as
    1276              :             // the read was at an LSN since we started the WAL ingestion. Reasoning: we
    1277              :             // never evict values from the cache, so if the relation size changed after
    1278              :             // 'lsn', the new value is already in the cache.
    1279            0 :             return;
    1280        10272 :         }
    1281        10272 : 
    1282        10272 :         match rel_size_cache.map.entry(tag) {
    1283        10272 :             hash_map::Entry::Occupied(mut entry) => {
    1284        10272 :                 let cached_lsn = entry.get_mut();
    1285        10272 :                 if lsn >= cached_lsn.0 {
    1286            0 :                     *cached_lsn = (lsn, nblocks);
    1287        10272 :                 }
    1288              :             }
    1289            0 :             hash_map::Entry::Vacant(entry) => {
    1290            0 :                 entry.insert((lsn, nblocks));
    1291            0 :                 RELSIZE_CACHE_ENTRIES.inc();
    1292            0 :             }
    1293              :         }
    1294        10272 :     }
    1295              : 
    1296              :     /// Store cached relation size
    1297       565440 :     pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
    1298       565440 :         let mut rel_size_cache = self.rel_size_cache.write().unwrap();
    1299       565440 :         if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
    1300         3840 :             RELSIZE_CACHE_ENTRIES.inc();
    1301       561600 :         }
    1302       565440 :     }
    1303              : 
    1304              :     /// Remove cached relation size
    1305            4 :     pub fn remove_cached_rel_size(&self, tag: &RelTag) {
    1306            4 :         let mut rel_size_cache = self.rel_size_cache.write().unwrap();
    1307            4 :         if rel_size_cache.map.remove(tag).is_some() {
    1308            4 :             RELSIZE_CACHE_ENTRIES.dec();
    1309            4 :         }
    1310            4 :     }
    1311              : }
    1312              : 
    1313              : /// DatadirModification represents an operation to ingest an atomic set of
    1314              : /// updates to the repository.
    1315              : ///
    1316              : /// It is created by the 'begin_record' function. It is called for each WAL
    1317              : /// record, so that all the modifications by a one WAL record appear atomic.
    1318              : pub struct DatadirModification<'a> {
    1319              :     /// The timeline this modification applies to. You can access this to
    1320              :     /// read the state, but note that any pending updates are *not* reflected
    1321              :     /// in the state in 'tline' yet.
    1322              :     pub tline: &'a Timeline,
    1323              : 
    1324              :     /// Current LSN of the modification
    1325              :     lsn: Lsn,
    1326              : 
    1327              :     // The modifications are not applied directly to the underlying key-value store.
    1328              :     // The put-functions add the modifications here, and they are flushed to the
    1329              :     // underlying key-value store by the 'finish' function.
    1330              :     pending_lsns: Vec<Lsn>,
    1331              :     pending_deletions: Vec<(Range<Key>, Lsn)>,
    1332              :     pending_nblocks: i64,
    1333              : 
    1334              :     /// Metadata writes, indexed by key so that they can be read from not-yet-committed modifications
    1335              :     /// while ingesting subsequent records. See [`Self::is_data_key`] for the definition of 'metadata'.
    1336              :     pending_metadata_pages: HashMap<CompactKey, Vec<(Lsn, usize, Value)>>,
    1337              : 
    1338              :     /// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for
    1339              :     /// which keys are stored here.
    1340              :     pending_data_batch: Option<SerializedValueBatch>,
    1341              : 
    1342              :     /// For special "directory" keys that store key-value maps, track the size of the map
    1343              :     /// if it was updated in this modification.
    1344              :     pending_directory_entries: Vec<(DirectoryKind, MetricsUpdate)>,
    1345              : 
    1346              :     /// An **approximation** of how many metadata bytes will be written to the EphemeralFile.
    1347              :     pending_metadata_bytes: usize,
    1348              : }
    1349              : 
    1350              : #[derive(Debug, Clone, Copy, PartialEq, Eq)]
    1351              : pub enum MetricsUpdate {
    1352              :     /// Set the metrics to this value
    1353              :     Set(u64),
    1354              :     /// Increment the metrics by this value
    1355              :     Add(u64),
    1356              :     /// Decrement the metrics by this value
    1357              :     Sub(u64),
    1358              : }
    1359              : 
    1360              : impl DatadirModification<'_> {
    1361              :     // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
    1362              :     // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
    1363              :     // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
    1364              :     pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
    1365              : 
    1366              :     /// Get the current lsn
    1367       836116 :     pub(crate) fn get_lsn(&self) -> Lsn {
    1368       836116 :         self.lsn
    1369       836116 :     }
    1370              : 
    1371            0 :     pub(crate) fn approx_pending_bytes(&self) -> usize {
    1372            0 :         self.pending_data_batch
    1373            0 :             .as_ref()
    1374            0 :             .map_or(0, |b| b.buffer_size())
    1375            0 :             + self.pending_metadata_bytes
    1376            0 :     }
    1377              : 
    1378            0 :     pub(crate) fn has_dirty_data(&self) -> bool {
    1379            0 :         self.pending_data_batch
    1380            0 :             .as_ref()
    1381            0 :             .is_some_and(|b| b.has_data())
    1382            0 :     }
    1383              : 
    1384              :     /// Returns statistics about the currently pending modifications.
    1385            0 :     pub(crate) fn stats(&self) -> DatadirModificationStats {
    1386            0 :         let mut stats = DatadirModificationStats::default();
    1387            0 :         for (_, _, value) in self.pending_metadata_pages.values().flatten() {
    1388            0 :             match value {
    1389            0 :                 Value::Image(_) => stats.metadata_images += 1,
    1390            0 :                 Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
    1391            0 :                 Value::WalRecord(_) => stats.metadata_deltas += 1,
    1392              :             }
    1393              :         }
    1394            0 :         for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
    1395            0 :             match valuemeta {
    1396            0 :                 ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
    1397            0 :                 ValueMeta::Serialized(_) => stats.data_deltas += 1,
    1398            0 :                 ValueMeta::Observed(_) => {}
    1399              :             }
    1400              :         }
    1401            0 :         stats
    1402            0 :     }
    1403              : 
    1404              :     /// Set the current lsn
    1405       291716 :     pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
    1406       291716 :         ensure!(
    1407       291716 :             lsn >= self.lsn,
    1408            0 :             "setting an older lsn {} than {} is not allowed",
    1409              :             lsn,
    1410              :             self.lsn
    1411              :         );
    1412              : 
    1413       291716 :         if lsn > self.lsn {
    1414       291716 :             self.pending_lsns.push(self.lsn);
    1415       291716 :             self.lsn = lsn;
    1416       291716 :         }
    1417       291716 :         Ok(())
    1418       291716 :     }
    1419              : 
    1420              :     /// In this context, 'metadata' means keys that are only read by the pageserver internally, and 'data' means
    1421              :     /// keys that represent literal blocks that postgres can read.  So data includes relation blocks and
    1422              :     /// SLRU blocks, which are read directly by postgres, and everything else is considered metadata.
    1423              :     ///
    1424              :     /// The distinction is important because data keys are handled on a fast path where dirty writes are
    1425              :     /// not readable until this modification is committed, whereas metadata keys are visible for read
    1426              :     /// via [`Self::get`] as soon as their record has been ingested.
    1427      1701248 :     fn is_data_key(key: &Key) -> bool {
    1428      1701248 :         key.is_rel_block_key() || key.is_slru_block_key()
    1429      1701248 :     }
    1430              : 
    1431              :     /// Initialize a completely new repository.
    1432              :     ///
    1433              :     /// This inserts the directory metadata entries that are assumed to
    1434              :     /// always exist.
    1435          416 :     pub fn init_empty(&mut self) -> anyhow::Result<()> {
    1436          416 :         let buf = DbDirectory::ser(&DbDirectory {
    1437          416 :             dbdirs: HashMap::new(),
    1438          416 :         })?;
    1439          416 :         self.pending_directory_entries
    1440          416 :             .push((DirectoryKind::Db, MetricsUpdate::Set(0)));
    1441          416 :         self.put(DBDIR_KEY, Value::Image(buf.into()));
    1442              : 
    1443          416 :         let buf = if self.tline.pg_version >= 17 {
    1444            0 :             TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
    1445            0 :                 xids: HashSet::new(),
    1446            0 :             })
    1447              :         } else {
    1448          416 :             TwoPhaseDirectory::ser(&TwoPhaseDirectory {
    1449          416 :                 xids: HashSet::new(),
    1450          416 :             })
    1451            0 :         }?;
    1452          416 :         self.pending_directory_entries
    1453          416 :             .push((DirectoryKind::TwoPhase, MetricsUpdate::Set(0)));
    1454          416 :         self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
    1455              : 
    1456          416 :         let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
    1457          416 :         let empty_dir = Value::Image(buf);
    1458          416 : 
    1459          416 :         // Initialize SLRUs on shard 0 only: creating these on other shards would be
    1460          416 :         // harmless but they'd just be dropped on later compaction.
    1461          416 :         if self.tline.tenant_shard_id.is_shard_zero() {
    1462          404 :             self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
    1463          404 :             self.pending_directory_entries.push((
    1464          404 :                 DirectoryKind::SlruSegment(SlruKind::Clog),
    1465          404 :                 MetricsUpdate::Set(0),
    1466          404 :             ));
    1467          404 :             self.put(
    1468          404 :                 slru_dir_to_key(SlruKind::MultiXactMembers),
    1469          404 :                 empty_dir.clone(),
    1470          404 :             );
    1471          404 :             self.pending_directory_entries.push((
    1472          404 :                 DirectoryKind::SlruSegment(SlruKind::Clog),
    1473          404 :                 MetricsUpdate::Set(0),
    1474          404 :             ));
    1475          404 :             self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
    1476          404 :             self.pending_directory_entries.push((
    1477          404 :                 DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets),
    1478          404 :                 MetricsUpdate::Set(0),
    1479          404 :             ));
    1480          404 :         }
    1481              : 
    1482          416 :         Ok(())
    1483          416 :     }
    1484              : 
    1485              :     #[cfg(test)]
    1486          412 :     pub fn init_empty_test_timeline(&mut self) -> anyhow::Result<()> {
    1487          412 :         self.init_empty()?;
    1488          412 :         self.put_control_file(bytes::Bytes::from_static(
    1489          412 :             b"control_file contents do not matter",
    1490          412 :         ))
    1491          412 :         .context("put_control_file")?;
    1492          412 :         self.put_checkpoint(bytes::Bytes::from_static(
    1493          412 :             b"checkpoint_file contents do not matter",
    1494          412 :         ))
    1495          412 :         .context("put_checkpoint_file")?;
    1496          412 :         Ok(())
    1497          412 :     }
    1498              : 
    1499              :     /// Creates a relation if it is not already present.
    1500              :     /// Returns the current size of the relation
    1501       836112 :     pub(crate) async fn create_relation_if_required(
    1502       836112 :         &mut self,
    1503       836112 :         rel: RelTag,
    1504       836112 :         ctx: &RequestContext,
    1505       836112 :     ) -> Result<u32, PageReconstructError> {
    1506              :         // Get current size and put rel creation if rel doesn't exist
    1507              :         //
    1508              :         // NOTE: we check the cache first even though get_rel_exists and get_rel_size would
    1509              :         //       check the cache too. This is because eagerly checking the cache results in
    1510              :         //       less work overall and 10% better performance. It's more work on cache miss
    1511              :         //       but cache miss is rare.
    1512       836112 :         if let Some(nblocks) = self.tline.get_cached_rel_size(&rel, self.get_lsn()) {
    1513       836092 :             Ok(nblocks)
    1514           20 :         } else if !self
    1515           20 :             .tline
    1516           20 :             .get_rel_exists(rel, Version::Modified(self), ctx)
    1517           20 :             .await?
    1518              :         {
    1519              :             // create it with 0 size initially, the logic below will extend it
    1520           20 :             self.put_rel_creation(rel, 0, ctx)
    1521           20 :                 .await
    1522           20 :                 .context("Relation Error")?;
    1523           20 :             Ok(0)
    1524              :         } else {
    1525            0 :             self.tline
    1526            0 :                 .get_rel_size(rel, Version::Modified(self), ctx)
    1527            0 :                 .await
    1528              :         }
    1529       836112 :     }
    1530              : 
    1531              :     /// Given a block number for a relation (which represents a newly written block),
    1532              :     /// the previous block count of the relation, and the shard info, find the gaps
    1533              :     /// that were created by the newly written block if any.
    1534       291340 :     fn find_gaps(
    1535       291340 :         rel: RelTag,
    1536       291340 :         blkno: u32,
    1537       291340 :         previous_nblocks: u32,
    1538       291340 :         shard: &ShardIdentity,
    1539       291340 :     ) -> Option<KeySpace> {
    1540       291340 :         let mut key = rel_block_to_key(rel, blkno);
    1541       291340 :         let mut gap_accum = None;
    1542              : 
    1543       291340 :         for gap_blkno in previous_nblocks..blkno {
    1544           64 :             key.field6 = gap_blkno;
    1545           64 : 
    1546           64 :             if shard.get_shard_number(&key) != shard.number {
    1547           16 :                 continue;
    1548           48 :             }
    1549           48 : 
    1550           48 :             gap_accum
    1551           48 :                 .get_or_insert_with(KeySpaceAccum::new)
    1552           48 :                 .add_key(key);
    1553              :         }
    1554              : 
    1555       291340 :         gap_accum.map(|accum| accum.to_keyspace())
    1556       291340 :     }
    1557              : 
    1558       291704 :     pub async fn ingest_batch(
    1559       291704 :         &mut self,
    1560       291704 :         mut batch: SerializedValueBatch,
    1561       291704 :         // TODO(vlad): remove this argument and replace the shard check with is_key_local
    1562       291704 :         shard: &ShardIdentity,
    1563       291704 :         ctx: &RequestContext,
    1564       291704 :     ) -> anyhow::Result<()> {
    1565       291704 :         let mut gaps_at_lsns = Vec::default();
    1566              : 
    1567       291704 :         for meta in batch.metadata.iter() {
    1568       291284 :             let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
    1569       291284 :             let new_nblocks = blkno + 1;
    1570              : 
    1571       291284 :             let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
    1572       291284 :             if new_nblocks > old_nblocks {
    1573         4780 :                 self.put_rel_extend(rel, new_nblocks, ctx).await?;
    1574       286504 :             }
    1575              : 
    1576       291284 :             if let Some(gaps) = Self::find_gaps(rel, blkno, old_nblocks, shard) {
    1577            0 :                 gaps_at_lsns.push((gaps, meta.lsn()));
    1578       291284 :             }
    1579              :         }
    1580              : 
    1581       291704 :         if !gaps_at_lsns.is_empty() {
    1582            0 :             batch.zero_gaps(gaps_at_lsns);
    1583       291704 :         }
    1584              : 
    1585       291704 :         match self.pending_data_batch.as_mut() {
    1586           40 :             Some(pending_batch) => {
    1587           40 :                 pending_batch.extend(batch);
    1588           40 :             }
    1589       291664 :             None if batch.has_data() => {
    1590       291260 :                 self.pending_data_batch = Some(batch);
    1591       291260 :             }
    1592          404 :             None => {
    1593          404 :                 // Nothing to initialize the batch with
    1594          404 :             }
    1595              :         }
    1596              : 
    1597       291704 :         Ok(())
    1598       291704 :     }
    1599              : 
    1600              :     /// Put a new page version that can be constructed from a WAL record
    1601              :     ///
    1602              :     /// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
    1603              :     /// current end-of-file. It's up to the caller to check that the relation size
    1604              :     /// matches the blocks inserted!
    1605           24 :     pub fn put_rel_wal_record(
    1606           24 :         &mut self,
    1607           24 :         rel: RelTag,
    1608           24 :         blknum: BlockNumber,
    1609           24 :         rec: NeonWalRecord,
    1610           24 :     ) -> anyhow::Result<()> {
    1611           24 :         anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
    1612           24 :         self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
    1613           24 :         Ok(())
    1614           24 :     }
    1615              : 
    1616              :     // Same, but for an SLRU.
    1617           16 :     pub fn put_slru_wal_record(
    1618           16 :         &mut self,
    1619           16 :         kind: SlruKind,
    1620           16 :         segno: u32,
    1621           16 :         blknum: BlockNumber,
    1622           16 :         rec: NeonWalRecord,
    1623           16 :     ) -> anyhow::Result<()> {
    1624           16 :         if !self.tline.tenant_shard_id.is_shard_zero() {
    1625            0 :             return Ok(());
    1626           16 :         }
    1627           16 : 
    1628           16 :         self.put(
    1629           16 :             slru_block_to_key(kind, segno, blknum),
    1630           16 :             Value::WalRecord(rec),
    1631           16 :         );
    1632           16 :         Ok(())
    1633           16 :     }
    1634              : 
    1635              :     /// Like put_wal_record, but with ready-made image of the page.
    1636       555684 :     pub fn put_rel_page_image(
    1637       555684 :         &mut self,
    1638       555684 :         rel: RelTag,
    1639       555684 :         blknum: BlockNumber,
    1640       555684 :         img: Bytes,
    1641       555684 :     ) -> anyhow::Result<()> {
    1642       555684 :         anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
    1643       555684 :         let key = rel_block_to_key(rel, blknum);
    1644       555684 :         if !key.is_valid_key_on_write_path() {
    1645            0 :             anyhow::bail!(
    1646            0 :                 "the request contains data not supported by pageserver at {}",
    1647            0 :                 key
    1648            0 :             );
    1649       555684 :         }
    1650       555684 :         self.put(rel_block_to_key(rel, blknum), Value::Image(img));
    1651       555684 :         Ok(())
    1652       555684 :     }
    1653              : 
    1654           12 :     pub fn put_slru_page_image(
    1655           12 :         &mut self,
    1656           12 :         kind: SlruKind,
    1657           12 :         segno: u32,
    1658           12 :         blknum: BlockNumber,
    1659           12 :         img: Bytes,
    1660           12 :     ) -> anyhow::Result<()> {
    1661           12 :         assert!(self.tline.tenant_shard_id.is_shard_zero());
    1662              : 
    1663           12 :         let key = slru_block_to_key(kind, segno, blknum);
    1664           12 :         if !key.is_valid_key_on_write_path() {
    1665            0 :             anyhow::bail!(
    1666            0 :                 "the request contains data not supported by pageserver at {}",
    1667            0 :                 key
    1668            0 :             );
    1669           12 :         }
    1670           12 :         self.put(key, Value::Image(img));
    1671           12 :         Ok(())
    1672           12 :     }
    1673              : 
    1674         5996 :     pub(crate) fn put_rel_page_image_zero(
    1675         5996 :         &mut self,
    1676         5996 :         rel: RelTag,
    1677         5996 :         blknum: BlockNumber,
    1678         5996 :     ) -> anyhow::Result<()> {
    1679         5996 :         anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
    1680         5996 :         let key = rel_block_to_key(rel, blknum);
    1681         5996 :         if !key.is_valid_key_on_write_path() {
    1682            0 :             anyhow::bail!(
    1683            0 :                 "the request contains data not supported by pageserver: {} @ {}",
    1684            0 :                 key,
    1685            0 :                 self.lsn
    1686            0 :             );
    1687         5996 :         }
    1688         5996 : 
    1689         5996 :         let batch = self
    1690         5996 :             .pending_data_batch
    1691         5996 :             .get_or_insert_with(SerializedValueBatch::default);
    1692         5996 : 
    1693         5996 :         batch.put(key.to_compact(), Value::Image(ZERO_PAGE.clone()), self.lsn);
    1694         5996 : 
    1695         5996 :         Ok(())
    1696         5996 :     }
    1697              : 
    1698            0 :     pub(crate) fn put_slru_page_image_zero(
    1699            0 :         &mut self,
    1700            0 :         kind: SlruKind,
    1701            0 :         segno: u32,
    1702            0 :         blknum: BlockNumber,
    1703            0 :     ) -> anyhow::Result<()> {
    1704            0 :         assert!(self.tline.tenant_shard_id.is_shard_zero());
    1705            0 :         let key = slru_block_to_key(kind, segno, blknum);
    1706            0 :         if !key.is_valid_key_on_write_path() {
    1707            0 :             anyhow::bail!(
    1708            0 :                 "the request contains data not supported by pageserver: {} @ {}",
    1709            0 :                 key,
    1710            0 :                 self.lsn
    1711            0 :             );
    1712            0 :         }
    1713            0 : 
    1714            0 :         let batch = self
    1715            0 :             .pending_data_batch
    1716            0 :             .get_or_insert_with(SerializedValueBatch::default);
    1717            0 : 
    1718            0 :         batch.put(key.to_compact(), Value::Image(ZERO_PAGE.clone()), self.lsn);
    1719            0 : 
    1720            0 :         Ok(())
    1721            0 :     }
    1722              : 
    1723              :     /// Store a relmapper file (pg_filenode.map) in the repository
    1724           32 :     pub async fn put_relmap_file(
    1725           32 :         &mut self,
    1726           32 :         spcnode: Oid,
    1727           32 :         dbnode: Oid,
    1728           32 :         img: Bytes,
    1729           32 :         ctx: &RequestContext,
    1730           32 :     ) -> anyhow::Result<()> {
    1731              :         // Add it to the directory (if it doesn't exist already)
    1732           32 :         let buf = self.get(DBDIR_KEY, ctx).await?;
    1733           32 :         let mut dbdir = DbDirectory::des(&buf)?;
    1734              : 
    1735           32 :         let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
    1736           32 :         if r.is_none() || r == Some(false) {
    1737              :             // The dbdir entry didn't exist, or it contained a
    1738              :             // 'false'. The 'insert' call already updated it with
    1739              :             // 'true', now write the updated 'dbdirs' map back.
    1740           32 :             let buf = DbDirectory::ser(&dbdir)?;
    1741           32 :             self.put(DBDIR_KEY, Value::Image(buf.into()));
    1742            0 :         }
    1743           32 :         if r.is_none() {
    1744              :             // Create RelDirectory
    1745              :             // TODO: if we have fully migrated to v2, no need to create this directory
    1746           16 :             let buf = RelDirectory::ser(&RelDirectory {
    1747           16 :                 rels: HashSet::new(),
    1748           16 :             })?;
    1749           16 :             self.pending_directory_entries
    1750           16 :                 .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
    1751           16 :             if self.tline.get_rel_size_v2_enabled() {
    1752            0 :                 self.pending_directory_entries
    1753            0 :                     .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
    1754           16 :             }
    1755           16 :             self.put(
    1756           16 :                 rel_dir_to_key(spcnode, dbnode),
    1757           16 :                 Value::Image(Bytes::from(buf)),
    1758           16 :             );
    1759           16 :         }
    1760              : 
    1761           32 :         self.put(relmap_file_key(spcnode, dbnode), Value::Image(img));
    1762           32 :         Ok(())
    1763           32 :     }
    1764              : 
    1765            0 :     pub async fn put_twophase_file(
    1766            0 :         &mut self,
    1767            0 :         xid: u64,
    1768            0 :         img: Bytes,
    1769            0 :         ctx: &RequestContext,
    1770            0 :     ) -> anyhow::Result<()> {
    1771              :         // Add it to the directory entry
    1772            0 :         let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
    1773            0 :         let newdirbuf = if self.tline.pg_version >= 17 {
    1774            0 :             let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
    1775            0 :             if !dir.xids.insert(xid) {
    1776            0 :                 anyhow::bail!("twophase file for xid {} already exists", xid);
    1777            0 :             }
    1778            0 :             self.pending_directory_entries.push((
    1779            0 :                 DirectoryKind::TwoPhase,
    1780            0 :                 MetricsUpdate::Set(dir.xids.len() as u64),
    1781            0 :             ));
    1782            0 :             Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
    1783              :         } else {
    1784            0 :             let xid = xid as u32;
    1785            0 :             let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
    1786            0 :             if !dir.xids.insert(xid) {
    1787            0 :                 anyhow::bail!("twophase file for xid {} already exists", xid);
    1788            0 :             }
    1789            0 :             self.pending_directory_entries.push((
    1790            0 :                 DirectoryKind::TwoPhase,
    1791            0 :                 MetricsUpdate::Set(dir.xids.len() as u64),
    1792            0 :             ));
    1793            0 :             Bytes::from(TwoPhaseDirectory::ser(&dir)?)
    1794              :         };
    1795            0 :         self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
    1796            0 : 
    1797            0 :         self.put(twophase_file_key(xid), Value::Image(img));
    1798            0 :         Ok(())
    1799            0 :     }
    1800              : 
    1801            0 :     pub async fn set_replorigin(
    1802            0 :         &mut self,
    1803            0 :         origin_id: RepOriginId,
    1804            0 :         origin_lsn: Lsn,
    1805            0 :     ) -> anyhow::Result<()> {
    1806            0 :         let key = repl_origin_key(origin_id);
    1807            0 :         self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
    1808            0 :         Ok(())
    1809            0 :     }
    1810              : 
    1811            0 :     pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
    1812            0 :         self.set_replorigin(origin_id, Lsn::INVALID).await
    1813            0 :     }
    1814              : 
    1815          416 :     pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
    1816          416 :         self.put(CONTROLFILE_KEY, Value::Image(img));
    1817          416 :         Ok(())
    1818          416 :     }
    1819              : 
    1820          444 :     pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
    1821          444 :         self.put(CHECKPOINT_KEY, Value::Image(img));
    1822          444 :         Ok(())
    1823          444 :     }
    1824              : 
    1825            0 :     pub async fn drop_dbdir(
    1826            0 :         &mut self,
    1827            0 :         spcnode: Oid,
    1828            0 :         dbnode: Oid,
    1829            0 :         ctx: &RequestContext,
    1830            0 :     ) -> anyhow::Result<()> {
    1831            0 :         let total_blocks = self
    1832            0 :             .tline
    1833            0 :             .get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
    1834            0 :             .await?;
    1835              : 
    1836              :         // Remove entry from dbdir
    1837            0 :         let buf = self.get(DBDIR_KEY, ctx).await?;
    1838            0 :         let mut dir = DbDirectory::des(&buf)?;
    1839            0 :         if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
    1840            0 :             let buf = DbDirectory::ser(&dir)?;
    1841            0 :             self.pending_directory_entries.push((
    1842            0 :                 DirectoryKind::Db,
    1843            0 :                 MetricsUpdate::Set(dir.dbdirs.len() as u64),
    1844            0 :             ));
    1845            0 :             self.put(DBDIR_KEY, Value::Image(buf.into()));
    1846              :         } else {
    1847            0 :             warn!(
    1848            0 :                 "dropped dbdir for spcnode {} dbnode {} did not exist in db directory",
    1849              :                 spcnode, dbnode
    1850              :             );
    1851              :         }
    1852              : 
    1853              :         // Update logical database size.
    1854            0 :         self.pending_nblocks -= total_blocks as i64;
    1855            0 : 
    1856            0 :         // Delete all relations and metadata files for the spcnode/dnode
    1857            0 :         self.delete(dbdir_key_range(spcnode, dbnode));
    1858            0 :         Ok(())
    1859            0 :     }
    1860              : 
    1861              :     /// Create a relation fork.
    1862              :     ///
    1863              :     /// 'nblocks' is the initial size.
    1864         3840 :     pub async fn put_rel_creation(
    1865         3840 :         &mut self,
    1866         3840 :         rel: RelTag,
    1867         3840 :         nblocks: BlockNumber,
    1868         3840 :         ctx: &RequestContext,
    1869         3840 :     ) -> Result<(), RelationError> {
    1870         3840 :         if rel.relnode == 0 {
    1871            0 :             return Err(RelationError::InvalidRelnode);
    1872         3840 :         }
    1873              :         // It's possible that this is the first rel for this db in this
    1874              :         // tablespace.  Create the reldir entry for it if so.
    1875         3840 :         let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
    1876         3840 :             .context("deserialize db")?;
    1877              : 
    1878         3840 :         let dbdir_exists =
    1879         3840 :             if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
    1880              :                 // Didn't exist. Update dbdir
    1881           16 :                 e.insert(false);
    1882           16 :                 let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
    1883           16 :                 self.pending_directory_entries.push((
    1884           16 :                     DirectoryKind::Db,
    1885           16 :                     MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
    1886           16 :                 ));
    1887           16 :                 self.put(DBDIR_KEY, Value::Image(buf.into()));
    1888           16 :                 false
    1889              :             } else {
    1890         3824 :                 true
    1891              :             };
    1892              : 
    1893         3840 :         let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
    1894         3840 :         let mut rel_dir = if !dbdir_exists {
    1895              :             // Create the RelDirectory
    1896           16 :             RelDirectory::default()
    1897              :         } else {
    1898              :             // reldir already exists, fetch it
    1899         3824 :             RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
    1900         3824 :                 .context("deserialize db")?
    1901              :         };
    1902              : 
    1903              :         // Add the new relation to the rel directory entry, and write it back
    1904         3840 :         if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
    1905            0 :             return Err(RelationError::AlreadyExists);
    1906         3840 :         }
    1907         3840 : 
    1908         3840 :         if self.tline.get_rel_size_v2_enabled() {
    1909            0 :             let sparse_rel_dir_key =
    1910            0 :                 rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
    1911              :             // check if the rel_dir_key exists in v2
    1912            0 :             let val = self
    1913            0 :                 .sparse_get(sparse_rel_dir_key, ctx)
    1914            0 :                 .await
    1915            0 :                 .map_err(|e| RelationError::Other(e.into()))?;
    1916            0 :             let val = RelDirExists::decode_option(val)
    1917            0 :                 .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
    1918            0 :             if val == RelDirExists::Exists {
    1919            0 :                 return Err(RelationError::AlreadyExists);
    1920            0 :             }
    1921            0 :             self.put(
    1922            0 :                 sparse_rel_dir_key,
    1923            0 :                 Value::Image(RelDirExists::Exists.encode()),
    1924            0 :             );
    1925            0 :             if !dbdir_exists {
    1926            0 :                 self.pending_directory_entries
    1927            0 :                     .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
    1928            0 :                 self.pending_directory_entries
    1929            0 :                     .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
    1930            0 :                 // We don't write `rel_dir_key -> rel_dir.rels` back to the storage in the v2 path unless it's the initial creation.
    1931            0 :                 // TODO: if we have fully migrated to v2, no need to create this directory. Otherwise, there
    1932            0 :                 // will be key not found errors if we don't create an empty one for rel_size_v2.
    1933            0 :                 self.put(
    1934            0 :                     rel_dir_key,
    1935            0 :                     Value::Image(Bytes::from(
    1936            0 :                         RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
    1937              :                     )),
    1938              :                 );
    1939            0 :             }
    1940            0 :             self.pending_directory_entries
    1941            0 :                 .push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
    1942              :         } else {
    1943         3840 :             if !dbdir_exists {
    1944           16 :                 self.pending_directory_entries
    1945           16 :                     .push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
    1946         3824 :             }
    1947         3840 :             self.pending_directory_entries
    1948         3840 :                 .push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
    1949         3840 :             self.put(
    1950         3840 :                 rel_dir_key,
    1951         3840 :                 Value::Image(Bytes::from(
    1952         3840 :                     RelDirectory::ser(&rel_dir).context("serialize")?,
    1953              :                 )),
    1954              :             );
    1955              :         }
    1956              :         // Put size
    1957         3840 :         let size_key = rel_size_to_key(rel);
    1958         3840 :         let buf = nblocks.to_le_bytes();
    1959         3840 :         self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
    1960         3840 : 
    1961         3840 :         self.pending_nblocks += nblocks as i64;
    1962         3840 : 
    1963         3840 :         // Update relation size cache
    1964         3840 :         self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
    1965         3840 : 
    1966         3840 :         // Even if nblocks > 0, we don't insert any actual blocks here. That's up to the
    1967         3840 :         // caller.
    1968         3840 :         Ok(())
    1969         3840 :     }
    1970              : 
    1971              :     /// Truncate relation
    1972        12024 :     pub async fn put_rel_truncation(
    1973        12024 :         &mut self,
    1974        12024 :         rel: RelTag,
    1975        12024 :         nblocks: BlockNumber,
    1976        12024 :         ctx: &RequestContext,
    1977        12024 :     ) -> anyhow::Result<()> {
    1978        12024 :         anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
    1979        12024 :         if self
    1980        12024 :             .tline
    1981        12024 :             .get_rel_exists(rel, Version::Modified(self), ctx)
    1982        12024 :             .await?
    1983              :         {
    1984        12024 :             let size_key = rel_size_to_key(rel);
    1985              :             // Fetch the old size first
    1986        12024 :             let old_size = self.get(size_key, ctx).await?.get_u32_le();
    1987        12024 : 
    1988        12024 :             // Update the entry with the new size.
    1989        12024 :             let buf = nblocks.to_le_bytes();
    1990        12024 :             self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
    1991        12024 : 
    1992        12024 :             // Update relation size cache
    1993        12024 :             self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
    1994        12024 : 
    1995        12024 :             // Update logical database size.
    1996        12024 :             self.pending_nblocks -= old_size as i64 - nblocks as i64;
    1997            0 :         }
    1998        12024 :         Ok(())
    1999        12024 :     }
    2000              : 
    2001              :     /// Extend relation
    2002              :     /// If new size is smaller, do nothing.
    2003       553360 :     pub async fn put_rel_extend(
    2004       553360 :         &mut self,
    2005       553360 :         rel: RelTag,
    2006       553360 :         nblocks: BlockNumber,
    2007       553360 :         ctx: &RequestContext,
    2008       553360 :     ) -> anyhow::Result<()> {
    2009       553360 :         anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
    2010              : 
    2011              :         // Put size
    2012       553360 :         let size_key = rel_size_to_key(rel);
    2013       553360 :         let old_size = self.get(size_key, ctx).await?.get_u32_le();
    2014       553360 : 
    2015       553360 :         // only extend relation here. never decrease the size
    2016       553360 :         if nblocks > old_size {
    2017       549576 :             let buf = nblocks.to_le_bytes();
    2018       549576 :             self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
    2019       549576 : 
    2020       549576 :             // Update relation size cache
    2021       549576 :             self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
    2022       549576 : 
    2023       549576 :             self.pending_nblocks += nblocks as i64 - old_size as i64;
    2024       549576 :         }
    2025       553360 :         Ok(())
    2026       553360 :     }
    2027              : 
    2028              :     /// Drop some relations
    2029           20 :     pub(crate) async fn put_rel_drops(
    2030           20 :         &mut self,
    2031           20 :         drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
    2032           20 :         ctx: &RequestContext,
    2033           20 :     ) -> anyhow::Result<()> {
    2034           24 :         for ((spc_node, db_node), rel_tags) in drop_relations {
    2035            4 :             let dir_key = rel_dir_to_key(spc_node, db_node);
    2036            4 :             let buf = self.get(dir_key, ctx).await?;
    2037            4 :             let mut dir = RelDirectory::des(&buf)?;
    2038              : 
    2039            4 :             let mut dirty = false;
    2040            8 :             for rel_tag in rel_tags {
    2041            4 :                 let found = if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
    2042            4 :                     self.pending_directory_entries
    2043            4 :                         .push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
    2044            4 :                     dirty = true;
    2045            4 :                     true
    2046            0 :                 } else if self.tline.get_rel_size_v2_enabled() {
    2047              :                     // The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
    2048              :                     // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
    2049              :                     // logic).
    2050            0 :                     let key =
    2051            0 :                         rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
    2052            0 :                     let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
    2053            0 :                         .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
    2054            0 :                     if val == RelDirExists::Exists {
    2055            0 :                         self.pending_directory_entries
    2056            0 :                             .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
    2057            0 :                         // put tombstone
    2058            0 :                         self.put(key, Value::Image(RelDirExists::Removed.encode()));
    2059            0 :                         // no need to set dirty to true
    2060            0 :                         true
    2061              :                     } else {
    2062            0 :                         false
    2063              :                     }
    2064              :                 } else {
    2065            0 :                     false
    2066              :                 };
    2067              : 
    2068            4 :                 if found {
    2069              :                     // update logical size
    2070            4 :                     let size_key = rel_size_to_key(rel_tag);
    2071            4 :                     let old_size = self.get(size_key, ctx).await?.get_u32_le();
    2072            4 :                     self.pending_nblocks -= old_size as i64;
    2073            4 : 
    2074            4 :                     // Remove entry from relation size cache
    2075            4 :                     self.tline.remove_cached_rel_size(&rel_tag);
    2076            4 : 
    2077            4 :                     // Delete size entry, as well as all blocks
    2078            4 :                     self.delete(rel_key_range(rel_tag));
    2079            0 :                 }
    2080              :             }
    2081              : 
    2082            4 :             if dirty {
    2083            4 :                 self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
    2084            0 :             }
    2085              :         }
    2086              : 
    2087           20 :         Ok(())
    2088           20 :     }
    2089              : 
    2090           12 :     pub async fn put_slru_segment_creation(
    2091           12 :         &mut self,
    2092           12 :         kind: SlruKind,
    2093           12 :         segno: u32,
    2094           12 :         nblocks: BlockNumber,
    2095           12 :         ctx: &RequestContext,
    2096           12 :     ) -> anyhow::Result<()> {
    2097           12 :         assert!(self.tline.tenant_shard_id.is_shard_zero());
    2098              : 
    2099              :         // Add it to the directory entry
    2100           12 :         let dir_key = slru_dir_to_key(kind);
    2101           12 :         let buf = self.get(dir_key, ctx).await?;
    2102           12 :         let mut dir = SlruSegmentDirectory::des(&buf)?;
    2103              : 
    2104           12 :         if !dir.segments.insert(segno) {
    2105            0 :             anyhow::bail!("slru segment {kind:?}/{segno} already exists");
    2106           12 :         }
    2107           12 :         self.pending_directory_entries.push((
    2108           12 :             DirectoryKind::SlruSegment(kind),
    2109           12 :             MetricsUpdate::Set(dir.segments.len() as u64),
    2110           12 :         ));
    2111           12 :         self.put(
    2112           12 :             dir_key,
    2113           12 :             Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
    2114              :         );
    2115              : 
    2116              :         // Put size
    2117           12 :         let size_key = slru_segment_size_to_key(kind, segno);
    2118           12 :         let buf = nblocks.to_le_bytes();
    2119           12 :         self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
    2120           12 : 
    2121           12 :         // even if nblocks > 0, we don't insert any actual blocks here
    2122           12 : 
    2123           12 :         Ok(())
    2124           12 :     }
    2125              : 
    2126              :     /// Extend SLRU segment
    2127            0 :     pub fn put_slru_extend(
    2128            0 :         &mut self,
    2129            0 :         kind: SlruKind,
    2130            0 :         segno: u32,
    2131            0 :         nblocks: BlockNumber,
    2132            0 :     ) -> anyhow::Result<()> {
    2133            0 :         assert!(self.tline.tenant_shard_id.is_shard_zero());
    2134              : 
    2135              :         // Put size
    2136            0 :         let size_key = slru_segment_size_to_key(kind, segno);
    2137            0 :         let buf = nblocks.to_le_bytes();
    2138            0 :         self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
    2139            0 :         Ok(())
    2140            0 :     }
    2141              : 
    2142              :     /// This method is used for marking truncated SLRU files
    2143            0 :     pub async fn drop_slru_segment(
    2144            0 :         &mut self,
    2145            0 :         kind: SlruKind,
    2146            0 :         segno: u32,
    2147            0 :         ctx: &RequestContext,
    2148            0 :     ) -> anyhow::Result<()> {
    2149            0 :         // Remove it from the directory entry
    2150            0 :         let dir_key = slru_dir_to_key(kind);
    2151            0 :         let buf = self.get(dir_key, ctx).await?;
    2152            0 :         let mut dir = SlruSegmentDirectory::des(&buf)?;
    2153              : 
    2154            0 :         if !dir.segments.remove(&segno) {
    2155            0 :             warn!("slru segment {:?}/{} does not exist", kind, segno);
    2156            0 :         }
    2157            0 :         self.pending_directory_entries.push((
    2158            0 :             DirectoryKind::SlruSegment(kind),
    2159            0 :             MetricsUpdate::Set(dir.segments.len() as u64),
    2160            0 :         ));
    2161            0 :         self.put(
    2162            0 :             dir_key,
    2163            0 :             Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
    2164              :         );
    2165              : 
    2166              :         // Delete size entry, as well as all blocks
    2167            0 :         self.delete(slru_segment_key_range(kind, segno));
    2168            0 : 
    2169            0 :         Ok(())
    2170            0 :     }
    2171              : 
    2172              :     /// Drop a relmapper file (pg_filenode.map)
    2173            0 :     pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
    2174            0 :         // TODO
    2175            0 :         Ok(())
    2176            0 :     }
    2177              : 
    2178              :     /// This method is used for marking truncated SLRU files
    2179            0 :     pub async fn drop_twophase_file(
    2180            0 :         &mut self,
    2181            0 :         xid: u64,
    2182            0 :         ctx: &RequestContext,
    2183            0 :     ) -> anyhow::Result<()> {
    2184              :         // Remove it from the directory entry
    2185            0 :         let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
    2186            0 :         let newdirbuf = if self.tline.pg_version >= 17 {
    2187            0 :             let mut dir = TwoPhaseDirectoryV17::des(&buf)?;
    2188              : 
    2189            0 :             if !dir.xids.remove(&xid) {
    2190            0 :                 warn!("twophase file for xid {} does not exist", xid);
    2191            0 :             }
    2192            0 :             self.pending_directory_entries.push((
    2193            0 :                 DirectoryKind::TwoPhase,
    2194            0 :                 MetricsUpdate::Set(dir.xids.len() as u64),
    2195            0 :             ));
    2196            0 :             Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
    2197              :         } else {
    2198            0 :             let xid: u32 = u32::try_from(xid)?;
    2199            0 :             let mut dir = TwoPhaseDirectory::des(&buf)?;
    2200              : 
    2201            0 :             if !dir.xids.remove(&xid) {
    2202            0 :                 warn!("twophase file for xid {} does not exist", xid);
    2203            0 :             }
    2204            0 :             self.pending_directory_entries.push((
    2205            0 :                 DirectoryKind::TwoPhase,
    2206            0 :                 MetricsUpdate::Set(dir.xids.len() as u64),
    2207            0 :             ));
    2208            0 :             Bytes::from(TwoPhaseDirectory::ser(&dir)?)
    2209              :         };
    2210            0 :         self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
    2211            0 : 
    2212            0 :         // Delete it
    2213            0 :         self.delete(twophase_key_range(xid));
    2214            0 : 
    2215            0 :         Ok(())
    2216            0 :     }
    2217              : 
    2218           32 :     pub async fn put_file(
    2219           32 :         &mut self,
    2220           32 :         path: &str,
    2221           32 :         content: &[u8],
    2222           32 :         ctx: &RequestContext,
    2223           32 :     ) -> anyhow::Result<()> {
    2224           32 :         let key = aux_file::encode_aux_file_key(path);
    2225              :         // retrieve the key from the engine
    2226           32 :         let old_val = match self.get(key, ctx).await {
    2227            8 :             Ok(val) => Some(val),
    2228           24 :             Err(PageReconstructError::MissingKey(_)) => None,
    2229            0 :             Err(e) => return Err(e.into()),
    2230              :         };
    2231           32 :         let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
    2232            8 :             aux_file::decode_file_value(old_val)?
    2233              :         } else {
    2234           24 :             Vec::new()
    2235              :         };
    2236           32 :         let mut other_files = Vec::with_capacity(files.len());
    2237           32 :         let mut modifying_file = None;
    2238           40 :         for file @ (p, content) in files {
    2239            8 :             if path == p {
    2240            8 :                 assert!(
    2241            8 :                     modifying_file.is_none(),
    2242            0 :                     "duplicated entries found for {}",
    2243              :                     path
    2244              :                 );
    2245            8 :                 modifying_file = Some(content);
    2246            0 :             } else {
    2247            0 :                 other_files.push(file);
    2248            0 :             }
    2249              :         }
    2250           32 :         let mut new_files = other_files;
    2251           32 :         match (modifying_file, content.is_empty()) {
    2252            4 :             (Some(old_content), false) => {
    2253            4 :                 self.tline
    2254            4 :                     .aux_file_size_estimator
    2255            4 :                     .on_update(old_content.len(), content.len());
    2256            4 :                 new_files.push((path, content));
    2257            4 :             }
    2258            4 :             (Some(old_content), true) => {
    2259            4 :                 self.tline
    2260            4 :                     .aux_file_size_estimator
    2261            4 :                     .on_remove(old_content.len());
    2262            4 :                 // not adding the file key to the final `new_files` vec.
    2263            4 :             }
    2264           24 :             (None, false) => {
    2265           24 :                 self.tline.aux_file_size_estimator.on_add(content.len());
    2266           24 :                 new_files.push((path, content));
    2267           24 :             }
    2268              :             // Compute may request delete of old version of pgstat AUX file if new one exceeds size limit.
    2269              :             // Compute doesn't know if previous version of this file exists or not, so
    2270              :             // attempt to delete non-existing file can cause this message.
    2271              :             // To avoid false alarms, log it as info rather than warning.
    2272            0 :             (None, true) if path.starts_with("pg_stat/") => {
    2273            0 :                 info!("removing non-existing pg_stat file: {}", path)
    2274              :             }
    2275            0 :             (None, true) => warn!("removing non-existing aux file: {}", path),
    2276              :         }
    2277           32 :         let new_val = aux_file::encode_file_value(&new_files)?;
    2278           32 :         self.put(key, Value::Image(new_val.into()));
    2279           32 : 
    2280           32 :         Ok(())
    2281           32 :     }
    2282              : 
    2283              :     ///
    2284              :     /// Flush changes accumulated so far to the underlying repository.
    2285              :     ///
    2286              :     /// Usually, changes made in DatadirModification are atomic, but this allows
    2287              :     /// you to flush them to the underlying repository before the final `commit`.
    2288              :     /// That allows to free up the memory used to hold the pending changes.
    2289              :     ///
    2290              :     /// Currently only used during bulk import of a data directory. In that
    2291              :     /// context, breaking the atomicity is OK. If the import is interrupted, the
    2292              :     /// whole import fails and the timeline will be deleted anyway.
    2293              :     /// (Or to be precise, it will be left behind for debugging purposes and
    2294              :     /// ignored, see <https://github.com/neondatabase/neon/pull/1809>)
    2295              :     ///
    2296              :     /// Note: A consequence of flushing the pending operations is that they
    2297              :     /// won't be visible to subsequent operations until `commit`. The function
    2298              :     /// retains all the metadata, but data pages are flushed. That's again OK
    2299              :     /// for bulk import, where you are just loading data pages and won't try to
    2300              :     /// modify the same pages twice.
    2301         3860 :     pub(crate) async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
    2302         3860 :         // Unless we have accumulated a decent amount of changes, it's not worth it
    2303         3860 :         // to scan through the pending_updates list.
    2304         3860 :         let pending_nblocks = self.pending_nblocks;
    2305         3860 :         if pending_nblocks < 10000 {
    2306         3860 :             return Ok(());
    2307            0 :         }
    2308              : 
    2309            0 :         let mut writer = self.tline.writer().await;
    2310              : 
    2311              :         // Flush relation and  SLRU data blocks, keep metadata.
    2312            0 :         if let Some(batch) = self.pending_data_batch.take() {
    2313            0 :             tracing::debug!(
    2314            0 :                 "Flushing batch with max_lsn={}. Last record LSN is {}",
    2315            0 :                 batch.max_lsn,
    2316            0 :                 self.tline.get_last_record_lsn()
    2317              :             );
    2318              : 
    2319              :             // This bails out on first error without modifying pending_updates.
    2320              :             // That's Ok, cf this function's doc comment.
    2321            0 :             writer.put_batch(batch, ctx).await?;
    2322            0 :         }
    2323              : 
    2324            0 :         if pending_nblocks != 0 {
    2325            0 :             writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
    2326            0 :             self.pending_nblocks = 0;
    2327            0 :         }
    2328              : 
    2329            0 :         for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
    2330            0 :             writer.update_directory_entries_count(kind, count);
    2331            0 :         }
    2332              : 
    2333            0 :         Ok(())
    2334         3860 :     }
    2335              : 
    2336              :     ///
    2337              :     /// Finish this atomic update, writing all the updated keys to the
    2338              :     /// underlying timeline.
    2339              :     /// All the modifications in this atomic update are stamped by the specified LSN.
    2340              :     ///
    2341      1486188 :     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
    2342      1486188 :         let mut writer = self.tline.writer().await;
    2343              : 
    2344      1486188 :         let pending_nblocks = self.pending_nblocks;
    2345      1486188 :         self.pending_nblocks = 0;
    2346              : 
    2347              :         // Ordering: the items in this batch do not need to be in any global order, but values for
    2348              :         // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
    2349              :         // this to do efficient updates to its index.  See [`wal_decoder::serialized_batch`] for
    2350              :         // more details.
    2351              : 
    2352      1486188 :         let metadata_batch = {
    2353      1486188 :             let pending_meta = self
    2354      1486188 :                 .pending_metadata_pages
    2355      1486188 :                 .drain()
    2356      1486188 :                 .flat_map(|(key, values)| {
    2357       548036 :                     values
    2358       548036 :                         .into_iter()
    2359       548036 :                         .map(move |(lsn, value_size, value)| (key, lsn, value_size, value))
    2360      1486188 :                 })
    2361      1486188 :                 .collect::<Vec<_>>();
    2362      1486188 : 
    2363      1486188 :             if pending_meta.is_empty() {
    2364       944556 :                 None
    2365              :             } else {
    2366       541632 :                 Some(SerializedValueBatch::from_values(pending_meta))
    2367              :             }
    2368              :         };
    2369              : 
    2370      1486188 :         let data_batch = self.pending_data_batch.take();
    2371              : 
    2372      1486188 :         let maybe_batch = match (data_batch, metadata_batch) {
    2373       529112 :             (Some(mut data), Some(metadata)) => {
    2374       529112 :                 data.extend(metadata);
    2375       529112 :                 Some(data)
    2376              :             }
    2377       286524 :             (Some(data), None) => Some(data),
    2378        12520 :             (None, Some(metadata)) => Some(metadata),
    2379       658032 :             (None, None) => None,
    2380              :         };
    2381              : 
    2382      1486188 :         if let Some(batch) = maybe_batch {
    2383       828156 :             tracing::debug!(
    2384            0 :                 "Flushing batch with max_lsn={}. Last record LSN is {}",
    2385            0 :                 batch.max_lsn,
    2386            0 :                 self.tline.get_last_record_lsn()
    2387              :             );
    2388              : 
    2389              :             // This bails out on first error without modifying pending_updates.
    2390              :             // That's Ok, cf this function's doc comment.
    2391       828156 :             writer.put_batch(batch, ctx).await?;
    2392       658032 :         }
    2393              : 
    2394      1486188 :         if !self.pending_deletions.is_empty() {
    2395            4 :             writer.delete_batch(&self.pending_deletions, ctx).await?;
    2396            4 :             self.pending_deletions.clear();
    2397      1486184 :         }
    2398              : 
    2399      1486188 :         self.pending_lsns.push(self.lsn);
    2400      1777904 :         for pending_lsn in self.pending_lsns.drain(..) {
    2401      1777904 :             // TODO(vlad): pretty sure the comment below is not valid anymore
    2402      1777904 :             // and we can call finish write with the latest LSN
    2403      1777904 :             //
    2404      1777904 :             // Ideally, we should be able to call writer.finish_write() only once
    2405      1777904 :             // with the highest LSN. However, the last_record_lsn variable in the
    2406      1777904 :             // timeline keeps track of the latest LSN and the immediate previous LSN
    2407      1777904 :             // so we need to record every LSN to not leave a gap between them.
    2408      1777904 :             writer.finish_write(pending_lsn);
    2409      1777904 :         }
    2410              : 
    2411      1486188 :         if pending_nblocks != 0 {
    2412       541140 :             writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
    2413       945048 :         }
    2414              : 
    2415      1486188 :         for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
    2416         5948 :             writer.update_directory_entries_count(kind, count);
    2417         5948 :         }
    2418              : 
    2419      1486188 :         self.pending_metadata_bytes = 0;
    2420      1486188 : 
    2421      1486188 :         Ok(())
    2422      1486188 :     }
    2423              : 
    2424       583408 :     pub(crate) fn len(&self) -> usize {
    2425       583408 :         self.pending_metadata_pages.len()
    2426       583408 :             + self.pending_data_batch.as_ref().map_or(0, |b| b.len())
    2427       583408 :             + self.pending_deletions.len()
    2428       583408 :     }
    2429              : 
    2430              :     /// Read a page from the Timeline we are writing to.  For metadata pages, this passes through
    2431              :     /// a cache in Self, which makes writes earlier in this modification visible to WAL records later
    2432              :     /// in the modification.
    2433              :     ///
    2434              :     /// For data pages, reads pass directly to the owning Timeline: any ingest code which reads a data
    2435              :     /// page must ensure that the pages they read are already committed in Timeline, for example
    2436              :     /// DB create operations are always preceded by a call to commit().  This is special cased because
    2437              :     /// it's rare: all the 'normal' WAL operations will only read metadata pages such as relation sizes,
    2438              :     /// and not data pages.
    2439       573172 :     async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
    2440       573172 :         if !Self::is_data_key(&key) {
    2441              :             // Have we already updated the same key? Read the latest pending updated
    2442              :             // version in that case.
    2443              :             //
    2444              :             // Note: we don't check pending_deletions. It is an error to request a
    2445              :             // value that has been removed, deletion only avoids leaking storage.
    2446       573172 :             if let Some(values) = self.pending_metadata_pages.get(&key.to_compact()) {
    2447        31856 :                 if let Some((_, _, value)) = values.last() {
    2448        31856 :                     return if let Value::Image(img) = value {
    2449        31856 :                         Ok(img.clone())
    2450              :                     } else {
    2451              :                         // Currently, we never need to read back a WAL record that we
    2452              :                         // inserted in the same "transaction". All the metadata updates
    2453              :                         // work directly with Images, and we never need to read actual
    2454              :                         // data pages. We could handle this if we had to, by calling
    2455              :                         // the walredo manager, but let's keep it simple for now.
    2456            0 :                         Err(PageReconstructError::Other(anyhow::anyhow!(
    2457            0 :                             "unexpected pending WAL record"
    2458            0 :                         )))
    2459              :                     };
    2460            0 :                 }
    2461       541316 :             }
    2462              :         } else {
    2463              :             // This is an expensive check, so we only do it in debug mode. If reading a data key,
    2464              :             // this key should never be present in pending_data_pages. We ensure this by committing
    2465              :             // modifications before ingesting DB create operations, which are the only kind that reads
    2466              :             // data pages during ingest.
    2467            0 :             if cfg!(debug_assertions) {
    2468            0 :                 assert!(
    2469            0 :                     !self
    2470            0 :                         .pending_data_batch
    2471            0 :                         .as_ref()
    2472            0 :                         .is_some_and(|b| b.updates_key(&key))
    2473            0 :                 );
    2474            0 :             }
    2475              :         }
    2476              : 
    2477              :         // Metadata page cache miss, or we're reading a data page.
    2478       541316 :         let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
    2479       541316 :         self.tline.get(key, lsn, ctx).await
    2480       573172 :     }
    2481              : 
    2482              :     /// Get a key from the sparse keyspace. Automatically converts the missing key error
    2483              :     /// and the empty value into None.
    2484            0 :     async fn sparse_get(
    2485            0 :         &self,
    2486            0 :         key: Key,
    2487            0 :         ctx: &RequestContext,
    2488            0 :     ) -> Result<Option<Bytes>, PageReconstructError> {
    2489            0 :         let val = self.get(key, ctx).await;
    2490            0 :         match val {
    2491            0 :             Ok(val) if val.is_empty() => Ok(None),
    2492            0 :             Ok(val) => Ok(Some(val)),
    2493            0 :             Err(PageReconstructError::MissingKey(_)) => Ok(None),
    2494            0 :             Err(e) => Err(e),
    2495              :         }
    2496            0 :     }
    2497              : 
    2498      1128076 :     fn put(&mut self, key: Key, val: Value) {
    2499      1128076 :         if Self::is_data_key(&key) {
    2500       555736 :             self.put_data(key.to_compact(), val)
    2501              :         } else {
    2502       572340 :             self.put_metadata(key.to_compact(), val)
    2503              :         }
    2504      1128076 :     }
    2505              : 
    2506       555736 :     fn put_data(&mut self, key: CompactKey, val: Value) {
    2507       555736 :         let batch = self
    2508       555736 :             .pending_data_batch
    2509       555736 :             .get_or_insert_with(SerializedValueBatch::default);
    2510       555736 :         batch.put(key, val, self.lsn);
    2511       555736 :     }
    2512              : 
    2513       572340 :     fn put_metadata(&mut self, key: CompactKey, val: Value) {
    2514       572340 :         let values = self.pending_metadata_pages.entry(key).or_default();
    2515              :         // Replace the previous value if it exists at the same lsn
    2516       572340 :         if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
    2517        24304 :             if *last_lsn == self.lsn {
    2518              :                 // Update the pending_metadata_bytes contribution from this entry, and update the serialized size in place
    2519        24304 :                 self.pending_metadata_bytes -= *last_value_ser_size;
    2520        24304 :                 *last_value_ser_size = val.serialized_size().unwrap() as usize;
    2521        24304 :                 self.pending_metadata_bytes += *last_value_ser_size;
    2522        24304 : 
    2523        24304 :                 // Use the latest value, this replaces any earlier write to the same (key,lsn), such as much
    2524        24304 :                 // have been generated by synthesized zero page writes prior to the first real write to a page.
    2525        24304 :                 *last_value = val;
    2526        24304 :                 return;
    2527            0 :             }
    2528       548036 :         }
    2529              : 
    2530       548036 :         let val_serialized_size = val.serialized_size().unwrap() as usize;
    2531       548036 :         self.pending_metadata_bytes += val_serialized_size;
    2532       548036 :         values.push((self.lsn, val_serialized_size, val));
    2533       548036 : 
    2534       548036 :         if key == CHECKPOINT_KEY.to_compact() {
    2535          444 :             tracing::debug!("Checkpoint key added to pending with size {val_serialized_size}");
    2536       547592 :         }
    2537       572340 :     }
    2538              : 
    2539            4 :     fn delete(&mut self, key_range: Range<Key>) {
    2540            4 :         trace!("DELETE {}-{}", key_range.start, key_range.end);
    2541            4 :         self.pending_deletions.push((key_range, self.lsn));
    2542            4 :     }
    2543              : }
    2544              : 
    2545              : /// Statistics for a DatadirModification.
    2546              : #[derive(Default)]
    2547              : pub struct DatadirModificationStats {
    2548              :     pub metadata_images: u64,
    2549              :     pub metadata_deltas: u64,
    2550              :     pub data_images: u64,
    2551              :     pub data_deltas: u64,
    2552              : }
    2553              : 
    2554              : /// This struct facilitates accessing either a committed key from the timeline at a
    2555              : /// specific LSN, or the latest uncommitted key from a pending modification.
    2556              : ///
    2557              : /// During WAL ingestion, the records from multiple LSNs may be batched in the same
    2558              : /// modification before being flushed to the timeline. Hence, the routines in WalIngest
    2559              : /// need to look up the keys in the modification first before looking them up in the
    2560              : /// timeline to not miss the latest updates.
    2561              : #[derive(Clone, Copy)]
    2562              : pub enum Version<'a> {
    2563              :     Lsn(Lsn),
    2564              :     Modified(&'a DatadirModification<'a>),
    2565              : }
    2566              : 
    2567              : impl Version<'_> {
    2568        10352 :     async fn get(
    2569        10352 :         &self,
    2570        10352 :         timeline: &Timeline,
    2571        10352 :         key: Key,
    2572        10352 :         ctx: &RequestContext,
    2573        10352 :     ) -> Result<Bytes, PageReconstructError> {
    2574        10352 :         match self {
    2575        10312 :             Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
    2576           40 :             Version::Modified(modification) => modification.get(key, ctx).await,
    2577              :         }
    2578        10352 :     }
    2579              : 
    2580              :     /// Get a key from the sparse keyspace. Automatically converts the missing key error
    2581              :     /// and the empty value into None.
    2582            0 :     async fn sparse_get(
    2583            0 :         &self,
    2584            0 :         timeline: &Timeline,
    2585            0 :         key: Key,
    2586            0 :         ctx: &RequestContext,
    2587            0 :     ) -> Result<Option<Bytes>, PageReconstructError> {
    2588            0 :         let val = self.get(timeline, key, ctx).await;
    2589            0 :         match val {
    2590            0 :             Ok(val) if val.is_empty() => Ok(None),
    2591            0 :             Ok(val) => Ok(Some(val)),
    2592            0 :             Err(PageReconstructError::MissingKey(_)) => Ok(None),
    2593            0 :             Err(e) => Err(e),
    2594              :         }
    2595            0 :     }
    2596              : 
    2597        71240 :     fn get_lsn(&self) -> Lsn {
    2598        71240 :         match self {
    2599        59148 :             Version::Lsn(lsn) => *lsn,
    2600        12092 :             Version::Modified(modification) => modification.lsn,
    2601              :         }
    2602        71240 :     }
    2603              : }
    2604              : 
    2605              : //--- Metadata structs stored in key-value pairs in the repository.
    2606              : 
    2607            0 : #[derive(Debug, Serialize, Deserialize)]
    2608              : pub(crate) struct DbDirectory {
    2609              :     // (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist)
    2610              :     pub(crate) dbdirs: HashMap<(Oid, Oid), bool>,
    2611              : }
    2612              : 
    2613              : // The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of
    2614              : // pg_twophase files was expanded from 32-bit XIDs to 64-bit XIDs.  Previously, the files
    2615              : // were named like "pg_twophase/000002E5", now they're like
    2616              : // "pg_twophsae/0000000A000002E4".
    2617              : 
    2618            0 : #[derive(Debug, Serialize, Deserialize)]
    2619              : pub(crate) struct TwoPhaseDirectory {
    2620              :     pub(crate) xids: HashSet<TransactionId>,
    2621              : }
    2622              : 
    2623            0 : #[derive(Debug, Serialize, Deserialize)]
    2624              : struct TwoPhaseDirectoryV17 {
    2625              :     xids: HashSet<u64>,
    2626              : }
    2627              : 
    2628            0 : #[derive(Debug, Serialize, Deserialize, Default)]
    2629              : pub(crate) struct RelDirectory {
    2630              :     // Set of relations that exist. (relfilenode, forknum)
    2631              :     //
    2632              :     // TODO: Store it as a btree or radix tree or something else that spans multiple
    2633              :     // key-value pairs, if you have a lot of relations
    2634              :     pub(crate) rels: HashSet<(Oid, u8)>,
    2635              : }
    2636              : 
    2637            0 : #[derive(Debug, Serialize, Deserialize)]
    2638              : struct RelSizeEntry {
    2639              :     nblocks: u32,
    2640              : }
    2641              : 
    2642            0 : #[derive(Debug, Serialize, Deserialize, Default)]
    2643              : pub(crate) struct SlruSegmentDirectory {
    2644              :     // Set of SLRU segments that exist.
    2645              :     pub(crate) segments: HashSet<u32>,
    2646              : }
    2647              : 
    2648              : #[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)]
    2649              : #[repr(u8)]
    2650              : pub(crate) enum DirectoryKind {
    2651              :     Db,
    2652              :     TwoPhase,
    2653              :     Rel,
    2654              :     AuxFiles,
    2655              :     SlruSegment(SlruKind),
    2656              :     RelV2,
    2657              : }
    2658              : 
    2659              : impl DirectoryKind {
    2660              :     pub(crate) const KINDS_NUM: usize = <DirectoryKind as Enum>::LENGTH;
    2661        17848 :     pub(crate) fn offset(&self) -> usize {
    2662        17848 :         self.into_usize()
    2663        17848 :     }
    2664              : }
    2665              : 
    2666              : static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
    2667              : 
    2668              : #[allow(clippy::bool_assert_comparison)]
    2669              : #[cfg(test)]
    2670              : mod tests {
    2671              :     use hex_literal::hex;
    2672              :     use pageserver_api::models::ShardParameters;
    2673              :     use pageserver_api::shard::ShardStripeSize;
    2674              :     use utils::id::TimelineId;
    2675              :     use utils::shard::{ShardCount, ShardNumber};
    2676              : 
    2677              :     use super::*;
    2678              :     use crate::DEFAULT_PG_VERSION;
    2679              :     use crate::tenant::harness::TenantHarness;
    2680              : 
    2681              :     /// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline
    2682              :     #[tokio::test]
    2683            4 :     async fn aux_files_round_trip() -> anyhow::Result<()> {
    2684            4 :         let name = "aux_files_round_trip";
    2685            4 :         let harness = TenantHarness::create(name).await?;
    2686            4 : 
    2687            4 :         pub const TIMELINE_ID: TimelineId =
    2688            4 :             TimelineId::from_array(hex!("11223344556677881122334455667788"));
    2689            4 : 
    2690            4 :         let (tenant, ctx) = harness.load().await;
    2691            4 :         let tline = tenant
    2692            4 :             .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
    2693            4 :             .await?;
    2694            4 :         let tline = tline.raw_timeline().unwrap();
    2695            4 : 
    2696            4 :         // First modification: insert two keys
    2697            4 :         let mut modification = tline.begin_modification(Lsn(0x1000));
    2698            4 :         modification.put_file("foo/bar1", b"content1", &ctx).await?;
    2699            4 :         modification.set_lsn(Lsn(0x1008))?;
    2700            4 :         modification.put_file("foo/bar2", b"content2", &ctx).await?;
    2701            4 :         modification.commit(&ctx).await?;
    2702            4 :         let expect_1008 = HashMap::from([
    2703            4 :             ("foo/bar1".to_string(), Bytes::from_static(b"content1")),
    2704            4 :             ("foo/bar2".to_string(), Bytes::from_static(b"content2")),
    2705            4 :         ]);
    2706            4 : 
    2707            4 :         let io_concurrency = IoConcurrency::spawn_for_test();
    2708            4 : 
    2709            4 :         let readback = tline
    2710            4 :             .list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone())
    2711            4 :             .await?;
    2712            4 :         assert_eq!(readback, expect_1008);
    2713            4 : 
    2714            4 :         // Second modification: update one key, remove the other
    2715            4 :         let mut modification = tline.begin_modification(Lsn(0x2000));
    2716            4 :         modification.put_file("foo/bar1", b"content3", &ctx).await?;
    2717            4 :         modification.set_lsn(Lsn(0x2008))?;
    2718            4 :         modification.put_file("foo/bar2", b"", &ctx).await?;
    2719            4 :         modification.commit(&ctx).await?;
    2720            4 :         let expect_2008 =
    2721            4 :             HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]);
    2722            4 : 
    2723            4 :         let readback = tline
    2724            4 :             .list_aux_files(Lsn(0x2008), &ctx, io_concurrency.clone())
    2725            4 :             .await?;
    2726            4 :         assert_eq!(readback, expect_2008);
    2727            4 : 
    2728            4 :         // Reading back in time works
    2729            4 :         let readback = tline
    2730            4 :             .list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone())
    2731            4 :             .await?;
    2732            4 :         assert_eq!(readback, expect_1008);
    2733            4 : 
    2734            4 :         Ok(())
    2735            4 :     }
    2736              : 
    2737              :     #[test]
    2738            4 :     fn gap_finding() {
    2739            4 :         let rel = RelTag {
    2740            4 :             spcnode: 1663,
    2741            4 :             dbnode: 208101,
    2742            4 :             relnode: 2620,
    2743            4 :             forknum: 0,
    2744            4 :         };
    2745            4 :         let base_blkno = 1;
    2746            4 : 
    2747            4 :         let base_key = rel_block_to_key(rel, base_blkno);
    2748            4 :         let before_base_key = rel_block_to_key(rel, base_blkno - 1);
    2749            4 : 
    2750            4 :         let shard = ShardIdentity::unsharded();
    2751            4 : 
    2752            4 :         let mut previous_nblocks = 0;
    2753           44 :         for i in 0..10 {
    2754           40 :             let crnt_blkno = base_blkno + i;
    2755           40 :             let gaps = DatadirModification::find_gaps(rel, crnt_blkno, previous_nblocks, &shard);
    2756           40 : 
    2757           40 :             previous_nblocks = crnt_blkno + 1;
    2758           40 : 
    2759           40 :             if i == 0 {
    2760              :                 // The first block we write is 1, so we should find the gap.
    2761            4 :                 assert_eq!(gaps.unwrap(), KeySpace::single(before_base_key..base_key));
    2762              :             } else {
    2763           36 :                 assert!(gaps.is_none());
    2764              :             }
    2765              :         }
    2766              : 
    2767              :         // This is an update to an already existing block. No gaps here.
    2768            4 :         let update_blkno = 5;
    2769            4 :         let gaps = DatadirModification::find_gaps(rel, update_blkno, previous_nblocks, &shard);
    2770            4 :         assert!(gaps.is_none());
    2771              : 
    2772              :         // This is an update past the current end block.
    2773            4 :         let after_gap_blkno = 20;
    2774            4 :         let gaps = DatadirModification::find_gaps(rel, after_gap_blkno, previous_nblocks, &shard);
    2775            4 : 
    2776            4 :         let gap_start_key = rel_block_to_key(rel, previous_nblocks);
    2777            4 :         let after_gap_key = rel_block_to_key(rel, after_gap_blkno);
    2778            4 :         assert_eq!(
    2779            4 :             gaps.unwrap(),
    2780            4 :             KeySpace::single(gap_start_key..after_gap_key)
    2781            4 :         );
    2782            4 :     }
    2783              : 
    2784              :     #[test]
    2785            4 :     fn sharded_gap_finding() {
    2786            4 :         let rel = RelTag {
    2787            4 :             spcnode: 1663,
    2788            4 :             dbnode: 208101,
    2789            4 :             relnode: 2620,
    2790            4 :             forknum: 0,
    2791            4 :         };
    2792            4 : 
    2793            4 :         let first_blkno = 6;
    2794            4 : 
    2795            4 :         // This shard will get the even blocks
    2796            4 :         let shard = ShardIdentity::from_params(
    2797            4 :             ShardNumber(0),
    2798            4 :             &ShardParameters {
    2799            4 :                 count: ShardCount(2),
    2800            4 :                 stripe_size: ShardStripeSize(1),
    2801            4 :             },
    2802            4 :         );
    2803            4 : 
    2804            4 :         // Only keys belonging to this shard are considered as gaps.
    2805            4 :         let mut previous_nblocks = 0;
    2806            4 :         let gaps =
    2807            4 :             DatadirModification::find_gaps(rel, first_blkno, previous_nblocks, &shard).unwrap();
    2808            4 :         assert!(!gaps.ranges.is_empty());
    2809           12 :         for gap_range in gaps.ranges {
    2810            8 :             let mut k = gap_range.start;
    2811           16 :             while k != gap_range.end {
    2812            8 :                 assert_eq!(shard.get_shard_number(&k), shard.number);
    2813            8 :                 k = k.next();
    2814              :             }
    2815              :         }
    2816              : 
    2817            4 :         previous_nblocks = first_blkno;
    2818            4 : 
    2819            4 :         let update_blkno = 2;
    2820            4 :         let gaps = DatadirModification::find_gaps(rel, update_blkno, previous_nblocks, &shard);
    2821            4 :         assert!(gaps.is_none());
    2822            4 :     }
    2823              : 
    2824              :     /*
    2825              :         fn assert_current_logical_size<R: Repository>(timeline: &DatadirTimeline<R>, lsn: Lsn) {
    2826              :             let incremental = timeline.get_current_logical_size();
    2827              :             let non_incremental = timeline
    2828              :                 .get_current_logical_size_non_incremental(lsn)
    2829              :                 .unwrap();
    2830              :             assert_eq!(incremental, non_incremental);
    2831              :         }
    2832              :     */
    2833              : 
    2834              :     /*
    2835              :     ///
    2836              :     /// Test list_rels() function, with branches and dropped relations
    2837              :     ///
    2838              :     #[test]
    2839              :     fn test_list_rels_drop() -> Result<()> {
    2840              :         let repo = RepoHarness::create("test_list_rels_drop")?.load();
    2841              :         let tline = create_empty_timeline(repo, TIMELINE_ID)?;
    2842              :         const TESTDB: u32 = 111;
    2843              : 
    2844              :         // Import initial dummy checkpoint record, otherwise the get_timeline() call
    2845              :         // after branching fails below
    2846              :         let mut writer = tline.begin_record(Lsn(0x10));
    2847              :         writer.put_checkpoint(ZERO_CHECKPOINT.clone())?;
    2848              :         writer.finish()?;
    2849              : 
    2850              :         // Create a relation on the timeline
    2851              :         let mut writer = tline.begin_record(Lsn(0x20));
    2852              :         writer.put_rel_page_image(TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
    2853              :         writer.finish()?;
    2854              : 
    2855              :         let writer = tline.begin_record(Lsn(0x00));
    2856              :         writer.finish()?;
    2857              : 
    2858              :         // Check that list_rels() lists it after LSN 2, but no before it
    2859              :         assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A));
    2860              :         assert!(tline.list_rels(0, TESTDB, Lsn(0x20))?.contains(&TESTREL_A));
    2861              :         assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A));
    2862              : 
    2863              :         // Create a branch, check that the relation is visible there
    2864              :         repo.branch_timeline(&tline, NEW_TIMELINE_ID, Lsn(0x30))?;
    2865              :         let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
    2866              :             Some(timeline) => timeline,
    2867              :             None => panic!("Should have a local timeline"),
    2868              :         };
    2869              :         let newtline = DatadirTimelineImpl::new(newtline);
    2870              :         assert!(newtline
    2871              :             .list_rels(0, TESTDB, Lsn(0x30))?
    2872              :             .contains(&TESTREL_A));
    2873              : 
    2874              :         // Drop it on the branch
    2875              :         let mut new_writer = newtline.begin_record(Lsn(0x40));
    2876              :         new_writer.drop_relation(TESTREL_A)?;
    2877              :         new_writer.finish()?;
    2878              : 
    2879              :         // Check that it's no longer listed on the branch after the point where it was dropped
    2880              :         assert!(newtline
    2881              :             .list_rels(0, TESTDB, Lsn(0x30))?
    2882              :             .contains(&TESTREL_A));
    2883              :         assert!(!newtline
    2884              :             .list_rels(0, TESTDB, Lsn(0x40))?
    2885              :             .contains(&TESTREL_A));
    2886              : 
    2887              :         // Run checkpoint and garbage collection and check that it's still not visible
    2888              :         newtline.checkpoint(CheckpointConfig::Forced)?;
    2889              :         repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?;
    2890              : 
    2891              :         assert!(!newtline
    2892              :             .list_rels(0, TESTDB, Lsn(0x40))?
    2893              :             .contains(&TESTREL_A));
    2894              : 
    2895              :         Ok(())
    2896              :     }
    2897              :      */
    2898              : 
    2899              :     /*
    2900              :     #[test]
    2901              :     fn test_read_beyond_eof() -> Result<()> {
    2902              :         let repo = RepoHarness::create("test_read_beyond_eof")?.load();
    2903              :         let tline = create_test_timeline(repo, TIMELINE_ID)?;
    2904              : 
    2905              :         make_some_layers(&tline, Lsn(0x20))?;
    2906              :         let mut writer = tline.begin_record(Lsn(0x60));
    2907              :         walingest.put_rel_page_image(
    2908              :             &mut writer,
    2909              :             TESTREL_A,
    2910              :             0,
    2911              :             TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x60))),
    2912              :         )?;
    2913              :         writer.finish()?;
    2914              : 
    2915              :         // Test read before rel creation. Should error out.
    2916              :         assert!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x10), false).is_err());
    2917              : 
    2918              :         // Read block beyond end of relation at different points in time.
    2919              :         // These reads should fall into different delta, image, and in-memory layers.
    2920              :         assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x20), false)?, ZERO_PAGE);
    2921              :         assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x25), false)?, ZERO_PAGE);
    2922              :         assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x30), false)?, ZERO_PAGE);
    2923              :         assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x35), false)?, ZERO_PAGE);
    2924              :         assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?, ZERO_PAGE);
    2925              :         assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x45), false)?, ZERO_PAGE);
    2926              :         assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?, ZERO_PAGE);
    2927              :         assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x55), false)?, ZERO_PAGE);
    2928              :         assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?, ZERO_PAGE);
    2929              : 
    2930              :         // Test on an in-memory layer with no preceding layer
    2931              :         let mut writer = tline.begin_record(Lsn(0x70));
    2932              :         walingest.put_rel_page_image(
    2933              :             &mut writer,
    2934              :             TESTREL_B,
    2935              :             0,
    2936              :             TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x70))),
    2937              :         )?;
    2938              :         writer.finish()?;
    2939              : 
    2940              :         assert_eq!(tline.get_rel_page_at_lsn(TESTREL_B, 1, Lsn(0x70), false)?6, ZERO_PAGE);
    2941              : 
    2942              :         Ok(())
    2943              :     }
    2944              :      */
    2945              : }
        

Generated by: LCOV version 2.1-beta