LCOV - code coverage report
Current view: top level - pageserver/src/tenant/storage_layer - inmemory_layer.rs (source / functions) Coverage Total Hit
Test: 472031e0b71f3195f7f21b1f2b20de09fd07bb56.info Lines: 90.8 % 530 481
Test Date: 2025-05-26 10:37:33 Functions: 83.7 % 49 41

            Line data    Source code
       1              : //! An in-memory layer stores recently received key-value pairs.
       2              : //!
       3              : //! The "in-memory" part of the name is a bit misleading: the actual page versions are
       4              : //! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
       5              : //! its position in the file, is kept in memory, though.
       6              : //!
       7              : use std::cmp::Ordering;
       8              : use std::collections::{BTreeMap, HashMap};
       9              : use std::fmt::Write;
      10              : use std::ops::Range;
      11              : use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering as AtomicOrdering};
      12              : use std::sync::{Arc, OnceLock};
      13              : use std::time::Instant;
      14              : 
      15              : use anyhow::Result;
      16              : use camino::Utf8PathBuf;
      17              : use pageserver_api::key::{CompactKey, Key};
      18              : use pageserver_api::keyspace::KeySpace;
      19              : use pageserver_api::models::InMemoryLayerInfo;
      20              : use pageserver_api::shard::TenantShardId;
      21              : use tokio::sync::RwLock;
      22              : use tokio_util::sync::CancellationToken;
      23              : use tracing::*;
      24              : use utils::id::TimelineId;
      25              : use utils::lsn::Lsn;
      26              : use utils::vec_map::VecMap;
      27              : use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
      28              : 
      29              : use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};
      30              : use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64, u64_to_usize};
      31              : use crate::config::PageServerConf;
      32              : use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
      33              : // avoid binding to Write (conflicts with std::io::Write)
      34              : // while being able to use std::fmt::Write's methods
      35              : use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
      36              : use crate::tenant::ephemeral_file::EphemeralFile;
      37              : use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo};
      38              : use crate::tenant::timeline::GetVectoredError;
      39              : use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
      40              : use crate::{l0_flush, page_cache};
      41              : 
      42              : pub(crate) mod vectored_dio_read;
      43              : 
      44              : #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
      45              : pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
      46              : 
      47              : pub struct InMemoryLayer {
      48              :     conf: &'static PageServerConf,
      49              :     tenant_shard_id: TenantShardId,
      50              :     timeline_id: TimelineId,
      51              :     file_id: InMemoryLayerFileId,
      52              : 
      53              :     /// This layer contains all the changes from 'start_lsn'. The
      54              :     /// start is inclusive.
      55              :     start_lsn: Lsn,
      56              : 
      57              :     /// Frozen layers have an exclusive end LSN.
      58              :     /// Writes are only allowed when this is `None`.
      59              :     pub(crate) end_lsn: OnceLock<Lsn>,
      60              : 
      61              :     /// Used for traversal path. Cached representation of the in-memory layer after frozen.
      62              :     frozen_local_path_str: OnceLock<Arc<str>>,
      63              : 
      64              :     opened_at: Instant,
      65              : 
      66              :     /// All versions of all pages in the layer are kept here. Indexed
      67              :     /// by block number and LSN. The [`IndexEntry`] is an offset into the
      68              :     /// ephemeral file where the page version is stored.
      69              :     ///
      70              :     /// We use a separate lock for the index to reduce the critical section
      71              :     /// during which reads cannot be planned.
      72              :     ///
      73              :     /// If you need access to both the index and the underlying file at the same time,
      74              :     /// respect the following locking order to avoid deadlocks:
      75              :     /// 1. [`InMemoryLayer::inner`]
      76              :     /// 2. [`InMemoryLayer::index`]
      77              :     ///
      78              :     /// Note that the file backing [`InMemoryLayer::inner`] is append-only,
      79              :     /// so it is not necessary to hold simultaneous locks on index.
      80              :     /// This avoids holding index locks across IO, and is crucial for avoiding read tail latency.
      81              :     /// In particular:
      82              :     /// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`].
      83              :     /// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`].
      84              :     index: RwLock<BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>>,
      85              : 
      86              :     /// The above fields never change, except for `end_lsn`, which is only set once,
      87              :     /// and `index` (see rationale there).
      88              :     /// All other changing parts are in `inner`, and protected by a mutex.
      89              :     inner: RwLock<InMemoryLayerInner>,
      90              : 
      91              :     estimated_in_mem_size: AtomicU64,
      92              : }
      93              : 
      94              : impl std::fmt::Debug for InMemoryLayer {
      95            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      96            0 :         f.debug_struct("InMemoryLayer")
      97            0 :             .field("start_lsn", &self.start_lsn)
      98            0 :             .field("end_lsn", &self.end_lsn)
      99            0 :             .field("inner", &self.inner)
     100            0 :             .finish()
     101            0 :     }
     102              : }
     103              : 
     104              : pub struct InMemoryLayerInner {
     105              :     /// The values are stored in a serialized format in this file.
     106              :     /// Each serialized Value is preceded by a 'u32' length field.
     107              :     /// PerSeg::page_versions map stores offsets into this file.
     108              :     file: EphemeralFile,
     109              : 
     110              :     resource_units: GlobalResourceUnits,
     111              : }
     112              : 
     113              : /// Support the same max blob length as blob_io, because ultimately
     114              : /// all the InMemoryLayer contents end up being written into a delta layer,
     115              : /// using the [`crate::tenant::blob_io`].
     116              : const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN;
     117              : const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
     118              :     let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize;
     119              :     let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize;
     120              :     assert!(trailing_ones + leading_zeroes == std::mem::size_of::<usize>() * 8);
     121              :     trailing_ones
     122              : };
     123              : 
     124              : /// See [`InMemoryLayer::index`].
     125              : ///
     126              : /// For memory efficiency, the data is packed into a u64.
     127              : ///
     128              : /// Layout:
     129              : /// - 1 bit: `will_init`
     130              : /// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len`
     131              : /// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos`
     132              : #[derive(Debug, Clone, Copy, PartialEq, Eq)]
     133              : pub struct IndexEntry(u64);
     134              : 
     135              : impl IndexEntry {
     136              :     /// See [`Self::MAX_SUPPORTED_POS`].
     137              :     const MAX_SUPPORTED_POS_BITS: usize = {
     138              :         let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS;
     139              :         if remainder < 32 {
     140              :             panic!("pos can be u32 as per type system, support that");
     141              :         }
     142              :         remainder
     143              :     };
     144              :     /// The maximum supported blob offset that can be represented by [`Self`].
     145              :     /// See also [`Self::validate_checkpoint_distance`].
     146              :     const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1;
     147              : 
     148              :     // Layout
     149              :     const WILL_INIT_RANGE: Range<usize> = 0..1;
     150              :     const LEN_RANGE: Range<usize> =
     151              :         Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS;
     152              :     const POS_RANGE: Range<usize> =
     153              :         Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS;
     154              :     const _ASSERT: () = {
     155              :         if Self::POS_RANGE.end != 64 {
     156              :             panic!("we don't want undefined bits for our own sanity")
     157              :         }
     158              :     };
     159              : 
     160              :     /// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`].
     161              :     ///
     162              :     /// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long.
     163              :     /// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`].
     164              :     ///
     165              :     /// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance,
     166              :     /// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value.
     167              :     ///
     168              :     /// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested)
     169              :     /// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer.
     170              :     #[inline(always)]
     171      2565230 :     fn new(arg: IndexEntryNewArgs) -> anyhow::Result<Self> {
     172      2565230 :         let IndexEntryNewArgs {
     173      2565230 :             base_offset,
     174      2565230 :             batch_offset,
     175      2565230 :             len,
     176      2565230 :             will_init,
     177      2565230 :         } = arg;
     178              : 
     179      2565230 :         let pos = base_offset
     180      2565230 :             .checked_add(batch_offset)
     181      2565230 :             .ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?;
     182              : 
     183      2565230 :         if pos.into_usize() > Self::MAX_SUPPORTED_POS {
     184            4 :             anyhow::bail!(
     185            4 :                 "base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}",
     186            4 :                 max = Self::MAX_SUPPORTED_POS
     187            4 :             );
     188      2565226 :         }
     189      2565226 : 
     190      2565226 :         if len > MAX_SUPPORTED_BLOB_LEN {
     191            1 :             anyhow::bail!(
     192            1 :                 "len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}",
     193            1 :             );
     194      2565225 :         }
     195      2565225 : 
     196      2565225 :         let mut data: u64 = 0;
     197              :         use bit_field::BitField;
     198      2565225 :         data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 });
     199      2565225 :         data.set_bits(Self::LEN_RANGE, len.into_u64());
     200      2565225 :         data.set_bits(Self::POS_RANGE, pos);
     201      2565225 : 
     202      2565225 :         Ok(Self(data))
     203      2565230 :     }
     204              : 
     205              :     #[inline(always)]
     206      3149462 :     fn unpack(&self) -> IndexEntryUnpacked {
     207              :         use bit_field::BitField;
     208      3149462 :         IndexEntryUnpacked {
     209      3149462 :             will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0,
     210      3149462 :             len: self.0.get_bits(Self::LEN_RANGE),
     211      3149462 :             pos: self.0.get_bits(Self::POS_RANGE),
     212      3149462 :         }
     213      3149462 :     }
     214              : 
     215              :     /// See [`Self::new`].
     216          126 :     pub(crate) const fn validate_checkpoint_distance(
     217          126 :         checkpoint_distance: u64,
     218          126 :     ) -> Result<(), &'static str> {
     219          126 :         if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 {
     220            0 :             return Err("exceeds the maximum supported value");
     221          126 :         }
     222          126 :         let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN);
     223          126 :         if res.is_none() {
     224            0 :             return Err(
     225            0 :                 "checkpoint distance + max supported blob len overflows in-memory addition",
     226            0 :             );
     227          126 :         }
     228          126 : 
     229          126 :         // NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS
     230          126 : 
     231          126 :         Ok(())
     232          126 :     }
     233              : 
     234              :     const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
     235              :         let res = Self::validate_checkpoint_distance(
     236              :             pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE,
     237              :         );
     238              :         if res.is_err() {
     239              :             panic!("default checkpoint distance is valid")
     240              :         }
     241              :     };
     242              : }
     243              : 
     244              : /// Args to [`IndexEntry::new`].
     245              : #[derive(Clone, Copy)]
     246              : struct IndexEntryNewArgs {
     247              :     base_offset: u64,
     248              :     batch_offset: u64,
     249              :     len: usize,
     250              :     will_init: bool,
     251              : }
     252              : 
     253              : /// Unpacked representation of the bitfielded [`IndexEntry`].
     254              : #[derive(Clone, Copy, PartialEq, Eq, Debug)]
     255              : struct IndexEntryUnpacked {
     256              :     will_init: bool,
     257              :     len: u64,
     258              :     pos: u64,
     259              : }
     260              : 
     261              : impl std::fmt::Debug for InMemoryLayerInner {
     262            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     263            0 :         f.debug_struct("InMemoryLayerInner").finish()
     264            0 :     }
     265              : }
     266              : 
     267              : /// State shared by all in-memory (ephemeral) layers.  Updated infrequently during background ticks in Timeline,
     268              : /// to minimize contention.
     269              : ///
     270              : /// This global state is used to implement behaviors that require a global view of the system, e.g.
     271              : /// rolling layers proactively to limit the total amount of dirty data.
     272              : pub(crate) struct GlobalResources {
     273              :     // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
     274              :     // Zero means unlimited.
     275              :     pub(crate) max_dirty_bytes: AtomicU64,
     276              :     // How many bytes are in all EphemeralFile objects
     277              :     dirty_bytes: AtomicU64,
     278              :     // How many layers are contributing to dirty_bytes
     279              :     dirty_layers: AtomicUsize,
     280              : }
     281              : 
     282              : // Per-timeline RAII struct for its contribution to [`GlobalResources`]
     283              : struct GlobalResourceUnits {
     284              :     // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
     285              :     // for decrementing the global counter by this many bytes when dropped.
     286              :     dirty_bytes: u64,
     287              : }
     288              : 
     289              : impl GlobalResourceUnits {
     290              :     // Hint for the layer append path to update us when the layer size differs from the last
     291              :     // call to update_size by this much.  If we don't reach this threshold, we'll still get
     292              :     // updated when the Timeline "ticks" in the background.
     293              :     const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
     294              : 
     295          661 :     fn new() -> Self {
     296          661 :         GLOBAL_RESOURCES
     297          661 :             .dirty_layers
     298          661 :             .fetch_add(1, AtomicOrdering::Relaxed);
     299          661 :         Self { dirty_bytes: 0 }
     300          661 :     }
     301              : 
     302              :     /// Do not call this frequently: all timelines will write to these same global atomics,
     303              :     /// so this is a relatively expensive operation.  Wait at least a few seconds between calls.
     304              :     ///
     305              :     /// Returns the effective layer size limit that should be applied, if any, to keep
     306              :     /// the total number of dirty bytes below the configured maximum.
     307          599 :     fn publish_size(&mut self, size: u64) -> Option<u64> {
     308          599 :         let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
     309          594 :             Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
     310              :             Ordering::Greater => {
     311            4 :                 let delta = size - self.dirty_bytes;
     312            4 :                 let old = GLOBAL_RESOURCES
     313            4 :                     .dirty_bytes
     314            4 :                     .fetch_add(delta, AtomicOrdering::Relaxed);
     315            4 :                 old + delta
     316              :             }
     317              :             Ordering::Less => {
     318            1 :                 let delta = self.dirty_bytes - size;
     319            1 :                 let old = GLOBAL_RESOURCES
     320            1 :                     .dirty_bytes
     321            1 :                     .fetch_sub(delta, AtomicOrdering::Relaxed);
     322            1 :                 old - delta
     323              :             }
     324              :         };
     325              : 
     326              :         // This is a sloppy update: concurrent updates to the counter will race, and the exact
     327              :         // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
     328              :         // That's okay: as long as the metric contains some recent value, it doesn't have to always
     329              :         // be literally the last update.
     330          599 :         TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
     331          599 : 
     332          599 :         self.dirty_bytes = size;
     333          599 : 
     334          599 :         let max_dirty_bytes = GLOBAL_RESOURCES
     335          599 :             .max_dirty_bytes
     336          599 :             .load(AtomicOrdering::Relaxed);
     337          599 :         if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
     338              :             // Set the layer file limit to the average layer size: this implies that all above-average
     339              :             // sized layers will be elegible for freezing.  They will be frozen in the order they
     340              :             // next enter publish_size.
     341            0 :             Some(
     342            0 :                 new_global_dirty_bytes
     343            0 :                     / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
     344            0 :             )
     345              :         } else {
     346          599 :             None
     347              :         }
     348          599 :     }
     349              : 
     350              :     // Call publish_size if the input size differs from last published size by more than
     351              :     // the drift limit
     352      2402128 :     fn maybe_publish_size(&mut self, size: u64) {
     353      2402128 :         let publish = match size.cmp(&self.dirty_bytes) {
     354            0 :             Ordering::Equal => false,
     355      2402128 :             Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
     356            0 :             Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT,
     357              :         };
     358              : 
     359      2402128 :         if publish {
     360            4 :             self.publish_size(size);
     361      2402124 :         }
     362      2402128 :     }
     363              : }
     364              : 
     365              : impl Drop for GlobalResourceUnits {
     366          595 :     fn drop(&mut self) {
     367          595 :         GLOBAL_RESOURCES
     368          595 :             .dirty_layers
     369          595 :             .fetch_sub(1, AtomicOrdering::Relaxed);
     370          595 : 
     371          595 :         // Subtract our contribution to the global total dirty bytes
     372          595 :         self.publish_size(0);
     373          595 :     }
     374              : }
     375              : 
     376              : pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
     377              :     max_dirty_bytes: AtomicU64::new(0),
     378              :     dirty_bytes: AtomicU64::new(0),
     379              :     dirty_layers: AtomicUsize::new(0),
     380              : };
     381              : 
     382              : impl InMemoryLayer {
     383       311044 :     pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
     384       311044 :         self.file_id
     385       311044 :     }
     386              : 
     387          597 :     pub(crate) fn get_timeline_id(&self) -> TimelineId {
     388          597 :         self.timeline_id
     389          597 :     }
     390              : 
     391         1190 :     pub(crate) fn info(&self) -> InMemoryLayerInfo {
     392         1190 :         let lsn_start = self.start_lsn;
     393              : 
     394         1190 :         if let Some(&lsn_end) = self.end_lsn.get() {
     395         1189 :             InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
     396              :         } else {
     397            1 :             InMemoryLayerInfo::Open { lsn_start }
     398              :         }
     399         1190 :     }
     400              : 
     401         1186 :     pub(crate) fn try_len(&self) -> Option<u64> {
     402         1186 :         self.inner.try_read().map(|i| i.file.len()).ok()
     403         1186 :     }
     404              : 
     405      2402128 :     pub(crate) fn assert_writable(&self) {
     406      2402128 :         assert!(self.end_lsn.get().is_none());
     407      2402128 :     }
     408              : 
     409      1212296 :     pub(crate) fn end_lsn_or_max(&self) -> Lsn {
     410      1212296 :         self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
     411      1212296 :     }
     412              : 
     413      1211703 :     pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
     414      1211703 :         self.start_lsn..self.end_lsn_or_max()
     415      1211703 :     }
     416              : 
     417              :     /// debugging function to print out the contents of the layer
     418              :     ///
     419              :     /// this is likely completly unused
     420            0 :     pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
     421            0 :         let end_str = self.end_lsn_or_max();
     422            0 : 
     423            0 :         println!(
     424            0 :             "----- in-memory layer for tli {} LSNs {}-{} ----",
     425            0 :             self.timeline_id, self.start_lsn, end_str,
     426            0 :         );
     427            0 : 
     428            0 :         Ok(())
     429            0 :     }
     430              : 
     431              :     // Look up the keys in the provided keyspace and update
     432              :     // the reconstruct state with whatever is found.
     433       307204 :     pub(crate) async fn get_values_reconstruct_data(
     434       307204 :         self: &Arc<InMemoryLayer>,
     435       307204 :         keyspace: KeySpace,
     436       307204 :         lsn_range: Range<Lsn>,
     437       307204 :         reconstruct_state: &mut ValuesReconstructState,
     438       307204 :         ctx: &RequestContext,
     439       307204 :     ) -> Result<(), GetVectoredError> {
     440       307204 :         let ctx = RequestContextBuilder::from(ctx)
     441       307204 :             .page_content_kind(PageContentKind::InMemoryLayer)
     442       307204 :             .attached_child();
     443              : 
     444       307204 :         let index = self.index.read().await;
     445              : 
     446              :         struct ValueRead {
     447              :             entry_lsn: Lsn,
     448              :             read: vectored_dio_read::LogicalRead<Vec<u8>>,
     449              :         }
     450       307204 :         let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
     451       307204 :         let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
     452              : 
     453       310851 :         for range in keyspace.ranges.iter() {
     454       310851 :             for (key, vec_map) in index.range(range.start.to_compact()..range.end.to_compact()) {
     455       263874 :                 let key = Key::from_compact(*key);
     456       263874 :                 let slice = vec_map.slice_range(lsn_range.clone());
     457              : 
     458       956684 :                 for (entry_lsn, index_entry) in slice.iter().rev() {
     459              :                     let IndexEntryUnpacked {
     460       956684 :                         pos,
     461       956684 :                         len,
     462       956684 :                         will_init,
     463       956684 :                     } = index_entry.unpack();
     464       956684 : 
     465       956684 :                     reads.entry(key).or_default().push(ValueRead {
     466       956684 :                         entry_lsn: *entry_lsn,
     467       956684 :                         read: vectored_dio_read::LogicalRead::new(
     468       956684 :                             pos,
     469       956684 :                             Vec::with_capacity(len as usize),
     470       956684 :                         ),
     471       956684 :                     });
     472       956684 : 
     473       956684 :                     let io = reconstruct_state.update_key(&key, *entry_lsn, will_init);
     474       956684 :                     ios.insert((key, *entry_lsn), io);
     475       956684 : 
     476       956684 :                     if will_init {
     477       253522 :                         break;
     478       703162 :                     }
     479              :                 }
     480              :             }
     481              :         }
     482       307204 :         drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
     483       307204 :         let read_from = Arc::clone(self);
     484       307204 :         let read_ctx = ctx.attached_child();
     485       307204 :         reconstruct_state
     486       307204 :             .spawn_io(async move {
     487       307204 :                 let inner = read_from.inner.read().await;
     488       307204 :                 let f = vectored_dio_read::execute(
     489       307204 :                     &inner.file,
     490       307204 :                     reads
     491       307204 :                         .iter()
     492       956684 :                         .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
     493       307204 :                     &read_ctx,
     494       307204 :                 );
     495       307204 :                 send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
     496       307204 :                     .await;
     497              : 
     498       571076 :                 for (key, value_reads) in reads {
     499      1220556 :                     for ValueRead { entry_lsn, read } in value_reads {
     500       956684 :                         let io = ios.remove(&(key, entry_lsn)).expect("sender must exist");
     501       956684 :                         match read.into_result().expect("we run execute() above") {
     502            0 :                             Err(e) => {
     503            0 :                                 io.complete(Err(std::io::Error::new(
     504            0 :                                     e.kind(),
     505            0 :                                     "dio vec read failed",
     506            0 :                                 )));
     507            0 :                             }
     508       956684 :                             Ok(value_buf) => {
     509       956684 :                                 io.complete(Ok(OnDiskValue::WalRecordOrImage(value_buf.into())));
     510       956684 :                             }
     511              :                         }
     512              :                     }
     513              :                 }
     514              : 
     515       307204 :                 assert!(ios.is_empty());
     516              : 
     517              :                 // Keep layer existent until this IO is done;
     518              :                 // This is kinda forced for InMemoryLayer because we need to inner.read() anyway,
     519              :                 // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit
     520              :                 // drop for consistency among all three layer types.
     521       307204 :                 drop(inner);
     522       307204 :                 drop(read_from);
     523       307204 :             })
     524       307204 :             .await;
     525              : 
     526       307204 :         Ok(())
     527       307204 :     }
     528              : }
     529              : 
     530         1190 : fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
     531         1190 :     write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
     532         1190 : }
     533              : 
     534          597 : fn inmem_layer_log_display(
     535          597 :     mut f: impl Write,
     536          597 :     timeline: TimelineId,
     537          597 :     start_lsn: Lsn,
     538          597 :     end_lsn: Lsn,
     539          597 : ) -> std::fmt::Result {
     540          597 :     write!(f, "timeline {} in-memory ", timeline)?;
     541          597 :     inmem_layer_display(f, start_lsn, end_lsn)
     542          597 : }
     543              : 
     544              : impl std::fmt::Display for InMemoryLayer {
     545          593 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     546          593 :         let end_lsn = self.end_lsn_or_max();
     547          593 :         inmem_layer_display(f, self.start_lsn, end_lsn)
     548          593 :     }
     549              : }
     550              : 
     551              : impl InMemoryLayer {
     552              :     /// Get layer size.
     553          656 :     pub async fn size(&self) -> Result<u64> {
     554          656 :         let inner = self.inner.read().await;
     555          656 :         Ok(inner.file.len())
     556          656 :     }
     557              : 
     558          602 :     pub fn estimated_in_mem_size(&self) -> u64 {
     559          602 :         self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
     560          602 :     }
     561              : 
     562              :     /// Create a new, empty, in-memory layer
     563          661 :     pub async fn create(
     564          661 :         conf: &'static PageServerConf,
     565          661 :         timeline_id: TimelineId,
     566          661 :         tenant_shard_id: TenantShardId,
     567          661 :         start_lsn: Lsn,
     568          661 :         gate: &utils::sync::gate::Gate,
     569          661 :         cancel: &CancellationToken,
     570          661 :         ctx: &RequestContext,
     571          661 :     ) -> Result<InMemoryLayer> {
     572          661 :         trace!(
     573            0 :             "initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"
     574              :         );
     575              : 
     576          661 :         let file =
     577          661 :             EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, cancel, ctx).await?;
     578          661 :         let key = InMemoryLayerFileId(file.page_cache_file_id());
     579          661 : 
     580          661 :         Ok(InMemoryLayer {
     581          661 :             file_id: key,
     582          661 :             frozen_local_path_str: OnceLock::new(),
     583          661 :             conf,
     584          661 :             timeline_id,
     585          661 :             tenant_shard_id,
     586          661 :             start_lsn,
     587          661 :             end_lsn: OnceLock::new(),
     588          661 :             opened_at: Instant::now(),
     589          661 :             index: RwLock::new(BTreeMap::new()),
     590          661 :             inner: RwLock::new(InMemoryLayerInner {
     591          661 :                 file,
     592          661 :                 resource_units: GlobalResourceUnits::new(),
     593          661 :             }),
     594          661 :             estimated_in_mem_size: AtomicU64::new(0),
     595          661 :         })
     596          661 :     }
     597              : 
     598              :     /// Write path.
     599              :     ///
     600              :     /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
     601              :     /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
     602              :     /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
     603      2402128 :     pub async fn put_batch(
     604      2402128 :         &self,
     605      2402128 :         serialized_batch: SerializedValueBatch,
     606      2402128 :         ctx: &RequestContext,
     607      2402128 :     ) -> anyhow::Result<()> {
     608      2402128 :         let (base_offset, metadata) = {
     609      2402128 :             let mut inner = self.inner.write().await;
     610      2402128 :             self.assert_writable();
     611      2402128 : 
     612      2402128 :             let base_offset = inner.file.len();
     613      2402128 : 
     614      2402128 :             let SerializedValueBatch {
     615      2402128 :                 raw,
     616      2402128 :                 metadata,
     617      2402128 :                 max_lsn: _,
     618      2402128 :                 len: _,
     619      2402128 :             } = serialized_batch;
     620      2402128 : 
     621      2402128 :             // Write the batch to the file
     622      2402128 :             inner.file.write_raw(&raw, ctx).await?;
     623      2402128 :             let new_size = inner.file.len();
     624      2402128 : 
     625      2402128 :             let expected_new_len = base_offset
     626      2402128 :                 .checked_add(raw.len().into_u64())
     627      2402128 :                 // write_raw would error if we were to overflow u64.
     628      2402128 :                 // also IndexEntry and higher levels in
     629      2402128 :                 //the code don't allow the file to grow that large
     630      2402128 :                 .unwrap();
     631      2402128 :             assert_eq!(new_size, expected_new_len);
     632              : 
     633      2402128 :             inner.resource_units.maybe_publish_size(new_size);
     634      2402128 : 
     635      2402128 :             (base_offset, metadata)
     636              :         };
     637              : 
     638              :         // Update the index with the new entries
     639      2402128 :         let mut index = self.index.write().await;
     640              : 
     641      4967333 :         for meta in metadata {
     642              :             let SerializedValueMeta {
     643      2565205 :                 key,
     644      2565205 :                 lsn,
     645      2565205 :                 batch_offset,
     646      2565205 :                 len,
     647      2565205 :                 will_init,
     648      2565205 :             } = match meta {
     649      2565205 :                 ValueMeta::Serialized(ser) => ser,
     650              :                 ValueMeta::Observed(_) => {
     651            0 :                     continue;
     652              :                 }
     653              :             };
     654              : 
     655              :             // Add the base_offset to the batch's index entries which are relative to the batch start.
     656      2565205 :             let index_entry = IndexEntry::new(IndexEntryNewArgs {
     657      2565205 :                 base_offset,
     658      2565205 :                 batch_offset,
     659      2565205 :                 len,
     660      2565205 :                 will_init,
     661      2565205 :             })?;
     662              : 
     663      2565205 :             let vec_map = index.entry(key).or_default();
     664      2565205 :             let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
     665      2565205 :             if old.is_some() {
     666              :                 // This should not break anything, but is unexpected: ingestion code aims to filter out
     667              :                 // multiple writes to the same key at the same LSN.  This happens in cases where our
     668              :                 // ingenstion code generates some write like an empty page, and we see a write from postgres
     669              :                 // to the same key in the same wal record.  If one such write makes it through, we
     670              :                 // index the most recent write, implicitly ignoring the earlier write.  We log a warning
     671              :                 // because this case is unexpected, and we would like tests to fail if this happens.
     672            0 :                 warn!("Key {} at {} written twice at same LSN", key, lsn);
     673      2565205 :             }
     674      2565205 :             self.estimated_in_mem_size.fetch_add(
     675      2565205 :                 (std::mem::size_of::<CompactKey>()
     676      2565205 :                     + std::mem::size_of::<Lsn>()
     677      2565205 :                     + std::mem::size_of::<IndexEntry>()) as u64,
     678      2565205 :                 AtomicOrdering::Relaxed,
     679      2565205 :             );
     680              :         }
     681              : 
     682      2402128 :         Ok(())
     683      2402128 :     }
     684              : 
     685      2401506 :     pub(crate) fn get_opened_at(&self) -> Instant {
     686      2401506 :         self.opened_at
     687      2401506 :     }
     688              : 
     689            0 :     pub(crate) async fn tick(&self) -> Option<u64> {
     690            0 :         let mut inner = self.inner.write().await;
     691            0 :         let size = inner.file.len();
     692            0 :         inner.resource_units.publish_size(size)
     693            0 :     }
     694              : 
     695            1 :     pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
     696            1 :         // TODO: Currently, we just leak the storage for any deleted keys
     697            1 :         Ok(())
     698            1 :     }
     699              : 
     700              :     /// Records the end_lsn for non-dropped layers.
     701              :     /// `end_lsn` is exclusive
     702              :     ///
     703              :     /// A note on locking:
     704              :     /// The current API of [`InMemoryLayer`] does not ensure that there's no ongoing
     705              :     /// writes while freezing the layer. This is enforced at a higher level via
     706              :     /// [`crate::tenant::Timeline::write_lock`]. Freeze might be called via two code paths:
     707              :     /// 1. Via the active [`crate::tenant::timeline::TimelineWriter`]. This holds the
     708              :     ///    Timeline::write_lock for its lifetime. The rolling is handled in
     709              :     ///    [`crate::tenant::timeline::TimelineWriter::put_batch`]. It's a &mut self function
     710              :     ///    so can't be called from different threads.
     711              :     /// 2. In the background via [`crate::tenant::Timeline::maybe_freeze_ephemeral_layer`].
     712              :     ///    This only proceeds if try_lock on Timeline::write_lock succeeds (i.e. there's no active writer),
     713              :     ///    hence there can be no concurrent writes
     714          597 :     pub async fn freeze(&self, end_lsn: Lsn) {
     715          597 :         assert!(
     716          597 :             self.start_lsn < end_lsn,
     717            0 :             "{} >= {}",
     718              :             self.start_lsn,
     719              :             end_lsn
     720              :         );
     721          597 :         self.end_lsn.set(end_lsn).expect("end_lsn set only once");
     722          597 : 
     723          597 :         self.frozen_local_path_str
     724          597 :             .set({
     725          597 :                 let mut buf = String::new();
     726          597 :                 inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
     727          597 :                     .unwrap();
     728          597 :                 buf.into()
     729          597 :             })
     730          597 :             .expect("frozen_local_path_str set only once");
     731              : 
     732              :         #[cfg(debug_assertions)]
     733              :         {
     734          597 :             let index = self.index.read().await;
     735      2128078 :             for vec_map in index.values() {
     736      2213322 :                 for (lsn, _) in vec_map.as_slice() {
     737      2213322 :                     assert!(*lsn < end_lsn);
     738              :                 }
     739              :             }
     740              :         }
     741          597 :     }
     742              : 
     743              :     /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
     744              :     /// layer will only contain the key range the user specifies, and may return `None`
     745              :     /// if there are no matching keys.
     746              :     ///
     747              :     /// Returns a new delta layer with all the same data as this in-memory layer
     748          484 :     pub async fn write_to_disk(
     749          484 :         &self,
     750          484 :         ctx: &RequestContext,
     751          484 :         key_range: Option<Range<Key>>,
     752          484 :         l0_flush_global_state: &l0_flush::Inner,
     753          484 :         gate: &utils::sync::gate::Gate,
     754          484 :         cancel: CancellationToken,
     755          484 :     ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
     756              :         // Grab the lock in read-mode. We hold it over the I/O, but because this
     757              :         // layer is not writeable anymore, no one should be trying to acquire the
     758              :         // write lock on it, so we shouldn't block anyone. See the comment on
     759              :         // [`InMemoryLayer::freeze`] to understand how locking between the append path
     760              :         // and layer flushing works.
     761          484 :         let inner = self.inner.read().await;
     762          484 :         let index = self.index.read().await;
     763              : 
     764              :         use l0_flush::Inner;
     765          484 :         let _concurrency_permit = match l0_flush_global_state {
     766          484 :             Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
     767              :         };
     768              : 
     769          484 :         let end_lsn = *self.end_lsn.get().unwrap();
     770              : 
     771          484 :         let key_count = if let Some(key_range) = key_range {
     772            0 :             let key_range = key_range.start.to_compact()..key_range.end.to_compact();
     773            0 : 
     774            0 :             index.iter().filter(|(k, _)| key_range.contains(k)).count()
     775              :         } else {
     776          484 :             index.len()
     777              :         };
     778          484 :         if key_count == 0 {
     779            0 :             return Ok(None);
     780          484 :         }
     781              : 
     782          484 :         let mut delta_layer_writer = DeltaLayerWriter::new(
     783          484 :             self.conf,
     784          484 :             self.timeline_id,
     785          484 :             self.tenant_shard_id,
     786          484 :             Key::MIN,
     787          484 :             self.start_lsn..end_lsn,
     788          484 :             gate,
     789          484 :             cancel,
     790          484 :             ctx,
     791          484 :         )
     792          484 :         .await?;
     793              : 
     794          484 :         match l0_flush_global_state {
     795              :             l0_flush::Inner::Direct { .. } => {
     796          484 :                 let file_contents = inner.file.load_to_io_buf(ctx).await?;
     797          484 :                 let file_contents = file_contents.freeze();
     798              : 
     799      2127116 :                 for (key, vec_map) in index.iter() {
     800              :                     // Write all page versions
     801      2192758 :                     for (lsn, entry) in vec_map
     802      2127116 :                         .as_slice()
     803      2127116 :                         .iter()
     804      2192758 :                         .map(|(lsn, entry)| (lsn, entry.unpack()))
     805              :                     {
     806              :                         let IndexEntryUnpacked {
     807      2192758 :                             pos,
     808      2192758 :                             len,
     809      2192758 :                             will_init,
     810      2192758 :                         } = entry;
     811      2192758 :                         let buf = file_contents.slice(pos as usize..(pos + len) as usize);
     812      2192758 :                         let (_buf, res) = delta_layer_writer
     813      2192758 :                             .put_value_bytes(
     814      2192758 :                                 Key::from_compact(*key),
     815      2192758 :                                 *lsn,
     816      2192758 :                                 buf.slice_len(),
     817      2192758 :                                 will_init,
     818      2192758 :                                 ctx,
     819      2192758 :                             )
     820      2192758 :                             .await;
     821      2192758 :                         res?;
     822              :                     }
     823              :                 }
     824              :             }
     825              :         }
     826              : 
     827              :         // MAX is used here because we identify L0 layers by full key range
     828          484 :         let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
     829              : 
     830              :         // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
     831              :         //
     832              :         // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
     833              :         // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
     834              :         // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
     835              :         //
     836              :         // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
     837              :         // we dirtied when writing to the filesystem have been flushed and marked !dirty.
     838          484 :         drop(_concurrency_permit);
     839          484 : 
     840          484 :         Ok(Some((desc, path)))
     841          484 :     }
     842              : }
     843              : 
     844              : #[cfg(test)]
     845              : mod tests {
     846              :     use super::*;
     847              : 
     848              :     #[test]
     849            1 :     fn test_index_entry() {
     850              :         const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS;
     851              :         use {IndexEntryNewArgs as Args, IndexEntryUnpacked as Unpacked};
     852              : 
     853           20 :         let roundtrip = |args, expect: Unpacked| {
     854           20 :             let res = IndexEntry::new(args).expect("this tests expects no errors");
     855           20 :             let IndexEntryUnpacked {
     856           20 :                 will_init,
     857           20 :                 len,
     858           20 :                 pos,
     859           20 :             } = res.unpack();
     860           20 :             assert_eq!(will_init, expect.will_init);
     861           20 :             assert_eq!(len, expect.len);
     862           20 :             assert_eq!(pos, expect.pos);
     863           20 :         };
     864              : 
     865              :         // basic roundtrip
     866            3 :         for pos in [0, MAX_SUPPORTED_POS] {
     867            6 :             for len in [0, MAX_SUPPORTED_BLOB_LEN] {
     868           12 :                 for will_init in [true, false] {
     869            8 :                     let expect = Unpacked {
     870            8 :                         will_init,
     871            8 :                         len: len.into_u64(),
     872            8 :                         pos: pos.into_u64(),
     873            8 :                     };
     874            8 :                     roundtrip(
     875            8 :                         Args {
     876            8 :                             will_init,
     877            8 :                             base_offset: pos.into_u64(),
     878            8 :                             batch_offset: 0,
     879            8 :                             len,
     880            8 :                         },
     881            8 :                         expect,
     882            8 :                     );
     883            8 :                     roundtrip(
     884            8 :                         Args {
     885            8 :                             will_init,
     886            8 :                             base_offset: 0,
     887            8 :                             batch_offset: pos.into_u64(),
     888            8 :                             len,
     889            8 :                         },
     890            8 :                         expect,
     891            8 :                     );
     892            8 :                 }
     893              :             }
     894              :         }
     895              : 
     896              :         // too-large len
     897            1 :         let too_large = Args {
     898            1 :             will_init: false,
     899            1 :             len: MAX_SUPPORTED_BLOB_LEN + 1,
     900            1 :             base_offset: 0,
     901            1 :             batch_offset: 0,
     902            1 :         };
     903            1 :         assert!(IndexEntry::new(too_large).is_err());
     904              : 
     905              :         // too-large pos
     906              :         {
     907            1 :             let too_large = Args {
     908            1 :                 will_init: false,
     909            1 :                 len: 0,
     910            1 :                 base_offset: MAX_SUPPORTED_POS.into_u64() + 1,
     911            1 :                 batch_offset: 0,
     912            1 :             };
     913            1 :             assert!(IndexEntry::new(too_large).is_err());
     914            1 :             let too_large = Args {
     915            1 :                 will_init: false,
     916            1 :                 len: 0,
     917            1 :                 base_offset: 0,
     918            1 :                 batch_offset: MAX_SUPPORTED_POS.into_u64() + 1,
     919            1 :             };
     920            1 :             assert!(IndexEntry::new(too_large).is_err());
     921              :         }
     922              : 
     923              :         // too large (base_offset + batch_offset)
     924              :         {
     925            1 :             let too_large = Args {
     926            1 :                 will_init: false,
     927            1 :                 len: 0,
     928            1 :                 base_offset: MAX_SUPPORTED_POS.into_u64(),
     929            1 :                 batch_offset: 1,
     930            1 :             };
     931            1 :             assert!(IndexEntry::new(too_large).is_err());
     932            1 :             let too_large = Args {
     933            1 :                 will_init: false,
     934            1 :                 len: 0,
     935            1 :                 base_offset: MAX_SUPPORTED_POS.into_u64() - 1,
     936            1 :                 batch_offset: MAX_SUPPORTED_POS.into_u64() - 1,
     937            1 :             };
     938            1 :             assert!(IndexEntry::new(too_large).is_err());
     939              :         }
     940              : 
     941              :         // valid special cases
     942              :         // - area past the max supported pos that is accessible by len
     943            3 :         for len in [1, MAX_SUPPORTED_BLOB_LEN] {
     944            2 :             roundtrip(
     945            2 :                 Args {
     946            2 :                     will_init: false,
     947            2 :                     len,
     948            2 :                     base_offset: MAX_SUPPORTED_POS.into_u64(),
     949            2 :                     batch_offset: 0,
     950            2 :                 },
     951            2 :                 Unpacked {
     952            2 :                     will_init: false,
     953            2 :                     len: len as u64,
     954            2 :                     pos: MAX_SUPPORTED_POS.into_u64(),
     955            2 :                 },
     956            2 :             );
     957            2 :             roundtrip(
     958            2 :                 Args {
     959            2 :                     will_init: false,
     960            2 :                     len,
     961            2 :                     base_offset: 0,
     962            2 :                     batch_offset: MAX_SUPPORTED_POS.into_u64(),
     963            2 :                 },
     964            2 :                 Unpacked {
     965            2 :                     will_init: false,
     966            2 :                     len: len as u64,
     967            2 :                     pos: MAX_SUPPORTED_POS.into_u64(),
     968            2 :                 },
     969            2 :             );
     970            2 :         }
     971            1 :     }
     972              : }
        

Generated by: LCOV version 2.1-beta