LCOV - aca806cab4756d7eb6a304846130f4a73a5d5393.info - pageserver/src/tenant/storage_layer/inmemory

LCOV - code coverage report

Current view:	top level - pageserver/src/tenant/storage_layer - inmemory_layer.rs (source / functions)		Coverage	Total	Hit
Test:	aca806cab4756d7eb6a304846130f4a73a5d5393.info	Lines:	90.0 %	532	479
Test Date:	2025-04-24 20:31:15	Functions:	83.7 %	49	41

            Line data    Source code

       1              : //! An in-memory layer stores recently received key-value pairs.
       2              : //!
       3              : //! The "in-memory" part of the name is a bit misleading: the actual page versions are
       4              : //! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
       5              : //! its position in the file, is kept in memory, though.
       6              : //!
       7              : use std::cmp::Ordering;
       8              : use std::collections::{BTreeMap, HashMap};
       9              : use std::fmt::Write;
      10              : use std::ops::Range;
      11              : use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering as AtomicOrdering};
      12              : use std::sync::{Arc, OnceLock};
      13              : use std::time::Instant;
      14              : 
      15              : use anyhow::Result;
      16              : use camino::Utf8PathBuf;
      17              : use pageserver_api::key::{CompactKey, Key};
      18              : use pageserver_api::keyspace::KeySpace;
      19              : use pageserver_api::models::InMemoryLayerInfo;
      20              : use pageserver_api::shard::TenantShardId;
      21              : use tokio::sync::RwLock;
      22              : use tokio_util::sync::CancellationToken;
      23              : use tracing::*;
      24              : use utils::id::TimelineId;
      25              : use utils::lsn::Lsn;
      26              : use utils::vec_map::VecMap;
      27              : use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
      28              : 
      29              : use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};
      30              : use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64, u64_to_usize};
      31              : use crate::config::PageServerConf;
      32              : use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
      33              : // avoid binding to Write (conflicts with std::io::Write)
      34              : // while being able to use std::fmt::Write's methods
      35              : use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
      36              : use crate::tenant::ephemeral_file::EphemeralFile;
      37              : use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo};
      38              : use crate::tenant::timeline::GetVectoredError;
      39              : use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
      40              : use crate::{l0_flush, page_cache};
      41              : 
      42              : pub(crate) mod vectored_dio_read;
      43              : 
      44              : #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
      45              : pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
      46              : 
      47              : pub struct InMemoryLayer {
      48              :     conf: &'static PageServerConf,
      49              :     tenant_shard_id: TenantShardId,
      50              :     timeline_id: TimelineId,
      51              :     file_id: InMemoryLayerFileId,
      52              : 
      53              :     /// This layer contains all the changes from 'start_lsn'. The
      54              :     /// start is inclusive.
      55              :     start_lsn: Lsn,
      56              : 
      57              :     /// Frozen layers have an exclusive end LSN.
      58              :     /// Writes are only allowed when this is `None`.
      59              :     pub(crate) end_lsn: OnceLock<Lsn>,
      60              : 
      61              :     /// Used for traversal path. Cached representation of the in-memory layer after frozen.
      62              :     frozen_local_path_str: OnceLock<Arc<str>>,
      63              : 
      64              :     opened_at: Instant,
      65              : 
      66              :     /// The above fields never change, except for `end_lsn`, which is only set once.
      67              :     /// All other changing parts are in `inner`, and protected by a mutex.
      68              :     inner: RwLock<InMemoryLayerInner>,
      69              : 
      70              :     estimated_in_mem_size: AtomicU64,
      71              : }
      72              : 
      73              : impl std::fmt::Debug for InMemoryLayer {
      74            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      75            0 :         f.debug_struct("InMemoryLayer")
      76            0 :             .field("start_lsn", &self.start_lsn)
      77            0 :             .field("end_lsn", &self.end_lsn)
      78            0 :             .field("inner", &self.inner)
      79            0 :             .finish()
      80            0 :     }
      81              : }
      82              : 
      83              : pub struct InMemoryLayerInner {
      84              :     /// All versions of all pages in the layer are kept here. Indexed
      85              :     /// by block number and LSN. The [`IndexEntry`] is an offset into the
      86              :     /// ephemeral file where the page version is stored.
      87              :     index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
      88              : 
      89              :     /// The values are stored in a serialized format in this file.
      90              :     /// Each serialized Value is preceded by a 'u32' length field.
      91              :     /// PerSeg::page_versions map stores offsets into this file.
      92              :     file: EphemeralFile,
      93              : 
      94              :     resource_units: GlobalResourceUnits,
      95              : }
      96              : 
      97              : /// Support the same max blob length as blob_io, because ultimately
      98              : /// all the InMemoryLayer contents end up being written into a delta layer,
      99              : /// using the [`crate::tenant::blob_io`].
     100              : const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN;
     101              : const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
     102              :     let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize;
     103              :     let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize;
     104              :     assert!(trailing_ones + leading_zeroes == std::mem::size_of::<usize>() * 8);
     105              :     trailing_ones
     106              : };
     107              : 
     108              : /// See [`InMemoryLayerInner::index`].
     109              : ///
     110              : /// For memory efficiency, the data is packed into a u64.
     111              : ///
     112              : /// Layout:
     113              : /// - 1 bit: `will_init`
     114              : /// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len`
     115              : /// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos`
     116              : #[derive(Debug, Clone, Copy, PartialEq, Eq)]
     117              : pub struct IndexEntry(u64);
     118              : 
     119              : impl IndexEntry {
     120              :     /// See [`Self::MAX_SUPPORTED_POS`].
     121              :     const MAX_SUPPORTED_POS_BITS: usize = {
     122              :         let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS;
     123              :         if remainder < 32 {
     124              :             panic!("pos can be u32 as per type system, support that");
     125              :         }
     126              :         remainder
     127              :     };
     128              :     /// The maximum supported blob offset that can be represented by [`Self`].
     129              :     /// See also [`Self::validate_checkpoint_distance`].
     130              :     const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1;
     131              : 
     132              :     // Layout
     133              :     const WILL_INIT_RANGE: Range<usize> = 0..1;
     134              :     const LEN_RANGE: Range<usize> =
     135              :         Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS;
     136              :     const POS_RANGE: Range<usize> =
     137              :         Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS;
     138              :     const _ASSERT: () = {
     139              :         if Self::POS_RANGE.end != 64 {
     140              :             panic!("we don't want undefined bits for our own sanity")
     141              :         }
     142              :     };
     143              : 
     144              :     /// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`].
     145              :     ///
     146              :     /// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long.
     147              :     /// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`].
     148              :     ///
     149              :     /// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance,
     150              :     /// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value.
     151              :     ///
     152              :     /// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested)
     153              :     /// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer.
     154              :     #[inline(always)]
     155     30782640 :     fn new(arg: IndexEntryNewArgs) -> anyhow::Result<Self> {
     156     30782640 :         let IndexEntryNewArgs {
     157     30782640 :             base_offset,
     158     30782640 :             batch_offset,
     159     30782640 :             len,
     160     30782640 :             will_init,
     161     30782640 :         } = arg;
     162              : 
     163     30782640 :         let pos = base_offset
     164     30782640 :             .checked_add(batch_offset)
     165     30782640 :             .ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?;
     166              : 
     167     30782640 :         if pos.into_usize() > Self::MAX_SUPPORTED_POS {
     168           48 :             anyhow::bail!(
     169           48 :                 "base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}",
     170           48 :                 max = Self::MAX_SUPPORTED_POS
     171           48 :             );
     172     30782592 :         }
     173     30782592 : 
     174     30782592 :         if len > MAX_SUPPORTED_BLOB_LEN {
     175           12 :             anyhow::bail!(
     176           12 :                 "len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}",
     177           12 :             );
     178     30782580 :         }
     179     30782580 : 
     180     30782580 :         let mut data: u64 = 0;
     181              :         use bit_field::BitField;
     182     30782580 :         data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 });
     183     30782580 :         data.set_bits(Self::LEN_RANGE, len.into_u64());
     184     30782580 :         data.set_bits(Self::POS_RANGE, pos);
     185     30782580 : 
     186     30782580 :         Ok(Self(data))
     187     30782640 :     }
     188              : 
     189              :     #[inline(always)]
     190     37825581 :     fn unpack(&self) -> IndexEntryUnpacked {
     191              :         use bit_field::BitField;
     192     37825581 :         IndexEntryUnpacked {
     193     37825581 :             will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0,
     194     37825581 :             len: self.0.get_bits(Self::LEN_RANGE),
     195     37825581 :             pos: self.0.get_bits(Self::POS_RANGE),
     196     37825581 :         }
     197     37825581 :     }
     198              : 
     199              :     /// See [`Self::new`].
     200         1500 :     pub(crate) const fn validate_checkpoint_distance(
     201         1500 :         checkpoint_distance: u64,
     202         1500 :     ) -> Result<(), &'static str> {
     203         1500 :         if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 {
     204            0 :             return Err("exceeds the maximum supported value");
     205         1500 :         }
     206         1500 :         let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN);
     207         1500 :         if res.is_none() {
     208            0 :             return Err(
     209            0 :                 "checkpoint distance + max supported blob len overflows in-memory addition",
     210            0 :             );
     211         1500 :         }
     212         1500 : 
     213         1500 :         // NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS
     214         1500 : 
     215         1500 :         Ok(())
     216         1500 :     }
     217              : 
     218              :     const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
     219              :         let res = Self::validate_checkpoint_distance(
     220              :             pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE,
     221              :         );
     222              :         if res.is_err() {
     223              :             panic!("default checkpoint distance is valid")
     224              :         }
     225              :     };
     226              : }
     227              : 
     228              : /// Args to [`IndexEntry::new`].
     229              : #[derive(Clone, Copy)]
     230              : struct IndexEntryNewArgs {
     231              :     base_offset: u64,
     232              :     batch_offset: u64,
     233              :     len: usize,
     234              :     will_init: bool,
     235              : }
     236              : 
     237              : /// Unpacked representation of the bitfielded [`IndexEntry`].
     238              : #[derive(Clone, Copy, PartialEq, Eq, Debug)]
     239              : struct IndexEntryUnpacked {
     240              :     will_init: bool,
     241              :     len: u64,
     242              :     pos: u64,
     243              : }
     244              : 
     245              : impl std::fmt::Debug for InMemoryLayerInner {
     246            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     247            0 :         f.debug_struct("InMemoryLayerInner").finish()
     248            0 :     }
     249              : }
     250              : 
     251              : /// State shared by all in-memory (ephemeral) layers.  Updated infrequently during background ticks in Timeline,
     252              : /// to minimize contention.
     253              : ///
     254              : /// This global state is used to implement behaviors that require a global view of the system, e.g.
     255              : /// rolling layers proactively to limit the total amount of dirty data.
     256              : pub(crate) struct GlobalResources {
     257              :     // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
     258              :     // Zero means unlimited.
     259              :     pub(crate) max_dirty_bytes: AtomicU64,
     260              :     // How many bytes are in all EphemeralFile objects
     261              :     dirty_bytes: AtomicU64,
     262              :     // How many layers are contributing to dirty_bytes
     263              :     dirty_layers: AtomicUsize,
     264              : }
     265              : 
     266              : // Per-timeline RAII struct for its contribution to [`GlobalResources`]
     267              : struct GlobalResourceUnits {
     268              :     // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
     269              :     // for decrementing the global counter by this many bytes when dropped.
     270              :     dirty_bytes: u64,
     271              : }
     272              : 
     273              : impl GlobalResourceUnits {
     274              :     // Hint for the layer append path to update us when the layer size differs from the last
     275              :     // call to update_size by this much.  If we don't reach this threshold, we'll still get
     276              :     // updated when the Timeline "ticks" in the background.
     277              :     const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
     278              : 
     279         7908 :     fn new() -> Self {
     280         7908 :         GLOBAL_RESOURCES
     281         7908 :             .dirty_layers
     282         7908 :             .fetch_add(1, AtomicOrdering::Relaxed);
     283         7908 :         Self { dirty_bytes: 0 }
     284         7908 :     }
     285              : 
     286              :     /// Do not call this frequently: all timelines will write to these same global atomics,
     287              :     /// so this is a relatively expensive operation.  Wait at least a few seconds between calls.
     288              :     ///
     289              :     /// Returns the effective layer size limit that should be applied, if any, to keep
     290              :     /// the total number of dirty bytes below the configured maximum.
     291         7176 :     fn publish_size(&mut self, size: u64) -> Option<u64> {
     292         7176 :         let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
     293         7116 :             Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
     294              :             Ordering::Greater => {
     295           48 :                 let delta = size - self.dirty_bytes;
     296           48 :                 let old = GLOBAL_RESOURCES
     297           48 :                     .dirty_bytes
     298           48 :                     .fetch_add(delta, AtomicOrdering::Relaxed);
     299           48 :                 old + delta
     300              :             }
     301              :             Ordering::Less => {
     302           12 :                 let delta = self.dirty_bytes - size;
     303           12 :                 let old = GLOBAL_RESOURCES
     304           12 :                     .dirty_bytes
     305           12 :                     .fetch_sub(delta, AtomicOrdering::Relaxed);
     306           12 :                 old - delta
     307              :             }
     308              :         };
     309              : 
     310              :         // This is a sloppy update: concurrent updates to the counter will race, and the exact
     311              :         // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
     312              :         // That's okay: as long as the metric contains some recent value, it doesn't have to always
     313              :         // be literally the last update.
     314         7176 :         TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
     315         7176 : 
     316         7176 :         self.dirty_bytes = size;
     317         7176 : 
     318         7176 :         let max_dirty_bytes = GLOBAL_RESOURCES
     319         7176 :             .max_dirty_bytes
     320         7176 :             .load(AtomicOrdering::Relaxed);
     321         7176 :         if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
     322              :             // Set the layer file limit to the average layer size: this implies that all above-average
     323              :             // sized layers will be elegible for freezing.  They will be frozen in the order they
     324              :             // next enter publish_size.
     325            0 :             Some(
     326            0 :                 new_global_dirty_bytes
     327            0 :                     / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
     328            0 :             )
     329              :         } else {
     330         7176 :             None
     331              :         }
     332         7176 :     }
     333              : 
     334              :     // Call publish_size if the input size differs from last published size by more than
     335              :     // the drift limit
     336     28825500 :     fn maybe_publish_size(&mut self, size: u64) {
     337     28825500 :         let publish = match size.cmp(&self.dirty_bytes) {
     338            0 :             Ordering::Equal => false,
     339     28825500 :             Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
     340            0 :             Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT,
     341              :         };
     342              : 
     343     28825500 :         if publish {
     344           48 :             self.publish_size(size);
     345     28825452 :         }
     346     28825500 :     }
     347              : }
     348              : 
     349              : impl Drop for GlobalResourceUnits {
     350         7128 :     fn drop(&mut self) {
     351         7128 :         GLOBAL_RESOURCES
     352         7128 :             .dirty_layers
     353         7128 :             .fetch_sub(1, AtomicOrdering::Relaxed);
     354         7128 : 
     355         7128 :         // Subtract our contribution to the global total dirty bytes
     356         7128 :         self.publish_size(0);
     357         7128 :     }
     358              : }
     359              : 
     360              : pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
     361              :     max_dirty_bytes: AtomicU64::new(0),
     362              :     dirty_bytes: AtomicU64::new(0),
     363              :     dirty_layers: AtomicUsize::new(0),
     364              : };
     365              : 
     366              : impl InMemoryLayer {
     367      3763267 :     pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
     368      3763267 :         self.file_id
     369      3763267 :     }
     370              : 
     371         7152 :     pub(crate) fn get_timeline_id(&self) -> TimelineId {
     372         7152 :         self.timeline_id
     373         7152 :     }
     374              : 
     375        14256 :     pub(crate) fn info(&self) -> InMemoryLayerInfo {
     376        14256 :         let lsn_start = self.start_lsn;
     377              : 
     378        14256 :         if let Some(&lsn_end) = self.end_lsn.get() {
     379        14244 :             InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
     380              :         } else {
     381           12 :             InMemoryLayerInfo::Open { lsn_start }
     382              :         }
     383        14256 :     }
     384              : 
     385        14208 :     pub(crate) fn try_len(&self) -> Option<u64> {
     386        14208 :         self.inner.try_read().map(|i| i.file.len()).ok()
     387        14208 :     }
     388              : 
     389     28825500 :     pub(crate) fn assert_writable(&self) {
     390     28825500 :         assert!(self.end_lsn.get().is_none());
     391     28825500 :     }
     392              : 
     393     14641790 :     pub(crate) fn end_lsn_or_max(&self) -> Lsn {
     394     14641790 :         self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
     395     14641790 :     }
     396              : 
     397     14634686 :     pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
     398     14634686 :         self.start_lsn..self.end_lsn_or_max()
     399     14634686 :     }
     400              : 
     401              :     /// debugging function to print out the contents of the layer
     402              :     ///
     403              :     /// this is likely completly unused
     404            0 :     pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
     405            0 :         let end_str = self.end_lsn_or_max();
     406            0 : 
     407            0 :         println!(
     408            0 :             "----- in-memory layer for tli {} LSNs {}-{} ----",
     409            0 :             self.timeline_id, self.start_lsn, end_str,
     410            0 :         );
     411            0 : 
     412            0 :         Ok(())
     413            0 :     }
     414              : 
     415              :     // Look up the keys in the provided keyspace and update
     416              :     // the reconstruct state with whatever is found.
     417      3717211 :     pub(crate) async fn get_values_reconstruct_data(
     418      3717211 :         self: &Arc<InMemoryLayer>,
     419      3717211 :         keyspace: KeySpace,
     420      3717211 :         lsn_range: Range<Lsn>,
     421      3717211 :         reconstruct_state: &mut ValuesReconstructState,
     422      3717211 :         ctx: &RequestContext,
     423      3717211 :     ) -> Result<(), GetVectoredError> {
     424      3717211 :         let ctx = RequestContextBuilder::from(ctx)
     425      3717211 :             .page_content_kind(PageContentKind::InMemoryLayer)
     426      3717211 :             .attached_child();
     427              : 
     428      3717211 :         let inner = self.inner.read().await;
     429              : 
     430              :         struct ValueRead {
     431              :             entry_lsn: Lsn,
     432              :             read: vectored_dio_read::LogicalRead<Vec<u8>>,
     433              :         }
     434      3717211 :         let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
     435      3717211 :         let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
     436              : 
     437      3760963 :         for range in keyspace.ranges.iter() {
     438      3760963 :             for (key, vec_map) in inner
     439      3760963 :                 .index
     440      3760963 :                 .range(range.start.to_compact()..range.end.to_compact())
     441              :             {
     442      3198525 :                 let key = Key::from_compact(*key);
     443      3198525 :                 let slice = vec_map.slice_range(lsn_range.clone());
     444              : 
     445     11512245 :                 for (entry_lsn, index_entry) in slice.iter().rev() {
     446              :                     let IndexEntryUnpacked {
     447     11512245 :                         pos,
     448     11512245 :                         len,
     449     11512245 :                         will_init,
     450     11512245 :                     } = index_entry.unpack();
     451     11512245 : 
     452     11512245 :                     reads.entry(key).or_default().push(ValueRead {
     453     11512245 :                         entry_lsn: *entry_lsn,
     454     11512245 :                         read: vectored_dio_read::LogicalRead::new(
     455     11512245 :                             pos,
     456     11512245 :                             Vec::with_capacity(len as usize),
     457     11512245 :                         ),
     458     11512245 :                     });
     459     11512245 : 
     460     11512245 :                     let io = reconstruct_state.update_key(&key, *entry_lsn, will_init);
     461     11512245 :                     ios.insert((key, *entry_lsn), io);
     462     11512245 : 
     463     11512245 :                     if will_init {
     464      3074301 :                         break;
     465      8437944 :                     }
     466              :                 }
     467              :             }
     468              :         }
     469      3717211 :         drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
     470      3717211 :         let read_from = Arc::clone(self);
     471      3717211 :         let read_ctx = ctx.attached_child();
     472      3717211 :         reconstruct_state
     473      3717211 :             .spawn_io(async move {
     474      3717211 :                 let inner = read_from.inner.read().await;
     475      3717211 :                 let f = vectored_dio_read::execute(
     476      3717211 :                     &inner.file,
     477      3717211 :                     reads
     478      3717211 :                         .iter()
     479     11512245 :                         .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
     480      3717211 :                     &read_ctx,
     481      3717211 :                 );
     482      3717211 :                 send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
     483      3717211 :                     .await;
     484              : 
     485      6915712 :                 for (key, value_reads) in reads {
     486     14710746 :                     for ValueRead { entry_lsn, read } in value_reads {
     487     11512245 :                         let io = ios.remove(&(key, entry_lsn)).expect("sender must exist");
     488     11512245 :                         match read.into_result().expect("we run execute() above") {
     489            0 :                             Err(e) => {
     490            0 :                                 io.complete(Err(std::io::Error::new(
     491            0 :                                     e.kind(),
     492            0 :                                     "dio vec read failed",
     493            0 :                                 )));
     494            0 :                             }
     495     11512245 :                             Ok(value_buf) => {
     496     11512245 :                                 io.complete(Ok(OnDiskValue::WalRecordOrImage(value_buf.into())));
     497     11512245 :                             }
     498              :                         }
     499              :                     }
     500              :                 }
     501              : 
     502      3717211 :                 assert!(ios.is_empty());
     503              : 
     504              :                 // Keep layer existent until this IO is done;
     505              :                 // This is kinda forced for InMemoryLayer because we need to inner.read() anyway,
     506              :                 // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit
     507              :                 // drop for consistency among all three layer types.
     508      3717211 :                 drop(inner);
     509      3717211 :                 drop(read_from);
     510      3717211 :             })
     511      3717211 :             .await;
     512              : 
     513      3717211 :         Ok(())
     514      3717211 :     }
     515              : }
     516              : 
     517        14256 : fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
     518        14256 :     write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
     519        14256 : }
     520              : 
     521         7152 : fn inmem_layer_log_display(
     522         7152 :     mut f: impl Write,
     523         7152 :     timeline: TimelineId,
     524         7152 :     start_lsn: Lsn,
     525         7152 :     end_lsn: Lsn,
     526         7152 : ) -> std::fmt::Result {
     527         7152 :     write!(f, "timeline {} in-memory ", timeline)?;
     528         7152 :     inmem_layer_display(f, start_lsn, end_lsn)
     529         7152 : }
     530              : 
     531              : impl std::fmt::Display for InMemoryLayer {
     532         7104 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     533         7104 :         let end_lsn = self.end_lsn_or_max();
     534         7104 :         inmem_layer_display(f, self.start_lsn, end_lsn)
     535         7104 :     }
     536              : }
     537              : 
     538              : impl InMemoryLayer {
     539              :     /// Get layer size.
     540         7848 :     pub async fn size(&self) -> Result<u64> {
     541         7848 :         let inner = self.inner.read().await;
     542         7848 :         Ok(inner.file.len())
     543         7848 :     }
     544              : 
     545         7264 :     pub fn estimated_in_mem_size(&self) -> u64 {
     546         7264 :         self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
     547         7264 :     }
     548              : 
     549              :     /// Create a new, empty, in-memory layer
     550         7908 :     pub async fn create(
     551         7908 :         conf: &'static PageServerConf,
     552         7908 :         timeline_id: TimelineId,
     553         7908 :         tenant_shard_id: TenantShardId,
     554         7908 :         start_lsn: Lsn,
     555         7908 :         gate: &utils::sync::gate::Gate,
     556         7908 :         cancel: &CancellationToken,
     557         7908 :         ctx: &RequestContext,
     558         7908 :     ) -> Result<InMemoryLayer> {
     559         7908 :         trace!(
     560            0 :             "initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"
     561              :         );
     562              : 
     563         7908 :         let file =
     564         7908 :             EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, cancel, ctx).await?;
     565         7908 :         let key = InMemoryLayerFileId(file.page_cache_file_id());
     566         7908 : 
     567         7908 :         Ok(InMemoryLayer {
     568         7908 :             file_id: key,
     569         7908 :             frozen_local_path_str: OnceLock::new(),
     570         7908 :             conf,
     571         7908 :             timeline_id,
     572         7908 :             tenant_shard_id,
     573         7908 :             start_lsn,
     574         7908 :             end_lsn: OnceLock::new(),
     575         7908 :             opened_at: Instant::now(),
     576         7908 :             inner: RwLock::new(InMemoryLayerInner {
     577         7908 :                 index: BTreeMap::new(),
     578         7908 :                 file,
     579         7908 :                 resource_units: GlobalResourceUnits::new(),
     580         7908 :             }),
     581         7908 :             estimated_in_mem_size: AtomicU64::new(0),
     582         7908 :         })
     583         7908 :     }
     584              : 
     585              :     /// Write path.
     586              :     ///
     587              :     /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
     588              :     /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
     589              :     /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
     590     28825500 :     pub async fn put_batch(
     591     28825500 :         &self,
     592     28825500 :         serialized_batch: SerializedValueBatch,
     593     28825500 :         ctx: &RequestContext,
     594     28825500 :     ) -> anyhow::Result<()> {
     595     28825500 :         let mut inner = self.inner.write().await;
     596     28825500 :         self.assert_writable();
     597     28825500 : 
     598     28825500 :         let base_offset = inner.file.len();
     599     28825500 : 
     600     28825500 :         let SerializedValueBatch {
     601     28825500 :             raw,
     602     28825500 :             metadata,
     603     28825500 :             max_lsn: _,
     604     28825500 :             len: _,
     605     28825500 :         } = serialized_batch;
     606     28825500 : 
     607     28825500 :         // Write the batch to the file
     608     28825500 :         inner.file.write_raw(&raw, ctx).await?;
     609     28825500 :         let new_size = inner.file.len();
     610     28825500 : 
     611     28825500 :         let expected_new_len = base_offset
     612     28825500 :             .checked_add(raw.len().into_u64())
     613     28825500 :             // write_raw would error if we were to overflow u64.
     614     28825500 :             // also IndexEntry and higher levels in
     615     28825500 :             //the code don't allow the file to grow that large
     616     28825500 :             .unwrap();
     617     28825500 :         assert_eq!(new_size, expected_new_len);
     618              : 
     619              :         // Update the index with the new entries
     620     59607840 :         for meta in metadata {
     621              :             let SerializedValueMeta {
     622     30782340 :                 key,
     623     30782340 :                 lsn,
     624     30782340 :                 batch_offset,
     625     30782340 :                 len,
     626     30782340 :                 will_init,
     627     30782340 :             } = match meta {
     628     30782340 :                 ValueMeta::Serialized(ser) => ser,
     629              :                 ValueMeta::Observed(_) => {
     630            0 :                     continue;
     631              :                 }
     632              :             };
     633              : 
     634              :             // Add the base_offset to the batch's index entries which are relative to the batch start.
     635     30782340 :             let index_entry = IndexEntry::new(IndexEntryNewArgs {
     636     30782340 :                 base_offset,
     637     30782340 :                 batch_offset,
     638     30782340 :                 len,
     639     30782340 :                 will_init,
     640     30782340 :             })?;
     641              : 
     642     30782340 :             let vec_map = inner.index.entry(key).or_default();
     643     30782340 :             let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
     644     30782340 :             if old.is_some() {
     645              :                 // This should not break anything, but is unexpected: ingestion code aims to filter out
     646              :                 // multiple writes to the same key at the same LSN.  This happens in cases where our
     647              :                 // ingenstion code generates some write like an empty page, and we see a write from postgres
     648              :                 // to the same key in the same wal record.  If one such write makes it through, we
     649              :                 // index the most recent write, implicitly ignoring the earlier write.  We log a warning
     650              :                 // because this case is unexpected, and we would like tests to fail if this happens.
     651            0 :                 warn!("Key {} at {} written twice at same LSN", key, lsn);
     652     30782340 :             }
     653     30782340 :             self.estimated_in_mem_size.fetch_add(
     654     30782340 :                 (std::mem::size_of::<CompactKey>()
     655     30782340 :                     + std::mem::size_of::<Lsn>()
     656     30782340 :                     + std::mem::size_of::<IndexEntry>()) as u64,
     657     30782340 :                 AtomicOrdering::Relaxed,
     658     30782340 :             );
     659              :         }
     660              : 
     661     28825500 :         inner.resource_units.maybe_publish_size(new_size);
     662     28825500 : 
     663     28825500 :         Ok(())
     664     28825500 :     }
     665              : 
     666     28818060 :     pub(crate) fn get_opened_at(&self) -> Instant {
     667     28818060 :         self.opened_at
     668     28818060 :     }
     669              : 
     670            0 :     pub(crate) async fn tick(&self) -> Option<u64> {
     671            0 :         let mut inner = self.inner.write().await;
     672            0 :         let size = inner.file.len();
     673            0 :         inner.resource_units.publish_size(size)
     674            0 :     }
     675              : 
     676           12 :     pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
     677           12 :         // TODO: Currently, we just leak the storage for any deleted keys
     678           12 :         Ok(())
     679           12 :     }
     680              : 
     681              :     /// Records the end_lsn for non-dropped layers.
     682              :     /// `end_lsn` is exclusive
     683         7152 :     pub async fn freeze(&self, end_lsn: Lsn) {
     684         7152 :         assert!(
     685         7152 :             self.start_lsn < end_lsn,
     686            0 :             "{} >= {}",
     687              :             self.start_lsn,
     688              :             end_lsn
     689              :         );
     690         7152 :         self.end_lsn.set(end_lsn).expect("end_lsn set only once");
     691         7152 : 
     692         7152 :         self.frozen_local_path_str
     693         7152 :             .set({
     694         7152 :                 let mut buf = String::new();
     695         7152 :                 inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
     696         7152 :                     .unwrap();
     697         7152 :                 buf.into()
     698         7152 :             })
     699         7152 :             .expect("frozen_local_path_str set only once");
     700              : 
     701              :         #[cfg(debug_assertions)]
     702              :         {
     703         7152 :             let inner = self.inner.write().await;
     704     25539443 :             for vec_map in inner.index.values() {
     705     26559780 :                 for (lsn, _) in vec_map.as_slice() {
     706     26559780 :                     assert!(*lsn < end_lsn);
     707              :                 }
     708              :             }
     709              :         }
     710         7152 :     }
     711              : 
     712              :     /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
     713              :     /// layer will only contain the key range the user specifies, and may return `None`
     714              :     /// if there are no matching keys.
     715              :     ///
     716              :     /// Returns a new delta layer with all the same data as this in-memory layer
     717         5808 :     pub async fn write_to_disk(
     718         5808 :         &self,
     719         5808 :         ctx: &RequestContext,
     720         5808 :         key_range: Option<Range<Key>>,
     721         5808 :         l0_flush_global_state: &l0_flush::Inner,
     722         5808 :         gate: &utils::sync::gate::Gate,
     723         5808 :         cancel: CancellationToken,
     724         5808 :     ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
     725              :         // Grab the lock in read-mode. We hold it over the I/O, but because this
     726              :         // layer is not writeable anymore, no one should be trying to acquire the
     727              :         // write lock on it, so we shouldn't block anyone. There's one exception
     728              :         // though: another thread might have grabbed a reference to this layer
     729              :         // in `get_layer_for_write' just before the checkpointer called
     730              :         // `freeze`, and then `write_to_disk` on it. When the thread gets the
     731              :         // lock, it will see that it's not writeable anymore and retry, but it
     732              :         // would have to wait until we release it. That race condition is very
     733              :         // rare though, so we just accept the potential latency hit for now.
     734         5808 :         let inner = self.inner.read().await;
     735              : 
     736              :         use l0_flush::Inner;
     737         5808 :         let _concurrency_permit = match l0_flush_global_state {
     738         5808 :             Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
     739              :         };
     740              : 
     741         5808 :         let end_lsn = *self.end_lsn.get().unwrap();
     742              : 
     743         5808 :         let key_count = if let Some(key_range) = key_range {
     744            0 :             let key_range = key_range.start.to_compact()..key_range.end.to_compact();
     745            0 : 
     746            0 :             inner
     747            0 :                 .index
     748            0 :                 .iter()
     749            0 :                 .filter(|(k, _)| key_range.contains(k))
     750            0 :                 .count()
     751              :         } else {
     752         5808 :             inner.index.len()
     753              :         };
     754         5808 :         if key_count == 0 {
     755            0 :             return Ok(None);
     756         5808 :         }
     757              : 
     758         5808 :         let mut delta_layer_writer = DeltaLayerWriter::new(
     759         5808 :             self.conf,
     760         5808 :             self.timeline_id,
     761         5808 :             self.tenant_shard_id,
     762         5808 :             Key::MIN,
     763         5808 :             self.start_lsn..end_lsn,
     764         5808 :             gate,
     765         5808 :             cancel,
     766         5808 :             ctx,
     767         5808 :         )
     768         5808 :         .await?;
     769              : 
     770         5808 :         match l0_flush_global_state {
     771              :             l0_flush::Inner::Direct { .. } => {
     772         5808 :                 let file_contents = inner.file.load_to_io_buf(ctx).await?;
     773         5808 :                 let file_contents = file_contents.freeze();
     774              : 
     775     25527983 :                 for (key, vec_map) in inner.index.iter() {
     776              :                     // Write all page versions
     777     26313096 :                     for (lsn, entry) in vec_map
     778     25527983 :                         .as_slice()
     779     25527983 :                         .iter()
     780     26313096 :                         .map(|(lsn, entry)| (lsn, entry.unpack()))
     781              :                     {
     782              :                         let IndexEntryUnpacked {
     783     26313096 :                             pos,
     784     26313096 :                             len,
     785     26313096 :                             will_init,
     786     26313096 :                         } = entry;
     787     26313096 :                         let buf = file_contents.slice(pos as usize..(pos + len) as usize);
     788     26313096 :                         let (_buf, res) = delta_layer_writer
     789     26313096 :                             .put_value_bytes(
     790     26313096 :                                 Key::from_compact(*key),
     791     26313096 :                                 *lsn,
     792     26313096 :                                 buf.slice_len(),
     793     26313096 :                                 will_init,
     794     26313096 :                                 ctx,
     795     26313096 :                             )
     796     26313096 :                             .await;
     797     26313096 :                         res?;
     798              :                     }
     799              :                 }
     800              :             }
     801              :         }
     802              : 
     803              :         // MAX is used here because we identify L0 layers by full key range
     804         5808 :         let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
     805              : 
     806              :         // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
     807              :         //
     808              :         // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
     809              :         // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
     810              :         // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
     811              :         //
     812              :         // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
     813              :         // we dirtied when writing to the filesystem have been flushed and marked !dirty.
     814         5808 :         drop(_concurrency_permit);
     815         5808 : 
     816         5808 :         Ok(Some((desc, path)))
     817         5808 :     }
     818              : }
     819              : 
     820              : #[cfg(test)]
     821              : mod tests {
     822              :     use super::*;
     823              : 
     824              :     #[test]
     825           12 :     fn test_index_entry() {
     826              :         const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS;
     827              :         use {IndexEntryNewArgs as Args, IndexEntryUnpacked as Unpacked};
     828              : 
     829          240 :         let roundtrip = |args, expect: Unpacked| {
     830          240 :             let res = IndexEntry::new(args).expect("this tests expects no errors");
     831          240 :             let IndexEntryUnpacked {
     832          240 :                 will_init,
     833          240 :                 len,
     834          240 :                 pos,
     835          240 :             } = res.unpack();
     836          240 :             assert_eq!(will_init, expect.will_init);
     837          240 :             assert_eq!(len, expect.len);
     838          240 :             assert_eq!(pos, expect.pos);
     839          240 :         };
     840              : 
     841              :         // basic roundtrip
     842           36 :         for pos in [0, MAX_SUPPORTED_POS] {
     843           72 :             for len in [0, MAX_SUPPORTED_BLOB_LEN] {
     844          144 :                 for will_init in [true, false] {
     845           96 :                     let expect = Unpacked {
     846           96 :                         will_init,
     847           96 :                         len: len.into_u64(),
     848           96 :                         pos: pos.into_u64(),
     849           96 :                     };
     850           96 :                     roundtrip(
     851           96 :                         Args {
     852           96 :                             will_init,
     853           96 :                             base_offset: pos.into_u64(),
     854           96 :                             batch_offset: 0,
     855           96 :                             len,
     856           96 :                         },
     857           96 :                         expect,
     858           96 :                     );
     859           96 :                     roundtrip(
     860           96 :                         Args {
     861           96 :                             will_init,
     862           96 :                             base_offset: 0,
     863           96 :                             batch_offset: pos.into_u64(),
     864           96 :                             len,
     865           96 :                         },
     866           96 :                         expect,
     867           96 :                     );
     868           96 :                 }
     869              :             }
     870              :         }
     871              : 
     872              :         // too-large len
     873           12 :         let too_large = Args {
     874           12 :             will_init: false,
     875           12 :             len: MAX_SUPPORTED_BLOB_LEN + 1,
     876           12 :             base_offset: 0,
     877           12 :             batch_offset: 0,
     878           12 :         };
     879           12 :         assert!(IndexEntry::new(too_large).is_err());
     880              : 
     881              :         // too-large pos
     882              :         {
     883           12 :             let too_large = Args {
     884           12 :                 will_init: false,
     885           12 :                 len: 0,
     886           12 :                 base_offset: MAX_SUPPORTED_POS.into_u64() + 1,
     887           12 :                 batch_offset: 0,
     888           12 :             };
     889           12 :             assert!(IndexEntry::new(too_large).is_err());
     890           12 :             let too_large = Args {
     891           12 :                 will_init: false,
     892           12 :                 len: 0,
     893           12 :                 base_offset: 0,
     894           12 :                 batch_offset: MAX_SUPPORTED_POS.into_u64() + 1,
     895           12 :             };
     896           12 :             assert!(IndexEntry::new(too_large).is_err());
     897              :         }
     898              : 
     899              :         // too large (base_offset + batch_offset)
     900              :         {
     901           12 :             let too_large = Args {
     902           12 :                 will_init: false,
     903           12 :                 len: 0,
     904           12 :                 base_offset: MAX_SUPPORTED_POS.into_u64(),
     905           12 :                 batch_offset: 1,
     906           12 :             };
     907           12 :             assert!(IndexEntry::new(too_large).is_err());
     908           12 :             let too_large = Args {
     909           12 :                 will_init: false,
     910           12 :                 len: 0,
     911           12 :                 base_offset: MAX_SUPPORTED_POS.into_u64() - 1,
     912           12 :                 batch_offset: MAX_SUPPORTED_POS.into_u64() - 1,
     913           12 :             };
     914           12 :             assert!(IndexEntry::new(too_large).is_err());
     915              :         }
     916              : 
     917              :         // valid special cases
     918              :         // - area past the max supported pos that is accessible by len
     919           36 :         for len in [1, MAX_SUPPORTED_BLOB_LEN] {
     920           24 :             roundtrip(
     921           24 :                 Args {
     922           24 :                     will_init: false,
     923           24 :                     len,
     924           24 :                     base_offset: MAX_SUPPORTED_POS.into_u64(),
     925           24 :                     batch_offset: 0,
     926           24 :                 },
     927           24 :                 Unpacked {
     928           24 :                     will_init: false,
     929           24 :                     len: len as u64,
     930           24 :                     pos: MAX_SUPPORTED_POS.into_u64(),
     931           24 :                 },
     932           24 :             );
     933           24 :             roundtrip(
     934           24 :                 Args {
     935           24 :                     will_init: false,
     936           24 :                     len,
     937           24 :                     base_offset: 0,
     938           24 :                     batch_offset: MAX_SUPPORTED_POS.into_u64(),
     939           24 :                 },
     940           24 :                 Unpacked {
     941           24 :                     will_init: false,
     942           24 :                     len: len as u64,
     943           24 :                     pos: MAX_SUPPORTED_POS.into_u64(),
     944           24 :                 },
     945           24 :             );
     946           24 :         }
     947           12 :     }
     948              : }

Generated by: LCOV version 2.1-beta