LCOV - 53437f7e869ac68c86c7d3e4c20964c0156f158c.info - pageserver/src/tenant/storage_layer/inmemory

LCOV - code coverage report

Current view:	top level - pageserver/src/tenant/storage_layer - inmemory_layer.rs (source / functions)		Coverage	Total	Hit
Test:	53437f7e869ac68c86c7d3e4c20964c0156f158c.info	Lines:	88.7 %	541	480
Test Date:	2024-09-20 16:14:12	Functions:	77.6 %	49	38

            Line data    Source code

       1              : //! An in-memory layer stores recently received key-value pairs.
       2              : //!
       3              : //! The "in-memory" part of the name is a bit misleading: the actual page versions are
       4              : //! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
       5              : //! its position in the file, is kept in memory, though.
       6              : //!
       7              : use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
       8              : use crate::config::PageServerConf;
       9              : use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
      10              : use crate::repository::{Key, Value};
      11              : use crate::tenant::ephemeral_file::EphemeralFile;
      12              : use crate::tenant::timeline::GetVectoredError;
      13              : use crate::tenant::PageReconstructError;
      14              : use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
      15              : use crate::{l0_flush, page_cache};
      16              : use anyhow::{anyhow, Context, Result};
      17              : use bytes::Bytes;
      18              : use camino::Utf8PathBuf;
      19              : use pageserver_api::key::CompactKey;
      20              : use pageserver_api::keyspace::KeySpace;
      21              : use pageserver_api::models::InMemoryLayerInfo;
      22              : use pageserver_api::shard::TenantShardId;
      23              : use std::collections::{BTreeMap, HashMap};
      24              : use std::sync::{Arc, OnceLock};
      25              : use std::time::Instant;
      26              : use tracing::*;
      27              : use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
      28              : // avoid binding to Write (conflicts with std::io::Write)
      29              : // while being able to use std::fmt::Write's methods
      30              : use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
      31              : use std::cmp::Ordering;
      32              : use std::fmt::Write;
      33              : use std::ops::Range;
      34              : use std::sync::atomic::Ordering as AtomicOrdering;
      35              : use std::sync::atomic::{AtomicU64, AtomicUsize};
      36              : use tokio::sync::RwLock;
      37              : 
      38              : use super::{
      39              :     DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
      40              : };
      41              : 
      42              : pub(crate) mod vectored_dio_read;
      43              : 
      44              : #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
      45              : pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
      46              : 
      47              : pub struct InMemoryLayer {
      48              :     conf: &'static PageServerConf,
      49              :     tenant_shard_id: TenantShardId,
      50              :     timeline_id: TimelineId,
      51              :     file_id: InMemoryLayerFileId,
      52              : 
      53              :     /// This layer contains all the changes from 'start_lsn'. The
      54              :     /// start is inclusive.
      55              :     start_lsn: Lsn,
      56              : 
      57              :     /// Frozen layers have an exclusive end LSN.
      58              :     /// Writes are only allowed when this is `None`.
      59              :     pub(crate) end_lsn: OnceLock<Lsn>,
      60              : 
      61              :     /// Used for traversal path. Cached representation of the in-memory layer after frozen.
      62              :     frozen_local_path_str: OnceLock<Arc<str>>,
      63              : 
      64              :     opened_at: Instant,
      65              : 
      66              :     /// The above fields never change, except for `end_lsn`, which is only set once.
      67              :     /// All other changing parts are in `inner`, and protected by a mutex.
      68              :     inner: RwLock<InMemoryLayerInner>,
      69              : }
      70              : 
      71              : impl std::fmt::Debug for InMemoryLayer {
      72            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      73            0 :         f.debug_struct("InMemoryLayer")
      74            0 :             .field("start_lsn", &self.start_lsn)
      75            0 :             .field("end_lsn", &self.end_lsn)
      76            0 :             .field("inner", &self.inner)
      77            0 :             .finish()
      78            0 :     }
      79              : }
      80              : 
      81              : pub struct InMemoryLayerInner {
      82              :     /// All versions of all pages in the layer are kept here. Indexed
      83              :     /// by block number and LSN. The [`IndexEntry`] is an offset into the
      84              :     /// ephemeral file where the page version is stored.
      85              :     index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
      86              : 
      87              :     /// The values are stored in a serialized format in this file.
      88              :     /// Each serialized Value is preceded by a 'u32' length field.
      89              :     /// PerSeg::page_versions map stores offsets into this file.
      90              :     file: EphemeralFile,
      91              : 
      92              :     resource_units: GlobalResourceUnits,
      93              : }
      94              : 
      95              : /// Support the same max blob length as blob_io, because ultimately
      96              : /// all the InMemoryLayer contents end up being written into a delta layer,
      97              : /// using the [`crate::tenant::blob_io`].
      98              : const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN;
      99              : const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
     100              :     let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize;
     101              :     let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize;
     102              :     assert!(trailing_ones + leading_zeroes == std::mem::size_of::<usize>() * 8);
     103              :     trailing_ones
     104              : };
     105              : 
     106              : /// See [`InMemoryLayerInner::index`].
     107              : ///
     108              : /// For memory efficiency, the data is packed into a u64.
     109              : ///
     110              : /// Layout:
     111              : /// - 1 bit: `will_init`
     112              : /// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
     113              : /// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
     114              : #[derive(Debug, Clone, Copy, PartialEq, Eq)]
     115              : pub struct IndexEntry(u64);
     116              : 
     117              : impl IndexEntry {
     118              :     /// See [`Self::MAX_SUPPORTED_POS`].
     119              :     const MAX_SUPPORTED_POS_BITS: usize = {
     120              :         let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS;
     121              :         if remainder < 32 {
     122              :             panic!("pos can be u32 as per type system, support that");
     123              :         }
     124              :         remainder
     125              :     };
     126              :     /// The maximum supported blob offset that can be represented by [`Self`].
     127              :     /// See also [`Self::validate_checkpoint_distance`].
     128              :     const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1;
     129              : 
     130              :     // Layout
     131              :     const WILL_INIT_RANGE: Range<usize> = 0..1;
     132              :     const LEN_RANGE: Range<usize> =
     133              :         Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS;
     134              :     const POS_RANGE: Range<usize> =
     135              :         Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS;
     136              :     const _ASSERT: () = {
     137              :         if Self::POS_RANGE.end != 64 {
     138              :             panic!("we don't want undefined bits for our own sanity")
     139              :         }
     140              :     };
     141              : 
     142              :     /// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`].
     143              :     ///
     144              :     /// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long.
     145              :     /// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`].
     146              :     ///
     147              :     /// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance,
     148              :     /// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value.
     149              :     ///
     150              :     /// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested)
     151              :     /// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer.
     152              :     #[inline(always)]
     153     30543126 :     fn new(arg: IndexEntryNewArgs) -> anyhow::Result<Self> {
     154     30543126 :         let IndexEntryNewArgs {
     155     30543126 :             base_offset,
     156     30543126 :             batch_offset,
     157     30543126 :             len,
     158     30543126 :             will_init,
     159     30543126 :         } = arg;
     160              : 
     161     30543126 :         let pos = base_offset
     162     30543126 :             .checked_add(batch_offset)
     163     30543126 :             .ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?;
     164              : 
     165     30543126 :         if pos.into_usize() > Self::MAX_SUPPORTED_POS {
     166           24 :             anyhow::bail!(
     167           24 :                 "base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}",
     168           24 :                 max = Self::MAX_SUPPORTED_POS
     169           24 :             );
     170     30543102 :         }
     171     30543102 : 
     172     30543102 :         if len > MAX_SUPPORTED_BLOB_LEN {
     173            6 :             anyhow::bail!(
     174            6 :                 "len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}",
     175            6 :             );
     176     30543096 :         }
     177     30543096 : 
     178     30543096 :         let mut data: u64 = 0;
     179              :         use bit_field::BitField;
     180     30543096 :         data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 });
     181     30543096 :         data.set_bits(Self::LEN_RANGE, len.into_u64());
     182     30543096 :         data.set_bits(Self::POS_RANGE, pos);
     183     30543096 : 
     184     30543096 :         Ok(Self(data))
     185     30543126 :     }
     186              : 
     187              :     #[inline(always)]
     188     29925488 :     fn unpack(&self) -> IndexEntryUnpacked {
     189              :         use bit_field::BitField;
     190     29925488 :         IndexEntryUnpacked {
     191     29925488 :             will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0,
     192     29925488 :             len: self.0.get_bits(Self::LEN_RANGE),
     193     29925488 :             pos: self.0.get_bits(Self::POS_RANGE),
     194     29925488 :         }
     195     29925488 :     }
     196              : 
     197              :     /// See [`Self::new`].
     198          618 :     pub(crate) const fn validate_checkpoint_distance(
     199          618 :         checkpoint_distance: u64,
     200          618 :     ) -> Result<(), &'static str> {
     201          618 :         if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 {
     202            0 :             return Err("exceeds the maximum supported value");
     203          618 :         }
     204          618 :         let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN);
     205          618 :         if res.is_none() {
     206            0 :             return Err(
     207            0 :                 "checkpoint distance + max supported blob len overflows in-memory addition",
     208            0 :             );
     209          618 :         }
     210          618 : 
     211          618 :         // NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS
     212          618 : 
     213          618 :         Ok(())
     214          618 :     }
     215              : 
     216              :     const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
     217              :         let res = Self::validate_checkpoint_distance(
     218              :             pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE,
     219              :         );
     220              :         if res.is_err() {
     221              :             panic!("default checkpoint distance is valid")
     222              :         }
     223              :     };
     224              : }
     225              : 
     226              : /// Args to [`IndexEntry::new`].
     227              : #[derive(Clone, Copy)]
     228              : struct IndexEntryNewArgs {
     229              :     base_offset: u64,
     230              :     batch_offset: u64,
     231              :     len: usize,
     232              :     will_init: bool,
     233              : }
     234              : 
     235              : /// Unpacked representation of the bitfielded [`IndexEntry`].
     236              : #[derive(Clone, Copy, PartialEq, Eq, Debug)]
     237              : struct IndexEntryUnpacked {
     238              :     will_init: bool,
     239              :     len: u64,
     240              :     pos: u64,
     241              : }
     242              : 
     243              : impl std::fmt::Debug for InMemoryLayerInner {
     244            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     245            0 :         f.debug_struct("InMemoryLayerInner").finish()
     246            0 :     }
     247              : }
     248              : 
     249              : /// State shared by all in-memory (ephemeral) layers.  Updated infrequently during background ticks in Timeline,
     250              : /// to minimize contention.
     251              : ///
     252              : /// This global state is used to implement behaviors that require a global view of the system, e.g.
     253              : /// rolling layers proactively to limit the total amount of dirty data.
     254              : pub(crate) struct GlobalResources {
     255              :     // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
     256              :     // Zero means unlimited.
     257              :     pub(crate) max_dirty_bytes: AtomicU64,
     258              :     // How many bytes are in all EphemeralFile objects
     259              :     dirty_bytes: AtomicU64,
     260              :     // How many layers are contributing to dirty_bytes
     261              :     dirty_layers: AtomicUsize,
     262              : }
     263              : 
     264              : // Per-timeline RAII struct for its contribution to [`GlobalResources`]
     265              : struct GlobalResourceUnits {
     266              :     // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
     267              :     // for decrementing the global counter by this many bytes when dropped.
     268              :     dirty_bytes: u64,
     269              : }
     270              : 
     271              : impl GlobalResourceUnits {
     272              :     // Hint for the layer append path to update us when the layer size differs from the last
     273              :     // call to update_size by this much.  If we don't reach this threshold, we'll still get
     274              :     // updated when the Timeline "ticks" in the background.
     275              :     const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
     276              : 
     277         3816 :     fn new() -> Self {
     278         3816 :         GLOBAL_RESOURCES
     279         3816 :             .dirty_layers
     280         3816 :             .fetch_add(1, AtomicOrdering::Relaxed);
     281         3816 :         Self { dirty_bytes: 0 }
     282         3816 :     }
     283              : 
     284              :     /// Do not call this frequently: all timelines will write to these same global atomics,
     285              :     /// so this is a relatively expensive operation.  Wait at least a few seconds between calls.
     286              :     ///
     287              :     /// Returns the effective layer size limit that should be applied, if any, to keep
     288              :     /// the total number of dirty bytes below the configured maximum.
     289         3456 :     fn publish_size(&mut self, size: u64) -> Option<u64> {
     290         3456 :         let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
     291         3426 :             Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
     292              :             Ordering::Greater => {
     293           24 :                 let delta = size - self.dirty_bytes;
     294           24 :                 let old = GLOBAL_RESOURCES
     295           24 :                     .dirty_bytes
     296           24 :                     .fetch_add(delta, AtomicOrdering::Relaxed);
     297           24 :                 old + delta
     298              :             }
     299              :             Ordering::Less => {
     300            6 :                 let delta = self.dirty_bytes - size;
     301            6 :                 let old = GLOBAL_RESOURCES
     302            6 :                     .dirty_bytes
     303            6 :                     .fetch_sub(delta, AtomicOrdering::Relaxed);
     304            6 :                 old - delta
     305              :             }
     306              :         };
     307              : 
     308              :         // This is a sloppy update: concurrent updates to the counter will race, and the exact
     309              :         // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
     310              :         // That's okay: as long as the metric contains some recent value, it doesn't have to always
     311              :         // be literally the last update.
     312         3456 :         TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
     313         3456 : 
     314         3456 :         self.dirty_bytes = size;
     315         3456 : 
     316         3456 :         let max_dirty_bytes = GLOBAL_RESOURCES
     317         3456 :             .max_dirty_bytes
     318         3456 :             .load(AtomicOrdering::Relaxed);
     319         3456 :         if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
     320              :             // Set the layer file limit to the average layer size: this implies that all above-average
     321              :             // sized layers will be elegible for freezing.  They will be frozen in the order they
     322              :             // next enter publish_size.
     323            0 :             Some(
     324            0 :                 new_global_dirty_bytes
     325            0 :                     / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
     326            0 :             )
     327              :         } else {
     328         3456 :             None
     329              :         }
     330         3456 :     }
     331              : 
     332              :     // Call publish_size if the input size differs from last published size by more than
     333              :     // the drift limit
     334     14412648 :     fn maybe_publish_size(&mut self, size: u64) {
     335     14412648 :         let publish = match size.cmp(&self.dirty_bytes) {
     336            0 :             Ordering::Equal => false,
     337     14412648 :             Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
     338            0 :             Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT,
     339              :         };
     340              : 
     341     14412648 :         if publish {
     342           24 :             self.publish_size(size);
     343     14412624 :         }
     344     14412648 :     }
     345              : }
     346              : 
     347              : impl Drop for GlobalResourceUnits {
     348         3432 :     fn drop(&mut self) {
     349         3432 :         GLOBAL_RESOURCES
     350         3432 :             .dirty_layers
     351         3432 :             .fetch_sub(1, AtomicOrdering::Relaxed);
     352         3432 : 
     353         3432 :         // Subtract our contribution to the global total dirty bytes
     354         3432 :         self.publish_size(0);
     355         3432 :     }
     356              : }
     357              : 
     358              : pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
     359              :     max_dirty_bytes: AtomicU64::new(0),
     360              :     dirty_bytes: AtomicU64::new(0),
     361              :     dirty_layers: AtomicUsize::new(0),
     362              : };
     363              : 
     364              : impl InMemoryLayer {
     365      1819447 :     pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
     366      1819447 :         self.file_id
     367      1819447 :     }
     368              : 
     369         3426 :     pub(crate) fn get_timeline_id(&self) -> TimelineId {
     370         3426 :         self.timeline_id
     371         3426 :     }
     372              : 
     373            0 :     pub(crate) fn info(&self) -> InMemoryLayerInfo {
     374            0 :         let lsn_start = self.start_lsn;
     375              : 
     376            0 :         if let Some(&lsn_end) = self.end_lsn.get() {
     377            0 :             InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
     378              :         } else {
     379            0 :             InMemoryLayerInfo::Open { lsn_start }
     380              :         }
     381            0 :     }
     382              : 
     383            0 :     pub(crate) fn try_len(&self) -> Option<u64> {
     384            0 :         self.inner.try_read().map(|i| i.file.len()).ok()
     385            0 :     }
     386              : 
     387     14412648 :     pub(crate) fn assert_writable(&self) {
     388     14412648 :         assert!(self.end_lsn.get().is_none());
     389     14412648 :     }
     390              : 
     391      4561450 :     pub(crate) fn end_lsn_or_max(&self) -> Lsn {
     392      4561450 :         self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
     393      4561450 :     }
     394              : 
     395      4558024 :     pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
     396      4558024 :         self.start_lsn..self.end_lsn_or_max()
     397      4558024 :     }
     398              : 
     399              :     /// debugging function to print out the contents of the layer
     400              :     ///
     401              :     /// this is likely completly unused
     402            0 :     pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
     403            0 :         let end_str = self.end_lsn_or_max();
     404            0 : 
     405            0 :         println!(
     406            0 :             "----- in-memory layer for tli {} LSNs {}-{} ----",
     407            0 :             self.timeline_id, self.start_lsn, end_str,
     408            0 :         );
     409            0 : 
     410            0 :         Ok(())
     411            0 :     }
     412              : 
     413              :     // Look up the keys in the provided keyspace and update
     414              :     // the reconstruct state with whatever is found.
     415              :     //
     416              :     // If the key is cached, go no further than the cached Lsn.
     417      1819447 :     pub(crate) async fn get_values_reconstruct_data(
     418      1819447 :         &self,
     419      1819447 :         keyspace: KeySpace,
     420      1819447 :         end_lsn: Lsn,
     421      1819447 :         reconstruct_state: &mut ValuesReconstructState,
     422      1819447 :         ctx: &RequestContext,
     423      1819447 :     ) -> Result<(), GetVectoredError> {
     424      1819447 :         let ctx = RequestContextBuilder::extend(ctx)
     425      1819447 :             .page_content_kind(PageContentKind::InMemoryLayer)
     426      1819447 :             .build();
     427              : 
     428      1819447 :         let inner = self.inner.read().await;
     429              : 
     430              :         struct ValueRead {
     431              :             entry_lsn: Lsn,
     432              :             read: vectored_dio_read::LogicalRead<Vec<u8>>,
     433              :         }
     434      1819447 :         let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
     435              : 
     436      1819447 :         for range in keyspace.ranges.iter() {
     437      1819447 :             for (key, vec_map) in inner
     438      1819447 :                 .index
     439      1819447 :                 .range(range.start.to_compact()..range.end.to_compact())
     440              :             {
     441      1497338 :                 let key = Key::from_compact(*key);
     442      1497338 :                 let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
     443            0 :                     Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
     444      1497338 :                     None => self.start_lsn..end_lsn,
     445              :                 };
     446              : 
     447      1497338 :                 let slice = vec_map.slice_range(lsn_range);
     448              : 
     449      1497338 :                 for (entry_lsn, index_entry) in slice.iter().rev() {
     450              :                     let IndexEntryUnpacked {
     451      1497332 :                         pos,
     452      1497332 :                         len,
     453      1497332 :                         will_init,
     454      1497332 :                     } = index_entry.unpack();
     455      1497332 :                     reads.entry(key).or_default().push(ValueRead {
     456      1497332 :                         entry_lsn: *entry_lsn,
     457      1497332 :                         read: vectored_dio_read::LogicalRead::new(
     458      1497332 :                             pos,
     459      1497332 :                             Vec::with_capacity(len as usize),
     460      1497332 :                         ),
     461      1497332 :                     });
     462      1497332 :                     if will_init {
     463      1497320 :                         break;
     464           12 :                     }
     465              :                 }
     466              :             }
     467              :         }
     468              : 
     469              :         // Execute the reads.
     470              : 
     471      1819447 :         let f = vectored_dio_read::execute(
     472      1819447 :             &inner.file,
     473      1819447 :             reads
     474      1819447 :                 .iter()
     475      1819447 :                 .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
     476      1819447 :             &ctx,
     477      1819447 :         );
     478      1819447 :         send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
     479       257308 :             .await;
     480              : 
     481              :         // Process results into the reconstruct state
     482      3316773 :         'next_key: for (key, value_reads) in reads {
     483      1497338 :             for ValueRead { entry_lsn, read } in value_reads {
     484      1497332 :                 match read.into_result().expect("we run execute() above") {
     485            0 :                     Err(e) => {
     486            0 :                         reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
     487            0 :                         continue 'next_key;
     488              :                     }
     489      1497332 :                     Ok(value_buf) => {
     490      1497332 :                         let value = Value::des(&value_buf);
     491      1497332 :                         if let Err(e) = value {
     492            0 :                             reconstruct_state
     493            0 :                                 .on_key_error(key, PageReconstructError::from(anyhow!(e)));
     494            0 :                             continue 'next_key;
     495      1497332 :                         }
     496      1497332 : 
     497      1497332 :                         let key_situation =
     498      1497332 :                             reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
     499      1497332 :                         if key_situation == ValueReconstructSituation::Complete {
     500              :                             // TODO: metric to see if we fetched more values than necessary
     501      1497320 :                             continue 'next_key;
     502           12 :                         }
     503              : 
     504              :                         // process the next value in the next iteration of the loop
     505              :                     }
     506              :                 }
     507              :             }
     508              :         }
     509              : 
     510      1819447 :         reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
     511      1819447 : 
     512      1819447 :         Ok(())
     513      1819447 :     }
     514              : }
     515              : 
     516              : /// Offset of a particular Value within a serialized batch.
     517              : struct SerializedBatchOffset {
     518              :     key: CompactKey,
     519              :     lsn: Lsn,
     520              :     // TODO: separate type when we start serde-serializing this value, to avoid coupling
     521              :     // in-memory representation to serialization format.
     522              :     index_entry: IndexEntry,
     523              : }
     524              : 
     525              : pub struct SerializedBatch {
     526              :     /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
     527              :     pub(crate) raw: Vec<u8>,
     528              : 
     529              :     /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
     530              :     offsets: Vec<SerializedBatchOffset>,
     531              : 
     532              :     /// The highest LSN of any value in the batch
     533              :     pub(crate) max_lsn: Lsn,
     534              : }
     535              : 
     536              : impl SerializedBatch {
     537     14412648 :     pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result<Self> {
     538     14412648 :         // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
     539     14412648 :         // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
     540     15271488 :         let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
     541     14412648 :         let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
     542     14412648 : 
     543     14412648 :         let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
     544     14412648 :         let mut max_lsn: Lsn = Lsn(0);
     545     29684136 :         for (key, lsn, val_ser_size, val) in batch {
     546     15271488 :             let relative_off = cursor.position();
     547     15271488 : 
     548     15271488 :             val.ser_into(&mut cursor)
     549     15271488 :                 .expect("Writing into in-memory buffer is infallible");
     550     15271488 : 
     551     15271488 :             offsets.push(SerializedBatchOffset {
     552     15271488 :                 key,
     553     15271488 :                 lsn,
     554     15271488 :                 index_entry: IndexEntry::new(IndexEntryNewArgs {
     555     15271488 :                     base_offset: 0,
     556     15271488 :                     batch_offset: relative_off,
     557     15271488 :                     len: val_ser_size,
     558     15271488 :                     will_init: val.will_init(),
     559     15271488 :                 })
     560     15271488 :                 .context("higher-level code ensures that values are within supported ranges")?,
     561              :             });
     562     15271488 :             max_lsn = std::cmp::max(max_lsn, lsn);
     563              :         }
     564              : 
     565     14412648 :         let buffer = cursor.into_inner();
     566     14412648 : 
     567     14412648 :         // Assert that we didn't do any extra allocations while building buffer.
     568     14412648 :         debug_assert!(buffer.len() <= buffer_size);
     569              : 
     570     14412648 :         Ok(Self {
     571     14412648 :             raw: buffer,
     572     14412648 :             offsets,
     573     14412648 :             max_lsn,
     574     14412648 :         })
     575     14412648 :     }
     576              : }
     577              : 
     578         6852 : fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
     579         6852 :     write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
     580         6852 : }
     581              : 
     582         3426 : fn inmem_layer_log_display(
     583         3426 :     mut f: impl Write,
     584         3426 :     timeline: TimelineId,
     585         3426 :     start_lsn: Lsn,
     586         3426 :     end_lsn: Lsn,
     587         3426 : ) -> std::fmt::Result {
     588         3426 :     write!(f, "timeline {} in-memory ", timeline)?;
     589         3426 :     inmem_layer_display(f, start_lsn, end_lsn)
     590         3426 : }
     591              : 
     592              : impl std::fmt::Display for InMemoryLayer {
     593         3426 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     594         3426 :         let end_lsn = self.end_lsn_or_max();
     595         3426 :         inmem_layer_display(f, self.start_lsn, end_lsn)
     596         3426 :     }
     597              : }
     598              : 
     599              : impl InMemoryLayer {
     600              :     /// Get layer size.
     601         3816 :     pub async fn size(&self) -> Result<u64> {
     602         3816 :         let inner = self.inner.read().await;
     603         3816 :         Ok(inner.file.len())
     604         3816 :     }
     605              : 
     606              :     /// Create a new, empty, in-memory layer
     607         3816 :     pub async fn create(
     608         3816 :         conf: &'static PageServerConf,
     609         3816 :         timeline_id: TimelineId,
     610         3816 :         tenant_shard_id: TenantShardId,
     611         3816 :         start_lsn: Lsn,
     612         3816 :         gate_guard: utils::sync::gate::GateGuard,
     613         3816 :         ctx: &RequestContext,
     614         3816 :     ) -> Result<InMemoryLayer> {
     615         3816 :         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
     616              : 
     617         3816 :         let file =
     618         3816 :             EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
     619         3816 :         let key = InMemoryLayerFileId(file.page_cache_file_id());
     620         3816 : 
     621         3816 :         Ok(InMemoryLayer {
     622         3816 :             file_id: key,
     623         3816 :             frozen_local_path_str: OnceLock::new(),
     624         3816 :             conf,
     625         3816 :             timeline_id,
     626         3816 :             tenant_shard_id,
     627         3816 :             start_lsn,
     628         3816 :             end_lsn: OnceLock::new(),
     629         3816 :             opened_at: Instant::now(),
     630         3816 :             inner: RwLock::new(InMemoryLayerInner {
     631         3816 :                 index: BTreeMap::new(),
     632         3816 :                 file,
     633         3816 :                 resource_units: GlobalResourceUnits::new(),
     634         3816 :             }),
     635         3816 :         })
     636         3816 :     }
     637              : 
     638              :     /// Write path.
     639              :     ///
     640              :     /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
     641              :     /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
     642              :     /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
     643     14412648 :     pub async fn put_batch(
     644     14412648 :         &self,
     645     14412648 :         serialized_batch: SerializedBatch,
     646     14412648 :         ctx: &RequestContext,
     647     14412648 :     ) -> anyhow::Result<()> {
     648     14412648 :         let mut inner = self.inner.write().await;
     649     14412648 :         self.assert_writable();
     650     14412648 : 
     651     14412648 :         let base_offset = inner.file.len();
     652     14412648 : 
     653     14412648 :         let SerializedBatch {
     654     14412648 :             raw,
     655     14412648 :             mut offsets,
     656     14412648 :             max_lsn: _,
     657     14412648 :         } = serialized_batch;
     658              : 
     659              :         // Add the base_offset to the batch's index entries which are relative to the batch start.
     660     29684136 :         for offset in &mut offsets {
     661              :             let IndexEntryUnpacked {
     662     15271488 :                 will_init,
     663     15271488 :                 len,
     664     15271488 :                 pos,
     665     15271488 :             } = offset.index_entry.unpack();
     666     15271488 :             offset.index_entry = IndexEntry::new(IndexEntryNewArgs {
     667     15271488 :                 base_offset,
     668     15271488 :                 batch_offset: pos,
     669     15271488 :                 len: len.into_usize(),
     670     15271488 :                 will_init,
     671     15271488 :             })?;
     672              :         }
     673              : 
     674              :         // Write the batch to the file
     675     14412648 :         inner.file.write_raw(&raw, ctx).await?;
     676     14412648 :         let new_size = inner.file.len();
     677     14412648 :         let expected_new_len = base_offset
     678     14412648 :             .checked_add(raw.len().into_u64())
     679     14412648 :             // write_raw would error if we were to overflow u64.
     680     14412648 :             // also IndexEntry and higher levels in
     681     14412648 :             //the code don't allow the file to grow that large
     682     14412648 :             .unwrap();
     683     14412648 :         assert_eq!(new_size, expected_new_len);
     684              : 
     685              :         // Update the index with the new entries
     686              :         for SerializedBatchOffset {
     687     15271488 :             key,
     688     15271488 :             lsn,
     689     15271488 :             index_entry,
     690     29684136 :         } in offsets
     691              :         {
     692     15271488 :             let vec_map = inner.index.entry(key).or_default();
     693     15271488 :             let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
     694     15271488 :             if old.is_some() {
     695              :                 // This should not break anything, but is unexpected: ingestion code aims to filter out
     696              :                 // multiple writes to the same key at the same LSN.  This happens in cases where our
     697              :                 // ingenstion code generates some write like an empty page, and we see a write from postgres
     698              :                 // to the same key in the same wal record.  If one such write makes it through, we
     699              :                 // index the most recent write, implicitly ignoring the earlier write.  We log a warning
     700              :                 // because this case is unexpected, and we would like tests to fail if this happens.
     701            0 :                 warn!("Key {} at {} written twice at same LSN", key, lsn);
     702     15271488 :             }
     703              :         }
     704              : 
     705     14412648 :         inner.resource_units.maybe_publish_size(new_size);
     706     14412648 : 
     707     14412648 :         Ok(())
     708     14412648 :     }
     709              : 
     710     14409060 :     pub(crate) fn get_opened_at(&self) -> Instant {
     711     14409060 :         self.opened_at
     712     14409060 :     }
     713              : 
     714            0 :     pub(crate) async fn tick(&self) -> Option<u64> {
     715            0 :         let mut inner = self.inner.write().await;
     716            0 :         let size = inner.file.len();
     717            0 :         inner.resource_units.publish_size(size)
     718            0 :     }
     719              : 
     720            6 :     pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
     721            6 :         // TODO: Currently, we just leak the storage for any deleted keys
     722            6 :         Ok(())
     723            6 :     }
     724              : 
     725              :     /// Records the end_lsn for non-dropped layers.
     726              :     /// `end_lsn` is exclusive
     727         3426 :     pub async fn freeze(&self, end_lsn: Lsn) {
     728         3426 :         assert!(
     729         3426 :             self.start_lsn < end_lsn,
     730            0 :             "{} >= {}",
     731              :             self.start_lsn,
     732              :             end_lsn
     733              :         );
     734         3426 :         self.end_lsn.set(end_lsn).expect("end_lsn set only once");
     735         3426 : 
     736         3426 :         self.frozen_local_path_str
     737         3426 :             .set({
     738         3426 :                 let mut buf = String::new();
     739         3426 :                 inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
     740         3426 :                     .unwrap();
     741         3426 :                 buf.into()
     742         3426 :             })
     743         3426 :             .expect("frozen_local_path_str set only once");
     744              : 
     745              :         #[cfg(debug_assertions)]
     746              :         {
     747         3426 :             let inner = self.inner.write().await;
     748     12768066 :             for vec_map in inner.index.values() {
     749     13160214 :                 for (lsn, _) in vec_map.as_slice() {
     750     13160214 :                     assert!(*lsn < end_lsn);
     751              :                 }
     752              :             }
     753              :         }
     754         3426 :     }
     755              : 
     756              :     /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
     757              :     /// layer will only contain the key range the user specifies, and may return `None`
     758              :     /// if there are no matching keys.
     759              :     ///
     760              :     /// Returns a new delta layer with all the same data as this in-memory layer
     761         2904 :     pub async fn write_to_disk(
     762         2904 :         &self,
     763         2904 :         ctx: &RequestContext,
     764         2904 :         key_range: Option<Range<Key>>,
     765         2904 :         l0_flush_global_state: &l0_flush::Inner,
     766         2904 :     ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
     767              :         // Grab the lock in read-mode. We hold it over the I/O, but because this
     768              :         // layer is not writeable anymore, no one should be trying to acquire the
     769              :         // write lock on it, so we shouldn't block anyone. There's one exception
     770              :         // though: another thread might have grabbed a reference to this layer
     771              :         // in `get_layer_for_write' just before the checkpointer called
     772              :         // `freeze`, and then `write_to_disk` on it. When the thread gets the
     773              :         // lock, it will see that it's not writeable anymore and retry, but it
     774              :         // would have to wait until we release it. That race condition is very
     775              :         // rare though, so we just accept the potential latency hit for now.
     776         2904 :         let inner = self.inner.read().await;
     777              : 
     778              :         use l0_flush::Inner;
     779         2904 :         let _concurrency_permit = match l0_flush_global_state {
     780         2904 :             Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
     781              :         };
     782              : 
     783         2904 :         let end_lsn = *self.end_lsn.get().unwrap();
     784              : 
     785         2904 :         let key_count = if let Some(key_range) = key_range {
     786            0 :             let key_range = key_range.start.to_compact()..key_range.end.to_compact();
     787            0 : 
     788            0 :             inner
     789            0 :                 .index
     790            0 :                 .iter()
     791            0 :                 .filter(|(k, _)| key_range.contains(k))
     792            0 :                 .count()
     793              :         } else {
     794         2904 :             inner.index.len()
     795              :         };
     796         2904 :         if key_count == 0 {
     797            0 :             return Ok(None);
     798         2904 :         }
     799              : 
     800         2904 :         let mut delta_layer_writer = DeltaLayerWriter::new(
     801         2904 :             self.conf,
     802         2904 :             self.timeline_id,
     803         2904 :             self.tenant_shard_id,
     804         2904 :             Key::MIN,
     805         2904 :             self.start_lsn..end_lsn,
     806         2904 :             ctx,
     807         2904 :         )
     808         1483 :         .await?;
     809              : 
     810         2904 :         match l0_flush_global_state {
     811              :             l0_flush::Inner::Direct { .. } => {
     812         2904 :                 let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
     813              : 
     814         2904 :                 let file_contents = Bytes::from(file_contents);
     815              : 
     816     12764400 :                 for (key, vec_map) in inner.index.iter() {
     817              :                     // Write all page versions
     818     13156548 :                     for (lsn, entry) in vec_map
     819     12764400 :                         .as_slice()
     820     12764400 :                         .iter()
     821     13156548 :                         .map(|(lsn, entry)| (lsn, entry.unpack()))
     822              :                     {
     823              :                         let IndexEntryUnpacked {
     824     13156548 :                             pos,
     825     13156548 :                             len,
     826     13156548 :                             will_init,
     827     13156548 :                         } = entry;
     828     13156548 :                         let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize);
     829     13156548 :                         let (_buf, res) = delta_layer_writer
     830     13156548 :                             .put_value_bytes(
     831     13156548 :                                 Key::from_compact(*key),
     832     13156548 :                                 *lsn,
     833     13156548 :                                 buf.slice_len(),
     834     13156548 :                                 will_init,
     835     13156548 :                                 ctx,
     836     13156548 :                             )
     837         8185 :                             .await;
     838     13156548 :                         res?;
     839              :                     }
     840              :                 }
     841              :             }
     842              :         }
     843              : 
     844              :         // MAX is used here because we identify L0 layers by full key range
     845        19920 :         let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
     846              : 
     847              :         // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
     848              :         //
     849              :         // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
     850              :         // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
     851              :         // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
     852              :         //
     853              :         // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
     854              :         // we dirtied when writing to the filesystem have been flushed and marked !dirty.
     855         2904 :         drop(_concurrency_permit);
     856         2904 : 
     857         2904 :         Ok(Some((desc, path)))
     858         2904 :     }
     859              : }
     860              : 
     861              : #[cfg(test)]
     862              : mod tests {
     863              :     use super::*;
     864              : 
     865              :     #[test]
     866            6 :     fn test_index_entry() {
     867              :         const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS;
     868              :         use IndexEntryNewArgs as Args;
     869              :         use IndexEntryUnpacked as Unpacked;
     870              : 
     871          120 :         let roundtrip = |args, expect: Unpacked| {
     872          120 :             let res = IndexEntry::new(args).expect("this tests expects no errors");
     873          120 :             let IndexEntryUnpacked {
     874          120 :                 will_init,
     875          120 :                 len,
     876          120 :                 pos,
     877          120 :             } = res.unpack();
     878          120 :             assert_eq!(will_init, expect.will_init);
     879          120 :             assert_eq!(len, expect.len);
     880          120 :             assert_eq!(pos, expect.pos);
     881          120 :         };
     882              : 
     883              :         // basic roundtrip
     884           18 :         for pos in [0, MAX_SUPPORTED_POS] {
     885           36 :             for len in [0, MAX_SUPPORTED_BLOB_LEN] {
     886           72 :                 for will_init in [true, false] {
     887           48 :                     let expect = Unpacked {
     888           48 :                         will_init,
     889           48 :                         len: len.into_u64(),
     890           48 :                         pos: pos.into_u64(),
     891           48 :                     };
     892           48 :                     roundtrip(
     893           48 :                         Args {
     894           48 :                             will_init,
     895           48 :                             base_offset: pos.into_u64(),
     896           48 :                             batch_offset: 0,
     897           48 :                             len,
     898           48 :                         },
     899           48 :                         expect,
     900           48 :                     );
     901           48 :                     roundtrip(
     902           48 :                         Args {
     903           48 :                             will_init,
     904           48 :                             base_offset: 0,
     905           48 :                             batch_offset: pos.into_u64(),
     906           48 :                             len,
     907           48 :                         },
     908           48 :                         expect,
     909           48 :                     );
     910           48 :                 }
     911              :             }
     912              :         }
     913              : 
     914              :         // too-large len
     915            6 :         let too_large = Args {
     916            6 :             will_init: false,
     917            6 :             len: MAX_SUPPORTED_BLOB_LEN + 1,
     918            6 :             base_offset: 0,
     919            6 :             batch_offset: 0,
     920            6 :         };
     921            6 :         assert!(IndexEntry::new(too_large).is_err());
     922              : 
     923              :         // too-large pos
     924              :         {
     925            6 :             let too_large = Args {
     926            6 :                 will_init: false,
     927            6 :                 len: 0,
     928            6 :                 base_offset: MAX_SUPPORTED_POS.into_u64() + 1,
     929            6 :                 batch_offset: 0,
     930            6 :             };
     931            6 :             assert!(IndexEntry::new(too_large).is_err());
     932            6 :             let too_large = Args {
     933            6 :                 will_init: false,
     934            6 :                 len: 0,
     935            6 :                 base_offset: 0,
     936            6 :                 batch_offset: MAX_SUPPORTED_POS.into_u64() + 1,
     937            6 :             };
     938            6 :             assert!(IndexEntry::new(too_large).is_err());
     939              :         }
     940              : 
     941              :         // too large (base_offset + batch_offset)
     942              :         {
     943            6 :             let too_large = Args {
     944            6 :                 will_init: false,
     945            6 :                 len: 0,
     946            6 :                 base_offset: MAX_SUPPORTED_POS.into_u64(),
     947            6 :                 batch_offset: 1,
     948            6 :             };
     949            6 :             assert!(IndexEntry::new(too_large).is_err());
     950            6 :             let too_large = Args {
     951            6 :                 will_init: false,
     952            6 :                 len: 0,
     953            6 :                 base_offset: MAX_SUPPORTED_POS.into_u64() - 1,
     954            6 :                 batch_offset: MAX_SUPPORTED_POS.into_u64() - 1,
     955            6 :             };
     956            6 :             assert!(IndexEntry::new(too_large).is_err());
     957              :         }
     958              : 
     959              :         // valid special cases
     960              :         // - area past the max supported pos that is accessible by len
     961           18 :         for len in [1, MAX_SUPPORTED_BLOB_LEN] {
     962           12 :             roundtrip(
     963           12 :                 Args {
     964           12 :                     will_init: false,
     965           12 :                     len,
     966           12 :                     base_offset: MAX_SUPPORTED_POS.into_u64(),
     967           12 :                     batch_offset: 0,
     968           12 :                 },
     969           12 :                 Unpacked {
     970           12 :                     will_init: false,
     971           12 :                     len: len as u64,
     972           12 :                     pos: MAX_SUPPORTED_POS.into_u64(),
     973           12 :                 },
     974           12 :             );
     975           12 :             roundtrip(
     976           12 :                 Args {
     977           12 :                     will_init: false,
     978           12 :                     len,
     979           12 :                     base_offset: 0,
     980           12 :                     batch_offset: MAX_SUPPORTED_POS.into_u64(),
     981           12 :                 },
     982           12 :                 Unpacked {
     983           12 :                     will_init: false,
     984           12 :                     len: len as u64,
     985           12 :                     pos: MAX_SUPPORTED_POS.into_u64(),
     986           12 :                 },
     987           12 :             );
     988           12 :         }
     989            6 :     }
     990              : }

Generated by: LCOV version 2.1-beta