LCOV - code coverage report
Current view: top level - pageserver/src/tenant - storage_layer.rs (source / functions) Coverage Total Hit
Test: b9d67f908f91f00e353a27440ba89f642a869959.info Lines: 76.0 % 350 266
Test Date: 2024-11-19 21:44:13 Functions: 76.0 % 50 38

            Line data    Source code
       1              : //! Common traits and structs for layers
       2              : 
       3              : pub mod batch_split_writer;
       4              : pub mod delta_layer;
       5              : pub mod filter_iterator;
       6              : pub mod image_layer;
       7              : pub mod inmemory_layer;
       8              : pub(crate) mod layer;
       9              : mod layer_desc;
      10              : mod layer_name;
      11              : pub mod merge_iterator;
      12              : 
      13              : use crate::context::{AccessStatsBehavior, RequestContext};
      14              : use bytes::Bytes;
      15              : use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
      16              : use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
      17              : use pageserver_api::record::NeonWalRecord;
      18              : use pageserver_api::value::Value;
      19              : use std::cmp::{Ordering, Reverse};
      20              : use std::collections::hash_map::Entry;
      21              : use std::collections::{BinaryHeap, HashMap};
      22              : use std::ops::Range;
      23              : use std::sync::Arc;
      24              : use std::time::{Duration, SystemTime, UNIX_EPOCH};
      25              : 
      26              : use utils::lsn::Lsn;
      27              : 
      28              : pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
      29              : pub use image_layer::{ImageLayer, ImageLayerWriter};
      30              : pub use inmemory_layer::InMemoryLayer;
      31              : pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
      32              : pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
      33              : 
      34              : pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
      35              : 
      36              : use self::inmemory_layer::InMemoryLayerFileId;
      37              : 
      38              : use super::timeline::GetVectoredError;
      39              : use super::PageReconstructError;
      40              : 
      41            0 : pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
      42            0 : where
      43            0 :     T: PartialOrd<T>,
      44            0 : {
      45            0 :     if a.start < b.start {
      46            0 :         a.end > b.start
      47              :     } else {
      48            0 :         b.end > a.start
      49              :     }
      50            0 : }
      51              : 
      52              : /// Struct used to communicate across calls to 'get_value_reconstruct_data'.
      53              : ///
      54              : /// Before first call, you can fill in 'page_img' if you have an older cached
      55              : /// version of the page available. That can save work in
      56              : /// 'get_value_reconstruct_data', as it can stop searching for page versions
      57              : /// when all the WAL records going back to the cached image have been collected.
      58              : ///
      59              : /// When get_value_reconstruct_data returns Complete, 'img' is set to an image
      60              : /// of the page, or the oldest WAL record in 'records' is a will_init-type
      61              : /// record that initializes the page without requiring a previous image.
      62              : ///
      63              : /// If 'get_page_reconstruct_data' returns Continue, some 'records' may have
      64              : /// been collected, but there are more records outside the current layer. Pass
      65              : /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
      66              : /// call, to collect more records.
      67              : ///
      68              : #[derive(Debug, Default)]
      69              : pub(crate) struct ValueReconstructState {
      70              :     pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
      71              :     pub(crate) img: Option<(Lsn, Bytes)>,
      72              : }
      73              : 
      74              : #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
      75              : pub(crate) enum ValueReconstructSituation {
      76              :     Complete,
      77              :     #[default]
      78              :     Continue,
      79              : }
      80              : 
      81              : /// Reconstruct data accumulated for a single key during a vectored get
      82              : #[derive(Debug, Default, Clone)]
      83              : pub(crate) struct VectoredValueReconstructState {
      84              :     pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
      85              :     pub(crate) img: Option<(Lsn, Bytes)>,
      86              : 
      87              :     situation: ValueReconstructSituation,
      88              : }
      89              : 
      90              : impl VectoredValueReconstructState {
      91        72575 :     fn get_cached_lsn(&self) -> Option<Lsn> {
      92        72575 :         self.img.as_ref().map(|img| img.0)
      93        72575 :     }
      94              : }
      95              : 
      96              : impl From<VectoredValueReconstructState> for ValueReconstructState {
      97       666926 :     fn from(mut state: VectoredValueReconstructState) -> Self {
      98       666926 :         // walredo expects the records to be descending in terms of Lsn
      99       666926 :         state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
     100       666926 : 
     101       666926 :         ValueReconstructState {
     102       666926 :             records: state.records,
     103       666926 :             img: state.img,
     104       666926 :         }
     105       666926 :     }
     106              : }
     107              : 
     108              : /// Bag of data accumulated during a vectored get..
     109              : pub(crate) struct ValuesReconstructState {
     110              :     /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
     111              :     /// should not expect to get anything from this hashmap.
     112              :     pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
     113              :     /// The keys which are already retrieved
     114              :     keys_done: KeySpaceRandomAccum,
     115              : 
     116              :     /// The keys covered by the image layers
     117              :     keys_with_image_coverage: Option<Range<Key>>,
     118              : 
     119              :     // Statistics that are still accessible as a caller of `get_vectored_impl`.
     120              :     layers_visited: u32,
     121              :     delta_layers_visited: u32,
     122              : }
     123              : 
     124              : impl ValuesReconstructState {
     125       626830 :     pub(crate) fn new() -> Self {
     126       626830 :         Self {
     127       626830 :             keys: HashMap::new(),
     128       626830 :             keys_done: KeySpaceRandomAccum::new(),
     129       626830 :             keys_with_image_coverage: None,
     130       626830 :             layers_visited: 0,
     131       626830 :             delta_layers_visited: 0,
     132       626830 :         }
     133       626830 :     }
     134              : 
     135              :     /// Associate a key with the error which it encountered and mark it as done
     136            0 :     pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) {
     137            0 :         let previous = self.keys.insert(key, Err(err));
     138            0 :         if let Some(Ok(state)) = previous {
     139            0 :             if state.situation == ValueReconstructSituation::Continue {
     140            0 :                 self.keys_done.add_key(key);
     141            0 :             }
     142            0 :         }
     143            0 :     }
     144              : 
     145       846270 :     pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
     146       846270 :         self.layers_visited += 1;
     147       846270 :         if let ReadableLayer::PersistentLayer(layer) = layer {
     148       239920 :             if layer.layer_desc().is_delta() {
     149       218112 :                 self.delta_layers_visited += 1;
     150       218112 :             }
     151       606350 :         }
     152       846270 :     }
     153              : 
     154          200 :     pub(crate) fn get_delta_layers_visited(&self) -> u32 {
     155          200 :         self.delta_layers_visited
     156          200 :     }
     157              : 
     158       626600 :     pub(crate) fn get_layers_visited(&self) -> u32 {
     159       626600 :         self.layers_visited
     160       626600 :     }
     161              : 
     162              :     /// This function is called after reading a keyspace from a layer.
     163              :     /// It checks if the read path has now moved past the cached Lsn for any keys.
     164              :     ///
     165              :     /// Implementation note: We intentionally iterate over the keys for which we've
     166              :     /// already collected some reconstruct data. This avoids scaling complexity with
     167              :     /// the size of the search space.
     168       824462 :     pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) {
     169       824462 :         for (key, value) in self.keys.iter_mut() {
     170       704290 :             if !keyspace.contains(key) {
     171          128 :                 continue;
     172       704162 :             }
     173              : 
     174       704162 :             if let Ok(state) = value {
     175       704162 :                 if state.situation != ValueReconstructSituation::Complete
     176          354 :                     && state.get_cached_lsn() >= Some(advanced_to)
     177            0 :                 {
     178            0 :                     state.situation = ValueReconstructSituation::Complete;
     179            0 :                     self.keys_done.add_key(*key);
     180       704162 :                 }
     181            0 :             }
     182              :         }
     183       824462 :     }
     184              : 
     185              :     /// On hitting image layer, we can mark all keys in this range as done, because
     186              :     /// if the image layer does not contain a key, it is deleted/never added.
     187        21820 :     pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
     188        21820 :         let prev_val = self.keys_with_image_coverage.replace(key_range.clone());
     189        21820 :         assert_eq!(
     190              :             prev_val, None,
     191            0 :             "should consume the keyspace before the next iteration"
     192              :         );
     193        21820 :     }
     194              : 
     195              :     /// Update the state collected for a given key.
     196              :     /// Returns true if this was the last value needed for the key and false otherwise.
     197              :     ///
     198              :     /// If the key is done after the update, mark it as such.
     199              :     ///
     200              :     /// If the key is in the sparse keyspace (i.e., aux files), we do not track them in
     201              :     /// `key_done`.
     202       702949 :     pub(crate) fn update_key(
     203       702949 :         &mut self,
     204       702949 :         key: &Key,
     205       702949 :         lsn: Lsn,
     206       702949 :         value: Value,
     207       702949 :     ) -> ValueReconstructSituation {
     208       702949 :         let state = self
     209       702949 :             .keys
     210       702949 :             .entry(*key)
     211       702949 :             .or_insert(Ok(VectoredValueReconstructState::default()));
     212       702949 :         let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
     213       702949 :         if let Ok(state) = state {
     214       702949 :             let key_done = match state.situation {
     215              :                 ValueReconstructSituation::Complete => {
     216        35497 :                     if is_sparse_key {
     217              :                         // Sparse keyspace might be visited multiple times because
     218              :                         // we don't track unmapped keyspaces.
     219        35497 :                         return ValueReconstructSituation::Complete;
     220              :                     } else {
     221            0 :                         unreachable!()
     222              :                     }
     223              :                 }
     224       667452 :                 ValueReconstructSituation::Continue => match value {
     225       666952 :                     Value::Image(img) => {
     226       666952 :                         state.img = Some((lsn, img));
     227       666952 :                         true
     228              :                     }
     229          500 :                     Value::WalRecord(rec) => {
     230          500 :                         debug_assert!(
     231          500 :                             Some(lsn) > state.get_cached_lsn(),
     232            0 :                             "Attempt to collect a record below cached LSN for walredo: {} < {}",
     233            0 :                             lsn,
     234            0 :                             state
     235            0 :                                 .get_cached_lsn()
     236            0 :                                 .expect("Assertion can only fire if a cached lsn is present")
     237              :                         );
     238              : 
     239          500 :                         let will_init = rec.will_init();
     240          500 :                         state.records.push((lsn, rec));
     241          500 :                         will_init
     242              :                     }
     243              :                 },
     244              :             };
     245              : 
     246       667452 :             if key_done && state.situation == ValueReconstructSituation::Continue {
     247       666996 :                 state.situation = ValueReconstructSituation::Complete;
     248       666996 :                 if !is_sparse_key {
     249       603920 :                     self.keys_done.add_key(*key);
     250       603920 :                 }
     251          456 :             }
     252              : 
     253       667452 :             state.situation
     254              :         } else {
     255            0 :             ValueReconstructSituation::Complete
     256              :         }
     257       702949 :     }
     258              : 
     259              :     /// Returns the Lsn at which this key is cached if one exists.
     260              :     /// The read path should go no further than this Lsn for the given key.
     261      1126396 :     pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option<Lsn> {
     262      1126396 :         self.keys
     263      1126396 :             .get(key)
     264      1126396 :             .and_then(|k| k.as_ref().ok())
     265      1126396 :             .and_then(|state| state.get_cached_lsn())
     266      1126396 :     }
     267              : 
     268              :     /// Returns the key space describing the keys that have
     269              :     /// been marked as completed since the last call to this function.
     270              :     /// Returns individual keys done, and the image layer coverage.
     271      1698539 :     pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option<Range<Key>>) {
     272      1698539 :         (
     273      1698539 :             self.keys_done.consume_keyspace(),
     274      1698539 :             self.keys_with_image_coverage.take(),
     275      1698539 :         )
     276      1698539 :     }
     277              : }
     278              : 
     279              : impl Default for ValuesReconstructState {
     280          246 :     fn default() -> Self {
     281          246 :         Self::new()
     282          246 :     }
     283              : }
     284              : 
     285              : /// A key that uniquely identifies a layer in a timeline
     286              : #[derive(Debug, PartialEq, Eq, Clone, Hash)]
     287              : pub(crate) enum LayerId {
     288              :     PersitentLayerId(PersistentLayerKey),
     289              :     InMemoryLayerId(InMemoryLayerFileId),
     290              : }
     291              : 
     292              : /// Uniquely identify a layer visit by the layer
     293              : /// and LSN floor (or start LSN) of the reads.
     294              : /// The layer itself is not enough since we may
     295              : /// have different LSN lower bounds for delta layer reads.
     296              : #[derive(Debug, PartialEq, Eq, Clone, Hash)]
     297              : struct LayerToVisitId {
     298              :     layer_id: LayerId,
     299              :     lsn_floor: Lsn,
     300              : }
     301              : 
     302              : /// Layer wrapper for the read path. Note that it is valid
     303              : /// to use these layers even after external operations have
     304              : /// been performed on them (compaction, freeze, etc.).
     305              : #[derive(Debug)]
     306              : pub(crate) enum ReadableLayer {
     307              :     PersistentLayer(Layer),
     308              :     InMemoryLayer(Arc<InMemoryLayer>),
     309              : }
     310              : 
     311              : /// A partial description of a read to be done.
     312              : #[derive(Debug, Clone)]
     313              : struct LayerVisit {
     314              :     /// An id used to resolve the readable layer within the fringe
     315              :     layer_to_visit_id: LayerToVisitId,
     316              :     /// Lsn range for the read, used for selecting the next read
     317              :     lsn_range: Range<Lsn>,
     318              : }
     319              : 
     320              : /// Data structure which maintains a fringe of layers for the
     321              : /// read path. The fringe is the set of layers which intersects
     322              : /// the current keyspace that the search is descending on.
     323              : /// Each layer tracks the keyspace that intersects it.
     324              : ///
     325              : /// The fringe must appear sorted by Lsn. Hence, it uses
     326              : /// a two layer indexing scheme.
     327              : #[derive(Debug)]
     328              : pub(crate) struct LayerFringe {
     329              :     planned_visits_by_lsn: BinaryHeap<LayerVisit>,
     330              :     visit_reads: HashMap<LayerToVisitId, LayerVisitReads>,
     331              : }
     332              : 
     333              : #[derive(Debug)]
     334              : struct LayerVisitReads {
     335              :     layer: ReadableLayer,
     336              :     target_keyspace: KeySpaceRandomAccum,
     337              : }
     338              : 
     339              : impl LayerFringe {
     340       852269 :     pub(crate) fn new() -> Self {
     341       852269 :         LayerFringe {
     342       852269 :             planned_visits_by_lsn: BinaryHeap::new(),
     343       852269 :             visit_reads: HashMap::new(),
     344       852269 :         }
     345       852269 :     }
     346              : 
     347      1698539 :     pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
     348      1698539 :         let read_desc = match self.planned_visits_by_lsn.pop() {
     349       846270 :             Some(desc) => desc,
     350       852269 :             None => return None,
     351              :         };
     352              : 
     353       846270 :         let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id);
     354       846270 : 
     355       846270 :         match removed {
     356              :             Some((
     357              :                 _,
     358              :                 LayerVisitReads {
     359       846270 :                     layer,
     360       846270 :                     mut target_keyspace,
     361       846270 :                 },
     362       846270 :             )) => Some((
     363       846270 :                 layer,
     364       846270 :                 target_keyspace.consume_keyspace(),
     365       846270 :                 read_desc.lsn_range,
     366       846270 :             )),
     367            0 :             None => unreachable!("fringe internals are always consistent"),
     368              :         }
     369      1698539 :     }
     370              : 
     371       846284 :     pub(crate) fn update(
     372       846284 :         &mut self,
     373       846284 :         layer: ReadableLayer,
     374       846284 :         keyspace: KeySpace,
     375       846284 :         lsn_range: Range<Lsn>,
     376       846284 :     ) {
     377       846284 :         let layer_to_visit_id = LayerToVisitId {
     378       846284 :             layer_id: layer.id(),
     379       846284 :             lsn_floor: lsn_range.start,
     380       846284 :         };
     381       846284 : 
     382       846284 :         let entry = self.visit_reads.entry(layer_to_visit_id.clone());
     383       846284 :         match entry {
     384           14 :             Entry::Occupied(mut entry) => {
     385           14 :                 entry.get_mut().target_keyspace.add_keyspace(keyspace);
     386           14 :             }
     387       846270 :             Entry::Vacant(entry) => {
     388       846270 :                 self.planned_visits_by_lsn.push(LayerVisit {
     389       846270 :                     lsn_range,
     390       846270 :                     layer_to_visit_id: layer_to_visit_id.clone(),
     391       846270 :                 });
     392       846270 :                 let mut accum = KeySpaceRandomAccum::new();
     393       846270 :                 accum.add_keyspace(keyspace);
     394       846270 :                 entry.insert(LayerVisitReads {
     395       846270 :                     layer,
     396       846270 :                     target_keyspace: accum,
     397       846270 :                 });
     398       846270 :             }
     399              :         }
     400       846284 :     }
     401              : }
     402              : 
     403              : impl Default for LayerFringe {
     404            0 :     fn default() -> Self {
     405            0 :         Self::new()
     406            0 :     }
     407              : }
     408              : 
     409              : impl Ord for LayerVisit {
     410           30 :     fn cmp(&self, other: &Self) -> Ordering {
     411           30 :         let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
     412           30 :         if ord == std::cmp::Ordering::Equal {
     413           22 :             self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
     414              :         } else {
     415            8 :             ord
     416              :         }
     417           30 :     }
     418              : }
     419              : 
     420              : impl PartialOrd for LayerVisit {
     421           30 :     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
     422           30 :         Some(self.cmp(other))
     423           30 :     }
     424              : }
     425              : 
     426              : impl PartialEq for LayerVisit {
     427            0 :     fn eq(&self, other: &Self) -> bool {
     428            0 :         self.lsn_range == other.lsn_range
     429            0 :     }
     430              : }
     431              : 
     432              : impl Eq for LayerVisit {}
     433              : 
     434              : impl ReadableLayer {
     435       846284 :     pub(crate) fn id(&self) -> LayerId {
     436       846284 :         match self {
     437       239934 :             Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
     438       606350 :             Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
     439              :         }
     440       846284 :     }
     441              : 
     442       846270 :     pub(crate) async fn get_values_reconstruct_data(
     443       846270 :         &self,
     444       846270 :         keyspace: KeySpace,
     445       846270 :         lsn_range: Range<Lsn>,
     446       846270 :         reconstruct_state: &mut ValuesReconstructState,
     447       846270 :         ctx: &RequestContext,
     448       846270 :     ) -> Result<(), GetVectoredError> {
     449       846270 :         match self {
     450       239920 :             ReadableLayer::PersistentLayer(layer) => {
     451       239920 :                 layer
     452       239920 :                     .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
     453        95970 :                     .await
     454              :             }
     455       606350 :             ReadableLayer::InMemoryLayer(layer) => {
     456       606350 :                 layer
     457       606350 :                     .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
     458        87081 :                     .await
     459              :             }
     460              :         }
     461       846270 :     }
     462              : }
     463              : 
     464              : /// Layers contain a hint indicating whether they are likely to be used for reads.
     465              : ///
     466              : /// This is a hint rather than an authoritative value, so that we do not have to update it synchronously
     467              : /// when changing the visibility of layers (for example when creating a branch that makes some previously
     468              : /// covered layers visible).  It should be used for cache management but not for correctness-critical checks.
     469              : #[derive(Debug, Clone, PartialEq, Eq)]
     470              : pub enum LayerVisibilityHint {
     471              :     /// A Visible layer might be read while serving a read, because there is not an image layer between it
     472              :     /// and a readable LSN (the tip of the branch or a child's branch point)
     473              :     Visible,
     474              :     /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
     475              :     /// a branch or ephemeral endpoint at an LSN below the layer that covers this.
     476              :     Covered,
     477              : }
     478              : 
     479              : pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
     480              : 
     481            0 : #[derive(Clone, Copy, strum_macros::EnumString)]
     482              : pub(crate) enum LayerAccessStatsReset {
     483              :     NoReset,
     484              :     AllStats,
     485              : }
     486              : 
     487              : impl Default for LayerAccessStats {
     488         1756 :     fn default() -> Self {
     489         1756 :         // Default value is to assume resident since creation time, and visible.
     490         1756 :         let (_mask, mut value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, SystemTime::now());
     491         1756 :         value |= 0x1 << Self::VISIBILITY_SHIFT;
     492         1756 : 
     493         1756 :         Self(std::sync::atomic::AtomicU64::new(value))
     494         1756 :     }
     495              : }
     496              : 
     497              : // Efficient store of two very-low-resolution timestamps and some bits.  Used for storing last access time and
     498              : // last residence change time.
     499              : impl LayerAccessStats {
     500              :     // How many high bits to drop from a u32 timestamp?
     501              :     // - Only storing up to a u32 timestamp will work fine until 2038 (if this code is still in use
     502              :     //   after that, this software has been very successful!)
     503              :     // - Dropping the top bit is implicitly safe because unix timestamps are meant to be
     504              :     // stored in an i32, so they never used it.
     505              :     // - Dropping the next two bits is safe because this code is only running on systems in
     506              :     // years >= 2024, and these bits have been 1 since 2021
     507              :     //
     508              :     // Therefore we may store only 28 bits for a timestamp with one second resolution.  We do
     509              :     // this truncation to make space for some flags in the high bits of our u64.
     510              :     const TS_DROP_HIGH_BITS: u32 = u32::count_ones(Self::TS_ONES) + 1;
     511              :     const TS_MASK: u32 = 0x1f_ff_ff_ff;
     512              :     const TS_ONES: u32 = 0x60_00_00_00;
     513              : 
     514              :     const ATIME_SHIFT: u32 = 0;
     515              :     const RTIME_SHIFT: u32 = 32 - Self::TS_DROP_HIGH_BITS;
     516              :     const VISIBILITY_SHIFT: u32 = 64 - 2 * Self::TS_DROP_HIGH_BITS;
     517              : 
     518       240012 :     fn write_bits(&self, mask: u64, value: u64) -> u64 {
     519       240012 :         self.0
     520       240012 :             .fetch_update(
     521       240012 :                 // TODO: decide what orderings are correct
     522       240012 :                 std::sync::atomic::Ordering::Relaxed,
     523       240012 :                 std::sync::atomic::Ordering::Relaxed,
     524       240012 :                 |v| Some((v & !mask) | (value & mask)),
     525       240012 :             )
     526       240012 :             .expect("Inner function is infallible")
     527       240012 :     }
     528              : 
     529       241436 :     fn to_low_res_timestamp(shift: u32, time: SystemTime) -> (u64, u64) {
     530       241436 :         // Drop the low three bits of the timestamp, for an ~8s accuracy
     531       241436 :         let timestamp = time.duration_since(UNIX_EPOCH).unwrap().as_secs() & (Self::TS_MASK as u64);
     532       241436 : 
     533       241436 :         ((Self::TS_MASK as u64) << shift, timestamp << shift)
     534       241436 :     }
     535              : 
     536           62 :     fn read_low_res_timestamp(&self, shift: u32) -> Option<SystemTime> {
     537           62 :         let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
     538           62 : 
     539           62 :         let ts_bits = (read & ((Self::TS_MASK as u64) << shift)) >> shift;
     540           62 :         if ts_bits == 0 {
     541           24 :             None
     542              :         } else {
     543           38 :             Some(UNIX_EPOCH + Duration::from_secs(ts_bits | (Self::TS_ONES as u64)))
     544              :         }
     545           62 :     }
     546              : 
     547              :     /// Record a change in layer residency.
     548              :     ///
     549              :     /// Recording the event must happen while holding the layer map lock to
     550              :     /// ensure that latest-activity-threshold-based layer eviction (eviction_task.rs)
     551              :     /// can do an "imitate access" to this layer, before it observes `now-latest_activity() > threshold`.
     552              :     ///
     553              :     /// If we instead recorded the residence event with a timestamp from before grabbing the layer map lock,
     554              :     /// the following race could happen:
     555              :     ///
     556              :     /// - Compact: Write out an L1 layer from several L0 layers. This records residence event LayerCreate with the current timestamp.
     557              :     /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map.
     558              :     /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
     559              :     /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
     560           26 :     pub(crate) fn record_residence_event_at(&self, now: SystemTime) {
     561           26 :         let (mask, value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, now);
     562           26 :         self.write_bits(mask, value);
     563           26 :     }
     564              : 
     565           24 :     pub(crate) fn record_residence_event(&self) {
     566           24 :         self.record_residence_event_at(SystemTime::now())
     567           24 :     }
     568              : 
     569       239654 :     fn record_access_at(&self, now: SystemTime) -> bool {
     570       239654 :         let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);
     571       239654 : 
     572       239654 :         // A layer which is accessed must be visible.
     573       239654 :         mask |= 0x1 << Self::VISIBILITY_SHIFT;
     574       239654 :         value |= 0x1 << Self::VISIBILITY_SHIFT;
     575       239654 : 
     576       239654 :         let old_bits = self.write_bits(mask, value);
     577            2 :         !matches!(
     578       239654 :             self.decode_visibility(old_bits),
     579              :             LayerVisibilityHint::Visible
     580              :         )
     581       239654 :     }
     582              : 
     583              :     /// Returns true if we modified the layer's visibility to set it to Visible implicitly
     584              :     /// as a result of this access
     585       239932 :     pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool {
     586       239932 :         if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
     587          284 :             return false;
     588       239648 :         }
     589       239648 : 
     590       239648 :         self.record_access_at(SystemTime::now())
     591       239932 :     }
     592              : 
     593            0 :     fn as_api_model(
     594            0 :         &self,
     595            0 :         reset: LayerAccessStatsReset,
     596            0 :     ) -> pageserver_api::models::LayerAccessStats {
     597            0 :         let ret = pageserver_api::models::LayerAccessStats {
     598            0 :             access_time: self
     599            0 :                 .read_low_res_timestamp(Self::ATIME_SHIFT)
     600            0 :                 .unwrap_or(UNIX_EPOCH),
     601            0 :             residence_time: self
     602            0 :                 .read_low_res_timestamp(Self::RTIME_SHIFT)
     603            0 :                 .unwrap_or(UNIX_EPOCH),
     604            0 :             visible: matches!(self.visibility(), LayerVisibilityHint::Visible),
     605              :         };
     606            0 :         match reset {
     607            0 :             LayerAccessStatsReset::NoReset => {}
     608            0 :             LayerAccessStatsReset::AllStats => {
     609            0 :                 self.write_bits((Self::TS_MASK as u64) << Self::ATIME_SHIFT, 0x0);
     610            0 :                 self.write_bits((Self::TS_MASK as u64) << Self::RTIME_SHIFT, 0x0);
     611            0 :             }
     612              :         }
     613            0 :         ret
     614            0 :     }
     615              : 
     616              :     /// Get the latest access timestamp, falling back to latest residence event.  The latest residence event
     617              :     /// will be this Layer's construction time, if its residence hasn't changed since then.
     618           16 :     pub(crate) fn latest_activity(&self) -> SystemTime {
     619           16 :         if let Some(t) = self.read_low_res_timestamp(Self::ATIME_SHIFT) {
     620            6 :             t
     621              :         } else {
     622           10 :             self.read_low_res_timestamp(Self::RTIME_SHIFT)
     623           10 :                 .expect("Residence time is set on construction")
     624              :         }
     625           16 :     }
     626              : 
     627              :     /// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]).
     628              :     ///
     629              :     /// This indicates whether the layer has been used for some purpose that would motivate
     630              :     /// us to keep it on disk, such as for serving a getpage request.
     631           18 :     fn accessed(&self) -> bool {
     632           18 :         // Consider it accessed if the most recent access is more recent than
     633           18 :         // the most recent change in residence status.
     634           18 :         match (
     635           18 :             self.read_low_res_timestamp(Self::ATIME_SHIFT),
     636           18 :             self.read_low_res_timestamp(Self::RTIME_SHIFT),
     637              :         ) {
     638           14 :             (None, _) => false,
     639            0 :             (Some(_), None) => true,
     640            4 :             (Some(a), Some(r)) => a >= r,
     641              :         }
     642           18 :     }
     643              : 
     644              :     /// Helper for extracting the visibility hint from the literal value of our inner u64
     645       240571 :     fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint {
     646       240571 :         match (bits >> Self::VISIBILITY_SHIFT) & 0x1 {
     647       240549 :             1 => LayerVisibilityHint::Visible,
     648           22 :             0 => LayerVisibilityHint::Covered,
     649            0 :             _ => unreachable!(),
     650              :         }
     651       240571 :     }
     652              : 
     653              :     /// Returns the old value which has been replaced
     654          332 :     pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint {
     655          332 :         let value = match visibility {
     656          280 :             LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
     657           52 :             LayerVisibilityHint::Covered => 0x0,
     658              :         };
     659              : 
     660          332 :         let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
     661          332 :         self.decode_visibility(old_bits)
     662          332 :     }
     663              : 
     664          585 :     pub(crate) fn visibility(&self) -> LayerVisibilityHint {
     665          585 :         let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
     666          585 :         self.decode_visibility(read)
     667          585 :     }
     668              : }
     669              : 
     670              : /// Get a layer descriptor from a layer.
     671              : pub(crate) trait AsLayerDesc {
     672              :     /// Get the layer descriptor.
     673              :     fn layer_desc(&self) -> &PersistentLayerDesc;
     674              : }
     675              : 
     676              : pub mod tests {
     677              :     use pageserver_api::shard::TenantShardId;
     678              :     use utils::id::TimelineId;
     679              : 
     680              :     use super::*;
     681              : 
     682              :     impl From<DeltaLayerName> for PersistentLayerDesc {
     683            0 :         fn from(value: DeltaLayerName) -> Self {
     684            0 :             PersistentLayerDesc::new_delta(
     685            0 :                 TenantShardId::from([0; 18]),
     686            0 :                 TimelineId::from_array([0; 16]),
     687            0 :                 value.key_range,
     688            0 :                 value.lsn_range,
     689            0 :                 233,
     690            0 :             )
     691            0 :         }
     692              :     }
     693              : 
     694              :     impl From<ImageLayerName> for PersistentLayerDesc {
     695            0 :         fn from(value: ImageLayerName) -> Self {
     696            0 :             PersistentLayerDesc::new_img(
     697            0 :                 TenantShardId::from([0; 18]),
     698            0 :                 TimelineId::from_array([0; 16]),
     699            0 :                 value.key_range,
     700            0 :                 value.lsn,
     701            0 :                 233,
     702            0 :             )
     703            0 :         }
     704              :     }
     705              : 
     706              :     impl From<LayerName> for PersistentLayerDesc {
     707            0 :         fn from(value: LayerName) -> Self {
     708            0 :             match value {
     709            0 :                 LayerName::Delta(d) => Self::from(d),
     710            0 :                 LayerName::Image(i) => Self::from(i),
     711              :             }
     712            0 :         }
     713              :     }
     714              : }
     715              : 
     716              : /// Range wrapping newtype, which uses display to render Debug.
     717              : ///
     718              : /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
     719              : struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);
     720              : 
     721              : impl<T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'_, T> {
     722            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     723            0 :         write!(f, "{}..{}", self.0.start, self.0.end)
     724            0 :     }
     725              : }
        

Generated by: LCOV version 2.1-beta