LCOV - code coverage report
Current view: top level - pageserver/src/tenant - storage_layer.rs (source / functions) Coverage Total Hit
Test: 8ac049b474321fdc72ddcb56d7165153a1a900e8.info Lines: 83.2 % 185 154
Test Date: 2023-09-06 10:18:01 Functions: 64.0 % 50 32

            Line data    Source code
       1              : //! Common traits and structs for layers
       2              : 
       3              : pub mod delta_layer;
       4              : mod filename;
       5              : mod image_layer;
       6              : mod inmemory_layer;
       7              : mod layer_desc;
       8              : mod remote_layer;
       9              : 
      10              : use crate::config::PageServerConf;
      11              : use crate::context::{AccessStatsBehavior, RequestContext};
      12              : use crate::repository::Key;
      13              : use crate::task_mgr::TaskKind;
      14              : use crate::walrecord::NeonWalRecord;
      15              : use anyhow::Result;
      16              : use bytes::Bytes;
      17              : use enum_map::EnumMap;
      18              : use enumset::EnumSet;
      19              : use once_cell::sync::Lazy;
      20              : use pageserver_api::models::LayerAccessKind;
      21              : use pageserver_api::models::{
      22              :     HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
      23              : };
      24              : use std::ops::Range;
      25              : use std::path::PathBuf;
      26              : use std::sync::{Arc, Mutex};
      27              : use std::time::{Duration, SystemTime, UNIX_EPOCH};
      28              : use tracing::warn;
      29              : use utils::history_buffer::HistoryBufferWithDropCounter;
      30              : use utils::rate_limit::RateLimit;
      31              : 
      32              : use utils::{
      33              :     id::{TenantId, TimelineId},
      34              :     lsn::Lsn,
      35              : };
      36              : 
      37              : pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
      38              : pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
      39              : pub use image_layer::{ImageLayer, ImageLayerWriter};
      40              : pub use inmemory_layer::InMemoryLayer;
      41              : pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
      42              : pub use remote_layer::RemoteLayer;
      43              : 
      44            0 : pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
      45            0 : where
      46            0 :     T: PartialOrd<T>,
      47            0 : {
      48            0 :     if a.start < b.start {
      49            0 :         a.end > b.start
      50              :     } else {
      51            0 :         b.end > a.start
      52              :     }
      53            0 : }
      54              : 
      55              : /// Struct used to communicate across calls to 'get_value_reconstruct_data'.
      56              : ///
      57              : /// Before first call, you can fill in 'page_img' if you have an older cached
      58              : /// version of the page available. That can save work in
      59              : /// 'get_value_reconstruct_data', as it can stop searching for page versions
      60              : /// when all the WAL records going back to the cached image have been collected.
      61              : ///
      62              : /// When get_value_reconstruct_data returns Complete, 'img' is set to an image
      63              : /// of the page, or the oldest WAL record in 'records' is a will_init-type
      64              : /// record that initializes the page without requiring a previous image.
      65              : ///
      66              : /// If 'get_page_reconstruct_data' returns Continue, some 'records' may have
      67              : /// been collected, but there are more records outside the current layer. Pass
      68              : /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
      69              : /// call, to collect more records.
      70              : ///
      71            0 : #[derive(Debug)]
      72              : pub struct ValueReconstructState {
      73              :     pub records: Vec<(Lsn, NeonWalRecord)>,
      74              :     pub img: Option<(Lsn, Bytes)>,
      75              : }
      76              : 
      77              : /// Return value from Layer::get_page_reconstruct_data
      78            4 : #[derive(Clone, Copy, Debug)]
      79              : pub enum ValueReconstructResult {
      80              :     /// Got all the data needed to reconstruct the requested page
      81              :     Complete,
      82              :     /// This layer didn't contain all the required data, the caller should look up
      83              :     /// the predecessor layer at the returned LSN and collect more data from there.
      84              :     Continue,
      85              : 
      86              :     /// This layer didn't contain data needed to reconstruct the page version at
      87              :     /// the returned LSN. This is usually considered an error, but might be OK
      88              :     /// in some circumstances.
      89              :     Missing,
      90              : }
      91              : 
      92            0 : #[derive(Debug)]
      93              : pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
      94              : 
      95              : /// This struct holds two instances of [`LayerAccessStatsInner`].
      96              : /// Accesses are recorded to both instances.
      97              : /// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`].
      98              : /// The `for_eviction_policy` is never reset.
      99        22960 : #[derive(Debug, Default, Clone)]
     100              : struct LayerAccessStatsLocked {
     101              :     for_scraping_api: LayerAccessStatsInner,
     102              :     for_eviction_policy: LayerAccessStatsInner,
     103              : }
     104              : 
     105              : impl LayerAccessStatsLocked {
     106     43844889 :     fn iter_mut(&mut self) -> impl Iterator<Item = &mut LayerAccessStatsInner> {
     107     43844889 :         [&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter()
     108     43844889 :     }
     109              : }
     110              : 
     111        45920 : #[derive(Debug, Default, Clone)]
     112              : struct LayerAccessStatsInner {
     113              :     first_access: Option<LayerAccessStatFullDetails>,
     114              :     count_by_access_kind: EnumMap<LayerAccessKind, u64>,
     115              :     task_kind_flag: EnumSet<TaskKind>,
     116              :     last_accesses: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
     117              :     last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
     118              : }
     119              : 
     120         1130 : #[derive(Debug, Clone, Copy)]
     121              : pub(crate) struct LayerAccessStatFullDetails {
     122              :     pub(crate) when: SystemTime,
     123              :     pub(crate) task_kind: TaskKind,
     124              :     pub(crate) access_kind: LayerAccessKind,
     125              : }
     126              : 
     127            0 : #[derive(Clone, Copy, strum_macros::EnumString)]
     128              : pub enum LayerAccessStatsReset {
     129              :     NoReset,
     130              :     JustTaskKindFlags,
     131              :     AllStats,
     132              : }
     133              : 
     134         8585 : fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 {
     135         8585 :     ts.duration_since(UNIX_EPOCH)
     136         8585 :         .expect("better to die in this unlikely case than report false stats")
     137         8585 :         .as_millis()
     138         8585 :         .try_into()
     139         8585 :         .expect("64 bits is enough for few more years")
     140         8585 : }
     141              : 
     142              : impl LayerAccessStatFullDetails {
     143         8585 :     fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
     144         8585 :         let Self {
     145         8585 :             when,
     146         8585 :             task_kind,
     147         8585 :             access_kind,
     148         8585 :         } = self;
     149         8585 :         pageserver_api::models::LayerAccessStatFullDetails {
     150         8585 :             when_millis_since_epoch: system_time_to_millis_since_epoch(when),
     151         8585 :             task_kind: task_kind.into(), // into static str, powered by strum_macros
     152         8585 :             access_kind: *access_kind,
     153         8585 :         }
     154         8585 :     }
     155              : }
     156              : 
     157              : impl LayerAccessStats {
     158              :     /// Create an empty stats object.
     159              :     ///
     160              :     /// The caller is responsible for recording a residence event
     161              :     /// using [`record_residence_event`] before calling `latest_activity`.
     162              :     /// If they don't, [`latest_activity`] will return `None`.
     163              :     ///
     164              :     /// [`record_residence_event`]: Self::record_residence_event
     165              :     /// [`latest_activity`]: Self::latest_activity
     166        16620 :     pub(crate) fn empty_will_record_residence_event_later() -> Self {
     167        16620 :         LayerAccessStats(Mutex::default())
     168        16620 :     }
     169              : 
     170              :     /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
     171              :     ///
     172              :     /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
     173              :     ///
     174              :     /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
     175              :     /// [`record_residence_event`]: Self::record_residence_event
     176         6340 :     pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
     177         6340 :         let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
     178         6340 :         new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
     179         6340 :         new
     180         6340 :     }
     181              : 
     182              :     /// Creates a clone of `self` and records `new_status` in the clone.
     183              :     ///
     184              :     /// The `new_status` is not recorded in `self`.
     185              :     ///
     186              :     /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
     187              :     ///
     188              :     /// [`record_residence_event`]: Self::record_residence_event
     189         1217 :     pub(crate) fn clone_for_residence_change(
     190         1217 :         &self,
     191         1217 :         new_status: LayerResidenceStatus,
     192         1217 :     ) -> LayerAccessStats {
     193         1217 :         let clone = {
     194         1217 :             let inner = self.0.lock().unwrap();
     195         1217 :             inner.clone()
     196         1217 :         };
     197         1217 :         let new = LayerAccessStats(Mutex::new(clone));
     198         1217 :         new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
     199         1217 :         new
     200         1217 :     }
     201              : 
     202              :     /// Record a change in layer residency.
     203              :     ///
     204              :     /// Recording the event must happen while holding the layer map lock to
     205              :     /// ensure that latest-activity-threshold-based layer eviction (eviction_task.rs)
     206              :     /// can do an "imitate access" to this layer, before it observes `now-latest_activity() > threshold`.
     207              :     ///
     208              :     /// If we instead recorded the residence event with a timestamp from before grabbing the layer map lock,
     209              :     /// the following race could happen:
     210              :     ///
     211              :     /// - Compact: Write out an L1 layer from several L0 layers. This records residence event LayerCreate with the current timestamp.
     212              :     /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map.
     213              :     /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
     214              :     /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
     215              :     ///
     216        26186 :     pub(crate) fn record_residence_event(
     217        26186 :         &self,
     218        26186 :         status: LayerResidenceStatus,
     219        26186 :         reason: LayerResidenceEventReason,
     220        26186 :     ) {
     221        26186 :         let mut locked = self.0.lock().unwrap();
     222        52372 :         locked.iter_mut().for_each(|inner| {
     223        52372 :             inner
     224        52372 :                 .last_residence_changes
     225        52372 :                 .write(LayerResidenceEvent::new(status, reason))
     226        52372 :         });
     227        26186 :     }
     228              : 
     229     45700285 :     fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
     230     45700285 :         if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
     231      1881582 :             return;
     232     43818703 :         }
     233     43818703 : 
     234     43818703 :         let this_access = LayerAccessStatFullDetails {
     235     43818703 :             when: SystemTime::now(),
     236     43818703 :             task_kind: ctx.task_kind(),
     237     43818703 :             access_kind,
     238     43818703 :         };
     239     43818703 : 
     240     43818703 :         let mut locked = self.0.lock().unwrap();
     241     87637444 :         locked.iter_mut().for_each(|inner| {
     242     87637444 :             inner.first_access.get_or_insert(this_access);
     243     87637444 :             inner.count_by_access_kind[access_kind] += 1;
     244     87637444 :             inner.task_kind_flag |= ctx.task_kind();
     245     87637444 :             inner.last_accesses.write(this_access);
     246     87637444 :         })
     247     45700285 :     }
     248              : 
     249         2273 :     fn as_api_model(
     250         2273 :         &self,
     251         2273 :         reset: LayerAccessStatsReset,
     252         2273 :     ) -> pageserver_api::models::LayerAccessStats {
     253         2273 :         let mut locked = self.0.lock().unwrap();
     254         2273 :         let inner = &mut locked.for_scraping_api;
     255         2273 :         let LayerAccessStatsInner {
     256         2273 :             first_access,
     257         2273 :             count_by_access_kind,
     258         2273 :             task_kind_flag,
     259         2273 :             last_accesses,
     260         2273 :             last_residence_changes,
     261         2273 :         } = inner;
     262         2273 :         let ret = pageserver_api::models::LayerAccessStats {
     263         2273 :             access_count_by_access_kind: count_by_access_kind
     264         2273 :                 .iter()
     265         9092 :                 .map(|(kind, count)| (kind, *count))
     266         2273 :                 .collect(),
     267         2273 :             task_kind_access_flag: task_kind_flag
     268         2273 :                 .iter()
     269         2273 :                 .map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
     270         2273 :                 .collect(),
     271         2273 :             first: first_access.as_ref().map(|a| a.as_api_model()),
     272         8074 :             accesses_history: last_accesses.map(|m| m.as_api_model()),
     273         2273 :             residence_events_history: last_residence_changes.clone(),
     274         2273 :         };
     275         2273 :         match reset {
     276         2273 :             LayerAccessStatsReset::NoReset => (),
     277            0 :             LayerAccessStatsReset::JustTaskKindFlags => {
     278            0 :                 inner.task_kind_flag.clear();
     279            0 :             }
     280            0 :             LayerAccessStatsReset::AllStats => {
     281            0 :                 *inner = LayerAccessStatsInner::default();
     282            0 :             }
     283              :         }
     284         2273 :         ret
     285         2273 :     }
     286              : 
     287              :     /// Get the latest access timestamp, falling back to latest residence event.
     288              :     ///
     289              :     /// This function can only return `None` if there has not yet been a call to the
     290              :     /// [`record_residence_event`] method. That would generally be considered an
     291              :     /// implementation error. This function logs a rate-limited warning in that case.
     292              :     ///
     293              :     /// TODO: use type system to avoid the need for `fallback`.
     294              :     /// The approach in <https://github.com/neondatabase/neon/pull/3775>
     295              :     /// could be used to enforce that a residence event is recorded
     296              :     /// before a layer is added to the layer map. We could also have
     297              :     /// a layer wrapper type that holds the LayerAccessStats, and ensure
     298              :     /// that that type can only be produced by inserting into the layer map.
     299              :     ///
     300              :     /// [`record_residence_event`]: Self::record_residence_event
     301         1237 :     pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
     302         1237 :         let locked = self.0.lock().unwrap();
     303         1237 :         let inner = &locked.for_eviction_policy;
     304         1237 :         match inner.last_accesses.recent() {
     305          783 :             Some(a) => Some(a.when),
     306          454 :             None => match inner.last_residence_changes.recent() {
     307          454 :                 Some(e) => Some(e.timestamp),
     308              :                 None => {
     309              :                     static WARN_RATE_LIMIT: Lazy<Mutex<(usize, RateLimit)>> =
     310            0 :                         Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10)))));
     311            0 :                     let mut guard = WARN_RATE_LIMIT.lock().unwrap();
     312            0 :                     guard.0 += 1;
     313            0 :                     let occurences = guard.0;
     314            0 :                     guard.1.call(move || {
     315            0 :                         warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value");
     316            0 :                     });
     317            0 :                     None
     318              :                 }
     319              :             },
     320              :         }
     321         1237 :     }
     322              : }
     323              : 
     324              : /// Supertrait of the [`Layer`] trait that captures the bare minimum interface
     325              : /// required by [`LayerMap`](super::layer_map::LayerMap).
     326              : ///
     327              : /// All layers should implement a minimal `std::fmt::Debug` without tenant or
     328              : /// timeline names, because those are known in the context of which the layers
     329              : /// are used in (timeline).
     330              : #[async_trait::async_trait]
     331              : pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
     332              :     ///
     333              :     /// Return data needed to reconstruct given page at LSN.
     334              :     ///
     335              :     /// It is up to the caller to collect more data from previous layer and
     336              :     /// perform WAL redo, if necessary.
     337              :     ///
     338              :     /// See PageReconstructResult for possible return values. The collected data
     339              :     /// is appended to reconstruct_data; the caller should pass an empty struct
     340              :     /// on first call, or a struct with a cached older image of the page if one
     341              :     /// is available. If this returns ValueReconstructResult::Continue, look up
     342              :     /// the predecessor layer and call again with the same 'reconstruct_data' to
     343              :     /// collect more data.
     344              :     async fn get_value_reconstruct_data(
     345              :         &self,
     346              :         key: Key,
     347              :         lsn_range: Range<Lsn>,
     348              :         reconstruct_data: &mut ValueReconstructState,
     349              :         ctx: &RequestContext,
     350              :     ) -> Result<ValueReconstructResult>;
     351              : }
     352              : 
     353              : /// Get a layer descriptor from a layer.
     354              : pub trait AsLayerDesc {
     355              :     /// Get the layer descriptor.
     356              :     fn layer_desc(&self) -> &PersistentLayerDesc;
     357              : }
     358              : 
     359              : /// A Layer contains all data in a "rectangle" consisting of a range of keys and
     360              : /// range of LSNs.
     361              : ///
     362              : /// There are two kinds of layers, in-memory and on-disk layers. In-memory
     363              : /// layers are used to ingest incoming WAL, and provide fast access to the
     364              : /// recent page versions. On-disk layers are stored as files on disk, and are
     365              : /// immutable. This trait presents the common functionality of in-memory and
     366              : /// on-disk layers.
     367              : ///
     368              : /// Furthermore, there are two kinds of on-disk layers: delta and image layers.
     369              : /// A delta layer contains all modifications within a range of LSNs and keys.
     370              : /// An image layer is a snapshot of all the data in a key-range, at a single
     371              : /// LSN.
     372              : pub trait PersistentLayer: Layer + AsLayerDesc {
     373              :     /// File name used for this layer, both in the pageserver's local filesystem
     374              :     /// state as well as in the remote storage.
     375        15719 :     fn filename(&self) -> LayerFileName {
     376        15719 :         self.layer_desc().filename()
     377        15719 :     }
     378              : 
     379              :     // Path to the layer file in the local filesystem.
     380              :     // `None` for `RemoteLayer`.
     381              :     fn local_path(&self) -> Option<PathBuf>;
     382              : 
     383              :     /// Permanently remove this layer from disk.
     384              :     fn delete_resident_layer_file(&self) -> Result<()>;
     385              : 
     386            4 :     fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
     387            4 :         None
     388            4 :     }
     389              : 
     390            0 :     fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
     391            0 :         None
     392            0 :     }
     393              : 
     394     45706926 :     fn is_remote_layer(&self) -> bool {
     395     45706926 :         false
     396     45706926 :     }
     397              : 
     398              :     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
     399              : 
     400              :     fn access_stats(&self) -> &LayerAccessStats;
     401              : }
     402              : 
     403     45696324 : pub fn downcast_remote_layer(
     404     45696324 :     layer: &Arc<dyn PersistentLayer>,
     405     45696324 : ) -> Option<std::sync::Arc<RemoteLayer>> {
     406     45696324 :     if layer.is_remote_layer() {
     407          993 :         Arc::clone(layer).downcast_remote_layer()
     408              :     } else {
     409     45695331 :         None
     410              :     }
     411     45696324 : }
     412              : 
     413              : pub mod tests {
     414              :     use super::*;
     415              : 
     416              :     impl From<DeltaFileName> for PersistentLayerDesc {
     417            3 :         fn from(value: DeltaFileName) -> Self {
     418            3 :             PersistentLayerDesc::new_delta(
     419            3 :                 TenantId::from_array([0; 16]),
     420            3 :                 TimelineId::from_array([0; 16]),
     421            3 :                 value.key_range,
     422            3 :                 value.lsn_range,
     423            3 :                 233,
     424            3 :             )
     425            3 :         }
     426              :     }
     427              : 
     428              :     impl From<ImageFileName> for PersistentLayerDesc {
     429            1 :         fn from(value: ImageFileName) -> Self {
     430            1 :             PersistentLayerDesc::new_img(
     431            1 :                 TenantId::from_array([0; 16]),
     432            1 :                 TimelineId::from_array([0; 16]),
     433            1 :                 value.key_range,
     434            1 :                 value.lsn,
     435            1 :                 233,
     436            1 :             )
     437            1 :         }
     438              :     }
     439              : 
     440              :     impl From<LayerFileName> for PersistentLayerDesc {
     441            4 :         fn from(value: LayerFileName) -> Self {
     442            4 :             match value {
     443            3 :                 LayerFileName::Delta(d) => Self::from(d),
     444            1 :                 LayerFileName::Image(i) => Self::from(i),
     445              :             }
     446            4 :         }
     447              :     }
     448              : }
     449              : 
     450              : /// Helper enum to hold a PageServerConf, or a path
     451              : ///
     452              : /// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
     453              : /// global config, and paths to layer files are constructed using the tenant/timeline
     454              : /// path from the config. But in the 'pagectl' binary, we need to construct a Layer
     455              : /// struct for a file on disk, without having a page server running, so that we have no
     456              : /// config. In that case, we use the Path variant to hold the full path to the file on
     457              : /// disk.
     458              : enum PathOrConf {
     459              :     Path(PathBuf),
     460              :     Conf(&'static PageServerConf),
     461              : }
     462              : 
     463              : /// Range wrapping newtype, which uses display to render Debug.
     464              : ///
     465              : /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
     466              : struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);
     467              : 
     468              : impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> {
     469            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     470            0 :         write!(f, "{}..{}", self.0.start, self.0.end)
     471            0 :     }
     472              : }
        

Generated by: LCOV version 2.1-beta