LCOV - code coverage report
Current view: top level - pageserver/src/tenant/storage_layer - layer.rs (source / functions) Coverage Total Hit
Test: 190869232aac3a234374e5bb62582e91cf5f5818.info Lines: 63.0 % 1094 689
Test Date: 2024-02-23 13:21:27 Functions: 61.3 % 155 95

            Line data    Source code
       1              : use anyhow::Context;
       2              : use camino::{Utf8Path, Utf8PathBuf};
       3              : use pageserver_api::keyspace::KeySpace;
       4              : use pageserver_api::models::{
       5              :     HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
       6              : };
       7              : use pageserver_api::shard::ShardIndex;
       8              : use std::ops::Range;
       9              : use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
      10              : use std::sync::{Arc, Weak};
      11              : use std::time::SystemTime;
      12              : use tracing::Instrument;
      13              : use utils::lsn::Lsn;
      14              : use utils::sync::heavier_once_cell;
      15              : 
      16              : use crate::config::PageServerConf;
      17              : use crate::context::RequestContext;
      18              : use crate::repository::Key;
      19              : use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
      20              : use crate::tenant::timeline::GetVectoredError;
      21              : use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
      22              : 
      23              : use super::delta_layer::{self, DeltaEntry};
      24              : use super::image_layer;
      25              : use super::{
      26              :     AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
      27              :     ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
      28              : };
      29              : 
      30              : use utils::generation::Generation;
      31              : 
      32              : /// A Layer contains all data in a "rectangle" consisting of a range of keys and
      33              : /// range of LSNs.
      34              : ///
      35              : /// There are two kinds of layers, in-memory and on-disk layers. In-memory
      36              : /// layers are used to ingest incoming WAL, and provide fast access to the
      37              : /// recent page versions. On-disk layers are stored as files on disk, and are
      38              : /// immutable. This type represents the on-disk kind while in-memory kind are represented by
      39              : /// [`InMemoryLayer`].
      40              : ///
      41              : /// Furthermore, there are two kinds of on-disk layers: delta and image layers.
      42              : /// A delta layer contains all modifications within a range of LSNs and keys.
      43              : /// An image layer is a snapshot of all the data in a key-range, at a single
      44              : /// LSN.
      45              : ///
      46              : /// This type models the on-disk layers, which can be evicted and on-demand downloaded.
      47              : ///
      48              : /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer
      49       251424 : #[derive(Clone)]
      50              : pub(crate) struct Layer(Arc<LayerInner>);
      51              : 
      52              : impl std::fmt::Display for Layer {
      53         1158 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      54         1158 :         if matches!(self.0.generation, Generation::Broken) {
      55            0 :             write!(f, "{}-broken", self.layer_desc().short_id())
      56              :         } else {
      57         1158 :             write!(
      58         1158 :                 f,
      59         1158 :                 "{}{}",
      60         1158 :                 self.layer_desc().short_id(),
      61         1158 :                 self.0.generation.get_suffix()
      62         1158 :             )
      63              :         }
      64         1158 :     }
      65              : }
      66              : 
      67              : impl std::fmt::Debug for Layer {
      68            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      69            0 :         write!(f, "{}", self)
      70            0 :     }
      71              : }
      72              : 
      73              : impl AsLayerDesc for Layer {
      74       377701 :     fn layer_desc(&self) -> &PersistentLayerDesc {
      75       377701 :         self.0.layer_desc()
      76       377701 :     }
      77              : }
      78              : 
      79              : impl Layer {
      80              :     /// Creates a layer value for a file we know to not be resident.
      81            0 :     pub(crate) fn for_evicted(
      82            0 :         conf: &'static PageServerConf,
      83            0 :         timeline: &Arc<Timeline>,
      84            0 :         file_name: LayerFileName,
      85            0 :         metadata: LayerFileMetadata,
      86            0 :     ) -> Self {
      87            0 :         let desc = PersistentLayerDesc::from_filename(
      88            0 :             timeline.tenant_shard_id,
      89            0 :             timeline.timeline_id,
      90            0 :             file_name,
      91            0 :             metadata.file_size(),
      92            0 :         );
      93            0 : 
      94            0 :         let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
      95            0 : 
      96            0 :         let owner = Layer(Arc::new(LayerInner::new(
      97            0 :             conf,
      98            0 :             timeline,
      99            0 :             access_stats,
     100            0 :             desc,
     101            0 :             None,
     102            0 :             metadata.generation,
     103            0 :             metadata.shard,
     104            0 :         )));
     105              : 
     106            0 :         debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
     107              : 
     108            0 :         owner
     109            0 :     }
     110              : 
     111              :     /// Creates a Layer value for a file we know to be resident in timeline directory.
     112           24 :     pub(crate) fn for_resident(
     113           24 :         conf: &'static PageServerConf,
     114           24 :         timeline: &Arc<Timeline>,
     115           24 :         file_name: LayerFileName,
     116           24 :         metadata: LayerFileMetadata,
     117           24 :     ) -> ResidentLayer {
     118           24 :         let desc = PersistentLayerDesc::from_filename(
     119           24 :             timeline.tenant_shard_id,
     120           24 :             timeline.timeline_id,
     121           24 :             file_name,
     122           24 :             metadata.file_size(),
     123           24 :         );
     124           24 : 
     125           24 :         let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
     126           24 : 
     127           24 :         let mut resident = None;
     128           24 : 
     129           24 :         let owner = Layer(Arc::new_cyclic(|owner| {
     130           24 :             let inner = Arc::new(DownloadedLayer {
     131           24 :                 owner: owner.clone(),
     132           24 :                 kind: tokio::sync::OnceCell::default(),
     133           24 :                 version: 0,
     134           24 :             });
     135           24 :             resident = Some(inner.clone());
     136           24 : 
     137           24 :             LayerInner::new(
     138           24 :                 conf,
     139           24 :                 timeline,
     140           24 :                 access_stats,
     141           24 :                 desc,
     142           24 :                 Some(inner),
     143           24 :                 metadata.generation,
     144           24 :                 metadata.shard,
     145           24 :             )
     146           24 :         }));
     147           24 : 
     148           24 :         let downloaded = resident.expect("just initialized");
     149              : 
     150           24 :         debug_assert!(owner.0.needs_download_blocking().unwrap().is_none());
     151              : 
     152           24 :         timeline
     153           24 :             .metrics
     154           24 :             .resident_physical_size_add(metadata.file_size());
     155           24 : 
     156           24 :         ResidentLayer { downloaded, owner }
     157           24 :     }
     158              : 
     159              :     /// Creates a Layer value for freshly written out new layer file by renaming it from a
     160              :     /// temporary path.
     161          550 :     pub(crate) fn finish_creating(
     162          550 :         conf: &'static PageServerConf,
     163          550 :         timeline: &Arc<Timeline>,
     164          550 :         desc: PersistentLayerDesc,
     165          550 :         temp_path: &Utf8Path,
     166          550 :     ) -> anyhow::Result<ResidentLayer> {
     167          550 :         let mut resident = None;
     168          550 : 
     169          550 :         let owner = Layer(Arc::new_cyclic(|owner| {
     170          550 :             let inner = Arc::new(DownloadedLayer {
     171          550 :                 owner: owner.clone(),
     172          550 :                 kind: tokio::sync::OnceCell::default(),
     173          550 :                 version: 0,
     174          550 :             });
     175          550 :             resident = Some(inner.clone());
     176          550 :             let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
     177          550 :             access_stats.record_residence_event(
     178          550 :                 LayerResidenceStatus::Resident,
     179          550 :                 LayerResidenceEventReason::LayerCreate,
     180          550 :             );
     181          550 :             LayerInner::new(
     182          550 :                 conf,
     183          550 :                 timeline,
     184          550 :                 access_stats,
     185          550 :                 desc,
     186          550 :                 Some(inner),
     187          550 :                 timeline.generation,
     188          550 :                 timeline.get_shard_index(),
     189          550 :             )
     190          550 :         }));
     191          550 : 
     192          550 :         let downloaded = resident.expect("just initialized");
     193          550 : 
     194          550 :         // if the rename works, the path is as expected
     195          550 :         std::fs::rename(temp_path, owner.local_path())
     196          550 :             .with_context(|| format!("rename temporary file as correct path for {owner}"))?;
     197              : 
     198          550 :         Ok(ResidentLayer { downloaded, owner })
     199          550 :     }
     200              : 
     201              :     /// Requests the layer to be evicted and waits for this to be done.
     202              :     ///
     203              :     /// If the file is not resident, an [`EvictionError::NotFound`] is returned.
     204              :     ///
     205              :     /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
     206              :     /// re-downloaded, [`EvictionError::Downloaded`] is returned.
     207              :     ///
     208              :     /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
     209              :     /// of download-evict cycle on retry.
     210            4 :     pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
     211            4 :         self.0.evict_and_wait().await
     212            4 :     }
     213              : 
     214              :     /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
     215              :     /// then.
     216              :     ///
     217              :     /// On drop, this will cause a call to [`crate::tenant::remote_timeline_client::RemoteTimelineClient::schedule_deletion_of_unlinked`].
     218              :     /// This means that the unlinking by [gc] or [compaction] must have happened strictly before
     219              :     /// the value this is called on gets dropped.
     220              :     ///
     221              :     /// This is ensured by both of those methods accepting references to Layer.
     222              :     ///
     223              :     /// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
     224              :     /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
     225          300 :     pub(crate) fn delete_on_drop(&self) {
     226          300 :         self.0.delete_on_drop();
     227          300 :     }
     228              : 
     229              :     /// Return data needed to reconstruct given page at LSN.
     230              :     ///
     231              :     /// It is up to the caller to collect more data from the previous layer and
     232              :     /// perform WAL redo, if necessary.
     233              :     ///
     234              :     /// # Cancellation-Safety
     235              :     ///
     236              :     /// This method is cancellation-safe.
     237       123851 :     pub(crate) async fn get_value_reconstruct_data(
     238       123851 :         &self,
     239       123851 :         key: Key,
     240       123851 :         lsn_range: Range<Lsn>,
     241       123851 :         reconstruct_data: &mut ValueReconstructState,
     242       123851 :         ctx: &RequestContext,
     243       123851 :     ) -> anyhow::Result<ValueReconstructResult> {
     244              :         use anyhow::ensure;
     245              : 
     246       123851 :         let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
     247       123851 :         self.0
     248       123851 :             .access_stats
     249       123851 :             .record_access(LayerAccessKind::GetValueReconstructData, ctx);
     250       123851 : 
     251       123851 :         if self.layer_desc().is_delta {
     252       123339 :             ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
     253       123339 :             ensure!(self.layer_desc().key_range.contains(&key));
     254              :         } else {
     255          512 :             ensure!(self.layer_desc().key_range.contains(&key));
     256          512 :             ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
     257          512 :             ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
     258              :         }
     259              : 
     260       123851 :         layer
     261       123851 :             .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
     262       123851 :             .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
     263        23596 :             .await
     264       123851 :             .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
     265       123851 :     }
     266              : 
     267           10 :     pub(crate) async fn get_values_reconstruct_data(
     268           10 :         &self,
     269           10 :         keyspace: KeySpace,
     270           10 :         end_lsn: Lsn,
     271           10 :         reconstruct_data: &mut ValuesReconstructState,
     272           10 :         ctx: &RequestContext,
     273           10 :     ) -> Result<(), GetVectoredError> {
     274           10 :         let layer = self
     275           10 :             .0
     276           10 :             .get_or_maybe_download(true, Some(ctx))
     277            0 :             .await
     278           10 :             .map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
     279              : 
     280           10 :         self.0
     281           10 :             .access_stats
     282           10 :             .record_access(LayerAccessKind::GetValueReconstructData, ctx);
     283           10 : 
     284           10 :         layer
     285           10 :             .get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, &self.0, ctx)
     286           10 :             .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
     287           28 :             .await
     288           10 :     }
     289              : 
     290              :     /// Download the layer if evicted.
     291              :     ///
     292              :     /// Will not error when the layer is already downloaded.
     293            0 :     pub(crate) async fn download(&self) -> anyhow::Result<()> {
     294            0 :         self.0.get_or_maybe_download(true, None).await?;
     295            0 :         Ok(())
     296            0 :     }
     297              : 
     298              :     /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
     299              :     /// while the guard exists.
     300              :     ///
     301              :     /// Returns None if the layer is currently evicted.
     302            4 :     pub(crate) async fn keep_resident(&self) -> anyhow::Result<Option<ResidentLayer>> {
     303            4 :         let downloaded = match self.0.get_or_maybe_download(false, None).await {
     304            2 :             Ok(d) => d,
     305              :             // technically there are a lot of possible errors, but in practice it should only be
     306              :             // DownloadRequired which is tripped up. could work to improve this situation
     307              :             // statically later.
     308            2 :             Err(DownloadError::DownloadRequired) => return Ok(None),
     309            0 :             Err(e) => return Err(e.into()),
     310              :         };
     311              : 
     312            2 :         Ok(Some(ResidentLayer {
     313            2 :             downloaded,
     314            2 :             owner: self.clone(),
     315            2 :         }))
     316            4 :     }
     317              : 
     318              :     /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
     319          300 :     pub(crate) async fn download_and_keep_resident(&self) -> anyhow::Result<ResidentLayer> {
     320          300 :         let downloaded = self.0.get_or_maybe_download(true, None).await?;
     321              : 
     322          300 :         Ok(ResidentLayer {
     323          300 :             downloaded,
     324          300 :             owner: self.clone(),
     325          300 :         })
     326          300 :     }
     327              : 
     328            0 :     pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
     329            0 :         self.0.info(reset)
     330            0 :     }
     331              : 
     332            0 :     pub(crate) fn access_stats(&self) -> &LayerAccessStats {
     333            0 :         &self.0.access_stats
     334            0 :     }
     335              : 
     336          654 :     pub(crate) fn local_path(&self) -> &Utf8Path {
     337          654 :         &self.0.path
     338          654 :     }
     339              : 
     340          558 :     pub(crate) fn metadata(&self) -> LayerFileMetadata {
     341          558 :         self.0.metadata()
     342          558 :     }
     343              : 
     344              :     /// Traditional debug dumping facility
     345              :     #[allow(unused)]
     346            4 :     pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> {
     347            4 :         self.0.desc.dump();
     348            4 : 
     349            4 :         if verbose {
     350              :             // for now, unconditionally download everything, even if that might not be wanted.
     351            4 :             let l = self.0.get_or_maybe_download(true, Some(ctx)).await?;
     352            8 :             l.dump(&self.0, ctx).await?
     353            0 :         }
     354              : 
     355            4 :         Ok(())
     356            4 :     }
     357              : 
     358              :     /// Waits until this layer has been dropped (and if needed, local file deletion and remote
     359              :     /// deletion scheduling has completed).
     360              :     ///
     361              :     /// Does not start local deletion, use [`Self::delete_on_drop`] for that
     362              :     /// separatedly.
     363              :     #[cfg(feature = "testing")]
     364            0 :     pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
     365            0 :         let mut rx = self.0.status.subscribe();
     366              : 
     367            0 :         async move {
     368              :             loop {
     369            0 :                 if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await {
     370            0 :                     break;
     371            0 :                 }
     372              :             }
     373            0 :         }
     374            0 :     }
     375              : }
     376              : 
     377              : /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
     378              : ///
     379              : /// However when we want something evicted, we cannot evict it right away as there might be current
     380              : /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
     381              : /// read with [`Layer::get_value_reconstruct_data`].
     382              : ///
     383              : /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
     384            0 : #[derive(Debug)]
     385              : enum ResidentOrWantedEvicted {
     386              :     Resident(Arc<DownloadedLayer>),
     387              :     WantedEvicted(Weak<DownloadedLayer>, usize),
     388              : }
     389              : 
     390              : impl ResidentOrWantedEvicted {
     391       124167 :     fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
     392       124167 :         match self {
     393       124167 :             ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
     394            0 :             ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
     395            0 :                 Some(strong) => {
     396            0 :                     LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
     397            0 : 
     398            0 :                     *self = ResidentOrWantedEvicted::Resident(strong.clone());
     399            0 : 
     400            0 :                     Some((strong, true))
     401              :                 }
     402            0 :                 None => None,
     403              :             },
     404              :         }
     405       124167 :     }
     406              : 
     407              :     /// When eviction is first requested, drop down to holding a [`Weak`].
     408              :     ///
     409              :     /// Returns `Some` if this was the first time eviction was requested. Care should be taken to
     410              :     /// drop the possibly last strong reference outside of the mutex of
     411              :     /// heavier_once_cell::OnceCell.
     412            2 :     fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
     413            2 :         match self {
     414            2 :             ResidentOrWantedEvicted::Resident(strong) => {
     415            2 :                 let weak = Arc::downgrade(strong);
     416            2 :                 let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
     417            2 :                 std::mem::swap(self, &mut temp);
     418            2 :                 match temp {
     419            2 :                     ResidentOrWantedEvicted::Resident(strong) => Some(strong),
     420            0 :                     ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
     421              :                 }
     422              :             }
     423            0 :             ResidentOrWantedEvicted::WantedEvicted(..) => None,
     424              :         }
     425            2 :     }
     426              : }
     427              : 
     428              : struct LayerInner {
     429              :     /// Only needed to check ondemand_download_behavior_treat_error_as_warn and creation of
     430              :     /// [`Self::path`].
     431              :     conf: &'static PageServerConf,
     432              : 
     433              :     /// Full path to the file; unclear if this should exist anymore.
     434              :     path: Utf8PathBuf,
     435              : 
     436              :     desc: PersistentLayerDesc,
     437              : 
     438              :     /// Timeline access is needed for remote timeline client and metrics.
     439              :     timeline: Weak<Timeline>,
     440              : 
     441              :     /// Cached knowledge of [`Timeline::remote_client`] being `Some`.
     442              :     have_remote_client: bool,
     443              : 
     444              :     access_stats: LayerAccessStats,
     445              : 
     446              :     /// This custom OnceCell is backed by std mutex, but only held for short time periods.
     447              :     /// Initialization and deinitialization are done while holding a permit.
     448              :     inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
     449              : 
     450              :     /// Do we want to delete locally and remotely this when `LayerInner` is dropped
     451              :     wanted_deleted: AtomicBool,
     452              : 
     453              :     /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
     454              :     /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
     455              :     /// [`LayerInner::on_downloaded_layer_drop`].
     456              :     wanted_evicted: AtomicBool,
     457              : 
     458              :     /// Version is to make sure we will only evict a specific download of a file.
     459              :     ///
     460              :     /// Incremented for each download, stored in `DownloadedLayer::version` or
     461              :     /// `ResidentOrWantedEvicted::WantedEvicted`.
     462              :     version: AtomicUsize,
     463              : 
     464              :     /// Allow subscribing to when the layer actually gets evicted.
     465              :     status: tokio::sync::broadcast::Sender<Status>,
     466              : 
     467              :     /// Counter for exponential backoff with the download
     468              :     consecutive_failures: AtomicUsize,
     469              : 
     470              :     /// The generation of this Layer.
     471              :     ///
     472              :     /// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
     473              :     /// for created layers from [`Timeline::generation`].
     474              :     generation: Generation,
     475              : 
     476              :     /// The shard of this Layer.
     477              :     ///
     478              :     /// For layers created in this process, this will always be the [`ShardIndex`] of the
     479              :     /// current `ShardIdentity`` (TODO: add link once it's introduced).
     480              :     ///
     481              :     /// For loaded layers, this may be some other value if the tenant has undergone
     482              :     /// a shard split since the layer was originally written.
     483              :     shard: ShardIndex,
     484              : 
     485              :     last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
     486              : }
     487              : 
     488              : impl std::fmt::Display for LayerInner {
     489            4 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     490            4 :         write!(f, "{}", self.layer_desc().short_id())
     491            4 :     }
     492              : }
     493              : 
     494              : impl AsLayerDesc for LayerInner {
     495       379205 :     fn layer_desc(&self) -> &PersistentLayerDesc {
     496       379205 :         &self.desc
     497       379205 :     }
     498              : }
     499              : 
     500            2 : #[derive(Debug, Clone, Copy)]
     501              : enum Status {
     502              :     Evicted,
     503              :     Downloaded,
     504              : }
     505              : 
     506              : impl Drop for LayerInner {
     507          324 :     fn drop(&mut self) {
     508          324 :         if !*self.wanted_deleted.get_mut() {
     509              :             // should we try to evict if the last wish was for eviction?
     510              :             // feels like there's some hazard of overcrowding near shutdown near by, but we don't
     511              :             // run drops during shutdown (yet)
     512           24 :             return;
     513          300 :         }
     514              : 
     515          300 :         let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
     516              : 
     517          300 :         let path = std::mem::take(&mut self.path);
     518          300 :         let file_name = self.layer_desc().filename();
     519          300 :         let file_size = self.layer_desc().file_size;
     520          300 :         let timeline = self.timeline.clone();
     521          300 :         let meta = self.metadata();
     522          300 :         let status = self.status.clone();
     523          300 : 
     524          300 :         crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
     525          300 :             let _g = span.entered();
     526          300 : 
     527          300 :             // carry this until we are finished for [`Layer::wait_drop`] support
     528          300 :             let _status = status;
     529              : 
     530          300 :             let removed = match std::fs::remove_file(path) {
     531          300 :                 Ok(()) => true,
     532            0 :                 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
     533            0 :                     // until we no longer do detaches by removing all local files before removing the
     534            0 :                     // tenant from the global map, we will always get these errors even if we knew what
     535            0 :                     // is the latest state.
     536            0 :                     //
     537            0 :                     // we currently do not track the latest state, so we'll also end up here on evicted
     538            0 :                     // layers.
     539            0 :                     false
     540              :                 }
     541            0 :                 Err(e) => {
     542            0 :                     tracing::error!("failed to remove wanted deleted layer: {e}");
     543            0 :                     LAYER_IMPL_METRICS.inc_delete_removes_failed();
     544            0 :                     false
     545              :                 }
     546              :             };
     547              : 
     548          300 :             if let Some(timeline) = timeline.upgrade() {
     549          300 :                 if removed {
     550          300 :                     timeline.metrics.resident_physical_size_sub(file_size);
     551          300 :                 }
     552          300 :                 if let Some(remote_client) = timeline.remote_client.as_ref() {
     553          300 :                     let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
     554              : 
     555          300 :                     if let Err(e) = res {
     556              :                         // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
     557              :                         // demonstrating this deadlock (without spawn_blocking): stop will drop
     558              :                         // queued items, which will have ResidentLayer's, and those drops would try
     559              :                         // to re-entrantly lock the RemoteTimelineClient inner state.
     560            0 :                         if !timeline.is_active() {
     561            0 :                             tracing::info!("scheduling deletion on drop failed: {e:#}");
     562              :                         } else {
     563            0 :                             tracing::warn!("scheduling deletion on drop failed: {e:#}");
     564              :                         }
     565            0 :                         LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
     566          300 :                     } else {
     567          300 :                         LAYER_IMPL_METRICS.inc_completed_deletes();
     568          300 :                     }
     569            0 :                 }
     570            0 :             } else {
     571            0 :                 // no need to nag that timeline is gone: under normal situation on
     572            0 :                 // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
     573            0 :                 LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
     574            0 :             }
     575          300 :         });
     576          324 :     }
     577              : }
     578              : 
     579              : impl LayerInner {
     580          574 :     fn new(
     581          574 :         conf: &'static PageServerConf,
     582          574 :         timeline: &Arc<Timeline>,
     583          574 :         access_stats: LayerAccessStats,
     584          574 :         desc: PersistentLayerDesc,
     585          574 :         downloaded: Option<Arc<DownloadedLayer>>,
     586          574 :         generation: Generation,
     587          574 :         shard: ShardIndex,
     588          574 :     ) -> Self {
     589          574 :         let path = conf
     590          574 :             .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
     591          574 :             .join(desc.filename().to_string());
     592              : 
     593          574 :         let (inner, version) = if let Some(inner) = downloaded {
     594          574 :             let version = inner.version;
     595          574 :             let resident = ResidentOrWantedEvicted::Resident(inner);
     596          574 :             (heavier_once_cell::OnceCell::new(resident), version)
     597              :         } else {
     598            0 :             (heavier_once_cell::OnceCell::default(), 0)
     599              :         };
     600              : 
     601          574 :         LayerInner {
     602          574 :             conf,
     603          574 :             path,
     604          574 :             desc,
     605          574 :             timeline: Arc::downgrade(timeline),
     606          574 :             have_remote_client: timeline.remote_client.is_some(),
     607          574 :             access_stats,
     608          574 :             wanted_deleted: AtomicBool::new(false),
     609          574 :             wanted_evicted: AtomicBool::new(false),
     610          574 :             inner,
     611          574 :             version: AtomicUsize::new(version),
     612          574 :             status: tokio::sync::broadcast::channel(1).0,
     613          574 :             consecutive_failures: AtomicUsize::new(0),
     614          574 :             generation,
     615          574 :             shard,
     616          574 :             last_evicted_at: std::sync::Mutex::default(),
     617          574 :         }
     618          574 :     }
     619              : 
     620          300 :     fn delete_on_drop(&self) {
     621          300 :         let res =
     622          300 :             self.wanted_deleted
     623          300 :                 .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
     624          300 : 
     625          300 :         if res.is_ok() {
     626          300 :             LAYER_IMPL_METRICS.inc_started_deletes();
     627          300 :         }
     628          300 :     }
     629              : 
     630              :     /// Cancellation safe, however dropping the future and calling this method again might result
     631              :     /// in a new attempt to evict OR join the previously started attempt.
     632            4 :     pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
     633            4 :         use tokio::sync::broadcast::error::RecvError;
     634            4 : 
     635            4 :         assert!(self.have_remote_client);
     636              : 
     637            4 :         let mut rx = self.status.subscribe();
     638              : 
     639            2 :         let strong = {
     640            4 :             match self.inner.get() {
     641            2 :                 Some(mut either) => {
     642            2 :                     self.wanted_evicted.store(true, Ordering::Relaxed);
     643            2 :                     either.downgrade()
     644              :                 }
     645            2 :                 None => return Err(EvictionError::NotFound),
     646              :             }
     647              :         };
     648              : 
     649            2 :         if strong.is_some() {
     650            2 :             // drop the DownloadedLayer outside of the holding the guard
     651            2 :             drop(strong);
     652            2 :             LAYER_IMPL_METRICS.inc_started_evictions();
     653            2 :         }
     654              : 
     655            2 :         match rx.recv().await {
     656            2 :             Ok(Status::Evicted) => Ok(()),
     657            0 :             Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
     658              :             Err(RecvError::Closed) => {
     659            0 :                 unreachable!("sender cannot be dropped while we are in &self method")
     660              :             }
     661              :             Err(RecvError::Lagged(_)) => {
     662              :                 // this is quite unlikely, but we are blocking a lot in the async context, so
     663              :                 // we might be missing this because we are stuck on a LIFO slot on a thread
     664              :                 // which is busy blocking for a 1TB database create_image_layers.
     665              :                 //
     666              :                 // use however late (compared to the initial expressing of wanted) as the
     667              :                 // "outcome" now
     668            0 :                 LAYER_IMPL_METRICS.inc_broadcast_lagged();
     669            0 :                 match self.inner.get() {
     670            0 :                     Some(_) => Err(EvictionError::Downloaded),
     671            0 :                     None => Ok(()),
     672              :                 }
     673              :             }
     674              :         }
     675            4 :     }
     676              : 
     677              :     /// Cancellation safe.
     678       124169 :     async fn get_or_maybe_download(
     679       124169 :         self: &Arc<Self>,
     680       124169 :         allow_download: bool,
     681       124169 :         ctx: Option<&RequestContext>,
     682       124169 :     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
     683       124169 :         let mut init_permit = None;
     684              : 
     685              :         loop {
     686       124169 :             let download = move |permit| {
     687            2 :                 async move {
     688            2 :                     // disable any scheduled but not yet running eviction deletions for this
     689            2 :                     let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
     690            2 : 
     691            2 :                     // count cancellations, which currently remain largely unexpected
     692            2 :                     let init_cancelled =
     693            2 :                         scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
     694            2 : 
     695            2 :                     // no need to make the evict_and_wait wait for the actual download to complete
     696            2 :                     drop(self.status.send(Status::Downloaded));
     697              : 
     698            2 :                     let timeline = self
     699            2 :                         .timeline
     700            2 :                         .upgrade()
     701            2 :                         .ok_or_else(|| DownloadError::TimelineShutdown)?;
     702              : 
     703              :                     // FIXME: grab a gate
     704              : 
     705            2 :                     let can_ever_evict = timeline.remote_client.as_ref().is_some();
     706              : 
     707              :                     // check if we really need to be downloaded; could have been already downloaded by a
     708              :                     // cancelled previous attempt.
     709            2 :                     let needs_download = self
     710            2 :                         .needs_download()
     711            2 :                         .await
     712            2 :                         .map_err(DownloadError::PreStatFailed)?;
     713              : 
     714            2 :                     let permit = if let Some(reason) = needs_download {
     715            2 :                         if let NeedsDownload::NotFile(ft) = reason {
     716            0 :                             return Err(DownloadError::NotFile(ft));
     717            2 :                         }
     718            2 : 
     719            2 :                         // only reset this after we've decided we really need to download. otherwise it'd
     720            2 :                         // be impossible to mark cancelled downloads for eviction, like one could imagine
     721            2 :                         // we would like to do for prefetching which was not needed.
     722            2 :                         self.wanted_evicted.store(false, Ordering::Release);
     723            2 : 
     724            2 :                         if !can_ever_evict {
     725            0 :                             return Err(DownloadError::NoRemoteStorage);
     726            2 :                         }
     727              : 
     728            2 :                         if let Some(ctx) = ctx {
     729            0 :                             self.check_expected_download(ctx)?;
     730            2 :                         }
     731              : 
     732            2 :                         if !allow_download {
     733              :                             // this does look weird, but for LayerInner the "downloading" means also changing
     734              :                             // internal once related state ...
     735            2 :                             return Err(DownloadError::DownloadRequired);
     736            0 :                         }
     737            0 : 
     738            0 :                         tracing::info!(%reason, "downloading on-demand");
     739              : 
     740            0 :                         self.spawn_download_and_wait(timeline, permit).await?
     741              :                     } else {
     742              :                         // the file is present locally, probably by a previous but cancelled call to
     743              :                         // get_or_maybe_download. alternatively we might be running without remote storage.
     744            0 :                         LAYER_IMPL_METRICS.inc_init_needed_no_download();
     745            0 : 
     746            0 :                         permit
     747              :                     };
     748              : 
     749            0 :                     let since_last_eviction =
     750            0 :                         self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
     751            0 :                     if let Some(since_last_eviction) = since_last_eviction {
     752            0 :                         // FIXME: this will not always be recorded correctly until #6028 (the no
     753            0 :                         // download needed branch above)
     754            0 :                         LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
     755            0 :                     }
     756              : 
     757            0 :                     let res = Arc::new(DownloadedLayer {
     758            0 :                         owner: Arc::downgrade(self),
     759            0 :                         kind: tokio::sync::OnceCell::default(),
     760            0 :                         version: next_version,
     761            0 :                     });
     762            0 : 
     763            0 :                     self.access_stats.record_residence_event(
     764            0 :                         LayerResidenceStatus::Resident,
     765            0 :                         LayerResidenceEventReason::ResidenceChange,
     766            0 :                     );
     767            0 : 
     768            0 :                     let waiters = self.inner.initializer_count();
     769            0 :                     if waiters > 0 {
     770            0 :                         tracing::info!(
     771            0 :                             waiters,
     772            0 :                             "completing the on-demand download for other tasks"
     773            0 :                         );
     774            0 :                     }
     775              : 
     776            0 :                     scopeguard::ScopeGuard::into_inner(init_cancelled);
     777            0 : 
     778            0 :                     Ok((ResidentOrWantedEvicted::Resident(res), permit))
     779            2 :                 }
     780            2 :                 .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
     781            2 :             };
     782              : 
     783       124169 :             if let Some(init_permit) = init_permit.take() {
     784              :                 // use the already held initialization permit because it is impossible to hit the
     785              :                 // below paths anymore essentially limiting the max loop iterations to 2.
     786            0 :                 let (value, init_permit) = download(init_permit).await?;
     787            0 :                 let mut guard = self.inner.set(value, init_permit);
     788            0 :                 let (strong, _upgraded) = guard
     789            0 :                     .get_and_upgrade()
     790            0 :                     .expect("init creates strong reference, we held the init permit");
     791            0 :                 return Ok(strong);
     792       124169 :             }
     793              : 
     794            0 :             let (weak, permit) = {
     795       124169 :                 let mut locked = self.inner.get_or_init(download).await?;
     796              : 
     797       124167 :                 if let Some((strong, upgraded)) = locked.get_and_upgrade() {
     798       124167 :                     if upgraded {
     799            0 :                         // when upgraded back, the Arc<DownloadedLayer> is still available, but
     800            0 :                         // previously a `evict_and_wait` was received.
     801            0 :                         self.wanted_evicted.store(false, Ordering::Relaxed);
     802            0 : 
     803            0 :                         // error out any `evict_and_wait`
     804            0 :                         drop(self.status.send(Status::Downloaded));
     805            0 :                         LAYER_IMPL_METRICS
     806            0 :                             .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
     807       124167 :                     }
     808              : 
     809       124167 :                     return Ok(strong);
     810              :                 } else {
     811              :                     // path to here: the evict_blocking is stuck on spawn_blocking queue.
     812              :                     //
     813              :                     // reset the contents, deactivating the eviction and causing a
     814              :                     // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
     815            0 :                     locked.take_and_deinit()
     816            0 :                 }
     817            0 :             };
     818            0 : 
     819            0 :             // unlock first, then drop the weak, but because upgrade failed, we
     820            0 :             // know it cannot be a problem.
     821            0 : 
     822            0 :             assert!(
     823            0 :                 matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
     824            0 :                 "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
     825              :             );
     826              : 
     827            0 :             init_permit = Some(permit);
     828            0 : 
     829            0 :             LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download();
     830              :         }
     831       124169 :     }
     832              : 
     833              :     /// Nag or fail per RequestContext policy
     834            0 :     fn check_expected_download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
     835            0 :         use crate::context::DownloadBehavior::*;
     836            0 :         let b = ctx.download_behavior();
     837            0 :         match b {
     838            0 :             Download => Ok(()),
     839              :             Warn | Error => {
     840            0 :                 tracing::info!(
     841            0 :                     "unexpectedly on-demand downloading for task kind {:?}",
     842            0 :                     ctx.task_kind()
     843            0 :                 );
     844            0 :                 crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
     845              : 
     846            0 :                 let really_error =
     847            0 :                     matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn;
     848              : 
     849            0 :                 if really_error {
     850              :                     // this check is only probablistic, seems like flakyness footgun
     851            0 :                     Err(DownloadError::ContextAndConfigReallyDeniesDownloads)
     852              :                 } else {
     853            0 :                     Ok(())
     854              :                 }
     855              :             }
     856              :         }
     857            0 :     }
     858              : 
     859              :     /// Actual download, at most one is executed at the time.
     860            0 :     async fn spawn_download_and_wait(
     861            0 :         self: &Arc<Self>,
     862            0 :         timeline: Arc<Timeline>,
     863            0 :         permit: heavier_once_cell::InitPermit,
     864            0 :     ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
     865            0 :         debug_assert_current_span_has_tenant_and_timeline_id();
     866            0 : 
     867            0 :         let task_name = format!("download layer {}", self);
     868            0 : 
     869            0 :         let (tx, rx) = tokio::sync::oneshot::channel();
     870            0 : 
     871            0 :         // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
     872            0 :         // block tenant::mgr::remove_tenant_from_memory.
     873            0 : 
     874            0 :         let this: Arc<Self> = self.clone();
     875            0 : 
     876            0 :         crate::task_mgr::spawn(
     877            0 :             &tokio::runtime::Handle::current(),
     878            0 :             crate::task_mgr::TaskKind::RemoteDownloadTask,
     879            0 :             Some(self.desc.tenant_shard_id),
     880            0 :             Some(self.desc.timeline_id),
     881            0 :             &task_name,
     882            0 :             false,
     883            0 :             async move {
     884            0 : 
     885            0 :                 let client = timeline
     886            0 :                     .remote_client
     887            0 :                     .as_ref()
     888            0 :                     .expect("checked above with have_remote_client");
     889              : 
     890            0 :                 let result = client.download_layer_file(
     891            0 :                     &this.desc.filename(),
     892            0 :                     &this.metadata(),
     893            0 :                     &crate::task_mgr::shutdown_token()
     894            0 :                 )
     895            0 :                 .await;
     896              : 
     897            0 :                 let result = match result {
     898            0 :                     Ok(size) => {
     899            0 :                         timeline.metrics.resident_physical_size_add(size);
     900            0 :                         Ok(())
     901              :                     }
     902            0 :                     Err(e) => {
     903            0 :                         let consecutive_failures =
     904            0 :                             this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
     905            0 : 
     906            0 :                         let backoff = utils::backoff::exponential_backoff_duration_seconds(
     907            0 :                             consecutive_failures.min(u32::MAX as usize) as u32,
     908            0 :                             1.5,
     909            0 :                             60.0,
     910            0 :                         );
     911            0 : 
     912            0 :                         let backoff = std::time::Duration::from_secs_f64(backoff);
     913            0 : 
     914            0 :                         tokio::select! {
     915            0 :                             _ = tokio::time::sleep(backoff) => {},
     916            0 :                             _ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
     917            0 :                             _ = timeline.cancel.cancelled() => {},
     918            0 :                         };
     919              : 
     920            0 :                         Err(e)
     921              :                     }
     922              :                 };
     923              : 
     924            0 :                 if let Err(res) = tx.send((result, permit)) {
     925            0 :                     match res {
     926            0 :                         (Ok(()), _) => {
     927            0 :                             // our caller is cancellation safe so this is fine; if someone
     928            0 :                             // else requests the layer, they'll find it already downloaded.
     929            0 :                             //
     930            0 :                             // See counter [`LayerImplMetrics::inc_init_needed_no_download`]
     931            0 :                             //
     932            0 :                             // FIXME(#6028): however, could be that we should consider marking the
     933            0 :                             // layer for eviction? alas, cannot: because only DownloadedLayer will
     934            0 :                             // handle that.
     935            0 :                         },
     936            0 :                         (Err(e), _) => {
     937            0 :                             // our caller is cancellation safe, but we might be racing with
     938            0 :                             // another attempt to initialize. before we have cancellation
     939            0 :                             // token support: these attempts should converge regardless of
     940            0 :                             // their completion order.
     941            0 :                             tracing::error!("layer file download failed, and additionally failed to communicate this to caller: {e:?}");
     942            0 :                             LAYER_IMPL_METRICS.inc_download_failed_without_requester();
     943              :                         }
     944              :                     }
     945            0 :                 }
     946              : 
     947            0 :                 Ok(())
     948            0 :             }
     949            0 :             .in_current_span(),
     950            0 :         );
     951            0 :         match rx.await {
     952            0 :             Ok((Ok(()), permit)) => {
     953            0 :                 if let Some(reason) = self
     954            0 :                     .needs_download()
     955            0 :                     .await
     956            0 :                     .map_err(DownloadError::PostStatFailed)?
     957              :                 {
     958              :                     // this is really a bug in needs_download or remote timeline client
     959            0 :                     panic!("post-condition failed: needs_download returned {reason:?}");
     960            0 :                 }
     961            0 : 
     962            0 :                 self.consecutive_failures.store(0, Ordering::Relaxed);
     963            0 :                 tracing::info!("on-demand download successful");
     964              : 
     965            0 :                 Ok(permit)
     966              :             }
     967            0 :             Ok((Err(e), _permit)) => {
     968            0 :                 // sleep already happened in the spawned task, if it was not cancelled
     969            0 :                 let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
     970            0 : 
     971            0 :                 match e.downcast_ref::<remote_storage::DownloadError>() {
     972              :                     // If the download failed due to its cancellation token,
     973              :                     // propagate the cancellation error upstream.
     974              :                     Some(remote_storage::DownloadError::Cancelled) => {
     975            0 :                         Err(DownloadError::DownloadCancelled)
     976              :                     }
     977              :                     _ => {
     978            0 :                         tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
     979            0 :                         Err(DownloadError::DownloadFailed)
     980              :                     }
     981              :                 }
     982              :             }
     983            0 :             Err(_gone) => Err(DownloadError::DownloadCancelled),
     984              :         }
     985            0 :     }
     986              : 
     987            2 :     async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
     988            2 :         match tokio::fs::metadata(&self.path).await {
     989            0 :             Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
     990            2 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
     991            0 :             Err(e) => Err(e),
     992              :         }
     993            2 :     }
     994              : 
     995           24 :     fn needs_download_blocking(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
     996           24 :         match self.path.metadata() {
     997           24 :             Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
     998            0 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
     999            0 :             Err(e) => Err(e),
    1000              :         }
    1001           24 :     }
    1002              : 
    1003           24 :     fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
    1004           24 :         // in future, this should include sha2-256 validation of the file.
    1005           24 :         if !m.is_file() {
    1006            0 :             Err(NeedsDownload::NotFile(m.file_type()))
    1007           24 :         } else if m.len() != self.desc.file_size {
    1008            0 :             Err(NeedsDownload::WrongSize {
    1009            0 :                 actual: m.len(),
    1010            0 :                 expected: self.desc.file_size,
    1011            0 :             })
    1012              :         } else {
    1013           24 :             Ok(())
    1014              :         }
    1015           24 :     }
    1016              : 
    1017            0 :     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
    1018            0 :         let layer_file_name = self.desc.filename().file_name();
    1019            0 : 
    1020            0 :         // this is not accurate: we could have the file locally but there was a cancellation
    1021            0 :         // and now we are not in sync, or we are currently downloading it.
    1022            0 :         let remote = self.inner.get().is_none();
    1023            0 : 
    1024            0 :         let access_stats = self.access_stats.as_api_model(reset);
    1025            0 : 
    1026            0 :         if self.desc.is_delta {
    1027            0 :             let lsn_range = &self.desc.lsn_range;
    1028            0 : 
    1029            0 :             HistoricLayerInfo::Delta {
    1030            0 :                 layer_file_name,
    1031            0 :                 layer_file_size: self.desc.file_size,
    1032            0 :                 lsn_start: lsn_range.start,
    1033            0 :                 lsn_end: lsn_range.end,
    1034            0 :                 remote,
    1035            0 :                 access_stats,
    1036            0 :             }
    1037              :         } else {
    1038            0 :             let lsn = self.desc.image_layer_lsn();
    1039            0 : 
    1040            0 :             HistoricLayerInfo::Image {
    1041            0 :                 layer_file_name,
    1042            0 :                 layer_file_size: self.desc.file_size,
    1043            0 :                 lsn_start: lsn,
    1044            0 :                 remote,
    1045            0 :                 access_stats,
    1046            0 :             }
    1047              :         }
    1048            0 :     }
    1049              : 
    1050              :     /// `DownloadedLayer` is being dropped, so it calls this method.
    1051            2 :     fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
    1052            2 :         let delete = self.wanted_deleted.load(Ordering::Acquire);
    1053            2 :         let evict = self.wanted_evicted.load(Ordering::Acquire);
    1054            2 :         let can_evict = self.have_remote_client;
    1055            2 : 
    1056            2 :         if delete {
    1057            0 :             // do nothing now, only in LayerInner::drop -- this was originally implemented because
    1058            0 :             // we could had already scheduled the deletion at the time.
    1059            0 :             //
    1060            0 :             // FIXME: this is not true anymore, we can safely evict wanted deleted files.
    1061            2 :         } else if can_evict && evict {
    1062            2 :             let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
    1063              : 
    1064              :             // downgrade for queueing, in case there's a tear down already ongoing we should not
    1065              :             // hold it alive.
    1066            2 :             let this = Arc::downgrade(&self);
    1067            2 :             drop(self);
    1068            2 : 
    1069            2 :             // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
    1070            2 :             // drop while the `self.inner` is being locked, leading to a deadlock.
    1071            2 : 
    1072            2 :             crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
    1073            2 :                 let _g = span.entered();
    1074              : 
    1075              :                 // if LayerInner is already dropped here, do nothing because the delete on drop
    1076              :                 // has already ran while we were in queue
    1077            2 :                 let Some(this) = this.upgrade() else {
    1078            0 :                     LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
    1079            0 :                     return;
    1080              :                 };
    1081            2 :                 match this.evict_blocking(version) {
    1082            2 :                     Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
    1083            0 :                     Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
    1084              :                 }
    1085            2 :             });
    1086            0 :         }
    1087            2 :     }
    1088              : 
    1089            2 :     fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
    1090              :         // deleted or detached timeline, don't do anything.
    1091            2 :         let Some(timeline) = self.timeline.upgrade() else {
    1092            0 :             return Err(EvictionCancelled::TimelineGone);
    1093              :         };
    1094              : 
    1095              :         // to avoid starting a new download while we evict, keep holding on to the
    1096              :         // permit.
    1097            2 :         let _permit = {
    1098            2 :             let maybe_downloaded = self.inner.get();
    1099              : 
    1100            2 :             let (_weak, permit) = match maybe_downloaded {
    1101            2 :                 Some(mut guard) => {
    1102            2 :                     if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard {
    1103            2 :                         if *version == only_version {
    1104            2 :                             guard.take_and_deinit()
    1105              :                         } else {
    1106              :                             // this was not for us; maybe there's another eviction job
    1107              :                             // TODO: does it make any sense to stall here? unique versions do not
    1108              :                             // matter, we only want to make sure not to evict a resident, which we
    1109              :                             // are not doing.
    1110            0 :                             return Err(EvictionCancelled::VersionCheckFailed);
    1111              :                         }
    1112              :                     } else {
    1113            0 :                         return Err(EvictionCancelled::AlreadyReinitialized);
    1114              :                     }
    1115              :                 }
    1116              :                 None => {
    1117              :                     // already deinitialized, perhaps get_or_maybe_download did this and is
    1118              :                     // currently waiting to reinitialize it
    1119            0 :                     return Err(EvictionCancelled::LostToDownload);
    1120              :                 }
    1121              :             };
    1122              : 
    1123            2 :             permit
    1124            2 :         };
    1125            2 : 
    1126            2 :         // now accesses to inner.get_or_init wait on the semaphore or the `_permit`
    1127            2 : 
    1128            2 :         self.access_stats.record_residence_event(
    1129            2 :             LayerResidenceStatus::Evicted,
    1130            2 :             LayerResidenceEventReason::ResidenceChange,
    1131            2 :         );
    1132              : 
    1133            2 :         let res = match capture_mtime_and_remove(&self.path) {
    1134            2 :             Ok(local_layer_mtime) => {
    1135            2 :                 let duration = SystemTime::now().duration_since(local_layer_mtime);
    1136            2 :                 match duration {
    1137            2 :                     Ok(elapsed) => {
    1138            2 :                         timeline
    1139            2 :                             .metrics
    1140            2 :                             .evictions_with_low_residence_duration
    1141            2 :                             .read()
    1142            2 :                             .unwrap()
    1143            2 :                             .observe(elapsed);
    1144            2 :                         tracing::info!(
    1145            2 :                             residence_millis = elapsed.as_millis(),
    1146            2 :                             "evicted layer after known residence period"
    1147            2 :                         );
    1148              :                     }
    1149              :                     Err(_) => {
    1150            0 :                         tracing::info!("evicted layer after unknown residence period");
    1151              :                     }
    1152              :                 }
    1153            2 :                 timeline.metrics.evictions.inc();
    1154            2 :                 timeline
    1155            2 :                     .metrics
    1156            2 :                     .resident_physical_size_sub(self.desc.file_size);
    1157            2 : 
    1158            2 :                 Ok(())
    1159              :             }
    1160            0 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
    1161            0 :                 tracing::error!(
    1162            0 :                     layer_size = %self.desc.file_size,
    1163            0 :                     "failed to evict layer from disk, it was already gone (metrics will be inaccurate)"
    1164            0 :                 );
    1165            0 :                 Err(EvictionCancelled::FileNotFound)
    1166              :             }
    1167            0 :             Err(e) => {
    1168            0 :                 tracing::error!("failed to evict file from disk: {e:#}");
    1169            0 :                 Err(EvictionCancelled::RemoveFailed)
    1170              :             }
    1171              :         };
    1172              : 
    1173              :         // we are still holding the permit, so no new spawn_download_and_wait can happen
    1174            2 :         drop(self.status.send(Status::Evicted));
    1175            2 : 
    1176            2 :         *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
    1177            2 : 
    1178            2 :         res
    1179            2 :     }
    1180              : 
    1181          858 :     fn metadata(&self) -> LayerFileMetadata {
    1182          858 :         LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
    1183          858 :     }
    1184              : }
    1185              : 
    1186            2 : fn capture_mtime_and_remove(path: &Utf8Path) -> Result<SystemTime, std::io::Error> {
    1187            2 :     let m = path.metadata()?;
    1188            2 :     let local_layer_mtime = m.modified()?;
    1189            2 :     std::fs::remove_file(path)?;
    1190            2 :     Ok(local_layer_mtime)
    1191            2 : }
    1192              : 
    1193            0 : #[derive(Debug, thiserror::Error)]
    1194              : pub(crate) enum EvictionError {
    1195              :     #[error("layer was already evicted")]
    1196              :     NotFound,
    1197              : 
    1198              :     /// Evictions must always lose to downloads in races, and this time it happened.
    1199              :     #[error("layer was downloaded instead")]
    1200              :     Downloaded,
    1201              : }
    1202              : 
    1203              : /// Error internal to the [`LayerInner::get_or_maybe_download`]
    1204            0 : #[derive(Debug, thiserror::Error)]
    1205              : pub(crate) enum DownloadError {
    1206              :     #[error("timeline has already shutdown")]
    1207              :     TimelineShutdown,
    1208              :     #[error("no remote storage configured")]
    1209              :     NoRemoteStorage,
    1210              :     #[error("context denies downloading")]
    1211              :     ContextAndConfigReallyDeniesDownloads,
    1212              :     #[error("downloading is really required but not allowed by this method")]
    1213              :     DownloadRequired,
    1214              :     #[error("layer path exists, but it is not a file: {0:?}")]
    1215              :     NotFile(std::fs::FileType),
    1216              :     /// Why no error here? Because it will be reported by page_service. We should had also done
    1217              :     /// retries already.
    1218              :     #[error("downloading evicted layer file failed")]
    1219              :     DownloadFailed,
    1220              :     #[error("downloading failed, possibly for shutdown")]
    1221              :     DownloadCancelled,
    1222              :     #[error("pre-condition: stat before download failed")]
    1223              :     PreStatFailed(#[source] std::io::Error),
    1224              :     #[error("post-condition: stat after download failed")]
    1225              :     PostStatFailed(#[source] std::io::Error),
    1226              : }
    1227              : 
    1228            0 : #[derive(Debug, PartialEq)]
    1229              : pub(crate) enum NeedsDownload {
    1230              :     NotFound,
    1231              :     NotFile(std::fs::FileType),
    1232              :     WrongSize { actual: u64, expected: u64 },
    1233              : }
    1234              : 
    1235              : impl std::fmt::Display for NeedsDownload {
    1236            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1237            0 :         match self {
    1238            0 :             NeedsDownload::NotFound => write!(f, "file was not found"),
    1239            0 :             NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
    1240            0 :             NeedsDownload::WrongSize { actual, expected } => {
    1241            0 :                 write!(f, "file size mismatch {actual} vs. {expected}")
    1242              :             }
    1243              :         }
    1244            0 :     }
    1245              : }
    1246              : 
    1247              : /// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
    1248              : pub(crate) struct DownloadedLayer {
    1249              :     owner: Weak<LayerInner>,
    1250              :     // Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
    1251              :     // DownloadedLayer
    1252              :     kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
    1253              :     version: usize,
    1254              : }
    1255              : 
    1256              : impl std::fmt::Debug for DownloadedLayer {
    1257            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1258            0 :         f.debug_struct("DownloadedLayer")
    1259            0 :             // owner omitted because it is always "Weak"
    1260            0 :             .field("kind", &self.kind)
    1261            0 :             .field("version", &self.version)
    1262            0 :             .finish()
    1263            0 :     }
    1264              : }
    1265              : 
    1266              : impl Drop for DownloadedLayer {
    1267          326 :     fn drop(&mut self) {
    1268          326 :         if let Some(owner) = self.owner.upgrade() {
    1269            2 :             owner.on_downloaded_layer_drop(self.version);
    1270          324 :         } else {
    1271          324 :             // no need to do anything, we are shutting down
    1272          324 :         }
    1273          326 :     }
    1274              : }
    1275              : 
    1276              : impl DownloadedLayer {
    1277              :     /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to
    1278              :     /// initialize it permanently.
    1279              :     ///
    1280              :     /// `owner` parameter is a strong reference at the same `LayerInner` as the
    1281              :     /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called,
    1282              :     /// we will always have the LayerInner on the callstack, so we can just use it.
    1283       124165 :     async fn get<'a>(
    1284       124165 :         &'a self,
    1285       124165 :         owner: &Arc<LayerInner>,
    1286       124165 :         ctx: &RequestContext,
    1287       124165 :     ) -> anyhow::Result<&'a LayerKind> {
    1288       124165 :         let init = || async {
    1289          450 :             assert_eq!(
    1290          450 :                 Weak::as_ptr(&self.owner),
    1291          450 :                 Arc::as_ptr(owner),
    1292            0 :                 "these are the same, just avoiding the upgrade"
    1293              :             );
    1294              : 
    1295          450 :             let res = if owner.desc.is_delta {
    1296          440 :                 let summary = Some(delta_layer::Summary::expected(
    1297          440 :                     owner.desc.tenant_shard_id.tenant_id,
    1298          440 :                     owner.desc.timeline_id,
    1299          440 :                     owner.desc.key_range.clone(),
    1300          440 :                     owner.desc.lsn_range.clone(),
    1301          440 :                 ));
    1302          440 :                 delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
    1303          440 :                     .await
    1304          440 :                     .map(|res| res.map(LayerKind::Delta))
    1305              :             } else {
    1306           10 :                 let lsn = owner.desc.image_layer_lsn();
    1307           10 :                 let summary = Some(image_layer::Summary::expected(
    1308           10 :                     owner.desc.tenant_shard_id.tenant_id,
    1309           10 :                     owner.desc.timeline_id,
    1310           10 :                     owner.desc.key_range.clone(),
    1311           10 :                     lsn,
    1312           10 :                 ));
    1313           10 :                 image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
    1314           10 :                     .await
    1315           10 :                     .map(|res| res.map(LayerKind::Image))
    1316              :             };
    1317              : 
    1318          450 :             match res {
    1319          450 :                 Ok(Ok(layer)) => Ok(Ok(layer)),
    1320            0 :                 Ok(Err(transient)) => Err(transient),
    1321            0 :                 Err(permanent) => {
    1322            0 :                     LAYER_IMPL_METRICS.inc_permanent_loading_failures();
    1323            0 :                     // TODO(#5815): we are not logging all errors, so temporarily log them **once**
    1324            0 :                     // here as well
    1325            0 :                     let permanent = permanent.context("load layer");
    1326            0 :                     tracing::error!("layer loading failed permanently: {permanent:#}");
    1327            0 :                     Ok(Err(permanent))
    1328              :                 }
    1329              :             }
    1330          900 :         };
    1331       124165 :         self.kind
    1332       124165 :             .get_or_try_init(init)
    1333              :             // return transient errors using `?`
    1334          450 :             .await?
    1335       124165 :             .as_ref()
    1336       124165 :             .map_err(|e| {
    1337            0 :                 // errors are not clonabled, cannot but stringify
    1338            0 :                 // test_broken_timeline matches this string
    1339            0 :                 anyhow::anyhow!("layer loading failed: {e:#}")
    1340       124165 :             })
    1341       124165 :     }
    1342              : 
    1343       123851 :     async fn get_value_reconstruct_data(
    1344       123851 :         &self,
    1345       123851 :         key: Key,
    1346       123851 :         lsn_range: Range<Lsn>,
    1347       123851 :         reconstruct_data: &mut ValueReconstructState,
    1348       123851 :         owner: &Arc<LayerInner>,
    1349       123851 :         ctx: &RequestContext,
    1350       123851 :     ) -> anyhow::Result<ValueReconstructResult> {
    1351       123851 :         use LayerKind::*;
    1352       123851 : 
    1353       123851 :         match self.get(owner, ctx).await? {
    1354       123339 :             Delta(d) => {
    1355       123339 :                 d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
    1356        22918 :                     .await
    1357              :             }
    1358          512 :             Image(i) => {
    1359          512 :                 i.get_value_reconstruct_data(key, reconstruct_data, ctx)
    1360          452 :                     .await
    1361              :             }
    1362              :         }
    1363       123851 :     }
    1364              : 
    1365           10 :     async fn get_values_reconstruct_data(
    1366           10 :         &self,
    1367           10 :         keyspace: KeySpace,
    1368           10 :         end_lsn: Lsn,
    1369           10 :         reconstruct_data: &mut ValuesReconstructState,
    1370           10 :         owner: &Arc<LayerInner>,
    1371           10 :         ctx: &RequestContext,
    1372           10 :     ) -> Result<(), GetVectoredError> {
    1373           10 :         use LayerKind::*;
    1374           10 : 
    1375           10 :         match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
    1376           10 :             Delta(d) => {
    1377           10 :                 d.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, ctx)
    1378           18 :                     .await
    1379              :             }
    1380            0 :             Image(i) => {
    1381            0 :                 i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx)
    1382            0 :                     .await
    1383              :             }
    1384              :         }
    1385           10 :     }
    1386              : 
    1387            4 :     async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
    1388            4 :         use LayerKind::*;
    1389            4 :         match self.get(owner, ctx).await? {
    1390            4 :             Delta(d) => d.dump(ctx).await?,
    1391            0 :             Image(i) => i.dump(ctx).await?,
    1392              :         }
    1393              : 
    1394            4 :         Ok(())
    1395            4 :     }
    1396              : }
    1397              : 
    1398              : /// Wrapper around an actual layer implementation.
    1399            0 : #[derive(Debug)]
    1400              : enum LayerKind {
    1401              :     Delta(delta_layer::DeltaLayerInner),
    1402              :     Image(image_layer::ImageLayerInner),
    1403              : }
    1404              : 
    1405              : /// Guard for forcing a layer be resident while it exists.
    1406          518 : #[derive(Clone)]
    1407              : pub(crate) struct ResidentLayer {
    1408              :     owner: Layer,
    1409              :     downloaded: Arc<DownloadedLayer>,
    1410              : }
    1411              : 
    1412              : impl std::fmt::Display for ResidentLayer {
    1413         1158 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1414         1158 :         write!(f, "{}", self.owner)
    1415         1158 :     }
    1416              : }
    1417              : 
    1418              : impl std::fmt::Debug for ResidentLayer {
    1419            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1420            0 :         write!(f, "{}", self.owner)
    1421            0 :     }
    1422              : }
    1423              : 
    1424              : impl ResidentLayer {
    1425              :     /// Release the eviction guard, converting back into a plain [`Layer`].
    1426              :     ///
    1427              :     /// You can access the [`Layer`] also by using `as_ref`.
    1428          318 :     pub(crate) fn drop_eviction_guard(self) -> Layer {
    1429          318 :         self.into()
    1430          318 :     }
    1431              : 
    1432              :     /// Loads all keys stored in the layer. Returns key, lsn and value size.
    1433          600 :     #[tracing::instrument(skip_all, fields(layer=%self))]
    1434              :     pub(crate) async fn load_keys<'a>(
    1435              :         &'a self,
    1436              :         ctx: &RequestContext,
    1437              :     ) -> anyhow::Result<Vec<DeltaEntry<'a>>> {
    1438              :         use LayerKind::*;
    1439              : 
    1440              :         let owner = &self.owner.0;
    1441              : 
    1442              :         match self.downloaded.get(owner, ctx).await? {
    1443              :             Delta(ref d) => {
    1444              :                 owner
    1445              :                     .access_stats
    1446              :                     .record_access(LayerAccessKind::KeyIter, ctx);
    1447              : 
    1448              :                 // this is valid because the DownloadedLayer::kind is a OnceCell, not a
    1449              :                 // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
    1450              :                 // while it's being held.
    1451              :                 delta_layer::DeltaLayerInner::load_keys(d, ctx)
    1452              :                     .await
    1453              :                     .context("Layer index is corrupted")
    1454              :             }
    1455              :             Image(_) => anyhow::bail!("cannot load_keys on a image layer"),
    1456              :         }
    1457              :     }
    1458              : 
    1459         1104 :     pub(crate) fn local_path(&self) -> &Utf8Path {
    1460         1104 :         &self.owner.0.path
    1461         1104 :     }
    1462              : 
    1463          558 :     pub(crate) fn metadata(&self) -> LayerFileMetadata {
    1464          558 :         self.owner.metadata()
    1465          558 :     }
    1466              : }
    1467              : 
    1468              : impl AsLayerDesc for ResidentLayer {
    1469         1272 :     fn layer_desc(&self) -> &PersistentLayerDesc {
    1470         1272 :         self.owner.layer_desc()
    1471         1272 :     }
    1472              : }
    1473              : 
    1474              : impl AsRef<Layer> for ResidentLayer {
    1475          580 :     fn as_ref(&self) -> &Layer {
    1476          580 :         &self.owner
    1477          580 :     }
    1478              : }
    1479              : 
    1480              : /// Drop the eviction guard.
    1481              : impl From<ResidentLayer> for Layer {
    1482          318 :     fn from(value: ResidentLayer) -> Self {
    1483          318 :         value.owner
    1484          318 :     }
    1485              : }
    1486              : 
    1487              : use metrics::IntCounter;
    1488              : 
    1489              : pub(crate) struct LayerImplMetrics {
    1490              :     started_evictions: IntCounter,
    1491              :     completed_evictions: IntCounter,
    1492              :     cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
    1493              : 
    1494              :     started_deletes: IntCounter,
    1495              :     completed_deletes: IntCounter,
    1496              :     failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
    1497              : 
    1498              :     rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
    1499              :     inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
    1500              :     redownload_after: metrics::Histogram,
    1501              : }
    1502              : 
    1503              : impl Default for LayerImplMetrics {
    1504            8 :     fn default() -> Self {
    1505            8 :         use enum_map::Enum;
    1506            8 : 
    1507            8 :         // reminder: these will be pageserver_layer_* with "_total" suffix
    1508            8 : 
    1509            8 :         let started_evictions = metrics::register_int_counter!(
    1510            8 :             "pageserver_layer_started_evictions",
    1511            8 :             "Evictions started in the Layer implementation"
    1512            8 :         )
    1513            8 :         .unwrap();
    1514            8 :         let completed_evictions = metrics::register_int_counter!(
    1515            8 :             "pageserver_layer_completed_evictions",
    1516            8 :             "Evictions completed in the Layer implementation"
    1517            8 :         )
    1518            8 :         .unwrap();
    1519            8 : 
    1520            8 :         let cancelled_evictions = metrics::register_int_counter_vec!(
    1521            8 :             "pageserver_layer_cancelled_evictions_count",
    1522            8 :             "Different reasons for evictions to have been cancelled or failed",
    1523            8 :             &["reason"]
    1524            8 :         )
    1525            8 :         .unwrap();
    1526            8 : 
    1527           64 :         let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1528           64 :             let reason = EvictionCancelled::from_usize(i);
    1529           64 :             let s = reason.as_str();
    1530           64 :             cancelled_evictions.with_label_values(&[s])
    1531           64 :         }));
    1532            8 : 
    1533            8 :         let started_deletes = metrics::register_int_counter!(
    1534            8 :             "pageserver_layer_started_deletes",
    1535            8 :             "Deletions on drop pending in the Layer implementation"
    1536            8 :         )
    1537            8 :         .unwrap();
    1538            8 :         let completed_deletes = metrics::register_int_counter!(
    1539            8 :             "pageserver_layer_completed_deletes",
    1540            8 :             "Deletions on drop completed in the Layer implementation"
    1541            8 :         )
    1542            8 :         .unwrap();
    1543            8 : 
    1544            8 :         let failed_deletes = metrics::register_int_counter_vec!(
    1545            8 :             "pageserver_layer_failed_deletes_count",
    1546            8 :             "Different reasons for deletions on drop to have failed",
    1547            8 :             &["reason"]
    1548            8 :         )
    1549            8 :         .unwrap();
    1550            8 : 
    1551           16 :         let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1552           16 :             let reason = DeleteFailed::from_usize(i);
    1553           16 :             let s = reason.as_str();
    1554           16 :             failed_deletes.with_label_values(&[s])
    1555           16 :         }));
    1556            8 : 
    1557            8 :         let rare_counters = metrics::register_int_counter_vec!(
    1558            8 :             "pageserver_layer_assumed_rare_count",
    1559            8 :             "Times unexpected or assumed rare event happened",
    1560            8 :             &["event"]
    1561            8 :         )
    1562            8 :         .unwrap();
    1563            8 : 
    1564           56 :         let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1565           56 :             let event = RareEvent::from_usize(i);
    1566           56 :             let s = event.as_str();
    1567           56 :             rare_counters.with_label_values(&[s])
    1568           56 :         }));
    1569            8 : 
    1570            8 :         let inits_cancelled = metrics::register_int_counter!(
    1571            8 :             "pageserver_layer_inits_cancelled_count",
    1572            8 :             "Times Layer initialization was cancelled",
    1573            8 :         )
    1574            8 :         .unwrap();
    1575            8 : 
    1576            8 :         let redownload_after = {
    1577            8 :             let minute = 60.0;
    1578            8 :             let hour = 60.0 * minute;
    1579            8 :             metrics::register_histogram!(
    1580            8 :                 "pageserver_layer_redownloaded_after",
    1581            8 :                 "Time between evicting and re-downloading.",
    1582            8 :                 vec![
    1583            8 :                     10.0,
    1584            8 :                     30.0,
    1585            8 :                     minute,
    1586            8 :                     5.0 * minute,
    1587            8 :                     15.0 * minute,
    1588            8 :                     30.0 * minute,
    1589            8 :                     hour,
    1590            8 :                     12.0 * hour,
    1591            8 :                 ]
    1592            8 :             )
    1593            8 :             .unwrap()
    1594            8 :         };
    1595            8 : 
    1596            8 :         Self {
    1597            8 :             started_evictions,
    1598            8 :             completed_evictions,
    1599            8 :             cancelled_evictions,
    1600            8 : 
    1601            8 :             started_deletes,
    1602            8 :             completed_deletes,
    1603            8 :             failed_deletes,
    1604            8 : 
    1605            8 :             rare_counters,
    1606            8 :             inits_cancelled,
    1607            8 :             redownload_after,
    1608            8 :         }
    1609            8 :     }
    1610              : }
    1611              : 
    1612              : impl LayerImplMetrics {
    1613            2 :     fn inc_started_evictions(&self) {
    1614            2 :         self.started_evictions.inc();
    1615            2 :     }
    1616            2 :     fn inc_completed_evictions(&self) {
    1617            2 :         self.completed_evictions.inc();
    1618            2 :     }
    1619            0 :     fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
    1620            0 :         self.cancelled_evictions[reason].inc()
    1621            0 :     }
    1622              : 
    1623          300 :     fn inc_started_deletes(&self) {
    1624          300 :         self.started_deletes.inc();
    1625          300 :     }
    1626          300 :     fn inc_completed_deletes(&self) {
    1627          300 :         self.completed_deletes.inc();
    1628          300 :     }
    1629            0 :     fn inc_deletes_failed(&self, reason: DeleteFailed) {
    1630            0 :         self.failed_deletes[reason].inc();
    1631            0 :     }
    1632              : 
    1633              :     /// Counted separatedly from failed layer deletes because we will complete the layer deletion
    1634              :     /// attempt regardless of failure to delete local file.
    1635            0 :     fn inc_delete_removes_failed(&self) {
    1636            0 :         self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
    1637            0 :     }
    1638              : 
    1639              :     /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
    1640            0 :     fn inc_retried_get_or_maybe_download(&self) {
    1641            0 :         self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
    1642            0 :     }
    1643              : 
    1644              :     /// Expected rare because cancellations are unexpected, and failures are unexpected
    1645            0 :     fn inc_download_failed_without_requester(&self) {
    1646            0 :         self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
    1647            0 :     }
    1648              : 
    1649              :     /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
    1650              :     ///
    1651              :     /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
    1652              :     /// Option.
    1653            0 :     fn inc_raced_wanted_evicted_accesses(&self) {
    1654            0 :         self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
    1655            0 :     }
    1656              : 
    1657              :     /// These are only expected for [`Self::inc_init_cancelled`] amount when
    1658              :     /// running with remote storage.
    1659            0 :     fn inc_init_needed_no_download(&self) {
    1660            0 :         self.rare_counters[RareEvent::InitWithoutDownload].inc();
    1661            0 :     }
    1662              : 
    1663              :     /// Expected rare because all layer files should be readable and good
    1664            0 :     fn inc_permanent_loading_failures(&self) {
    1665            0 :         self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
    1666            0 :     }
    1667              : 
    1668            0 :     fn inc_broadcast_lagged(&self) {
    1669            0 :         self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
    1670            0 :     }
    1671              : 
    1672            2 :     fn inc_init_cancelled(&self) {
    1673            2 :         self.inits_cancelled.inc()
    1674            2 :     }
    1675              : 
    1676            0 :     fn record_redownloaded_after(&self, duration: std::time::Duration) {
    1677            0 :         self.redownload_after.observe(duration.as_secs_f64())
    1678            0 :     }
    1679              : }
    1680              : 
    1681           64 : #[derive(enum_map::Enum)]
    1682              : enum EvictionCancelled {
    1683              :     LayerGone,
    1684              :     TimelineGone,
    1685              :     VersionCheckFailed,
    1686              :     FileNotFound,
    1687              :     RemoveFailed,
    1688              :     AlreadyReinitialized,
    1689              :     /// Not evicted because of a pending reinitialization
    1690              :     LostToDownload,
    1691              :     /// After eviction, there was a new layer access which cancelled the eviction.
    1692              :     UpgradedBackOnAccess,
    1693              : }
    1694              : 
    1695              : impl EvictionCancelled {
    1696           64 :     fn as_str(&self) -> &'static str {
    1697           64 :         match self {
    1698            8 :             EvictionCancelled::LayerGone => "layer_gone",
    1699            8 :             EvictionCancelled::TimelineGone => "timeline_gone",
    1700            8 :             EvictionCancelled::VersionCheckFailed => "version_check_fail",
    1701            8 :             EvictionCancelled::FileNotFound => "file_not_found",
    1702            8 :             EvictionCancelled::RemoveFailed => "remove_failed",
    1703            8 :             EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
    1704            8 :             EvictionCancelled::LostToDownload => "lost_to_download",
    1705            8 :             EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
    1706              :         }
    1707           64 :     }
    1708              : }
    1709              : 
    1710           16 : #[derive(enum_map::Enum)]
    1711              : enum DeleteFailed {
    1712              :     TimelineGone,
    1713              :     DeleteSchedulingFailed,
    1714              : }
    1715              : 
    1716              : impl DeleteFailed {
    1717           16 :     fn as_str(&self) -> &'static str {
    1718           16 :         match self {
    1719            8 :             DeleteFailed::TimelineGone => "timeline_gone",
    1720            8 :             DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
    1721              :         }
    1722           16 :     }
    1723              : }
    1724              : 
    1725           56 : #[derive(enum_map::Enum)]
    1726              : enum RareEvent {
    1727              :     RemoveOnDropFailed,
    1728              :     RetriedGetOrMaybeDownload,
    1729              :     DownloadFailedWithoutRequester,
    1730              :     UpgradedWantedEvicted,
    1731              :     InitWithoutDownload,
    1732              :     PermanentLoadingFailure,
    1733              :     EvictAndWaitLagged,
    1734              : }
    1735              : 
    1736              : impl RareEvent {
    1737           56 :     fn as_str(&self) -> &'static str {
    1738           56 :         use RareEvent::*;
    1739           56 : 
    1740           56 :         match self {
    1741            8 :             RemoveOnDropFailed => "remove_on_drop_failed",
    1742            8 :             RetriedGetOrMaybeDownload => "retried_gomd",
    1743            8 :             DownloadFailedWithoutRequester => "download_failed_without",
    1744            8 :             UpgradedWantedEvicted => "raced_wanted_evicted",
    1745            8 :             InitWithoutDownload => "init_needed_no_download",
    1746            8 :             PermanentLoadingFailure => "permanent_loading_failure",
    1747            8 :             EvictAndWaitLagged => "broadcast_lagged",
    1748              :         }
    1749           56 :     }
    1750              : }
    1751              : 
    1752              : pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
    1753              :     once_cell::sync::Lazy::new(LayerImplMetrics::default);
        

Generated by: LCOV version 2.1-beta