LCOV - code coverage report
Current view: top level - pageserver/src/tenant/storage_layer - layer.rs (source / functions) Coverage Total Hit
Test: 32f4a56327bc9da697706839ed4836b2a00a408f.info Lines: 86.3 % 1062 917
Test Date: 2024-02-07 07:37:29 Functions: 80.9 % 152 123

            Line data    Source code
       1              : use anyhow::Context;
       2              : use camino::{Utf8Path, Utf8PathBuf};
       3              : use pageserver_api::models::{
       4              :     HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
       5              : };
       6              : use pageserver_api::shard::ShardIndex;
       7              : use std::ops::Range;
       8              : use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
       9              : use std::sync::{Arc, Weak};
      10              : use std::time::SystemTime;
      11              : use tracing::Instrument;
      12              : use utils::lsn::Lsn;
      13              : use utils::sync::heavier_once_cell;
      14              : 
      15              : use crate::config::PageServerConf;
      16              : use crate::context::RequestContext;
      17              : use crate::repository::Key;
      18              : use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
      19              : use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
      20              : 
      21              : use super::delta_layer::{self, DeltaEntry};
      22              : use super::image_layer;
      23              : use super::{
      24              :     AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
      25              :     ValueReconstructResult, ValueReconstructState,
      26              : };
      27              : 
      28              : use utils::generation::Generation;
      29              : 
      30              : /// A Layer contains all data in a "rectangle" consisting of a range of keys and
      31              : /// range of LSNs.
      32              : ///
      33              : /// There are two kinds of layers, in-memory and on-disk layers. In-memory
      34              : /// layers are used to ingest incoming WAL, and provide fast access to the
      35              : /// recent page versions. On-disk layers are stored as files on disk, and are
      36              : /// immutable. This type represents the on-disk kind while in-memory kind are represented by
      37              : /// [`InMemoryLayer`].
      38              : ///
      39              : /// Furthermore, there are two kinds of on-disk layers: delta and image layers.
      40              : /// A delta layer contains all modifications within a range of LSNs and keys.
      41              : /// An image layer is a snapshot of all the data in a key-range, at a single
      42              : /// LSN.
      43              : ///
      44              : /// This type models the on-disk layers, which can be evicted and on-demand downloaded.
      45              : ///
      46              : /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer
      47     48023087 : #[derive(Clone)]
      48              : pub(crate) struct Layer(Arc<LayerInner>);
      49              : 
      50              : impl std::fmt::Display for Layer {
      51        34162 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      52        34162 :         if matches!(self.0.generation, Generation::Broken) {
      53            0 :             write!(f, "{}-broken", self.layer_desc().short_id())
      54              :         } else {
      55        34162 :             write!(
      56        34162 :                 f,
      57        34162 :                 "{}{}",
      58        34162 :                 self.layer_desc().short_id(),
      59        34162 :                 self.0.generation.get_suffix()
      60        34162 :             )
      61              :         }
      62        34162 :     }
      63              : }
      64              : 
      65              : impl std::fmt::Debug for Layer {
      66            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      67            0 :         write!(f, "{}", self)
      68            0 :     }
      69              : }
      70              : 
      71              : impl AsLayerDesc for Layer {
      72     73257565 :     fn layer_desc(&self) -> &PersistentLayerDesc {
      73     73257565 :         self.0.layer_desc()
      74     73257565 :     }
      75              : }
      76              : 
      77              : impl Layer {
      78              :     /// Creates a layer value for a file we know to not be resident.
      79        44117 :     pub(crate) fn for_evicted(
      80        44117 :         conf: &'static PageServerConf,
      81        44117 :         timeline: &Arc<Timeline>,
      82        44117 :         file_name: LayerFileName,
      83        44117 :         metadata: LayerFileMetadata,
      84        44117 :     ) -> Self {
      85        44117 :         let desc = PersistentLayerDesc::from_filename(
      86        44117 :             timeline.tenant_shard_id,
      87        44117 :             timeline.timeline_id,
      88        44117 :             file_name,
      89        44117 :             metadata.file_size(),
      90        44117 :         );
      91        44117 : 
      92        44117 :         let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
      93        44117 : 
      94        44117 :         let owner = Layer(Arc::new(LayerInner::new(
      95        44117 :             conf,
      96        44117 :             timeline,
      97        44117 :             access_stats,
      98        44117 :             desc,
      99        44117 :             None,
     100        44117 :             metadata.generation,
     101        44117 :             metadata.shard,
     102        44117 :         )));
     103              : 
     104        44117 :         debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
     105              : 
     106        44117 :         owner
     107        44117 :     }
     108              : 
     109              :     /// Creates a Layer value for a file we know to be resident in timeline directory.
     110        12719 :     pub(crate) fn for_resident(
     111        12719 :         conf: &'static PageServerConf,
     112        12719 :         timeline: &Arc<Timeline>,
     113        12719 :         file_name: LayerFileName,
     114        12719 :         metadata: LayerFileMetadata,
     115        12719 :     ) -> ResidentLayer {
     116        12719 :         let desc = PersistentLayerDesc::from_filename(
     117        12719 :             timeline.tenant_shard_id,
     118        12719 :             timeline.timeline_id,
     119        12719 :             file_name,
     120        12719 :             metadata.file_size(),
     121        12719 :         );
     122        12719 : 
     123        12719 :         let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
     124        12719 : 
     125        12719 :         let mut resident = None;
     126        12719 : 
     127        12719 :         let owner = Layer(Arc::new_cyclic(|owner| {
     128        12719 :             let inner = Arc::new(DownloadedLayer {
     129        12719 :                 owner: owner.clone(),
     130        12719 :                 kind: tokio::sync::OnceCell::default(),
     131        12719 :                 version: 0,
     132        12719 :             });
     133        12719 :             resident = Some(inner.clone());
     134        12719 : 
     135        12719 :             LayerInner::new(
     136        12719 :                 conf,
     137        12719 :                 timeline,
     138        12719 :                 access_stats,
     139        12719 :                 desc,
     140        12719 :                 Some(inner),
     141        12719 :                 metadata.generation,
     142        12719 :                 metadata.shard,
     143        12719 :             )
     144        12719 :         }));
     145        12719 : 
     146        12719 :         let downloaded = resident.expect("just initialized");
     147              : 
     148        12719 :         debug_assert!(owner.0.needs_download_blocking().unwrap().is_none());
     149              : 
     150        12719 :         timeline
     151        12719 :             .metrics
     152        12719 :             .resident_physical_size_add(metadata.file_size());
     153        12719 : 
     154        12719 :         ResidentLayer { downloaded, owner }
     155        12719 :     }
     156              : 
     157              :     /// Creates a Layer value for freshly written out new layer file by renaming it from a
     158              :     /// temporary path.
     159        21944 :     pub(crate) fn finish_creating(
     160        21944 :         conf: &'static PageServerConf,
     161        21944 :         timeline: &Arc<Timeline>,
     162        21944 :         desc: PersistentLayerDesc,
     163        21944 :         temp_path: &Utf8Path,
     164        21944 :     ) -> anyhow::Result<ResidentLayer> {
     165        21944 :         let mut resident = None;
     166        21944 : 
     167        21944 :         let owner = Layer(Arc::new_cyclic(|owner| {
     168        21944 :             let inner = Arc::new(DownloadedLayer {
     169        21944 :                 owner: owner.clone(),
     170        21944 :                 kind: tokio::sync::OnceCell::default(),
     171        21944 :                 version: 0,
     172        21944 :             });
     173        21944 :             resident = Some(inner.clone());
     174        21944 :             let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
     175        21944 :             access_stats.record_residence_event(
     176        21944 :                 LayerResidenceStatus::Resident,
     177        21944 :                 LayerResidenceEventReason::LayerCreate,
     178        21944 :             );
     179        21944 :             LayerInner::new(
     180        21944 :                 conf,
     181        21944 :                 timeline,
     182        21944 :                 access_stats,
     183        21944 :                 desc,
     184        21944 :                 Some(inner),
     185        21944 :                 timeline.generation,
     186        21944 :                 timeline.get_shard_index(),
     187        21944 :             )
     188        21944 :         }));
     189        21944 : 
     190        21944 :         let downloaded = resident.expect("just initialized");
     191        21944 : 
     192        21944 :         // if the rename works, the path is as expected
     193        21944 :         std::fs::rename(temp_path, owner.local_path())
     194        21944 :             .with_context(|| format!("rename temporary file as correct path for {owner}"))?;
     195              : 
     196        21944 :         Ok(ResidentLayer { downloaded, owner })
     197        21944 :     }
     198              : 
     199              :     /// Requests the layer to be evicted and waits for this to be done.
     200              :     ///
     201              :     /// If the file is not resident, an [`EvictionError::NotFound`] is returned.
     202              :     ///
     203              :     /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
     204              :     /// re-downloaded, [`EvictionError::Downloaded`] is returned.
     205              :     ///
     206              :     /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
     207              :     /// of download-evict cycle on retry.
     208         2563 :     pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
     209         2563 :         self.0.evict_and_wait().await
     210         2563 :     }
     211              : 
     212              :     /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
     213              :     /// then.
     214              :     ///
     215              :     /// On drop, this will cause a call to [`crate::tenant::remote_timeline_client::RemoteTimelineClient::schedule_deletion_of_unlinked`].
     216              :     /// This means that the unlinking by [gc] or [compaction] must have happened strictly before
     217              :     /// the value this is called on gets dropped.
     218              :     ///
     219              :     /// This is ensured by both of those methods accepting references to Layer.
     220              :     ///
     221              :     /// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
     222              :     /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
     223         5410 :     pub(crate) fn delete_on_drop(&self) {
     224         5410 :         self.0.delete_on_drop();
     225         5410 :     }
     226              : 
     227              :     /// Return data needed to reconstruct given page at LSN.
     228              :     ///
     229              :     /// It is up to the caller to collect more data from the previous layer and
     230              :     /// perform WAL redo, if necessary.
     231              :     ///
     232              :     /// # Cancellation-Safety
     233              :     ///
     234              :     /// This method is cancellation-safe.
     235     23934556 :     pub(crate) async fn get_value_reconstruct_data(
     236     23934556 :         &self,
     237     23934556 :         key: Key,
     238     23934556 :         lsn_range: Range<Lsn>,
     239     23934556 :         reconstruct_data: &mut ValueReconstructState,
     240     23934556 :         ctx: &RequestContext,
     241     23934556 :     ) -> anyhow::Result<ValueReconstructResult> {
     242              :         use anyhow::ensure;
     243              : 
     244     23934549 :         let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
     245     23934538 :         self.0
     246     23934538 :             .access_stats
     247     23934538 :             .record_access(LayerAccessKind::GetValueReconstructData, ctx);
     248     23934538 : 
     249     23934538 :         if self.layer_desc().is_delta {
     250     22799318 :             ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
     251     22799318 :             ensure!(self.layer_desc().key_range.contains(&key));
     252              :         } else {
     253      1135220 :             ensure!(self.layer_desc().key_range.contains(&key));
     254      1135220 :             ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
     255      1135220 :             ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
     256              :         }
     257              : 
     258     23934538 :         layer
     259     23934538 :             .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
     260     23934538 :             .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
     261      1011840 :             .await
     262     23934535 :             .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
     263     23934537 :     }
     264              : 
     265              :     /// Download the layer if evicted.
     266              :     ///
     267              :     /// Will not error when the layer is already downloaded.
     268           12 :     pub(crate) async fn download(&self) -> anyhow::Result<()> {
     269           32 :         self.0.get_or_maybe_download(true, None).await?;
     270            7 :         Ok(())
     271           12 :     }
     272              : 
     273              :     /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
     274              :     /// while the guard exists.
     275              :     ///
     276              :     /// Returns None if the layer is currently evicted.
     277         4738 :     pub(crate) async fn keep_resident(&self) -> anyhow::Result<Option<ResidentLayer>> {
     278         4738 :         let downloaded = match self.0.get_or_maybe_download(false, None).await {
     279         4150 :             Ok(d) => d,
     280              :             // technically there are a lot of possible errors, but in practice it should only be
     281              :             // DownloadRequired which is tripped up. could work to improve this situation
     282              :             // statically later.
     283          588 :             Err(DownloadError::DownloadRequired) => return Ok(None),
     284            0 :             Err(e) => return Err(e.into()),
     285              :         };
     286              : 
     287         4150 :         Ok(Some(ResidentLayer {
     288         4150 :             downloaded,
     289         4150 :             owner: self.clone(),
     290         4150 :         }))
     291         4738 :     }
     292              : 
     293              :     /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
     294         4268 :     pub(crate) async fn download_and_keep_resident(&self) -> anyhow::Result<ResidentLayer> {
     295         4268 :         let downloaded = self.0.get_or_maybe_download(true, None).await?;
     296              : 
     297         4268 :         Ok(ResidentLayer {
     298         4268 :             downloaded,
     299         4268 :             owner: self.clone(),
     300         4268 :         })
     301         4268 :     }
     302              : 
     303         3014 :     pub(crate) async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
     304         3014 :         self.0.info(reset).await
     305         3014 :     }
     306              : 
     307         4148 :     pub(crate) fn access_stats(&self) -> &LayerAccessStats {
     308         4148 :         &self.0.access_stats
     309         4148 :     }
     310              : 
     311        22995 :     pub(crate) fn local_path(&self) -> &Utf8Path {
     312        22995 :         &self.0.path
     313        22995 :     }
     314              : 
     315        21990 :     pub(crate) fn metadata(&self) -> LayerFileMetadata {
     316        21990 :         self.0.metadata()
     317        21990 :     }
     318              : 
     319              :     /// Traditional debug dumping facility
     320              :     #[allow(unused)]
     321            4 :     pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> {
     322            4 :         self.0.desc.dump();
     323            4 : 
     324            4 :         if verbose {
     325              :             // for now, unconditionally download everything, even if that might not be wanted.
     326            4 :             let l = self.0.get_or_maybe_download(true, Some(ctx)).await?;
     327            8 :             l.dump(&self.0, ctx).await?
     328            0 :         }
     329              : 
     330            4 :         Ok(())
     331            4 :     }
     332              : 
     333              :     /// Waits until this layer has been dropped (and if needed, local file deletion and remote
     334              :     /// deletion scheduling has completed).
     335              :     ///
     336              :     /// Does not start local deletion, use [`Self::delete_on_drop`] for that
     337              :     /// separatedly.
     338              :     #[cfg(feature = "testing")]
     339         1031 :     pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
     340         1031 :         let mut rx = self.0.status.subscribe();
     341              : 
     342         1031 :         async move {
     343              :             loop {
     344         1031 :                 if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await {
     345         1031 :                     break;
     346            0 :                 }
     347              :             }
     348         1031 :         }
     349         1031 :     }
     350              : }
     351              : 
     352              : /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
     353              : ///
     354              : /// However when we want something evicted, we cannot evict it right away as there might be current
     355              : /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
     356              : /// read with [`Layer::get_value_reconstruct_data`].
     357              : ///
     358              : /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
     359            0 : #[derive(Debug)]
     360              : enum ResidentOrWantedEvicted {
     361              :     Resident(Arc<DownloadedLayer>),
     362              :     WantedEvicted(Weak<DownloadedLayer>, usize),
     363              : }
     364              : 
     365              : impl ResidentOrWantedEvicted {
     366     23942974 :     fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
     367     23942974 :         match self {
     368     23942974 :             ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
     369            0 :             ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
     370            0 :                 Some(strong) => {
     371            0 :                     LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
     372            0 : 
     373            0 :                     *self = ResidentOrWantedEvicted::Resident(strong.clone());
     374            0 : 
     375            0 :                     Some((strong, true))
     376              :                 }
     377            0 :                 None => None,
     378              :             },
     379              :         }
     380     23942974 :     }
     381              : 
     382              :     /// When eviction is first requested, drop down to holding a [`Weak`].
     383              :     ///
     384              :     /// Returns `Some` if this was the first time eviction was requested. Care should be taken to
     385              :     /// drop the possibly last strong reference outside of the mutex of
     386              :     /// heavier_once_cell::OnceCell.
     387         2561 :     fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
     388         2561 :         match self {
     389         2561 :             ResidentOrWantedEvicted::Resident(strong) => {
     390         2561 :                 let weak = Arc::downgrade(strong);
     391         2561 :                 let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
     392         2561 :                 std::mem::swap(self, &mut temp);
     393         2561 :                 match temp {
     394         2561 :                     ResidentOrWantedEvicted::Resident(strong) => Some(strong),
     395            0 :                     ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
     396              :                 }
     397              :             }
     398            0 :             ResidentOrWantedEvicted::WantedEvicted(..) => None,
     399              :         }
     400         2561 :     }
     401              : }
     402              : 
     403              : struct LayerInner {
     404              :     /// Only needed to check ondemand_download_behavior_treat_error_as_warn and creation of
     405              :     /// [`Self::path`].
     406              :     conf: &'static PageServerConf,
     407              : 
     408              :     /// Full path to the file; unclear if this should exist anymore.
     409              :     path: Utf8PathBuf,
     410              : 
     411              :     desc: PersistentLayerDesc,
     412              : 
     413              :     /// Timeline access is needed for remote timeline client and metrics.
     414              :     timeline: Weak<Timeline>,
     415              : 
     416              :     /// Cached knowledge of [`Timeline::remote_client`] being `Some`.
     417              :     have_remote_client: bool,
     418              : 
     419              :     access_stats: LayerAccessStats,
     420              : 
     421              :     /// This custom OnceCell is backed by std mutex, but only held for short time periods.
     422              :     /// Initialization and deinitialization are done while holding a permit.
     423              :     inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
     424              : 
     425              :     /// Do we want to delete locally and remotely this when `LayerInner` is dropped
     426              :     wanted_deleted: AtomicBool,
     427              : 
     428              :     /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
     429              :     /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
     430              :     /// [`LayerInner::on_downloaded_layer_drop`].
     431              :     wanted_evicted: AtomicBool,
     432              : 
     433              :     /// Version is to make sure we will only evict a specific download of a file.
     434              :     ///
     435              :     /// Incremented for each download, stored in `DownloadedLayer::version` or
     436              :     /// `ResidentOrWantedEvicted::WantedEvicted`.
     437              :     version: AtomicUsize,
     438              : 
     439              :     /// Allow subscribing to when the layer actually gets evicted.
     440              :     status: tokio::sync::broadcast::Sender<Status>,
     441              : 
     442              :     /// Counter for exponential backoff with the download
     443              :     consecutive_failures: AtomicUsize,
     444              : 
     445              :     /// The generation of this Layer.
     446              :     ///
     447              :     /// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
     448              :     /// for created layers from [`Timeline::generation`].
     449              :     generation: Generation,
     450              : 
     451              :     /// The shard of this Layer.
     452              :     ///
     453              :     /// For layers created in this process, this will always be the [`ShardIndex`] of the
     454              :     /// current `ShardIdentity`` (TODO: add link once it's introduced).
     455              :     ///
     456              :     /// For loaded layers, this may be some other value if the tenant has undergone
     457              :     /// a shard split since the layer was originally written.
     458              :     shard: ShardIndex,
     459              : 
     460              :     last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
     461              : }
     462              : 
     463              : impl std::fmt::Display for LayerInner {
     464        22717 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     465        22717 :         write!(f, "{}", self.layer_desc().short_id())
     466        22717 :     }
     467              : }
     468              : 
     469              : impl AsLayerDesc for LayerInner {
     470     73307332 :     fn layer_desc(&self) -> &PersistentLayerDesc {
     471     73307332 :         &self.desc
     472     73307332 :     }
     473              : }
     474              : 
     475         2561 : #[derive(Debug, Clone, Copy)]
     476              : enum Status {
     477              :     Evicted,
     478              :     Downloaded,
     479              : }
     480              : 
     481              : impl Drop for LayerInner {
     482        48874 :     fn drop(&mut self) {
     483        48874 :         if !*self.wanted_deleted.get_mut() {
     484              :             // should we try to evict if the last wish was for eviction?
     485              :             // feels like there's some hazard of overcrowding near shutdown near by, but we don't
     486              :             // run drops during shutdown (yet)
     487        43464 :             return;
     488         5410 :         }
     489              : 
     490         5410 :         let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
     491              : 
     492         5410 :         let path = std::mem::take(&mut self.path);
     493         5410 :         let file_name = self.layer_desc().filename();
     494         5410 :         let file_size = self.layer_desc().file_size;
     495         5410 :         let timeline = self.timeline.clone();
     496         5410 :         let meta = self.metadata();
     497         5410 :         let status = self.status.clone();
     498         5410 : 
     499         5410 :         crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
     500         5410 :             let _g = span.entered();
     501         5410 : 
     502         5410 :             // carry this until we are finished for [`Layer::wait_drop`] support
     503         5410 :             let _status = status;
     504              : 
     505         5410 :             let removed = match std::fs::remove_file(path) {
     506         5408 :                 Ok(()) => true,
     507            2 :                 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
     508            2 :                     // until we no longer do detaches by removing all local files before removing the
     509            2 :                     // tenant from the global map, we will always get these errors even if we knew what
     510            2 :                     // is the latest state.
     511            2 :                     //
     512            2 :                     // we currently do not track the latest state, so we'll also end up here on evicted
     513            2 :                     // layers.
     514            2 :                     false
     515              :                 }
     516            0 :                 Err(e) => {
     517            0 :                     tracing::error!("failed to remove wanted deleted layer: {e}");
     518            0 :                     LAYER_IMPL_METRICS.inc_delete_removes_failed();
     519            0 :                     false
     520              :                 }
     521              :             };
     522              : 
     523         5410 :             if let Some(timeline) = timeline.upgrade() {
     524         5410 :                 if removed {
     525         5408 :                     timeline.metrics.resident_physical_size_sub(file_size);
     526         5408 :                 }
     527         5410 :                 if let Some(remote_client) = timeline.remote_client.as_ref() {
     528         5410 :                     let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
     529              : 
     530         5410 :                     if let Err(e) = res {
     531              :                         // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
     532              :                         // demonstrating this deadlock (without spawn_blocking): stop will drop
     533              :                         // queued items, which will have ResidentLayer's, and those drops would try
     534              :                         // to re-entrantly lock the RemoteTimelineClient inner state.
     535           14 :                         if !timeline.is_active() {
     536           14 :                             tracing::info!("scheduling deletion on drop failed: {e:#}");
     537              :                         } else {
     538            0 :                             tracing::warn!("scheduling deletion on drop failed: {e:#}");
     539              :                         }
     540           14 :                         LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
     541         5396 :                     } else {
     542         5396 :                         LAYER_IMPL_METRICS.inc_completed_deletes();
     543         5396 :                     }
     544            0 :                 }
     545            0 :             } else {
     546            0 :                 // no need to nag that timeline is gone: under normal situation on
     547            0 :                 // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
     548            0 :                 LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
     549            0 :             }
     550         5410 :         });
     551        48874 :     }
     552              : }
     553              : 
     554              : impl LayerInner {
     555        78780 :     fn new(
     556        78780 :         conf: &'static PageServerConf,
     557        78780 :         timeline: &Arc<Timeline>,
     558        78780 :         access_stats: LayerAccessStats,
     559        78780 :         desc: PersistentLayerDesc,
     560        78780 :         downloaded: Option<Arc<DownloadedLayer>>,
     561        78780 :         generation: Generation,
     562        78780 :         shard: ShardIndex,
     563        78780 :     ) -> Self {
     564        78780 :         let path = conf
     565        78780 :             .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
     566        78780 :             .join(desc.filename().to_string());
     567              : 
     568        78780 :         let (inner, version) = if let Some(inner) = downloaded {
     569        34663 :             let version = inner.version;
     570        34663 :             let resident = ResidentOrWantedEvicted::Resident(inner);
     571        34663 :             (heavier_once_cell::OnceCell::new(resident), version)
     572              :         } else {
     573        44117 :             (heavier_once_cell::OnceCell::default(), 0)
     574              :         };
     575              : 
     576        78780 :         LayerInner {
     577        78780 :             conf,
     578        78780 :             path,
     579        78780 :             desc,
     580        78780 :             timeline: Arc::downgrade(timeline),
     581        78780 :             have_remote_client: timeline.remote_client.is_some(),
     582        78780 :             access_stats,
     583        78780 :             wanted_deleted: AtomicBool::new(false),
     584        78780 :             wanted_evicted: AtomicBool::new(false),
     585        78780 :             inner,
     586        78780 :             version: AtomicUsize::new(version),
     587        78780 :             status: tokio::sync::broadcast::channel(1).0,
     588        78780 :             consecutive_failures: AtomicUsize::new(0),
     589        78780 :             generation,
     590        78780 :             shard,
     591        78780 :             last_evicted_at: std::sync::Mutex::default(),
     592        78780 :         }
     593        78780 :     }
     594              : 
     595         5410 :     fn delete_on_drop(&self) {
     596         5410 :         let res =
     597         5410 :             self.wanted_deleted
     598         5410 :                 .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
     599         5410 : 
     600         5410 :         if res.is_ok() {
     601         5410 :             LAYER_IMPL_METRICS.inc_started_deletes();
     602         5410 :         }
     603         5410 :     }
     604              : 
     605              :     /// Cancellation safe, however dropping the future and calling this method again might result
     606              :     /// in a new attempt to evict OR join the previously started attempt.
     607         2563 :     pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
     608         2563 :         use tokio::sync::broadcast::error::RecvError;
     609         2563 : 
     610         2563 :         assert!(self.have_remote_client);
     611              : 
     612         2563 :         let mut rx = self.status.subscribe();
     613              : 
     614         2561 :         let strong = {
     615         2563 :             match self.inner.get_mut().await {
     616         2561 :                 Some(mut either) => {
     617         2561 :                     self.wanted_evicted.store(true, Ordering::Relaxed);
     618         2561 :                     ResidentOrWantedEvicted::downgrade(&mut either)
     619              :                 }
     620            2 :                 None => return Err(EvictionError::NotFound),
     621              :             }
     622              :         };
     623              : 
     624         2561 :         if strong.is_some() {
     625         2561 :             // drop the DownloadedLayer outside of the holding the guard
     626         2561 :             drop(strong);
     627         2561 :             LAYER_IMPL_METRICS.inc_started_evictions();
     628         2561 :         }
     629              : 
     630         2561 :         match rx.recv().await {
     631         2561 :             Ok(Status::Evicted) => Ok(()),
     632            0 :             Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
     633              :             Err(RecvError::Closed) => {
     634            0 :                 unreachable!("sender cannot be dropped while we are in &self method")
     635              :             }
     636              :             Err(RecvError::Lagged(_)) => {
     637              :                 // this is quite unlikely, but we are blocking a lot in the async context, so
     638              :                 // we might be missing this because we are stuck on a LIFO slot on a thread
     639              :                 // which is busy blocking for a 1TB database create_image_layers.
     640              :                 //
     641              :                 // use however late (compared to the initial expressing of wanted) as the
     642              :                 // "outcome" now
     643            0 :                 LAYER_IMPL_METRICS.inc_broadcast_lagged();
     644            0 :                 match self.inner.get_mut().await {
     645            0 :                     Some(_) => Err(EvictionError::Downloaded),
     646            0 :                     None => Ok(()),
     647              :                 }
     648              :             }
     649              :         }
     650         2563 :     }
     651              : 
     652              :     /// Cancellation safe.
     653     23943578 :     async fn get_or_maybe_download(
     654     23943578 :         self: &Arc<Self>,
     655     23943578 :         allow_download: bool,
     656     23943578 :         ctx: Option<&RequestContext>,
     657     23943578 :     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
     658     23943571 :         let mut init_permit = None;
     659              : 
     660              :         loop {
     661     23943571 :             let download = move |permit| {
     662        10372 :                 async move {
     663        10372 :                     // disable any scheduled but not yet running eviction deletions for this
     664        10372 :                     let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
     665        10372 : 
     666        10372 :                     // count cancellations, which currently remain largely unexpected
     667        10372 :                     let init_cancelled =
     668        10372 :                         scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
     669        10372 : 
     670        10372 :                     // no need to make the evict_and_wait wait for the actual download to complete
     671        10372 :                     drop(self.status.send(Status::Downloaded));
     672              : 
     673        10372 :                     let timeline = self
     674        10372 :                         .timeline
     675        10372 :                         .upgrade()
     676        10372 :                         .ok_or_else(|| DownloadError::TimelineShutdown)?;
     677              : 
     678              :                     // FIXME: grab a gate
     679              : 
     680        10372 :                     let can_ever_evict = timeline.remote_client.as_ref().is_some();
     681              : 
     682              :                     // check if we really need to be downloaded; could have been already downloaded by a
     683              :                     // cancelled previous attempt.
     684        10372 :                     let needs_download = self
     685        10372 :                         .needs_download()
     686        10006 :                         .await
     687        10372 :                         .map_err(DownloadError::PreStatFailed)?;
     688              : 
     689        10372 :                     let permit = if let Some(reason) = needs_download {
     690        10372 :                         if let NeedsDownload::NotFile(ft) = reason {
     691            0 :                             return Err(DownloadError::NotFile(ft));
     692        10372 :                         }
     693        10372 : 
     694        10372 :                         // only reset this after we've decided we really need to download. otherwise it'd
     695        10372 :                         // be impossible to mark cancelled downloads for eviction, like one could imagine
     696        10372 :                         // we would like to do for prefetching which was not needed.
     697        10372 :                         self.wanted_evicted.store(false, Ordering::Release);
     698        10372 : 
     699        10372 :                         if !can_ever_evict {
     700            0 :                             return Err(DownloadError::NoRemoteStorage);
     701        10372 :                         }
     702              : 
     703        10372 :                         if let Some(ctx) = ctx {
     704         9769 :                             self.check_expected_download(ctx)?;
     705          603 :                         }
     706              : 
     707        10372 :                         if !allow_download {
     708              :                             // this does look weird, but for LayerInner the "downloading" means also changing
     709              :                             // internal once related state ...
     710          588 :                             return Err(DownloadError::DownloadRequired);
     711         9784 :                         }
     712         9784 : 
     713         9784 :                         tracing::info!(%reason, "downloading on-demand");
     714              : 
     715        20612 :                         self.spawn_download_and_wait(timeline, permit).await?
     716              :                     } else {
     717              :                         // the file is present locally, probably by a previous but cancelled call to
     718              :                         // get_or_maybe_download. alternatively we might be running without remote storage.
     719            0 :                         LAYER_IMPL_METRICS.inc_init_needed_no_download();
     720            0 : 
     721            0 :                         permit
     722              :                     };
     723              : 
     724         9768 :                     let since_last_eviction =
     725         9768 :                         self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
     726         9768 :                     if let Some(since_last_eviction) = since_last_eviction {
     727          123 :                         // FIXME: this will not always be recorded correctly until #6028 (the no
     728          123 :                         // download needed branch above)
     729          123 :                         LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
     730         9645 :                     }
     731              : 
     732         9768 :                     let res = Arc::new(DownloadedLayer {
     733         9768 :                         owner: Arc::downgrade(self),
     734         9768 :                         kind: tokio::sync::OnceCell::default(),
     735         9768 :                         version: next_version,
     736         9768 :                     });
     737         9768 : 
     738         9768 :                     self.access_stats.record_residence_event(
     739         9768 :                         LayerResidenceStatus::Resident,
     740         9768 :                         LayerResidenceEventReason::ResidenceChange,
     741         9768 :                     );
     742         9768 : 
     743         9768 :                     let waiters = self.inner.initializer_count();
     744         9768 :                     if waiters > 0 {
     745          333 :                         tracing::info!(
     746          333 :                             waiters,
     747          333 :                             "completing the on-demand download for other tasks"
     748          333 :                         );
     749         9435 :                     }
     750              : 
     751         9768 :                     scopeguard::ScopeGuard::into_inner(init_cancelled);
     752         9768 : 
     753         9768 :                     Ok((ResidentOrWantedEvicted::Resident(res), permit))
     754        10363 :                 }
     755        10372 :                 .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
     756        10372 :             };
     757              : 
     758     23943571 :             if let Some(init_permit) = init_permit.take() {
     759              :                 // use the already held initialization permit because it is impossible to hit the
     760              :                 // below paths anymore essentially limiting the max loop iterations to 2.
     761            0 :                 let (value, init_permit) = download(init_permit).await?;
     762            0 :                 let mut guard = self.inner.set(value, init_permit).await;
     763            0 :                 let (strong, _upgraded) = guard
     764            0 :                     .get_and_upgrade()
     765            0 :                     .expect("init creates strong reference, we held the init permit");
     766            0 :                 return Ok(strong);
     767     23943571 :             }
     768              : 
     769            0 :             let (weak, permit) = {
     770     23943571 :                 let mut locked = self.inner.get_mut_or_init(download).await?;
     771              : 
     772     23942967 :                 if let Some((strong, upgraded)) = locked.get_and_upgrade() {
     773     23942967 :                     if upgraded {
     774            0 :                         // when upgraded back, the Arc<DownloadedLayer> is still available, but
     775            0 :                         // previously a `evict_and_wait` was received.
     776            0 :                         self.wanted_evicted.store(false, Ordering::Relaxed);
     777            0 : 
     778            0 :                         // error out any `evict_and_wait`
     779            0 :                         drop(self.status.send(Status::Downloaded));
     780            0 :                         LAYER_IMPL_METRICS
     781            0 :                             .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
     782     23942967 :                     }
     783              : 
     784     23942967 :                     return Ok(strong);
     785              :                 } else {
     786              :                     // path to here: the evict_blocking is stuck on spawn_blocking queue.
     787              :                     //
     788              :                     // reset the contents, deactivating the eviction and causing a
     789              :                     // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
     790            0 :                     locked.take_and_deinit()
     791            0 :                 }
     792            0 :             };
     793            0 : 
     794            0 :             // unlock first, then drop the weak, but because upgrade failed, we
     795            0 :             // know it cannot be a problem.
     796            0 : 
     797            0 :             assert!(
     798            0 :                 matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
     799            0 :                 "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
     800              :             );
     801              : 
     802            0 :             init_permit = Some(permit);
     803            0 : 
     804            0 :             LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download();
     805              :         }
     806     23943562 :     }
     807              : 
     808              :     /// Nag or fail per RequestContext policy
     809         9769 :     fn check_expected_download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
     810         9769 :         use crate::context::DownloadBehavior::*;
     811         9769 :         let b = ctx.download_behavior();
     812         9769 :         match b {
     813         9769 :             Download => Ok(()),
     814              :             Warn | Error => {
     815            0 :                 tracing::info!(
     816            0 :                     "unexpectedly on-demand downloading for task kind {:?}",
     817            0 :                     ctx.task_kind()
     818            0 :                 );
     819            0 :                 crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
     820              : 
     821            0 :                 let really_error =
     822            0 :                     matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn;
     823              : 
     824            0 :                 if really_error {
     825              :                     // this check is only probablistic, seems like flakyness footgun
     826            0 :                     Err(DownloadError::ContextAndConfigReallyDeniesDownloads)
     827              :                 } else {
     828            0 :                     Ok(())
     829              :                 }
     830              :             }
     831              :         }
     832         9769 :     }
     833              : 
     834              :     /// Actual download, at most one is executed at the time.
     835         9784 :     async fn spawn_download_and_wait(
     836         9784 :         self: &Arc<Self>,
     837         9784 :         timeline: Arc<Timeline>,
     838         9784 :         permit: heavier_once_cell::InitPermit,
     839         9784 :     ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
     840         9784 :         debug_assert_current_span_has_tenant_and_timeline_id();
     841         9784 : 
     842         9784 :         let task_name = format!("download layer {}", self);
     843         9784 : 
     844         9784 :         let (tx, rx) = tokio::sync::oneshot::channel();
     845         9784 : 
     846         9784 :         // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
     847         9784 :         // block tenant::mgr::remove_tenant_from_memory.
     848         9784 : 
     849         9784 :         let this: Arc<Self> = self.clone();
     850         9784 : 
     851         9784 :         crate::task_mgr::spawn(
     852         9784 :             &tokio::runtime::Handle::current(),
     853         9784 :             crate::task_mgr::TaskKind::RemoteDownloadTask,
     854         9784 :             Some(self.desc.tenant_shard_id),
     855         9784 :             Some(self.desc.timeline_id),
     856         9784 :             &task_name,
     857         9784 :             false,
     858         9784 :             async move {
     859         9784 : 
     860         9784 :                 let client = timeline
     861         9784 :                     .remote_client
     862         9784 :                     .as_ref()
     863         9784 :                     .expect("checked above with have_remote_client");
     864              : 
     865         9784 :                 let result = client.download_layer_file(
     866         9784 :                     &this.desc.filename(),
     867         9784 :                     &this.metadata(),
     868         9784 :                     &crate::task_mgr::shutdown_token()
     869         9784 :                 )
     870       406494 :                 .await;
     871              : 
     872         9783 :                 let result = match result {
     873         9769 :                     Ok(size) => {
     874         9769 :                         timeline.metrics.resident_physical_size_add(size);
     875         9769 :                         Ok(())
     876              :                     }
     877           14 :                     Err(e) => {
     878           14 :                         let consecutive_failures =
     879           14 :                             this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
     880           14 : 
     881           14 :                         let backoff = utils::backoff::exponential_backoff_duration_seconds(
     882           14 :                             consecutive_failures.min(u32::MAX as usize) as u32,
     883           14 :                             1.5,
     884           14 :                             60.0,
     885           14 :                         );
     886           14 : 
     887           14 :                         let backoff = std::time::Duration::from_secs_f64(backoff);
     888           14 : 
     889           20 :                         tokio::select! {
     890           20 :                             _ = tokio::time::sleep(backoff) => {},
     891           20 :                             _ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
     892           20 :                             _ = timeline.cancel.cancelled() => {},
     893           20 :                         };
     894              : 
     895           14 :                         Err(e)
     896              :                     }
     897              :                 };
     898              : 
     899         9783 :                 if let Err(res) = tx.send((result, permit)) {
     900            8 :                     match res {
     901            1 :                         (Ok(()), _) => {
     902            1 :                             // our caller is cancellation safe so this is fine; if someone
     903            1 :                             // else requests the layer, they'll find it already downloaded.
     904            1 :                             //
     905            1 :                             // See counter [`LayerImplMetrics::inc_init_needed_no_download`]
     906            1 :                             //
     907            1 :                             // FIXME(#6028): however, could be that we should consider marking the
     908            1 :                             // layer for eviction? alas, cannot: because only DownloadedLayer will
     909            1 :                             // handle that.
     910            1 :                         },
     911            7 :                         (Err(e), _) => {
     912            7 :                             // our caller is cancellation safe, but we might be racing with
     913            7 :                             // another attempt to initialize. before we have cancellation
     914            7 :                             // token support: these attempts should converge regardless of
     915            7 :                             // their completion order.
     916            7 :                             tracing::error!("layer file download failed, and additionally failed to communicate this to caller: {e:?}");
     917            7 :                             LAYER_IMPL_METRICS.inc_download_failed_without_requester();
     918              :                         }
     919              :                     }
     920         9775 :                 }
     921              : 
     922         9783 :                 Ok(())
     923         9784 :             }
     924         9784 :             .in_current_span(),
     925         9784 :         );
     926        11208 :         match rx.await {
     927         9768 :             Ok((Ok(()), permit)) => {
     928         9768 :                 if let Some(reason) = self
     929         9768 :                     .needs_download()
     930         9404 :                     .await
     931         9768 :                     .map_err(DownloadError::PostStatFailed)?
     932              :                 {
     933              :                     // this is really a bug in needs_download or remote timeline client
     934            0 :                     panic!("post-condition failed: needs_download returned {reason:?}");
     935         9768 :                 }
     936         9768 : 
     937         9768 :                 self.consecutive_failures.store(0, Ordering::Relaxed);
     938         9768 :                 tracing::info!("on-demand download successful");
     939              : 
     940         9768 :                 Ok(permit)
     941              :             }
     942            7 :             Ok((Err(e), _permit)) => {
     943            7 :                 // sleep already happened in the spawned task, if it was not cancelled
     944            7 :                 let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
     945            7 : 
     946            7 :                 match e.downcast_ref::<remote_storage::DownloadError>() {
     947              :                     // If the download failed due to its cancellation token,
     948              :                     // propagate the cancellation error upstream.
     949              :                     Some(remote_storage::DownloadError::Cancelled) => {
     950            1 :                         Err(DownloadError::DownloadCancelled)
     951              :                     }
     952              :                     _ => {
     953            6 :                         tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
     954            6 :                         Err(DownloadError::DownloadFailed)
     955              :                     }
     956              :                 }
     957              :             }
     958            0 :             Err(_gone) => Err(DownloadError::DownloadCancelled),
     959              :         }
     960         9775 :     }
     961              : 
     962        20140 :     async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
     963        20140 :         match tokio::fs::metadata(&self.path).await {
     964         9768 :             Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
     965        10372 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
     966            0 :             Err(e) => Err(e),
     967              :         }
     968        20140 :     }
     969              : 
     970        56836 :     fn needs_download_blocking(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
     971        56836 :         match self.path.metadata() {
     972        12719 :             Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
     973        44117 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
     974            0 :             Err(e) => Err(e),
     975              :         }
     976        56836 :     }
     977              : 
     978        22487 :     fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
     979        22487 :         // in future, this should include sha2-256 validation of the file.
     980        22487 :         if !m.is_file() {
     981            0 :             Err(NeedsDownload::NotFile(m.file_type()))
     982        22487 :         } else if m.len() != self.desc.file_size {
     983            0 :             Err(NeedsDownload::WrongSize {
     984            0 :                 actual: m.len(),
     985            0 :                 expected: self.desc.file_size,
     986            0 :             })
     987              :         } else {
     988        22487 :             Ok(())
     989              :         }
     990        22487 :     }
     991              : 
     992         3014 :     async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
     993         3014 :         let layer_file_name = self.desc.filename().file_name();
     994              : 
     995              :         // this is not accurate: we could have the file locally but there was a cancellation
     996              :         // and now we are not in sync, or we are currently downloading it.
     997         3014 :         let remote = self.inner.get_mut().await.is_none();
     998         3014 : 
     999         3014 :         let access_stats = self.access_stats.as_api_model(reset);
    1000         3014 : 
    1001         3014 :         if self.desc.is_delta {
    1002         2431 :             let lsn_range = &self.desc.lsn_range;
    1003         2431 : 
    1004         2431 :             HistoricLayerInfo::Delta {
    1005         2431 :                 layer_file_name,
    1006         2431 :                 layer_file_size: self.desc.file_size,
    1007         2431 :                 lsn_start: lsn_range.start,
    1008         2431 :                 lsn_end: lsn_range.end,
    1009         2431 :                 remote,
    1010         2431 :                 access_stats,
    1011         2431 :             }
    1012              :         } else {
    1013          583 :             let lsn = self.desc.image_layer_lsn();
    1014          583 : 
    1015          583 :             HistoricLayerInfo::Image {
    1016          583 :                 layer_file_name,
    1017          583 :                 layer_file_size: self.desc.file_size,
    1018          583 :                 lsn_start: lsn,
    1019          583 :                 remote,
    1020          583 :                 access_stats,
    1021          583 :             }
    1022              :         }
    1023         3014 :     }
    1024              : 
    1025              :     /// `DownloadedLayer` is being dropped, so it calls this method.
    1026         2561 :     fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
    1027         2561 :         let delete = self.wanted_deleted.load(Ordering::Acquire);
    1028         2561 :         let evict = self.wanted_evicted.load(Ordering::Acquire);
    1029         2561 :         let can_evict = self.have_remote_client;
    1030         2561 : 
    1031         2561 :         if delete {
    1032            0 :             // do nothing now, only in LayerInner::drop -- this was originally implemented because
    1033            0 :             // we could had already scheduled the deletion at the time.
    1034            0 :             //
    1035            0 :             // FIXME: this is not true anymore, we can safely evict wanted deleted files.
    1036         2561 :         } else if can_evict && evict {
    1037         2561 :             let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
    1038              : 
    1039              :             // downgrade for queueing, in case there's a tear down already ongoing we should not
    1040              :             // hold it alive.
    1041         2561 :             let this = Arc::downgrade(&self);
    1042         2561 :             drop(self);
    1043         2561 : 
    1044         2561 :             // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
    1045         2561 :             // drop while the `self.inner` is being locked, leading to a deadlock.
    1046         2561 : 
    1047         2561 :             crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
    1048         2561 :                 let _g = span.entered();
    1049              : 
    1050              :                 // if LayerInner is already dropped here, do nothing because the delete on drop
    1051              :                 // has already ran while we were in queue
    1052         2561 :                 let Some(this) = this.upgrade() else {
    1053            0 :                     LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
    1054            0 :                     return;
    1055              :                 };
    1056         2561 :                 match tokio::runtime::Handle::current().block_on(this.evict_blocking(version)) {
    1057         2561 :                     Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
    1058            0 :                     Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
    1059              :                 }
    1060         2561 :             });
    1061            0 :         }
    1062         2561 :     }
    1063              : 
    1064         2561 :     async fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
    1065              :         // deleted or detached timeline, don't do anything.
    1066         2561 :         let Some(timeline) = self.timeline.upgrade() else {
    1067            0 :             return Err(EvictionCancelled::TimelineGone);
    1068              :         };
    1069              : 
    1070              :         // to avoid starting a new download while we evict, keep holding on to the
    1071              :         // permit.
    1072         2561 :         let _permit = {
    1073         2561 :             let maybe_downloaded = self.inner.get_mut().await;
    1074              : 
    1075         2561 :             let (_weak, permit) = match maybe_downloaded {
    1076         2561 :                 Some(mut guard) => {
    1077         2561 :                     if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard {
    1078         2561 :                         if *version == only_version {
    1079         2561 :                             guard.take_and_deinit()
    1080              :                         } else {
    1081              :                             // this was not for us; maybe there's another eviction job
    1082              :                             // TODO: does it make any sense to stall here? unique versions do not
    1083              :                             // matter, we only want to make sure not to evict a resident, which we
    1084              :                             // are not doing.
    1085            0 :                             return Err(EvictionCancelled::VersionCheckFailed);
    1086              :                         }
    1087              :                     } else {
    1088            0 :                         return Err(EvictionCancelled::AlreadyReinitialized);
    1089              :                     }
    1090              :                 }
    1091              :                 None => {
    1092              :                     // already deinitialized, perhaps get_or_maybe_download did this and is
    1093              :                     // currently waiting to reinitialize it
    1094            0 :                     return Err(EvictionCancelled::LostToDownload);
    1095              :                 }
    1096              :             };
    1097              : 
    1098         2561 :             permit
    1099         2561 :         };
    1100         2561 : 
    1101         2561 :         // now accesses to inner.get_or_init wait on the semaphore or the `_permit`
    1102         2561 : 
    1103         2561 :         self.access_stats.record_residence_event(
    1104         2561 :             LayerResidenceStatus::Evicted,
    1105         2561 :             LayerResidenceEventReason::ResidenceChange,
    1106         2561 :         );
    1107              : 
    1108         2561 :         let res = match capture_mtime_and_remove(&self.path) {
    1109         2561 :             Ok(local_layer_mtime) => {
    1110         2561 :                 let duration = SystemTime::now().duration_since(local_layer_mtime);
    1111         2561 :                 match duration {
    1112         2561 :                     Ok(elapsed) => {
    1113         2561 :                         timeline
    1114         2561 :                             .metrics
    1115         2561 :                             .evictions_with_low_residence_duration
    1116         2561 :                             .read()
    1117         2561 :                             .unwrap()
    1118         2561 :                             .observe(elapsed);
    1119         2561 :                         tracing::info!(
    1120         2561 :                             residence_millis = elapsed.as_millis(),
    1121         2561 :                             "evicted layer after known residence period"
    1122         2561 :                         );
    1123              :                     }
    1124              :                     Err(_) => {
    1125            0 :                         tracing::info!("evicted layer after unknown residence period");
    1126              :                     }
    1127              :                 }
    1128         2561 :                 timeline.metrics.evictions.inc();
    1129         2561 :                 timeline
    1130         2561 :                     .metrics
    1131         2561 :                     .resident_physical_size_sub(self.desc.file_size);
    1132         2561 : 
    1133         2561 :                 Ok(())
    1134              :             }
    1135            0 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
    1136            0 :                 tracing::error!(
    1137            0 :                     layer_size = %self.desc.file_size,
    1138            0 :                     "failed to evict layer from disk, it was already gone (metrics will be inaccurate)"
    1139            0 :                 );
    1140            0 :                 Err(EvictionCancelled::FileNotFound)
    1141              :             }
    1142            0 :             Err(e) => {
    1143            0 :                 tracing::error!("failed to evict file from disk: {e:#}");
    1144            0 :                 Err(EvictionCancelled::RemoveFailed)
    1145              :             }
    1146              :         };
    1147              : 
    1148              :         // we are still holding the permit, so no new spawn_download_and_wait can happen
    1149         2561 :         drop(self.status.send(Status::Evicted));
    1150         2561 : 
    1151         2561 :         *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
    1152         2561 : 
    1153         2561 :         res
    1154         2561 :     }
    1155              : 
    1156        37184 :     fn metadata(&self) -> LayerFileMetadata {
    1157        37184 :         LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
    1158        37184 :     }
    1159              : }
    1160              : 
    1161         2561 : fn capture_mtime_and_remove(path: &Utf8Path) -> Result<SystemTime, std::io::Error> {
    1162         2561 :     let m = path.metadata()?;
    1163         2561 :     let local_layer_mtime = m.modified()?;
    1164         2561 :     std::fs::remove_file(path)?;
    1165         2561 :     Ok(local_layer_mtime)
    1166         2561 : }
    1167              : 
    1168            0 : #[derive(Debug, thiserror::Error)]
    1169              : pub(crate) enum EvictionError {
    1170              :     #[error("layer was already evicted")]
    1171              :     NotFound,
    1172              : 
    1173              :     /// Evictions must always lose to downloads in races, and this time it happened.
    1174              :     #[error("layer was downloaded instead")]
    1175              :     Downloaded,
    1176              : }
    1177              : 
    1178              : /// Error internal to the [`LayerInner::get_or_maybe_download`]
    1179           13 : #[derive(Debug, thiserror::Error)]
    1180              : enum DownloadError {
    1181              :     #[error("timeline has already shutdown")]
    1182              :     TimelineShutdown,
    1183              :     #[error("no remote storage configured")]
    1184              :     NoRemoteStorage,
    1185              :     #[error("context denies downloading")]
    1186              :     ContextAndConfigReallyDeniesDownloads,
    1187              :     #[error("downloading is really required but not allowed by this method")]
    1188              :     DownloadRequired,
    1189              :     #[error("layer path exists, but it is not a file: {0:?}")]
    1190              :     NotFile(std::fs::FileType),
    1191              :     /// Why no error here? Because it will be reported by page_service. We should had also done
    1192              :     /// retries already.
    1193              :     #[error("downloading evicted layer file failed")]
    1194              :     DownloadFailed,
    1195              :     #[error("downloading failed, possibly for shutdown")]
    1196              :     DownloadCancelled,
    1197              :     #[error("pre-condition: stat before download failed")]
    1198              :     PreStatFailed(#[source] std::io::Error),
    1199              :     #[error("post-condition: stat after download failed")]
    1200              :     PostStatFailed(#[source] std::io::Error),
    1201              : }
    1202              : 
    1203            0 : #[derive(Debug, PartialEq)]
    1204              : pub(crate) enum NeedsDownload {
    1205              :     NotFound,
    1206              :     NotFile(std::fs::FileType),
    1207              :     WrongSize { actual: u64, expected: u64 },
    1208              : }
    1209              : 
    1210              : impl std::fmt::Display for NeedsDownload {
    1211         9784 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1212         9784 :         match self {
    1213         9784 :             NeedsDownload::NotFound => write!(f, "file was not found"),
    1214            0 :             NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
    1215            0 :             NeedsDownload::WrongSize { actual, expected } => {
    1216            0 :                 write!(f, "file size mismatch {actual} vs. {expected}")
    1217              :             }
    1218              :         }
    1219         9784 :     }
    1220              : }
    1221              : 
    1222              : /// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
    1223              : pub(crate) struct DownloadedLayer {
    1224              :     owner: Weak<LayerInner>,
    1225              :     // Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
    1226              :     // DownloadedLayer
    1227              :     kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
    1228              :     version: usize,
    1229              : }
    1230              : 
    1231              : impl std::fmt::Debug for DownloadedLayer {
    1232            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1233            0 :         f.debug_struct("DownloadedLayer")
    1234            0 :             // owner omitted because it is always "Weak"
    1235            0 :             .field("kind", &self.kind)
    1236            0 :             .field("version", &self.version)
    1237            0 :             .finish()
    1238            0 :     }
    1239              : }
    1240              : 
    1241              : impl Drop for DownloadedLayer {
    1242        25539 :     fn drop(&mut self) {
    1243        25539 :         if let Some(owner) = self.owner.upgrade() {
    1244         2561 :             owner.on_downloaded_layer_drop(self.version);
    1245        22978 :         } else {
    1246        22978 :             // no need to do anything, we are shutting down
    1247        22978 :         }
    1248        25539 :     }
    1249              : }
    1250              : 
    1251              : impl DownloadedLayer {
    1252              :     /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to
    1253              :     /// initialize it permanently.
    1254              :     ///
    1255              :     /// `owner` parameter is a strong reference at the same `LayerInner` as the
    1256              :     /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called,
    1257              :     /// we will always have the LayerInner on the callstack, so we can just use it.
    1258     23938777 :     async fn get<'a>(
    1259     23938777 :         &'a self,
    1260     23938777 :         owner: &Arc<LayerInner>,
    1261     23938777 :         ctx: &RequestContext,
    1262     23938777 :     ) -> anyhow::Result<&'a LayerKind> {
    1263     23938769 :         let init = || async {
    1264        34407 :             assert_eq!(
    1265        34407 :                 Weak::as_ptr(&self.owner),
    1266        34407 :                 Arc::as_ptr(owner),
    1267     23938769 :                 "these are the same, just avoiding the upgrade"
    1268     23938769 :             );
    1269     23938769 : 
    1270     23938769 :             let res = if owner.desc.is_delta {
    1271     23938769 :                 let summary = Some(delta_layer::Summary::expected(
    1272        12046 :                     owner.desc.tenant_shard_id.tenant_id,
    1273        12046 :                     owner.desc.timeline_id,
    1274        12046 :                     owner.desc.key_range.clone(),
    1275        12046 :                     owner.desc.lsn_range.clone(),
    1276        12046 :                 ));
    1277        12046 :                 delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
    1278     23938769 :                     .await
    1279     23938769 :                     .map(|res| res.map(LayerKind::Delta))
    1280     23938769 :             } else {
    1281     23938769 :                 let lsn = owner.desc.image_layer_lsn();
    1282        22361 :                 let summary = Some(image_layer::Summary::expected(
    1283        22361 :                     owner.desc.tenant_shard_id.tenant_id,
    1284        22361 :                     owner.desc.timeline_id,
    1285        22361 :                     owner.desc.key_range.clone(),
    1286        22361 :                     lsn,
    1287        22361 :                 ));
    1288        22361 :                 image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
    1289     23938769 :                     .await
    1290     23938769 :                     .map(|res| res.map(LayerKind::Image))
    1291     23938769 :             };
    1292     23938769 : 
    1293     23938769 :             match res {
    1294     23938769 :                 Ok(Ok(layer)) => Ok(Ok(layer)),
    1295     23938769 :                 Ok(Err(transient)) => Err(transient),
    1296     23938769 :                 Err(permanent) => {
    1297            1 :                     LAYER_IMPL_METRICS.inc_permanent_loading_failures();
    1298            1 :                     // TODO(#5815): we are not logging all errors, so temporarily log them **once**
    1299            1 :                     // here as well
    1300            1 :                     let permanent = permanent.context("load layer");
    1301            1 :                     tracing::error!("layer loading failed permanently: {permanent:#}");
    1302     23938769 :                     Ok(Err(permanent))
    1303     23938769 :                 }
    1304     23938769 :             }
    1305     23938769 :         };
    1306     23938769 :         self.kind
    1307     23938769 :             .get_or_try_init(init)
    1308              :             // return transient errors using `?`
    1309         1503 :             .await?
    1310     23938769 :             .as_ref()
    1311     23938769 :             .map_err(|e| {
    1312           20 :                 // errors are not clonabled, cannot but stringify
    1313           20 :                 // test_broken_timeline matches this string
    1314           20 :                 anyhow::anyhow!("layer loading failed: {e:#}")
    1315     23938769 :             })
    1316     23938769 :     }
    1317              : 
    1318     23934545 :     async fn get_value_reconstruct_data(
    1319     23934545 :         &self,
    1320     23934545 :         key: Key,
    1321     23934545 :         lsn_range: Range<Lsn>,
    1322     23934545 :         reconstruct_data: &mut ValueReconstructState,
    1323     23934545 :         owner: &Arc<LayerInner>,
    1324     23934545 :         ctx: &RequestContext,
    1325     23934545 :     ) -> anyhow::Result<ValueReconstructResult> {
    1326     23934537 :         use LayerKind::*;
    1327     23934537 : 
    1328     23934537 :         match self.get(owner, ctx).await? {
    1329     22799298 :             Delta(d) => {
    1330     22799298 :                 d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
    1331       995899 :                     .await
    1332              :             }
    1333      1135219 :             Image(i) => {
    1334      1135219 :                 i.get_value_reconstruct_data(key, reconstruct_data, ctx)
    1335        14554 :                     .await
    1336              :             }
    1337              :         }
    1338     23934535 :     }
    1339              : 
    1340            4 :     async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
    1341            4 :         use LayerKind::*;
    1342            4 :         match self.get(owner, ctx).await? {
    1343            4 :             Delta(d) => d.dump(ctx).await?,
    1344            0 :             Image(i) => i.dump(ctx).await?,
    1345              :         }
    1346              : 
    1347            4 :         Ok(())
    1348            4 :     }
    1349              : }
    1350              : 
    1351              : /// Wrapper around an actual layer implementation.
    1352            0 : #[derive(Debug)]
    1353              : enum LayerKind {
    1354              :     Delta(delta_layer::DeltaLayerInner),
    1355              :     Image(image_layer::ImageLayerInner),
    1356              : }
    1357              : 
    1358              : /// Guard for forcing a layer be resident while it exists.
    1359        26692 : #[derive(Clone)]
    1360              : pub(crate) struct ResidentLayer {
    1361              :     owner: Layer,
    1362              :     downloaded: Arc<DownloadedLayer>,
    1363              : }
    1364              : 
    1365              : impl std::fmt::Display for ResidentLayer {
    1366        34127 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1367        34127 :         write!(f, "{}", self.owner)
    1368        34127 :     }
    1369              : }
    1370              : 
    1371              : impl std::fmt::Debug for ResidentLayer {
    1372            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1373            0 :         write!(f, "{}", self.owner)
    1374            0 :     }
    1375              : }
    1376              : 
    1377              : impl ResidentLayer {
    1378              :     /// Release the eviction guard, converting back into a plain [`Layer`].
    1379              :     ///
    1380              :     /// You can access the [`Layer`] also by using `as_ref`.
    1381        20893 :     pub(crate) fn drop_eviction_guard(self) -> Layer {
    1382        20893 :         self.into()
    1383        20893 :     }
    1384              : 
    1385              :     /// Loads all keys stored in the layer. Returns key, lsn and value size.
    1386            0 :     #[tracing::instrument(skip_all, fields(layer=%self))]
    1387              :     pub(crate) async fn load_keys<'a>(
    1388              :         &'a self,
    1389              :         ctx: &RequestContext,
    1390              :     ) -> anyhow::Result<Vec<DeltaEntry<'a>>> {
    1391              :         use LayerKind::*;
    1392              : 
    1393              :         let owner = &self.owner.0;
    1394              : 
    1395              :         match self.downloaded.get(owner, ctx).await? {
    1396              :             Delta(ref d) => {
    1397              :                 owner
    1398              :                     .access_stats
    1399              :                     .record_access(LayerAccessKind::KeyIter, ctx);
    1400              : 
    1401              :                 // this is valid because the DownloadedLayer::kind is a OnceCell, not a
    1402              :                 // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
    1403              :                 // while it's being held.
    1404              :                 delta_layer::DeltaLayerInner::load_keys(d, ctx)
    1405              :                     .await
    1406              :                     .context("Layer index is corrupted")
    1407              :             }
    1408              :             Image(_) => anyhow::bail!("cannot load_keys on a image layer"),
    1409              :         }
    1410              :     }
    1411              : 
    1412        45150 :     pub(crate) fn local_path(&self) -> &Utf8Path {
    1413        45150 :         &self.owner.0.path
    1414        45150 :     }
    1415              : 
    1416         3017 :     pub(crate) fn access_stats(&self) -> &LayerAccessStats {
    1417         3017 :         self.owner.access_stats()
    1418         3017 :     }
    1419              : 
    1420        21990 :     pub(crate) fn metadata(&self) -> LayerFileMetadata {
    1421        21990 :         self.owner.metadata()
    1422        21990 :     }
    1423              : }
    1424              : 
    1425              : impl AsLayerDesc for ResidentLayer {
    1426        76762 :     fn layer_desc(&self) -> &PersistentLayerDesc {
    1427        76762 :         self.owner.layer_desc()
    1428        76762 :     }
    1429              : }
    1430              : 
    1431              : impl AsRef<Layer> for ResidentLayer {
    1432        32712 :     fn as_ref(&self) -> &Layer {
    1433        32712 :         &self.owner
    1434        32712 :     }
    1435              : }
    1436              : 
    1437              : /// Drop the eviction guard.
    1438              : impl From<ResidentLayer> for Layer {
    1439        20893 :     fn from(value: ResidentLayer) -> Self {
    1440        20893 :         value.owner
    1441        20893 :     }
    1442              : }
    1443              : 
    1444              : use metrics::IntCounter;
    1445              : 
    1446              : pub(crate) struct LayerImplMetrics {
    1447              :     started_evictions: IntCounter,
    1448              :     completed_evictions: IntCounter,
    1449              :     cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
    1450              : 
    1451              :     started_deletes: IntCounter,
    1452              :     completed_deletes: IntCounter,
    1453              :     failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
    1454              : 
    1455              :     rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
    1456              :     inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
    1457              :     redownload_after: metrics::Histogram,
    1458              : }
    1459              : 
    1460              : impl Default for LayerImplMetrics {
    1461          610 :     fn default() -> Self {
    1462          610 :         use enum_map::Enum;
    1463          610 : 
    1464          610 :         // reminder: these will be pageserver_layer_* with "_total" suffix
    1465          610 : 
    1466          610 :         let started_evictions = metrics::register_int_counter!(
    1467          610 :             "pageserver_layer_started_evictions",
    1468          610 :             "Evictions started in the Layer implementation"
    1469          610 :         )
    1470          610 :         .unwrap();
    1471          610 :         let completed_evictions = metrics::register_int_counter!(
    1472          610 :             "pageserver_layer_completed_evictions",
    1473          610 :             "Evictions completed in the Layer implementation"
    1474          610 :         )
    1475          610 :         .unwrap();
    1476          610 : 
    1477          610 :         let cancelled_evictions = metrics::register_int_counter_vec!(
    1478          610 :             "pageserver_layer_cancelled_evictions_count",
    1479          610 :             "Different reasons for evictions to have been cancelled or failed",
    1480          610 :             &["reason"]
    1481          610 :         )
    1482          610 :         .unwrap();
    1483          610 : 
    1484         4880 :         let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1485         4880 :             let reason = EvictionCancelled::from_usize(i);
    1486         4880 :             let s = reason.as_str();
    1487         4880 :             cancelled_evictions.with_label_values(&[s])
    1488         4880 :         }));
    1489          610 : 
    1490          610 :         let started_deletes = metrics::register_int_counter!(
    1491          610 :             "pageserver_layer_started_deletes",
    1492          610 :             "Deletions on drop pending in the Layer implementation"
    1493          610 :         )
    1494          610 :         .unwrap();
    1495          610 :         let completed_deletes = metrics::register_int_counter!(
    1496          610 :             "pageserver_layer_completed_deletes",
    1497          610 :             "Deletions on drop completed in the Layer implementation"
    1498          610 :         )
    1499          610 :         .unwrap();
    1500          610 : 
    1501          610 :         let failed_deletes = metrics::register_int_counter_vec!(
    1502          610 :             "pageserver_layer_failed_deletes_count",
    1503          610 :             "Different reasons for deletions on drop to have failed",
    1504          610 :             &["reason"]
    1505          610 :         )
    1506          610 :         .unwrap();
    1507          610 : 
    1508         1220 :         let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1509         1220 :             let reason = DeleteFailed::from_usize(i);
    1510         1220 :             let s = reason.as_str();
    1511         1220 :             failed_deletes.with_label_values(&[s])
    1512         1220 :         }));
    1513          610 : 
    1514          610 :         let rare_counters = metrics::register_int_counter_vec!(
    1515          610 :             "pageserver_layer_assumed_rare_count",
    1516          610 :             "Times unexpected or assumed rare event happened",
    1517          610 :             &["event"]
    1518          610 :         )
    1519          610 :         .unwrap();
    1520          610 : 
    1521         4270 :         let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1522         4270 :             let event = RareEvent::from_usize(i);
    1523         4270 :             let s = event.as_str();
    1524         4270 :             rare_counters.with_label_values(&[s])
    1525         4270 :         }));
    1526          610 : 
    1527          610 :         let inits_cancelled = metrics::register_int_counter!(
    1528          610 :             "pageserver_layer_inits_cancelled_count",
    1529          610 :             "Times Layer initialization was cancelled",
    1530          610 :         )
    1531          610 :         .unwrap();
    1532          610 : 
    1533          610 :         let redownload_after = {
    1534          610 :             let minute = 60.0;
    1535          610 :             let hour = 60.0 * minute;
    1536          610 :             metrics::register_histogram!(
    1537          610 :                 "pageserver_layer_redownloaded_after",
    1538          610 :                 "Time between evicting and re-downloading.",
    1539          610 :                 vec![
    1540          610 :                     10.0,
    1541          610 :                     30.0,
    1542          610 :                     minute,
    1543          610 :                     5.0 * minute,
    1544          610 :                     15.0 * minute,
    1545          610 :                     30.0 * minute,
    1546          610 :                     hour,
    1547          610 :                     12.0 * hour,
    1548          610 :                 ]
    1549          610 :             )
    1550          610 :             .unwrap()
    1551          610 :         };
    1552          610 : 
    1553          610 :         Self {
    1554          610 :             started_evictions,
    1555          610 :             completed_evictions,
    1556          610 :             cancelled_evictions,
    1557          610 : 
    1558          610 :             started_deletes,
    1559          610 :             completed_deletes,
    1560          610 :             failed_deletes,
    1561          610 : 
    1562          610 :             rare_counters,
    1563          610 :             inits_cancelled,
    1564          610 :             redownload_after,
    1565          610 :         }
    1566          610 :     }
    1567              : }
    1568              : 
    1569              : impl LayerImplMetrics {
    1570         2561 :     fn inc_started_evictions(&self) {
    1571         2561 :         self.started_evictions.inc();
    1572         2561 :     }
    1573         2561 :     fn inc_completed_evictions(&self) {
    1574         2561 :         self.completed_evictions.inc();
    1575         2561 :     }
    1576            0 :     fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
    1577            0 :         self.cancelled_evictions[reason].inc()
    1578            0 :     }
    1579              : 
    1580         5410 :     fn inc_started_deletes(&self) {
    1581         5410 :         self.started_deletes.inc();
    1582         5410 :     }
    1583         5396 :     fn inc_completed_deletes(&self) {
    1584         5396 :         self.completed_deletes.inc();
    1585         5396 :     }
    1586           14 :     fn inc_deletes_failed(&self, reason: DeleteFailed) {
    1587           14 :         self.failed_deletes[reason].inc();
    1588           14 :     }
    1589              : 
    1590              :     /// Counted separatedly from failed layer deletes because we will complete the layer deletion
    1591              :     /// attempt regardless of failure to delete local file.
    1592            0 :     fn inc_delete_removes_failed(&self) {
    1593            0 :         self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
    1594            0 :     }
    1595              : 
    1596              :     /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
    1597            0 :     fn inc_retried_get_or_maybe_download(&self) {
    1598            0 :         self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
    1599            0 :     }
    1600              : 
    1601              :     /// Expected rare because cancellations are unexpected, and failures are unexpected
    1602            7 :     fn inc_download_failed_without_requester(&self) {
    1603            7 :         self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
    1604            7 :     }
    1605              : 
    1606              :     /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
    1607              :     ///
    1608              :     /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
    1609              :     /// Option.
    1610            0 :     fn inc_raced_wanted_evicted_accesses(&self) {
    1611            0 :         self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
    1612            0 :     }
    1613              : 
    1614              :     /// These are only expected for [`Self::inc_init_cancelled`] amount when
    1615              :     /// running with remote storage.
    1616            0 :     fn inc_init_needed_no_download(&self) {
    1617            0 :         self.rare_counters[RareEvent::InitWithoutDownload].inc();
    1618            0 :     }
    1619              : 
    1620              :     /// Expected rare because all layer files should be readable and good
    1621            1 :     fn inc_permanent_loading_failures(&self) {
    1622            1 :         self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
    1623            1 :     }
    1624              : 
    1625            0 :     fn inc_broadcast_lagged(&self) {
    1626            0 :         self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
    1627            0 :     }
    1628              : 
    1629          603 :     fn inc_init_cancelled(&self) {
    1630          603 :         self.inits_cancelled.inc()
    1631          603 :     }
    1632              : 
    1633          123 :     fn record_redownloaded_after(&self, duration: std::time::Duration) {
    1634          123 :         self.redownload_after.observe(duration.as_secs_f64())
    1635          123 :     }
    1636              : }
    1637              : 
    1638         4880 : #[derive(enum_map::Enum)]
    1639              : enum EvictionCancelled {
    1640              :     LayerGone,
    1641              :     TimelineGone,
    1642              :     VersionCheckFailed,
    1643              :     FileNotFound,
    1644              :     RemoveFailed,
    1645              :     AlreadyReinitialized,
    1646              :     /// Not evicted because of a pending reinitialization
    1647              :     LostToDownload,
    1648              :     /// After eviction, there was a new layer access which cancelled the eviction.
    1649              :     UpgradedBackOnAccess,
    1650              : }
    1651              : 
    1652              : impl EvictionCancelled {
    1653         4880 :     fn as_str(&self) -> &'static str {
    1654         4880 :         match self {
    1655          610 :             EvictionCancelled::LayerGone => "layer_gone",
    1656          610 :             EvictionCancelled::TimelineGone => "timeline_gone",
    1657          610 :             EvictionCancelled::VersionCheckFailed => "version_check_fail",
    1658          610 :             EvictionCancelled::FileNotFound => "file_not_found",
    1659          610 :             EvictionCancelled::RemoveFailed => "remove_failed",
    1660          610 :             EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
    1661          610 :             EvictionCancelled::LostToDownload => "lost_to_download",
    1662          610 :             EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
    1663              :         }
    1664         4880 :     }
    1665              : }
    1666              : 
    1667         1234 : #[derive(enum_map::Enum)]
    1668              : enum DeleteFailed {
    1669              :     TimelineGone,
    1670              :     DeleteSchedulingFailed,
    1671              : }
    1672              : 
    1673              : impl DeleteFailed {
    1674         1220 :     fn as_str(&self) -> &'static str {
    1675         1220 :         match self {
    1676          610 :             DeleteFailed::TimelineGone => "timeline_gone",
    1677          610 :             DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
    1678              :         }
    1679         1220 :     }
    1680              : }
    1681              : 
    1682         4278 : #[derive(enum_map::Enum)]
    1683              : enum RareEvent {
    1684              :     RemoveOnDropFailed,
    1685              :     RetriedGetOrMaybeDownload,
    1686              :     DownloadFailedWithoutRequester,
    1687              :     UpgradedWantedEvicted,
    1688              :     InitWithoutDownload,
    1689              :     PermanentLoadingFailure,
    1690              :     EvictAndWaitLagged,
    1691              : }
    1692              : 
    1693              : impl RareEvent {
    1694         4270 :     fn as_str(&self) -> &'static str {
    1695         4270 :         use RareEvent::*;
    1696         4270 : 
    1697         4270 :         match self {
    1698          610 :             RemoveOnDropFailed => "remove_on_drop_failed",
    1699          610 :             RetriedGetOrMaybeDownload => "retried_gomd",
    1700          610 :             DownloadFailedWithoutRequester => "download_failed_without",
    1701          610 :             UpgradedWantedEvicted => "raced_wanted_evicted",
    1702          610 :             InitWithoutDownload => "init_needed_no_download",
    1703          610 :             PermanentLoadingFailure => "permanent_loading_failure",
    1704          610 :             EvictAndWaitLagged => "broadcast_lagged",
    1705              :         }
    1706         4270 :     }
    1707              : }
    1708              : 
    1709              : pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
    1710              :     once_cell::sync::Lazy::new(LayerImplMetrics::default);
        

Generated by: LCOV version 2.1-beta