LCOV - code coverage report
Current view: top level - pageserver/src/tenant/storage_layer - layer.rs (source / functions) Coverage Total Hit
Test: aca8877be6ceba750c1be359ed71bc1799d52b30.info Lines: 86.1 % 1055 908
Test Date: 2024-02-14 18:05:35 Functions: 82.0 % 150 123

            Line data    Source code
       1              : use anyhow::Context;
       2              : use camino::{Utf8Path, Utf8PathBuf};
       3              : use pageserver_api::models::{
       4              :     HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
       5              : };
       6              : use pageserver_api::shard::ShardIndex;
       7              : use std::ops::Range;
       8              : use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
       9              : use std::sync::{Arc, Weak};
      10              : use std::time::SystemTime;
      11              : use tracing::Instrument;
      12              : use utils::lsn::Lsn;
      13              : use utils::sync::heavier_once_cell;
      14              : 
      15              : use crate::config::PageServerConf;
      16              : use crate::context::RequestContext;
      17              : use crate::repository::Key;
      18              : use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
      19              : use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
      20              : 
      21              : use super::delta_layer::{self, DeltaEntry};
      22              : use super::image_layer;
      23              : use super::{
      24              :     AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
      25              :     ValueReconstructResult, ValueReconstructState,
      26              : };
      27              : 
      28              : use utils::generation::Generation;
      29              : 
      30              : /// A Layer contains all data in a "rectangle" consisting of a range of keys and
      31              : /// range of LSNs.
      32              : ///
      33              : /// There are two kinds of layers, in-memory and on-disk layers. In-memory
      34              : /// layers are used to ingest incoming WAL, and provide fast access to the
      35              : /// recent page versions. On-disk layers are stored as files on disk, and are
      36              : /// immutable. This type represents the on-disk kind while in-memory kind are represented by
      37              : /// [`InMemoryLayer`].
      38              : ///
      39              : /// Furthermore, there are two kinds of on-disk layers: delta and image layers.
      40              : /// A delta layer contains all modifications within a range of LSNs and keys.
      41              : /// An image layer is a snapshot of all the data in a key-range, at a single
      42              : /// LSN.
      43              : ///
      44              : /// This type models the on-disk layers, which can be evicted and on-demand downloaded.
      45              : ///
      46              : /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer
      47     33838544 : #[derive(Clone)]
      48              : pub(crate) struct Layer(Arc<LayerInner>);
      49              : 
      50              : impl std::fmt::Display for Layer {
      51        34303 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      52        34303 :         if matches!(self.0.generation, Generation::Broken) {
      53            0 :             write!(f, "{}-broken", self.layer_desc().short_id())
      54              :         } else {
      55        34303 :             write!(
      56        34303 :                 f,
      57        34303 :                 "{}{}",
      58        34303 :                 self.layer_desc().short_id(),
      59        34303 :                 self.0.generation.get_suffix()
      60        34303 :             )
      61              :         }
      62        34303 :     }
      63              : }
      64              : 
      65              : impl std::fmt::Debug for Layer {
      66            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      67            0 :         write!(f, "{}", self)
      68            0 :     }
      69              : }
      70              : 
      71              : impl AsLayerDesc for Layer {
      72     51268809 :     fn layer_desc(&self) -> &PersistentLayerDesc {
      73     51268809 :         self.0.layer_desc()
      74     51268809 :     }
      75              : }
      76              : 
      77              : impl Layer {
      78              :     /// Creates a layer value for a file we know to not be resident.
      79        40899 :     pub(crate) fn for_evicted(
      80        40899 :         conf: &'static PageServerConf,
      81        40899 :         timeline: &Arc<Timeline>,
      82        40899 :         file_name: LayerFileName,
      83        40899 :         metadata: LayerFileMetadata,
      84        40899 :     ) -> Self {
      85        40899 :         let desc = PersistentLayerDesc::from_filename(
      86        40899 :             timeline.tenant_shard_id,
      87        40899 :             timeline.timeline_id,
      88        40899 :             file_name,
      89        40899 :             metadata.file_size(),
      90        40899 :         );
      91        40899 : 
      92        40899 :         let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
      93        40899 : 
      94        40899 :         let owner = Layer(Arc::new(LayerInner::new(
      95        40899 :             conf,
      96        40899 :             timeline,
      97        40899 :             access_stats,
      98        40899 :             desc,
      99        40899 :             None,
     100        40899 :             metadata.generation,
     101        40899 :             metadata.shard,
     102        40899 :         )));
     103              : 
     104        40899 :         debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
     105              : 
     106        40899 :         owner
     107        40899 :     }
     108              : 
     109              :     /// Creates a Layer value for a file we know to be resident in timeline directory.
     110        12153 :     pub(crate) fn for_resident(
     111        12153 :         conf: &'static PageServerConf,
     112        12153 :         timeline: &Arc<Timeline>,
     113        12153 :         file_name: LayerFileName,
     114        12153 :         metadata: LayerFileMetadata,
     115        12153 :     ) -> ResidentLayer {
     116        12153 :         let desc = PersistentLayerDesc::from_filename(
     117        12153 :             timeline.tenant_shard_id,
     118        12153 :             timeline.timeline_id,
     119        12153 :             file_name,
     120        12153 :             metadata.file_size(),
     121        12153 :         );
     122        12153 : 
     123        12153 :         let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
     124        12153 : 
     125        12153 :         let mut resident = None;
     126        12153 : 
     127        12153 :         let owner = Layer(Arc::new_cyclic(|owner| {
     128        12153 :             let inner = Arc::new(DownloadedLayer {
     129        12153 :                 owner: owner.clone(),
     130        12153 :                 kind: tokio::sync::OnceCell::default(),
     131        12153 :                 version: 0,
     132        12153 :             });
     133        12153 :             resident = Some(inner.clone());
     134        12153 : 
     135        12153 :             LayerInner::new(
     136        12153 :                 conf,
     137        12153 :                 timeline,
     138        12153 :                 access_stats,
     139        12153 :                 desc,
     140        12153 :                 Some(inner),
     141        12153 :                 metadata.generation,
     142        12153 :                 metadata.shard,
     143        12153 :             )
     144        12153 :         }));
     145        12153 : 
     146        12153 :         let downloaded = resident.expect("just initialized");
     147              : 
     148        12153 :         debug_assert!(owner.0.needs_download_blocking().unwrap().is_none());
     149              : 
     150        12153 :         timeline
     151        12153 :             .metrics
     152        12153 :             .resident_physical_size_add(metadata.file_size());
     153        12153 : 
     154        12153 :         ResidentLayer { downloaded, owner }
     155        12153 :     }
     156              : 
     157              :     /// Creates a Layer value for freshly written out new layer file by renaming it from a
     158              :     /// temporary path.
     159        22304 :     pub(crate) fn finish_creating(
     160        22304 :         conf: &'static PageServerConf,
     161        22304 :         timeline: &Arc<Timeline>,
     162        22304 :         desc: PersistentLayerDesc,
     163        22304 :         temp_path: &Utf8Path,
     164        22304 :     ) -> anyhow::Result<ResidentLayer> {
     165        22304 :         let mut resident = None;
     166        22304 : 
     167        22304 :         let owner = Layer(Arc::new_cyclic(|owner| {
     168        22304 :             let inner = Arc::new(DownloadedLayer {
     169        22304 :                 owner: owner.clone(),
     170        22304 :                 kind: tokio::sync::OnceCell::default(),
     171        22304 :                 version: 0,
     172        22304 :             });
     173        22304 :             resident = Some(inner.clone());
     174        22304 :             let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
     175        22304 :             access_stats.record_residence_event(
     176        22304 :                 LayerResidenceStatus::Resident,
     177        22304 :                 LayerResidenceEventReason::LayerCreate,
     178        22304 :             );
     179        22304 :             LayerInner::new(
     180        22304 :                 conf,
     181        22304 :                 timeline,
     182        22304 :                 access_stats,
     183        22304 :                 desc,
     184        22304 :                 Some(inner),
     185        22304 :                 timeline.generation,
     186        22304 :                 timeline.get_shard_index(),
     187        22304 :             )
     188        22304 :         }));
     189        22304 : 
     190        22304 :         let downloaded = resident.expect("just initialized");
     191        22304 : 
     192        22304 :         // if the rename works, the path is as expected
     193        22304 :         std::fs::rename(temp_path, owner.local_path())
     194        22304 :             .with_context(|| format!("rename temporary file as correct path for {owner}"))?;
     195              : 
     196        22304 :         Ok(ResidentLayer { downloaded, owner })
     197        22304 :     }
     198              : 
     199              :     /// Requests the layer to be evicted and waits for this to be done.
     200              :     ///
     201              :     /// If the file is not resident, an [`EvictionError::NotFound`] is returned.
     202              :     ///
     203              :     /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
     204              :     /// re-downloaded, [`EvictionError::Downloaded`] is returned.
     205              :     ///
     206              :     /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
     207              :     /// of download-evict cycle on retry.
     208         2517 :     pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
     209         2517 :         self.0.evict_and_wait().await
     210         2517 :     }
     211              : 
     212              :     /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
     213              :     /// then.
     214              :     ///
     215              :     /// On drop, this will cause a call to [`crate::tenant::remote_timeline_client::RemoteTimelineClient::schedule_deletion_of_unlinked`].
     216              :     /// This means that the unlinking by [gc] or [compaction] must have happened strictly before
     217              :     /// the value this is called on gets dropped.
     218              :     ///
     219              :     /// This is ensured by both of those methods accepting references to Layer.
     220              :     ///
     221              :     /// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
     222              :     /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
     223         5157 :     pub(crate) fn delete_on_drop(&self) {
     224         5157 :         self.0.delete_on_drop();
     225         5157 :     }
     226              : 
     227              :     /// Return data needed to reconstruct given page at LSN.
     228              :     ///
     229              :     /// It is up to the caller to collect more data from the previous layer and
     230              :     /// perform WAL redo, if necessary.
     231              :     ///
     232              :     /// # Cancellation-Safety
     233              :     ///
     234              :     /// This method is cancellation-safe.
     235     16843799 :     pub(crate) async fn get_value_reconstruct_data(
     236     16843799 :         &self,
     237     16843799 :         key: Key,
     238     16843799 :         lsn_range: Range<Lsn>,
     239     16843799 :         reconstruct_data: &mut ValueReconstructState,
     240     16843799 :         ctx: &RequestContext,
     241     16843799 :     ) -> anyhow::Result<ValueReconstructResult> {
     242              :         use anyhow::ensure;
     243              : 
     244     16843765 :         let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
     245     16843756 :         self.0
     246     16843756 :             .access_stats
     247     16843756 :             .record_access(LayerAccessKind::GetValueReconstructData, ctx);
     248     16843756 : 
     249     16843756 :         if self.layer_desc().is_delta {
     250     16415732 :             ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
     251     16415732 :             ensure!(self.layer_desc().key_range.contains(&key));
     252              :         } else {
     253       428024 :             ensure!(self.layer_desc().key_range.contains(&key));
     254       428024 :             ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
     255       428024 :             ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
     256              :         }
     257              : 
     258     16843756 :         layer
     259     16843756 :             .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
     260     16843756 :             .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
     261       992605 :             .await
     262     16843751 :             .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
     263     16843752 :     }
     264              : 
     265              :     /// Download the layer if evicted.
     266              :     ///
     267              :     /// Will not error when the layer is already downloaded.
     268           12 :     pub(crate) async fn download(&self) -> anyhow::Result<()> {
     269           32 :         self.0.get_or_maybe_download(true, None).await?;
     270            7 :         Ok(())
     271           12 :     }
     272              : 
     273              :     /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
     274              :     /// while the guard exists.
     275              :     ///
     276              :     /// Returns None if the layer is currently evicted.
     277         4812 :     pub(crate) async fn keep_resident(&self) -> anyhow::Result<Option<ResidentLayer>> {
     278         4812 :         let downloaded = match self.0.get_or_maybe_download(false, None).await {
     279         4196 :             Ok(d) => d,
     280              :             // technically there are a lot of possible errors, but in practice it should only be
     281              :             // DownloadRequired which is tripped up. could work to improve this situation
     282              :             // statically later.
     283          616 :             Err(DownloadError::DownloadRequired) => return Ok(None),
     284            0 :             Err(e) => return Err(e.into()),
     285              :         };
     286              : 
     287         4196 :         Ok(Some(ResidentLayer {
     288         4196 :             downloaded,
     289         4196 :             owner: self.clone(),
     290         4196 :         }))
     291         4812 :     }
     292              : 
     293              :     /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
     294         4132 :     pub(crate) async fn download_and_keep_resident(&self) -> anyhow::Result<ResidentLayer> {
     295         4132 :         let downloaded = self.0.get_or_maybe_download(true, None).await?;
     296              : 
     297         4132 :         Ok(ResidentLayer {
     298         4132 :             downloaded,
     299         4132 :             owner: self.clone(),
     300         4132 :         })
     301         4132 :     }
     302              : 
     303         3050 :     pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
     304         3050 :         self.0.info(reset)
     305         3050 :     }
     306              : 
     307         4194 :     pub(crate) fn access_stats(&self) -> &LayerAccessStats {
     308         4194 :         &self.0.access_stats
     309         4194 :     }
     310              : 
     311        23733 :     pub(crate) fn local_path(&self) -> &Utf8Path {
     312        23733 :         &self.0.path
     313        23733 :     }
     314              : 
     315        24849 :     pub(crate) fn metadata(&self) -> LayerFileMetadata {
     316        24849 :         self.0.metadata()
     317        24849 :     }
     318              : 
     319              :     /// Traditional debug dumping facility
     320              :     #[allow(unused)]
     321            4 :     pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> {
     322            4 :         self.0.desc.dump();
     323            4 : 
     324            4 :         if verbose {
     325              :             // for now, unconditionally download everything, even if that might not be wanted.
     326            4 :             let l = self.0.get_or_maybe_download(true, Some(ctx)).await?;
     327            8 :             l.dump(&self.0, ctx).await?
     328            0 :         }
     329              : 
     330            4 :         Ok(())
     331            4 :     }
     332              : 
     333              :     /// Waits until this layer has been dropped (and if needed, local file deletion and remote
     334              :     /// deletion scheduling has completed).
     335              :     ///
     336              :     /// Does not start local deletion, use [`Self::delete_on_drop`] for that
     337              :     /// separatedly.
     338              :     #[cfg(feature = "testing")]
     339         1080 :     pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
     340         1080 :         let mut rx = self.0.status.subscribe();
     341              : 
     342         1080 :         async move {
     343              :             loop {
     344         1080 :                 if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await {
     345         1080 :                     break;
     346            0 :                 }
     347              :             }
     348         1080 :         }
     349         1080 :     }
     350              : }
     351              : 
     352              : /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
     353              : ///
     354              : /// However when we want something evicted, we cannot evict it right away as there might be current
     355              : /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
     356              : /// read with [`Layer::get_value_reconstruct_data`].
     357              : ///
     358              : /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
     359            0 : #[derive(Debug)]
     360              : enum ResidentOrWantedEvicted {
     361              :     Resident(Arc<DownloadedLayer>),
     362              :     WantedEvicted(Weak<DownloadedLayer>, usize),
     363              : }
     364              : 
     365              : impl ResidentOrWantedEvicted {
     366     16852129 :     fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
     367     16852129 :         match self {
     368     16852129 :             ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
     369            0 :             ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
     370            0 :                 Some(strong) => {
     371            0 :                     LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
     372            0 : 
     373            0 :                     *self = ResidentOrWantedEvicted::Resident(strong.clone());
     374            0 : 
     375            0 :                     Some((strong, true))
     376              :                 }
     377            0 :                 None => None,
     378              :             },
     379              :         }
     380     16852129 :     }
     381              : 
     382              :     /// When eviction is first requested, drop down to holding a [`Weak`].
     383              :     ///
     384              :     /// Returns `Some` if this was the first time eviction was requested. Care should be taken to
     385              :     /// drop the possibly last strong reference outside of the mutex of
     386              :     /// heavier_once_cell::OnceCell.
     387         2515 :     fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
     388         2515 :         match self {
     389         2515 :             ResidentOrWantedEvicted::Resident(strong) => {
     390         2515 :                 let weak = Arc::downgrade(strong);
     391         2515 :                 let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
     392         2515 :                 std::mem::swap(self, &mut temp);
     393         2515 :                 match temp {
     394         2515 :                     ResidentOrWantedEvicted::Resident(strong) => Some(strong),
     395            0 :                     ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
     396              :                 }
     397              :             }
     398            0 :             ResidentOrWantedEvicted::WantedEvicted(..) => None,
     399              :         }
     400         2515 :     }
     401              : }
     402              : 
     403              : struct LayerInner {
     404              :     /// Only needed to check ondemand_download_behavior_treat_error_as_warn and creation of
     405              :     /// [`Self::path`].
     406              :     conf: &'static PageServerConf,
     407              : 
     408              :     /// Full path to the file; unclear if this should exist anymore.
     409              :     path: Utf8PathBuf,
     410              : 
     411              :     desc: PersistentLayerDesc,
     412              : 
     413              :     /// Timeline access is needed for remote timeline client and metrics.
     414              :     timeline: Weak<Timeline>,
     415              : 
     416              :     /// Cached knowledge of [`Timeline::remote_client`] being `Some`.
     417              :     have_remote_client: bool,
     418              : 
     419              :     access_stats: LayerAccessStats,
     420              : 
     421              :     /// This custom OnceCell is backed by std mutex, but only held for short time periods.
     422              :     /// Initialization and deinitialization are done while holding a permit.
     423              :     inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
     424              : 
     425              :     /// Do we want to delete locally and remotely this when `LayerInner` is dropped
     426              :     wanted_deleted: AtomicBool,
     427              : 
     428              :     /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
     429              :     /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
     430              :     /// [`LayerInner::on_downloaded_layer_drop`].
     431              :     wanted_evicted: AtomicBool,
     432              : 
     433              :     /// Version is to make sure we will only evict a specific download of a file.
     434              :     ///
     435              :     /// Incremented for each download, stored in `DownloadedLayer::version` or
     436              :     /// `ResidentOrWantedEvicted::WantedEvicted`.
     437              :     version: AtomicUsize,
     438              : 
     439              :     /// Allow subscribing to when the layer actually gets evicted.
     440              :     status: tokio::sync::broadcast::Sender<Status>,
     441              : 
     442              :     /// Counter for exponential backoff with the download
     443              :     consecutive_failures: AtomicUsize,
     444              : 
     445              :     /// The generation of this Layer.
     446              :     ///
     447              :     /// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
     448              :     /// for created layers from [`Timeline::generation`].
     449              :     generation: Generation,
     450              : 
     451              :     /// The shard of this Layer.
     452              :     ///
     453              :     /// For layers created in this process, this will always be the [`ShardIndex`] of the
     454              :     /// current `ShardIdentity`` (TODO: add link once it's introduced).
     455              :     ///
     456              :     /// For loaded layers, this may be some other value if the tenant has undergone
     457              :     /// a shard split since the layer was originally written.
     458              :     shard: ShardIndex,
     459              : 
     460              :     last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
     461              : }
     462              : 
     463              : impl std::fmt::Display for LayerInner {
     464        22070 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     465        22070 :         write!(f, "{}", self.layer_desc().short_id())
     466        22070 :     }
     467              : }
     468              : 
     469              : impl AsLayerDesc for LayerInner {
     470     51316664 :     fn layer_desc(&self) -> &PersistentLayerDesc {
     471     51316664 :         &self.desc
     472     51316664 :     }
     473              : }
     474              : 
     475         2515 : #[derive(Debug, Clone, Copy)]
     476              : enum Status {
     477              :     Evicted,
     478              :     Downloaded,
     479              : }
     480              : 
     481              : impl Drop for LayerInner {
     482        44421 :     fn drop(&mut self) {
     483        44421 :         if !*self.wanted_deleted.get_mut() {
     484              :             // should we try to evict if the last wish was for eviction?
     485              :             // feels like there's some hazard of overcrowding near shutdown near by, but we don't
     486              :             // run drops during shutdown (yet)
     487        39264 :             return;
     488         5157 :         }
     489              : 
     490         5157 :         let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
     491              : 
     492         5157 :         let path = std::mem::take(&mut self.path);
     493         5157 :         let file_name = self.layer_desc().filename();
     494         5157 :         let file_size = self.layer_desc().file_size;
     495         5157 :         let timeline = self.timeline.clone();
     496         5157 :         let meta = self.metadata();
     497         5157 :         let status = self.status.clone();
     498         5157 : 
     499         5157 :         crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
     500         5157 :             let _g = span.entered();
     501         5157 : 
     502         5157 :             // carry this until we are finished for [`Layer::wait_drop`] support
     503         5157 :             let _status = status;
     504              : 
     505         5157 :             let removed = match std::fs::remove_file(path) {
     506         5155 :                 Ok(()) => true,
     507            2 :                 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
     508            2 :                     // until we no longer do detaches by removing all local files before removing the
     509            2 :                     // tenant from the global map, we will always get these errors even if we knew what
     510            2 :                     // is the latest state.
     511            2 :                     //
     512            2 :                     // we currently do not track the latest state, so we'll also end up here on evicted
     513            2 :                     // layers.
     514            2 :                     false
     515              :                 }
     516            0 :                 Err(e) => {
     517            0 :                     tracing::error!("failed to remove wanted deleted layer: {e}");
     518            0 :                     LAYER_IMPL_METRICS.inc_delete_removes_failed();
     519            0 :                     false
     520              :                 }
     521              :             };
     522              : 
     523         5157 :             if let Some(timeline) = timeline.upgrade() {
     524         5157 :                 if removed {
     525         5155 :                     timeline.metrics.resident_physical_size_sub(file_size);
     526         5155 :                 }
     527         5157 :                 if let Some(remote_client) = timeline.remote_client.as_ref() {
     528         5157 :                     let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
     529              : 
     530         5157 :                     if let Err(e) = res {
     531              :                         // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
     532              :                         // demonstrating this deadlock (without spawn_blocking): stop will drop
     533              :                         // queued items, which will have ResidentLayer's, and those drops would try
     534              :                         // to re-entrantly lock the RemoteTimelineClient inner state.
     535           14 :                         if !timeline.is_active() {
     536           14 :                             tracing::info!("scheduling deletion on drop failed: {e:#}");
     537              :                         } else {
     538            0 :                             tracing::warn!("scheduling deletion on drop failed: {e:#}");
     539              :                         }
     540           14 :                         LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
     541         5143 :                     } else {
     542         5143 :                         LAYER_IMPL_METRICS.inc_completed_deletes();
     543         5143 :                     }
     544            0 :                 }
     545            0 :             } else {
     546            0 :                 // no need to nag that timeline is gone: under normal situation on
     547            0 :                 // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
     548            0 :                 LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
     549            0 :             }
     550         5157 :         });
     551        44421 :     }
     552              : }
     553              : 
     554              : impl LayerInner {
     555        75356 :     fn new(
     556        75356 :         conf: &'static PageServerConf,
     557        75356 :         timeline: &Arc<Timeline>,
     558        75356 :         access_stats: LayerAccessStats,
     559        75356 :         desc: PersistentLayerDesc,
     560        75356 :         downloaded: Option<Arc<DownloadedLayer>>,
     561        75356 :         generation: Generation,
     562        75356 :         shard: ShardIndex,
     563        75356 :     ) -> Self {
     564        75356 :         let path = conf
     565        75356 :             .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
     566        75356 :             .join(desc.filename().to_string());
     567              : 
     568        75356 :         let (inner, version) = if let Some(inner) = downloaded {
     569        34457 :             let version = inner.version;
     570        34457 :             let resident = ResidentOrWantedEvicted::Resident(inner);
     571        34457 :             (heavier_once_cell::OnceCell::new(resident), version)
     572              :         } else {
     573        40899 :             (heavier_once_cell::OnceCell::default(), 0)
     574              :         };
     575              : 
     576        75356 :         LayerInner {
     577        75356 :             conf,
     578        75356 :             path,
     579        75356 :             desc,
     580        75356 :             timeline: Arc::downgrade(timeline),
     581        75356 :             have_remote_client: timeline.remote_client.is_some(),
     582        75356 :             access_stats,
     583        75356 :             wanted_deleted: AtomicBool::new(false),
     584        75356 :             wanted_evicted: AtomicBool::new(false),
     585        75356 :             inner,
     586        75356 :             version: AtomicUsize::new(version),
     587        75356 :             status: tokio::sync::broadcast::channel(1).0,
     588        75356 :             consecutive_failures: AtomicUsize::new(0),
     589        75356 :             generation,
     590        75356 :             shard,
     591        75356 :             last_evicted_at: std::sync::Mutex::default(),
     592        75356 :         }
     593        75356 :     }
     594              : 
     595         5157 :     fn delete_on_drop(&self) {
     596         5157 :         let res =
     597         5157 :             self.wanted_deleted
     598         5157 :                 .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
     599         5157 : 
     600         5157 :         if res.is_ok() {
     601         5157 :             LAYER_IMPL_METRICS.inc_started_deletes();
     602         5157 :         }
     603         5157 :     }
     604              : 
     605              :     /// Cancellation safe, however dropping the future and calling this method again might result
     606              :     /// in a new attempt to evict OR join the previously started attempt.
     607         2517 :     pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
     608         2517 :         use tokio::sync::broadcast::error::RecvError;
     609         2517 : 
     610         2517 :         assert!(self.have_remote_client);
     611              : 
     612         2517 :         let mut rx = self.status.subscribe();
     613              : 
     614         2515 :         let strong = {
     615         2517 :             match self.inner.get() {
     616         2515 :                 Some(mut either) => {
     617         2515 :                     self.wanted_evicted.store(true, Ordering::Relaxed);
     618         2515 :                     either.downgrade()
     619              :                 }
     620            2 :                 None => return Err(EvictionError::NotFound),
     621              :             }
     622              :         };
     623              : 
     624         2515 :         if strong.is_some() {
     625         2515 :             // drop the DownloadedLayer outside of the holding the guard
     626         2515 :             drop(strong);
     627         2515 :             LAYER_IMPL_METRICS.inc_started_evictions();
     628         2515 :         }
     629              : 
     630         2515 :         match rx.recv().await {
     631         2515 :             Ok(Status::Evicted) => Ok(()),
     632            0 :             Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
     633              :             Err(RecvError::Closed) => {
     634            0 :                 unreachable!("sender cannot be dropped while we are in &self method")
     635              :             }
     636              :             Err(RecvError::Lagged(_)) => {
     637              :                 // this is quite unlikely, but we are blocking a lot in the async context, so
     638              :                 // we might be missing this because we are stuck on a LIFO slot on a thread
     639              :                 // which is busy blocking for a 1TB database create_image_layers.
     640              :                 //
     641              :                 // use however late (compared to the initial expressing of wanted) as the
     642              :                 // "outcome" now
     643            0 :                 LAYER_IMPL_METRICS.inc_broadcast_lagged();
     644            0 :                 match self.inner.get() {
     645            0 :                     Some(_) => Err(EvictionError::Downloaded),
     646            0 :                     None => Ok(()),
     647              :                 }
     648              :             }
     649              :         }
     650         2517 :     }
     651              : 
     652              :     /// Cancellation safe.
     653     16852759 :     async fn get_or_maybe_download(
     654     16852759 :         self: &Arc<Self>,
     655     16852759 :         allow_download: bool,
     656     16852759 :         ctx: Option<&RequestContext>,
     657     16852759 :     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
     658     16852725 :         let mut init_permit = None;
     659              : 
     660              :         loop {
     661     16852725 :             let download = move |permit| {
     662        10086 :                 async move {
     663        10086 :                     // disable any scheduled but not yet running eviction deletions for this
     664        10086 :                     let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
     665        10086 : 
     666        10086 :                     // count cancellations, which currently remain largely unexpected
     667        10086 :                     let init_cancelled =
     668        10086 :                         scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
     669        10086 : 
     670        10086 :                     // no need to make the evict_and_wait wait for the actual download to complete
     671        10086 :                     drop(self.status.send(Status::Downloaded));
     672              : 
     673        10086 :                     let timeline = self
     674        10086 :                         .timeline
     675        10086 :                         .upgrade()
     676        10086 :                         .ok_or_else(|| DownloadError::TimelineShutdown)?;
     677              : 
     678              :                     // FIXME: grab a gate
     679              : 
     680        10086 :                     let can_ever_evict = timeline.remote_client.as_ref().is_some();
     681              : 
     682              :                     // check if we really need to be downloaded; could have been already downloaded by a
     683              :                     // cancelled previous attempt.
     684        10086 :                     let needs_download = self
     685        10086 :                         .needs_download()
     686         9679 :                         .await
     687        10086 :                         .map_err(DownloadError::PreStatFailed)?;
     688              : 
     689        10086 :                     let permit = if let Some(reason) = needs_download {
     690        10086 :                         if let NeedsDownload::NotFile(ft) = reason {
     691            0 :                             return Err(DownloadError::NotFile(ft));
     692        10086 :                         }
     693        10086 : 
     694        10086 :                         // only reset this after we've decided we really need to download. otherwise it'd
     695        10086 :                         // be impossible to mark cancelled downloads for eviction, like one could imagine
     696        10086 :                         // we would like to do for prefetching which was not needed.
     697        10086 :                         self.wanted_evicted.store(false, Ordering::Release);
     698        10086 : 
     699        10086 :                         if !can_ever_evict {
     700            0 :                             return Err(DownloadError::NoRemoteStorage);
     701        10086 :                         }
     702              : 
     703        10086 :                         if let Some(ctx) = ctx {
     704         9455 :                             self.check_expected_download(ctx)?;
     705          631 :                         }
     706              : 
     707        10086 :                         if !allow_download {
     708              :                             // this does look weird, but for LayerInner the "downloading" means also changing
     709              :                             // internal once related state ...
     710          616 :                             return Err(DownloadError::DownloadRequired);
     711         9470 :                         }
     712         9470 : 
     713         9470 :                         tracing::info!(%reason, "downloading on-demand");
     714              : 
     715        19751 :                         self.spawn_download_and_wait(timeline, permit).await?
     716              :                     } else {
     717              :                         // the file is present locally, probably by a previous but cancelled call to
     718              :                         // get_or_maybe_download. alternatively we might be running without remote storage.
     719            0 :                         LAYER_IMPL_METRICS.inc_init_needed_no_download();
     720            0 : 
     721            0 :                         permit
     722              :                     };
     723              : 
     724         9456 :                     let since_last_eviction =
     725         9456 :                         self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
     726         9456 :                     if let Some(since_last_eviction) = since_last_eviction {
     727          165 :                         // FIXME: this will not always be recorded correctly until #6028 (the no
     728          165 :                         // download needed branch above)
     729          165 :                         LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
     730         9291 :                     }
     731              : 
     732         9456 :                     let res = Arc::new(DownloadedLayer {
     733         9456 :                         owner: Arc::downgrade(self),
     734         9456 :                         kind: tokio::sync::OnceCell::default(),
     735         9456 :                         version: next_version,
     736         9456 :                     });
     737         9456 : 
     738         9456 :                     self.access_stats.record_residence_event(
     739         9456 :                         LayerResidenceStatus::Resident,
     740         9456 :                         LayerResidenceEventReason::ResidenceChange,
     741         9456 :                     );
     742         9456 : 
     743         9456 :                     let waiters = self.inner.initializer_count();
     744         9456 :                     if waiters > 0 {
     745          325 :                         tracing::info!(
     746          325 :                             waiters,
     747          325 :                             "completing the on-demand download for other tasks"
     748          325 :                         );
     749         9131 :                     }
     750              : 
     751         9456 :                     scopeguard::ScopeGuard::into_inner(init_cancelled);
     752         9456 : 
     753         9456 :                     Ok((ResidentOrWantedEvicted::Resident(res), permit))
     754        10078 :                 }
     755        10086 :                 .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
     756        10086 :             };
     757              : 
     758     16852725 :             if let Some(init_permit) = init_permit.take() {
     759              :                 // use the already held initialization permit because it is impossible to hit the
     760              :                 // below paths anymore essentially limiting the max loop iterations to 2.
     761            0 :                 let (value, init_permit) = download(init_permit).await?;
     762            0 :                 let mut guard = self.inner.set(value, init_permit);
     763            0 :                 let (strong, _upgraded) = guard
     764            0 :                     .get_and_upgrade()
     765            0 :                     .expect("init creates strong reference, we held the init permit");
     766            0 :                 return Ok(strong);
     767     16852725 :             }
     768              : 
     769            0 :             let (weak, permit) = {
     770     16852725 :                 let mut locked = self.inner.get_or_init(download).await?;
     771              : 
     772     16852095 :                 if let Some((strong, upgraded)) = locked.get_and_upgrade() {
     773     16852095 :                     if upgraded {
     774            0 :                         // when upgraded back, the Arc<DownloadedLayer> is still available, but
     775            0 :                         // previously a `evict_and_wait` was received.
     776            0 :                         self.wanted_evicted.store(false, Ordering::Relaxed);
     777            0 : 
     778            0 :                         // error out any `evict_and_wait`
     779            0 :                         drop(self.status.send(Status::Downloaded));
     780            0 :                         LAYER_IMPL_METRICS
     781            0 :                             .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
     782     16852095 :                     }
     783              : 
     784     16852095 :                     return Ok(strong);
     785              :                 } else {
     786              :                     // path to here: the evict_blocking is stuck on spawn_blocking queue.
     787              :                     //
     788              :                     // reset the contents, deactivating the eviction and causing a
     789              :                     // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
     790            0 :                     locked.take_and_deinit()
     791            0 :                 }
     792            0 :             };
     793            0 : 
     794            0 :             // unlock first, then drop the weak, but because upgrade failed, we
     795            0 :             // know it cannot be a problem.
     796            0 : 
     797            0 :             assert!(
     798            0 :                 matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
     799            0 :                 "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
     800              :             );
     801              : 
     802            0 :             init_permit = Some(permit);
     803            0 : 
     804            0 :             LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download();
     805              :         }
     806     16852717 :     }
     807              : 
     808              :     /// Nag or fail per RequestContext policy
     809         9455 :     fn check_expected_download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
     810         9455 :         use crate::context::DownloadBehavior::*;
     811         9455 :         let b = ctx.download_behavior();
     812         9455 :         match b {
     813         9455 :             Download => Ok(()),
     814              :             Warn | Error => {
     815            0 :                 tracing::info!(
     816            0 :                     "unexpectedly on-demand downloading for task kind {:?}",
     817            0 :                     ctx.task_kind()
     818            0 :                 );
     819            0 :                 crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
     820              : 
     821            0 :                 let really_error =
     822            0 :                     matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn;
     823              : 
     824            0 :                 if really_error {
     825              :                     // this check is only probablistic, seems like flakyness footgun
     826            0 :                     Err(DownloadError::ContextAndConfigReallyDeniesDownloads)
     827              :                 } else {
     828            0 :                     Ok(())
     829              :                 }
     830              :             }
     831              :         }
     832         9455 :     }
     833              : 
     834              :     /// Actual download, at most one is executed at the time.
     835         9469 :     async fn spawn_download_and_wait(
     836         9469 :         self: &Arc<Self>,
     837         9469 :         timeline: Arc<Timeline>,
     838         9469 :         permit: heavier_once_cell::InitPermit,
     839         9469 :     ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
     840         9469 :         debug_assert_current_span_has_tenant_and_timeline_id();
     841         9469 : 
     842         9469 :         let task_name = format!("download layer {}", self);
     843         9469 : 
     844         9469 :         let (tx, rx) = tokio::sync::oneshot::channel();
     845         9469 : 
     846         9469 :         // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
     847         9469 :         // block tenant::mgr::remove_tenant_from_memory.
     848         9469 : 
     849         9469 :         let this: Arc<Self> = self.clone();
     850         9469 : 
     851         9469 :         crate::task_mgr::spawn(
     852         9469 :             &tokio::runtime::Handle::current(),
     853         9469 :             crate::task_mgr::TaskKind::RemoteDownloadTask,
     854         9469 :             Some(self.desc.tenant_shard_id),
     855         9469 :             Some(self.desc.timeline_id),
     856         9469 :             &task_name,
     857         9469 :             false,
     858         9469 :             async move {
     859         9469 : 
     860         9469 :                 let client = timeline
     861         9469 :                     .remote_client
     862         9469 :                     .as_ref()
     863         9469 :                     .expect("checked above with have_remote_client");
     864              : 
     865         9469 :                 let result = client.download_layer_file(
     866         9469 :                     &this.desc.filename(),
     867         9469 :                     &this.metadata(),
     868         9469 :                     &crate::task_mgr::shutdown_token()
     869         9469 :                 )
     870       438981 :                 .await;
     871              : 
     872         9467 :                 let result = match result {
     873         9457 :                     Ok(size) => {
     874         9457 :                         timeline.metrics.resident_physical_size_add(size);
     875         9457 :                         Ok(())
     876              :                     }
     877           10 :                     Err(e) => {
     878           10 :                         let consecutive_failures =
     879           10 :                             this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
     880           10 : 
     881           10 :                         let backoff = utils::backoff::exponential_backoff_duration_seconds(
     882           10 :                             consecutive_failures.min(u32::MAX as usize) as u32,
     883           10 :                             1.5,
     884           10 :                             60.0,
     885           10 :                         );
     886           10 : 
     887           10 :                         let backoff = std::time::Duration::from_secs_f64(backoff);
     888           10 : 
     889           16 :                         tokio::select! {
     890           16 :                             _ = tokio::time::sleep(backoff) => {},
     891           16 :                             _ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
     892           16 :                             _ = timeline.cancel.cancelled() => {},
     893           16 :                         };
     894              : 
     895           10 :                         Err(e)
     896              :                     }
     897              :                 };
     898              : 
     899         9467 :                 if let Err(res) = tx.send((result, permit)) {
     900            5 :                     match res {
     901            1 :                         (Ok(()), _) => {
     902            1 :                             // our caller is cancellation safe so this is fine; if someone
     903            1 :                             // else requests the layer, they'll find it already downloaded.
     904            1 :                             //
     905            1 :                             // See counter [`LayerImplMetrics::inc_init_needed_no_download`]
     906            1 :                             //
     907            1 :                             // FIXME(#6028): however, could be that we should consider marking the
     908            1 :                             // layer for eviction? alas, cannot: because only DownloadedLayer will
     909            1 :                             // handle that.
     910            1 :                         },
     911            4 :                         (Err(e), _) => {
     912            4 :                             // our caller is cancellation safe, but we might be racing with
     913            4 :                             // another attempt to initialize. before we have cancellation
     914            4 :                             // token support: these attempts should converge regardless of
     915            4 :                             // their completion order.
     916            4 :                             tracing::error!("layer file download failed, and additionally failed to communicate this to caller: {e:?}");
     917            4 :                             LAYER_IMPL_METRICS.inc_download_failed_without_requester();
     918              :                         }
     919              :                     }
     920         9462 :                 }
     921              : 
     922         9467 :                 Ok(())
     923         9469 :             }
     924         9469 :             .in_current_span(),
     925         9469 :         );
     926        10760 :         match rx.await {
     927         9456 :             Ok((Ok(()), permit)) => {
     928         9456 :                 if let Some(reason) = self
     929         9456 :                     .needs_download()
     930         8991 :                     .await
     931         9456 :                     .map_err(DownloadError::PostStatFailed)?
     932              :                 {
     933              :                     // this is really a bug in needs_download or remote timeline client
     934            0 :                     panic!("post-condition failed: needs_download returned {reason:?}");
     935         9456 :                 }
     936         9456 : 
     937         9456 :                 self.consecutive_failures.store(0, Ordering::Relaxed);
     938         9456 :                 tracing::info!("on-demand download successful");
     939              : 
     940         9456 :                 Ok(permit)
     941              :             }
     942            6 :             Ok((Err(e), _permit)) => {
     943            6 :                 // sleep already happened in the spawned task, if it was not cancelled
     944            6 :                 let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
     945            6 : 
     946            6 :                 match e.downcast_ref::<remote_storage::DownloadError>() {
     947              :                     // If the download failed due to its cancellation token,
     948              :                     // propagate the cancellation error upstream.
     949              :                     Some(remote_storage::DownloadError::Cancelled) => {
     950            0 :                         Err(DownloadError::DownloadCancelled)
     951              :                     }
     952              :                     _ => {
     953            6 :                         tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
     954            6 :                         Err(DownloadError::DownloadFailed)
     955              :                     }
     956              :                 }
     957              :             }
     958            0 :             Err(_gone) => Err(DownloadError::DownloadCancelled),
     959              :         }
     960         9462 :     }
     961              : 
     962        19542 :     async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
     963        19542 :         match tokio::fs::metadata(&self.path).await {
     964         9456 :             Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
     965        10086 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
     966            0 :             Err(e) => Err(e),
     967              :         }
     968        19542 :     }
     969              : 
     970        53052 :     fn needs_download_blocking(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
     971        53052 :         match self.path.metadata() {
     972        12153 :             Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
     973        40899 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
     974            0 :             Err(e) => Err(e),
     975              :         }
     976        53052 :     }
     977              : 
     978        21609 :     fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
     979        21609 :         // in future, this should include sha2-256 validation of the file.
     980        21609 :         if !m.is_file() {
     981            0 :             Err(NeedsDownload::NotFile(m.file_type()))
     982        21609 :         } else if m.len() != self.desc.file_size {
     983            0 :             Err(NeedsDownload::WrongSize {
     984            0 :                 actual: m.len(),
     985            0 :                 expected: self.desc.file_size,
     986            0 :             })
     987              :         } else {
     988        21609 :             Ok(())
     989              :         }
     990        21609 :     }
     991              : 
     992         3050 :     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
     993         3050 :         let layer_file_name = self.desc.filename().file_name();
     994         3050 : 
     995         3050 :         // this is not accurate: we could have the file locally but there was a cancellation
     996         3050 :         // and now we are not in sync, or we are currently downloading it.
     997         3050 :         let remote = self.inner.get().is_none();
     998         3050 : 
     999         3050 :         let access_stats = self.access_stats.as_api_model(reset);
    1000         3050 : 
    1001         3050 :         if self.desc.is_delta {
    1002         2467 :             let lsn_range = &self.desc.lsn_range;
    1003         2467 : 
    1004         2467 :             HistoricLayerInfo::Delta {
    1005         2467 :                 layer_file_name,
    1006         2467 :                 layer_file_size: self.desc.file_size,
    1007         2467 :                 lsn_start: lsn_range.start,
    1008         2467 :                 lsn_end: lsn_range.end,
    1009         2467 :                 remote,
    1010         2467 :                 access_stats,
    1011         2467 :             }
    1012              :         } else {
    1013          583 :             let lsn = self.desc.image_layer_lsn();
    1014          583 : 
    1015          583 :             HistoricLayerInfo::Image {
    1016          583 :                 layer_file_name,
    1017          583 :                 layer_file_size: self.desc.file_size,
    1018          583 :                 lsn_start: lsn,
    1019          583 :                 remote,
    1020          583 :                 access_stats,
    1021          583 :             }
    1022              :         }
    1023         3050 :     }
    1024              : 
    1025              :     /// `DownloadedLayer` is being dropped, so it calls this method.
    1026         2515 :     fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
    1027         2515 :         let delete = self.wanted_deleted.load(Ordering::Acquire);
    1028         2515 :         let evict = self.wanted_evicted.load(Ordering::Acquire);
    1029         2515 :         let can_evict = self.have_remote_client;
    1030         2515 : 
    1031         2515 :         if delete {
    1032            0 :             // do nothing now, only in LayerInner::drop -- this was originally implemented because
    1033            0 :             // we could had already scheduled the deletion at the time.
    1034            0 :             //
    1035            0 :             // FIXME: this is not true anymore, we can safely evict wanted deleted files.
    1036         2515 :         } else if can_evict && evict {
    1037         2515 :             let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
    1038              : 
    1039              :             // downgrade for queueing, in case there's a tear down already ongoing we should not
    1040              :             // hold it alive.
    1041         2515 :             let this = Arc::downgrade(&self);
    1042         2515 :             drop(self);
    1043         2515 : 
    1044         2515 :             // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
    1045         2515 :             // drop while the `self.inner` is being locked, leading to a deadlock.
    1046         2515 : 
    1047         2515 :             crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
    1048         2515 :                 let _g = span.entered();
    1049              : 
    1050              :                 // if LayerInner is already dropped here, do nothing because the delete on drop
    1051              :                 // has already ran while we were in queue
    1052         2515 :                 let Some(this) = this.upgrade() else {
    1053            0 :                     LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
    1054            0 :                     return;
    1055              :                 };
    1056         2515 :                 match this.evict_blocking(version) {
    1057         2515 :                     Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
    1058            0 :                     Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
    1059              :                 }
    1060         2515 :             });
    1061            0 :         }
    1062         2515 :     }
    1063              : 
    1064         2515 :     fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
    1065              :         // deleted or detached timeline, don't do anything.
    1066         2515 :         let Some(timeline) = self.timeline.upgrade() else {
    1067            0 :             return Err(EvictionCancelled::TimelineGone);
    1068              :         };
    1069              : 
    1070              :         // to avoid starting a new download while we evict, keep holding on to the
    1071              :         // permit.
    1072         2515 :         let _permit = {
    1073         2515 :             let maybe_downloaded = self.inner.get();
    1074              : 
    1075         2515 :             let (_weak, permit) = match maybe_downloaded {
    1076         2515 :                 Some(mut guard) => {
    1077         2515 :                     if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard {
    1078         2515 :                         if *version == only_version {
    1079         2515 :                             guard.take_and_deinit()
    1080              :                         } else {
    1081              :                             // this was not for us; maybe there's another eviction job
    1082              :                             // TODO: does it make any sense to stall here? unique versions do not
    1083              :                             // matter, we only want to make sure not to evict a resident, which we
    1084              :                             // are not doing.
    1085            0 :                             return Err(EvictionCancelled::VersionCheckFailed);
    1086              :                         }
    1087              :                     } else {
    1088            0 :                         return Err(EvictionCancelled::AlreadyReinitialized);
    1089              :                     }
    1090              :                 }
    1091              :                 None => {
    1092              :                     // already deinitialized, perhaps get_or_maybe_download did this and is
    1093              :                     // currently waiting to reinitialize it
    1094            0 :                     return Err(EvictionCancelled::LostToDownload);
    1095              :                 }
    1096              :             };
    1097              : 
    1098         2515 :             permit
    1099         2515 :         };
    1100         2515 : 
    1101         2515 :         // now accesses to inner.get_or_init wait on the semaphore or the `_permit`
    1102         2515 : 
    1103         2515 :         self.access_stats.record_residence_event(
    1104         2515 :             LayerResidenceStatus::Evicted,
    1105         2515 :             LayerResidenceEventReason::ResidenceChange,
    1106         2515 :         );
    1107              : 
    1108         2515 :         let res = match capture_mtime_and_remove(&self.path) {
    1109         2515 :             Ok(local_layer_mtime) => {
    1110         2515 :                 let duration = SystemTime::now().duration_since(local_layer_mtime);
    1111         2515 :                 match duration {
    1112         2515 :                     Ok(elapsed) => {
    1113         2515 :                         timeline
    1114         2515 :                             .metrics
    1115         2515 :                             .evictions_with_low_residence_duration
    1116         2515 :                             .read()
    1117         2515 :                             .unwrap()
    1118         2515 :                             .observe(elapsed);
    1119         2515 :                         tracing::info!(
    1120         2515 :                             residence_millis = elapsed.as_millis(),
    1121         2515 :                             "evicted layer after known residence period"
    1122         2515 :                         );
    1123              :                     }
    1124              :                     Err(_) => {
    1125            0 :                         tracing::info!("evicted layer after unknown residence period");
    1126              :                     }
    1127              :                 }
    1128         2515 :                 timeline.metrics.evictions.inc();
    1129         2515 :                 timeline
    1130         2515 :                     .metrics
    1131         2515 :                     .resident_physical_size_sub(self.desc.file_size);
    1132         2515 : 
    1133         2515 :                 Ok(())
    1134              :             }
    1135            0 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
    1136            0 :                 tracing::error!(
    1137            0 :                     layer_size = %self.desc.file_size,
    1138            0 :                     "failed to evict layer from disk, it was already gone (metrics will be inaccurate)"
    1139            0 :                 );
    1140            0 :                 Err(EvictionCancelled::FileNotFound)
    1141              :             }
    1142            0 :             Err(e) => {
    1143            0 :                 tracing::error!("failed to evict file from disk: {e:#}");
    1144            0 :                 Err(EvictionCancelled::RemoveFailed)
    1145              :             }
    1146              :         };
    1147              : 
    1148              :         // we are still holding the permit, so no new spawn_download_and_wait can happen
    1149         2515 :         drop(self.status.send(Status::Evicted));
    1150         2515 : 
    1151         2515 :         *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
    1152         2515 : 
    1153         2515 :         res
    1154         2515 :     }
    1155              : 
    1156        39475 :     fn metadata(&self) -> LayerFileMetadata {
    1157        39475 :         LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
    1158        39475 :     }
    1159              : }
    1160              : 
    1161         2515 : fn capture_mtime_and_remove(path: &Utf8Path) -> Result<SystemTime, std::io::Error> {
    1162         2515 :     let m = path.metadata()?;
    1163         2515 :     let local_layer_mtime = m.modified()?;
    1164         2515 :     std::fs::remove_file(path)?;
    1165         2515 :     Ok(local_layer_mtime)
    1166         2515 : }
    1167              : 
    1168            0 : #[derive(Debug, thiserror::Error)]
    1169              : pub(crate) enum EvictionError {
    1170              :     #[error("layer was already evicted")]
    1171              :     NotFound,
    1172              : 
    1173              :     /// Evictions must always lose to downloads in races, and this time it happened.
    1174              :     #[error("layer was downloaded instead")]
    1175              :     Downloaded,
    1176              : }
    1177              : 
    1178              : /// Error internal to the [`LayerInner::get_or_maybe_download`]
    1179           13 : #[derive(Debug, thiserror::Error)]
    1180              : enum DownloadError {
    1181              :     #[error("timeline has already shutdown")]
    1182              :     TimelineShutdown,
    1183              :     #[error("no remote storage configured")]
    1184              :     NoRemoteStorage,
    1185              :     #[error("context denies downloading")]
    1186              :     ContextAndConfigReallyDeniesDownloads,
    1187              :     #[error("downloading is really required but not allowed by this method")]
    1188              :     DownloadRequired,
    1189              :     #[error("layer path exists, but it is not a file: {0:?}")]
    1190              :     NotFile(std::fs::FileType),
    1191              :     /// Why no error here? Because it will be reported by page_service. We should had also done
    1192              :     /// retries already.
    1193              :     #[error("downloading evicted layer file failed")]
    1194              :     DownloadFailed,
    1195              :     #[error("downloading failed, possibly for shutdown")]
    1196              :     DownloadCancelled,
    1197              :     #[error("pre-condition: stat before download failed")]
    1198              :     PreStatFailed(#[source] std::io::Error),
    1199              :     #[error("post-condition: stat after download failed")]
    1200              :     PostStatFailed(#[source] std::io::Error),
    1201              : }
    1202              : 
    1203            0 : #[derive(Debug, PartialEq)]
    1204              : pub(crate) enum NeedsDownload {
    1205              :     NotFound,
    1206              :     NotFile(std::fs::FileType),
    1207              :     WrongSize { actual: u64, expected: u64 },
    1208              : }
    1209              : 
    1210              : impl std::fmt::Display for NeedsDownload {
    1211         9470 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1212         9470 :         match self {
    1213         9470 :             NeedsDownload::NotFound => write!(f, "file was not found"),
    1214            0 :             NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
    1215            0 :             NeedsDownload::WrongSize { actual, expected } => {
    1216            0 :                 write!(f, "file size mismatch {actual} vs. {expected}")
    1217              :             }
    1218              :         }
    1219         9470 :     }
    1220              : }
    1221              : 
    1222              : /// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
    1223              : pub(crate) struct DownloadedLayer {
    1224              :     owner: Weak<LayerInner>,
    1225              :     // Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
    1226              :     // DownloadedLayer
    1227              :     kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
    1228              :     version: usize,
    1229              : }
    1230              : 
    1231              : impl std::fmt::Debug for DownloadedLayer {
    1232            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1233            0 :         f.debug_struct("DownloadedLayer")
    1234            0 :             // owner omitted because it is always "Weak"
    1235            0 :             .field("kind", &self.kind)
    1236            0 :             .field("version", &self.version)
    1237            0 :             .finish()
    1238            0 :     }
    1239              : }
    1240              : 
    1241              : impl Drop for DownloadedLayer {
    1242        24222 :     fn drop(&mut self) {
    1243        24222 :         if let Some(owner) = self.owner.upgrade() {
    1244         2515 :             owner.on_downloaded_layer_drop(self.version);
    1245        21707 :         } else {
    1246        21707 :             // no need to do anything, we are shutting down
    1247        21707 :         }
    1248        24222 :     }
    1249              : }
    1250              : 
    1251              : impl DownloadedLayer {
    1252              :     /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to
    1253              :     /// initialize it permanently.
    1254              :     ///
    1255              :     /// `owner` parameter is a strong reference at the same `LayerInner` as the
    1256              :     /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called,
    1257              :     /// we will always have the LayerInner on the callstack, so we can just use it.
    1258     16847898 :     async fn get<'a>(
    1259     16847898 :         &'a self,
    1260     16847898 :         owner: &Arc<LayerInner>,
    1261     16847898 :         ctx: &RequestContext,
    1262     16847898 :     ) -> anyhow::Result<&'a LayerKind> {
    1263     16847864 :         let init = || async {
    1264        33380 :             assert_eq!(
    1265        33380 :                 Weak::as_ptr(&self.owner),
    1266        33380 :                 Arc::as_ptr(owner),
    1267            0 :                 "these are the same, just avoiding the upgrade"
    1268              :             );
    1269              : 
    1270        33380 :             let res = if owner.desc.is_delta {
    1271        12086 :                 let summary = Some(delta_layer::Summary::expected(
    1272        12086 :                     owner.desc.tenant_shard_id.tenant_id,
    1273        12086 :                     owner.desc.timeline_id,
    1274        12086 :                     owner.desc.key_range.clone(),
    1275        12086 :                     owner.desc.lsn_range.clone(),
    1276        12086 :                 ));
    1277        12086 :                 delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
    1278          454 :                     .await
    1279        12086 :                     .map(|res| res.map(LayerKind::Delta))
    1280              :             } else {
    1281        21294 :                 let lsn = owner.desc.image_layer_lsn();
    1282        21294 :                 let summary = Some(image_layer::Summary::expected(
    1283        21294 :                     owner.desc.tenant_shard_id.tenant_id,
    1284        21294 :                     owner.desc.timeline_id,
    1285        21294 :                     owner.desc.key_range.clone(),
    1286        21294 :                     lsn,
    1287        21294 :                 ));
    1288        21294 :                 image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
    1289          244 :                     .await
    1290        21294 :                     .map(|res| res.map(LayerKind::Image))
    1291              :             };
    1292              : 
    1293        33379 :             match res {
    1294        33379 :                 Ok(Ok(layer)) => Ok(Ok(layer)),
    1295            0 :                 Ok(Err(transient)) => Err(transient),
    1296            1 :                 Err(permanent) => {
    1297            1 :                     LAYER_IMPL_METRICS.inc_permanent_loading_failures();
    1298            1 :                     // TODO(#5815): we are not logging all errors, so temporarily log them **once**
    1299            1 :                     // here as well
    1300            1 :                     let permanent = permanent.context("load layer");
    1301            1 :                     tracing::error!("layer loading failed permanently: {permanent:#}");
    1302            1 :                     Ok(Err(permanent))
    1303              :                 }
    1304              :             }
    1305        66760 :         };
    1306     16847864 :         self.kind
    1307     16847864 :             .get_or_try_init(init)
    1308              :             // return transient errors using `?`
    1309         1488 :             .await?
    1310     16847864 :             .as_ref()
    1311     16847864 :             .map_err(|e| {
    1312           20 :                 // errors are not clonabled, cannot but stringify
    1313           20 :                 // test_broken_timeline matches this string
    1314           20 :                 anyhow::anyhow!("layer loading failed: {e:#}")
    1315     16847864 :             })
    1316     16847864 :     }
    1317              : 
    1318     16843790 :     async fn get_value_reconstruct_data(
    1319     16843790 :         &self,
    1320     16843790 :         key: Key,
    1321     16843790 :         lsn_range: Range<Lsn>,
    1322     16843790 :         reconstruct_data: &mut ValueReconstructState,
    1323     16843790 :         owner: &Arc<LayerInner>,
    1324     16843790 :         ctx: &RequestContext,
    1325     16843790 :     ) -> anyhow::Result<ValueReconstructResult> {
    1326     16843756 :         use LayerKind::*;
    1327     16843756 : 
    1328     16843756 :         match self.get(owner, ctx).await? {
    1329     16415713 :             Delta(d) => {
    1330     16415713 :                 d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
    1331       977966 :                     .await
    1332              :             }
    1333       428023 :             Image(i) => {
    1334       428023 :                 i.get_value_reconstruct_data(key, reconstruct_data, ctx)
    1335        13268 :                     .await
    1336              :             }
    1337              :         }
    1338     16843752 :     }
    1339              : 
    1340            4 :     async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
    1341            4 :         use LayerKind::*;
    1342            4 :         match self.get(owner, ctx).await? {
    1343            4 :             Delta(d) => d.dump(ctx).await?,
    1344            0 :             Image(i) => i.dump(ctx).await?,
    1345              :         }
    1346              : 
    1347            4 :         Ok(())
    1348            4 :     }
    1349              : }
    1350              : 
    1351              : /// Wrapper around an actual layer implementation.
    1352            0 : #[derive(Debug)]
    1353              : enum LayerKind {
    1354              :     Delta(delta_layer::DeltaLayerInner),
    1355              :     Image(image_layer::ImageLayerInner),
    1356              : }
    1357              : 
    1358              : /// Guard for forcing a layer be resident while it exists.
    1359        26586 : #[derive(Clone)]
    1360              : pub(crate) struct ResidentLayer {
    1361              :     owner: Layer,
    1362              :     downloaded: Arc<DownloadedLayer>,
    1363              : }
    1364              : 
    1365              : impl std::fmt::Display for ResidentLayer {
    1366        34268 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1367        34268 :         write!(f, "{}", self.owner)
    1368        34268 :     }
    1369              : }
    1370              : 
    1371              : impl std::fmt::Debug for ResidentLayer {
    1372            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1373            0 :         write!(f, "{}", self.owner)
    1374            0 :     }
    1375              : }
    1376              : 
    1377              : impl ResidentLayer {
    1378              :     /// Release the eviction guard, converting back into a plain [`Layer`].
    1379              :     ///
    1380              :     /// You can access the [`Layer`] also by using `as_ref`.
    1381        20216 :     pub(crate) fn drop_eviction_guard(self) -> Layer {
    1382        20216 :         self.into()
    1383        20216 :     }
    1384              : 
    1385              :     /// Loads all keys stored in the layer. Returns key, lsn and value size.
    1386         8208 :     #[tracing::instrument(skip_all, fields(layer=%self))]
    1387              :     pub(crate) async fn load_keys<'a>(
    1388              :         &'a self,
    1389              :         ctx: &RequestContext,
    1390              :     ) -> anyhow::Result<Vec<DeltaEntry<'a>>> {
    1391              :         use LayerKind::*;
    1392              : 
    1393              :         let owner = &self.owner.0;
    1394              : 
    1395              :         match self.downloaded.get(owner, ctx).await? {
    1396              :             Delta(ref d) => {
    1397              :                 owner
    1398              :                     .access_stats
    1399              :                     .record_access(LayerAccessKind::KeyIter, ctx);
    1400              : 
    1401              :                 // this is valid because the DownloadedLayer::kind is a OnceCell, not a
    1402              :                 // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
    1403              :                 // while it's being held.
    1404              :                 delta_layer::DeltaLayerInner::load_keys(d, ctx)
    1405              :                     .await
    1406              :                     .context("Layer index is corrupted")
    1407              :             }
    1408              :             Image(_) => anyhow::bail!("cannot load_keys on a image layer"),
    1409              :         }
    1410              :     }
    1411              : 
    1412        45907 :     pub(crate) fn local_path(&self) -> &Utf8Path {
    1413        45907 :         &self.owner.0.path
    1414        45907 :     }
    1415              : 
    1416        22334 :     pub(crate) fn metadata(&self) -> LayerFileMetadata {
    1417        22334 :         self.owner.metadata()
    1418        22334 :     }
    1419              : }
    1420              : 
    1421              : impl AsLayerDesc for ResidentLayer {
    1422        77442 :     fn layer_desc(&self) -> &PersistentLayerDesc {
    1423        77442 :         self.owner.layer_desc()
    1424        77442 :     }
    1425              : }
    1426              : 
    1427              : impl AsRef<Layer> for ResidentLayer {
    1428        33052 :     fn as_ref(&self) -> &Layer {
    1429        33052 :         &self.owner
    1430        33052 :     }
    1431              : }
    1432              : 
    1433              : /// Drop the eviction guard.
    1434              : impl From<ResidentLayer> for Layer {
    1435        20216 :     fn from(value: ResidentLayer) -> Self {
    1436        20216 :         value.owner
    1437        20216 :     }
    1438              : }
    1439              : 
    1440              : use metrics::IntCounter;
    1441              : 
    1442              : pub(crate) struct LayerImplMetrics {
    1443              :     started_evictions: IntCounter,
    1444              :     completed_evictions: IntCounter,
    1445              :     cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
    1446              : 
    1447              :     started_deletes: IntCounter,
    1448              :     completed_deletes: IntCounter,
    1449              :     failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
    1450              : 
    1451              :     rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
    1452              :     inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
    1453              :     redownload_after: metrics::Histogram,
    1454              : }
    1455              : 
    1456              : impl Default for LayerImplMetrics {
    1457          631 :     fn default() -> Self {
    1458          631 :         use enum_map::Enum;
    1459          631 : 
    1460          631 :         // reminder: these will be pageserver_layer_* with "_total" suffix
    1461          631 : 
    1462          631 :         let started_evictions = metrics::register_int_counter!(
    1463          631 :             "pageserver_layer_started_evictions",
    1464          631 :             "Evictions started in the Layer implementation"
    1465          631 :         )
    1466          631 :         .unwrap();
    1467          631 :         let completed_evictions = metrics::register_int_counter!(
    1468          631 :             "pageserver_layer_completed_evictions",
    1469          631 :             "Evictions completed in the Layer implementation"
    1470          631 :         )
    1471          631 :         .unwrap();
    1472          631 : 
    1473          631 :         let cancelled_evictions = metrics::register_int_counter_vec!(
    1474          631 :             "pageserver_layer_cancelled_evictions_count",
    1475          631 :             "Different reasons for evictions to have been cancelled or failed",
    1476          631 :             &["reason"]
    1477          631 :         )
    1478          631 :         .unwrap();
    1479          631 : 
    1480         5048 :         let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1481         5048 :             let reason = EvictionCancelled::from_usize(i);
    1482         5048 :             let s = reason.as_str();
    1483         5048 :             cancelled_evictions.with_label_values(&[s])
    1484         5048 :         }));
    1485          631 : 
    1486          631 :         let started_deletes = metrics::register_int_counter!(
    1487          631 :             "pageserver_layer_started_deletes",
    1488          631 :             "Deletions on drop pending in the Layer implementation"
    1489          631 :         )
    1490          631 :         .unwrap();
    1491          631 :         let completed_deletes = metrics::register_int_counter!(
    1492          631 :             "pageserver_layer_completed_deletes",
    1493          631 :             "Deletions on drop completed in the Layer implementation"
    1494          631 :         )
    1495          631 :         .unwrap();
    1496          631 : 
    1497          631 :         let failed_deletes = metrics::register_int_counter_vec!(
    1498          631 :             "pageserver_layer_failed_deletes_count",
    1499          631 :             "Different reasons for deletions on drop to have failed",
    1500          631 :             &["reason"]
    1501          631 :         )
    1502          631 :         .unwrap();
    1503          631 : 
    1504         1262 :         let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1505         1262 :             let reason = DeleteFailed::from_usize(i);
    1506         1262 :             let s = reason.as_str();
    1507         1262 :             failed_deletes.with_label_values(&[s])
    1508         1262 :         }));
    1509          631 : 
    1510          631 :         let rare_counters = metrics::register_int_counter_vec!(
    1511          631 :             "pageserver_layer_assumed_rare_count",
    1512          631 :             "Times unexpected or assumed rare event happened",
    1513          631 :             &["event"]
    1514          631 :         )
    1515          631 :         .unwrap();
    1516          631 : 
    1517         4417 :         let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1518         4417 :             let event = RareEvent::from_usize(i);
    1519         4417 :             let s = event.as_str();
    1520         4417 :             rare_counters.with_label_values(&[s])
    1521         4417 :         }));
    1522          631 : 
    1523          631 :         let inits_cancelled = metrics::register_int_counter!(
    1524          631 :             "pageserver_layer_inits_cancelled_count",
    1525          631 :             "Times Layer initialization was cancelled",
    1526          631 :         )
    1527          631 :         .unwrap();
    1528          631 : 
    1529          631 :         let redownload_after = {
    1530          631 :             let minute = 60.0;
    1531          631 :             let hour = 60.0 * minute;
    1532          631 :             metrics::register_histogram!(
    1533          631 :                 "pageserver_layer_redownloaded_after",
    1534          631 :                 "Time between evicting and re-downloading.",
    1535          631 :                 vec![
    1536          631 :                     10.0,
    1537          631 :                     30.0,
    1538          631 :                     minute,
    1539          631 :                     5.0 * minute,
    1540          631 :                     15.0 * minute,
    1541          631 :                     30.0 * minute,
    1542          631 :                     hour,
    1543          631 :                     12.0 * hour,
    1544          631 :                 ]
    1545          631 :             )
    1546          631 :             .unwrap()
    1547          631 :         };
    1548          631 : 
    1549          631 :         Self {
    1550          631 :             started_evictions,
    1551          631 :             completed_evictions,
    1552          631 :             cancelled_evictions,
    1553          631 : 
    1554          631 :             started_deletes,
    1555          631 :             completed_deletes,
    1556          631 :             failed_deletes,
    1557          631 : 
    1558          631 :             rare_counters,
    1559          631 :             inits_cancelled,
    1560          631 :             redownload_after,
    1561          631 :         }
    1562          631 :     }
    1563              : }
    1564              : 
    1565              : impl LayerImplMetrics {
    1566         2515 :     fn inc_started_evictions(&self) {
    1567         2515 :         self.started_evictions.inc();
    1568         2515 :     }
    1569         2515 :     fn inc_completed_evictions(&self) {
    1570         2515 :         self.completed_evictions.inc();
    1571         2515 :     }
    1572            0 :     fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
    1573            0 :         self.cancelled_evictions[reason].inc()
    1574            0 :     }
    1575              : 
    1576         5157 :     fn inc_started_deletes(&self) {
    1577         5157 :         self.started_deletes.inc();
    1578         5157 :     }
    1579         5143 :     fn inc_completed_deletes(&self) {
    1580         5143 :         self.completed_deletes.inc();
    1581         5143 :     }
    1582           14 :     fn inc_deletes_failed(&self, reason: DeleteFailed) {
    1583           14 :         self.failed_deletes[reason].inc();
    1584           14 :     }
    1585              : 
    1586              :     /// Counted separatedly from failed layer deletes because we will complete the layer deletion
    1587              :     /// attempt regardless of failure to delete local file.
    1588            0 :     fn inc_delete_removes_failed(&self) {
    1589            0 :         self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
    1590            0 :     }
    1591              : 
    1592              :     /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
    1593            0 :     fn inc_retried_get_or_maybe_download(&self) {
    1594            0 :         self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
    1595            0 :     }
    1596              : 
    1597              :     /// Expected rare because cancellations are unexpected, and failures are unexpected
    1598            4 :     fn inc_download_failed_without_requester(&self) {
    1599            4 :         self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
    1600            4 :     }
    1601              : 
    1602              :     /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
    1603              :     ///
    1604              :     /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
    1605              :     /// Option.
    1606            0 :     fn inc_raced_wanted_evicted_accesses(&self) {
    1607            0 :         self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
    1608            0 :     }
    1609              : 
    1610              :     /// These are only expected for [`Self::inc_init_cancelled`] amount when
    1611              :     /// running with remote storage.
    1612            0 :     fn inc_init_needed_no_download(&self) {
    1613            0 :         self.rare_counters[RareEvent::InitWithoutDownload].inc();
    1614            0 :     }
    1615              : 
    1616              :     /// Expected rare because all layer files should be readable and good
    1617            1 :     fn inc_permanent_loading_failures(&self) {
    1618            1 :         self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
    1619            1 :     }
    1620              : 
    1621            0 :     fn inc_broadcast_lagged(&self) {
    1622            0 :         self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
    1623            0 :     }
    1624              : 
    1625          627 :     fn inc_init_cancelled(&self) {
    1626          627 :         self.inits_cancelled.inc()
    1627          627 :     }
    1628              : 
    1629          165 :     fn record_redownloaded_after(&self, duration: std::time::Duration) {
    1630          165 :         self.redownload_after.observe(duration.as_secs_f64())
    1631          165 :     }
    1632              : }
    1633              : 
    1634         5048 : #[derive(enum_map::Enum)]
    1635              : enum EvictionCancelled {
    1636              :     LayerGone,
    1637              :     TimelineGone,
    1638              :     VersionCheckFailed,
    1639              :     FileNotFound,
    1640              :     RemoveFailed,
    1641              :     AlreadyReinitialized,
    1642              :     /// Not evicted because of a pending reinitialization
    1643              :     LostToDownload,
    1644              :     /// After eviction, there was a new layer access which cancelled the eviction.
    1645              :     UpgradedBackOnAccess,
    1646              : }
    1647              : 
    1648              : impl EvictionCancelled {
    1649         5048 :     fn as_str(&self) -> &'static str {
    1650         5048 :         match self {
    1651          631 :             EvictionCancelled::LayerGone => "layer_gone",
    1652          631 :             EvictionCancelled::TimelineGone => "timeline_gone",
    1653          631 :             EvictionCancelled::VersionCheckFailed => "version_check_fail",
    1654          631 :             EvictionCancelled::FileNotFound => "file_not_found",
    1655          631 :             EvictionCancelled::RemoveFailed => "remove_failed",
    1656          631 :             EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
    1657          631 :             EvictionCancelled::LostToDownload => "lost_to_download",
    1658          631 :             EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
    1659              :         }
    1660         5048 :     }
    1661              : }
    1662              : 
    1663         1276 : #[derive(enum_map::Enum)]
    1664              : enum DeleteFailed {
    1665              :     TimelineGone,
    1666              :     DeleteSchedulingFailed,
    1667              : }
    1668              : 
    1669              : impl DeleteFailed {
    1670         1262 :     fn as_str(&self) -> &'static str {
    1671         1262 :         match self {
    1672          631 :             DeleteFailed::TimelineGone => "timeline_gone",
    1673          631 :             DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
    1674              :         }
    1675         1262 :     }
    1676              : }
    1677              : 
    1678         4422 : #[derive(enum_map::Enum)]
    1679              : enum RareEvent {
    1680              :     RemoveOnDropFailed,
    1681              :     RetriedGetOrMaybeDownload,
    1682              :     DownloadFailedWithoutRequester,
    1683              :     UpgradedWantedEvicted,
    1684              :     InitWithoutDownload,
    1685              :     PermanentLoadingFailure,
    1686              :     EvictAndWaitLagged,
    1687              : }
    1688              : 
    1689              : impl RareEvent {
    1690         4417 :     fn as_str(&self) -> &'static str {
    1691         4417 :         use RareEvent::*;
    1692         4417 : 
    1693         4417 :         match self {
    1694          631 :             RemoveOnDropFailed => "remove_on_drop_failed",
    1695          631 :             RetriedGetOrMaybeDownload => "retried_gomd",
    1696          631 :             DownloadFailedWithoutRequester => "download_failed_without",
    1697          631 :             UpgradedWantedEvicted => "raced_wanted_evicted",
    1698          631 :             InitWithoutDownload => "init_needed_no_download",
    1699          631 :             PermanentLoadingFailure => "permanent_loading_failure",
    1700          631 :             EvictAndWaitLagged => "broadcast_lagged",
    1701              :         }
    1702         4417 :     }
    1703              : }
    1704              : 
    1705              : pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
    1706              :     once_cell::sync::Lazy::new(LayerImplMetrics::default);
        

Generated by: LCOV version 2.1-beta