LCOV - code coverage report
Current view: top level - pageserver/src/tenant/storage_layer - layer.rs (source / functions) Coverage Total Hit
Test: 322b88762cba8ea666f63cda880cccab6936bf37.info Lines: 70.1 % 1099 770
Test Date: 2024-02-29 11:57:12 Functions: 65.2 % 155 101

            Line data    Source code
       1              : use anyhow::Context;
       2              : use camino::{Utf8Path, Utf8PathBuf};
       3              : use pageserver_api::keyspace::KeySpace;
       4              : use pageserver_api::models::{
       5              :     HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
       6              : };
       7              : use pageserver_api::shard::ShardIndex;
       8              : use std::ops::Range;
       9              : use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
      10              : use std::sync::{Arc, Weak};
      11              : use std::time::SystemTime;
      12              : use tracing::Instrument;
      13              : use utils::lsn::Lsn;
      14              : use utils::sync::heavier_once_cell;
      15              : 
      16              : use crate::config::PageServerConf;
      17              : use crate::context::RequestContext;
      18              : use crate::repository::Key;
      19              : use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
      20              : use crate::tenant::timeline::GetVectoredError;
      21              : use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
      22              : 
      23              : use super::delta_layer::{self, DeltaEntry};
      24              : use super::image_layer;
      25              : use super::{
      26              :     AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
      27              :     ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
      28              : };
      29              : 
      30              : use utils::generation::Generation;
      31              : 
      32              : #[cfg(test)]
      33              : mod tests;
      34              : 
      35              : /// A Layer contains all data in a "rectangle" consisting of a range of keys and
      36              : /// range of LSNs.
      37              : ///
      38              : /// There are two kinds of layers, in-memory and on-disk layers. In-memory
      39              : /// layers are used to ingest incoming WAL, and provide fast access to the
      40              : /// recent page versions. On-disk layers are stored as files on disk, and are
      41              : /// immutable. This type represents the on-disk kind while in-memory kind are represented by
      42              : /// [`InMemoryLayer`].
      43              : ///
      44              : /// Furthermore, there are two kinds of on-disk layers: delta and image layers.
      45              : /// A delta layer contains all modifications within a range of LSNs and keys.
      46              : /// An image layer is a snapshot of all the data in a key-range, at a single
      47              : /// LSN.
      48              : ///
      49              : /// This type models the on-disk layers, which can be evicted and on-demand downloaded.
      50              : ///
      51              : /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer
      52       252100 : #[derive(Clone)]
      53              : pub(crate) struct Layer(Arc<LayerInner>);
      54              : 
      55              : impl std::fmt::Display for Layer {
      56         1162 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      57         1162 :         if matches!(self.0.generation, Generation::Broken) {
      58            0 :             write!(f, "{}-broken", self.layer_desc().short_id())
      59              :         } else {
      60         1162 :             write!(
      61         1162 :                 f,
      62         1162 :                 "{}{}",
      63         1162 :                 self.layer_desc().short_id(),
      64         1162 :                 self.0.generation.get_suffix()
      65         1162 :             )
      66              :         }
      67         1162 :     }
      68              : }
      69              : 
      70              : impl std::fmt::Debug for Layer {
      71            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      72            0 :         write!(f, "{}", self)
      73            0 :     }
      74              : }
      75              : 
      76              : impl AsLayerDesc for Layer {
      77       378256 :     fn layer_desc(&self) -> &PersistentLayerDesc {
      78       378256 :         self.0.layer_desc()
      79       378256 :     }
      80              : }
      81              : 
      82              : impl Layer {
      83              :     /// Creates a layer value for a file we know to not be resident.
      84            0 :     pub(crate) fn for_evicted(
      85            0 :         conf: &'static PageServerConf,
      86            0 :         timeline: &Arc<Timeline>,
      87            0 :         file_name: LayerFileName,
      88            0 :         metadata: LayerFileMetadata,
      89            0 :     ) -> Self {
      90            0 :         let desc = PersistentLayerDesc::from_filename(
      91            0 :             timeline.tenant_shard_id,
      92            0 :             timeline.timeline_id,
      93            0 :             file_name,
      94            0 :             metadata.file_size(),
      95            0 :         );
      96            0 : 
      97            0 :         let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
      98            0 : 
      99            0 :         let owner = Layer(Arc::new(LayerInner::new(
     100            0 :             conf,
     101            0 :             timeline,
     102            0 :             access_stats,
     103            0 :             desc,
     104            0 :             None,
     105            0 :             metadata.generation,
     106            0 :             metadata.shard,
     107            0 :         )));
     108              : 
     109            0 :         debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
     110              : 
     111            0 :         owner
     112            0 :     }
     113              : 
     114              :     /// Creates a Layer value for a file we know to be resident in timeline directory.
     115           24 :     pub(crate) fn for_resident(
     116           24 :         conf: &'static PageServerConf,
     117           24 :         timeline: &Arc<Timeline>,
     118           24 :         file_name: LayerFileName,
     119           24 :         metadata: LayerFileMetadata,
     120           24 :     ) -> ResidentLayer {
     121           24 :         let desc = PersistentLayerDesc::from_filename(
     122           24 :             timeline.tenant_shard_id,
     123           24 :             timeline.timeline_id,
     124           24 :             file_name,
     125           24 :             metadata.file_size(),
     126           24 :         );
     127           24 : 
     128           24 :         let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
     129           24 : 
     130           24 :         let mut resident = None;
     131           24 : 
     132           24 :         let owner = Layer(Arc::new_cyclic(|owner| {
     133           24 :             let inner = Arc::new(DownloadedLayer {
     134           24 :                 owner: owner.clone(),
     135           24 :                 kind: tokio::sync::OnceCell::default(),
     136           24 :                 version: 0,
     137           24 :             });
     138           24 :             resident = Some(inner.clone());
     139           24 : 
     140           24 :             LayerInner::new(
     141           24 :                 conf,
     142           24 :                 timeline,
     143           24 :                 access_stats,
     144           24 :                 desc,
     145           24 :                 Some(inner),
     146           24 :                 metadata.generation,
     147           24 :                 metadata.shard,
     148           24 :             )
     149           24 :         }));
     150           24 : 
     151           24 :         let downloaded = resident.expect("just initialized");
     152              : 
     153           24 :         debug_assert!(owner.0.needs_download_blocking().unwrap().is_none());
     154              : 
     155           24 :         timeline
     156           24 :             .metrics
     157           24 :             .resident_physical_size_add(metadata.file_size());
     158           24 : 
     159           24 :         ResidentLayer { downloaded, owner }
     160           24 :     }
     161              : 
     162              :     /// Creates a Layer value for freshly written out new layer file by renaming it from a
     163              :     /// temporary path.
     164          554 :     pub(crate) fn finish_creating(
     165          554 :         conf: &'static PageServerConf,
     166          554 :         timeline: &Arc<Timeline>,
     167          554 :         desc: PersistentLayerDesc,
     168          554 :         temp_path: &Utf8Path,
     169          554 :     ) -> anyhow::Result<ResidentLayer> {
     170          554 :         let mut resident = None;
     171          554 : 
     172          554 :         let owner = Layer(Arc::new_cyclic(|owner| {
     173          554 :             let inner = Arc::new(DownloadedLayer {
     174          554 :                 owner: owner.clone(),
     175          554 :                 kind: tokio::sync::OnceCell::default(),
     176          554 :                 version: 0,
     177          554 :             });
     178          554 :             resident = Some(inner.clone());
     179          554 :             let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
     180          554 :             access_stats.record_residence_event(
     181          554 :                 LayerResidenceStatus::Resident,
     182          554 :                 LayerResidenceEventReason::LayerCreate,
     183          554 :             );
     184          554 :             LayerInner::new(
     185          554 :                 conf,
     186          554 :                 timeline,
     187          554 :                 access_stats,
     188          554 :                 desc,
     189          554 :                 Some(inner),
     190          554 :                 timeline.generation,
     191          554 :                 timeline.get_shard_index(),
     192          554 :             )
     193          554 :         }));
     194          554 : 
     195          554 :         let downloaded = resident.expect("just initialized");
     196          554 : 
     197          554 :         // if the rename works, the path is as expected
     198          554 :         std::fs::rename(temp_path, owner.local_path())
     199          554 :             .with_context(|| format!("rename temporary file as correct path for {owner}"))?;
     200              : 
     201          554 :         Ok(ResidentLayer { downloaded, owner })
     202          554 :     }
     203              : 
     204              :     /// Requests the layer to be evicted and waits for this to be done.
     205              :     ///
     206              :     /// If the file is not resident, an [`EvictionError::NotFound`] is returned.
     207              :     ///
     208              :     /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
     209              :     /// re-downloaded, [`EvictionError::Downloaded`] is returned.
     210              :     ///
     211              :     /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
     212              :     /// of download-evict cycle on retry.
     213           10 :     pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
     214           14 :         self.0.evict_and_wait().await
     215           10 :     }
     216              : 
     217              :     /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
     218              :     /// then.
     219              :     ///
     220              :     /// On drop, this will cause a call to [`crate::tenant::remote_timeline_client::RemoteTimelineClient::schedule_deletion_of_unlinked`].
     221              :     /// This means that the unlinking by [gc] or [compaction] must have happened strictly before
     222              :     /// the value this is called on gets dropped.
     223              :     ///
     224              :     /// This is ensured by both of those methods accepting references to Layer.
     225              :     ///
     226              :     /// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
     227              :     /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
     228          304 :     pub(crate) fn delete_on_drop(&self) {
     229          304 :         self.0.delete_on_drop();
     230          304 :     }
     231              : 
     232              :     /// Return data needed to reconstruct given page at LSN.
     233              :     ///
     234              :     /// It is up to the caller to collect more data from the previous layer and
     235              :     /// perform WAL redo, if necessary.
     236              :     ///
     237              :     /// # Cancellation-Safety
     238              :     ///
     239              :     /// This method is cancellation-safe.
     240       124028 :     pub(crate) async fn get_value_reconstruct_data(
     241       124028 :         &self,
     242       124028 :         key: Key,
     243       124028 :         lsn_range: Range<Lsn>,
     244       124028 :         reconstruct_data: &mut ValueReconstructState,
     245       124028 :         ctx: &RequestContext,
     246       124028 :     ) -> anyhow::Result<ValueReconstructResult> {
     247              :         use anyhow::ensure;
     248              : 
     249       124028 :         let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
     250       124028 :         self.0
     251       124028 :             .access_stats
     252       124028 :             .record_access(LayerAccessKind::GetValueReconstructData, ctx);
     253       124028 : 
     254       124028 :         if self.layer_desc().is_delta {
     255       123516 :             ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
     256       123516 :             ensure!(self.layer_desc().key_range.contains(&key));
     257              :         } else {
     258          512 :             ensure!(self.layer_desc().key_range.contains(&key));
     259          512 :             ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
     260          512 :             ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
     261              :         }
     262              : 
     263       124028 :         layer
     264       124028 :             .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
     265       124028 :             .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
     266        23310 :             .await
     267       124028 :             .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
     268       124028 :     }
     269              : 
     270           10 :     pub(crate) async fn get_values_reconstruct_data(
     271           10 :         &self,
     272           10 :         keyspace: KeySpace,
     273           10 :         lsn_range: Range<Lsn>,
     274           10 :         reconstruct_data: &mut ValuesReconstructState,
     275           10 :         ctx: &RequestContext,
     276           10 :     ) -> Result<(), GetVectoredError> {
     277           10 :         let layer = self
     278           10 :             .0
     279           10 :             .get_or_maybe_download(true, Some(ctx))
     280            0 :             .await
     281           10 :             .map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
     282              : 
     283           10 :         self.0
     284           10 :             .access_stats
     285           10 :             .record_access(LayerAccessKind::GetValueReconstructData, ctx);
     286           10 : 
     287           10 :         layer
     288           10 :             .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
     289           10 :             .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
     290           25 :             .await
     291           10 :     }
     292              : 
     293              :     /// Download the layer if evicted.
     294              :     ///
     295              :     /// Will not error when the layer is already downloaded.
     296            0 :     pub(crate) async fn download(&self) -> anyhow::Result<()> {
     297            0 :         self.0.get_or_maybe_download(true, None).await?;
     298            0 :         Ok(())
     299            0 :     }
     300              : 
     301              :     /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
     302              :     /// while the guard exists.
     303              :     ///
     304              :     /// Returns None if the layer is currently evicted.
     305           16 :     pub(crate) async fn keep_resident(&self) -> anyhow::Result<Option<ResidentLayer>> {
     306           16 :         let downloaded = match self.0.get_or_maybe_download(false, None).await {
     307           12 :             Ok(d) => d,
     308              :             // technically there are a lot of possible errors, but in practice it should only be
     309              :             // DownloadRequired which is tripped up. could work to improve this situation
     310              :             // statically later.
     311            4 :             Err(DownloadError::DownloadRequired) => return Ok(None),
     312            0 :             Err(e) => return Err(e.into()),
     313              :         };
     314              : 
     315           12 :         Ok(Some(ResidentLayer {
     316           12 :             downloaded,
     317           12 :             owner: self.clone(),
     318           12 :         }))
     319           16 :     }
     320              : 
     321              :     /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
     322          300 :     pub(crate) async fn download_and_keep_resident(&self) -> anyhow::Result<ResidentLayer> {
     323          300 :         let downloaded = self.0.get_or_maybe_download(true, None).await?;
     324              : 
     325          300 :         Ok(ResidentLayer {
     326          300 :             downloaded,
     327          300 :             owner: self.clone(),
     328          300 :         })
     329          300 :     }
     330              : 
     331            0 :     pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
     332            0 :         self.0.info(reset)
     333            0 :     }
     334              : 
     335            0 :     pub(crate) fn access_stats(&self) -> &LayerAccessStats {
     336            0 :         &self.0.access_stats
     337            0 :     }
     338              : 
     339          658 :     pub(crate) fn local_path(&self) -> &Utf8Path {
     340          658 :         &self.0.path
     341          658 :     }
     342              : 
     343          562 :     pub(crate) fn metadata(&self) -> LayerFileMetadata {
     344          562 :         self.0.metadata()
     345          562 :     }
     346              : 
     347              :     /// Traditional debug dumping facility
     348              :     #[allow(unused)]
     349            4 :     pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> {
     350            4 :         self.0.desc.dump();
     351            4 : 
     352            4 :         if verbose {
     353              :             // for now, unconditionally download everything, even if that might not be wanted.
     354            4 :             let l = self.0.get_or_maybe_download(true, Some(ctx)).await?;
     355            8 :             l.dump(&self.0, ctx).await?
     356            0 :         }
     357              : 
     358            4 :         Ok(())
     359            4 :     }
     360              : 
     361              :     /// Waits until this layer has been dropped (and if needed, local file deletion and remote
     362              :     /// deletion scheduling has completed).
     363              :     ///
     364              :     /// Does not start local deletion, use [`Self::delete_on_drop`] for that
     365              :     /// separatedly.
     366              :     #[cfg(feature = "testing")]
     367            0 :     pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
     368            0 :         let mut rx = self.0.status.subscribe();
     369              : 
     370            0 :         async move {
     371              :             loop {
     372            0 :                 if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await {
     373            0 :                     break;
     374            0 :                 }
     375              :             }
     376            0 :         }
     377            0 :     }
     378              : }
     379              : 
     380              : /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
     381              : ///
     382              : /// However when we want something evicted, we cannot evict it right away as there might be current
     383              : /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
     384              : /// read with [`Layer::get_value_reconstruct_data`].
     385              : ///
     386              : /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
     387            0 : #[derive(Debug)]
     388              : enum ResidentOrWantedEvicted {
     389              :     Resident(Arc<DownloadedLayer>),
     390              :     WantedEvicted(Weak<DownloadedLayer>, usize),
     391              : }
     392              : 
     393              : impl ResidentOrWantedEvicted {
     394       124356 :     fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
     395       124356 :         match self {
     396       124354 :             ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
     397            2 :             ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
     398            0 :                 Some(strong) => {
     399            0 :                     LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
     400            0 : 
     401            0 :                     *self = ResidentOrWantedEvicted::Resident(strong.clone());
     402            0 : 
     403            0 :                     Some((strong, true))
     404              :                 }
     405            2 :                 None => None,
     406              :             },
     407              :         }
     408       124356 :     }
     409              : 
     410              :     /// When eviction is first requested, drop down to holding a [`Weak`].
     411              :     ///
     412              :     /// Returns `Some` if this was the first time eviction was requested. Care should be taken to
     413              :     /// drop the possibly last strong reference outside of the mutex of
     414              :     /// heavier_once_cell::OnceCell.
     415            8 :     fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
     416            8 :         match self {
     417            8 :             ResidentOrWantedEvicted::Resident(strong) => {
     418            8 :                 let weak = Arc::downgrade(strong);
     419            8 :                 let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
     420            8 :                 std::mem::swap(self, &mut temp);
     421            8 :                 match temp {
     422            8 :                     ResidentOrWantedEvicted::Resident(strong) => Some(strong),
     423            0 :                     ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
     424              :                 }
     425              :             }
     426            0 :             ResidentOrWantedEvicted::WantedEvicted(..) => None,
     427              :         }
     428            8 :     }
     429              : }
     430              : 
     431              : struct LayerInner {
     432              :     /// Only needed to check ondemand_download_behavior_treat_error_as_warn and creation of
     433              :     /// [`Self::path`].
     434              :     conf: &'static PageServerConf,
     435              : 
     436              :     /// Full path to the file; unclear if this should exist anymore.
     437              :     path: Utf8PathBuf,
     438              : 
     439              :     desc: PersistentLayerDesc,
     440              : 
     441              :     /// Timeline access is needed for remote timeline client and metrics.
     442              :     timeline: Weak<Timeline>,
     443              : 
     444              :     /// Cached knowledge of [`Timeline::remote_client`] being `Some`.
     445              :     have_remote_client: bool,
     446              : 
     447              :     access_stats: LayerAccessStats,
     448              : 
     449              :     /// This custom OnceCell is backed by std mutex, but only held for short time periods.
     450              :     /// Initialization and deinitialization are done while holding a permit.
     451              :     inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
     452              : 
     453              :     /// Do we want to delete locally and remotely this when `LayerInner` is dropped
     454              :     wanted_deleted: AtomicBool,
     455              : 
     456              :     /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
     457              :     /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
     458              :     /// [`LayerInner::on_downloaded_layer_drop`].
     459              :     wanted_evicted: AtomicBool,
     460              : 
     461              :     /// Version is to make sure we will only evict a specific download of a file.
     462              :     ///
     463              :     /// Incremented for each download, stored in `DownloadedLayer::version` or
     464              :     /// `ResidentOrWantedEvicted::WantedEvicted`.
     465              :     version: AtomicUsize,
     466              : 
     467              :     /// Allow subscribing to when the layer actually gets evicted.
     468              :     status: tokio::sync::broadcast::Sender<Status>,
     469              : 
     470              :     /// Counter for exponential backoff with the download
     471              :     consecutive_failures: AtomicUsize,
     472              : 
     473              :     /// The generation of this Layer.
     474              :     ///
     475              :     /// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
     476              :     /// for created layers from [`Timeline::generation`].
     477              :     generation: Generation,
     478              : 
     479              :     /// The shard of this Layer.
     480              :     ///
     481              :     /// For layers created in this process, this will always be the [`ShardIndex`] of the
     482              :     /// current `ShardIdentity`` (TODO: add link once it's introduced).
     483              :     ///
     484              :     /// For loaded layers, this may be some other value if the tenant has undergone
     485              :     /// a shard split since the layer was originally written.
     486              :     shard: ShardIndex,
     487              : 
     488              :     last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
     489              : }
     490              : 
     491              : impl std::fmt::Display for LayerInner {
     492           14 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     493           14 :         write!(f, "{}", self.layer_desc().short_id())
     494           14 :     }
     495              : }
     496              : 
     497              : impl AsLayerDesc for LayerInner {
     498       379780 :     fn layer_desc(&self) -> &PersistentLayerDesc {
     499       379780 :         &self.desc
     500       379780 :     }
     501              : }
     502              : 
     503            6 : #[derive(Debug, Clone, Copy)]
     504              : enum Status {
     505              :     Evicted,
     506              :     Downloaded,
     507              : }
     508              : 
     509              : impl Drop for LayerInner {
     510          326 :     fn drop(&mut self) {
     511          326 :         if !*self.wanted_deleted.get_mut() {
     512              :             // should we try to evict if the last wish was for eviction?
     513              :             // feels like there's some hazard of overcrowding near shutdown near by, but we don't
     514              :             // run drops during shutdown (yet)
     515           24 :             return;
     516          302 :         }
     517              : 
     518          302 :         let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
     519              : 
     520          302 :         let path = std::mem::take(&mut self.path);
     521          302 :         let file_name = self.layer_desc().filename();
     522          302 :         let file_size = self.layer_desc().file_size;
     523          302 :         let timeline = self.timeline.clone();
     524          302 :         let meta = self.metadata();
     525          302 :         let status = self.status.clone();
     526          302 : 
     527          302 :         crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
     528          302 :             let _g = span.entered();
     529          302 : 
     530          302 :             // carry this until we are finished for [`Layer::wait_drop`] support
     531          302 :             let _status = status;
     532              : 
     533          302 :             let removed = match std::fs::remove_file(path) {
     534          300 :                 Ok(()) => true,
     535            2 :                 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
     536            2 :                     // until we no longer do detaches by removing all local files before removing the
     537            2 :                     // tenant from the global map, we will always get these errors even if we knew what
     538            2 :                     // is the latest state.
     539            2 :                     //
     540            2 :                     // we currently do not track the latest state, so we'll also end up here on evicted
     541            2 :                     // layers.
     542            2 :                     false
     543              :                 }
     544            0 :                 Err(e) => {
     545            0 :                     tracing::error!("failed to remove wanted deleted layer: {e}");
     546            0 :                     LAYER_IMPL_METRICS.inc_delete_removes_failed();
     547            0 :                     false
     548              :                 }
     549              :             };
     550              : 
     551          302 :             if let Some(timeline) = timeline.upgrade() {
     552          302 :                 if removed {
     553          300 :                     timeline.metrics.resident_physical_size_sub(file_size);
     554          300 :                 }
     555          302 :                 if let Some(remote_client) = timeline.remote_client.as_ref() {
     556          302 :                     let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
     557              : 
     558          302 :                     if let Err(e) = res {
     559              :                         // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
     560              :                         // demonstrating this deadlock (without spawn_blocking): stop will drop
     561              :                         // queued items, which will have ResidentLayer's, and those drops would try
     562              :                         // to re-entrantly lock the RemoteTimelineClient inner state.
     563            0 :                         if !timeline.is_active() {
     564            0 :                             tracing::info!("scheduling deletion on drop failed: {e:#}");
     565              :                         } else {
     566            0 :                             tracing::warn!("scheduling deletion on drop failed: {e:#}");
     567              :                         }
     568            0 :                         LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
     569          302 :                     } else {
     570          302 :                         LAYER_IMPL_METRICS.inc_completed_deletes();
     571          302 :                     }
     572            0 :                 }
     573            0 :             } else {
     574            0 :                 // no need to nag that timeline is gone: under normal situation on
     575            0 :                 // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
     576            0 :                 LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
     577            0 :             }
     578          302 :         });
     579          326 :     }
     580              : }
     581              : 
     582              : impl LayerInner {
     583          578 :     fn new(
     584          578 :         conf: &'static PageServerConf,
     585          578 :         timeline: &Arc<Timeline>,
     586          578 :         access_stats: LayerAccessStats,
     587          578 :         desc: PersistentLayerDesc,
     588          578 :         downloaded: Option<Arc<DownloadedLayer>>,
     589          578 :         generation: Generation,
     590          578 :         shard: ShardIndex,
     591          578 :     ) -> Self {
     592          578 :         let path = conf
     593          578 :             .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
     594          578 :             .join(desc.filename().to_string());
     595              : 
     596          578 :         let (inner, version) = if let Some(inner) = downloaded {
     597          578 :             let version = inner.version;
     598          578 :             let resident = ResidentOrWantedEvicted::Resident(inner);
     599          578 :             (heavier_once_cell::OnceCell::new(resident), version)
     600              :         } else {
     601            0 :             (heavier_once_cell::OnceCell::default(), 0)
     602              :         };
     603              : 
     604          578 :         LayerInner {
     605          578 :             conf,
     606          578 :             path,
     607          578 :             desc,
     608          578 :             timeline: Arc::downgrade(timeline),
     609          578 :             have_remote_client: timeline.remote_client.is_some(),
     610          578 :             access_stats,
     611          578 :             wanted_deleted: AtomicBool::new(false),
     612          578 :             wanted_evicted: AtomicBool::new(false),
     613          578 :             inner,
     614          578 :             version: AtomicUsize::new(version),
     615          578 :             status: tokio::sync::broadcast::channel(1).0,
     616          578 :             consecutive_failures: AtomicUsize::new(0),
     617          578 :             generation,
     618          578 :             shard,
     619          578 :             last_evicted_at: std::sync::Mutex::default(),
     620          578 :         }
     621          578 :     }
     622              : 
     623          304 :     fn delete_on_drop(&self) {
     624          304 :         let res =
     625          304 :             self.wanted_deleted
     626          304 :                 .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
     627          304 : 
     628          304 :         if res.is_ok() {
     629          302 :             LAYER_IMPL_METRICS.inc_started_deletes();
     630          302 :         }
     631          304 :     }
     632              : 
     633              :     /// Cancellation safe, however dropping the future and calling this method again might result
     634              :     /// in a new attempt to evict OR join the previously started attempt.
     635           10 :     pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
     636           10 :         use tokio::sync::broadcast::error::RecvError;
     637           10 : 
     638           10 :         assert!(self.have_remote_client);
     639              : 
     640           10 :         let mut rx = self.status.subscribe();
     641              : 
     642            8 :         let strong = {
     643           10 :             match self.inner.get() {
     644            8 :                 Some(mut either) => {
     645            8 :                     self.wanted_evicted.store(true, Ordering::Relaxed);
     646            8 :                     either.downgrade()
     647              :                 }
     648            2 :                 None => return Err(EvictionError::NotFound),
     649              :             }
     650              :         };
     651              : 
     652            8 :         if strong.is_some() {
     653            8 :             // drop the DownloadedLayer outside of the holding the guard
     654            8 :             drop(strong);
     655            8 :             LAYER_IMPL_METRICS.inc_started_evictions();
     656            8 :         }
     657              : 
     658           14 :         match rx.recv().await {
     659            4 :             Ok(Status::Evicted) => Ok(()),
     660            2 :             Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
     661              :             Err(RecvError::Closed) => {
     662            0 :                 unreachable!("sender cannot be dropped while we are in &self method")
     663              :             }
     664              :             Err(RecvError::Lagged(_)) => {
     665              :                 // this is quite unlikely, but we are blocking a lot in the async context, so
     666              :                 // we might be missing this because we are stuck on a LIFO slot on a thread
     667              :                 // which is busy blocking for a 1TB database create_image_layers.
     668              :                 //
     669              :                 // use however late (compared to the initial expressing of wanted) as the
     670              :                 // "outcome" now
     671            2 :                 LAYER_IMPL_METRICS.inc_broadcast_lagged();
     672            2 :                 match self.inner.get() {
     673            0 :                     Some(_) => Err(EvictionError::Downloaded),
     674            2 :                     None => Ok(()),
     675              :                 }
     676              :             }
     677              :         }
     678           10 :     }
     679              : 
     680              :     /// Cancellation safe.
     681       124358 :     async fn get_or_maybe_download(
     682       124358 :         self: &Arc<Self>,
     683       124358 :         allow_download: bool,
     684       124358 :         ctx: Option<&RequestContext>,
     685       124358 :     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
     686       124358 :         let mut init_permit = None;
     687              : 
     688              :         loop {
     689       124360 :             let download = move |permit| {
     690            6 :                 async move {
     691            6 :                     // disable any scheduled but not yet running eviction deletions for this
     692            6 :                     let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
     693            6 : 
     694            6 :                     // count cancellations, which currently remain largely unexpected
     695            6 :                     let init_cancelled =
     696            6 :                         scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
     697            6 : 
     698            6 :                     // no need to make the evict_and_wait wait for the actual download to complete
     699            6 :                     drop(self.status.send(Status::Downloaded));
     700              : 
     701            6 :                     let timeline = self
     702            6 :                         .timeline
     703            6 :                         .upgrade()
     704            6 :                         .ok_or_else(|| DownloadError::TimelineShutdown)?;
     705              : 
     706              :                     // FIXME: grab a gate
     707              : 
     708            6 :                     let can_ever_evict = timeline.remote_client.as_ref().is_some();
     709              : 
     710              :                     // check if we really need to be downloaded; could have been already downloaded by a
     711              :                     // cancelled previous attempt.
     712            6 :                     let needs_download = self
     713            6 :                         .needs_download()
     714            8 :                         .await
     715            6 :                         .map_err(DownloadError::PreStatFailed)?;
     716              : 
     717            6 :                     let permit = if let Some(reason) = needs_download {
     718            4 :                         if let NeedsDownload::NotFile(ft) = reason {
     719            0 :                             return Err(DownloadError::NotFile(ft));
     720            4 :                         }
     721            4 : 
     722            4 :                         // only reset this after we've decided we really need to download. otherwise it'd
     723            4 :                         // be impossible to mark cancelled downloads for eviction, like one could imagine
     724            4 :                         // we would like to do for prefetching which was not needed.
     725            4 :                         self.wanted_evicted.store(false, Ordering::Release);
     726            4 : 
     727            4 :                         if !can_ever_evict {
     728            0 :                             return Err(DownloadError::NoRemoteStorage);
     729            4 :                         }
     730              : 
     731            4 :                         if let Some(ctx) = ctx {
     732            0 :                             self.check_expected_download(ctx)?;
     733            4 :                         }
     734              : 
     735            4 :                         if !allow_download {
     736              :                             // this does look weird, but for LayerInner the "downloading" means also changing
     737              :                             // internal once related state ...
     738            4 :                             return Err(DownloadError::DownloadRequired);
     739            0 :                         }
     740            0 : 
     741            0 :                         tracing::info!(%reason, "downloading on-demand");
     742              : 
     743            0 :                         self.spawn_download_and_wait(timeline, permit).await?
     744              :                     } else {
     745              :                         // the file is present locally, probably by a previous but cancelled call to
     746              :                         // get_or_maybe_download. alternatively we might be running without remote storage.
     747            2 :                         LAYER_IMPL_METRICS.inc_init_needed_no_download();
     748            2 : 
     749            2 :                         permit
     750              :                     };
     751              : 
     752            2 :                     let since_last_eviction =
     753            2 :                         self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
     754            2 :                     if let Some(since_last_eviction) = since_last_eviction {
     755            0 :                         // FIXME: this will not always be recorded correctly until #6028 (the no
     756            0 :                         // download needed branch above)
     757            0 :                         LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
     758            2 :                     }
     759              : 
     760            2 :                     let res = Arc::new(DownloadedLayer {
     761            2 :                         owner: Arc::downgrade(self),
     762            2 :                         kind: tokio::sync::OnceCell::default(),
     763            2 :                         version: next_version,
     764            2 :                     });
     765            2 : 
     766            2 :                     self.access_stats.record_residence_event(
     767            2 :                         LayerResidenceStatus::Resident,
     768            2 :                         LayerResidenceEventReason::ResidenceChange,
     769            2 :                     );
     770            2 : 
     771            2 :                     let waiters = self.inner.initializer_count();
     772            2 :                     if waiters > 0 {
     773            0 :                         tracing::info!(
     774            0 :                             waiters,
     775            0 :                             "completing the on-demand download for other tasks"
     776            0 :                         );
     777            2 :                     }
     778              : 
     779            2 :                     scopeguard::ScopeGuard::into_inner(init_cancelled);
     780            2 : 
     781            2 :                     Ok((ResidentOrWantedEvicted::Resident(res), permit))
     782            6 :                 }
     783            6 :                 .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
     784            6 :             };
     785              : 
     786       124360 :             if let Some(init_permit) = init_permit.take() {
     787              :                 // use the already held initialization permit because it is impossible to hit the
     788              :                 // below paths anymore essentially limiting the max loop iterations to 2.
     789            4 :                 let (value, init_permit) = download(init_permit).await?;
     790            2 :                 let mut guard = self.inner.set(value, init_permit);
     791            2 :                 let (strong, _upgraded) = guard
     792            2 :                     .get_and_upgrade()
     793            2 :                     .expect("init creates strong reference, we held the init permit");
     794            2 :                 return Ok(strong);
     795       124358 :             }
     796              : 
     797            2 :             let (weak, permit) = {
     798       124358 :                 let mut locked = self.inner.get_or_init(download).await?;
     799              : 
     800       124354 :                 if let Some((strong, upgraded)) = locked.get_and_upgrade() {
     801       124352 :                     if upgraded {
     802            0 :                         // when upgraded back, the Arc<DownloadedLayer> is still available, but
     803            0 :                         // previously a `evict_and_wait` was received.
     804            0 :                         self.wanted_evicted.store(false, Ordering::Relaxed);
     805            0 : 
     806            0 :                         // error out any `evict_and_wait`
     807            0 :                         drop(self.status.send(Status::Downloaded));
     808            0 :                         LAYER_IMPL_METRICS
     809            0 :                             .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
     810       124352 :                     }
     811              : 
     812       124352 :                     return Ok(strong);
     813              :                 } else {
     814              :                     // path to here: the evict_blocking is stuck on spawn_blocking queue.
     815              :                     //
     816              :                     // reset the contents, deactivating the eviction and causing a
     817              :                     // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
     818            2 :                     locked.take_and_deinit()
     819            2 :                 }
     820            2 :             };
     821            2 : 
     822            2 :             // unlock first, then drop the weak, but because upgrade failed, we
     823            2 :             // know it cannot be a problem.
     824            2 : 
     825            2 :             assert!(
     826            2 :                 matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
     827            0 :                 "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
     828              :             );
     829              : 
     830            2 :             init_permit = Some(permit);
     831            2 : 
     832            2 :             LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download();
     833              :         }
     834       124358 :     }
     835              : 
     836              :     /// Nag or fail per RequestContext policy
     837            0 :     fn check_expected_download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
     838            0 :         use crate::context::DownloadBehavior::*;
     839            0 :         let b = ctx.download_behavior();
     840            0 :         match b {
     841            0 :             Download => Ok(()),
     842              :             Warn | Error => {
     843            0 :                 tracing::info!(
     844            0 :                     "unexpectedly on-demand downloading for task kind {:?}",
     845            0 :                     ctx.task_kind()
     846            0 :                 );
     847            0 :                 crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
     848              : 
     849            0 :                 let really_error =
     850            0 :                     matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn;
     851              : 
     852            0 :                 if really_error {
     853              :                     // this check is only probablistic, seems like flakyness footgun
     854            0 :                     Err(DownloadError::ContextAndConfigReallyDeniesDownloads)
     855              :                 } else {
     856            0 :                     Ok(())
     857              :                 }
     858              :             }
     859              :         }
     860            0 :     }
     861              : 
     862              :     /// Actual download, at most one is executed at the time.
     863            0 :     async fn spawn_download_and_wait(
     864            0 :         self: &Arc<Self>,
     865            0 :         timeline: Arc<Timeline>,
     866            0 :         permit: heavier_once_cell::InitPermit,
     867            0 :     ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
     868            0 :         debug_assert_current_span_has_tenant_and_timeline_id();
     869            0 : 
     870            0 :         let task_name = format!("download layer {}", self);
     871            0 : 
     872            0 :         let (tx, rx) = tokio::sync::oneshot::channel();
     873            0 : 
     874            0 :         // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
     875            0 :         // block tenant::mgr::remove_tenant_from_memory.
     876            0 : 
     877            0 :         let this: Arc<Self> = self.clone();
     878            0 : 
     879            0 :         crate::task_mgr::spawn(
     880            0 :             &tokio::runtime::Handle::current(),
     881            0 :             crate::task_mgr::TaskKind::RemoteDownloadTask,
     882            0 :             Some(self.desc.tenant_shard_id),
     883            0 :             Some(self.desc.timeline_id),
     884            0 :             &task_name,
     885            0 :             false,
     886            0 :             async move {
     887            0 : 
     888            0 :                 let client = timeline
     889            0 :                     .remote_client
     890            0 :                     .as_ref()
     891            0 :                     .expect("checked above with have_remote_client");
     892              : 
     893            0 :                 let result = client.download_layer_file(
     894            0 :                     &this.desc.filename(),
     895            0 :                     &this.metadata(),
     896            0 :                     &crate::task_mgr::shutdown_token()
     897            0 :                 )
     898            0 :                 .await;
     899              : 
     900            0 :                 let result = match result {
     901            0 :                     Ok(size) => {
     902            0 :                         timeline.metrics.resident_physical_size_add(size);
     903            0 :                         Ok(())
     904              :                     }
     905            0 :                     Err(e) => {
     906            0 :                         let consecutive_failures =
     907            0 :                             this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
     908            0 : 
     909            0 :                         let backoff = utils::backoff::exponential_backoff_duration_seconds(
     910            0 :                             consecutive_failures.min(u32::MAX as usize) as u32,
     911            0 :                             1.5,
     912            0 :                             60.0,
     913            0 :                         );
     914            0 : 
     915            0 :                         let backoff = std::time::Duration::from_secs_f64(backoff);
     916            0 : 
     917            0 :                         tokio::select! {
     918            0 :                             _ = tokio::time::sleep(backoff) => {},
     919            0 :                             _ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
     920            0 :                             _ = timeline.cancel.cancelled() => {},
     921            0 :                         };
     922              : 
     923            0 :                         Err(e)
     924              :                     }
     925              :                 };
     926              : 
     927            0 :                 if let Err(res) = tx.send((result, permit)) {
     928            0 :                     match res {
     929            0 :                         (Ok(()), _) => {
     930            0 :                             // our caller is cancellation safe so this is fine; if someone
     931            0 :                             // else requests the layer, they'll find it already downloaded.
     932            0 :                             //
     933            0 :                             // See counter [`LayerImplMetrics::inc_init_needed_no_download`]
     934            0 :                             //
     935            0 :                             // FIXME(#6028): however, could be that we should consider marking the
     936            0 :                             // layer for eviction? alas, cannot: because only DownloadedLayer will
     937            0 :                             // handle that.
     938            0 :                         },
     939            0 :                         (Err(e), _) => {
     940            0 :                             // our caller is cancellation safe, but we might be racing with
     941            0 :                             // another attempt to initialize. before we have cancellation
     942            0 :                             // token support: these attempts should converge regardless of
     943            0 :                             // their completion order.
     944            0 :                             tracing::error!("layer file download failed, and additionally failed to communicate this to caller: {e:?}");
     945            0 :                             LAYER_IMPL_METRICS.inc_download_failed_without_requester();
     946              :                         }
     947              :                     }
     948            0 :                 }
     949              : 
     950            0 :                 Ok(())
     951            0 :             }
     952            0 :             .in_current_span(),
     953            0 :         );
     954            0 :         match rx.await {
     955            0 :             Ok((Ok(()), permit)) => {
     956            0 :                 if let Some(reason) = self
     957            0 :                     .needs_download()
     958            0 :                     .await
     959            0 :                     .map_err(DownloadError::PostStatFailed)?
     960              :                 {
     961              :                     // this is really a bug in needs_download or remote timeline client
     962            0 :                     panic!("post-condition failed: needs_download returned {reason:?}");
     963            0 :                 }
     964            0 : 
     965            0 :                 self.consecutive_failures.store(0, Ordering::Relaxed);
     966            0 :                 tracing::info!("on-demand download successful");
     967              : 
     968            0 :                 Ok(permit)
     969              :             }
     970            0 :             Ok((Err(e), _permit)) => {
     971            0 :                 // sleep already happened in the spawned task, if it was not cancelled
     972            0 :                 let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
     973            0 : 
     974            0 :                 match e.downcast_ref::<remote_storage::DownloadError>() {
     975              :                     // If the download failed due to its cancellation token,
     976              :                     // propagate the cancellation error upstream.
     977              :                     Some(remote_storage::DownloadError::Cancelled) => {
     978            0 :                         Err(DownloadError::DownloadCancelled)
     979              :                     }
     980              :                     _ => {
     981            0 :                         tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
     982            0 :                         Err(DownloadError::DownloadFailed)
     983              :                     }
     984              :                 }
     985              :             }
     986            0 :             Err(_gone) => Err(DownloadError::DownloadCancelled),
     987              :         }
     988            0 :     }
     989              : 
     990            6 :     async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
     991            8 :         match tokio::fs::metadata(&self.path).await {
     992            2 :             Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
     993            4 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
     994            0 :             Err(e) => Err(e),
     995              :         }
     996            6 :     }
     997              : 
     998           24 :     fn needs_download_blocking(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
     999           24 :         match self.path.metadata() {
    1000           24 :             Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
    1001            0 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
    1002            0 :             Err(e) => Err(e),
    1003              :         }
    1004           24 :     }
    1005              : 
    1006           26 :     fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
    1007           26 :         // in future, this should include sha2-256 validation of the file.
    1008           26 :         if !m.is_file() {
    1009            0 :             Err(NeedsDownload::NotFile(m.file_type()))
    1010           26 :         } else if m.len() != self.desc.file_size {
    1011            0 :             Err(NeedsDownload::WrongSize {
    1012            0 :                 actual: m.len(),
    1013            0 :                 expected: self.desc.file_size,
    1014            0 :             })
    1015              :         } else {
    1016           26 :             Ok(())
    1017              :         }
    1018           26 :     }
    1019              : 
    1020            0 :     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
    1021            0 :         let layer_file_name = self.desc.filename().file_name();
    1022            0 : 
    1023            0 :         // this is not accurate: we could have the file locally but there was a cancellation
    1024            0 :         // and now we are not in sync, or we are currently downloading it.
    1025            0 :         let remote = self.inner.get().is_none();
    1026            0 : 
    1027            0 :         let access_stats = self.access_stats.as_api_model(reset);
    1028            0 : 
    1029            0 :         if self.desc.is_delta {
    1030            0 :             let lsn_range = &self.desc.lsn_range;
    1031            0 : 
    1032            0 :             HistoricLayerInfo::Delta {
    1033            0 :                 layer_file_name,
    1034            0 :                 layer_file_size: self.desc.file_size,
    1035            0 :                 lsn_start: lsn_range.start,
    1036            0 :                 lsn_end: lsn_range.end,
    1037            0 :                 remote,
    1038            0 :                 access_stats,
    1039            0 :             }
    1040              :         } else {
    1041            0 :             let lsn = self.desc.image_layer_lsn();
    1042            0 : 
    1043            0 :             HistoricLayerInfo::Image {
    1044            0 :                 layer_file_name,
    1045            0 :                 layer_file_size: self.desc.file_size,
    1046            0 :                 lsn_start: lsn,
    1047            0 :                 remote,
    1048            0 :                 access_stats,
    1049            0 :             }
    1050              :         }
    1051            0 :     }
    1052              : 
    1053              :     /// `DownloadedLayer` is being dropped, so it calls this method.
    1054            8 :     fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
    1055            8 :         let evict = self.wanted_evicted.load(Ordering::Acquire);
    1056            8 :         let can_evict = self.have_remote_client;
    1057            8 : 
    1058            8 :         if can_evict && evict {
    1059            8 :             let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
    1060              : 
    1061              :             // downgrade for queueing, in case there's a tear down already ongoing we should not
    1062              :             // hold it alive.
    1063            8 :             let this = Arc::downgrade(&self);
    1064            8 :             drop(self);
    1065            8 : 
    1066            8 :             // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
    1067            8 :             // drop while the `self.inner` is being locked, leading to a deadlock.
    1068            8 : 
    1069            8 :             crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
    1070            8 :                 let _g = span.entered();
    1071              : 
    1072              :                 // if LayerInner is already dropped here, do nothing because the delete on drop
    1073              :                 // has already ran while we were in queue
    1074            8 :                 let Some(this) = this.upgrade() else {
    1075            0 :                     LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
    1076            0 :                     return;
    1077              :                 };
    1078            8 :                 match this.evict_blocking(version) {
    1079            6 :                     Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
    1080            2 :                     Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
    1081              :                 }
    1082            8 :             });
    1083            0 :         }
    1084            8 :     }
    1085              : 
    1086            8 :     fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
    1087              :         // deleted or detached timeline, don't do anything.
    1088            8 :         let Some(timeline) = self.timeline.upgrade() else {
    1089            0 :             return Err(EvictionCancelled::TimelineGone);
    1090              :         };
    1091              : 
    1092              :         // to avoid starting a new download while we evict, keep holding on to the
    1093              :         // permit.
    1094            6 :         let _permit = {
    1095            8 :             let maybe_downloaded = self.inner.get();
    1096              : 
    1097            8 :             let (_weak, permit) = match maybe_downloaded {
    1098            8 :                 Some(mut guard) => {
    1099            8 :                     if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard {
    1100            8 :                         if *version == only_version {
    1101            6 :                             guard.take_and_deinit()
    1102              :                         } else {
    1103              :                             // this was not for us; maybe there's another eviction job
    1104              :                             // TODO: does it make any sense to stall here? unique versions do not
    1105              :                             // matter, we only want to make sure not to evict a resident, which we
    1106              :                             // are not doing.
    1107            2 :                             return Err(EvictionCancelled::VersionCheckFailed);
    1108              :                         }
    1109              :                     } else {
    1110            0 :                         return Err(EvictionCancelled::AlreadyReinitialized);
    1111              :                     }
    1112              :                 }
    1113              :                 None => {
    1114              :                     // already deinitialized, perhaps get_or_maybe_download did this and is
    1115              :                     // currently waiting to reinitialize it
    1116            0 :                     return Err(EvictionCancelled::LostToDownload);
    1117              :                 }
    1118              :             };
    1119              : 
    1120            6 :             permit
    1121            6 :         };
    1122            6 : 
    1123            6 :         // now accesses to inner.get_or_init wait on the semaphore or the `_permit`
    1124            6 : 
    1125            6 :         self.access_stats.record_residence_event(
    1126            6 :             LayerResidenceStatus::Evicted,
    1127            6 :             LayerResidenceEventReason::ResidenceChange,
    1128            6 :         );
    1129              : 
    1130            6 :         let res = match capture_mtime_and_remove(&self.path) {
    1131            6 :             Ok(local_layer_mtime) => {
    1132            6 :                 let duration = SystemTime::now().duration_since(local_layer_mtime);
    1133            6 :                 match duration {
    1134            6 :                     Ok(elapsed) => {
    1135            6 :                         timeline
    1136            6 :                             .metrics
    1137            6 :                             .evictions_with_low_residence_duration
    1138            6 :                             .read()
    1139            6 :                             .unwrap()
    1140            6 :                             .observe(elapsed);
    1141            6 :                         tracing::info!(
    1142            6 :                             residence_millis = elapsed.as_millis(),
    1143            6 :                             "evicted layer after known residence period"
    1144            6 :                         );
    1145              :                     }
    1146              :                     Err(_) => {
    1147            0 :                         tracing::info!("evicted layer after unknown residence period");
    1148              :                     }
    1149              :                 }
    1150            6 :                 timeline.metrics.evictions.inc();
    1151            6 :                 timeline
    1152            6 :                     .metrics
    1153            6 :                     .resident_physical_size_sub(self.desc.file_size);
    1154            6 : 
    1155            6 :                 Ok(())
    1156              :             }
    1157            0 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
    1158            0 :                 tracing::error!(
    1159            0 :                     layer_size = %self.desc.file_size,
    1160            0 :                     "failed to evict layer from disk, it was already gone (metrics will be inaccurate)"
    1161            0 :                 );
    1162            0 :                 Err(EvictionCancelled::FileNotFound)
    1163              :             }
    1164            0 :             Err(e) => {
    1165            0 :                 tracing::error!("failed to evict file from disk: {e:#}");
    1166            0 :                 Err(EvictionCancelled::RemoveFailed)
    1167              :             }
    1168              :         };
    1169              : 
    1170              :         // we are still holding the permit, so no new spawn_download_and_wait can happen
    1171            6 :         drop(self.status.send(Status::Evicted));
    1172            6 : 
    1173            6 :         *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
    1174            6 : 
    1175            6 :         res
    1176            8 :     }
    1177              : 
    1178          864 :     fn metadata(&self) -> LayerFileMetadata {
    1179          864 :         LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
    1180          864 :     }
    1181              : }
    1182              : 
    1183            6 : fn capture_mtime_and_remove(path: &Utf8Path) -> Result<SystemTime, std::io::Error> {
    1184            6 :     let m = path.metadata()?;
    1185            6 :     let local_layer_mtime = m.modified()?;
    1186            6 :     std::fs::remove_file(path)?;
    1187            6 :     Ok(local_layer_mtime)
    1188            6 : }
    1189              : 
    1190            0 : #[derive(Debug, thiserror::Error)]
    1191              : pub(crate) enum EvictionError {
    1192              :     #[error("layer was already evicted")]
    1193              :     NotFound,
    1194              : 
    1195              :     /// Evictions must always lose to downloads in races, and this time it happened.
    1196              :     #[error("layer was downloaded instead")]
    1197              :     Downloaded,
    1198              : }
    1199              : 
    1200              : /// Error internal to the [`LayerInner::get_or_maybe_download`]
    1201            0 : #[derive(Debug, thiserror::Error)]
    1202              : pub(crate) enum DownloadError {
    1203              :     #[error("timeline has already shutdown")]
    1204              :     TimelineShutdown,
    1205              :     #[error("no remote storage configured")]
    1206              :     NoRemoteStorage,
    1207              :     #[error("context denies downloading")]
    1208              :     ContextAndConfigReallyDeniesDownloads,
    1209              :     #[error("downloading is really required but not allowed by this method")]
    1210              :     DownloadRequired,
    1211              :     #[error("layer path exists, but it is not a file: {0:?}")]
    1212              :     NotFile(std::fs::FileType),
    1213              :     /// Why no error here? Because it will be reported by page_service. We should had also done
    1214              :     /// retries already.
    1215              :     #[error("downloading evicted layer file failed")]
    1216              :     DownloadFailed,
    1217              :     #[error("downloading failed, possibly for shutdown")]
    1218              :     DownloadCancelled,
    1219              :     #[error("pre-condition: stat before download failed")]
    1220              :     PreStatFailed(#[source] std::io::Error),
    1221              :     #[error("post-condition: stat after download failed")]
    1222              :     PostStatFailed(#[source] std::io::Error),
    1223              : }
    1224              : 
    1225            0 : #[derive(Debug, PartialEq)]
    1226              : pub(crate) enum NeedsDownload {
    1227              :     NotFound,
    1228              :     NotFile(std::fs::FileType),
    1229              :     WrongSize { actual: u64, expected: u64 },
    1230              : }
    1231              : 
    1232              : impl std::fmt::Display for NeedsDownload {
    1233            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1234            0 :         match self {
    1235            0 :             NeedsDownload::NotFound => write!(f, "file was not found"),
    1236            0 :             NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
    1237            0 :             NeedsDownload::WrongSize { actual, expected } => {
    1238            0 :                 write!(f, "file size mismatch {actual} vs. {expected}")
    1239              :             }
    1240              :         }
    1241            0 :     }
    1242              : }
    1243              : 
    1244              : /// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
    1245              : pub(crate) struct DownloadedLayer {
    1246              :     owner: Weak<LayerInner>,
    1247              :     // Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
    1248              :     // DownloadedLayer
    1249              :     kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
    1250              :     version: usize,
    1251              : }
    1252              : 
    1253              : impl std::fmt::Debug for DownloadedLayer {
    1254            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1255            0 :         f.debug_struct("DownloadedLayer")
    1256            0 :             // owner omitted because it is always "Weak"
    1257            0 :             .field("kind", &self.kind)
    1258            0 :             .field("version", &self.version)
    1259            0 :             .finish()
    1260            0 :     }
    1261              : }
    1262              : 
    1263              : impl Drop for DownloadedLayer {
    1264          332 :     fn drop(&mut self) {
    1265          332 :         if let Some(owner) = self.owner.upgrade() {
    1266            8 :             owner.on_downloaded_layer_drop(self.version);
    1267          324 :         } else {
    1268          324 :             // no need to do anything, we are shutting down
    1269          324 :         }
    1270          332 :     }
    1271              : }
    1272              : 
    1273              : impl DownloadedLayer {
    1274              :     /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to
    1275              :     /// initialize it permanently.
    1276              :     ///
    1277              :     /// `owner` parameter is a strong reference at the same `LayerInner` as the
    1278              :     /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called,
    1279              :     /// we will always have the LayerInner on the callstack, so we can just use it.
    1280       124342 :     async fn get<'a>(
    1281       124342 :         &'a self,
    1282       124342 :         owner: &Arc<LayerInner>,
    1283       124342 :         ctx: &RequestContext,
    1284       124342 :     ) -> anyhow::Result<&'a LayerKind> {
    1285       124342 :         let init = || async {
    1286          450 :             assert_eq!(
    1287          450 :                 Weak::as_ptr(&self.owner),
    1288          450 :                 Arc::as_ptr(owner),
    1289            0 :                 "these are the same, just avoiding the upgrade"
    1290              :             );
    1291              : 
    1292          450 :             let res = if owner.desc.is_delta {
    1293          440 :                 let summary = Some(delta_layer::Summary::expected(
    1294          440 :                     owner.desc.tenant_shard_id.tenant_id,
    1295          440 :                     owner.desc.timeline_id,
    1296          440 :                     owner.desc.key_range.clone(),
    1297          440 :                     owner.desc.lsn_range.clone(),
    1298          440 :                 ));
    1299          440 :                 delta_layer::DeltaLayerInner::load(
    1300          440 :                     &owner.path,
    1301          440 :                     summary,
    1302          440 :                     Some(owner.conf.max_vectored_read_bytes),
    1303          440 :                     ctx,
    1304          440 :                 )
    1305          441 :                 .await
    1306          440 :                 .map(|res| res.map(LayerKind::Delta))
    1307              :             } else {
    1308           10 :                 let lsn = owner.desc.image_layer_lsn();
    1309           10 :                 let summary = Some(image_layer::Summary::expected(
    1310           10 :                     owner.desc.tenant_shard_id.tenant_id,
    1311           10 :                     owner.desc.timeline_id,
    1312           10 :                     owner.desc.key_range.clone(),
    1313           10 :                     lsn,
    1314           10 :                 ));
    1315           10 :                 image_layer::ImageLayerInner::load(
    1316           10 :                     &owner.path,
    1317           10 :                     lsn,
    1318           10 :                     summary,
    1319           10 :                     Some(owner.conf.max_vectored_read_bytes),
    1320           10 :                     ctx,
    1321           10 :                 )
    1322           10 :                 .await
    1323           10 :                 .map(|res| res.map(LayerKind::Image))
    1324              :             };
    1325              : 
    1326          450 :             match res {
    1327          450 :                 Ok(Ok(layer)) => Ok(Ok(layer)),
    1328            0 :                 Ok(Err(transient)) => Err(transient),
    1329            0 :                 Err(permanent) => {
    1330            0 :                     LAYER_IMPL_METRICS.inc_permanent_loading_failures();
    1331            0 :                     // TODO(#5815): we are not logging all errors, so temporarily log them **once**
    1332            0 :                     // here as well
    1333            0 :                     let permanent = permanent.context("load layer");
    1334            0 :                     tracing::error!("layer loading failed permanently: {permanent:#}");
    1335            0 :                     Ok(Err(permanent))
    1336              :                 }
    1337              :             }
    1338          900 :         };
    1339       124342 :         self.kind
    1340       124342 :             .get_or_try_init(init)
    1341              :             // return transient errors using `?`
    1342          452 :             .await?
    1343       124342 :             .as_ref()
    1344       124342 :             .map_err(|e| {
    1345            0 :                 // errors are not clonabled, cannot but stringify
    1346            0 :                 // test_broken_timeline matches this string
    1347            0 :                 anyhow::anyhow!("layer loading failed: {e:#}")
    1348       124342 :             })
    1349       124342 :     }
    1350              : 
    1351       124028 :     async fn get_value_reconstruct_data(
    1352       124028 :         &self,
    1353       124028 :         key: Key,
    1354       124028 :         lsn_range: Range<Lsn>,
    1355       124028 :         reconstruct_data: &mut ValueReconstructState,
    1356       124028 :         owner: &Arc<LayerInner>,
    1357       124028 :         ctx: &RequestContext,
    1358       124028 :     ) -> anyhow::Result<ValueReconstructResult> {
    1359       124028 :         use LayerKind::*;
    1360       124028 : 
    1361       124028 :         match self.get(owner, ctx).await? {
    1362       123516 :             Delta(d) => {
    1363       123516 :                 d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
    1364        22647 :                     .await
    1365              :             }
    1366          512 :             Image(i) => {
    1367          512 :                 i.get_value_reconstruct_data(key, reconstruct_data, ctx)
    1368          435 :                     .await
    1369              :             }
    1370              :         }
    1371       124028 :     }
    1372              : 
    1373           10 :     async fn get_values_reconstruct_data(
    1374           10 :         &self,
    1375           10 :         keyspace: KeySpace,
    1376           10 :         lsn_range: Range<Lsn>,
    1377           10 :         reconstruct_data: &mut ValuesReconstructState,
    1378           10 :         owner: &Arc<LayerInner>,
    1379           10 :         ctx: &RequestContext,
    1380           10 :     ) -> Result<(), GetVectoredError> {
    1381           10 :         use LayerKind::*;
    1382           10 : 
    1383           10 :         match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
    1384           10 :             Delta(d) => {
    1385           10 :                 d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
    1386           15 :                     .await
    1387              :             }
    1388            0 :             Image(i) => {
    1389            0 :                 i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx)
    1390            0 :                     .await
    1391              :             }
    1392              :         }
    1393           10 :     }
    1394              : 
    1395            4 :     async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
    1396            4 :         use LayerKind::*;
    1397            4 :         match self.get(owner, ctx).await? {
    1398            4 :             Delta(d) => d.dump(ctx).await?,
    1399            0 :             Image(i) => i.dump(ctx).await?,
    1400              :         }
    1401              : 
    1402            4 :         Ok(())
    1403            4 :     }
    1404              : }
    1405              : 
    1406              : /// Wrapper around an actual layer implementation.
    1407            0 : #[derive(Debug)]
    1408              : enum LayerKind {
    1409              :     Delta(delta_layer::DeltaLayerInner),
    1410              :     Image(image_layer::ImageLayerInner),
    1411              : }
    1412              : 
    1413              : /// Guard for forcing a layer be resident while it exists.
    1414          518 : #[derive(Clone)]
    1415              : pub(crate) struct ResidentLayer {
    1416              :     owner: Layer,
    1417              :     downloaded: Arc<DownloadedLayer>,
    1418              : }
    1419              : 
    1420              : impl std::fmt::Display for ResidentLayer {
    1421         1162 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1422         1162 :         write!(f, "{}", self.owner)
    1423         1162 :     }
    1424              : }
    1425              : 
    1426              : impl std::fmt::Debug for ResidentLayer {
    1427            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1428            0 :         write!(f, "{}", self.owner)
    1429            0 :     }
    1430              : }
    1431              : 
    1432              : impl ResidentLayer {
    1433              :     /// Release the eviction guard, converting back into a plain [`Layer`].
    1434              :     ///
    1435              :     /// You can access the [`Layer`] also by using `as_ref`.
    1436          322 :     pub(crate) fn drop_eviction_guard(self) -> Layer {
    1437          322 :         self.into()
    1438          322 :     }
    1439              : 
    1440              :     /// Loads all keys stored in the layer. Returns key, lsn and value size.
    1441          600 :     #[tracing::instrument(skip_all, fields(layer=%self))]
    1442              :     pub(crate) async fn load_keys<'a>(
    1443              :         &'a self,
    1444              :         ctx: &RequestContext,
    1445              :     ) -> anyhow::Result<Vec<DeltaEntry<'a>>> {
    1446              :         use LayerKind::*;
    1447              : 
    1448              :         let owner = &self.owner.0;
    1449              : 
    1450              :         match self.downloaded.get(owner, ctx).await? {
    1451              :             Delta(ref d) => {
    1452              :                 owner
    1453              :                     .access_stats
    1454              :                     .record_access(LayerAccessKind::KeyIter, ctx);
    1455              : 
    1456              :                 // this is valid because the DownloadedLayer::kind is a OnceCell, not a
    1457              :                 // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
    1458              :                 // while it's being held.
    1459              :                 delta_layer::DeltaLayerInner::load_keys(d, ctx)
    1460              :                     .await
    1461              :                     .context("Layer index is corrupted")
    1462              :             }
    1463              :             Image(_) => anyhow::bail!("cannot load_keys on a image layer"),
    1464              :         }
    1465              :     }
    1466              : 
    1467         1109 :     pub(crate) fn local_path(&self) -> &Utf8Path {
    1468         1109 :         &self.owner.0.path
    1469         1109 :     }
    1470              : 
    1471          562 :     pub(crate) fn metadata(&self) -> LayerFileMetadata {
    1472          562 :         self.owner.metadata()
    1473          562 :     }
    1474              : }
    1475              : 
    1476              : impl AsLayerDesc for ResidentLayer {
    1477         1280 :     fn layer_desc(&self) -> &PersistentLayerDesc {
    1478         1280 :         self.owner.layer_desc()
    1479         1280 :     }
    1480              : }
    1481              : 
    1482              : impl AsRef<Layer> for ResidentLayer {
    1483          584 :     fn as_ref(&self) -> &Layer {
    1484          584 :         &self.owner
    1485          584 :     }
    1486              : }
    1487              : 
    1488              : /// Drop the eviction guard.
    1489              : impl From<ResidentLayer> for Layer {
    1490          322 :     fn from(value: ResidentLayer) -> Self {
    1491          322 :         value.owner
    1492          322 :     }
    1493              : }
    1494              : 
    1495              : use metrics::IntCounter;
    1496              : 
    1497              : pub(crate) struct LayerImplMetrics {
    1498              :     started_evictions: IntCounter,
    1499              :     completed_evictions: IntCounter,
    1500              :     cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
    1501              : 
    1502              :     started_deletes: IntCounter,
    1503              :     completed_deletes: IntCounter,
    1504              :     failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
    1505              : 
    1506              :     rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
    1507              :     inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
    1508              :     redownload_after: metrics::Histogram,
    1509              : }
    1510              : 
    1511              : impl Default for LayerImplMetrics {
    1512           12 :     fn default() -> Self {
    1513           12 :         use enum_map::Enum;
    1514           12 : 
    1515           12 :         // reminder: these will be pageserver_layer_* with "_total" suffix
    1516           12 : 
    1517           12 :         let started_evictions = metrics::register_int_counter!(
    1518           12 :             "pageserver_layer_started_evictions",
    1519           12 :             "Evictions started in the Layer implementation"
    1520           12 :         )
    1521           12 :         .unwrap();
    1522           12 :         let completed_evictions = metrics::register_int_counter!(
    1523           12 :             "pageserver_layer_completed_evictions",
    1524           12 :             "Evictions completed in the Layer implementation"
    1525           12 :         )
    1526           12 :         .unwrap();
    1527           12 : 
    1528           12 :         let cancelled_evictions = metrics::register_int_counter_vec!(
    1529           12 :             "pageserver_layer_cancelled_evictions_count",
    1530           12 :             "Different reasons for evictions to have been cancelled or failed",
    1531           12 :             &["reason"]
    1532           12 :         )
    1533           12 :         .unwrap();
    1534           12 : 
    1535           96 :         let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1536           96 :             let reason = EvictionCancelled::from_usize(i);
    1537           96 :             let s = reason.as_str();
    1538           96 :             cancelled_evictions.with_label_values(&[s])
    1539           96 :         }));
    1540           12 : 
    1541           12 :         let started_deletes = metrics::register_int_counter!(
    1542           12 :             "pageserver_layer_started_deletes",
    1543           12 :             "Deletions on drop pending in the Layer implementation"
    1544           12 :         )
    1545           12 :         .unwrap();
    1546           12 :         let completed_deletes = metrics::register_int_counter!(
    1547           12 :             "pageserver_layer_completed_deletes",
    1548           12 :             "Deletions on drop completed in the Layer implementation"
    1549           12 :         )
    1550           12 :         .unwrap();
    1551           12 : 
    1552           12 :         let failed_deletes = metrics::register_int_counter_vec!(
    1553           12 :             "pageserver_layer_failed_deletes_count",
    1554           12 :             "Different reasons for deletions on drop to have failed",
    1555           12 :             &["reason"]
    1556           12 :         )
    1557           12 :         .unwrap();
    1558           12 : 
    1559           24 :         let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1560           24 :             let reason = DeleteFailed::from_usize(i);
    1561           24 :             let s = reason.as_str();
    1562           24 :             failed_deletes.with_label_values(&[s])
    1563           24 :         }));
    1564           12 : 
    1565           12 :         let rare_counters = metrics::register_int_counter_vec!(
    1566           12 :             "pageserver_layer_assumed_rare_count",
    1567           12 :             "Times unexpected or assumed rare event happened",
    1568           12 :             &["event"]
    1569           12 :         )
    1570           12 :         .unwrap();
    1571           12 : 
    1572           84 :         let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1573           84 :             let event = RareEvent::from_usize(i);
    1574           84 :             let s = event.as_str();
    1575           84 :             rare_counters.with_label_values(&[s])
    1576           84 :         }));
    1577           12 : 
    1578           12 :         let inits_cancelled = metrics::register_int_counter!(
    1579           12 :             "pageserver_layer_inits_cancelled_count",
    1580           12 :             "Times Layer initialization was cancelled",
    1581           12 :         )
    1582           12 :         .unwrap();
    1583           12 : 
    1584           12 :         let redownload_after = {
    1585           12 :             let minute = 60.0;
    1586           12 :             let hour = 60.0 * minute;
    1587           12 :             metrics::register_histogram!(
    1588           12 :                 "pageserver_layer_redownloaded_after",
    1589           12 :                 "Time between evicting and re-downloading.",
    1590           12 :                 vec![
    1591           12 :                     10.0,
    1592           12 :                     30.0,
    1593           12 :                     minute,
    1594           12 :                     5.0 * minute,
    1595           12 :                     15.0 * minute,
    1596           12 :                     30.0 * minute,
    1597           12 :                     hour,
    1598           12 :                     12.0 * hour,
    1599           12 :                 ]
    1600           12 :             )
    1601           12 :             .unwrap()
    1602           12 :         };
    1603           12 : 
    1604           12 :         Self {
    1605           12 :             started_evictions,
    1606           12 :             completed_evictions,
    1607           12 :             cancelled_evictions,
    1608           12 : 
    1609           12 :             started_deletes,
    1610           12 :             completed_deletes,
    1611           12 :             failed_deletes,
    1612           12 : 
    1613           12 :             rare_counters,
    1614           12 :             inits_cancelled,
    1615           12 :             redownload_after,
    1616           12 :         }
    1617           12 :     }
    1618              : }
    1619              : 
    1620              : impl LayerImplMetrics {
    1621            8 :     fn inc_started_evictions(&self) {
    1622            8 :         self.started_evictions.inc();
    1623            8 :     }
    1624            6 :     fn inc_completed_evictions(&self) {
    1625            6 :         self.completed_evictions.inc();
    1626            6 :     }
    1627            2 :     fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
    1628            2 :         self.cancelled_evictions[reason].inc()
    1629            2 :     }
    1630              : 
    1631          302 :     fn inc_started_deletes(&self) {
    1632          302 :         self.started_deletes.inc();
    1633          302 :     }
    1634          302 :     fn inc_completed_deletes(&self) {
    1635          302 :         self.completed_deletes.inc();
    1636          302 :     }
    1637            0 :     fn inc_deletes_failed(&self, reason: DeleteFailed) {
    1638            0 :         self.failed_deletes[reason].inc();
    1639            0 :     }
    1640              : 
    1641              :     /// Counted separatedly from failed layer deletes because we will complete the layer deletion
    1642              :     /// attempt regardless of failure to delete local file.
    1643            0 :     fn inc_delete_removes_failed(&self) {
    1644            0 :         self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
    1645            0 :     }
    1646              : 
    1647              :     /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
    1648            2 :     fn inc_retried_get_or_maybe_download(&self) {
    1649            2 :         self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
    1650            2 :     }
    1651              : 
    1652              :     /// Expected rare because cancellations are unexpected, and failures are unexpected
    1653            0 :     fn inc_download_failed_without_requester(&self) {
    1654            0 :         self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
    1655            0 :     }
    1656              : 
    1657              :     /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
    1658              :     ///
    1659              :     /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
    1660              :     /// Option.
    1661            0 :     fn inc_raced_wanted_evicted_accesses(&self) {
    1662            0 :         self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
    1663            0 :     }
    1664              : 
    1665              :     /// These are only expected for [`Self::inc_init_cancelled`] amount when
    1666              :     /// running with remote storage.
    1667            2 :     fn inc_init_needed_no_download(&self) {
    1668            2 :         self.rare_counters[RareEvent::InitWithoutDownload].inc();
    1669            2 :     }
    1670              : 
    1671              :     /// Expected rare because all layer files should be readable and good
    1672            0 :     fn inc_permanent_loading_failures(&self) {
    1673            0 :         self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
    1674            0 :     }
    1675              : 
    1676            2 :     fn inc_broadcast_lagged(&self) {
    1677            2 :         self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
    1678            2 :     }
    1679              : 
    1680            4 :     fn inc_init_cancelled(&self) {
    1681            4 :         self.inits_cancelled.inc()
    1682            4 :     }
    1683              : 
    1684            0 :     fn record_redownloaded_after(&self, duration: std::time::Duration) {
    1685            0 :         self.redownload_after.observe(duration.as_secs_f64())
    1686            0 :     }
    1687              : }
    1688              : 
    1689           98 : #[derive(enum_map::Enum)]
    1690              : enum EvictionCancelled {
    1691              :     LayerGone,
    1692              :     TimelineGone,
    1693              :     VersionCheckFailed,
    1694              :     FileNotFound,
    1695              :     RemoveFailed,
    1696              :     AlreadyReinitialized,
    1697              :     /// Not evicted because of a pending reinitialization
    1698              :     LostToDownload,
    1699              :     /// After eviction, there was a new layer access which cancelled the eviction.
    1700              :     UpgradedBackOnAccess,
    1701              : }
    1702              : 
    1703              : impl EvictionCancelled {
    1704           96 :     fn as_str(&self) -> &'static str {
    1705           96 :         match self {
    1706           12 :             EvictionCancelled::LayerGone => "layer_gone",
    1707           12 :             EvictionCancelled::TimelineGone => "timeline_gone",
    1708           12 :             EvictionCancelled::VersionCheckFailed => "version_check_fail",
    1709           12 :             EvictionCancelled::FileNotFound => "file_not_found",
    1710           12 :             EvictionCancelled::RemoveFailed => "remove_failed",
    1711           12 :             EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
    1712           12 :             EvictionCancelled::LostToDownload => "lost_to_download",
    1713           12 :             EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
    1714              :         }
    1715           96 :     }
    1716              : }
    1717              : 
    1718           24 : #[derive(enum_map::Enum)]
    1719              : enum DeleteFailed {
    1720              :     TimelineGone,
    1721              :     DeleteSchedulingFailed,
    1722              : }
    1723              : 
    1724              : impl DeleteFailed {
    1725           24 :     fn as_str(&self) -> &'static str {
    1726           24 :         match self {
    1727           12 :             DeleteFailed::TimelineGone => "timeline_gone",
    1728           12 :             DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
    1729              :         }
    1730           24 :     }
    1731              : }
    1732              : 
    1733           90 : #[derive(enum_map::Enum)]
    1734              : enum RareEvent {
    1735              :     RemoveOnDropFailed,
    1736              :     RetriedGetOrMaybeDownload,
    1737              :     DownloadFailedWithoutRequester,
    1738              :     UpgradedWantedEvicted,
    1739              :     InitWithoutDownload,
    1740              :     PermanentLoadingFailure,
    1741              :     EvictAndWaitLagged,
    1742              : }
    1743              : 
    1744              : impl RareEvent {
    1745           84 :     fn as_str(&self) -> &'static str {
    1746           84 :         use RareEvent::*;
    1747           84 : 
    1748           84 :         match self {
    1749           12 :             RemoveOnDropFailed => "remove_on_drop_failed",
    1750           12 :             RetriedGetOrMaybeDownload => "retried_gomd",
    1751           12 :             DownloadFailedWithoutRequester => "download_failed_without",
    1752           12 :             UpgradedWantedEvicted => "raced_wanted_evicted",
    1753           12 :             InitWithoutDownload => "init_needed_no_download",
    1754           12 :             PermanentLoadingFailure => "permanent_loading_failure",
    1755           12 :             EvictAndWaitLagged => "broadcast_lagged",
    1756              :         }
    1757           84 :     }
    1758              : }
    1759              : 
    1760              : pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
    1761              :     once_cell::sync::Lazy::new(LayerImplMetrics::default);
        

Generated by: LCOV version 2.1-beta