LCOV - differential code coverage report
Current view: top level - pageserver/src/tenant/storage_layer - layer.rs (source / functions) Coverage Total Hit UBC CBC
Current: cd44433dd675caa99df17a61b18949c8387e2242.info Lines: 86.4 % 1066 921 145 921
Current Date: 2024-01-09 02:06:09 Functions: 80.5 % 149 120 29 120
Baseline: 66c52a629a0f4a503e193045e0df4c77139e344b.info
Baseline Date: 2024-01-08 15:34:46

           TLA  Line data    Source code
       1                 : use anyhow::Context;
       2                 : use camino::{Utf8Path, Utf8PathBuf};
       3                 : use pageserver_api::models::{
       4                 :     HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
       5                 : };
       6                 : use pageserver_api::shard::ShardIndex;
       7                 : use std::ops::Range;
       8                 : use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
       9                 : use std::sync::{Arc, Weak};
      10                 : use std::time::SystemTime;
      11                 : use tracing::Instrument;
      12                 : use utils::lsn::Lsn;
      13                 : use utils::sync::heavier_once_cell;
      14                 : 
      15                 : use crate::config::PageServerConf;
      16                 : use crate::context::RequestContext;
      17                 : use crate::repository::Key;
      18                 : use crate::tenant::{remote_timeline_client::LayerFileMetadata, RemoteTimelineClient, Timeline};
      19                 : 
      20                 : use super::delta_layer::{self, DeltaEntry};
      21                 : use super::image_layer;
      22                 : use super::{
      23                 :     AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
      24                 :     ValueReconstructResult, ValueReconstructState,
      25                 : };
      26                 : 
      27                 : use utils::generation::Generation;
      28                 : 
      29                 : /// A Layer contains all data in a "rectangle" consisting of a range of keys and
      30                 : /// range of LSNs.
      31                 : ///
      32                 : /// There are two kinds of layers, in-memory and on-disk layers. In-memory
      33                 : /// layers are used to ingest incoming WAL, and provide fast access to the
      34                 : /// recent page versions. On-disk layers are stored as files on disk, and are
      35                 : /// immutable. This type represents the on-disk kind while in-memory kind are represented by
      36                 : /// [`InMemoryLayer`].
      37                 : ///
      38                 : /// Furthermore, there are two kinds of on-disk layers: delta and image layers.
      39                 : /// A delta layer contains all modifications within a range of LSNs and keys.
      40                 : /// An image layer is a snapshot of all the data in a key-range, at a single
      41                 : /// LSN.
      42                 : ///
      43                 : /// This type models the on-disk layers, which can be evicted and on-demand downloaded.
      44                 : ///
      45                 : /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer
      46 CBC    31058985 : #[derive(Clone)]
      47                 : pub(crate) struct Layer(Arc<LayerInner>);
      48                 : 
      49                 : impl std::fmt::Display for Layer {
      50           31291 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      51           31291 :         if matches!(self.0.generation, Generation::Broken) {
      52 UBC           0 :             write!(f, "{}-broken", self.layer_desc().short_id())
      53                 :         } else {
      54 CBC       31291 :             write!(
      55           31291 :                 f,
      56           31291 :                 "{}{}",
      57           31291 :                 self.layer_desc().short_id(),
      58           31291 :                 self.0.generation.get_suffix()
      59           31291 :             )
      60                 :         }
      61           31291 :     }
      62                 : }
      63                 : 
      64                 : impl std::fmt::Debug for Layer {
      65 UBC           0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      66               0 :         write!(f, "{}", self)
      67               0 :     }
      68                 : }
      69                 : 
      70                 : impl AsLayerDesc for Layer {
      71 CBC    47123277 :     fn layer_desc(&self) -> &PersistentLayerDesc {
      72        47123277 :         self.0.layer_desc()
      73        47123277 :     }
      74                 : }
      75                 : 
      76                 : impl Layer {
      77                 :     /// Creates a layer value for a file we know to not be resident.
      78           43720 :     pub(crate) fn for_evicted(
      79           43720 :         conf: &'static PageServerConf,
      80           43720 :         timeline: &Arc<Timeline>,
      81           43720 :         file_name: LayerFileName,
      82           43720 :         metadata: LayerFileMetadata,
      83           43720 :     ) -> Self {
      84           43720 :         let desc = PersistentLayerDesc::from_filename(
      85           43720 :             timeline.tenant_shard_id,
      86           43720 :             timeline.timeline_id,
      87           43720 :             file_name,
      88           43720 :             metadata.file_size(),
      89           43720 :         );
      90           43720 : 
      91           43720 :         let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
      92           43720 : 
      93           43720 :         let owner = Layer(Arc::new(LayerInner::new(
      94           43720 :             conf,
      95           43720 :             timeline,
      96           43720 :             access_stats,
      97           43720 :             desc,
      98           43720 :             None,
      99           43720 :             metadata.generation,
     100           43720 :             metadata.shard,
     101           43720 :         )));
     102                 : 
     103           43720 :         debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
     104                 : 
     105           43720 :         owner
     106           43720 :     }
     107                 : 
     108                 :     /// Creates a Layer value for a file we know to be resident in timeline directory.
     109           14085 :     pub(crate) fn for_resident(
     110           14085 :         conf: &'static PageServerConf,
     111           14085 :         timeline: &Arc<Timeline>,
     112           14085 :         file_name: LayerFileName,
     113           14085 :         metadata: LayerFileMetadata,
     114           14085 :     ) -> ResidentLayer {
     115           14085 :         let desc = PersistentLayerDesc::from_filename(
     116           14085 :             timeline.tenant_shard_id,
     117           14085 :             timeline.timeline_id,
     118           14085 :             file_name,
     119           14085 :             metadata.file_size(),
     120           14085 :         );
     121           14085 : 
     122           14085 :         let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
     123           14085 : 
     124           14085 :         let mut resident = None;
     125           14085 : 
     126           14085 :         let owner = Layer(Arc::new_cyclic(|owner| {
     127           14085 :             let inner = Arc::new(DownloadedLayer {
     128           14085 :                 owner: owner.clone(),
     129           14085 :                 kind: tokio::sync::OnceCell::default(),
     130           14085 :                 version: 0,
     131           14085 :             });
     132           14085 :             resident = Some(inner.clone());
     133           14085 : 
     134           14085 :             LayerInner::new(
     135           14085 :                 conf,
     136           14085 :                 timeline,
     137           14085 :                 access_stats,
     138           14085 :                 desc,
     139           14085 :                 Some(inner),
     140           14085 :                 metadata.generation,
     141           14085 :                 metadata.shard,
     142           14085 :             )
     143           14085 :         }));
     144           14085 : 
     145           14085 :         let downloaded = resident.expect("just initialized");
     146                 : 
     147           14085 :         debug_assert!(owner.0.needs_download_blocking().unwrap().is_none());
     148                 : 
     149           14085 :         timeline
     150           14085 :             .metrics
     151           14085 :             .resident_physical_size_add(metadata.file_size());
     152           14085 : 
     153           14085 :         ResidentLayer { downloaded, owner }
     154           14085 :     }
     155                 : 
     156                 :     /// Creates a Layer value for freshly written out new layer file by renaming it from a
     157                 :     /// temporary path.
     158           20448 :     pub(crate) fn finish_creating(
     159           20448 :         conf: &'static PageServerConf,
     160           20448 :         timeline: &Arc<Timeline>,
     161           20448 :         desc: PersistentLayerDesc,
     162           20448 :         temp_path: &Utf8Path,
     163           20448 :     ) -> anyhow::Result<ResidentLayer> {
     164           20448 :         let mut resident = None;
     165           20448 : 
     166           20448 :         let owner = Layer(Arc::new_cyclic(|owner| {
     167           20448 :             let inner = Arc::new(DownloadedLayer {
     168           20448 :                 owner: owner.clone(),
     169           20448 :                 kind: tokio::sync::OnceCell::default(),
     170           20448 :                 version: 0,
     171           20448 :             });
     172           20448 :             resident = Some(inner.clone());
     173           20448 :             let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
     174           20448 :             access_stats.record_residence_event(
     175           20448 :                 LayerResidenceStatus::Resident,
     176           20448 :                 LayerResidenceEventReason::LayerCreate,
     177           20448 :             );
     178           20448 :             LayerInner::new(
     179           20448 :                 conf,
     180           20448 :                 timeline,
     181           20448 :                 access_stats,
     182           20448 :                 desc,
     183           20448 :                 Some(inner),
     184           20448 :                 timeline.generation,
     185           20448 :                 timeline.get_shard_index(),
     186           20448 :             )
     187           20448 :         }));
     188           20448 : 
     189           20448 :         let downloaded = resident.expect("just initialized");
     190           20448 : 
     191           20448 :         // if the rename works, the path is as expected
     192           20448 :         std::fs::rename(temp_path, owner.local_path())
     193           20448 :             .with_context(|| format!("rename temporary file as correct path for {owner}"))?;
     194                 : 
     195           20448 :         Ok(ResidentLayer { downloaded, owner })
     196           20448 :     }
     197                 : 
     198                 :     /// Requests the layer to be evicted and waits for this to be done.
     199                 :     ///
     200                 :     /// If the file is not resident, an [`EvictionError::NotFound`] is returned.
     201                 :     ///
     202                 :     /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
     203                 :     /// re-downloaded, [`EvictionError::Downloaded`] is returned.
     204                 :     ///
     205                 :     /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
     206                 :     /// of download-evict cycle on retry.
     207            2539 :     pub(crate) async fn evict_and_wait(
     208            2539 :         &self,
     209            2539 :         rtc: &RemoteTimelineClient,
     210            2539 :     ) -> Result<(), EvictionError> {
     211            2539 :         self.0.evict_and_wait(rtc).await
     212            2539 :     }
     213                 : 
     214                 :     /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
     215                 :     /// then.
     216                 :     ///
     217                 :     /// On drop, this will cause a call to [`RemoteTimelineClient::schedule_deletion_of_unlinked`].
     218                 :     /// This means that the unlinking by [gc] or [compaction] must have happened strictly before
     219                 :     /// the value this is called on gets dropped.
     220                 :     ///
     221                 :     /// This is ensured by both of those methods accepting references to Layer.
     222                 :     ///
     223                 :     /// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
     224                 :     /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
     225            5005 :     pub(crate) fn delete_on_drop(&self) {
     226            5005 :         self.0.delete_on_drop();
     227            5005 :     }
     228                 : 
     229                 :     /// Return data needed to reconstruct given page at LSN.
     230                 :     ///
     231                 :     /// It is up to the caller to collect more data from the previous layer and
     232                 :     /// perform WAL redo, if necessary.
     233                 :     ///
     234                 :     /// # Cancellation-Safety
     235                 :     ///
     236                 :     /// This method is cancellation-safe.
     237        15454855 :     pub(crate) async fn get_value_reconstruct_data(
     238        15454855 :         &self,
     239        15454855 :         key: Key,
     240        15454855 :         lsn_range: Range<Lsn>,
     241        15454855 :         reconstruct_data: &mut ValueReconstructState,
     242        15454855 :         ctx: &RequestContext,
     243        15454875 :     ) -> anyhow::Result<ValueReconstructResult> {
     244                 :         use anyhow::ensure;
     245                 : 
     246        15454875 :         let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
     247        15454865 :         self.0
     248        15454865 :             .access_stats
     249        15454865 :             .record_access(LayerAccessKind::GetValueReconstructData, ctx);
     250        15454865 : 
     251        15454865 :         if self.layer_desc().is_delta {
     252        15002765 :             ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
     253        15002765 :             ensure!(self.layer_desc().key_range.contains(&key));
     254                 :         } else {
     255          452100 :             ensure!(self.layer_desc().key_range.contains(&key));
     256          452100 :             ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
     257          452100 :             ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
     258                 :         }
     259                 : 
     260        15454865 :         layer
     261        15454865 :             .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
     262        15454865 :             .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
     263          862444 :             .await
     264        15454863 :             .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
     265        15454864 :     }
     266                 : 
     267                 :     /// Download the layer if evicted.
     268                 :     ///
     269                 :     /// Will not error when the layer is already downloaded.
     270              12 :     pub(crate) async fn download(&self) -> anyhow::Result<()> {
     271              32 :         self.0.get_or_maybe_download(true, None).await?;
     272               7 :         Ok(())
     273              12 :     }
     274                 : 
     275                 :     /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
     276                 :     /// while the guard exists.
     277                 :     ///
     278                 :     /// Returns None if the layer is currently evicted.
     279            4795 :     pub(crate) async fn keep_resident(&self) -> anyhow::Result<Option<ResidentLayer>> {
     280            4795 :         let downloaded = match self.0.get_or_maybe_download(false, None).await {
     281            4171 :             Ok(d) => d,
     282                 :             // technically there are a lot of possible errors, but in practice it should only be
     283                 :             // DownloadRequired which is tripped up. could work to improve this situation
     284                 :             // statically later.
     285             624 :             Err(DownloadError::DownloadRequired) => return Ok(None),
     286 UBC           0 :             Err(e) => return Err(e.into()),
     287                 :         };
     288                 : 
     289 CBC        4171 :         Ok(Some(ResidentLayer {
     290            4171 :             downloaded,
     291            4171 :             owner: self.clone(),
     292            4171 :         }))
     293            4795 :     }
     294                 : 
     295                 :     /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
     296            3710 :     pub(crate) async fn download_and_keep_resident(&self) -> anyhow::Result<ResidentLayer> {
     297            3710 :         let downloaded = self.0.get_or_maybe_download(true, None).await?;
     298                 : 
     299            3710 :         Ok(ResidentLayer {
     300            3710 :             downloaded,
     301            3710 :             owner: self.clone(),
     302            3710 :         })
     303            3710 :     }
     304                 : 
     305            2974 :     pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
     306            2974 :         self.0.info(reset)
     307            2974 :     }
     308                 : 
     309            4170 :     pub(crate) fn access_stats(&self) -> &LayerAccessStats {
     310            4170 :         &self.0.access_stats
     311            4170 :     }
     312                 : 
     313           21343 :     pub(crate) fn local_path(&self) -> &Utf8Path {
     314           21343 :         &self.0.path
     315           21343 :     }
     316                 : 
     317           20463 :     pub(crate) fn metadata(&self) -> LayerFileMetadata {
     318           20463 :         self.0.metadata()
     319           20463 :     }
     320                 : 
     321                 :     /// Traditional debug dumping facility
     322                 :     #[allow(unused)]
     323               2 :     pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> {
     324               2 :         self.0.desc.dump();
     325               2 : 
     326               2 :         if verbose {
     327                 :             // for now, unconditionally download everything, even if that might not be wanted.
     328               2 :             let l = self.0.get_or_maybe_download(true, Some(ctx)).await?;
     329               2 :             l.dump(&self.0, ctx).await?
     330 UBC           0 :         }
     331                 : 
     332 CBC           2 :         Ok(())
     333               2 :     }
     334                 : 
     335                 :     /// Waits until this layer has been dropped (and if needed, local file deletion and remote
     336                 :     /// deletion scheduling has completed).
     337                 :     ///
     338                 :     /// Does not start local deletion, use [`Self::delete_on_drop`] for that
     339                 :     /// separatedly.
     340                 :     #[cfg(feature = "testing")]
     341             702 :     pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
     342             702 :         let mut rx = self.0.status.subscribe();
     343                 : 
     344             702 :         async move {
     345                 :             loop {
     346             702 :                 if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await {
     347             702 :                     break;
     348 UBC           0 :                 }
     349                 :             }
     350 CBC         702 :         }
     351             702 :     }
     352                 : }
     353                 : 
     354                 : /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
     355                 : ///
     356                 : /// However when we want something evicted, we cannot evict it right away as there might be current
     357                 : /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
     358                 : /// read with [`Layer::get_value_reconstruct_data`].
     359                 : ///
     360                 : /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
     361 UBC           0 : #[derive(Debug)]
     362                 : enum ResidentOrWantedEvicted {
     363                 :     Resident(Arc<DownloadedLayer>),
     364                 :     WantedEvicted(Weak<DownloadedLayer>, usize),
     365                 : }
     366                 : 
     367                 : impl ResidentOrWantedEvicted {
     368 CBC    15462735 :     fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
     369        15462735 :         match self {
     370        15462735 :             ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
     371 UBC           0 :             ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
     372               0 :                 Some(strong) => {
     373               0 :                     LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
     374               0 : 
     375               0 :                     *self = ResidentOrWantedEvicted::Resident(strong.clone());
     376               0 : 
     377               0 :                     Some((strong, true))
     378                 :                 }
     379               0 :                 None => None,
     380                 :             },
     381                 :         }
     382 CBC    15462735 :     }
     383                 : 
     384                 :     /// When eviction is first requested, drop down to holding a [`Weak`].
     385                 :     ///
     386                 :     /// Returns `Some` if this was the first time eviction was requested. Care should be taken to
     387                 :     /// drop the possibly last strong reference outside of the mutex of
     388                 :     /// heavier_once_cell::OnceCell.
     389            2538 :     fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
     390            2538 :         match self {
     391            2538 :             ResidentOrWantedEvicted::Resident(strong) => {
     392            2538 :                 let weak = Arc::downgrade(strong);
     393            2538 :                 let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
     394            2538 :                 std::mem::swap(self, &mut temp);
     395            2538 :                 match temp {
     396            2538 :                     ResidentOrWantedEvicted::Resident(strong) => Some(strong),
     397 UBC           0 :                     ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
     398                 :                 }
     399                 :             }
     400               0 :             ResidentOrWantedEvicted::WantedEvicted(..) => None,
     401                 :         }
     402 CBC        2538 :     }
     403                 : }
     404                 : 
     405                 : struct LayerInner {
     406                 :     /// Only needed to check ondemand_download_behavior_treat_error_as_warn and creation of
     407                 :     /// [`Self::path`].
     408                 :     conf: &'static PageServerConf,
     409                 : 
     410                 :     /// Full path to the file; unclear if this should exist anymore.
     411                 :     path: Utf8PathBuf,
     412                 : 
     413                 :     desc: PersistentLayerDesc,
     414                 : 
     415                 :     /// Timeline access is needed for remote timeline client and metrics.
     416                 :     timeline: Weak<Timeline>,
     417                 : 
     418                 :     /// Cached knowledge of [`Timeline::remote_client`] being `Some`.
     419                 :     have_remote_client: bool,
     420                 : 
     421                 :     access_stats: LayerAccessStats,
     422                 : 
     423                 :     /// This custom OnceCell is backed by std mutex, but only held for short time periods.
     424                 :     /// Initialization and deinitialization are done while holding a permit.
     425                 :     inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
     426                 : 
     427                 :     /// Do we want to delete locally and remotely this when `LayerInner` is dropped
     428                 :     wanted_deleted: AtomicBool,
     429                 : 
     430                 :     /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
     431                 :     /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
     432                 :     /// [`LayerInner::on_downloaded_layer_drop`].
     433                 :     wanted_evicted: AtomicBool,
     434                 : 
     435                 :     /// Version is to make sure we will only evict a specific download of a file.
     436                 :     ///
     437                 :     /// Incremented for each download, stored in `DownloadedLayer::version` or
     438                 :     /// `ResidentOrWantedEvicted::WantedEvicted`.
     439                 :     version: AtomicUsize,
     440                 : 
     441                 :     /// Allow subscribing to when the layer actually gets evicted.
     442                 :     status: tokio::sync::broadcast::Sender<Status>,
     443                 : 
     444                 :     /// Counter for exponential backoff with the download
     445                 :     consecutive_failures: AtomicUsize,
     446                 : 
     447                 :     /// The generation of this Layer.
     448                 :     ///
     449                 :     /// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
     450                 :     /// for created layers from [`Timeline::generation`].
     451                 :     generation: Generation,
     452                 : 
     453                 :     /// The shard of this Layer.
     454                 :     ///
     455                 :     /// For layers created in this process, this will always be the [`ShardIndex`] of the
     456                 :     /// current `ShardIdentity`` (TODO: add link once it's introduced).
     457                 :     ///
     458                 :     /// For loaded layers, this may be some other value if the tenant has undergone
     459                 :     /// a shard split since the layer was originally written.
     460                 :     shard: ShardIndex,
     461                 : 
     462                 :     last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
     463                 : }
     464                 : 
     465                 : impl std::fmt::Display for LayerInner {
     466           21984 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     467           21984 :         write!(f, "{}", self.layer_desc().short_id())
     468           21984 :     }
     469                 : }
     470                 : 
     471                 : impl AsLayerDesc for LayerInner {
     472        47166786 :     fn layer_desc(&self) -> &PersistentLayerDesc {
     473        47166786 :         &self.desc
     474        47166786 :     }
     475                 : }
     476                 : 
     477            2538 : #[derive(Debug, Clone, Copy)]
     478                 : enum Status {
     479                 :     Evicted,
     480                 :     Downloaded,
     481                 : }
     482                 : 
     483                 : impl Drop for LayerInner {
     484           46995 :     fn drop(&mut self) {
     485           46995 :         if !*self.wanted_deleted.get_mut() {
     486                 :             // should we try to evict if the last wish was for eviction?
     487                 :             // feels like there's some hazard of overcrowding near shutdown near by, but we don't
     488                 :             // run drops during shutdown (yet)
     489           42690 :             return;
     490            4305 :         }
     491                 : 
     492            4305 :         let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
     493                 : 
     494            4305 :         let path = std::mem::take(&mut self.path);
     495            4305 :         let file_name = self.layer_desc().filename();
     496            4305 :         let file_size = self.layer_desc().file_size;
     497            4305 :         let timeline = self.timeline.clone();
     498            4305 :         let meta = self.metadata();
     499            4305 :         let status = self.status.clone();
     500            4305 : 
     501            4305 :         crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
     502            4305 :             let _g = span.entered();
     503            4305 : 
     504            4305 :             // carry this until we are finished for [`Layer::wait_drop`] support
     505            4305 :             let _status = status;
     506                 : 
     507            4305 :             let removed = match std::fs::remove_file(path) {
     508            4303 :                 Ok(()) => true,
     509               2 :                 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
     510               2 :                     // until we no longer do detaches by removing all local files before removing the
     511               2 :                     // tenant from the global map, we will always get these errors even if we knew what
     512               2 :                     // is the latest state.
     513               2 :                     //
     514               2 :                     // we currently do not track the latest state, so we'll also end up here on evicted
     515               2 :                     // layers.
     516               2 :                     false
     517                 :                 }
     518 UBC           0 :                 Err(e) => {
     519               0 :                     tracing::error!("failed to remove wanted deleted layer: {e}");
     520               0 :                     LAYER_IMPL_METRICS.inc_delete_removes_failed();
     521               0 :                     false
     522                 :                 }
     523                 :             };
     524                 : 
     525 CBC        4305 :             if let Some(timeline) = timeline.upgrade() {
     526            4305 :                 if removed {
     527            4303 :                     timeline.metrics.resident_physical_size_sub(file_size);
     528            4303 :                 }
     529            4305 :                 if let Some(remote_client) = timeline.remote_client.as_ref() {
     530            4305 :                     let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
     531                 : 
     532            4305 :                     if let Err(e) = res {
     533                 :                         // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
     534                 :                         // demonstrating this deadlock (without spawn_blocking): stop will drop
     535                 :                         // queued items, which will have ResidentLayer's, and those drops would try
     536                 :                         // to re-entrantly lock the RemoteTimelineClient inner state.
     537               7 :                         if !timeline.is_active() {
     538               7 :                             tracing::info!("scheduling deletion on drop failed: {e:#}");
     539                 :                         } else {
     540 UBC           0 :                             tracing::warn!("scheduling deletion on drop failed: {e:#}");
     541                 :                         }
     542 CBC           7 :                         LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
     543            4298 :                     } else {
     544            4298 :                         LAYER_IMPL_METRICS.inc_completed_deletes();
     545            4298 :                     }
     546 UBC           0 :                 }
     547               0 :             } else {
     548               0 :                 // no need to nag that timeline is gone: under normal situation on
     549               0 :                 // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
     550               0 :                 LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
     551               0 :             }
     552 CBC        4305 :         });
     553           46995 :     }
     554                 : }
     555                 : 
     556                 : impl LayerInner {
     557           78253 :     fn new(
     558           78253 :         conf: &'static PageServerConf,
     559           78253 :         timeline: &Arc<Timeline>,
     560           78253 :         access_stats: LayerAccessStats,
     561           78253 :         desc: PersistentLayerDesc,
     562           78253 :         downloaded: Option<Arc<DownloadedLayer>>,
     563           78253 :         generation: Generation,
     564           78253 :         shard: ShardIndex,
     565           78253 :     ) -> Self {
     566           78253 :         let path = conf
     567           78253 :             .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
     568           78253 :             .join(desc.filename().to_string());
     569                 : 
     570           78253 :         let (inner, version) = if let Some(inner) = downloaded {
     571           34533 :             let version = inner.version;
     572           34533 :             let resident = ResidentOrWantedEvicted::Resident(inner);
     573           34533 :             (heavier_once_cell::OnceCell::new(resident), version)
     574                 :         } else {
     575           43720 :             (heavier_once_cell::OnceCell::default(), 0)
     576                 :         };
     577                 : 
     578           78253 :         LayerInner {
     579           78253 :             conf,
     580           78253 :             path,
     581           78253 :             desc,
     582           78253 :             timeline: Arc::downgrade(timeline),
     583           78253 :             have_remote_client: timeline.remote_client.is_some(),
     584           78253 :             access_stats,
     585           78253 :             wanted_deleted: AtomicBool::new(false),
     586           78253 :             wanted_evicted: AtomicBool::new(false),
     587           78253 :             inner,
     588           78253 :             version: AtomicUsize::new(version),
     589           78253 :             status: tokio::sync::broadcast::channel(1).0,
     590           78253 :             consecutive_failures: AtomicUsize::new(0),
     591           78253 :             generation,
     592           78253 :             shard,
     593           78253 :             last_evicted_at: std::sync::Mutex::default(),
     594           78253 :         }
     595           78253 :     }
     596                 : 
     597            5005 :     fn delete_on_drop(&self) {
     598            5005 :         let res =
     599            5005 :             self.wanted_deleted
     600            5005 :                 .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
     601            5005 : 
     602            5005 :         if res.is_ok() {
     603            5005 :             LAYER_IMPL_METRICS.inc_started_deletes();
     604            5005 :         }
     605            5005 :     }
     606                 : 
     607                 :     /// Cancellation safe, however dropping the future and calling this method again might result
     608                 :     /// in a new attempt to evict OR join the previously started attempt.
     609            2539 :     pub(crate) async fn evict_and_wait(
     610            2539 :         &self,
     611            2539 :         _: &RemoteTimelineClient,
     612            2539 :     ) -> Result<(), EvictionError> {
     613            2539 :         use tokio::sync::broadcast::error::RecvError;
     614            2539 : 
     615            2539 :         assert!(self.have_remote_client);
     616                 : 
     617            2539 :         let mut rx = self.status.subscribe();
     618                 : 
     619            2538 :         let strong = {
     620            2539 :             match self.inner.get() {
     621            2538 :                 Some(mut either) => {
     622            2538 :                     self.wanted_evicted.store(true, Ordering::Relaxed);
     623            2538 :                     either.downgrade()
     624                 :                 }
     625               1 :                 None => return Err(EvictionError::NotFound),
     626                 :             }
     627                 :         };
     628                 : 
     629            2538 :         if strong.is_some() {
     630            2538 :             // drop the DownloadedLayer outside of the holding the guard
     631            2538 :             drop(strong);
     632            2538 :             LAYER_IMPL_METRICS.inc_started_evictions();
     633            2538 :         }
     634                 : 
     635            2538 :         match rx.recv().await {
     636            2538 :             Ok(Status::Evicted) => Ok(()),
     637 UBC           0 :             Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
     638                 :             Err(RecvError::Closed) => {
     639               0 :                 unreachable!("sender cannot be dropped while we are in &self method")
     640                 :             }
     641                 :             Err(RecvError::Lagged(_)) => {
     642                 :                 // this is quite unlikely, but we are blocking a lot in the async context, so
     643                 :                 // we might be missing this because we are stuck on a LIFO slot on a thread
     644                 :                 // which is busy blocking for a 1TB database create_image_layers.
     645                 :                 //
     646                 :                 // use however late (compared to the initial expressing of wanted) as the
     647                 :                 // "outcome" now
     648               0 :                 LAYER_IMPL_METRICS.inc_broadcast_lagged();
     649               0 :                 match self.inner.get() {
     650               0 :                     Some(_) => Err(EvictionError::Downloaded),
     651               0 :                     None => Ok(()),
     652                 :                 }
     653                 :             }
     654                 :         }
     655 CBC        2539 :     }
     656                 : 
     657                 :     /// Cancellation safe.
     658        15463374 :     async fn get_or_maybe_download(
     659        15463374 :         self: &Arc<Self>,
     660        15463374 :         allow_download: bool,
     661        15463374 :         ctx: Option<&RequestContext>,
     662        15463394 :     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
     663        15463394 :         let mut init_permit = None;
     664                 : 
     665                 :         loop {
     666        15463394 :             let download = move |permit| {
     667           10035 :                 async move {
     668           10035 :                     // disable any scheduled but not yet running eviction deletions for this
     669           10035 :                     let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
     670           10035 : 
     671           10035 :                     // count cancellations, which currently remain largely unexpected
     672           10035 :                     let init_cancelled =
     673           10035 :                         scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
     674           10035 : 
     675           10035 :                     // no need to make the evict_and_wait wait for the actual download to complete
     676           10035 :                     drop(self.status.send(Status::Downloaded));
     677                 : 
     678           10035 :                     let timeline = self
     679           10035 :                         .timeline
     680           10035 :                         .upgrade()
     681           10035 :                         .ok_or_else(|| DownloadError::TimelineShutdown)?;
     682                 : 
     683                 :                     // FIXME: grab a gate
     684                 : 
     685           10035 :                     let can_ever_evict = timeline.remote_client.as_ref().is_some();
     686                 : 
     687                 :                     // check if we really need to be downloaded; could have been already downloaded by a
     688                 :                     // cancelled previous attempt.
     689           10035 :                     let needs_download = self
     690           10035 :                         .needs_download()
     691            9589 :                         .await
     692           10035 :                         .map_err(DownloadError::PreStatFailed)?;
     693                 : 
     694           10035 :                     let permit = if let Some(reason) = needs_download {
     695           10035 :                         if let NeedsDownload::NotFile(ft) = reason {
     696 UBC           0 :                             return Err(DownloadError::NotFile(ft));
     697 CBC       10035 :                         }
     698           10035 : 
     699           10035 :                         // only reset this after we've decided we really need to download. otherwise it'd
     700           10035 :                         // be impossible to mark cancelled downloads for eviction, like one could imagine
     701           10035 :                         // we would like to do for prefetching which was not needed.
     702           10035 :                         self.wanted_evicted.store(false, Ordering::Release);
     703           10035 : 
     704           10035 :                         if !can_ever_evict {
     705 UBC           0 :                             return Err(DownloadError::NoRemoteStorage);
     706 CBC       10035 :                         }
     707                 : 
     708           10035 :                         if let Some(ctx) = ctx {
     709            9396 :                             self.check_expected_download(ctx)?;
     710             639 :                         }
     711                 : 
     712           10035 :                         if !allow_download {
     713                 :                             // this does look weird, but for LayerInner the "downloading" means also changing
     714                 :                             // internal once related state ...
     715             624 :                             return Err(DownloadError::DownloadRequired);
     716            9411 :                         }
     717            9411 : 
     718            9411 :                         tracing::info!(%reason, "downloading on-demand");
     719                 : 
     720           19664 :                         self.spawn_download_and_wait(timeline, permit).await?
     721                 :                     } else {
     722                 :                         // the file is present locally, probably by a previous but cancelled call to
     723                 :                         // get_or_maybe_download. alternatively we might be running without remote storage.
     724 UBC           0 :                         LAYER_IMPL_METRICS.inc_init_needed_no_download();
     725               0 : 
     726               0 :                         permit
     727                 :                     };
     728                 : 
     729 CBC        9396 :                     let since_last_eviction =
     730            9396 :                         self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
     731            9396 :                     if let Some(since_last_eviction) = since_last_eviction {
     732             105 :                         // FIXME: this will not always be recorded correctly until #6028 (the no
     733             105 :                         // download needed branch above)
     734             105 :                         LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
     735            9291 :                     }
     736                 : 
     737            9396 :                     let res = Arc::new(DownloadedLayer {
     738            9396 :                         owner: Arc::downgrade(self),
     739            9396 :                         kind: tokio::sync::OnceCell::default(),
     740            9396 :                         version: next_version,
     741            9396 :                     });
     742            9396 : 
     743            9396 :                     self.access_stats.record_residence_event(
     744            9396 :                         LayerResidenceStatus::Resident,
     745            9396 :                         LayerResidenceEventReason::ResidenceChange,
     746            9396 :                     );
     747            9396 : 
     748            9396 :                     let waiters = self.inner.initializer_count();
     749            9396 :                     if waiters > 0 {
     750             318 :                         tracing::info!(
     751             318 :                             waiters,
     752             318 :                             "completing the on-demand download for other tasks"
     753             318 :                         );
     754            9078 :                     }
     755                 : 
     756            9396 :                     scopeguard::ScopeGuard::into_inner(init_cancelled);
     757            9396 : 
     758            9396 :                     Ok((ResidentOrWantedEvicted::Resident(res), permit))
     759           10026 :                 }
     760           10035 :                 .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
     761           10035 :             };
     762                 : 
     763        15463394 :             if let Some(init_permit) = init_permit.take() {
     764                 :                 // use the already held initialization permit because it is impossible to hit the
     765                 :                 // below paths anymore essentially limiting the max loop iterations to 2.
     766 UBC           0 :                 let (value, init_permit) = download(init_permit).await?;
     767               0 :                 let mut guard = self.inner.set(value, init_permit);
     768               0 :                 let (strong, _upgraded) = guard
     769               0 :                     .get_and_upgrade()
     770               0 :                     .expect("init creates strong reference, we held the init permit");
     771               0 :                 return Ok(strong);
     772 CBC    15463394 :             }
     773                 : 
     774 UBC           0 :             let (weak, permit) = {
     775 CBC    15463394 :                 let mut locked = self.inner.get_or_init(download).await?;
     776                 : 
     777        15462755 :                 if let Some((strong, upgraded)) = locked.get_and_upgrade() {
     778        15462755 :                     if upgraded {
     779 UBC           0 :                         // when upgraded back, the Arc<DownloadedLayer> is still available, but
     780               0 :                         // previously a `evict_and_wait` was received.
     781               0 :                         self.wanted_evicted.store(false, Ordering::Relaxed);
     782               0 : 
     783               0 :                         // error out any `evict_and_wait`
     784               0 :                         drop(self.status.send(Status::Downloaded));
     785               0 :                         LAYER_IMPL_METRICS
     786               0 :                             .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
     787 CBC    15462755 :                     }
     788                 : 
     789        15462755 :                     return Ok(strong);
     790                 :                 } else {
     791                 :                     // path to here: the evict_blocking is stuck on spawn_blocking queue.
     792                 :                     //
     793                 :                     // reset the contents, deactivating the eviction and causing a
     794                 :                     // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
     795 UBC           0 :                     locked.take_and_deinit()
     796               0 :                 }
     797               0 :             };
     798               0 : 
     799               0 :             // unlock first, then drop the weak, but because upgrade failed, we
     800               0 :             // know it cannot be a problem.
     801               0 : 
     802               0 :             assert!(
     803               0 :                 matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
     804               0 :                 "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
     805                 :             );
     806                 : 
     807               0 :             init_permit = Some(permit);
     808               0 : 
     809               0 :             LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download();
     810                 :         }
     811 CBC    15463385 :     }
     812                 : 
     813                 :     /// Nag or fail per RequestContext policy
     814            9396 :     fn check_expected_download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
     815            9396 :         use crate::context::DownloadBehavior::*;
     816            9396 :         let b = ctx.download_behavior();
     817            9396 :         match b {
     818            9396 :             Download => Ok(()),
     819                 :             Warn | Error => {
     820 UBC           0 :                 tracing::info!(
     821               0 :                     "unexpectedly on-demand downloading for task kind {:?}",
     822               0 :                     ctx.task_kind()
     823               0 :                 );
     824               0 :                 crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
     825                 : 
     826               0 :                 let really_error =
     827               0 :                     matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn;
     828                 : 
     829               0 :                 if really_error {
     830                 :                     // this check is only probablistic, seems like flakyness footgun
     831               0 :                     Err(DownloadError::ContextAndConfigReallyDeniesDownloads)
     832                 :                 } else {
     833               0 :                     Ok(())
     834                 :                 }
     835                 :             }
     836                 :         }
     837 CBC        9396 :     }
     838                 : 
     839                 :     /// Actual download, at most one is executed at the time.
     840            9411 :     async fn spawn_download_and_wait(
     841            9411 :         self: &Arc<Self>,
     842            9411 :         timeline: Arc<Timeline>,
     843            9411 :         permit: heavier_once_cell::InitPermit,
     844            9411 :     ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
     845            9411 :         let task_name = format!("download layer {}", self);
     846            9411 : 
     847            9411 :         let (tx, rx) = tokio::sync::oneshot::channel();
     848            9411 : 
     849            9411 :         // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
     850            9411 :         // block tenant::mgr::remove_tenant_from_memory.
     851            9411 : 
     852            9411 :         let this: Arc<Self> = self.clone();
     853            9411 : 
     854            9411 :         crate::task_mgr::spawn(
     855            9411 :             &tokio::runtime::Handle::current(),
     856            9411 :             crate::task_mgr::TaskKind::RemoteDownloadTask,
     857            9411 :             Some(self.desc.tenant_shard_id),
     858            9411 :             Some(self.desc.timeline_id),
     859            9411 :             &task_name,
     860            9411 :             false,
     861            9411 :             async move {
     862            9411 : 
     863            9411 :                 let client = timeline
     864            9411 :                     .remote_client
     865            9411 :                     .as_ref()
     866            9411 :                     .expect("checked above with have_remote_client");
     867                 : 
     868            9411 :                 let result = client.download_layer_file(
     869            9411 :                     &this.desc.filename(),
     870            9411 :                     &this.metadata(),
     871            9411 :                     &crate::task_mgr::shutdown_token()
     872            9411 :                 )
     873          376826 :                 .await;
     874                 : 
     875            9408 :                 let result = match result {
     876            9397 :                     Ok(size) => {
     877            9397 :                         timeline.metrics.resident_physical_size_add(size);
     878            9397 :                         Ok(())
     879                 :                     }
     880              11 :                     Err(e) => {
     881              11 :                         let consecutive_failures =
     882              11 :                             this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
     883              11 : 
     884              11 :                         let backoff = utils::backoff::exponential_backoff_duration_seconds(
     885              11 :                             consecutive_failures.min(u32::MAX as usize) as u32,
     886              11 :                             1.5,
     887              11 :                             60.0,
     888              11 :                         );
     889              11 : 
     890              11 :                         let backoff = std::time::Duration::from_secs_f64(backoff);
     891              11 : 
     892              17 :                         tokio::select! {
     893              17 :                             _ = tokio::time::sleep(backoff) => {},
     894              17 :                             _ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
     895              17 :                             _ = timeline.cancel.cancelled() => {},
     896              17 :                         };
     897                 : 
     898              11 :                         Err(e)
     899                 :                     }
     900                 :                 };
     901                 : 
     902            9408 :                 if let Err(res) = tx.send((result, permit)) {
     903               6 :                     match res {
     904               1 :                         (Ok(()), _) => {
     905               1 :                             // our caller is cancellation safe so this is fine; if someone
     906               1 :                             // else requests the layer, they'll find it already downloaded.
     907               1 :                             //
     908               1 :                             // See counter [`LayerImplMetrics::inc_init_needed_no_download`]
     909               1 :                             //
     910               1 :                             // FIXME(#6028): however, could be that we should consider marking the
     911               1 :                             // layer for eviction? alas, cannot: because only DownloadedLayer will
     912               1 :                             // handle that.
     913               1 :                         },
     914               5 :                         (Err(e), _) => {
     915               5 :                             // our caller is cancellation safe, but we might be racing with
     916               5 :                             // another attempt to initialize. before we have cancellation
     917               5 :                             // token support: these attempts should converge regardless of
     918               5 :                             // their completion order.
     919               5 :                             tracing::error!("layer file download failed, and additionally failed to communicate this to caller: {e:?}");
     920               5 :                             LAYER_IMPL_METRICS.inc_download_failed_without_requester();
     921                 :                         }
     922                 :                     }
     923            9402 :                 }
     924                 : 
     925            9408 :                 Ok(())
     926            9411 :             }
     927            9411 :             .in_current_span(),
     928            9411 :         );
     929           10713 :         match rx.await {
     930            9396 :             Ok((Ok(()), permit)) => {
     931            9396 :                 if let Some(reason) = self
     932            9396 :                     .needs_download()
     933            8951 :                     .await
     934            9396 :                     .map_err(DownloadError::PostStatFailed)?
     935                 :                 {
     936                 :                     // this is really a bug in needs_download or remote timeline client
     937 UBC           0 :                     panic!("post-condition failed: needs_download returned {reason:?}");
     938 CBC        9396 :                 }
     939            9396 : 
     940            9396 :                 self.consecutive_failures.store(0, Ordering::Relaxed);
     941            9396 :                 tracing::info!("on-demand download successful");
     942                 : 
     943            9396 :                 Ok(permit)
     944                 :             }
     945               6 :             Ok((Err(e), _permit)) => {
     946               6 :                 // sleep already happened in the spawned task, if it was not cancelled
     947               6 :                 let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
     948               6 :                 tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
     949               6 :                 Err(DownloadError::DownloadFailed)
     950                 :             }
     951 UBC           0 :             Err(_gone) => Err(DownloadError::DownloadCancelled),
     952                 :         }
     953 CBC        9402 :     }
     954                 : 
     955           19431 :     async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
     956           19431 :         match tokio::fs::metadata(&self.path).await {
     957            9396 :             Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
     958           10035 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
     959 UBC           0 :             Err(e) => Err(e),
     960                 :         }
     961 CBC       19431 :     }
     962                 : 
     963           57805 :     fn needs_download_blocking(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
     964           57805 :         match self.path.metadata() {
     965           14085 :             Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
     966           43720 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
     967 UBC           0 :             Err(e) => Err(e),
     968                 :         }
     969 CBC       57805 :     }
     970                 : 
     971           23481 :     fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
     972           23481 :         // in future, this should include sha2-256 validation of the file.
     973           23481 :         if !m.is_file() {
     974 UBC           0 :             Err(NeedsDownload::NotFile(m.file_type()))
     975 CBC       23481 :         } else if m.len() != self.desc.file_size {
     976 UBC           0 :             Err(NeedsDownload::WrongSize {
     977               0 :                 actual: m.len(),
     978               0 :                 expected: self.desc.file_size,
     979               0 :             })
     980                 :         } else {
     981 CBC       23481 :             Ok(())
     982                 :         }
     983           23481 :     }
     984                 : 
     985            2974 :     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
     986            2974 :         let layer_file_name = self.desc.filename().file_name();
     987            2974 : 
     988            2974 :         // this is not accurate: we could have the file locally but there was a cancellation
     989            2974 :         // and now we are not in sync, or we are currently downloading it.
     990            2974 :         let remote = self.inner.get().is_none();
     991            2974 : 
     992            2974 :         let access_stats = self.access_stats.as_api_model(reset);
     993            2974 : 
     994            2974 :         if self.desc.is_delta {
     995            2393 :             let lsn_range = &self.desc.lsn_range;
     996            2393 : 
     997            2393 :             HistoricLayerInfo::Delta {
     998            2393 :                 layer_file_name,
     999            2393 :                 layer_file_size: self.desc.file_size,
    1000            2393 :                 lsn_start: lsn_range.start,
    1001            2393 :                 lsn_end: lsn_range.end,
    1002            2393 :                 remote,
    1003            2393 :                 access_stats,
    1004            2393 :             }
    1005                 :         } else {
    1006             581 :             let lsn = self.desc.image_layer_lsn();
    1007             581 : 
    1008             581 :             HistoricLayerInfo::Image {
    1009             581 :                 layer_file_name,
    1010             581 :                 layer_file_size: self.desc.file_size,
    1011             581 :                 lsn_start: lsn,
    1012             581 :                 remote,
    1013             581 :                 access_stats,
    1014             581 :             }
    1015                 :         }
    1016            2974 :     }
    1017                 : 
    1018                 :     /// `DownloadedLayer` is being dropped, so it calls this method.
    1019            2538 :     fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
    1020            2538 :         let delete = self.wanted_deleted.load(Ordering::Acquire);
    1021            2538 :         let evict = self.wanted_evicted.load(Ordering::Acquire);
    1022            2538 :         let can_evict = self.have_remote_client;
    1023            2538 : 
    1024            2538 :         if delete {
    1025 UBC           0 :             // do nothing now, only in LayerInner::drop -- this was originally implemented because
    1026               0 :             // we could had already scheduled the deletion at the time.
    1027               0 :             //
    1028               0 :             // FIXME: this is not true anymore, we can safely evict wanted deleted files.
    1029 CBC        2538 :         } else if can_evict && evict {
    1030            2538 :             let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
    1031                 : 
    1032                 :             // downgrade for queueing, in case there's a tear down already ongoing we should not
    1033                 :             // hold it alive.
    1034            2538 :             let this = Arc::downgrade(&self);
    1035            2538 :             drop(self);
    1036            2538 : 
    1037            2538 :             // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
    1038            2538 :             // drop while the `self.inner` is being locked, leading to a deadlock.
    1039            2538 : 
    1040            2538 :             crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
    1041            2538 :                 let _g = span.entered();
    1042                 : 
    1043                 :                 // if LayerInner is already dropped here, do nothing because the delete on drop
    1044                 :                 // has already ran while we were in queue
    1045            2538 :                 let Some(this) = this.upgrade() else {
    1046 UBC           0 :                     LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
    1047               0 :                     return;
    1048                 :                 };
    1049 CBC        2538 :                 match this.evict_blocking(version) {
    1050            2538 :                     Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
    1051 UBC           0 :                     Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
    1052                 :                 }
    1053 CBC        2538 :             });
    1054 UBC           0 :         }
    1055 CBC        2538 :     }
    1056                 : 
    1057            2538 :     fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
    1058                 :         // deleted or detached timeline, don't do anything.
    1059            2538 :         let Some(timeline) = self.timeline.upgrade() else {
    1060 UBC           0 :             return Err(EvictionCancelled::TimelineGone);
    1061                 :         };
    1062                 : 
    1063                 :         // to avoid starting a new download while we evict, keep holding on to the
    1064                 :         // permit.
    1065 CBC        2538 :         let _permit = {
    1066            2538 :             let maybe_downloaded = self.inner.get();
    1067                 : 
    1068            2538 :             let (_weak, permit) = match maybe_downloaded {
    1069            2538 :                 Some(mut guard) => {
    1070            2538 :                     if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard {
    1071            2538 :                         if *version == only_version {
    1072            2538 :                             guard.take_and_deinit()
    1073                 :                         } else {
    1074                 :                             // this was not for us; maybe there's another eviction job
    1075                 :                             // TODO: does it make any sense to stall here? unique versions do not
    1076                 :                             // matter, we only want to make sure not to evict a resident, which we
    1077                 :                             // are not doing.
    1078 UBC           0 :                             return Err(EvictionCancelled::VersionCheckFailed);
    1079                 :                         }
    1080                 :                     } else {
    1081               0 :                         return Err(EvictionCancelled::AlreadyReinitialized);
    1082                 :                     }
    1083                 :                 }
    1084                 :                 None => {
    1085                 :                     // already deinitialized, perhaps get_or_maybe_download did this and is
    1086                 :                     // currently waiting to reinitialize it
    1087               0 :                     return Err(EvictionCancelled::LostToDownload);
    1088                 :                 }
    1089                 :             };
    1090                 : 
    1091 CBC        2538 :             permit
    1092            2538 :         };
    1093            2538 : 
    1094            2538 :         // now accesses to inner.get_or_init wait on the semaphore or the `_permit`
    1095            2538 : 
    1096            2538 :         self.access_stats.record_residence_event(
    1097            2538 :             LayerResidenceStatus::Evicted,
    1098            2538 :             LayerResidenceEventReason::ResidenceChange,
    1099            2538 :         );
    1100                 : 
    1101            2538 :         let res = match capture_mtime_and_remove(&self.path) {
    1102            2538 :             Ok(local_layer_mtime) => {
    1103            2538 :                 let duration = SystemTime::now().duration_since(local_layer_mtime);
    1104            2538 :                 match duration {
    1105            2538 :                     Ok(elapsed) => {
    1106            2538 :                         timeline
    1107            2538 :                             .metrics
    1108            2538 :                             .evictions_with_low_residence_duration
    1109            2538 :                             .read()
    1110            2538 :                             .unwrap()
    1111            2538 :                             .observe(elapsed);
    1112            2538 :                         tracing::info!(
    1113            2538 :                             residence_millis = elapsed.as_millis(),
    1114            2538 :                             "evicted layer after known residence period"
    1115            2538 :                         );
    1116                 :                     }
    1117                 :                     Err(_) => {
    1118 UBC           0 :                         tracing::info!("evicted layer after unknown residence period");
    1119                 :                     }
    1120                 :                 }
    1121 CBC        2538 :                 timeline.metrics.evictions.inc();
    1122            2538 :                 timeline
    1123            2538 :                     .metrics
    1124            2538 :                     .resident_physical_size_sub(self.desc.file_size);
    1125            2538 : 
    1126            2538 :                 Ok(())
    1127                 :             }
    1128 UBC           0 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
    1129               0 :                 tracing::error!(
    1130               0 :                     layer_size = %self.desc.file_size,
    1131               0 :                     "failed to evict layer from disk, it was already gone (metrics will be inaccurate)"
    1132               0 :                 );
    1133               0 :                 Err(EvictionCancelled::FileNotFound)
    1134                 :             }
    1135               0 :             Err(e) => {
    1136               0 :                 tracing::error!("failed to evict file from disk: {e:#}");
    1137               0 :                 Err(EvictionCancelled::RemoveFailed)
    1138                 :             }
    1139                 :         };
    1140                 : 
    1141                 :         // we are still holding the permit, so no new spawn_download_and_wait can happen
    1142 CBC        2538 :         drop(self.status.send(Status::Evicted));
    1143            2538 : 
    1144            2538 :         *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
    1145            2538 : 
    1146            2538 :         res
    1147            2538 :     }
    1148                 : 
    1149           34179 :     fn metadata(&self) -> LayerFileMetadata {
    1150           34179 :         LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
    1151           34179 :     }
    1152                 : }
    1153                 : 
    1154            2538 : fn capture_mtime_and_remove(path: &Utf8Path) -> Result<SystemTime, std::io::Error> {
    1155            2538 :     let m = path.metadata()?;
    1156            2538 :     let local_layer_mtime = m.modified()?;
    1157            2538 :     std::fs::remove_file(path)?;
    1158            2538 :     Ok(local_layer_mtime)
    1159            2538 : }
    1160                 : 
    1161 UBC           0 : #[derive(Debug, thiserror::Error)]
    1162                 : pub(crate) enum EvictionError {
    1163                 :     #[error("layer was already evicted")]
    1164                 :     NotFound,
    1165                 : 
    1166                 :     /// Evictions must always lose to downloads in races, and this time it happened.
    1167                 :     #[error("layer was downloaded instead")]
    1168                 :     Downloaded,
    1169                 : }
    1170                 : 
    1171                 : /// Error internal to the [`LayerInner::get_or_maybe_download`]
    1172 CBC          13 : #[derive(Debug, thiserror::Error)]
    1173                 : enum DownloadError {
    1174                 :     #[error("timeline has already shutdown")]
    1175                 :     TimelineShutdown,
    1176                 :     #[error("no remote storage configured")]
    1177                 :     NoRemoteStorage,
    1178                 :     #[error("context denies downloading")]
    1179                 :     ContextAndConfigReallyDeniesDownloads,
    1180                 :     #[error("downloading is really required but not allowed by this method")]
    1181                 :     DownloadRequired,
    1182                 :     #[error("layer path exists, but it is not a file: {0:?}")]
    1183                 :     NotFile(std::fs::FileType),
    1184                 :     /// Why no error here? Because it will be reported by page_service. We should had also done
    1185                 :     /// retries already.
    1186                 :     #[error("downloading evicted layer file failed")]
    1187                 :     DownloadFailed,
    1188                 :     #[error("downloading failed, possibly for shutdown")]
    1189                 :     DownloadCancelled,
    1190                 :     #[error("pre-condition: stat before download failed")]
    1191                 :     PreStatFailed(#[source] std::io::Error),
    1192                 :     #[error("post-condition: stat after download failed")]
    1193                 :     PostStatFailed(#[source] std::io::Error),
    1194                 : }
    1195                 : 
    1196 UBC           0 : #[derive(Debug, PartialEq)]
    1197                 : pub(crate) enum NeedsDownload {
    1198                 :     NotFound,
    1199                 :     NotFile(std::fs::FileType),
    1200                 :     WrongSize { actual: u64, expected: u64 },
    1201                 : }
    1202                 : 
    1203                 : impl std::fmt::Display for NeedsDownload {
    1204 CBC        9411 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1205            9411 :         match self {
    1206            9411 :             NeedsDownload::NotFound => write!(f, "file was not found"),
    1207 UBC           0 :             NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
    1208               0 :             NeedsDownload::WrongSize { actual, expected } => {
    1209               0 :                 write!(f, "file size mismatch {actual} vs. {expected}")
    1210                 :             }
    1211                 :         }
    1212 CBC        9411 :     }
    1213                 : }
    1214                 : 
    1215                 : /// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
    1216                 : pub(crate) struct DownloadedLayer {
    1217                 :     owner: Weak<LayerInner>,
    1218                 :     // Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
    1219                 :     // DownloadedLayer
    1220                 :     kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
    1221                 :     version: usize,
    1222                 : }
    1223                 : 
    1224                 : impl std::fmt::Debug for DownloadedLayer {
    1225 UBC           0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1226               0 :         f.debug_struct("DownloadedLayer")
    1227               0 :             // owner omitted because it is always "Weak"
    1228               0 :             .field("kind", &self.kind)
    1229               0 :             .field("version", &self.version)
    1230               0 :             .finish()
    1231               0 :     }
    1232                 : }
    1233                 : 
    1234                 : impl Drop for DownloadedLayer {
    1235 CBC       24124 :     fn drop(&mut self) {
    1236           24124 :         if let Some(owner) = self.owner.upgrade() {
    1237            2538 :             owner.on_downloaded_layer_drop(self.version);
    1238           21586 :         } else {
    1239           21586 :             // no need to do anything, we are shutting down
    1240           21586 :         }
    1241           24124 :     }
    1242                 : }
    1243                 : 
    1244                 : impl DownloadedLayer {
    1245                 :     /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to
    1246                 :     /// initialize it permanently.
    1247                 :     ///
    1248                 :     /// `owner` parameter is a strong reference at the same `LayerInner` as the
    1249                 :     /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called,
    1250                 :     /// we will always have the LayerInner on the callstack, so we can just use it.
    1251        15458517 :     async fn get<'a>(
    1252        15458517 :         &'a self,
    1253        15458517 :         owner: &Arc<LayerInner>,
    1254        15458517 :         ctx: &RequestContext,
    1255        15458537 :     ) -> anyhow::Result<&'a LayerKind> {
    1256        15458537 :         let init = || async {
    1257           33286 :             assert_eq!(
    1258           33286 :                 Weak::as_ptr(&self.owner),
    1259           33286 :                 Arc::as_ptr(owner),
    1260        15458537 :                 "these are the same, just avoiding the upgrade"
    1261        15458537 :             );
    1262        15458537 : 
    1263        15458537 :             let res = if owner.desc.is_delta {
    1264        15458537 :                 let summary = Some(delta_layer::Summary::expected(
    1265           11055 :                     owner.desc.tenant_shard_id.tenant_id,
    1266           11055 :                     owner.desc.timeline_id,
    1267           11055 :                     owner.desc.key_range.clone(),
    1268           11055 :                     owner.desc.lsn_range.clone(),
    1269           11055 :                 ));
    1270           11055 :                 delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
    1271        15458537 :                     .await
    1272        15458537 :                     .map(|res| res.map(LayerKind::Delta))
    1273        15458537 :             } else {
    1274        15458537 :                 let lsn = owner.desc.image_layer_lsn();
    1275           22231 :                 let summary = Some(image_layer::Summary::expected(
    1276           22231 :                     owner.desc.tenant_shard_id.tenant_id,
    1277           22231 :                     owner.desc.timeline_id,
    1278           22231 :                     owner.desc.key_range.clone(),
    1279           22231 :                     lsn,
    1280           22231 :                 ));
    1281           22231 :                 image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
    1282        15458537 :                     .await
    1283        15458537 :                     .map(|res| res.map(LayerKind::Image))
    1284        15458537 :             };
    1285        15458537 : 
    1286        15458537 :             match res {
    1287        15458537 :                 Ok(Ok(layer)) => Ok(Ok(layer)),
    1288        15458537 :                 Ok(Err(transient)) => Err(transient),
    1289        15458537 :                 Err(permanent) => {
    1290               1 :                     LAYER_IMPL_METRICS.inc_permanent_loading_failures();
    1291               1 :                     // TODO(#5815): we are not logging all errors, so temporarily log them **once**
    1292               1 :                     // here as well
    1293               1 :                     let permanent = permanent.context("load layer");
    1294               1 :                     tracing::error!("layer loading failed permanently: {permanent:#}");
    1295        15458537 :                     Ok(Err(permanent))
    1296        15458537 :                 }
    1297        15458537 :             }
    1298        15458537 :         };
    1299        15458537 :         self.kind
    1300        15458537 :             .get_or_try_init(init)
    1301                 :             // return transient errors using `?`
    1302            1094 :             .await?
    1303        15458537 :             .as_ref()
    1304        15458537 :             .map_err(|e| {
    1305               9 :                 // errors are not clonabled, cannot but stringify
    1306               9 :                 // test_broken_timeline matches this string
    1307               9 :                 anyhow::anyhow!("layer loading failed: {e:#}")
    1308        15458537 :             })
    1309        15458537 :     }
    1310                 : 
    1311        15454845 :     async fn get_value_reconstruct_data(
    1312        15454845 :         &self,
    1313        15454845 :         key: Key,
    1314        15454845 :         lsn_range: Range<Lsn>,
    1315        15454845 :         reconstruct_data: &mut ValueReconstructState,
    1316        15454845 :         owner: &Arc<LayerInner>,
    1317        15454845 :         ctx: &RequestContext,
    1318        15454865 :     ) -> anyhow::Result<ValueReconstructResult> {
    1319        15454865 :         use LayerKind::*;
    1320        15454865 : 
    1321        15454865 :         match self.get(owner, ctx).await? {
    1322        15002756 :             Delta(d) => {
    1323        15002756 :                 d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
    1324          850598 :                     .await
    1325                 :             }
    1326          452100 :             Image(i) => {
    1327          452100 :                 i.get_value_reconstruct_data(key, reconstruct_data, ctx)
    1328           10753 :                     .await
    1329                 :             }
    1330                 :         }
    1331        15454863 :     }
    1332                 : 
    1333               2 :     async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
    1334               2 :         use LayerKind::*;
    1335               2 :         match self.get(owner, ctx).await? {
    1336               2 :             Delta(d) => d.dump(ctx).await?,
    1337 UBC           0 :             Image(i) => i.dump(ctx).await?,
    1338                 :         }
    1339                 : 
    1340 CBC           2 :         Ok(())
    1341               2 :     }
    1342                 : }
    1343                 : 
    1344                 : /// Wrapper around an actual layer implementation.
    1345 UBC           0 : #[derive(Debug)]
    1346                 : enum LayerKind {
    1347                 :     Delta(delta_layer::DeltaLayerInner),
    1348                 :     Image(image_layer::ImageLayerInner),
    1349                 : }
    1350                 : 
    1351                 : /// Guard for forcing a layer be resident while it exists.
    1352 CBC       25211 : #[derive(Clone)]
    1353                 : pub(crate) struct ResidentLayer {
    1354                 :     owner: Layer,
    1355                 :     downloaded: Arc<DownloadedLayer>,
    1356                 : }
    1357                 : 
    1358                 : impl std::fmt::Display for ResidentLayer {
    1359           31267 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1360           31267 :         write!(f, "{}", self.owner)
    1361           31267 :     }
    1362                 : }
    1363                 : 
    1364                 : impl std::fmt::Debug for ResidentLayer {
    1365 UBC           0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1366               0 :         write!(f, "{}", self.owner)
    1367               0 :     }
    1368                 : }
    1369                 : 
    1370                 : impl ResidentLayer {
    1371                 :     /// Release the eviction guard, converting back into a plain [`Layer`].
    1372                 :     ///
    1373                 :     /// You can access the [`Layer`] also by using `as_ref`.
    1374 CBC       21615 :     pub(crate) fn drop_eviction_guard(self) -> Layer {
    1375           21615 :         self.into()
    1376           21615 :     }
    1377                 : 
    1378                 :     /// Loads all keys stored in the layer. Returns key, lsn and value size.
    1379 UBC           0 :     #[tracing::instrument(skip_all, fields(layer=%self))]
    1380                 :     pub(crate) async fn load_keys<'a>(
    1381                 :         &'a self,
    1382                 :         ctx: &RequestContext,
    1383                 :     ) -> anyhow::Result<Vec<DeltaEntry<'a>>> {
    1384                 :         use LayerKind::*;
    1385                 : 
    1386                 :         let owner = &self.owner.0;
    1387                 : 
    1388                 :         match self.downloaded.get(owner, ctx).await? {
    1389                 :             Delta(ref d) => {
    1390                 :                 owner
    1391                 :                     .access_stats
    1392                 :                     .record_access(LayerAccessKind::KeyIter, ctx);
    1393                 : 
    1394                 :                 // this is valid because the DownloadedLayer::kind is a OnceCell, not a
    1395                 :                 // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
    1396                 :                 // while it's being held.
    1397                 :                 delta_layer::DeltaLayerInner::load_keys(d, ctx)
    1398                 :                     .await
    1399                 :                     .context("Layer index is corrupted")
    1400                 :             }
    1401                 :             Image(_) => anyhow::bail!("cannot load_keys on a image layer"),
    1402                 :         }
    1403                 :     }
    1404                 : 
    1405 CBC       41919 :     pub(crate) fn local_path(&self) -> &Utf8Path {
    1406           41919 :         &self.owner.0.path
    1407           41919 :     }
    1408                 : 
    1409            2893 :     pub(crate) fn access_stats(&self) -> &LayerAccessStats {
    1410            2893 :         self.owner.access_stats()
    1411            2893 :     }
    1412                 : 
    1413           20463 :     pub(crate) fn metadata(&self) -> LayerFileMetadata {
    1414           20463 :         self.owner.metadata()
    1415           20463 :     }
    1416                 : }
    1417                 : 
    1418                 : impl AsLayerDesc for ResidentLayer {
    1419           72650 :     fn layer_desc(&self) -> &PersistentLayerDesc {
    1420           72650 :         self.owner.layer_desc()
    1421           72650 :     }
    1422                 : }
    1423                 : 
    1424                 : impl AsRef<Layer> for ResidentLayer {
    1425           30844 :     fn as_ref(&self) -> &Layer {
    1426           30844 :         &self.owner
    1427           30844 :     }
    1428                 : }
    1429                 : 
    1430                 : /// Drop the eviction guard.
    1431                 : impl From<ResidentLayer> for Layer {
    1432           21615 :     fn from(value: ResidentLayer) -> Self {
    1433           21615 :         value.owner
    1434           21615 :     }
    1435                 : }
    1436                 : 
    1437                 : use metrics::IntCounter;
    1438                 : 
    1439                 : pub(crate) struct LayerImplMetrics {
    1440                 :     started_evictions: IntCounter,
    1441                 :     completed_evictions: IntCounter,
    1442                 :     cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
    1443                 : 
    1444                 :     started_deletes: IntCounter,
    1445                 :     completed_deletes: IntCounter,
    1446                 :     failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
    1447                 : 
    1448                 :     rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
    1449                 :     inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
    1450                 :     redownload_after: metrics::Histogram,
    1451                 : }
    1452                 : 
    1453                 : impl Default for LayerImplMetrics {
    1454             560 :     fn default() -> Self {
    1455             560 :         use enum_map::Enum;
    1456             560 : 
    1457             560 :         // reminder: these will be pageserver_layer_* with "_total" suffix
    1458             560 : 
    1459             560 :         let started_evictions = metrics::register_int_counter!(
    1460             560 :             "pageserver_layer_started_evictions",
    1461             560 :             "Evictions started in the Layer implementation"
    1462             560 :         )
    1463             560 :         .unwrap();
    1464             560 :         let completed_evictions = metrics::register_int_counter!(
    1465             560 :             "pageserver_layer_completed_evictions",
    1466             560 :             "Evictions completed in the Layer implementation"
    1467             560 :         )
    1468             560 :         .unwrap();
    1469             560 : 
    1470             560 :         let cancelled_evictions = metrics::register_int_counter_vec!(
    1471             560 :             "pageserver_layer_cancelled_evictions_count",
    1472             560 :             "Different reasons for evictions to have been cancelled or failed",
    1473             560 :             &["reason"]
    1474             560 :         )
    1475             560 :         .unwrap();
    1476             560 : 
    1477            4480 :         let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1478            4480 :             let reason = EvictionCancelled::from_usize(i);
    1479            4480 :             let s = reason.as_str();
    1480            4480 :             cancelled_evictions.with_label_values(&[s])
    1481            4480 :         }));
    1482             560 : 
    1483             560 :         let started_deletes = metrics::register_int_counter!(
    1484             560 :             "pageserver_layer_started_deletes",
    1485             560 :             "Deletions on drop pending in the Layer implementation"
    1486             560 :         )
    1487             560 :         .unwrap();
    1488             560 :         let completed_deletes = metrics::register_int_counter!(
    1489             560 :             "pageserver_layer_completed_deletes",
    1490             560 :             "Deletions on drop completed in the Layer implementation"
    1491             560 :         )
    1492             560 :         .unwrap();
    1493             560 : 
    1494             560 :         let failed_deletes = metrics::register_int_counter_vec!(
    1495             560 :             "pageserver_layer_failed_deletes_count",
    1496             560 :             "Different reasons for deletions on drop to have failed",
    1497             560 :             &["reason"]
    1498             560 :         )
    1499             560 :         .unwrap();
    1500             560 : 
    1501            1120 :         let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1502            1120 :             let reason = DeleteFailed::from_usize(i);
    1503            1120 :             let s = reason.as_str();
    1504            1120 :             failed_deletes.with_label_values(&[s])
    1505            1120 :         }));
    1506             560 : 
    1507             560 :         let rare_counters = metrics::register_int_counter_vec!(
    1508             560 :             "pageserver_layer_assumed_rare_count",
    1509             560 :             "Times unexpected or assumed rare event happened",
    1510             560 :             &["event"]
    1511             560 :         )
    1512             560 :         .unwrap();
    1513             560 : 
    1514            3920 :         let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    1515            3920 :             let event = RareEvent::from_usize(i);
    1516            3920 :             let s = event.as_str();
    1517            3920 :             rare_counters.with_label_values(&[s])
    1518            3920 :         }));
    1519             560 : 
    1520             560 :         let inits_cancelled = metrics::register_int_counter!(
    1521             560 :             "pageserver_layer_inits_cancelled_count",
    1522             560 :             "Times Layer initialization was cancelled",
    1523             560 :         )
    1524             560 :         .unwrap();
    1525             560 : 
    1526             560 :         let redownload_after = {
    1527             560 :             let minute = 60.0;
    1528             560 :             let hour = 60.0 * minute;
    1529             560 :             metrics::register_histogram!(
    1530             560 :                 "pageserver_layer_redownloaded_after",
    1531             560 :                 "Time between evicting and re-downloading.",
    1532             560 :                 vec![
    1533             560 :                     10.0,
    1534             560 :                     30.0,
    1535             560 :                     minute,
    1536             560 :                     5.0 * minute,
    1537             560 :                     15.0 * minute,
    1538             560 :                     30.0 * minute,
    1539             560 :                     hour,
    1540             560 :                     12.0 * hour,
    1541             560 :                 ]
    1542             560 :             )
    1543             560 :             .unwrap()
    1544             560 :         };
    1545             560 : 
    1546             560 :         Self {
    1547             560 :             started_evictions,
    1548             560 :             completed_evictions,
    1549             560 :             cancelled_evictions,
    1550             560 : 
    1551             560 :             started_deletes,
    1552             560 :             completed_deletes,
    1553             560 :             failed_deletes,
    1554             560 : 
    1555             560 :             rare_counters,
    1556             560 :             inits_cancelled,
    1557             560 :             redownload_after,
    1558             560 :         }
    1559             560 :     }
    1560                 : }
    1561                 : 
    1562                 : impl LayerImplMetrics {
    1563            2538 :     fn inc_started_evictions(&self) {
    1564            2538 :         self.started_evictions.inc();
    1565            2538 :     }
    1566            2538 :     fn inc_completed_evictions(&self) {
    1567            2538 :         self.completed_evictions.inc();
    1568            2538 :     }
    1569 UBC           0 :     fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
    1570               0 :         self.cancelled_evictions[reason].inc()
    1571               0 :     }
    1572                 : 
    1573 CBC        5005 :     fn inc_started_deletes(&self) {
    1574            5005 :         self.started_deletes.inc();
    1575            5005 :     }
    1576            4298 :     fn inc_completed_deletes(&self) {
    1577            4298 :         self.completed_deletes.inc();
    1578            4298 :     }
    1579               7 :     fn inc_deletes_failed(&self, reason: DeleteFailed) {
    1580               7 :         self.failed_deletes[reason].inc();
    1581               7 :     }
    1582                 : 
    1583                 :     /// Counted separatedly from failed layer deletes because we will complete the layer deletion
    1584                 :     /// attempt regardless of failure to delete local file.
    1585 UBC           0 :     fn inc_delete_removes_failed(&self) {
    1586               0 :         self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
    1587               0 :     }
    1588                 : 
    1589                 :     /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
    1590               0 :     fn inc_retried_get_or_maybe_download(&self) {
    1591               0 :         self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
    1592               0 :     }
    1593                 : 
    1594                 :     /// Expected rare because cancellations are unexpected, and failures are unexpected
    1595 CBC           5 :     fn inc_download_failed_without_requester(&self) {
    1596               5 :         self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
    1597               5 :     }
    1598                 : 
    1599                 :     /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
    1600                 :     ///
    1601                 :     /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
    1602                 :     /// Option.
    1603 UBC           0 :     fn inc_raced_wanted_evicted_accesses(&self) {
    1604               0 :         self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
    1605               0 :     }
    1606                 : 
    1607                 :     /// These are only expected for [`Self::inc_init_cancelled`] amount when
    1608                 :     /// running with remote storage.
    1609               0 :     fn inc_init_needed_no_download(&self) {
    1610               0 :         self.rare_counters[RareEvent::InitWithoutDownload].inc();
    1611               0 :     }
    1612                 : 
    1613                 :     /// Expected rare because all layer files should be readable and good
    1614 CBC           1 :     fn inc_permanent_loading_failures(&self) {
    1615               1 :         self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
    1616               1 :     }
    1617                 : 
    1618 UBC           0 :     fn inc_broadcast_lagged(&self) {
    1619               0 :         self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
    1620               0 :     }
    1621                 : 
    1622 CBC         636 :     fn inc_init_cancelled(&self) {
    1623             636 :         self.inits_cancelled.inc()
    1624             636 :     }
    1625                 : 
    1626             105 :     fn record_redownloaded_after(&self, duration: std::time::Duration) {
    1627             105 :         self.redownload_after.observe(duration.as_secs_f64())
    1628             105 :     }
    1629                 : }
    1630                 : 
    1631            4480 : #[derive(enum_map::Enum)]
    1632                 : enum EvictionCancelled {
    1633                 :     LayerGone,
    1634                 :     TimelineGone,
    1635                 :     VersionCheckFailed,
    1636                 :     FileNotFound,
    1637                 :     RemoveFailed,
    1638                 :     AlreadyReinitialized,
    1639                 :     /// Not evicted because of a pending reinitialization
    1640                 :     LostToDownload,
    1641                 :     /// After eviction, there was a new layer access which cancelled the eviction.
    1642                 :     UpgradedBackOnAccess,
    1643                 : }
    1644                 : 
    1645                 : impl EvictionCancelled {
    1646            4480 :     fn as_str(&self) -> &'static str {
    1647            4480 :         match self {
    1648             560 :             EvictionCancelled::LayerGone => "layer_gone",
    1649             560 :             EvictionCancelled::TimelineGone => "timeline_gone",
    1650             560 :             EvictionCancelled::VersionCheckFailed => "version_check_fail",
    1651             560 :             EvictionCancelled::FileNotFound => "file_not_found",
    1652             560 :             EvictionCancelled::RemoveFailed => "remove_failed",
    1653             560 :             EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
    1654             560 :             EvictionCancelled::LostToDownload => "lost_to_download",
    1655             560 :             EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
    1656                 :         }
    1657            4480 :     }
    1658                 : }
    1659                 : 
    1660            1127 : #[derive(enum_map::Enum)]
    1661                 : enum DeleteFailed {
    1662                 :     TimelineGone,
    1663                 :     DeleteSchedulingFailed,
    1664                 : }
    1665                 : 
    1666                 : impl DeleteFailed {
    1667            1120 :     fn as_str(&self) -> &'static str {
    1668            1120 :         match self {
    1669             560 :             DeleteFailed::TimelineGone => "timeline_gone",
    1670             560 :             DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
    1671                 :         }
    1672            1120 :     }
    1673                 : }
    1674                 : 
    1675            3926 : #[derive(enum_map::Enum)]
    1676                 : enum RareEvent {
    1677                 :     RemoveOnDropFailed,
    1678                 :     RetriedGetOrMaybeDownload,
    1679                 :     DownloadFailedWithoutRequester,
    1680                 :     UpgradedWantedEvicted,
    1681                 :     InitWithoutDownload,
    1682                 :     PermanentLoadingFailure,
    1683                 :     EvictAndWaitLagged,
    1684                 : }
    1685                 : 
    1686                 : impl RareEvent {
    1687            3920 :     fn as_str(&self) -> &'static str {
    1688            3920 :         use RareEvent::*;
    1689            3920 : 
    1690            3920 :         match self {
    1691             560 :             RemoveOnDropFailed => "remove_on_drop_failed",
    1692             560 :             RetriedGetOrMaybeDownload => "retried_gomd",
    1693             560 :             DownloadFailedWithoutRequester => "download_failed_without",
    1694             560 :             UpgradedWantedEvicted => "raced_wanted_evicted",
    1695             560 :             InitWithoutDownload => "init_needed_no_download",
    1696             560 :             PermanentLoadingFailure => "permanent_loading_failure",
    1697             560 :             EvictAndWaitLagged => "broadcast_lagged",
    1698                 :         }
    1699            3920 :     }
    1700                 : }
    1701                 : 
    1702                 : pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
    1703                 :     once_cell::sync::Lazy::new(LayerImplMetrics::default);
        

Generated by: LCOV version 2.1-beta