LCOV - code coverage report
Current view: top level - pageserver/src/tenant/storage_layer - layer.rs (source / functions) Coverage Total Hit
Test: 2620485e474b48c32427149a5d91ef8fc2cd649e.info Lines: 77.7 % 1366 1061
Test Date: 2025-05-01 22:50:11 Functions: 76.2 % 151 115

            Line data    Source code
       1              : use std::ops::Range;
       2              : use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
       3              : use std::sync::{Arc, Weak};
       4              : use std::time::{Duration, SystemTime};
       5              : 
       6              : use crate::PERF_TRACE_TARGET;
       7              : use crate::metrics::{ONDEMAND_DOWNLOAD_BYTES, ONDEMAND_DOWNLOAD_COUNT};
       8              : use anyhow::Context;
       9              : use camino::{Utf8Path, Utf8PathBuf};
      10              : use pageserver_api::keyspace::KeySpace;
      11              : use pageserver_api::models::HistoricLayerInfo;
      12              : use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
      13              : use tracing::{Instrument, info_span};
      14              : use utils::generation::Generation;
      15              : use utils::id::TimelineId;
      16              : use utils::lsn::Lsn;
      17              : use utils::sync::{gate, heavier_once_cell};
      18              : 
      19              : use super::delta_layer::{self};
      20              : use super::image_layer::{self};
      21              : use super::{
      22              :     AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
      23              :     LayerVisibilityHint, PerfInstrumentFutureExt, PersistentLayerDesc, ValuesReconstructState,
      24              : };
      25              : use crate::config::PageServerConf;
      26              : use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
      27              : use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
      28              : use crate::task_mgr::TaskKind;
      29              : use crate::tenant::Timeline;
      30              : use crate::tenant::remote_timeline_client::LayerFileMetadata;
      31              : use crate::tenant::timeline::{CompactionError, GetVectoredError};
      32              : 
      33              : #[cfg(test)]
      34              : mod tests;
      35              : 
      36              : #[cfg(test)]
      37              : mod failpoints;
      38              : 
      39              : pub const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
      40              : 
      41              : /// A Layer contains all data in a "rectangle" consisting of a range of keys and
      42              : /// range of LSNs.
      43              : ///
      44              : /// There are two kinds of layers, in-memory and on-disk layers. In-memory
      45              : /// layers are used to ingest incoming WAL, and provide fast access to the
      46              : /// recent page versions. On-disk layers are stored as files on disk, and are
      47              : /// immutable. This type represents the on-disk kind while in-memory kind are represented by
      48              : /// [`InMemoryLayer`].
      49              : ///
      50              : /// Furthermore, there are two kinds of on-disk layers: delta and image layers.
      51              : /// A delta layer contains all modifications within a range of LSNs and keys.
      52              : /// An image layer is a snapshot of all the data in a key-range, at a single
      53              : /// LSN.
      54              : ///
      55              : /// This type models the on-disk layers, which can be evicted and on-demand downloaded. As a
      56              : /// general goal, read accesses should always win eviction and eviction should not wait for
      57              : /// download.
      58              : ///
      59              : /// ### State transitions
      60              : ///
      61              : /// The internal state of `Layer` is composed of most importantly the on-filesystem state and the
      62              : /// [`ResidentOrWantedEvicted`] enum. On-filesystem state can be either present (fully downloaded,
      63              : /// right size) or deleted.
      64              : ///
      65              : /// Reads will always win requests to evict until `wait_for_turn_and_evict` has acquired the
      66              : /// `heavier_once_cell::InitPermit` and has started to `evict_blocking`. Before the
      67              : /// `heavier_once_cell::InitPermit` has been acquired, any read request
      68              : /// (`get_or_maybe_download`) can "re-initialize" using the existing downloaded file and thus
      69              : /// cancelling the eviction.
      70              : ///
      71              : /// ```text
      72              : ///  +-----------------+   get_or_maybe_download    +--------------------------------+
      73              : ///  | not initialized |--------------------------->| Resident(Arc<DownloadedLayer>) |
      74              : ///  |     ENOENT      |                         /->|                                |
      75              : ///  +-----------------+                         |  +--------------------------------+
      76              : ///                  ^                           |                         |       ^
      77              : ///                  |    get_or_maybe_download  |                         |       | get_or_maybe_download, either:
      78              : ///   evict_blocking | /-------------------------/                         |       | - upgrade weak to strong
      79              : ///                  | |                                                   |       | - re-initialize without download
      80              : ///                  | |                                    evict_and_wait |       |
      81              : ///  +-----------------+                                                   v       |
      82              : ///  | not initialized |  on_downloaded_layer_drop  +--------------------------------------+
      83              : ///  | file is present |<---------------------------| WantedEvicted(Weak<DownloadedLayer>) |
      84              : ///  +-----------------+                            +--------------------------------------+
      85              : /// ```
      86              : ///
      87              : /// ### Unsupported
      88              : ///
      89              : /// - Evicting by the operator deleting files from the filesystem
      90              : ///
      91              : /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer
      92              : #[derive(Clone)]
      93              : pub(crate) struct Layer(Arc<LayerInner>);
      94              : 
      95              : impl std::fmt::Display for Layer {
      96        13080 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      97        13080 :         write!(
      98        13080 :             f,
      99        13080 :             "{}{}",
     100        13080 :             self.layer_desc().short_id(),
     101        13080 :             self.0.generation.get_suffix()
     102        13080 :         )
     103        13080 :     }
     104              : }
     105              : 
     106              : impl std::fmt::Debug for Layer {
     107           24 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     108           24 :         write!(f, "{}", self)
     109           24 :     }
     110              : }
     111              : 
     112              : impl AsLayerDesc for Layer {
     113     12126033 :     fn layer_desc(&self) -> &PersistentLayerDesc {
     114     12126033 :         self.0.layer_desc()
     115     12126033 :     }
     116              : }
     117              : 
     118              : impl PartialEq for Layer {
     119           21 :     fn eq(&self, other: &Self) -> bool {
     120           21 :         Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0)
     121           21 :     }
     122              : }
     123              : 
     124        11628 : pub(crate) fn local_layer_path(
     125        11628 :     conf: &PageServerConf,
     126        11628 :     tenant_shard_id: &TenantShardId,
     127        11628 :     timeline_id: &TimelineId,
     128        11628 :     layer_file_name: &LayerName,
     129        11628 :     generation: &Generation,
     130        11628 : ) -> Utf8PathBuf {
     131        11628 :     let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);
     132        11628 : 
     133        11628 :     if generation.is_none() {
     134              :         // Without a generation, we may only use legacy path style
     135            0 :         timeline_path.join(layer_file_name.to_string())
     136              :     } else {
     137        11628 :         timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
     138              :     }
     139        11628 : }
     140              : 
     141              : pub(crate) enum LastEviction {
     142              :     Never,
     143              :     At(std::time::Instant),
     144              :     Evicting,
     145              : }
     146              : 
     147              : impl LastEviction {
     148          156 :     pub(crate) fn happened_after(&self, timepoint: std::time::Instant) -> bool {
     149          156 :         match self {
     150            0 :             LastEviction::Never => false,
     151          156 :             LastEviction::At(evicted_at) => evicted_at > &timepoint,
     152            0 :             LastEviction::Evicting => true,
     153              :         }
     154          156 :     }
     155              : }
     156              : 
     157              : impl Layer {
     158              :     /// Creates a layer value for a file we know to not be resident.
     159            0 :     pub(crate) fn for_evicted(
     160            0 :         conf: &'static PageServerConf,
     161            0 :         timeline: &Arc<Timeline>,
     162            0 :         file_name: LayerName,
     163            0 :         metadata: LayerFileMetadata,
     164            0 :     ) -> Self {
     165            0 :         let local_path = local_layer_path(
     166            0 :             conf,
     167            0 :             &timeline.tenant_shard_id,
     168            0 :             &timeline.timeline_id,
     169            0 :             &file_name,
     170            0 :             &metadata.generation,
     171            0 :         );
     172            0 : 
     173            0 :         let desc = PersistentLayerDesc::from_filename(
     174            0 :             timeline.tenant_shard_id,
     175            0 :             timeline.timeline_id,
     176            0 :             file_name,
     177            0 :             metadata.file_size,
     178            0 :         );
     179            0 : 
     180            0 :         let owner = Layer(Arc::new(LayerInner::new(
     181            0 :             conf,
     182            0 :             timeline,
     183            0 :             local_path,
     184            0 :             desc,
     185            0 :             None,
     186            0 :             metadata.generation,
     187            0 :             metadata.shard,
     188            0 :         )));
     189            0 : 
     190            0 :         debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
     191              : 
     192            0 :         owner
     193            0 :     }
     194              : 
     195              :     /// Creates a Layer value for a file we know to be resident in timeline directory.
     196          732 :     pub(crate) fn for_resident(
     197          732 :         conf: &'static PageServerConf,
     198          732 :         timeline: &Arc<Timeline>,
     199          732 :         local_path: Utf8PathBuf,
     200          732 :         file_name: LayerName,
     201          732 :         metadata: LayerFileMetadata,
     202          732 :     ) -> ResidentLayer {
     203          732 :         let desc = PersistentLayerDesc::from_filename(
     204          732 :             timeline.tenant_shard_id,
     205          732 :             timeline.timeline_id,
     206          732 :             file_name,
     207          732 :             metadata.file_size,
     208          732 :         );
     209          732 : 
     210          732 :         let mut resident = None;
     211          732 : 
     212          732 :         let owner = Layer(Arc::new_cyclic(|owner| {
     213          732 :             let inner = Arc::new(DownloadedLayer {
     214          732 :                 owner: owner.clone(),
     215          732 :                 kind: tokio::sync::OnceCell::default(),
     216          732 :                 version: 0,
     217          732 :             });
     218          732 :             resident = Some(inner.clone());
     219          732 : 
     220          732 :             LayerInner::new(
     221          732 :                 conf,
     222          732 :                 timeline,
     223          732 :                 local_path,
     224          732 :                 desc,
     225          732 :                 Some(inner),
     226          732 :                 metadata.generation,
     227          732 :                 metadata.shard,
     228          732 :             )
     229          732 :         }));
     230          732 : 
     231          732 :         let downloaded = resident.expect("just initialized");
     232          732 : 
     233          732 :         debug_assert!(owner.0.needs_download_blocking().unwrap().is_none());
     234              : 
     235          732 :         timeline
     236          732 :             .metrics
     237          732 :             .resident_physical_size_add(metadata.file_size);
     238          732 : 
     239          732 :         ResidentLayer { downloaded, owner }
     240          732 :     }
     241              : 
     242              :     /// Creates a Layer value for freshly written out new layer file by renaming it from a
     243              :     /// temporary path.
     244        10992 :     pub(crate) fn finish_creating(
     245        10992 :         conf: &'static PageServerConf,
     246        10992 :         timeline: &Arc<Timeline>,
     247        10992 :         desc: PersistentLayerDesc,
     248        10992 :         temp_path: &Utf8Path,
     249        10992 :     ) -> anyhow::Result<ResidentLayer> {
     250        10992 :         let mut resident = None;
     251        10992 : 
     252        10992 :         let owner = Layer(Arc::new_cyclic(|owner| {
     253        10992 :             let inner = Arc::new(DownloadedLayer {
     254        10992 :                 owner: owner.clone(),
     255        10992 :                 kind: tokio::sync::OnceCell::default(),
     256        10992 :                 version: 0,
     257        10992 :             });
     258        10992 :             resident = Some(inner.clone());
     259        10992 : 
     260        10992 :             let local_path = local_layer_path(
     261        10992 :                 conf,
     262        10992 :                 &timeline.tenant_shard_id,
     263        10992 :                 &timeline.timeline_id,
     264        10992 :                 &desc.layer_name(),
     265        10992 :                 &timeline.generation,
     266        10992 :             );
     267        10992 : 
     268        10992 :             LayerInner::new(
     269        10992 :                 conf,
     270        10992 :                 timeline,
     271        10992 :                 local_path,
     272        10992 :                 desc,
     273        10992 :                 Some(inner),
     274        10992 :                 timeline.generation,
     275        10992 :                 timeline.get_shard_index(),
     276        10992 :             )
     277        10992 :         }));
     278        10992 : 
     279        10992 :         let downloaded = resident.expect("just initialized");
     280        10992 : 
     281        10992 :         // We never want to overwrite an existing file, so we use `RENAME_NOREPLACE`.
     282        10992 :         // TODO: this leaves the temp file in place if the rename fails, risking us running
     283        10992 :         // out of space. Should we clean it up here or does the calling context deal with this?
     284        10992 :         utils::fs_ext::rename_noreplace(temp_path.as_std_path(), owner.local_path().as_std_path())
     285        10992 :             .with_context(|| format!("rename temporary file as correct path for {owner}"))?;
     286              : 
     287        10992 :         Ok(ResidentLayer { downloaded, owner })
     288        10992 :     }
     289              : 
     290              :     /// Requests the layer to be evicted and waits for this to be done.
     291              :     ///
     292              :     /// If the file is not resident, an [`EvictionError::NotFound`] is returned.
     293              :     ///
     294              :     /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
     295              :     /// re-downloaded, [`EvictionError::Downloaded`] is returned.
     296              :     ///
     297              :     /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction
     298              :     /// will happen regardless the future returned by this method completing unless there is a
     299              :     /// read access before eviction gets to complete.
     300              :     ///
     301              :     /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
     302              :     /// of download-evict cycle on retry.
     303          312 :     pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
     304          312 :         self.0.evict_and_wait(timeout).await
     305          288 :     }
     306              : 
     307              :     /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
     308              :     /// then.
     309              :     ///
     310              :     /// On drop, this will cause a call to [`crate::tenant::remote_timeline_client::RemoteTimelineClient::schedule_deletion_of_unlinked`].
     311              :     /// This means that the unlinking by [gc] or [compaction] must have happened strictly before
     312              :     /// the value this is called on gets dropped.
     313              :     ///
     314              :     /// This is ensured by both of those methods accepting references to Layer.
     315              :     ///
     316              :     /// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
     317              :     /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
     318         3120 :     pub(crate) fn delete_on_drop(&self) {
     319         3120 :         self.0.delete_on_drop();
     320         3120 :     }
     321              : 
     322      1596019 :     pub(crate) async fn get_values_reconstruct_data(
     323      1596019 :         &self,
     324      1596019 :         keyspace: KeySpace,
     325      1596019 :         lsn_range: Range<Lsn>,
     326      1596019 :         reconstruct_data: &mut ValuesReconstructState,
     327      1596019 :         ctx: &RequestContext,
     328      1596019 :     ) -> Result<(), GetVectoredError> {
     329      1596019 :         let downloaded = {
     330      1596019 :             let ctx = RequestContextBuilder::from(ctx)
     331      1596019 :                 .perf_span(|crnt_perf_span| {
     332            0 :                     info_span!(
     333              :                         target: PERF_TRACE_TARGET,
     334            0 :                         parent: crnt_perf_span,
     335              :                         "GET_LAYER",
     336              :                     )
     337      1596019 :                 })
     338      1596019 :                 .attached_child();
     339      1596019 : 
     340      1596019 :             self.0
     341      1596019 :                 .get_or_maybe_download(true, &ctx)
     342      1596019 :                 .maybe_perf_instrument(&ctx, |crnt_perf_context| crnt_perf_context.clone())
     343      1596019 :                 .await
     344      1596019 :                 .map_err(|err| match err {
     345              :                     DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
     346            0 :                         GetVectoredError::Cancelled
     347              :                     }
     348            0 :                     other => GetVectoredError::Other(anyhow::anyhow!(other)),
     349      1596019 :                 })?
     350              :         };
     351              : 
     352      1596019 :         let this = ResidentLayer {
     353      1596019 :             downloaded: downloaded.clone(),
     354      1596019 :             owner: self.clone(),
     355      1596019 :         };
     356      1596019 : 
     357      1596019 :         self.record_access(ctx);
     358      1596019 : 
     359      1596019 :         let ctx = RequestContextBuilder::from(ctx)
     360      1596019 :             .perf_span(|crnt_perf_span| {
     361            0 :                 info_span!(
     362              :                     target: PERF_TRACE_TARGET,
     363            0 :                     parent: crnt_perf_span,
     364              :                     "VISIT_LAYER",
     365              :                 )
     366      1596019 :             })
     367      1596019 :             .attached_child();
     368      1596019 : 
     369      1596019 :         downloaded
     370      1596019 :             .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, &ctx)
     371      1596019 :             .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
     372      1596019 :             .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
     373      1596019 :             .await
     374      1596019 :             .map_err(|err| match err {
     375            0 :                 GetVectoredError::Other(err) => GetVectoredError::Other(
     376            0 :                     err.context(format!("get_values_reconstruct_data for layer {self}")),
     377            0 :                 ),
     378            0 :                 err => err,
     379      1596019 :             })
     380      1596019 :     }
     381              : 
     382              :     /// Download the layer if evicted.
     383              :     ///
     384              :     /// Will not error when the layer is already downloaded.
     385            0 :     pub(crate) async fn download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
     386            0 :         self.0.get_or_maybe_download(true, ctx).await?;
     387            0 :         Ok(())
     388            0 :     }
     389              : 
     390         1896 :     pub(crate) async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
     391         1896 :         self.0.needs_download().await
     392         1896 :     }
     393              : 
     394              :     /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
     395              :     /// while the guard exists.
     396              :     ///
     397              :     /// Returns None if the layer is currently evicted or becoming evicted.
     398          120 :     pub(crate) async fn keep_resident(&self) -> Option<ResidentLayer> {
     399          120 :         let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?;
     400              : 
     401           84 :         Some(ResidentLayer {
     402           84 :             downloaded,
     403           84 :             owner: self.clone(),
     404           84 :         })
     405          120 :     }
     406              : 
     407              :     /// Weak indicator of is the layer resident or not. Good enough for eviction, which can deal
     408              :     /// with `EvictionError::NotFound`.
     409              :     ///
     410              :     /// Returns `true` if this layer might be resident, or `false`, if it most likely evicted or
     411              :     /// will be unless a read happens soon.
     412         1077 :     pub(crate) fn is_likely_resident(&self) -> bool {
     413         1077 :         self.0
     414         1077 :             .inner
     415         1077 :             .get()
     416         1077 :             .map(|rowe| rowe.is_likely_resident())
     417         1077 :             .unwrap_or(false)
     418         1077 :     }
     419              : 
     420              :     /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
     421         3480 :     pub(crate) async fn download_and_keep_resident(
     422         3480 :         &self,
     423         3480 :         ctx: &RequestContext,
     424         3480 :     ) -> Result<ResidentLayer, DownloadError> {
     425         3480 :         let downloaded = self.0.get_or_maybe_download(true, ctx).await?;
     426              : 
     427         3480 :         Ok(ResidentLayer {
     428         3480 :             downloaded,
     429         3480 :             owner: self.clone(),
     430         3480 :         })
     431         3480 :     }
     432              : 
     433            0 :     pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
     434            0 :         self.0.info(reset)
     435            0 :     }
     436              : 
     437          204 :     pub(crate) fn latest_activity(&self) -> SystemTime {
     438          204 :         self.0.access_stats.latest_activity()
     439          204 :     }
     440              : 
     441          552 :     pub(crate) fn visibility(&self) -> LayerVisibilityHint {
     442          552 :         self.0.access_stats.visibility()
     443          552 :     }
     444              : 
     445        11004 :     pub(crate) fn local_path(&self) -> &Utf8Path {
     446        11004 :         &self.0.path
     447        11004 :     }
     448              : 
     449        15984 :     pub(crate) fn metadata(&self) -> LayerFileMetadata {
     450        15984 :         self.0.metadata()
     451        15984 :     }
     452              : 
     453          156 :     pub(crate) fn last_evicted_at(&self) -> LastEviction {
     454          156 :         match self.0.last_evicted_at.try_lock() {
     455          156 :             Ok(lock) => match *lock {
     456            0 :                 None => LastEviction::Never,
     457          156 :                 Some(at) => LastEviction::At(at),
     458              :             },
     459            0 :             Err(std::sync::TryLockError::WouldBlock) => LastEviction::Evicting,
     460            0 :             Err(std::sync::TryLockError::Poisoned(p)) => panic!("Lock poisoned: {p}"),
     461              :         }
     462          156 :     }
     463              : 
     464            0 :     pub(crate) fn get_timeline_id(&self) -> Option<TimelineId> {
     465            0 :         self.0
     466            0 :             .timeline
     467            0 :             .upgrade()
     468            0 :             .map(|timeline| timeline.timeline_id)
     469            0 :     }
     470              : 
     471              :     /// Traditional debug dumping facility
     472              :     #[allow(unused)]
     473           24 :     pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> {
     474           24 :         self.0.desc.dump();
     475           24 : 
     476           24 :         if verbose {
     477              :             // for now, unconditionally download everything, even if that might not be wanted.
     478           24 :             let l = self.0.get_or_maybe_download(true, ctx).await?;
     479           24 :             l.dump(&self.0, ctx).await?
     480            0 :         }
     481              : 
     482           24 :         Ok(())
     483           24 :     }
     484              : 
     485              :     /// Waits until this layer has been dropped (and if needed, local file deletion and remote
     486              :     /// deletion scheduling has completed).
     487              :     ///
     488              :     /// Does not start local deletion, use [`Self::delete_on_drop`] for that
     489              :     /// separatedly.
     490              :     #[cfg(any(feature = "testing", test))]
     491           12 :     pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
     492           12 :         let mut rx = self.0.status.as_ref().unwrap().subscribe();
     493              : 
     494           12 :         async move {
     495              :             loop {
     496           12 :                 if rx.changed().await.is_err() {
     497           12 :                     break;
     498            0 :                 }
     499              :             }
     500           12 :         }
     501           12 :     }
     502              : 
     503      1596019 :     fn record_access(&self, ctx: &RequestContext) {
     504      1596019 :         if self.0.access_stats.record_access(ctx) {
     505              :             // Visibility was modified to Visible: maybe log about this
     506            0 :             match ctx.task_kind() {
     507              :                 TaskKind::CalculateSyntheticSize
     508              :                 | TaskKind::OndemandLogicalSizeCalculation
     509              :                 | TaskKind::GarbageCollector
     510            0 :                 | TaskKind::MgmtRequest => {
     511            0 :                     // This situation is expected in code paths do binary searches of the LSN space to resolve
     512            0 :                     // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
     513            0 :                     // and on-demand for certain HTTP API requests. On-demand logical size calculation is also included
     514            0 :                     // because it is run as a sub-task of synthetic size.
     515            0 :                 }
     516              :                 _ => {
     517              :                     // In all other contexts, it is unusual to do I/O involving layers which are not visible at
     518              :                     // some branch tip, so we log the fact that we are accessing something that the visibility
     519              :                     // calculation thought should not be visible.
     520              :                     //
     521              :                     // This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
     522              :                     // which was covered by a concurrent compaction.
     523            0 :                     tracing::info!(
     524              :                         layer=%self,
     525            0 :                         "became visible as a result of access",
     526              :                     );
     527              :                 }
     528              :             }
     529              : 
     530              :             // Update the timeline's visible bytes count
     531            0 :             if let Some(tl) = self.0.timeline.upgrade() {
     532            0 :                 tl.metrics
     533            0 :                     .visible_physical_size_gauge
     534            0 :                     .add(self.0.desc.file_size)
     535            0 :             }
     536      1596019 :         }
     537      1596019 :     }
     538              : 
     539         2196 :     pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
     540         2196 :         let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
     541              :         use LayerVisibilityHint::*;
     542         2196 :         match (old_visibility, visibility) {
     543              :             (Visible, Covered) => {
     544              :                 // Subtract this layer's contribution to the visible size metric
     545          216 :                 if let Some(tl) = self.0.timeline.upgrade() {
     546          216 :                     debug_assert!(
     547          216 :                         tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
     548              :                     );
     549          216 :                     tl.metrics
     550          216 :                         .visible_physical_size_gauge
     551          216 :                         .sub(self.0.desc.file_size)
     552            0 :                 }
     553              :             }
     554              :             (Covered, Visible) => {
     555              :                 // Add this layer's contribution to the visible size metric
     556            0 :                 if let Some(tl) = self.0.timeline.upgrade() {
     557            0 :                     tl.metrics
     558            0 :                         .visible_physical_size_gauge
     559            0 :                         .add(self.0.desc.file_size)
     560            0 :                 }
     561              :             }
     562         1980 :             (Covered, Covered) | (Visible, Visible) => {
     563         1980 :                 // no change
     564         1980 :             }
     565              :         }
     566         2196 :     }
     567              : }
     568              : 
     569              : /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
     570              : ///
     571              : /// However when we want something evicted, we cannot evict it right away as there might be current
     572              : /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
     573              : /// read with [`Layer::get_values_reconstruct_data`].
     574              : ///
     575              : /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
     576              : #[derive(Debug)]
     577              : enum ResidentOrWantedEvicted {
     578              :     Resident(Arc<DownloadedLayer>),
     579              :     WantedEvicted(Weak<DownloadedLayer>, usize),
     580              : }
     581              : 
     582              : impl ResidentOrWantedEvicted {
     583              :     /// Non-mutating access to the a DownloadedLayer, if possible.
     584              :     ///
     585              :     /// This is not used on the read path (anything that calls
     586              :     /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win
     587              :     /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`].
     588           84 :     fn get(&self) -> Option<Arc<DownloadedLayer>> {
     589           84 :         match self {
     590           84 :             ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
     591            0 :             ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.upgrade(),
     592              :         }
     593           84 :     }
     594              : 
     595              :     /// Best-effort query for residency right now, not as strong guarantee as receiving a strong
     596              :     /// reference from `ResidentOrWantedEvicted::get`.
     597          669 :     fn is_likely_resident(&self) -> bool {
     598          669 :         match self {
     599          633 :             ResidentOrWantedEvicted::Resident(_) => true,
     600           36 :             ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.strong_count() > 0,
     601              :         }
     602          669 :     }
     603              : 
     604              :     /// Upgrades any weak to strong if possible.
     605              :     ///
     606              :     /// Returns a strong reference if possible, along with a boolean telling if an upgrade
     607              :     /// happened.
     608      1599499 :     fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
     609      1599499 :         match self {
     610      1599451 :             ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
     611           48 :             ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
     612            0 :                 Some(strong) => {
     613            0 :                     LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
     614            0 : 
     615            0 :                     *self = ResidentOrWantedEvicted::Resident(strong.clone());
     616            0 : 
     617            0 :                     Some((strong, true))
     618              :                 }
     619           48 :                 None => None,
     620              :             },
     621              :         }
     622      1599499 :     }
     623              : 
     624              :     /// When eviction is first requested, drop down to holding a [`Weak`].
     625              :     ///
     626              :     /// Returns `Some` if this was the first time eviction was requested. Care should be taken to
     627              :     /// drop the possibly last strong reference outside of the mutex of
     628              :     /// [`heavier_once_cell::OnceCell`].
     629          276 :     fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
     630          276 :         match self {
     631          252 :             ResidentOrWantedEvicted::Resident(strong) => {
     632          252 :                 let weak = Arc::downgrade(strong);
     633          252 :                 let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
     634          252 :                 std::mem::swap(self, &mut temp);
     635          252 :                 match temp {
     636          252 :                     ResidentOrWantedEvicted::Resident(strong) => Some(strong),
     637            0 :                     ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
     638              :                 }
     639              :             }
     640           24 :             ResidentOrWantedEvicted::WantedEvicted(..) => None,
     641              :         }
     642          276 :     }
     643              : }
     644              : 
     645              : struct LayerInner {
     646              :     /// Only needed to check ondemand_download_behavior_treat_error_as_warn and creation of
     647              :     /// [`Self::path`].
     648              :     conf: &'static PageServerConf,
     649              : 
     650              :     /// Full path to the file; unclear if this should exist anymore.
     651              :     path: Utf8PathBuf,
     652              : 
     653              :     desc: PersistentLayerDesc,
     654              : 
     655              :     /// Timeline access is needed for remote timeline client and metrics.
     656              :     ///
     657              :     /// There should not be an access to timeline for any reason without entering the
     658              :     /// [`Timeline::gate`] at the same time.
     659              :     timeline: Weak<Timeline>,
     660              : 
     661              :     access_stats: LayerAccessStats,
     662              : 
     663              :     /// This custom OnceCell is backed by std mutex, but only held for short time periods.
     664              :     ///
     665              :     /// Filesystem changes (download, evict) are only done while holding a permit which the
     666              :     /// `heavier_once_cell` provides.
     667              :     ///
     668              :     /// A number of fields in `Layer` are meant to only be updated when holding the InitPermit, but
     669              :     /// possibly read while not holding it.
     670              :     inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
     671              : 
     672              :     /// Do we want to delete locally and remotely this when `LayerInner` is dropped
     673              :     wanted_deleted: AtomicBool,
     674              : 
     675              :     /// Version is to make sure we will only evict a specific initialization of the downloaded file.
     676              :     ///
     677              :     /// Incremented for each initialization, stored in `DownloadedLayer::version` or
     678              :     /// `ResidentOrWantedEvicted::WantedEvicted`.
     679              :     version: AtomicUsize,
     680              : 
     681              :     /// Allow subscribing to when the layer actually gets evicted, a non-cancellable download
     682              :     /// starts, or completes.
     683              :     ///
     684              :     /// Updates must only be posted while holding the InitPermit or the heavier_once_cell::Guard.
     685              :     /// Holding the InitPermit is the only time we can do state transitions, but we also need to
     686              :     /// cancel a pending eviction on upgrading a [`ResidentOrWantedEvicted::WantedEvicted`] back to
     687              :     /// [`ResidentOrWantedEvicted::Resident`] on access.
     688              :     ///
     689              :     /// The sender is wrapped in an Option to facilitate moving it out on [`LayerInner::drop`].
     690              :     status: Option<tokio::sync::watch::Sender<Status>>,
     691              : 
     692              :     /// Counter for exponential backoff with the download.
     693              :     ///
     694              :     /// This is atomic only for the purposes of having additional data only accessed while holding
     695              :     /// the InitPermit.
     696              :     consecutive_failures: AtomicUsize,
     697              : 
     698              :     /// The generation of this Layer.
     699              :     ///
     700              :     /// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
     701              :     /// for created layers from [`Timeline::generation`].
     702              :     generation: Generation,
     703              : 
     704              :     /// The shard of this Layer.
     705              :     ///
     706              :     /// For layers created in this process, this will always be the [`ShardIndex`] of the
     707              :     /// current `ShardIdentity`` (TODO: add link once it's introduced).
     708              :     ///
     709              :     /// For loaded layers, this may be some other value if the tenant has undergone
     710              :     /// a shard split since the layer was originally written.
     711              :     shard: ShardIndex,
     712              : 
     713              :     /// When the Layer was last evicted but has not been downloaded since.
     714              :     ///
     715              :     /// This is used for skipping evicted layers from the previous heatmap (see
     716              :     /// `[Timeline::generate_heatmap]`) and for updating metrics
     717              :     /// (see [`LayerImplMetrics::redownload_after`]).
     718              :     last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
     719              : 
     720              :     #[cfg(test)]
     721              :     failpoints: std::sync::Mutex<Vec<failpoints::Failpoint>>,
     722              : }
     723              : 
     724              : impl std::fmt::Display for LayerInner {
     725          324 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     726          324 :         write!(f, "{}", self.layer_desc().short_id())
     727          324 :     }
     728              : }
     729              : 
     730              : impl AsLayerDesc for LayerInner {
     731     12141722 :     fn layer_desc(&self) -> &PersistentLayerDesc {
     732     12141722 :         &self.desc
     733     12141722 :     }
     734              : }
     735              : 
     736              : #[derive(Debug, Clone, Copy)]
     737              : enum Status {
     738              :     Resident,
     739              :     Evicted,
     740              :     Downloading,
     741              : }
     742              : 
     743              : impl Drop for LayerInner {
     744         4249 :     fn drop(&mut self) {
     745         4249 :         // if there was a pending eviction, mark it cancelled here to balance metrics
     746         4249 :         if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit()
     747           12 :         {
     748           12 :             // eviction has already been started
     749           12 :             LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
     750           12 : 
     751           12 :             // eviction request is intentionally not honored as no one is present to wait for it
     752           12 :             // and we could be delaying shutdown for nothing.
     753         4237 :         }
     754              : 
     755         4249 :         let timeline = self.timeline.upgrade();
     756              : 
     757         4249 :         if let Some(timeline) = timeline.as_ref() {
     758              :             // Only need to decrement metrics if the timeline still exists: otherwise
     759              :             // it will have already de-registered these metrics via TimelineMetrics::shutdown
     760         4153 :             timeline.metrics.dec_layer(&self.desc);
     761              : 
     762         4153 :             if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
     763         4153 :                 debug_assert!(
     764         4153 :                     timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
     765              :                 );
     766         4153 :                 timeline
     767         4153 :                     .metrics
     768         4153 :                     .visible_physical_size_gauge
     769         4153 :                     .sub(self.desc.file_size);
     770            0 :             }
     771           96 :         }
     772              : 
     773         4249 :         if !*self.wanted_deleted.get_mut() {
     774         1176 :             return;
     775         3073 :         }
     776              : 
     777         3073 :         let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
     778              : 
     779         3073 :         let path = std::mem::take(&mut self.path);
     780         3073 :         let file_name = self.layer_desc().layer_name();
     781         3073 :         let file_size = self.layer_desc().file_size;
     782         3073 :         let meta = self.metadata();
     783         3073 :         let status = self.status.take();
     784         3073 : 
     785         3073 :         Self::spawn_blocking(move || {
     786         3068 :             let _g = span.entered();
     787         3068 : 
     788         3068 :             // carry this until we are finished for [`Layer::wait_drop`] support
     789         3068 :             let _status = status;
     790              : 
     791         3068 :             let Some(timeline) = timeline else {
     792              :                 // no need to nag that timeline is gone: under normal situation on
     793              :                 // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
     794            0 :                 LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
     795            0 :                 return;
     796              :             };
     797              : 
     798         3068 :             let Ok(_guard) = timeline.gate.enter() else {
     799            0 :                 LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
     800            0 :                 return;
     801              :             };
     802              : 
     803         3068 :             let removed = match std::fs::remove_file(path) {
     804         3056 :                 Ok(()) => true,
     805           12 :                 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
     806           12 :                     // until we no longer do detaches by removing all local files before removing the
     807           12 :                     // tenant from the global map, we will always get these errors even if we knew what
     808           12 :                     // is the latest state.
     809           12 :                     //
     810           12 :                     // we currently do not track the latest state, so we'll also end up here on evicted
     811           12 :                     // layers.
     812           12 :                     false
     813              :                 }
     814            0 :                 Err(e) => {
     815            0 :                     tracing::error!("failed to remove wanted deleted layer: {e}");
     816            0 :                     LAYER_IMPL_METRICS.inc_delete_removes_failed();
     817            0 :                     false
     818              :                 }
     819              :             };
     820              : 
     821         3068 :             if removed {
     822         3056 :                 timeline.metrics.resident_physical_size_sub(file_size);
     823         3056 :             }
     824         3068 :             let res = timeline
     825         3068 :                 .remote_client
     826         3068 :                 .schedule_deletion_of_unlinked(vec![(file_name, meta)]);
     827              : 
     828         3068 :             if let Err(e) = res {
     829              :                 // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
     830              :                 // demonstrating this deadlock (without spawn_blocking): stop will drop
     831              :                 // queued items, which will have ResidentLayer's, and those drops would try
     832              :                 // to re-entrantly lock the RemoteTimelineClient inner state.
     833           12 :                 if !timeline.is_active() {
     834           12 :                     tracing::info!("scheduling deletion on drop failed: {e:#}");
     835              :                 } else {
     836            0 :                     tracing::warn!("scheduling deletion on drop failed: {e:#}");
     837              :                 }
     838           12 :                 LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
     839         3056 :             } else {
     840         3056 :                 LAYER_IMPL_METRICS.inc_completed_deletes();
     841         3056 :             }
     842         3073 :         });
     843         4249 :     }
     844              : }
     845              : 
     846              : impl LayerInner {
     847              :     #[allow(clippy::too_many_arguments)]
     848        11724 :     fn new(
     849        11724 :         conf: &'static PageServerConf,
     850        11724 :         timeline: &Arc<Timeline>,
     851        11724 :         local_path: Utf8PathBuf,
     852        11724 :         desc: PersistentLayerDesc,
     853        11724 :         downloaded: Option<Arc<DownloadedLayer>>,
     854        11724 :         generation: Generation,
     855        11724 :         shard: ShardIndex,
     856        11724 :     ) -> Self {
     857        11724 :         let (inner, version, init_status) = if let Some(inner) = downloaded {
     858        11724 :             let version = inner.version;
     859        11724 :             let resident = ResidentOrWantedEvicted::Resident(inner);
     860        11724 :             (
     861        11724 :                 heavier_once_cell::OnceCell::new(resident),
     862        11724 :                 version,
     863        11724 :                 Status::Resident,
     864        11724 :             )
     865              :         } else {
     866            0 :             (heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
     867              :         };
     868              : 
     869              :         // This object acts as a RAII guard on these metrics: increment on construction
     870        11724 :         timeline.metrics.inc_layer(&desc);
     871        11724 : 
     872        11724 :         // New layers are visible by default. This metric is later updated on drop or in set_visibility
     873        11724 :         timeline
     874        11724 :             .metrics
     875        11724 :             .visible_physical_size_gauge
     876        11724 :             .add(desc.file_size);
     877        11724 : 
     878        11724 :         LayerInner {
     879        11724 :             conf,
     880        11724 :             path: local_path,
     881        11724 :             desc,
     882        11724 :             timeline: Arc::downgrade(timeline),
     883        11724 :             access_stats: Default::default(),
     884        11724 :             wanted_deleted: AtomicBool::new(false),
     885        11724 :             inner,
     886        11724 :             version: AtomicUsize::new(version),
     887        11724 :             status: Some(tokio::sync::watch::channel(init_status).0),
     888        11724 :             consecutive_failures: AtomicUsize::new(0),
     889        11724 :             generation,
     890        11724 :             shard,
     891        11724 :             last_evicted_at: std::sync::Mutex::default(),
     892        11724 :             #[cfg(test)]
     893        11724 :             failpoints: Default::default(),
     894        11724 :         }
     895        11724 :     }
     896              : 
     897         3120 :     fn delete_on_drop(&self) {
     898         3120 :         let res =
     899         3120 :             self.wanted_deleted
     900         3120 :                 .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
     901         3120 : 
     902         3120 :         if res.is_ok() {
     903         3096 :             LAYER_IMPL_METRICS.inc_started_deletes();
     904         3096 :         }
     905         3120 :     }
     906              : 
     907              :     /// Cancellation safe, however dropping the future and calling this method again might result
     908              :     /// in a new attempt to evict OR join the previously started attempt.
     909          312 :     #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))]
     910              :     pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
     911              :         let mut rx = self.status.as_ref().unwrap().subscribe();
     912              : 
     913              :         {
     914              :             let current = rx.borrow_and_update();
     915              :             match &*current {
     916              :                 Status::Resident => {
     917              :                     // we might get lucky and evict this; continue
     918              :                 }
     919              :                 Status::Evicted | Status::Downloading => {
     920              :                     // it is already evicted
     921              :                     return Err(EvictionError::NotFound);
     922              :                 }
     923              :             }
     924              :         }
     925              : 
     926              :         let strong = {
     927              :             match self.inner.get() {
     928              :                 Some(mut either) => either.downgrade(),
     929              :                 None => {
     930              :                     // we already have a scheduled eviction, which just has not gotten to run yet.
     931              :                     // it might still race with a read access, but that could also get cancelled,
     932              :                     // so let's say this is not evictable.
     933              :                     return Err(EvictionError::NotFound);
     934              :                 }
     935              :             }
     936              :         };
     937              : 
     938              :         if strong.is_some() {
     939              :             // drop the DownloadedLayer outside of the holding the guard
     940              :             drop(strong);
     941              : 
     942              :             // idea here is that only one evicter should ever get to witness a strong reference,
     943              :             // which means whenever get_or_maybe_download upgrades a weak, it must mark up a
     944              :             // cancelled eviction and signal us, like it currently does.
     945              :             //
     946              :             // a second concurrent evict_and_wait will not see a strong reference.
     947              :             LAYER_IMPL_METRICS.inc_started_evictions();
     948              :         }
     949              : 
     950              :         let changed = rx.changed();
     951              :         let changed = tokio::time::timeout(timeout, changed).await;
     952              : 
     953              :         let Ok(changed) = changed else {
     954              :             return Err(EvictionError::Timeout);
     955              :         };
     956              : 
     957              :         let _: () = changed.expect("cannot be closed, because we are holding a strong reference");
     958              : 
     959              :         let current = rx.borrow_and_update();
     960              : 
     961              :         match &*current {
     962              :             // the easiest case
     963              :             Status::Evicted => Ok(()),
     964              :             // it surely was evicted in between, but then there was a new access now; we can't know
     965              :             // if it'll succeed so lets just call it evicted
     966              :             Status::Downloading => Ok(()),
     967              :             // either the download which was started after eviction completed already, or it was
     968              :             // never evicted
     969              :             Status::Resident => Err(EvictionError::Downloaded),
     970              :         }
     971              :     }
     972              : 
     973              :     /// Cancellation safe.
     974      1599595 :     async fn get_or_maybe_download(
     975      1599595 :         self: &Arc<Self>,
     976      1599595 :         allow_download: bool,
     977      1599595 :         ctx: &RequestContext,
     978      1599595 :     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
     979      1599595 :         let mut wait_for_download_recorder =
     980      1599595 :             scopeguard::guard(utils::elapsed_accum::ElapsedAccum::default(), |accum| {
     981      1599595 :                 ctx.ondemand_download_wait_observe(accum.get());
     982      1599595 :             });
     983          144 :         let (weak, permit) = {
     984              :             // get_or_init_detached can:
     985              :             // - be fast (mutex lock) OR uncontested semaphore permit acquire
     986              :             // - be slow (wait for semaphore permit or closing)
     987      1599595 :             let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
     988              : 
     989      1599595 :             let locked = self
     990      1599595 :                 .inner
     991      1599595 :                 .get_or_init_detached_measured(Some(&mut wait_for_download_recorder))
     992      1599595 :                 .await
     993      1599595 :                 .map(|mut guard| guard.get_and_upgrade().ok_or(guard));
     994      1599595 : 
     995      1599595 :             scopeguard::ScopeGuard::into_inner(init_cancelled);
     996              : 
     997      1599451 :             match locked {
     998              :                 // this path could had been a RwLock::read
     999      1599451 :                 Ok(Ok((strong, upgraded))) if !upgraded => return Ok(strong),
    1000            0 :                 Ok(Ok((strong, _))) => {
    1001            0 :                     // when upgraded back, the Arc<DownloadedLayer> is still available, but
    1002            0 :                     // previously a `evict_and_wait` was received. this is the only place when we
    1003            0 :                     // send out an update without holding the InitPermit.
    1004            0 :                     //
    1005            0 :                     // note that we also have dropped the Guard; this is fine, because we just made
    1006            0 :                     // a state change and are holding a strong reference to be returned.
    1007            0 :                     self.status.as_ref().unwrap().send_replace(Status::Resident);
    1008            0 :                     LAYER_IMPL_METRICS
    1009            0 :                         .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
    1010            0 : 
    1011            0 :                     return Ok(strong);
    1012              :                 }
    1013           48 :                 Ok(Err(guard)) => {
    1014           48 :                     // path to here: we won the eviction, the file should still be on the disk.
    1015           48 :                     let (weak, permit) = guard.take_and_deinit();
    1016           48 :                     (Some(weak), permit)
    1017              :                 }
    1018           96 :                 Err(permit) => (None, permit),
    1019              :             }
    1020              :         };
    1021          144 :         let _guard = wait_for_download_recorder.guard();
    1022              : 
    1023          144 :         if let Some(weak) = weak {
    1024              :             // only drop the weak after dropping the heavier_once_cell guard
    1025           48 :             assert!(
    1026           48 :                 matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
    1027            0 :                 "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
    1028              :             );
    1029           96 :         }
    1030              : 
    1031          144 :         let timeline = self
    1032          144 :             .timeline
    1033          144 :             .upgrade()
    1034          144 :             .ok_or(DownloadError::TimelineShutdown)?;
    1035              : 
    1036              :         // count cancellations, which currently remain largely unexpected
    1037          144 :         let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
    1038              : 
    1039              :         // check if we really need to be downloaded: this can happen if a read access won the
    1040              :         // semaphore before eviction.
    1041              :         //
    1042              :         // if we are cancelled while doing this `stat` the `self.inner` will be uninitialized. a
    1043              :         // pending eviction will try to evict even upon finding an uninitialized `self.inner`.
    1044          144 :         let needs_download = self
    1045          144 :             .needs_download()
    1046          144 :             .await
    1047          144 :             .map_err(DownloadError::PreStatFailed);
    1048          144 : 
    1049          144 :         scopeguard::ScopeGuard::into_inner(init_cancelled);
    1050              : 
    1051          144 :         let needs_download = needs_download?;
    1052              : 
    1053          144 :         let Some(reason) = needs_download else {
    1054              :             // the file is present locally because eviction has not had a chance to run yet
    1055              : 
    1056              :             #[cfg(test)]
    1057           48 :             self.failpoint(failpoints::FailpointKind::AfterDeterminingLayerNeedsNoDownload)
    1058           48 :                 .await?;
    1059              : 
    1060           36 :             LAYER_IMPL_METRICS.inc_init_needed_no_download();
    1061           36 : 
    1062           36 :             return Ok(self.initialize_after_layer_is_on_disk(permit));
    1063              :         };
    1064              : 
    1065              :         // we must download; getting cancelled before spawning the download is not an issue as
    1066              :         // any still running eviction would not find anything to evict.
    1067              : 
    1068           96 :         if let NeedsDownload::NotFile(ft) = reason {
    1069            0 :             return Err(DownloadError::NotFile(ft));
    1070           96 :         }
    1071           96 : 
    1072           96 :         self.check_expected_download(ctx)?;
    1073              : 
    1074           96 :         if !allow_download {
    1075              :             // this is only used from tests, but it is hard to test without the boolean
    1076           12 :             return Err(DownloadError::DownloadRequired);
    1077           84 :         }
    1078              : 
    1079           84 :         let ctx = if ctx.has_perf_span() {
    1080            0 :             let dl_ctx = RequestContextBuilder::from(ctx)
    1081            0 :                 .task_kind(TaskKind::LayerDownload)
    1082            0 :                 .download_behavior(DownloadBehavior::Download)
    1083            0 :                 .root_perf_span(|| {
    1084            0 :                     info_span!(
    1085            0 :                         target: PERF_TRACE_TARGET,
    1086            0 :                         "DOWNLOAD_LAYER",
    1087            0 :                         layer = %self,
    1088            0 :                         reason = %reason
    1089            0 :                     )
    1090            0 :                 })
    1091            0 :                 .detached_child();
    1092            0 :             ctx.perf_follows_from(&dl_ctx);
    1093            0 :             dl_ctx
    1094              :         } else {
    1095           84 :             ctx.attached_child()
    1096              :         };
    1097              : 
    1098           84 :         async move {
    1099           84 :             tracing::info!(%reason, "downloading on-demand");
    1100              : 
    1101           84 :             let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
    1102           84 :             let res = self
    1103           84 :                 .download_init_and_wait(timeline, permit, ctx.attached_child())
    1104           84 :                 .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
    1105           84 :                 .await?;
    1106              : 
    1107           84 :             scopeguard::ScopeGuard::into_inner(init_cancelled);
    1108           84 :             Ok(res)
    1109           84 :         }
    1110           84 :         .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
    1111           84 :         .await
    1112      1599595 :     }
    1113              : 
    1114              :     /// Nag or fail per RequestContext policy
    1115           96 :     fn check_expected_download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
    1116              :         use crate::context::DownloadBehavior::*;
    1117           96 :         let b = ctx.download_behavior();
    1118           96 :         match b {
    1119           96 :             Download => Ok(()),
    1120              :             Warn | Error => {
    1121            0 :                 tracing::info!(
    1122            0 :                     "unexpectedly on-demand downloading for task kind {:?}",
    1123            0 :                     ctx.task_kind()
    1124              :                 );
    1125            0 :                 crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
    1126              : 
    1127            0 :                 let really_error =
    1128            0 :                     matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn;
    1129              : 
    1130            0 :                 if really_error {
    1131              :                     // this check is only probablistic, seems like flakyness footgun
    1132            0 :                     Err(DownloadError::ContextAndConfigReallyDeniesDownloads)
    1133              :                 } else {
    1134            0 :                     Ok(())
    1135              :                 }
    1136              :             }
    1137              :         }
    1138           96 :     }
    1139              : 
    1140              :     /// Actual download, at most one is executed at the time.
    1141           84 :     async fn download_init_and_wait(
    1142           84 :         self: &Arc<Self>,
    1143           84 :         timeline: Arc<Timeline>,
    1144           84 :         permit: heavier_once_cell::InitPermit,
    1145           84 :         ctx: RequestContext,
    1146           84 :     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
    1147           84 :         debug_assert_current_span_has_tenant_and_timeline_id();
    1148           84 : 
    1149           84 :         let (tx, rx) = tokio::sync::oneshot::channel();
    1150           84 : 
    1151           84 :         let this: Arc<Self> = self.clone();
    1152              : 
    1153           84 :         let guard = timeline
    1154           84 :             .gate
    1155           84 :             .enter()
    1156           84 :             .map_err(|_| DownloadError::DownloadCancelled)?;
    1157              : 
    1158           84 :         Self::spawn(
    1159           84 :             async move {
    1160            0 :                 let _guard = guard;
    1161            0 : 
    1162            0 :                 // now that we have commited to downloading, send out an update to:
    1163            0 :                 // - unhang any pending eviction
    1164            0 :                 // - break out of evict_and_wait
    1165            0 :                 this.status
    1166            0 :                     .as_ref()
    1167            0 :                     .unwrap()
    1168            0 :                     .send_replace(Status::Downloading);
    1169           84 : 
    1170           84 :                 #[cfg(test)]
    1171           84 :                 this.failpoint(failpoints::FailpointKind::WaitBeforeDownloading)
    1172           84 :                     .await
    1173           84 :                     .unwrap();
    1174              : 
    1175           84 :                 let res = this.download_and_init(timeline, permit, &ctx).await;
    1176              : 
    1177           84 :                 if let Err(res) = tx.send(res) {
    1178            0 :                     match res {
    1179            0 :                         Ok(_res) => {
    1180            0 :                             tracing::debug!("layer initialized, but caller has been cancelled");
    1181            0 :                             LAYER_IMPL_METRICS.inc_init_completed_without_requester();
    1182              :                         }
    1183            0 :                         Err(e) => {
    1184            0 :                             tracing::info!(
    1185            0 :                                 "layer file download failed, and caller has been cancelled: {e:?}"
    1186              :                             );
    1187            0 :                             LAYER_IMPL_METRICS.inc_download_failed_without_requester();
    1188              :                         }
    1189              :                     }
    1190           84 :                 }
    1191           84 :             }
    1192           84 :             .in_current_span(),
    1193           84 :         );
    1194           84 : 
    1195           84 :         match rx.await {
    1196           84 :             Ok(Ok(res)) => Ok(res),
    1197              :             Ok(Err(remote_storage::DownloadError::Cancelled)) => {
    1198            0 :                 Err(DownloadError::DownloadCancelled)
    1199              :             }
    1200            0 :             Ok(Err(_)) => Err(DownloadError::DownloadFailed),
    1201            0 :             Err(_gone) => Err(DownloadError::DownloadCancelled),
    1202              :         }
    1203           84 :     }
    1204              : 
    1205           84 :     async fn download_and_init(
    1206           84 :         self: &Arc<LayerInner>,
    1207           84 :         timeline: Arc<Timeline>,
    1208           84 :         permit: heavier_once_cell::InitPermit,
    1209           84 :         ctx: &RequestContext,
    1210           84 :     ) -> Result<Arc<DownloadedLayer>, remote_storage::DownloadError> {
    1211           84 :         let start = std::time::Instant::now();
    1212           84 :         let result = timeline
    1213           84 :             .remote_client
    1214           84 :             .download_layer_file(
    1215           84 :                 &self.desc.layer_name(),
    1216           84 :                 &self.metadata(),
    1217           84 :                 &self.path,
    1218           84 :                 &timeline.gate,
    1219           84 :                 &timeline.cancel,
    1220           84 :                 ctx,
    1221           84 :             )
    1222           84 :             .await;
    1223           84 :         let latency = start.elapsed();
    1224           84 :         let latency_millis = u64::try_from(latency.as_millis()).unwrap();
    1225           84 :         match result {
    1226           84 :             Ok(size) => {
    1227           84 :                 assert_eq!(size, self.desc.file_size);
    1228              : 
    1229           84 :                 match self.needs_download().await {
    1230            0 :                     Ok(Some(reason)) => {
    1231            0 :                         // this is really a bug in needs_download or remote timeline client
    1232            0 :                         panic!("post-condition failed: needs_download returned {reason:?}");
    1233              :                     }
    1234           84 :                     Ok(None) => {
    1235           84 :                         // as expected
    1236           84 :                     }
    1237            0 :                     Err(e) => {
    1238            0 :                         panic!("post-condition failed: needs_download errored: {e:?}");
    1239              :                     }
    1240              :                 };
    1241           84 :                 tracing::info!(size=%self.desc.file_size, %latency_millis, "on-demand download successful");
    1242           84 :                 timeline
    1243           84 :                     .metrics
    1244           84 :                     .resident_physical_size_add(self.desc.file_size);
    1245           84 :                 self.consecutive_failures.store(0, Ordering::Relaxed);
    1246           84 : 
    1247           84 :                 let since_last_eviction = self
    1248           84 :                     .last_evicted_at
    1249           84 :                     .lock()
    1250           84 :                     .unwrap()
    1251           84 :                     .take()
    1252           84 :                     .map(|ts| ts.elapsed());
    1253           84 :                 if let Some(since_last_eviction) = since_last_eviction {
    1254           84 :                     LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
    1255           84 :                 }
    1256              : 
    1257           84 :                 self.access_stats.record_residence_event();
    1258           84 : 
    1259           84 :                 let task_kind: &'static str = ctx.task_kind().into();
    1260           84 :                 ONDEMAND_DOWNLOAD_BYTES
    1261           84 :                     .with_label_values(&[task_kind])
    1262           84 :                     .inc_by(self.desc.file_size);
    1263           84 :                 ONDEMAND_DOWNLOAD_COUNT
    1264           84 :                     .with_label_values(&[task_kind])
    1265           84 :                     .inc();
    1266           84 : 
    1267           84 :                 Ok(self.initialize_after_layer_is_on_disk(permit))
    1268              :             }
    1269            0 :             Err(e) => {
    1270            0 :                 let consecutive_failures =
    1271            0 :                     1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
    1272            0 : 
    1273            0 :                 if timeline.cancel.is_cancelled() {
    1274              :                     // If we're shutting down, drop out before logging the error
    1275            0 :                     return Err(e);
    1276            0 :                 }
    1277            0 : 
    1278            0 :                 tracing::error!(consecutive_failures, %latency_millis, "layer file download failed: {e:#}");
    1279              : 
    1280            0 :                 let backoff = utils::backoff::exponential_backoff_duration_seconds(
    1281            0 :                     consecutive_failures.min(u32::MAX as usize) as u32,
    1282            0 :                     1.5,
    1283            0 :                     60.0,
    1284            0 :                 );
    1285            0 : 
    1286            0 :                 let backoff = std::time::Duration::from_secs_f64(backoff);
    1287            0 : 
    1288            0 :                 tokio::select! {
    1289            0 :                     _ = tokio::time::sleep(backoff) => {},
    1290            0 :                     _ = timeline.cancel.cancelled() => {},
    1291              :                 };
    1292              : 
    1293            0 :                 Err(e)
    1294              :             }
    1295              :         }
    1296           84 :     }
    1297              : 
    1298              :     /// Initializes the `Self::inner` to a "resident" state.
    1299              :     ///
    1300              :     /// Callers are assumed to ensure that the file is actually on disk with `Self::needs_download`
    1301              :     /// before calling this method.
    1302              :     ///
    1303              :     /// If this method is ever made async, it needs to be cancellation safe so that no state
    1304              :     /// changes are made before we can write to the OnceCell in non-cancellable fashion.
    1305          120 :     fn initialize_after_layer_is_on_disk(
    1306          120 :         self: &Arc<LayerInner>,
    1307          120 :         permit: heavier_once_cell::InitPermit,
    1308          120 :     ) -> Arc<DownloadedLayer> {
    1309          120 :         debug_assert_current_span_has_tenant_and_timeline_id();
    1310          120 : 
    1311          120 :         // disable any scheduled but not yet running eviction deletions for this initialization
    1312          120 :         let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
    1313          120 :         self.status.as_ref().unwrap().send_replace(Status::Resident);
    1314          120 : 
    1315          120 :         let res = Arc::new(DownloadedLayer {
    1316          120 :             owner: Arc::downgrade(self),
    1317          120 :             kind: tokio::sync::OnceCell::default(),
    1318          120 :             version: next_version,
    1319          120 :         });
    1320          120 : 
    1321          120 :         let waiters = self.inner.initializer_count();
    1322          120 :         if waiters > 0 {
    1323            0 :             tracing::info!(waiters, "completing layer init for other tasks");
    1324          120 :         }
    1325              : 
    1326          120 :         let value = ResidentOrWantedEvicted::Resident(res.clone());
    1327          120 : 
    1328          120 :         self.inner.set(value, permit);
    1329          120 : 
    1330          120 :         res
    1331          120 :     }
    1332              : 
    1333         2136 :     async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
    1334         2136 :         match tokio::fs::metadata(&self.path).await {
    1335         2040 :             Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
    1336           96 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
    1337            0 :             Err(e) => Err(e),
    1338              :         }
    1339         2136 :     }
    1340              : 
    1341          732 :     fn needs_download_blocking(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
    1342          732 :         match self.path.metadata() {
    1343          732 :             Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
    1344            0 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
    1345            0 :             Err(e) => Err(e),
    1346              :         }
    1347          732 :     }
    1348              : 
    1349         2772 :     fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
    1350         2772 :         // in future, this should include sha2-256 validation of the file.
    1351         2772 :         if !m.is_file() {
    1352            0 :             Err(NeedsDownload::NotFile(m.file_type()))
    1353         2772 :         } else if m.len() != self.desc.file_size {
    1354            0 :             Err(NeedsDownload::WrongSize {
    1355            0 :                 actual: m.len(),
    1356            0 :                 expected: self.desc.file_size,
    1357            0 :             })
    1358              :         } else {
    1359         2772 :             Ok(())
    1360              :         }
    1361         2772 :     }
    1362              : 
    1363            0 :     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
    1364            0 :         let layer_name = self.desc.layer_name().to_string();
    1365            0 : 
    1366            0 :         let resident = self
    1367            0 :             .inner
    1368            0 :             .get()
    1369            0 :             .map(|rowe| rowe.is_likely_resident())
    1370            0 :             .unwrap_or(false);
    1371            0 : 
    1372            0 :         let access_stats = self.access_stats.as_api_model(reset);
    1373            0 : 
    1374            0 :         if self.desc.is_delta {
    1375            0 :             let lsn_range = &self.desc.lsn_range;
    1376            0 : 
    1377            0 :             HistoricLayerInfo::Delta {
    1378            0 :                 layer_file_name: layer_name,
    1379            0 :                 layer_file_size: self.desc.file_size,
    1380            0 :                 lsn_start: lsn_range.start,
    1381            0 :                 lsn_end: lsn_range.end,
    1382            0 :                 remote: !resident,
    1383            0 :                 access_stats,
    1384            0 :                 l0: crate::tenant::layer_map::LayerMap::is_l0(
    1385            0 :                     &self.layer_desc().key_range,
    1386            0 :                     self.layer_desc().is_delta,
    1387            0 :                 ),
    1388            0 :             }
    1389              :         } else {
    1390            0 :             let lsn = self.desc.image_layer_lsn();
    1391            0 : 
    1392            0 :             HistoricLayerInfo::Image {
    1393            0 :                 layer_file_name: layer_name,
    1394            0 :                 layer_file_size: self.desc.file_size,
    1395            0 :                 lsn_start: lsn,
    1396            0 :                 remote: !resident,
    1397            0 :                 access_stats,
    1398            0 :             }
    1399              :         }
    1400            0 :     }
    1401              : 
    1402              :     /// `DownloadedLayer` is being dropped, so it calls this method.
    1403          240 :     fn on_downloaded_layer_drop(self: Arc<LayerInner>, only_version: usize) {
    1404              :         // we cannot know without inspecting LayerInner::inner if we should evict or not, even
    1405              :         // though here it is very likely
    1406          240 :         let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version);
    1407              : 
    1408              :         // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
    1409              :         // drop while the `self.inner` is being locked, leading to a deadlock.
    1410              : 
    1411          240 :         let start_evicting = async move {
    1412          240 :             #[cfg(test)]
    1413          240 :             self.failpoint(failpoints::FailpointKind::WaitBeforeStartingEvicting)
    1414          240 :                 .await
    1415          240 :                 .expect("failpoint should not have errored");
    1416          240 : 
    1417          240 :             tracing::debug!("eviction started");
    1418              : 
    1419          240 :             let res = self.wait_for_turn_and_evict(only_version).await;
    1420              :             // metrics: ignore the Ok branch, it is not done yet
    1421          240 :             if let Err(e) = res {
    1422           36 :                 tracing::debug!(res=?Err::<(), _>(&e), "eviction completed");
    1423           36 :                 LAYER_IMPL_METRICS.inc_eviction_cancelled(e);
    1424          204 :             }
    1425          240 :         };
    1426              : 
    1427          240 :         Self::spawn(start_evicting.instrument(span));
    1428          240 :     }
    1429              : 
    1430          240 :     async fn wait_for_turn_and_evict(
    1431          240 :         self: Arc<LayerInner>,
    1432          240 :         only_version: usize,
    1433          240 :     ) -> Result<(), EvictionCancelled> {
    1434          468 :         fn is_good_to_continue(status: &Status) -> Result<(), EvictionCancelled> {
    1435              :             use Status::*;
    1436          468 :             match status {
    1437          456 :                 Resident => Ok(()),
    1438           12 :                 Evicted => Err(EvictionCancelled::UnexpectedEvictedState),
    1439            0 :                 Downloading => Err(EvictionCancelled::LostToDownload),
    1440              :             }
    1441          468 :         }
    1442              : 
    1443          240 :         let timeline = self
    1444          240 :             .timeline
    1445          240 :             .upgrade()
    1446          240 :             .ok_or(EvictionCancelled::TimelineGone)?;
    1447              : 
    1448          240 :         let mut rx = self
    1449          240 :             .status
    1450          240 :             .as_ref()
    1451          240 :             .expect("LayerInner cannot be dropped, holding strong ref")
    1452          240 :             .subscribe();
    1453          240 : 
    1454          240 :         is_good_to_continue(&rx.borrow_and_update())?;
    1455              : 
    1456          228 :         let Ok(gate) = timeline.gate.enter() else {
    1457            0 :             return Err(EvictionCancelled::TimelineGone);
    1458              :         };
    1459              : 
    1460          204 :         let permit = {
    1461              :             // we cannot just `std::fs::remove_file` because there might already be an
    1462              :             // get_or_maybe_download which will inspect filesystem and reinitialize. filesystem
    1463              :             // operations must be done while holding the heavier_once_cell::InitPermit
    1464          228 :             let mut wait = std::pin::pin!(self.inner.get_or_init_detached());
    1465              : 
    1466          228 :             let waited = loop {
    1467              :                 // we must race to the Downloading starting, otherwise we would have to wait until the
    1468              :                 // completion of the download. waiting for download could be long and hinder our
    1469              :                 // efforts to alert on "hanging" evictions.
    1470          228 :                 tokio::select! {
    1471          228 :                     res = &mut wait => break res,
    1472          228 :                     _ = rx.changed() => {
    1473            0 :                         is_good_to_continue(&rx.borrow_and_update())?;
    1474              :                         // two possibilities for Status::Resident:
    1475              :                         // - the layer was found locally from disk by a read
    1476              :                         // - we missed a bunch of updates and now the layer is
    1477              :                         // again downloaded -- assume we'll fail later on with
    1478              :                         // version check or AlreadyReinitialized
    1479              :                     }
    1480              :                 }
    1481              :             };
    1482              : 
    1483              :             // re-check now that we have the guard or permit; all updates should have happened
    1484              :             // while holding the permit.
    1485          228 :             is_good_to_continue(&rx.borrow_and_update())?;
    1486              : 
    1487              :             // the term deinitialize is used here, because we clearing out the Weak will eventually
    1488              :             // lead to deallocating the reference counted value, and the value we
    1489              :             // `Guard::take_and_deinit` is likely to be the last because the Weak is never cloned.
    1490          228 :             let (_weak, permit) = match waited {
    1491          216 :                 Ok(guard) => {
    1492          216 :                     match &*guard {
    1493          204 :                         ResidentOrWantedEvicted::WantedEvicted(_weak, version)
    1494          204 :                             if *version == only_version =>
    1495          192 :                         {
    1496          192 :                             tracing::debug!(version, "deinitializing matching WantedEvicted");
    1497          192 :                             let (weak, permit) = guard.take_and_deinit();
    1498          192 :                             (Some(weak), permit)
    1499              :                         }
    1500           12 :                         ResidentOrWantedEvicted::WantedEvicted(_, version) => {
    1501           12 :                             // if we were not doing the version check, we would need to try to
    1502           12 :                             // upgrade the weak here to see if it really is dropped. version check
    1503           12 :                             // is done instead assuming that it is cheaper.
    1504           12 :                             tracing::debug!(
    1505              :                                 version,
    1506              :                                 only_version,
    1507            0 :                                 "version mismatch, not deinitializing"
    1508              :                             );
    1509           12 :                             return Err(EvictionCancelled::VersionCheckFailed);
    1510              :                         }
    1511              :                         ResidentOrWantedEvicted::Resident(_) => {
    1512           12 :                             return Err(EvictionCancelled::AlreadyReinitialized);
    1513              :                         }
    1514              :                     }
    1515              :                 }
    1516           12 :                 Err(permit) => {
    1517           12 :                     tracing::debug!("continuing after cancelled get_or_maybe_download or eviction");
    1518           12 :                     (None, permit)
    1519              :                 }
    1520              :             };
    1521              : 
    1522          204 :             permit
    1523          204 :         };
    1524          204 : 
    1525          204 :         let span = tracing::Span::current();
    1526          204 : 
    1527          204 :         let spawned_at = std::time::Instant::now();
    1528          204 : 
    1529          204 :         // this is on purpose a detached spawn; we don't need to wait for it
    1530          204 :         //
    1531          204 :         // eviction completion reporting is the only thing hinging on this, and it can be just as
    1532          204 :         // well from a spawn_blocking thread.
    1533          204 :         //
    1534          204 :         // important to note that now that we've acquired the permit we have made sure the evicted
    1535          204 :         // file is either the exact `WantedEvicted` we wanted to evict, or uninitialized in case
    1536          204 :         // there are multiple evictions. The rest is not cancellable, and we've now commited to
    1537          204 :         // evicting.
    1538          204 :         //
    1539          204 :         // If spawn_blocking has a queue and maximum number of threads are in use, we could stall
    1540          204 :         // reads. We will need to add cancellation for that if necessary.
    1541          204 :         Self::spawn_blocking(move || {
    1542          204 :             let _span = span.entered();
    1543          204 : 
    1544          204 :             let res = self.evict_blocking(&timeline, &gate, &permit);
    1545          204 : 
    1546          204 :             let waiters = self.inner.initializer_count();
    1547          204 : 
    1548          204 :             if waiters > 0 {
    1549            0 :                 LAYER_IMPL_METRICS.inc_evicted_with_waiters();
    1550          204 :             }
    1551              : 
    1552          204 :             let completed_in = spawned_at.elapsed();
    1553          204 :             LAYER_IMPL_METRICS.record_time_to_evict(completed_in);
    1554          204 : 
    1555          204 :             match res {
    1556          204 :                 Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
    1557            0 :                 Err(e) => LAYER_IMPL_METRICS.inc_eviction_cancelled(e),
    1558              :             }
    1559              : 
    1560          204 :             tracing::debug!(?res, elapsed_ms=%completed_in.as_millis(), %waiters, "eviction completed");
    1561          204 :         });
    1562          204 : 
    1563          204 :         Ok(())
    1564          240 :     }
    1565              : 
    1566              :     /// This is blocking only to do just one spawn_blocking hop compared to multiple via tokio::fs.
    1567          204 :     fn evict_blocking(
    1568          204 :         &self,
    1569          204 :         timeline: &Timeline,
    1570          204 :         _gate: &gate::GateGuard,
    1571          204 :         _permit: &heavier_once_cell::InitPermit,
    1572          204 :     ) -> Result<(), EvictionCancelled> {
    1573          204 :         // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`
    1574          204 : 
    1575          204 :         match capture_mtime_and_remove(&self.path) {
    1576          204 :             Ok(local_layer_mtime) => {
    1577          204 :                 let duration = SystemTime::now().duration_since(local_layer_mtime);
    1578          204 :                 match duration {
    1579          204 :                     Ok(elapsed) => {
    1580          204 :                         let accessed_and_visible = self.access_stats.accessed()
    1581           24 :                             && self.access_stats.visibility() == LayerVisibilityHint::Visible;
    1582          204 :                         if accessed_and_visible {
    1583           24 :                             // Only layers used for reads contribute to our "low residence" metric that is used
    1584           24 :                             // to detect thrashing.  Layers promoted for other reasons (e.g. compaction) are allowed
    1585           24 :                             // to be rapidly evicted without contributing to this metric.
    1586           24 :                             timeline
    1587           24 :                                 .metrics
    1588           24 :                                 .evictions_with_low_residence_duration
    1589           24 :                                 .read()
    1590           24 :                                 .unwrap()
    1591           24 :                                 .observe(elapsed);
    1592          180 :                         }
    1593              : 
    1594          204 :                         tracing::info!(
    1595            0 :                             residence_millis = elapsed.as_millis(),
    1596            0 :                             accessed_and_visible,
    1597            0 :                             "evicted layer after known residence period"
    1598              :                         );
    1599              :                     }
    1600              :                     Err(_) => {
    1601            0 :                         tracing::info!("evicted layer after unknown residence period");
    1602              :                     }
    1603              :                 }
    1604          204 :                 timeline.metrics.evictions.inc();
    1605          204 :                 timeline
    1606          204 :                     .metrics
    1607          204 :                     .resident_physical_size_sub(self.desc.file_size);
    1608              :             }
    1609            0 :             Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
    1610            0 :                 tracing::error!(
    1611              :                     layer_size = %self.desc.file_size,
    1612            0 :                     "failed to evict layer from disk, it was already gone"
    1613              :                 );
    1614            0 :                 return Err(EvictionCancelled::FileNotFound);
    1615              :             }
    1616            0 :             Err(e) => {
    1617            0 :                 // FIXME: this should probably be an abort
    1618            0 :                 tracing::error!("failed to evict file from disk: {e:#}");
    1619            0 :                 return Err(EvictionCancelled::RemoveFailed);
    1620              :             }
    1621              :         }
    1622              : 
    1623          204 :         self.access_stats.record_residence_event();
    1624          204 : 
    1625          204 :         *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
    1626          204 : 
    1627          204 :         self.status.as_ref().unwrap().send_replace(Status::Evicted);
    1628          204 : 
    1629          204 :         Ok(())
    1630          204 :     }
    1631              : 
    1632        19141 :     fn metadata(&self) -> LayerFileMetadata {
    1633        19141 :         LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
    1634        19141 :     }
    1635              : 
    1636              :     /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME.
    1637              :     ///
    1638              :     /// Synchronizing with spawned tasks is very complicated otherwise.
    1639          324 :     fn spawn<F>(fut: F)
    1640          324 :     where
    1641          324 :         F: std::future::Future<Output = ()> + Send + 'static,
    1642          324 :     {
    1643          324 :         #[cfg(test)]
    1644          324 :         tokio::task::spawn(fut);
    1645          324 :         #[cfg(not(test))]
    1646          324 :         crate::task_mgr::BACKGROUND_RUNTIME.spawn(fut);
    1647          324 :     }
    1648              : 
    1649              :     /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME.
    1650         3277 :     fn spawn_blocking<F>(f: F)
    1651         3277 :     where
    1652         3277 :         F: FnOnce() + Send + 'static,
    1653         3277 :     {
    1654         3277 :         #[cfg(test)]
    1655         3277 :         tokio::task::spawn_blocking(f);
    1656         3277 :         #[cfg(not(test))]
    1657         3277 :         crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(f);
    1658         3277 :     }
    1659              : }
    1660              : 
    1661          204 : fn capture_mtime_and_remove(path: &Utf8Path) -> Result<SystemTime, std::io::Error> {
    1662          204 :     let m = path.metadata()?;
    1663          204 :     let local_layer_mtime = m.modified()?;
    1664          204 :     std::fs::remove_file(path)?;
    1665          204 :     Ok(local_layer_mtime)
    1666          204 : }
    1667              : 
    1668              : #[derive(Debug, thiserror::Error)]
    1669              : pub(crate) enum EvictionError {
    1670              :     #[error("layer was already evicted")]
    1671              :     NotFound,
    1672              : 
    1673              :     /// Evictions must always lose to downloads in races, and this time it happened.
    1674              :     #[error("layer was downloaded instead")]
    1675              :     Downloaded,
    1676              : 
    1677              :     #[error("eviction did not happen within timeout")]
    1678              :     Timeout,
    1679              : }
    1680              : 
    1681              : /// Error internal to the [`LayerInner::get_or_maybe_download`]
    1682              : #[derive(Debug, thiserror::Error)]
    1683              : pub(crate) enum DownloadError {
    1684              :     #[error("timeline has already shutdown")]
    1685              :     TimelineShutdown,
    1686              :     #[error("context denies downloading")]
    1687              :     ContextAndConfigReallyDeniesDownloads,
    1688              :     #[error("downloading is really required but not allowed by this method")]
    1689              :     DownloadRequired,
    1690              :     #[error("layer path exists, but it is not a file: {0:?}")]
    1691              :     NotFile(std::fs::FileType),
    1692              :     /// Why no error here? Because it will be reported by page_service. We should had also done
    1693              :     /// retries already.
    1694              :     #[error("downloading evicted layer file failed")]
    1695              :     DownloadFailed,
    1696              :     #[error("downloading failed, possibly for shutdown")]
    1697              :     DownloadCancelled,
    1698              :     #[error("pre-condition: stat before download failed")]
    1699              :     PreStatFailed(#[source] std::io::Error),
    1700              : 
    1701              :     #[cfg(test)]
    1702              :     #[error("failpoint: {0:?}")]
    1703              :     Failpoint(failpoints::FailpointKind),
    1704              : }
    1705              : 
    1706              : impl DownloadError {
    1707            0 :     pub(crate) fn is_cancelled(&self) -> bool {
    1708            0 :         matches!(self, DownloadError::DownloadCancelled)
    1709            0 :     }
    1710              : }
    1711              : 
    1712              : #[derive(Debug, PartialEq)]
    1713              : pub(crate) enum NeedsDownload {
    1714              :     NotFound,
    1715              :     NotFile(std::fs::FileType),
    1716              :     WrongSize { actual: u64, expected: u64 },
    1717              : }
    1718              : 
    1719              : impl std::fmt::Display for NeedsDownload {
    1720           84 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1721           84 :         match self {
    1722           84 :             NeedsDownload::NotFound => write!(f, "file was not found"),
    1723            0 :             NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
    1724            0 :             NeedsDownload::WrongSize { actual, expected } => {
    1725            0 :                 write!(f, "file size mismatch {actual} vs. {expected}")
    1726              :             }
    1727              :         }
    1728           84 :     }
    1729              : }
    1730              : 
    1731              : /// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
    1732              : pub(crate) struct DownloadedLayer {
    1733              :     owner: Weak<LayerInner>,
    1734              :     // Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
    1735              :     // DownloadedLayer
    1736              :     kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
    1737              :     version: usize,
    1738              : }
    1739              : 
    1740              : impl std::fmt::Debug for DownloadedLayer {
    1741            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1742            0 :         f.debug_struct("DownloadedLayer")
    1743            0 :             // owner omitted because it is always "Weak"
    1744            0 :             .field("kind", &self.kind)
    1745            0 :             .field("version", &self.version)
    1746            0 :             .finish()
    1747            0 :     }
    1748              : }
    1749              : 
    1750              : impl Drop for DownloadedLayer {
    1751         4477 :     fn drop(&mut self) {
    1752         4477 :         if let Some(owner) = self.owner.upgrade() {
    1753          240 :             owner.on_downloaded_layer_drop(self.version);
    1754         4237 :         } else {
    1755         4237 :             // Layer::drop will handle cancelling the eviction; because of drop order and
    1756         4237 :             // `DownloadedLayer` never leaking, we cannot know here if eviction was requested.
    1757         4237 :         }
    1758         4477 :     }
    1759              : }
    1760              : 
    1761              : impl DownloadedLayer {
    1762              :     /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`].
    1763              :     /// Failure to load the layer is sticky, i.e., future `get()` calls will return
    1764              :     /// the initial load failure immediately.
    1765              :     ///
    1766              :     /// `owner` parameter is a strong reference at the same `LayerInner` as the
    1767              :     /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called,
    1768              :     /// we will always have the LayerInner on the callstack, so we can just use it.
    1769      1602319 :     async fn get<'a>(
    1770      1602319 :         &'a self,
    1771      1602319 :         owner: &Arc<LayerInner>,
    1772      1602319 :         ctx: &RequestContext,
    1773      1602319 :     ) -> anyhow::Result<&'a LayerKind> {
    1774      1602319 :         let init = || async {
    1775         7512 :             assert_eq!(
    1776         7512 :                 Weak::as_ptr(&self.owner),
    1777         7512 :                 Arc::as_ptr(owner),
    1778            0 :                 "these are the same, just avoiding the upgrade"
    1779              :             );
    1780              : 
    1781         7512 :             let res = if owner.desc.is_delta {
    1782         6612 :                 let ctx = RequestContextBuilder::from(ctx)
    1783         6612 :                     .page_content_kind(crate::context::PageContentKind::DeltaLayerSummary)
    1784         6612 :                     .attached_child();
    1785         6612 :                 let summary = Some(delta_layer::Summary::expected(
    1786         6612 :                     owner.desc.tenant_shard_id.tenant_id,
    1787         6612 :                     owner.desc.timeline_id,
    1788         6612 :                     owner.desc.key_range.clone(),
    1789         6612 :                     owner.desc.lsn_range.clone(),
    1790         6612 :                 ));
    1791         6612 :                 delta_layer::DeltaLayerInner::load(
    1792         6612 :                     &owner.path,
    1793         6612 :                     summary,
    1794         6612 :                     Some(owner.conf.max_vectored_read_bytes),
    1795         6612 :                     &ctx,
    1796         6612 :                 )
    1797         6612 :                 .await
    1798         6612 :                 .map(LayerKind::Delta)
    1799              :             } else {
    1800          900 :                 let ctx = RequestContextBuilder::from(ctx)
    1801          900 :                     .page_content_kind(crate::context::PageContentKind::ImageLayerSummary)
    1802          900 :                     .attached_child();
    1803          900 :                 let lsn = owner.desc.image_layer_lsn();
    1804          900 :                 let summary = Some(image_layer::Summary::expected(
    1805          900 :                     owner.desc.tenant_shard_id.tenant_id,
    1806          900 :                     owner.desc.timeline_id,
    1807          900 :                     owner.desc.key_range.clone(),
    1808          900 :                     lsn,
    1809          900 :                 ));
    1810          900 :                 image_layer::ImageLayerInner::load(
    1811          900 :                     &owner.path,
    1812          900 :                     lsn,
    1813          900 :                     summary,
    1814          900 :                     Some(owner.conf.max_vectored_read_bytes),
    1815          900 :                     &ctx,
    1816          900 :                 )
    1817          900 :                 .await
    1818          900 :                 .map(LayerKind::Image)
    1819              :             };
    1820              : 
    1821         7512 :             match res {
    1822         7512 :                 Ok(layer) => Ok(layer),
    1823            0 :                 Err(err) => {
    1824            0 :                     LAYER_IMPL_METRICS.inc_permanent_loading_failures();
    1825            0 :                     // We log this message once over the lifetime of `Self`
    1826            0 :                     // => Ok and good to log backtrace and path here.
    1827            0 :                     tracing::error!(
    1828            0 :                         "layer load failed, assuming permanent failure: {}: {err:?}",
    1829            0 :                         owner.path
    1830              :                     );
    1831            0 :                     Err(err)
    1832              :                 }
    1833              :             }
    1834        15024 :         };
    1835      1602319 :         self.kind
    1836      1602319 :             .get_or_init(init)
    1837      1602319 :             .await
    1838      1602319 :             .as_ref()
    1839      1602319 :             // We already logged the full backtrace above, once. Don't repeat that here.
    1840      1602319 :             .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
    1841      1602319 :     }
    1842              : 
    1843      1596019 :     async fn get_values_reconstruct_data(
    1844      1596019 :         &self,
    1845      1596019 :         this: ResidentLayer,
    1846      1596019 :         keyspace: KeySpace,
    1847      1596019 :         lsn_range: Range<Lsn>,
    1848      1596019 :         reconstruct_data: &mut ValuesReconstructState,
    1849      1596019 :         ctx: &RequestContext,
    1850      1596019 :     ) -> Result<(), GetVectoredError> {
    1851              :         use LayerKind::*;
    1852              : 
    1853      1596019 :         match self
    1854      1596019 :             .get(&this.owner.0, ctx)
    1855      1596019 :             .await
    1856      1596019 :             .map_err(GetVectoredError::Other)?
    1857              :         {
    1858      1414507 :             Delta(d) => {
    1859      1414507 :                 d.get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx)
    1860      1414507 :                     .await
    1861              :             }
    1862       181512 :             Image(i) => {
    1863       181512 :                 i.get_values_reconstruct_data(this, keyspace, reconstruct_data, ctx)
    1864       181512 :                     .await
    1865              :             }
    1866              :         }
    1867      1596019 :     }
    1868              : 
    1869           24 :     async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
    1870              :         use LayerKind::*;
    1871           24 :         match self.get(owner, ctx).await? {
    1872           24 :             Delta(d) => d.dump(ctx).await?,
    1873            0 :             Image(i) => i.dump(ctx).await?,
    1874              :         }
    1875              : 
    1876           24 :         Ok(())
    1877           24 :     }
    1878              : }
    1879              : 
    1880              : /// Wrapper around an actual layer implementation.
    1881              : #[derive(Debug)]
    1882              : enum LayerKind {
    1883              :     Delta(delta_layer::DeltaLayerInner),
    1884              :     Image(image_layer::ImageLayerInner),
    1885              : }
    1886              : 
    1887              : /// Guard for forcing a layer be resident while it exists.
    1888              : #[derive(Clone)]
    1889              : pub struct ResidentLayer {
    1890              :     owner: Layer,
    1891              :     downloaded: Arc<DownloadedLayer>,
    1892              : }
    1893              : 
    1894              : impl std::fmt::Display for ResidentLayer {
    1895        13008 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1896        13008 :         write!(f, "{}", self.owner)
    1897        13008 :     }
    1898              : }
    1899              : 
    1900              : impl std::fmt::Debug for ResidentLayer {
    1901            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1902            0 :         write!(f, "{}", self.owner)
    1903            0 :     }
    1904              : }
    1905              : 
    1906              : impl ResidentLayer {
    1907              :     /// Release the eviction guard, converting back into a plain [`Layer`].
    1908              :     ///
    1909              :     /// You can access the [`Layer`] also by using `as_ref`.
    1910         2520 :     pub(crate) fn drop_eviction_guard(self) -> Layer {
    1911         2520 :         self.into()
    1912         2520 :     }
    1913              : 
    1914              :     /// Loads all keys stored in the layer. Returns key, lsn and value size.
    1915            0 :     #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
    1916              :     pub(crate) async fn load_keys<'a>(
    1917              :         &'a self,
    1918              :         ctx: &RequestContext,
    1919              :     ) -> anyhow::Result<Vec<pageserver_api::key::Key>> {
    1920              :         use LayerKind::*;
    1921              : 
    1922              :         let owner = &self.owner.0;
    1923              :         let inner = self.downloaded.get(owner, ctx).await?;
    1924              : 
    1925              :         // this is valid because the DownloadedLayer::kind is a OnceCell, not a
    1926              :         // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
    1927              :         // while it's being held.
    1928              :         self.owner.record_access(ctx);
    1929              : 
    1930              :         let res = match inner {
    1931              :             Delta(d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await,
    1932              :             Image(i) => image_layer::ImageLayerInner::load_keys(i, ctx).await,
    1933              :         };
    1934            0 :         res.with_context(|| format!("Layer index is corrupted for {self}"))
    1935              :     }
    1936              : 
    1937              :     /// Read all they keys in this layer which match the ShardIdentity, and write them all to
    1938              :     /// the provided writer.  Return the number of keys written.
    1939           48 :     #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
    1940              :     pub(crate) async fn filter(
    1941              :         &self,
    1942              :         shard_identity: &ShardIdentity,
    1943              :         writer: &mut ImageLayerWriter,
    1944              :         ctx: &RequestContext,
    1945              :     ) -> Result<usize, CompactionError> {
    1946              :         use LayerKind::*;
    1947              : 
    1948              :         match self
    1949              :             .downloaded
    1950              :             .get(&self.owner.0, ctx)
    1951              :             .await
    1952              :             .map_err(CompactionError::Other)?
    1953              :         {
    1954              :             Delta(_) => {
    1955              :                 return Err(CompactionError::Other(anyhow::anyhow!(format!(
    1956              :                     "cannot filter() on a delta layer {self}"
    1957              :                 ))));
    1958              :             }
    1959              :             Image(i) => i
    1960              :                 .filter(shard_identity, writer, ctx)
    1961              :                 .await
    1962              :                 .map_err(CompactionError::Other),
    1963              :         }
    1964              :     }
    1965              : 
    1966              :     /// Returns the amount of keys and values written to the writer.
    1967           60 :     pub(crate) async fn copy_delta_prefix(
    1968           60 :         &self,
    1969           60 :         writer: &mut super::delta_layer::DeltaLayerWriter,
    1970           60 :         until: Lsn,
    1971           60 :         ctx: &RequestContext,
    1972           60 :     ) -> anyhow::Result<usize> {
    1973              :         use LayerKind::*;
    1974              : 
    1975           60 :         let owner = &self.owner.0;
    1976           60 : 
    1977           60 :         match self.downloaded.get(owner, ctx).await? {
    1978           60 :             Delta(d) => d
    1979           60 :                 .copy_prefix(writer, until, ctx)
    1980           60 :                 .await
    1981           60 :                 .with_context(|| format!("copy_delta_prefix until {until} of {self}")),
    1982            0 :             Image(_) => anyhow::bail!(format!("cannot copy_lsn_prefix of image layer {self}")),
    1983              :         }
    1984           60 :     }
    1985              : 
    1986        11264 :     pub(crate) fn local_path(&self) -> &Utf8Path {
    1987        11264 :         &self.owner.0.path
    1988        11264 :     }
    1989              : 
    1990        13392 :     pub(crate) fn metadata(&self) -> LayerFileMetadata {
    1991        13392 :         self.owner.metadata()
    1992        13392 :     }
    1993              : 
    1994              :     /// Cast the layer to a delta, return an error if it is an image layer.
    1995         5736 :     pub(crate) async fn get_as_delta(
    1996         5736 :         &self,
    1997         5736 :         ctx: &RequestContext,
    1998         5736 :     ) -> anyhow::Result<&delta_layer::DeltaLayerInner> {
    1999              :         use LayerKind::*;
    2000         5736 :         match self.downloaded.get(&self.owner.0, ctx).await? {
    2001         5736 :             Delta(d) => Ok(d),
    2002            0 :             Image(_) => Err(anyhow::anyhow!("image layer")),
    2003              :         }
    2004         5736 :     }
    2005              : 
    2006              :     /// Cast the layer to an image, return an error if it is a delta layer.
    2007          432 :     pub(crate) async fn get_as_image(
    2008          432 :         &self,
    2009          432 :         ctx: &RequestContext,
    2010          432 :     ) -> anyhow::Result<&image_layer::ImageLayerInner> {
    2011              :         use LayerKind::*;
    2012          432 :         match self.downloaded.get(&self.owner.0, ctx).await? {
    2013          432 :             Image(d) => Ok(d),
    2014            0 :             Delta(_) => Err(anyhow::anyhow!("delta layer")),
    2015              :         }
    2016          432 :     }
    2017              : }
    2018              : 
    2019              : impl AsLayerDesc for ResidentLayer {
    2020      7145724 :     fn layer_desc(&self) -> &PersistentLayerDesc {
    2021      7145724 :         self.owner.layer_desc()
    2022      7145724 :     }
    2023              : }
    2024              : 
    2025              : impl AsRef<Layer> for ResidentLayer {
    2026        12396 :     fn as_ref(&self) -> &Layer {
    2027        12396 :         &self.owner
    2028        12396 :     }
    2029              : }
    2030              : 
    2031              : /// Drop the eviction guard.
    2032              : impl From<ResidentLayer> for Layer {
    2033         2520 :     fn from(value: ResidentLayer) -> Self {
    2034         2520 :         value.owner
    2035         2520 :     }
    2036              : }
    2037              : 
    2038              : use metrics::IntCounter;
    2039              : 
    2040              : pub(crate) struct LayerImplMetrics {
    2041              :     started_evictions: IntCounter,
    2042              :     completed_evictions: IntCounter,
    2043              :     cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
    2044              : 
    2045              :     started_deletes: IntCounter,
    2046              :     completed_deletes: IntCounter,
    2047              :     failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
    2048              : 
    2049              :     rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
    2050              :     inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
    2051              :     redownload_after: metrics::Histogram,
    2052              :     time_to_evict: metrics::Histogram,
    2053              : }
    2054              : 
    2055              : impl Default for LayerImplMetrics {
    2056          324 :     fn default() -> Self {
    2057              :         use enum_map::Enum;
    2058              : 
    2059              :         // reminder: these will be pageserver_layer_* with "_total" suffix
    2060              : 
    2061          324 :         let started_evictions = metrics::register_int_counter!(
    2062          324 :             "pageserver_layer_started_evictions",
    2063          324 :             "Evictions started in the Layer implementation"
    2064          324 :         )
    2065          324 :         .unwrap();
    2066          324 :         let completed_evictions = metrics::register_int_counter!(
    2067          324 :             "pageserver_layer_completed_evictions",
    2068          324 :             "Evictions completed in the Layer implementation"
    2069          324 :         )
    2070          324 :         .unwrap();
    2071          324 : 
    2072          324 :         let cancelled_evictions = metrics::register_int_counter_vec!(
    2073          324 :             "pageserver_layer_cancelled_evictions_count",
    2074          324 :             "Different reasons for evictions to have been cancelled or failed",
    2075          324 :             &["reason"]
    2076          324 :         )
    2077          324 :         .unwrap();
    2078          324 : 
    2079         2916 :         let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    2080         2916 :             let reason = EvictionCancelled::from_usize(i);
    2081         2916 :             let s = reason.as_str();
    2082         2916 :             cancelled_evictions.with_label_values(&[s])
    2083         2916 :         }));
    2084          324 : 
    2085          324 :         let started_deletes = metrics::register_int_counter!(
    2086          324 :             "pageserver_layer_started_deletes",
    2087          324 :             "Deletions on drop pending in the Layer implementation"
    2088          324 :         )
    2089          324 :         .unwrap();
    2090          324 :         let completed_deletes = metrics::register_int_counter!(
    2091          324 :             "pageserver_layer_completed_deletes",
    2092          324 :             "Deletions on drop completed in the Layer implementation"
    2093          324 :         )
    2094          324 :         .unwrap();
    2095          324 : 
    2096          324 :         let failed_deletes = metrics::register_int_counter_vec!(
    2097          324 :             "pageserver_layer_failed_deletes_count",
    2098          324 :             "Different reasons for deletions on drop to have failed",
    2099          324 :             &["reason"]
    2100          324 :         )
    2101          324 :         .unwrap();
    2102          324 : 
    2103          648 :         let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    2104          648 :             let reason = DeleteFailed::from_usize(i);
    2105          648 :             let s = reason.as_str();
    2106          648 :             failed_deletes.with_label_values(&[s])
    2107          648 :         }));
    2108          324 : 
    2109          324 :         let rare_counters = metrics::register_int_counter_vec!(
    2110          324 :             "pageserver_layer_assumed_rare_count",
    2111          324 :             "Times unexpected or assumed rare event happened",
    2112          324 :             &["event"]
    2113          324 :         )
    2114          324 :         .unwrap();
    2115          324 : 
    2116         2268 :         let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
    2117         2268 :             let event = RareEvent::from_usize(i);
    2118         2268 :             let s = event.as_str();
    2119         2268 :             rare_counters.with_label_values(&[s])
    2120         2268 :         }));
    2121          324 : 
    2122          324 :         let inits_cancelled = metrics::register_int_counter!(
    2123          324 :             "pageserver_layer_inits_cancelled_count",
    2124          324 :             "Times Layer initialization was cancelled",
    2125          324 :         )
    2126          324 :         .unwrap();
    2127          324 : 
    2128          324 :         let redownload_after = {
    2129          324 :             let minute = 60.0;
    2130          324 :             let hour = 60.0 * minute;
    2131          324 :             metrics::register_histogram!(
    2132          324 :                 "pageserver_layer_redownloaded_after",
    2133          324 :                 "Time between evicting and re-downloading.",
    2134          324 :                 vec![
    2135          324 :                     10.0,
    2136          324 :                     30.0,
    2137          324 :                     minute,
    2138          324 :                     5.0 * minute,
    2139          324 :                     15.0 * minute,
    2140          324 :                     30.0 * minute,
    2141          324 :                     hour,
    2142          324 :                     12.0 * hour,
    2143          324 :                 ]
    2144          324 :             )
    2145          324 :             .unwrap()
    2146          324 :         };
    2147          324 : 
    2148          324 :         let time_to_evict = metrics::register_histogram!(
    2149          324 :             "pageserver_layer_eviction_held_permit_seconds",
    2150          324 :             "Time eviction held the permit.",
    2151          324 :             vec![0.001, 0.010, 0.100, 0.500, 1.000, 5.000]
    2152          324 :         )
    2153          324 :         .unwrap();
    2154          324 : 
    2155          324 :         Self {
    2156          324 :             started_evictions,
    2157          324 :             completed_evictions,
    2158          324 :             cancelled_evictions,
    2159          324 : 
    2160          324 :             started_deletes,
    2161          324 :             completed_deletes,
    2162          324 :             failed_deletes,
    2163          324 : 
    2164          324 :             rare_counters,
    2165          324 :             inits_cancelled,
    2166          324 :             redownload_after,
    2167          324 :             time_to_evict,
    2168          324 :         }
    2169          324 :     }
    2170              : }
    2171              : 
    2172              : impl LayerImplMetrics {
    2173          252 :     fn inc_started_evictions(&self) {
    2174          252 :         self.started_evictions.inc();
    2175          252 :     }
    2176          204 :     fn inc_completed_evictions(&self) {
    2177          204 :         self.completed_evictions.inc();
    2178          204 :     }
    2179           48 :     fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
    2180           48 :         self.cancelled_evictions[reason].inc()
    2181           48 :     }
    2182              : 
    2183         3096 :     fn inc_started_deletes(&self) {
    2184         3096 :         self.started_deletes.inc();
    2185         3096 :     }
    2186         3056 :     fn inc_completed_deletes(&self) {
    2187         3056 :         self.completed_deletes.inc();
    2188         3056 :     }
    2189            0 :     fn inc_deletes_failed(&self, reason: DeleteFailed) {
    2190            0 :         self.failed_deletes[reason].inc();
    2191            0 :     }
    2192              : 
    2193              :     /// Counted separatedly from failed layer deletes because we will complete the layer deletion
    2194              :     /// attempt regardless of failure to delete local file.
    2195            0 :     fn inc_delete_removes_failed(&self) {
    2196            0 :         self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
    2197            0 :     }
    2198              : 
    2199              :     /// Expected rare just as cancellations are rare, but we could have cancellations separate from
    2200              :     /// the single caller which can start the download, so use this counter to separte them.
    2201            0 :     fn inc_init_completed_without_requester(&self) {
    2202            0 :         self.rare_counters[RareEvent::InitCompletedWithoutRequester].inc();
    2203            0 :     }
    2204              : 
    2205              :     /// Expected rare because cancellations are unexpected, and failures are unexpected
    2206            0 :     fn inc_download_failed_without_requester(&self) {
    2207            0 :         self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
    2208            0 :     }
    2209              : 
    2210              :     /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
    2211              :     ///
    2212              :     /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
    2213              :     /// Option.
    2214            0 :     fn inc_raced_wanted_evicted_accesses(&self) {
    2215            0 :         self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
    2216            0 :     }
    2217              : 
    2218              :     /// These are only expected for [`Self::inc_init_cancelled`] amount when
    2219              :     /// running with remote storage.
    2220           36 :     fn inc_init_needed_no_download(&self) {
    2221           36 :         self.rare_counters[RareEvent::InitWithoutDownload].inc();
    2222           36 :     }
    2223              : 
    2224              :     /// Expected rare because all layer files should be readable and good
    2225            0 :     fn inc_permanent_loading_failures(&self) {
    2226            0 :         self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
    2227            0 :     }
    2228              : 
    2229            0 :     fn inc_init_cancelled(&self) {
    2230            0 :         self.inits_cancelled.inc()
    2231            0 :     }
    2232              : 
    2233           84 :     fn record_redownloaded_after(&self, duration: std::time::Duration) {
    2234           84 :         self.redownload_after.observe(duration.as_secs_f64())
    2235           84 :     }
    2236              : 
    2237              :     /// This would be bad if it ever happened, or mean extreme disk pressure. We should probably
    2238              :     /// instead cancel eviction if we would have read waiters. We cannot however separate reads
    2239              :     /// from other evictions, so this could have noise as well.
    2240            0 :     fn inc_evicted_with_waiters(&self) {
    2241            0 :         self.rare_counters[RareEvent::EvictedWithWaiters].inc();
    2242            0 :     }
    2243              : 
    2244              :     /// Recorded at least initially as the permit is now acquired in async context before
    2245              :     /// spawn_blocking action.
    2246          204 :     fn record_time_to_evict(&self, duration: std::time::Duration) {
    2247          204 :         self.time_to_evict.observe(duration.as_secs_f64())
    2248          204 :     }
    2249              : }
    2250              : 
    2251              : #[derive(Debug, Clone, Copy, enum_map::Enum)]
    2252              : enum EvictionCancelled {
    2253              :     LayerGone,
    2254              :     TimelineGone,
    2255              :     VersionCheckFailed,
    2256              :     FileNotFound,
    2257              :     RemoveFailed,
    2258              :     AlreadyReinitialized,
    2259              :     /// Not evicted because of a pending reinitialization
    2260              :     LostToDownload,
    2261              :     /// After eviction, there was a new layer access which cancelled the eviction.
    2262              :     UpgradedBackOnAccess,
    2263              :     UnexpectedEvictedState,
    2264              : }
    2265              : 
    2266              : impl EvictionCancelled {
    2267         2916 :     fn as_str(&self) -> &'static str {
    2268         2916 :         match self {
    2269          324 :             EvictionCancelled::LayerGone => "layer_gone",
    2270          324 :             EvictionCancelled::TimelineGone => "timeline_gone",
    2271          324 :             EvictionCancelled::VersionCheckFailed => "version_check_fail",
    2272          324 :             EvictionCancelled::FileNotFound => "file_not_found",
    2273          324 :             EvictionCancelled::RemoveFailed => "remove_failed",
    2274          324 :             EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
    2275          324 :             EvictionCancelled::LostToDownload => "lost_to_download",
    2276          324 :             EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
    2277          324 :             EvictionCancelled::UnexpectedEvictedState => "unexpected_evicted_state",
    2278              :         }
    2279         2916 :     }
    2280              : }
    2281              : 
    2282              : #[derive(enum_map::Enum)]
    2283              : enum DeleteFailed {
    2284              :     TimelineGone,
    2285              :     DeleteSchedulingFailed,
    2286              : }
    2287              : 
    2288              : impl DeleteFailed {
    2289          648 :     fn as_str(&self) -> &'static str {
    2290          648 :         match self {
    2291          324 :             DeleteFailed::TimelineGone => "timeline_gone",
    2292          324 :             DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
    2293              :         }
    2294          648 :     }
    2295              : }
    2296              : 
    2297              : #[derive(enum_map::Enum)]
    2298              : enum RareEvent {
    2299              :     RemoveOnDropFailed,
    2300              :     InitCompletedWithoutRequester,
    2301              :     DownloadFailedWithoutRequester,
    2302              :     UpgradedWantedEvicted,
    2303              :     InitWithoutDownload,
    2304              :     PermanentLoadingFailure,
    2305              :     EvictedWithWaiters,
    2306              : }
    2307              : 
    2308              : impl RareEvent {
    2309         2268 :     fn as_str(&self) -> &'static str {
    2310              :         use RareEvent::*;
    2311              : 
    2312         2268 :         match self {
    2313          324 :             RemoveOnDropFailed => "remove_on_drop_failed",
    2314          324 :             InitCompletedWithoutRequester => "init_completed_without",
    2315          324 :             DownloadFailedWithoutRequester => "download_failed_without",
    2316          324 :             UpgradedWantedEvicted => "raced_wanted_evicted",
    2317          324 :             InitWithoutDownload => "init_needed_no_download",
    2318          324 :             PermanentLoadingFailure => "permanent_loading_failure",
    2319          324 :             EvictedWithWaiters => "evicted_with_waiters",
    2320              :         }
    2321         2268 :     }
    2322              : }
    2323              : 
    2324              : pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
    2325              :     once_cell::sync::Lazy::new(LayerImplMetrics::default);
        

Generated by: LCOV version 2.1-beta