Line data Source code
1 : use std::ops::Range;
2 : use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
3 : use std::sync::{Arc, Weak};
4 : use std::time::{Duration, SystemTime};
5 :
6 : use crate::PERF_TRACE_TARGET;
7 : use anyhow::Context;
8 : use camino::{Utf8Path, Utf8PathBuf};
9 : use pageserver_api::keyspace::KeySpace;
10 : use pageserver_api::models::HistoricLayerInfo;
11 : use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
12 : use tracing::{Instrument, info_span};
13 : use utils::generation::Generation;
14 : use utils::id::TimelineId;
15 : use utils::lsn::Lsn;
16 : use utils::sync::{gate, heavier_once_cell};
17 :
18 : use super::delta_layer::{self};
19 : use super::image_layer::{self};
20 : use super::{
21 : AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
22 : LayerVisibilityHint, PerfInstrumentFutureExt, PersistentLayerDesc, ValuesReconstructState,
23 : };
24 : use crate::config::PageServerConf;
25 : use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
26 : use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
27 : use crate::task_mgr::TaskKind;
28 : use crate::tenant::Timeline;
29 : use crate::tenant::remote_timeline_client::LayerFileMetadata;
30 : use crate::tenant::timeline::{CompactionError, GetVectoredError};
31 :
32 : #[cfg(test)]
33 : mod tests;
34 :
35 : #[cfg(test)]
36 : mod failpoints;
37 :
38 : pub const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
39 :
40 : /// A Layer contains all data in a "rectangle" consisting of a range of keys and
41 : /// range of LSNs.
42 : ///
43 : /// There are two kinds of layers, in-memory and on-disk layers. In-memory
44 : /// layers are used to ingest incoming WAL, and provide fast access to the
45 : /// recent page versions. On-disk layers are stored as files on disk, and are
46 : /// immutable. This type represents the on-disk kind while in-memory kind are represented by
47 : /// [`InMemoryLayer`].
48 : ///
49 : /// Furthermore, there are two kinds of on-disk layers: delta and image layers.
50 : /// A delta layer contains all modifications within a range of LSNs and keys.
51 : /// An image layer is a snapshot of all the data in a key-range, at a single
52 : /// LSN.
53 : ///
54 : /// This type models the on-disk layers, which can be evicted and on-demand downloaded. As a
55 : /// general goal, read accesses should always win eviction and eviction should not wait for
56 : /// download.
57 : ///
58 : /// ### State transitions
59 : ///
60 : /// The internal state of `Layer` is composed of most importantly the on-filesystem state and the
61 : /// [`ResidentOrWantedEvicted`] enum. On-filesystem state can be either present (fully downloaded,
62 : /// right size) or deleted.
63 : ///
64 : /// Reads will always win requests to evict until `wait_for_turn_and_evict` has acquired the
65 : /// `heavier_once_cell::InitPermit` and has started to `evict_blocking`. Before the
66 : /// `heavier_once_cell::InitPermit` has been acquired, any read request
67 : /// (`get_or_maybe_download`) can "re-initialize" using the existing downloaded file and thus
68 : /// cancelling the eviction.
69 : ///
70 : /// ```text
71 : /// +-----------------+ get_or_maybe_download +--------------------------------+
72 : /// | not initialized |--------------------------->| Resident(Arc<DownloadedLayer>) |
73 : /// | ENOENT | /->| |
74 : /// +-----------------+ | +--------------------------------+
75 : /// ^ | | ^
76 : /// | get_or_maybe_download | | | get_or_maybe_download, either:
77 : /// evict_blocking | /-------------------------/ | | - upgrade weak to strong
78 : /// | | | | - re-initialize without download
79 : /// | | evict_and_wait | |
80 : /// +-----------------+ v |
81 : /// | not initialized | on_downloaded_layer_drop +--------------------------------------+
82 : /// | file is present |<---------------------------| WantedEvicted(Weak<DownloadedLayer>) |
83 : /// +-----------------+ +--------------------------------------+
84 : /// ```
85 : ///
86 : /// ### Unsupported
87 : ///
88 : /// - Evicting by the operator deleting files from the filesystem
89 : ///
90 : /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer
91 : #[derive(Clone)]
92 : pub(crate) struct Layer(Arc<LayerInner>);
93 :
94 : impl std::fmt::Display for Layer {
95 4328 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
96 4328 : write!(
97 4328 : f,
98 4328 : "{}{}",
99 4328 : self.layer_desc().short_id(),
100 4328 : self.0.generation.get_suffix()
101 4328 : )
102 4328 : }
103 : }
104 :
105 : impl std::fmt::Debug for Layer {
106 8 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
107 8 : write!(f, "{}", self)
108 8 : }
109 : }
110 :
111 : impl AsLayerDesc for Layer {
112 3845632 : fn layer_desc(&self) -> &PersistentLayerDesc {
113 3845632 : self.0.layer_desc()
114 3845632 : }
115 : }
116 :
117 : impl PartialEq for Layer {
118 7 : fn eq(&self, other: &Self) -> bool {
119 7 : Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0)
120 7 : }
121 : }
122 :
123 3844 : pub(crate) fn local_layer_path(
124 3844 : conf: &PageServerConf,
125 3844 : tenant_shard_id: &TenantShardId,
126 3844 : timeline_id: &TimelineId,
127 3844 : layer_file_name: &LayerName,
128 3844 : generation: &Generation,
129 3844 : ) -> Utf8PathBuf {
130 3844 : let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);
131 3844 :
132 3844 : if generation.is_none() {
133 : // Without a generation, we may only use legacy path style
134 0 : timeline_path.join(layer_file_name.to_string())
135 : } else {
136 3844 : timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
137 : }
138 3844 : }
139 :
140 : pub(crate) enum LastEviction {
141 : Never,
142 : At(std::time::Instant),
143 : Evicting,
144 : }
145 :
146 : impl LastEviction {
147 52 : pub(crate) fn happened_after(&self, timepoint: std::time::Instant) -> bool {
148 52 : match self {
149 0 : LastEviction::Never => false,
150 52 : LastEviction::At(evicted_at) => evicted_at > &timepoint,
151 0 : LastEviction::Evicting => true,
152 : }
153 52 : }
154 : }
155 :
156 : impl Layer {
157 : /// Creates a layer value for a file we know to not be resident.
158 0 : pub(crate) fn for_evicted(
159 0 : conf: &'static PageServerConf,
160 0 : timeline: &Arc<Timeline>,
161 0 : file_name: LayerName,
162 0 : metadata: LayerFileMetadata,
163 0 : ) -> Self {
164 0 : let local_path = local_layer_path(
165 0 : conf,
166 0 : &timeline.tenant_shard_id,
167 0 : &timeline.timeline_id,
168 0 : &file_name,
169 0 : &metadata.generation,
170 0 : );
171 0 :
172 0 : let desc = PersistentLayerDesc::from_filename(
173 0 : timeline.tenant_shard_id,
174 0 : timeline.timeline_id,
175 0 : file_name,
176 0 : metadata.file_size,
177 0 : );
178 0 :
179 0 : let owner = Layer(Arc::new(LayerInner::new(
180 0 : conf,
181 0 : timeline,
182 0 : local_path,
183 0 : desc,
184 0 : None,
185 0 : metadata.generation,
186 0 : metadata.shard,
187 0 : )));
188 0 :
189 0 : debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
190 :
191 0 : owner
192 0 : }
193 :
194 : /// Creates a Layer value for a file we know to be resident in timeline directory.
195 244 : pub(crate) fn for_resident(
196 244 : conf: &'static PageServerConf,
197 244 : timeline: &Arc<Timeline>,
198 244 : local_path: Utf8PathBuf,
199 244 : file_name: LayerName,
200 244 : metadata: LayerFileMetadata,
201 244 : ) -> ResidentLayer {
202 244 : let desc = PersistentLayerDesc::from_filename(
203 244 : timeline.tenant_shard_id,
204 244 : timeline.timeline_id,
205 244 : file_name,
206 244 : metadata.file_size,
207 244 : );
208 244 :
209 244 : let mut resident = None;
210 244 :
211 244 : let owner = Layer(Arc::new_cyclic(|owner| {
212 244 : let inner = Arc::new(DownloadedLayer {
213 244 : owner: owner.clone(),
214 244 : kind: tokio::sync::OnceCell::default(),
215 244 : version: 0,
216 244 : });
217 244 : resident = Some(inner.clone());
218 244 :
219 244 : LayerInner::new(
220 244 : conf,
221 244 : timeline,
222 244 : local_path,
223 244 : desc,
224 244 : Some(inner),
225 244 : metadata.generation,
226 244 : metadata.shard,
227 244 : )
228 244 : }));
229 244 :
230 244 : let downloaded = resident.expect("just initialized");
231 244 :
232 244 : debug_assert!(owner.0.needs_download_blocking().unwrap().is_none());
233 :
234 244 : timeline
235 244 : .metrics
236 244 : .resident_physical_size_add(metadata.file_size);
237 244 :
238 244 : ResidentLayer { downloaded, owner }
239 244 : }
240 :
241 : /// Creates a Layer value for freshly written out new layer file by renaming it from a
242 : /// temporary path.
243 3632 : pub(crate) fn finish_creating(
244 3632 : conf: &'static PageServerConf,
245 3632 : timeline: &Arc<Timeline>,
246 3632 : desc: PersistentLayerDesc,
247 3632 : temp_path: &Utf8Path,
248 3632 : ) -> anyhow::Result<ResidentLayer> {
249 3632 : let mut resident = None;
250 3632 :
251 3632 : let owner = Layer(Arc::new_cyclic(|owner| {
252 3632 : let inner = Arc::new(DownloadedLayer {
253 3632 : owner: owner.clone(),
254 3632 : kind: tokio::sync::OnceCell::default(),
255 3632 : version: 0,
256 3632 : });
257 3632 : resident = Some(inner.clone());
258 3632 :
259 3632 : let local_path = local_layer_path(
260 3632 : conf,
261 3632 : &timeline.tenant_shard_id,
262 3632 : &timeline.timeline_id,
263 3632 : &desc.layer_name(),
264 3632 : &timeline.generation,
265 3632 : );
266 3632 :
267 3632 : LayerInner::new(
268 3632 : conf,
269 3632 : timeline,
270 3632 : local_path,
271 3632 : desc,
272 3632 : Some(inner),
273 3632 : timeline.generation,
274 3632 : timeline.get_shard_index(),
275 3632 : )
276 3632 : }));
277 3632 :
278 3632 : let downloaded = resident.expect("just initialized");
279 3632 :
280 3632 : // We never want to overwrite an existing file, so we use `RENAME_NOREPLACE`.
281 3632 : // TODO: this leaves the temp file in place if the rename fails, risking us running
282 3632 : // out of space. Should we clean it up here or does the calling context deal with this?
283 3632 : utils::fs_ext::rename_noreplace(temp_path.as_std_path(), owner.local_path().as_std_path())
284 3632 : .with_context(|| format!("rename temporary file as correct path for {owner}"))?;
285 :
286 3632 : Ok(ResidentLayer { downloaded, owner })
287 3632 : }
288 :
289 : /// Requests the layer to be evicted and waits for this to be done.
290 : ///
291 : /// If the file is not resident, an [`EvictionError::NotFound`] is returned.
292 : ///
293 : /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
294 : /// re-downloaded, [`EvictionError::Downloaded`] is returned.
295 : ///
296 : /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction
297 : /// will happen regardless the future returned by this method completing unless there is a
298 : /// read access before eviction gets to complete.
299 : ///
300 : /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
301 : /// of download-evict cycle on retry.
302 104 : pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
303 104 : self.0.evict_and_wait(timeout).await
304 96 : }
305 :
306 : /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
307 : /// then.
308 : ///
309 : /// On drop, this will cause a call to [`crate::tenant::remote_timeline_client::RemoteTimelineClient::schedule_deletion_of_unlinked`].
310 : /// This means that the unlinking by [gc] or [compaction] must have happened strictly before
311 : /// the value this is called on gets dropped.
312 : ///
313 : /// This is ensured by both of those methods accepting references to Layer.
314 : ///
315 : /// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
316 : /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
317 1040 : pub(crate) fn delete_on_drop(&self) {
318 1040 : self.0.delete_on_drop();
319 1040 : }
320 :
321 480385 : pub(crate) async fn get_values_reconstruct_data(
322 480385 : &self,
323 480385 : keyspace: KeySpace,
324 480385 : lsn_range: Range<Lsn>,
325 480385 : reconstruct_data: &mut ValuesReconstructState,
326 480385 : ctx: &RequestContext,
327 480385 : ) -> Result<(), GetVectoredError> {
328 480385 : let downloaded = {
329 480385 : let ctx = RequestContextBuilder::from(ctx)
330 480385 : .perf_span(|crnt_perf_span| {
331 0 : info_span!(
332 : target: PERF_TRACE_TARGET,
333 0 : parent: crnt_perf_span,
334 : "GET_LAYER",
335 : )
336 480385 : })
337 480385 : .attached_child();
338 480385 :
339 480385 : self.0
340 480385 : .get_or_maybe_download(true, &ctx)
341 480385 : .maybe_perf_instrument(&ctx, |crnt_perf_context| crnt_perf_context.clone())
342 480385 : .await
343 480385 : .map_err(|err| match err {
344 : DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
345 0 : GetVectoredError::Cancelled
346 : }
347 0 : other => GetVectoredError::Other(anyhow::anyhow!(other)),
348 480385 : })?
349 : };
350 :
351 480385 : let this = ResidentLayer {
352 480385 : downloaded: downloaded.clone(),
353 480385 : owner: self.clone(),
354 480385 : };
355 480385 :
356 480385 : self.record_access(ctx);
357 480385 :
358 480385 : let ctx = RequestContextBuilder::from(ctx)
359 480385 : .perf_span(|crnt_perf_span| {
360 0 : info_span!(
361 : target: PERF_TRACE_TARGET,
362 0 : parent: crnt_perf_span,
363 : "VISIT_LAYER",
364 : )
365 480385 : })
366 480385 : .attached_child();
367 480385 :
368 480385 : downloaded
369 480385 : .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, &ctx)
370 480385 : .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
371 480385 : .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
372 480385 : .await
373 480385 : .map_err(|err| match err {
374 0 : GetVectoredError::Other(err) => GetVectoredError::Other(
375 0 : err.context(format!("get_values_reconstruct_data for layer {self}")),
376 0 : ),
377 0 : err => err,
378 480385 : })
379 480385 : }
380 :
381 : /// Download the layer if evicted.
382 : ///
383 : /// Will not error when the layer is already downloaded.
384 0 : pub(crate) async fn download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
385 0 : self.0.get_or_maybe_download(true, ctx).await?;
386 0 : Ok(())
387 0 : }
388 :
389 632 : pub(crate) async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
390 632 : self.0.needs_download().await
391 632 : }
392 :
393 : /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
394 : /// while the guard exists.
395 : ///
396 : /// Returns None if the layer is currently evicted or becoming evicted.
397 40 : pub(crate) async fn keep_resident(&self) -> Option<ResidentLayer> {
398 40 : let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?;
399 :
400 28 : Some(ResidentLayer {
401 28 : downloaded,
402 28 : owner: self.clone(),
403 28 : })
404 40 : }
405 :
406 : /// Weak indicator of is the layer resident or not. Good enough for eviction, which can deal
407 : /// with `EvictionError::NotFound`.
408 : ///
409 : /// Returns `true` if this layer might be resident, or `false`, if it most likely evicted or
410 : /// will be unless a read happens soon.
411 359 : pub(crate) fn is_likely_resident(&self) -> bool {
412 359 : self.0
413 359 : .inner
414 359 : .get()
415 359 : .map(|rowe| rowe.is_likely_resident())
416 359 : .unwrap_or(false)
417 359 : }
418 :
419 : /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
420 1160 : pub(crate) async fn download_and_keep_resident(
421 1160 : &self,
422 1160 : ctx: &RequestContext,
423 1160 : ) -> Result<ResidentLayer, DownloadError> {
424 1160 : let downloaded = self.0.get_or_maybe_download(true, ctx).await?;
425 :
426 1160 : Ok(ResidentLayer {
427 1160 : downloaded,
428 1160 : owner: self.clone(),
429 1160 : })
430 1160 : }
431 :
432 0 : pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
433 0 : self.0.info(reset)
434 0 : }
435 :
436 68 : pub(crate) fn latest_activity(&self) -> SystemTime {
437 68 : self.0.access_stats.latest_activity()
438 68 : }
439 :
440 184 : pub(crate) fn visibility(&self) -> LayerVisibilityHint {
441 184 : self.0.access_stats.visibility()
442 184 : }
443 :
444 3636 : pub(crate) fn local_path(&self) -> &Utf8Path {
445 3636 : &self.0.path
446 3636 : }
447 :
448 5288 : pub(crate) fn metadata(&self) -> LayerFileMetadata {
449 5288 : self.0.metadata()
450 5288 : }
451 :
452 52 : pub(crate) fn last_evicted_at(&self) -> LastEviction {
453 52 : match self.0.last_evicted_at.try_lock() {
454 52 : Ok(lock) => match *lock {
455 0 : None => LastEviction::Never,
456 52 : Some(at) => LastEviction::At(at),
457 : },
458 0 : Err(std::sync::TryLockError::WouldBlock) => LastEviction::Evicting,
459 0 : Err(std::sync::TryLockError::Poisoned(p)) => panic!("Lock poisoned: {p}"),
460 : }
461 52 : }
462 :
463 0 : pub(crate) fn get_timeline_id(&self) -> Option<TimelineId> {
464 0 : self.0
465 0 : .timeline
466 0 : .upgrade()
467 0 : .map(|timeline| timeline.timeline_id)
468 0 : }
469 :
470 : /// Traditional debug dumping facility
471 : #[allow(unused)]
472 8 : pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> {
473 8 : self.0.desc.dump();
474 8 :
475 8 : if verbose {
476 : // for now, unconditionally download everything, even if that might not be wanted.
477 8 : let l = self.0.get_or_maybe_download(true, ctx).await?;
478 8 : l.dump(&self.0, ctx).await?
479 0 : }
480 :
481 8 : Ok(())
482 8 : }
483 :
484 : /// Waits until this layer has been dropped (and if needed, local file deletion and remote
485 : /// deletion scheduling has completed).
486 : ///
487 : /// Does not start local deletion, use [`Self::delete_on_drop`] for that
488 : /// separatedly.
489 : #[cfg(any(feature = "testing", test))]
490 4 : pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
491 4 : let mut rx = self.0.status.as_ref().unwrap().subscribe();
492 :
493 4 : async move {
494 : loop {
495 4 : if rx.changed().await.is_err() {
496 4 : break;
497 0 : }
498 : }
499 4 : }
500 4 : }
501 :
502 480385 : fn record_access(&self, ctx: &RequestContext) {
503 480385 : if self.0.access_stats.record_access(ctx) {
504 : // Visibility was modified to Visible: maybe log about this
505 0 : match ctx.task_kind() {
506 : TaskKind::CalculateSyntheticSize
507 : | TaskKind::OndemandLogicalSizeCalculation
508 : | TaskKind::GarbageCollector
509 0 : | TaskKind::MgmtRequest => {
510 0 : // This situation is expected in code paths do binary searches of the LSN space to resolve
511 0 : // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
512 0 : // and on-demand for certain HTTP API requests. On-demand logical size calculation is also included
513 0 : // because it is run as a sub-task of synthetic size.
514 0 : }
515 : _ => {
516 : // In all other contexts, it is unusual to do I/O involving layers which are not visible at
517 : // some branch tip, so we log the fact that we are accessing something that the visibility
518 : // calculation thought should not be visible.
519 : //
520 : // This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
521 : // which was covered by a concurrent compaction.
522 0 : tracing::info!(
523 : layer=%self,
524 0 : "became visible as a result of access",
525 : );
526 : }
527 : }
528 :
529 : // Update the timeline's visible bytes count
530 0 : if let Some(tl) = self.0.timeline.upgrade() {
531 0 : tl.metrics
532 0 : .visible_physical_size_gauge
533 0 : .add(self.0.desc.file_size)
534 0 : }
535 480385 : }
536 480385 : }
537 :
538 724 : pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
539 724 : let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
540 : use LayerVisibilityHint::*;
541 724 : match (old_visibility, visibility) {
542 : (Visible, Covered) => {
543 : // Subtract this layer's contribution to the visible size metric
544 72 : if let Some(tl) = self.0.timeline.upgrade() {
545 72 : debug_assert!(
546 72 : tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
547 : );
548 72 : tl.metrics
549 72 : .visible_physical_size_gauge
550 72 : .sub(self.0.desc.file_size)
551 0 : }
552 : }
553 : (Covered, Visible) => {
554 : // Add this layer's contribution to the visible size metric
555 0 : if let Some(tl) = self.0.timeline.upgrade() {
556 0 : tl.metrics
557 0 : .visible_physical_size_gauge
558 0 : .add(self.0.desc.file_size)
559 0 : }
560 : }
561 652 : (Covered, Covered) | (Visible, Visible) => {
562 652 : // no change
563 652 : }
564 : }
565 724 : }
566 : }
567 :
568 : /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
569 : ///
570 : /// However when we want something evicted, we cannot evict it right away as there might be current
571 : /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
572 : /// read with [`Layer::get_values_reconstruct_data`].
573 : ///
574 : /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
575 : #[derive(Debug)]
576 : enum ResidentOrWantedEvicted {
577 : Resident(Arc<DownloadedLayer>),
578 : WantedEvicted(Weak<DownloadedLayer>, usize),
579 : }
580 :
581 : impl ResidentOrWantedEvicted {
582 : /// Non-mutating access to the a DownloadedLayer, if possible.
583 : ///
584 : /// This is not used on the read path (anything that calls
585 : /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win
586 : /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`].
587 28 : fn get(&self) -> Option<Arc<DownloadedLayer>> {
588 28 : match self {
589 28 : ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
590 0 : ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.upgrade(),
591 : }
592 28 : }
593 :
594 : /// Best-effort query for residency right now, not as strong guarantee as receiving a strong
595 : /// reference from `ResidentOrWantedEvicted::get`.
596 223 : fn is_likely_resident(&self) -> bool {
597 223 : match self {
598 211 : ResidentOrWantedEvicted::Resident(_) => true,
599 12 : ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.strong_count() > 0,
600 : }
601 223 : }
602 :
603 : /// Upgrades any weak to strong if possible.
604 : ///
605 : /// Returns a strong reference if possible, along with a boolean telling if an upgrade
606 : /// happened.
607 481545 : fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
608 481545 : match self {
609 481529 : ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
610 16 : ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
611 0 : Some(strong) => {
612 0 : LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
613 0 :
614 0 : *self = ResidentOrWantedEvicted::Resident(strong.clone());
615 0 :
616 0 : Some((strong, true))
617 : }
618 16 : None => None,
619 : },
620 : }
621 481545 : }
622 :
623 : /// When eviction is first requested, drop down to holding a [`Weak`].
624 : ///
625 : /// Returns `Some` if this was the first time eviction was requested. Care should be taken to
626 : /// drop the possibly last strong reference outside of the mutex of
627 : /// [`heavier_once_cell::OnceCell`].
628 92 : fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
629 92 : match self {
630 84 : ResidentOrWantedEvicted::Resident(strong) => {
631 84 : let weak = Arc::downgrade(strong);
632 84 : let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
633 84 : std::mem::swap(self, &mut temp);
634 84 : match temp {
635 84 : ResidentOrWantedEvicted::Resident(strong) => Some(strong),
636 0 : ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
637 : }
638 : }
639 8 : ResidentOrWantedEvicted::WantedEvicted(..) => None,
640 : }
641 92 : }
642 : }
643 :
644 : struct LayerInner {
645 : /// Only needed to check ondemand_download_behavior_treat_error_as_warn and creation of
646 : /// [`Self::path`].
647 : conf: &'static PageServerConf,
648 :
649 : /// Full path to the file; unclear if this should exist anymore.
650 : path: Utf8PathBuf,
651 :
652 : desc: PersistentLayerDesc,
653 :
654 : /// Timeline access is needed for remote timeline client and metrics.
655 : ///
656 : /// There should not be an access to timeline for any reason without entering the
657 : /// [`Timeline::gate`] at the same time.
658 : timeline: Weak<Timeline>,
659 :
660 : access_stats: LayerAccessStats,
661 :
662 : /// This custom OnceCell is backed by std mutex, but only held for short time periods.
663 : ///
664 : /// Filesystem changes (download, evict) are only done while holding a permit which the
665 : /// `heavier_once_cell` provides.
666 : ///
667 : /// A number of fields in `Layer` are meant to only be updated when holding the InitPermit, but
668 : /// possibly read while not holding it.
669 : inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
670 :
671 : /// Do we want to delete locally and remotely this when `LayerInner` is dropped
672 : wanted_deleted: AtomicBool,
673 :
674 : /// Version is to make sure we will only evict a specific initialization of the downloaded file.
675 : ///
676 : /// Incremented for each initialization, stored in `DownloadedLayer::version` or
677 : /// `ResidentOrWantedEvicted::WantedEvicted`.
678 : version: AtomicUsize,
679 :
680 : /// Allow subscribing to when the layer actually gets evicted, a non-cancellable download
681 : /// starts, or completes.
682 : ///
683 : /// Updates must only be posted while holding the InitPermit or the heavier_once_cell::Guard.
684 : /// Holding the InitPermit is the only time we can do state transitions, but we also need to
685 : /// cancel a pending eviction on upgrading a [`ResidentOrWantedEvicted::WantedEvicted`] back to
686 : /// [`ResidentOrWantedEvicted::Resident`] on access.
687 : ///
688 : /// The sender is wrapped in an Option to facilitate moving it out on [`LayerInner::drop`].
689 : status: Option<tokio::sync::watch::Sender<Status>>,
690 :
691 : /// Counter for exponential backoff with the download.
692 : ///
693 : /// This is atomic only for the purposes of having additional data only accessed while holding
694 : /// the InitPermit.
695 : consecutive_failures: AtomicUsize,
696 :
697 : /// The generation of this Layer.
698 : ///
699 : /// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
700 : /// for created layers from [`Timeline::generation`].
701 : generation: Generation,
702 :
703 : /// The shard of this Layer.
704 : ///
705 : /// For layers created in this process, this will always be the [`ShardIndex`] of the
706 : /// current `ShardIdentity`` (TODO: add link once it's introduced).
707 : ///
708 : /// For loaded layers, this may be some other value if the tenant has undergone
709 : /// a shard split since the layer was originally written.
710 : shard: ShardIndex,
711 :
712 : /// When the Layer was last evicted but has not been downloaded since.
713 : ///
714 : /// This is used for skipping evicted layers from the previous heatmap (see
715 : /// `[Timeline::generate_heatmap]`) and for updating metrics
716 : /// (see [`LayerImplMetrics::redownload_after`]).
717 : last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
718 :
719 : #[cfg(test)]
720 : failpoints: std::sync::Mutex<Vec<failpoints::Failpoint>>,
721 : }
722 :
723 : impl std::fmt::Display for LayerInner {
724 108 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
725 108 : write!(f, "{}", self.layer_desc().short_id())
726 108 : }
727 : }
728 :
729 : impl AsLayerDesc for LayerInner {
730 3850840 : fn layer_desc(&self) -> &PersistentLayerDesc {
731 3850840 : &self.desc
732 3850840 : }
733 : }
734 :
735 : #[derive(Debug, Clone, Copy)]
736 : enum Status {
737 : Resident,
738 : Evicted,
739 : Downloading,
740 : }
741 :
742 : impl Drop for LayerInner {
743 1412 : fn drop(&mut self) {
744 1412 : // if there was a pending eviction, mark it cancelled here to balance metrics
745 1412 : if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit()
746 4 : {
747 4 : // eviction has already been started
748 4 : LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
749 4 :
750 4 : // eviction request is intentionally not honored as no one is present to wait for it
751 4 : // and we could be delaying shutdown for nothing.
752 1408 : }
753 :
754 1412 : let timeline = self.timeline.upgrade();
755 :
756 1412 : if let Some(timeline) = timeline.as_ref() {
757 : // Only need to decrement metrics if the timeline still exists: otherwise
758 : // it will have already de-registered these metrics via TimelineMetrics::shutdown
759 1380 : timeline.metrics.dec_layer(&self.desc);
760 :
761 1380 : if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
762 1380 : debug_assert!(
763 1380 : timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
764 : );
765 1380 : timeline
766 1380 : .metrics
767 1380 : .visible_physical_size_gauge
768 1380 : .sub(self.desc.file_size);
769 0 : }
770 32 : }
771 :
772 1412 : if !*self.wanted_deleted.get_mut() {
773 392 : return;
774 1020 : }
775 :
776 1020 : let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
777 :
778 1020 : let path = std::mem::take(&mut self.path);
779 1020 : let file_name = self.layer_desc().layer_name();
780 1020 : let file_size = self.layer_desc().file_size;
781 1020 : let meta = self.metadata();
782 1020 : let status = self.status.take();
783 1020 :
784 1020 : Self::spawn_blocking(move || {
785 1020 : let _g = span.entered();
786 1020 :
787 1020 : // carry this until we are finished for [`Layer::wait_drop`] support
788 1020 : let _status = status;
789 :
790 1020 : let Some(timeline) = timeline else {
791 : // no need to nag that timeline is gone: under normal situation on
792 : // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
793 0 : LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
794 0 : return;
795 : };
796 :
797 1020 : let Ok(_guard) = timeline.gate.enter() else {
798 0 : LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
799 0 : return;
800 : };
801 :
802 1020 : let removed = match std::fs::remove_file(path) {
803 1016 : Ok(()) => true,
804 4 : Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
805 4 : // until we no longer do detaches by removing all local files before removing the
806 4 : // tenant from the global map, we will always get these errors even if we knew what
807 4 : // is the latest state.
808 4 : //
809 4 : // we currently do not track the latest state, so we'll also end up here on evicted
810 4 : // layers.
811 4 : false
812 : }
813 0 : Err(e) => {
814 0 : tracing::error!("failed to remove wanted deleted layer: {e}");
815 0 : LAYER_IMPL_METRICS.inc_delete_removes_failed();
816 0 : false
817 : }
818 : };
819 :
820 1020 : if removed {
821 1016 : timeline.metrics.resident_physical_size_sub(file_size);
822 1016 : }
823 1020 : let res = timeline
824 1020 : .remote_client
825 1020 : .schedule_deletion_of_unlinked(vec![(file_name, meta)]);
826 :
827 1020 : if let Err(e) = res {
828 : // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
829 : // demonstrating this deadlock (without spawn_blocking): stop will drop
830 : // queued items, which will have ResidentLayer's, and those drops would try
831 : // to re-entrantly lock the RemoteTimelineClient inner state.
832 4 : if !timeline.is_active() {
833 4 : tracing::info!("scheduling deletion on drop failed: {e:#}");
834 : } else {
835 0 : tracing::warn!("scheduling deletion on drop failed: {e:#}");
836 : }
837 4 : LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
838 1016 : } else {
839 1016 : LAYER_IMPL_METRICS.inc_completed_deletes();
840 1016 : }
841 1020 : });
842 1412 : }
843 : }
844 :
845 : impl LayerInner {
846 : #[allow(clippy::too_many_arguments)]
847 3876 : fn new(
848 3876 : conf: &'static PageServerConf,
849 3876 : timeline: &Arc<Timeline>,
850 3876 : local_path: Utf8PathBuf,
851 3876 : desc: PersistentLayerDesc,
852 3876 : downloaded: Option<Arc<DownloadedLayer>>,
853 3876 : generation: Generation,
854 3876 : shard: ShardIndex,
855 3876 : ) -> Self {
856 3876 : let (inner, version, init_status) = if let Some(inner) = downloaded {
857 3876 : let version = inner.version;
858 3876 : let resident = ResidentOrWantedEvicted::Resident(inner);
859 3876 : (
860 3876 : heavier_once_cell::OnceCell::new(resident),
861 3876 : version,
862 3876 : Status::Resident,
863 3876 : )
864 : } else {
865 0 : (heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
866 : };
867 :
868 : // This object acts as a RAII guard on these metrics: increment on construction
869 3876 : timeline.metrics.inc_layer(&desc);
870 3876 :
871 3876 : // New layers are visible by default. This metric is later updated on drop or in set_visibility
872 3876 : timeline
873 3876 : .metrics
874 3876 : .visible_physical_size_gauge
875 3876 : .add(desc.file_size);
876 3876 :
877 3876 : LayerInner {
878 3876 : conf,
879 3876 : path: local_path,
880 3876 : desc,
881 3876 : timeline: Arc::downgrade(timeline),
882 3876 : access_stats: Default::default(),
883 3876 : wanted_deleted: AtomicBool::new(false),
884 3876 : inner,
885 3876 : version: AtomicUsize::new(version),
886 3876 : status: Some(tokio::sync::watch::channel(init_status).0),
887 3876 : consecutive_failures: AtomicUsize::new(0),
888 3876 : generation,
889 3876 : shard,
890 3876 : last_evicted_at: std::sync::Mutex::default(),
891 3876 : #[cfg(test)]
892 3876 : failpoints: Default::default(),
893 3876 : }
894 3876 : }
895 :
896 1040 : fn delete_on_drop(&self) {
897 1040 : let res =
898 1040 : self.wanted_deleted
899 1040 : .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
900 1040 :
901 1040 : if res.is_ok() {
902 1032 : LAYER_IMPL_METRICS.inc_started_deletes();
903 1032 : }
904 1040 : }
905 :
906 : /// Cancellation safe, however dropping the future and calling this method again might result
907 : /// in a new attempt to evict OR join the previously started attempt.
908 104 : #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))]
909 : pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
910 : let mut rx = self.status.as_ref().unwrap().subscribe();
911 :
912 : {
913 : let current = rx.borrow_and_update();
914 : match &*current {
915 : Status::Resident => {
916 : // we might get lucky and evict this; continue
917 : }
918 : Status::Evicted | Status::Downloading => {
919 : // it is already evicted
920 : return Err(EvictionError::NotFound);
921 : }
922 : }
923 : }
924 :
925 : let strong = {
926 : match self.inner.get() {
927 : Some(mut either) => either.downgrade(),
928 : None => {
929 : // we already have a scheduled eviction, which just has not gotten to run yet.
930 : // it might still race with a read access, but that could also get cancelled,
931 : // so let's say this is not evictable.
932 : return Err(EvictionError::NotFound);
933 : }
934 : }
935 : };
936 :
937 : if strong.is_some() {
938 : // drop the DownloadedLayer outside of the holding the guard
939 : drop(strong);
940 :
941 : // idea here is that only one evicter should ever get to witness a strong reference,
942 : // which means whenever get_or_maybe_download upgrades a weak, it must mark up a
943 : // cancelled eviction and signal us, like it currently does.
944 : //
945 : // a second concurrent evict_and_wait will not see a strong reference.
946 : LAYER_IMPL_METRICS.inc_started_evictions();
947 : }
948 :
949 : let changed = rx.changed();
950 : let changed = tokio::time::timeout(timeout, changed).await;
951 :
952 : let Ok(changed) = changed else {
953 : return Err(EvictionError::Timeout);
954 : };
955 :
956 : let _: () = changed.expect("cannot be closed, because we are holding a strong reference");
957 :
958 : let current = rx.borrow_and_update();
959 :
960 : match &*current {
961 : // the easiest case
962 : Status::Evicted => Ok(()),
963 : // it surely was evicted in between, but then there was a new access now; we can't know
964 : // if it'll succeed so lets just call it evicted
965 : Status::Downloading => Ok(()),
966 : // either the download which was started after eviction completed already, or it was
967 : // never evicted
968 : Status::Resident => Err(EvictionError::Downloaded),
969 : }
970 : }
971 :
972 : /// Cancellation safe.
973 481577 : async fn get_or_maybe_download(
974 481577 : self: &Arc<Self>,
975 481577 : allow_download: bool,
976 481577 : ctx: &RequestContext,
977 481577 : ) -> Result<Arc<DownloadedLayer>, DownloadError> {
978 481577 : let mut wait_for_download_recorder =
979 481577 : scopeguard::guard(utils::elapsed_accum::ElapsedAccum::default(), |accum| {
980 481577 : ctx.ondemand_download_wait_observe(accum.get());
981 481577 : });
982 48 : let (weak, permit) = {
983 : // get_or_init_detached can:
984 : // - be fast (mutex lock) OR uncontested semaphore permit acquire
985 : // - be slow (wait for semaphore permit or closing)
986 481577 : let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
987 :
988 481577 : let locked = self
989 481577 : .inner
990 481577 : .get_or_init_detached_measured(Some(&mut wait_for_download_recorder))
991 481577 : .await
992 481577 : .map(|mut guard| guard.get_and_upgrade().ok_or(guard));
993 481577 :
994 481577 : scopeguard::ScopeGuard::into_inner(init_cancelled);
995 :
996 481529 : match locked {
997 : // this path could had been a RwLock::read
998 481529 : Ok(Ok((strong, upgraded))) if !upgraded => return Ok(strong),
999 0 : Ok(Ok((strong, _))) => {
1000 0 : // when upgraded back, the Arc<DownloadedLayer> is still available, but
1001 0 : // previously a `evict_and_wait` was received. this is the only place when we
1002 0 : // send out an update without holding the InitPermit.
1003 0 : //
1004 0 : // note that we also have dropped the Guard; this is fine, because we just made
1005 0 : // a state change and are holding a strong reference to be returned.
1006 0 : self.status.as_ref().unwrap().send_replace(Status::Resident);
1007 0 : LAYER_IMPL_METRICS
1008 0 : .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
1009 0 :
1010 0 : return Ok(strong);
1011 : }
1012 16 : Ok(Err(guard)) => {
1013 16 : // path to here: we won the eviction, the file should still be on the disk.
1014 16 : let (weak, permit) = guard.take_and_deinit();
1015 16 : (Some(weak), permit)
1016 : }
1017 32 : Err(permit) => (None, permit),
1018 : }
1019 : };
1020 48 : let _guard = wait_for_download_recorder.guard();
1021 :
1022 48 : if let Some(weak) = weak {
1023 : // only drop the weak after dropping the heavier_once_cell guard
1024 16 : assert!(
1025 16 : matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
1026 0 : "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
1027 : );
1028 32 : }
1029 :
1030 48 : let timeline = self
1031 48 : .timeline
1032 48 : .upgrade()
1033 48 : .ok_or(DownloadError::TimelineShutdown)?;
1034 :
1035 : // count cancellations, which currently remain largely unexpected
1036 48 : let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
1037 :
1038 : // check if we really need to be downloaded: this can happen if a read access won the
1039 : // semaphore before eviction.
1040 : //
1041 : // if we are cancelled while doing this `stat` the `self.inner` will be uninitialized. a
1042 : // pending eviction will try to evict even upon finding an uninitialized `self.inner`.
1043 48 : let needs_download = self
1044 48 : .needs_download()
1045 48 : .await
1046 48 : .map_err(DownloadError::PreStatFailed);
1047 48 :
1048 48 : scopeguard::ScopeGuard::into_inner(init_cancelled);
1049 :
1050 48 : let needs_download = needs_download?;
1051 :
1052 48 : let Some(reason) = needs_download else {
1053 : // the file is present locally because eviction has not had a chance to run yet
1054 :
1055 : #[cfg(test)]
1056 16 : self.failpoint(failpoints::FailpointKind::AfterDeterminingLayerNeedsNoDownload)
1057 16 : .await?;
1058 :
1059 12 : LAYER_IMPL_METRICS.inc_init_needed_no_download();
1060 12 :
1061 12 : return Ok(self.initialize_after_layer_is_on_disk(permit));
1062 : };
1063 :
1064 : // we must download; getting cancelled before spawning the download is not an issue as
1065 : // any still running eviction would not find anything to evict.
1066 :
1067 32 : if let NeedsDownload::NotFile(ft) = reason {
1068 0 : return Err(DownloadError::NotFile(ft));
1069 32 : }
1070 32 :
1071 32 : self.check_expected_download(ctx)?;
1072 :
1073 32 : if !allow_download {
1074 : // this is only used from tests, but it is hard to test without the boolean
1075 4 : return Err(DownloadError::DownloadRequired);
1076 28 : }
1077 :
1078 28 : let ctx = if ctx.has_perf_span() {
1079 0 : let dl_ctx = RequestContextBuilder::from(ctx)
1080 0 : .task_kind(TaskKind::LayerDownload)
1081 0 : .download_behavior(DownloadBehavior::Download)
1082 0 : .root_perf_span(|| {
1083 0 : info_span!(
1084 0 : target: PERF_TRACE_TARGET,
1085 0 : "DOWNLOAD_LAYER",
1086 0 : layer = %self,
1087 0 : reason = %reason
1088 0 : )
1089 0 : })
1090 0 : .detached_child();
1091 0 : ctx.perf_follows_from(&dl_ctx);
1092 0 : dl_ctx
1093 : } else {
1094 28 : ctx.attached_child()
1095 : };
1096 :
1097 28 : async move {
1098 28 : tracing::info!(%reason, "downloading on-demand");
1099 :
1100 28 : let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
1101 28 : let res = self
1102 28 : .download_init_and_wait(timeline, permit, ctx.attached_child())
1103 28 : .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
1104 28 : .await?;
1105 :
1106 28 : scopeguard::ScopeGuard::into_inner(init_cancelled);
1107 28 : Ok(res)
1108 28 : }
1109 28 : .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
1110 28 : .await
1111 481577 : }
1112 :
1113 : /// Nag or fail per RequestContext policy
1114 32 : fn check_expected_download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
1115 : use crate::context::DownloadBehavior::*;
1116 32 : let b = ctx.download_behavior();
1117 32 : match b {
1118 32 : Download => Ok(()),
1119 : Warn | Error => {
1120 0 : tracing::info!(
1121 0 : "unexpectedly on-demand downloading for task kind {:?}",
1122 0 : ctx.task_kind()
1123 : );
1124 0 : crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
1125 :
1126 0 : let really_error =
1127 0 : matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn;
1128 :
1129 0 : if really_error {
1130 : // this check is only probablistic, seems like flakyness footgun
1131 0 : Err(DownloadError::ContextAndConfigReallyDeniesDownloads)
1132 : } else {
1133 0 : Ok(())
1134 : }
1135 : }
1136 : }
1137 32 : }
1138 :
1139 : /// Actual download, at most one is executed at the time.
1140 28 : async fn download_init_and_wait(
1141 28 : self: &Arc<Self>,
1142 28 : timeline: Arc<Timeline>,
1143 28 : permit: heavier_once_cell::InitPermit,
1144 28 : ctx: RequestContext,
1145 28 : ) -> Result<Arc<DownloadedLayer>, DownloadError> {
1146 28 : debug_assert_current_span_has_tenant_and_timeline_id();
1147 28 :
1148 28 : let (tx, rx) = tokio::sync::oneshot::channel();
1149 28 :
1150 28 : let this: Arc<Self> = self.clone();
1151 :
1152 28 : let guard = timeline
1153 28 : .gate
1154 28 : .enter()
1155 28 : .map_err(|_| DownloadError::DownloadCancelled)?;
1156 :
1157 28 : Self::spawn(
1158 28 : async move {
1159 0 : let _guard = guard;
1160 0 :
1161 0 : // now that we have commited to downloading, send out an update to:
1162 0 : // - unhang any pending eviction
1163 0 : // - break out of evict_and_wait
1164 0 : this.status
1165 0 : .as_ref()
1166 0 : .unwrap()
1167 0 : .send_replace(Status::Downloading);
1168 28 :
1169 28 : #[cfg(test)]
1170 28 : this.failpoint(failpoints::FailpointKind::WaitBeforeDownloading)
1171 28 : .await
1172 28 : .unwrap();
1173 :
1174 28 : let res = this.download_and_init(timeline, permit, &ctx).await;
1175 :
1176 28 : if let Err(res) = tx.send(res) {
1177 0 : match res {
1178 0 : Ok(_res) => {
1179 0 : tracing::debug!("layer initialized, but caller has been cancelled");
1180 0 : LAYER_IMPL_METRICS.inc_init_completed_without_requester();
1181 : }
1182 0 : Err(e) => {
1183 0 : tracing::info!(
1184 0 : "layer file download failed, and caller has been cancelled: {e:?}"
1185 : );
1186 0 : LAYER_IMPL_METRICS.inc_download_failed_without_requester();
1187 : }
1188 : }
1189 28 : }
1190 28 : }
1191 28 : .in_current_span(),
1192 28 : );
1193 28 :
1194 28 : match rx.await {
1195 28 : Ok(Ok(res)) => Ok(res),
1196 : Ok(Err(remote_storage::DownloadError::Cancelled)) => {
1197 0 : Err(DownloadError::DownloadCancelled)
1198 : }
1199 0 : Ok(Err(_)) => Err(DownloadError::DownloadFailed),
1200 0 : Err(_gone) => Err(DownloadError::DownloadCancelled),
1201 : }
1202 28 : }
1203 :
1204 28 : async fn download_and_init(
1205 28 : self: &Arc<LayerInner>,
1206 28 : timeline: Arc<Timeline>,
1207 28 : permit: heavier_once_cell::InitPermit,
1208 28 : ctx: &RequestContext,
1209 28 : ) -> Result<Arc<DownloadedLayer>, remote_storage::DownloadError> {
1210 28 : let start = std::time::Instant::now();
1211 28 : let result = timeline
1212 28 : .remote_client
1213 28 : .download_layer_file(
1214 28 : &self.desc.layer_name(),
1215 28 : &self.metadata(),
1216 28 : &self.path,
1217 28 : &timeline.gate,
1218 28 : &timeline.cancel,
1219 28 : ctx,
1220 28 : )
1221 28 : .await;
1222 28 : let latency = start.elapsed();
1223 28 : let latency_millis = u64::try_from(latency.as_millis()).unwrap();
1224 28 : match result {
1225 28 : Ok(size) => {
1226 28 : assert_eq!(size, self.desc.file_size);
1227 :
1228 28 : match self.needs_download().await {
1229 0 : Ok(Some(reason)) => {
1230 0 : // this is really a bug in needs_download or remote timeline client
1231 0 : panic!("post-condition failed: needs_download returned {reason:?}");
1232 : }
1233 28 : Ok(None) => {
1234 28 : // as expected
1235 28 : }
1236 0 : Err(e) => {
1237 0 : panic!("post-condition failed: needs_download errored: {e:?}");
1238 : }
1239 : };
1240 28 : tracing::info!(size=%self.desc.file_size, %latency_millis, "on-demand download successful");
1241 28 : timeline
1242 28 : .metrics
1243 28 : .resident_physical_size_add(self.desc.file_size);
1244 28 : self.consecutive_failures.store(0, Ordering::Relaxed);
1245 28 :
1246 28 : let since_last_eviction = self
1247 28 : .last_evicted_at
1248 28 : .lock()
1249 28 : .unwrap()
1250 28 : .take()
1251 28 : .map(|ts| ts.elapsed());
1252 28 : if let Some(since_last_eviction) = since_last_eviction {
1253 28 : LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
1254 28 : }
1255 :
1256 28 : self.access_stats.record_residence_event();
1257 28 :
1258 28 : Ok(self.initialize_after_layer_is_on_disk(permit))
1259 : }
1260 0 : Err(e) => {
1261 0 : let consecutive_failures =
1262 0 : 1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
1263 0 :
1264 0 : if timeline.cancel.is_cancelled() {
1265 : // If we're shutting down, drop out before logging the error
1266 0 : return Err(e);
1267 0 : }
1268 0 :
1269 0 : tracing::error!(consecutive_failures, %latency_millis, "layer file download failed: {e:#}");
1270 :
1271 0 : let backoff = utils::backoff::exponential_backoff_duration_seconds(
1272 0 : consecutive_failures.min(u32::MAX as usize) as u32,
1273 0 : 1.5,
1274 0 : 60.0,
1275 0 : );
1276 0 :
1277 0 : let backoff = std::time::Duration::from_secs_f64(backoff);
1278 0 :
1279 0 : tokio::select! {
1280 0 : _ = tokio::time::sleep(backoff) => {},
1281 0 : _ = timeline.cancel.cancelled() => {},
1282 : };
1283 :
1284 0 : Err(e)
1285 : }
1286 : }
1287 28 : }
1288 :
1289 : /// Initializes the `Self::inner` to a "resident" state.
1290 : ///
1291 : /// Callers are assumed to ensure that the file is actually on disk with `Self::needs_download`
1292 : /// before calling this method.
1293 : ///
1294 : /// If this method is ever made async, it needs to be cancellation safe so that no state
1295 : /// changes are made before we can write to the OnceCell in non-cancellable fashion.
1296 40 : fn initialize_after_layer_is_on_disk(
1297 40 : self: &Arc<LayerInner>,
1298 40 : permit: heavier_once_cell::InitPermit,
1299 40 : ) -> Arc<DownloadedLayer> {
1300 40 : debug_assert_current_span_has_tenant_and_timeline_id();
1301 40 :
1302 40 : // disable any scheduled but not yet running eviction deletions for this initialization
1303 40 : let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
1304 40 : self.status.as_ref().unwrap().send_replace(Status::Resident);
1305 40 :
1306 40 : let res = Arc::new(DownloadedLayer {
1307 40 : owner: Arc::downgrade(self),
1308 40 : kind: tokio::sync::OnceCell::default(),
1309 40 : version: next_version,
1310 40 : });
1311 40 :
1312 40 : let waiters = self.inner.initializer_count();
1313 40 : if waiters > 0 {
1314 0 : tracing::info!(waiters, "completing layer init for other tasks");
1315 40 : }
1316 :
1317 40 : let value = ResidentOrWantedEvicted::Resident(res.clone());
1318 40 :
1319 40 : self.inner.set(value, permit);
1320 40 :
1321 40 : res
1322 40 : }
1323 :
1324 712 : async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
1325 712 : match tokio::fs::metadata(&self.path).await {
1326 680 : Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
1327 32 : Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
1328 0 : Err(e) => Err(e),
1329 : }
1330 712 : }
1331 :
1332 244 : fn needs_download_blocking(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
1333 244 : match self.path.metadata() {
1334 244 : Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
1335 0 : Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
1336 0 : Err(e) => Err(e),
1337 : }
1338 244 : }
1339 :
1340 924 : fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
1341 924 : // in future, this should include sha2-256 validation of the file.
1342 924 : if !m.is_file() {
1343 0 : Err(NeedsDownload::NotFile(m.file_type()))
1344 924 : } else if m.len() != self.desc.file_size {
1345 0 : Err(NeedsDownload::WrongSize {
1346 0 : actual: m.len(),
1347 0 : expected: self.desc.file_size,
1348 0 : })
1349 : } else {
1350 924 : Ok(())
1351 : }
1352 924 : }
1353 :
1354 0 : fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
1355 0 : let layer_name = self.desc.layer_name().to_string();
1356 0 :
1357 0 : let resident = self
1358 0 : .inner
1359 0 : .get()
1360 0 : .map(|rowe| rowe.is_likely_resident())
1361 0 : .unwrap_or(false);
1362 0 :
1363 0 : let access_stats = self.access_stats.as_api_model(reset);
1364 0 :
1365 0 : if self.desc.is_delta {
1366 0 : let lsn_range = &self.desc.lsn_range;
1367 0 :
1368 0 : HistoricLayerInfo::Delta {
1369 0 : layer_file_name: layer_name,
1370 0 : layer_file_size: self.desc.file_size,
1371 0 : lsn_start: lsn_range.start,
1372 0 : lsn_end: lsn_range.end,
1373 0 : remote: !resident,
1374 0 : access_stats,
1375 0 : l0: crate::tenant::layer_map::LayerMap::is_l0(
1376 0 : &self.layer_desc().key_range,
1377 0 : self.layer_desc().is_delta,
1378 0 : ),
1379 0 : }
1380 : } else {
1381 0 : let lsn = self.desc.image_layer_lsn();
1382 0 :
1383 0 : HistoricLayerInfo::Image {
1384 0 : layer_file_name: layer_name,
1385 0 : layer_file_size: self.desc.file_size,
1386 0 : lsn_start: lsn,
1387 0 : remote: !resident,
1388 0 : access_stats,
1389 0 : }
1390 : }
1391 0 : }
1392 :
1393 : /// `DownloadedLayer` is being dropped, so it calls this method.
1394 80 : fn on_downloaded_layer_drop(self: Arc<LayerInner>, only_version: usize) {
1395 : // we cannot know without inspecting LayerInner::inner if we should evict or not, even
1396 : // though here it is very likely
1397 80 : let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version);
1398 :
1399 : // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
1400 : // drop while the `self.inner` is being locked, leading to a deadlock.
1401 :
1402 80 : let start_evicting = async move {
1403 80 : #[cfg(test)]
1404 80 : self.failpoint(failpoints::FailpointKind::WaitBeforeStartingEvicting)
1405 80 : .await
1406 80 : .expect("failpoint should not have errored");
1407 80 :
1408 80 : tracing::debug!("eviction started");
1409 :
1410 80 : let res = self.wait_for_turn_and_evict(only_version).await;
1411 : // metrics: ignore the Ok branch, it is not done yet
1412 80 : if let Err(e) = res {
1413 12 : tracing::debug!(res=?Err::<(), _>(&e), "eviction completed");
1414 12 : LAYER_IMPL_METRICS.inc_eviction_cancelled(e);
1415 68 : }
1416 80 : };
1417 :
1418 80 : Self::spawn(start_evicting.instrument(span));
1419 80 : }
1420 :
1421 80 : async fn wait_for_turn_and_evict(
1422 80 : self: Arc<LayerInner>,
1423 80 : only_version: usize,
1424 80 : ) -> Result<(), EvictionCancelled> {
1425 156 : fn is_good_to_continue(status: &Status) -> Result<(), EvictionCancelled> {
1426 : use Status::*;
1427 156 : match status {
1428 152 : Resident => Ok(()),
1429 4 : Evicted => Err(EvictionCancelled::UnexpectedEvictedState),
1430 0 : Downloading => Err(EvictionCancelled::LostToDownload),
1431 : }
1432 156 : }
1433 :
1434 80 : let timeline = self
1435 80 : .timeline
1436 80 : .upgrade()
1437 80 : .ok_or(EvictionCancelled::TimelineGone)?;
1438 :
1439 80 : let mut rx = self
1440 80 : .status
1441 80 : .as_ref()
1442 80 : .expect("LayerInner cannot be dropped, holding strong ref")
1443 80 : .subscribe();
1444 80 :
1445 80 : is_good_to_continue(&rx.borrow_and_update())?;
1446 :
1447 76 : let Ok(gate) = timeline.gate.enter() else {
1448 0 : return Err(EvictionCancelled::TimelineGone);
1449 : };
1450 :
1451 68 : let permit = {
1452 : // we cannot just `std::fs::remove_file` because there might already be an
1453 : // get_or_maybe_download which will inspect filesystem and reinitialize. filesystem
1454 : // operations must be done while holding the heavier_once_cell::InitPermit
1455 76 : let mut wait = std::pin::pin!(self.inner.get_or_init_detached());
1456 :
1457 76 : let waited = loop {
1458 : // we must race to the Downloading starting, otherwise we would have to wait until the
1459 : // completion of the download. waiting for download could be long and hinder our
1460 : // efforts to alert on "hanging" evictions.
1461 76 : tokio::select! {
1462 76 : res = &mut wait => break res,
1463 76 : _ = rx.changed() => {
1464 0 : is_good_to_continue(&rx.borrow_and_update())?;
1465 : // two possibilities for Status::Resident:
1466 : // - the layer was found locally from disk by a read
1467 : // - we missed a bunch of updates and now the layer is
1468 : // again downloaded -- assume we'll fail later on with
1469 : // version check or AlreadyReinitialized
1470 : }
1471 : }
1472 : };
1473 :
1474 : // re-check now that we have the guard or permit; all updates should have happened
1475 : // while holding the permit.
1476 76 : is_good_to_continue(&rx.borrow_and_update())?;
1477 :
1478 : // the term deinitialize is used here, because we clearing out the Weak will eventually
1479 : // lead to deallocating the reference counted value, and the value we
1480 : // `Guard::take_and_deinit` is likely to be the last because the Weak is never cloned.
1481 76 : let (_weak, permit) = match waited {
1482 72 : Ok(guard) => {
1483 72 : match &*guard {
1484 68 : ResidentOrWantedEvicted::WantedEvicted(_weak, version)
1485 68 : if *version == only_version =>
1486 64 : {
1487 64 : tracing::debug!(version, "deinitializing matching WantedEvicted");
1488 64 : let (weak, permit) = guard.take_and_deinit();
1489 64 : (Some(weak), permit)
1490 : }
1491 4 : ResidentOrWantedEvicted::WantedEvicted(_, version) => {
1492 4 : // if we were not doing the version check, we would need to try to
1493 4 : // upgrade the weak here to see if it really is dropped. version check
1494 4 : // is done instead assuming that it is cheaper.
1495 4 : tracing::debug!(
1496 : version,
1497 : only_version,
1498 0 : "version mismatch, not deinitializing"
1499 : );
1500 4 : return Err(EvictionCancelled::VersionCheckFailed);
1501 : }
1502 : ResidentOrWantedEvicted::Resident(_) => {
1503 4 : return Err(EvictionCancelled::AlreadyReinitialized);
1504 : }
1505 : }
1506 : }
1507 4 : Err(permit) => {
1508 4 : tracing::debug!("continuing after cancelled get_or_maybe_download or eviction");
1509 4 : (None, permit)
1510 : }
1511 : };
1512 :
1513 68 : permit
1514 68 : };
1515 68 :
1516 68 : let span = tracing::Span::current();
1517 68 :
1518 68 : let spawned_at = std::time::Instant::now();
1519 68 :
1520 68 : // this is on purpose a detached spawn; we don't need to wait for it
1521 68 : //
1522 68 : // eviction completion reporting is the only thing hinging on this, and it can be just as
1523 68 : // well from a spawn_blocking thread.
1524 68 : //
1525 68 : // important to note that now that we've acquired the permit we have made sure the evicted
1526 68 : // file is either the exact `WantedEvicted` we wanted to evict, or uninitialized in case
1527 68 : // there are multiple evictions. The rest is not cancellable, and we've now commited to
1528 68 : // evicting.
1529 68 : //
1530 68 : // If spawn_blocking has a queue and maximum number of threads are in use, we could stall
1531 68 : // reads. We will need to add cancellation for that if necessary.
1532 68 : Self::spawn_blocking(move || {
1533 68 : let _span = span.entered();
1534 68 :
1535 68 : let res = self.evict_blocking(&timeline, &gate, &permit);
1536 68 :
1537 68 : let waiters = self.inner.initializer_count();
1538 68 :
1539 68 : if waiters > 0 {
1540 0 : LAYER_IMPL_METRICS.inc_evicted_with_waiters();
1541 68 : }
1542 :
1543 68 : let completed_in = spawned_at.elapsed();
1544 68 : LAYER_IMPL_METRICS.record_time_to_evict(completed_in);
1545 68 :
1546 68 : match res {
1547 68 : Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
1548 0 : Err(e) => LAYER_IMPL_METRICS.inc_eviction_cancelled(e),
1549 : }
1550 :
1551 68 : tracing::debug!(?res, elapsed_ms=%completed_in.as_millis(), %waiters, "eviction completed");
1552 68 : });
1553 68 :
1554 68 : Ok(())
1555 80 : }
1556 :
1557 : /// This is blocking only to do just one spawn_blocking hop compared to multiple via tokio::fs.
1558 68 : fn evict_blocking(
1559 68 : &self,
1560 68 : timeline: &Timeline,
1561 68 : _gate: &gate::GateGuard,
1562 68 : _permit: &heavier_once_cell::InitPermit,
1563 68 : ) -> Result<(), EvictionCancelled> {
1564 68 : // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`
1565 68 :
1566 68 : match capture_mtime_and_remove(&self.path) {
1567 68 : Ok(local_layer_mtime) => {
1568 68 : let duration = SystemTime::now().duration_since(local_layer_mtime);
1569 68 : match duration {
1570 68 : Ok(elapsed) => {
1571 68 : let accessed_and_visible = self.access_stats.accessed()
1572 8 : && self.access_stats.visibility() == LayerVisibilityHint::Visible;
1573 68 : if accessed_and_visible {
1574 8 : // Only layers used for reads contribute to our "low residence" metric that is used
1575 8 : // to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed
1576 8 : // to be rapidly evicted without contributing to this metric.
1577 8 : timeline
1578 8 : .metrics
1579 8 : .evictions_with_low_residence_duration
1580 8 : .read()
1581 8 : .unwrap()
1582 8 : .observe(elapsed);
1583 60 : }
1584 :
1585 68 : tracing::info!(
1586 0 : residence_millis = elapsed.as_millis(),
1587 0 : accessed_and_visible,
1588 0 : "evicted layer after known residence period"
1589 : );
1590 : }
1591 : Err(_) => {
1592 0 : tracing::info!("evicted layer after unknown residence period");
1593 : }
1594 : }
1595 68 : timeline.metrics.evictions.inc();
1596 68 : timeline
1597 68 : .metrics
1598 68 : .resident_physical_size_sub(self.desc.file_size);
1599 : }
1600 0 : Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
1601 0 : tracing::error!(
1602 : layer_size = %self.desc.file_size,
1603 0 : "failed to evict layer from disk, it was already gone"
1604 : );
1605 0 : return Err(EvictionCancelled::FileNotFound);
1606 : }
1607 0 : Err(e) => {
1608 0 : // FIXME: this should probably be an abort
1609 0 : tracing::error!("failed to evict file from disk: {e:#}");
1610 0 : return Err(EvictionCancelled::RemoveFailed);
1611 : }
1612 : }
1613 :
1614 68 : self.access_stats.record_residence_event();
1615 68 :
1616 68 : *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
1617 68 :
1618 68 : self.status.as_ref().unwrap().send_replace(Status::Evicted);
1619 68 :
1620 68 : Ok(())
1621 68 : }
1622 :
1623 6336 : fn metadata(&self) -> LayerFileMetadata {
1624 6336 : LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
1625 6336 : }
1626 :
1627 : /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME.
1628 : ///
1629 : /// Synchronizing with spawned tasks is very complicated otherwise.
1630 108 : fn spawn<F>(fut: F)
1631 108 : where
1632 108 : F: std::future::Future<Output = ()> + Send + 'static,
1633 108 : {
1634 108 : #[cfg(test)]
1635 108 : tokio::task::spawn(fut);
1636 108 : #[cfg(not(test))]
1637 108 : crate::task_mgr::BACKGROUND_RUNTIME.spawn(fut);
1638 108 : }
1639 :
1640 : /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME.
1641 1088 : fn spawn_blocking<F>(f: F)
1642 1088 : where
1643 1088 : F: FnOnce() + Send + 'static,
1644 1088 : {
1645 1088 : #[cfg(test)]
1646 1088 : tokio::task::spawn_blocking(f);
1647 1088 : #[cfg(not(test))]
1648 1088 : crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(f);
1649 1088 : }
1650 : }
1651 :
1652 68 : fn capture_mtime_and_remove(path: &Utf8Path) -> Result<SystemTime, std::io::Error> {
1653 68 : let m = path.metadata()?;
1654 68 : let local_layer_mtime = m.modified()?;
1655 68 : std::fs::remove_file(path)?;
1656 68 : Ok(local_layer_mtime)
1657 68 : }
1658 :
1659 : #[derive(Debug, thiserror::Error)]
1660 : pub(crate) enum EvictionError {
1661 : #[error("layer was already evicted")]
1662 : NotFound,
1663 :
1664 : /// Evictions must always lose to downloads in races, and this time it happened.
1665 : #[error("layer was downloaded instead")]
1666 : Downloaded,
1667 :
1668 : #[error("eviction did not happen within timeout")]
1669 : Timeout,
1670 : }
1671 :
1672 : /// Error internal to the [`LayerInner::get_or_maybe_download`]
1673 : #[derive(Debug, thiserror::Error)]
1674 : pub(crate) enum DownloadError {
1675 : #[error("timeline has already shutdown")]
1676 : TimelineShutdown,
1677 : #[error("context denies downloading")]
1678 : ContextAndConfigReallyDeniesDownloads,
1679 : #[error("downloading is really required but not allowed by this method")]
1680 : DownloadRequired,
1681 : #[error("layer path exists, but it is not a file: {0:?}")]
1682 : NotFile(std::fs::FileType),
1683 : /// Why no error here? Because it will be reported by page_service. We should had also done
1684 : /// retries already.
1685 : #[error("downloading evicted layer file failed")]
1686 : DownloadFailed,
1687 : #[error("downloading failed, possibly for shutdown")]
1688 : DownloadCancelled,
1689 : #[error("pre-condition: stat before download failed")]
1690 : PreStatFailed(#[source] std::io::Error),
1691 :
1692 : #[cfg(test)]
1693 : #[error("failpoint: {0:?}")]
1694 : Failpoint(failpoints::FailpointKind),
1695 : }
1696 :
1697 : impl DownloadError {
1698 0 : pub(crate) fn is_cancelled(&self) -> bool {
1699 0 : matches!(self, DownloadError::DownloadCancelled)
1700 0 : }
1701 : }
1702 :
1703 : #[derive(Debug, PartialEq)]
1704 : pub(crate) enum NeedsDownload {
1705 : NotFound,
1706 : NotFile(std::fs::FileType),
1707 : WrongSize { actual: u64, expected: u64 },
1708 : }
1709 :
1710 : impl std::fmt::Display for NeedsDownload {
1711 28 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1712 28 : match self {
1713 28 : NeedsDownload::NotFound => write!(f, "file was not found"),
1714 0 : NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
1715 0 : NeedsDownload::WrongSize { actual, expected } => {
1716 0 : write!(f, "file size mismatch {actual} vs. {expected}")
1717 : }
1718 : }
1719 28 : }
1720 : }
1721 :
1722 : /// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
1723 : pub(crate) struct DownloadedLayer {
1724 : owner: Weak<LayerInner>,
1725 : // Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
1726 : // DownloadedLayer
1727 : kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
1728 : version: usize,
1729 : }
1730 :
1731 : impl std::fmt::Debug for DownloadedLayer {
1732 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1733 0 : f.debug_struct("DownloadedLayer")
1734 0 : // owner omitted because it is always "Weak"
1735 0 : .field("kind", &self.kind)
1736 0 : .field("version", &self.version)
1737 0 : .finish()
1738 0 : }
1739 : }
1740 :
1741 : impl Drop for DownloadedLayer {
1742 1488 : fn drop(&mut self) {
1743 1488 : if let Some(owner) = self.owner.upgrade() {
1744 80 : owner.on_downloaded_layer_drop(self.version);
1745 1408 : } else {
1746 1408 : // Layer::drop will handle cancelling the eviction; because of drop order and
1747 1408 : // `DownloadedLayer` never leaking, we cannot know here if eviction was requested.
1748 1408 : }
1749 1488 : }
1750 : }
1751 :
1752 : impl DownloadedLayer {
1753 : /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`].
1754 : /// Failure to load the layer is sticky, i.e., future `get()` calls will return
1755 : /// the initial load failure immediately.
1756 : ///
1757 : /// `owner` parameter is a strong reference at the same `LayerInner` as the
1758 : /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called,
1759 : /// we will always have the LayerInner on the callstack, so we can just use it.
1760 482485 : async fn get<'a>(
1761 482485 : &'a self,
1762 482485 : owner: &Arc<LayerInner>,
1763 482485 : ctx: &RequestContext,
1764 482485 : ) -> anyhow::Result<&'a LayerKind> {
1765 482485 : let init = || async {
1766 2480 : assert_eq!(
1767 2480 : Weak::as_ptr(&self.owner),
1768 2480 : Arc::as_ptr(owner),
1769 0 : "these are the same, just avoiding the upgrade"
1770 : );
1771 :
1772 2480 : let res = if owner.desc.is_delta {
1773 2192 : let ctx = RequestContextBuilder::from(ctx)
1774 2192 : .page_content_kind(crate::context::PageContentKind::DeltaLayerSummary)
1775 2192 : .attached_child();
1776 2192 : let summary = Some(delta_layer::Summary::expected(
1777 2192 : owner.desc.tenant_shard_id.tenant_id,
1778 2192 : owner.desc.timeline_id,
1779 2192 : owner.desc.key_range.clone(),
1780 2192 : owner.desc.lsn_range.clone(),
1781 2192 : ));
1782 2192 : delta_layer::DeltaLayerInner::load(
1783 2192 : &owner.path,
1784 2192 : summary,
1785 2192 : Some(owner.conf.max_vectored_read_bytes),
1786 2192 : &ctx,
1787 2192 : )
1788 2192 : .await
1789 2192 : .map(LayerKind::Delta)
1790 : } else {
1791 288 : let ctx = RequestContextBuilder::from(ctx)
1792 288 : .page_content_kind(crate::context::PageContentKind::ImageLayerSummary)
1793 288 : .attached_child();
1794 288 : let lsn = owner.desc.image_layer_lsn();
1795 288 : let summary = Some(image_layer::Summary::expected(
1796 288 : owner.desc.tenant_shard_id.tenant_id,
1797 288 : owner.desc.timeline_id,
1798 288 : owner.desc.key_range.clone(),
1799 288 : lsn,
1800 288 : ));
1801 288 : image_layer::ImageLayerInner::load(
1802 288 : &owner.path,
1803 288 : lsn,
1804 288 : summary,
1805 288 : Some(owner.conf.max_vectored_read_bytes),
1806 288 : &ctx,
1807 288 : )
1808 288 : .await
1809 288 : .map(LayerKind::Image)
1810 : };
1811 :
1812 2480 : match res {
1813 2480 : Ok(layer) => Ok(layer),
1814 0 : Err(err) => {
1815 0 : LAYER_IMPL_METRICS.inc_permanent_loading_failures();
1816 0 : // We log this message once over the lifetime of `Self`
1817 0 : // => Ok and good to log backtrace and path here.
1818 0 : tracing::error!(
1819 0 : "layer load failed, assuming permanent failure: {}: {err:?}",
1820 0 : owner.path
1821 : );
1822 0 : Err(err)
1823 : }
1824 : }
1825 4960 : };
1826 482485 : self.kind
1827 482485 : .get_or_init(init)
1828 482485 : .await
1829 482485 : .as_ref()
1830 482485 : // We already logged the full backtrace above, once. Don't repeat that here.
1831 482485 : .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
1832 482485 : }
1833 :
1834 480385 : async fn get_values_reconstruct_data(
1835 480385 : &self,
1836 480385 : this: ResidentLayer,
1837 480385 : keyspace: KeySpace,
1838 480385 : lsn_range: Range<Lsn>,
1839 480385 : reconstruct_data: &mut ValuesReconstructState,
1840 480385 : ctx: &RequestContext,
1841 480385 : ) -> Result<(), GetVectoredError> {
1842 : use LayerKind::*;
1843 :
1844 480385 : match self
1845 480385 : .get(&this.owner.0, ctx)
1846 480385 : .await
1847 480385 : .map_err(GetVectoredError::Other)?
1848 : {
1849 435233 : Delta(d) => {
1850 435233 : d.get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx)
1851 435233 : .await
1852 : }
1853 45152 : Image(i) => {
1854 45152 : i.get_values_reconstruct_data(this, keyspace, reconstruct_data, ctx)
1855 45152 : .await
1856 : }
1857 : }
1858 480385 : }
1859 :
1860 8 : async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
1861 : use LayerKind::*;
1862 8 : match self.get(owner, ctx).await? {
1863 8 : Delta(d) => d.dump(ctx).await?,
1864 0 : Image(i) => i.dump(ctx).await?,
1865 : }
1866 :
1867 8 : Ok(())
1868 8 : }
1869 : }
1870 :
1871 : /// Wrapper around an actual layer implementation.
1872 : #[derive(Debug)]
1873 : enum LayerKind {
1874 : Delta(delta_layer::DeltaLayerInner),
1875 : Image(image_layer::ImageLayerInner),
1876 : }
1877 :
1878 : /// Guard for forcing a layer be resident while it exists.
1879 : #[derive(Clone)]
1880 : pub struct ResidentLayer {
1881 : owner: Layer,
1882 : downloaded: Arc<DownloadedLayer>,
1883 : }
1884 :
1885 : impl std::fmt::Display for ResidentLayer {
1886 4304 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1887 4304 : write!(f, "{}", self.owner)
1888 4304 : }
1889 : }
1890 :
1891 : impl std::fmt::Debug for ResidentLayer {
1892 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1893 0 : write!(f, "{}", self.owner)
1894 0 : }
1895 : }
1896 :
1897 : impl ResidentLayer {
1898 : /// Release the eviction guard, converting back into a plain [`Layer`].
1899 : ///
1900 : /// You can access the [`Layer`] also by using `as_ref`.
1901 840 : pub(crate) fn drop_eviction_guard(self) -> Layer {
1902 840 : self.into()
1903 840 : }
1904 :
1905 : /// Loads all keys stored in the layer. Returns key, lsn and value size.
1906 0 : #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
1907 : pub(crate) async fn load_keys<'a>(
1908 : &'a self,
1909 : ctx: &RequestContext,
1910 : ) -> anyhow::Result<Vec<pageserver_api::key::Key>> {
1911 : use LayerKind::*;
1912 :
1913 : let owner = &self.owner.0;
1914 : let inner = self.downloaded.get(owner, ctx).await?;
1915 :
1916 : // this is valid because the DownloadedLayer::kind is a OnceCell, not a
1917 : // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
1918 : // while it's being held.
1919 : self.owner.record_access(ctx);
1920 :
1921 : let res = match inner {
1922 : Delta(d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await,
1923 : Image(i) => image_layer::ImageLayerInner::load_keys(i, ctx).await,
1924 : };
1925 0 : res.with_context(|| format!("Layer index is corrupted for {self}"))
1926 : }
1927 :
1928 : /// Read all they keys in this layer which match the ShardIdentity, and write them all to
1929 : /// the provided writer. Return the number of keys written.
1930 16 : #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
1931 : pub(crate) async fn filter(
1932 : &self,
1933 : shard_identity: &ShardIdentity,
1934 : writer: &mut ImageLayerWriter,
1935 : ctx: &RequestContext,
1936 : ) -> Result<usize, CompactionError> {
1937 : use LayerKind::*;
1938 :
1939 : match self
1940 : .downloaded
1941 : .get(&self.owner.0, ctx)
1942 : .await
1943 : .map_err(CompactionError::Other)?
1944 : {
1945 : Delta(_) => {
1946 : return Err(CompactionError::Other(anyhow::anyhow!(format!(
1947 : "cannot filter() on a delta layer {self}"
1948 : ))));
1949 : }
1950 : Image(i) => i
1951 : .filter(shard_identity, writer, ctx)
1952 : .await
1953 : .map_err(CompactionError::Other),
1954 : }
1955 : }
1956 :
1957 : /// Returns the amount of keys and values written to the writer.
1958 20 : pub(crate) async fn copy_delta_prefix(
1959 20 : &self,
1960 20 : writer: &mut super::delta_layer::DeltaLayerWriter,
1961 20 : until: Lsn,
1962 20 : ctx: &RequestContext,
1963 20 : ) -> anyhow::Result<usize> {
1964 : use LayerKind::*;
1965 :
1966 20 : let owner = &self.owner.0;
1967 20 :
1968 20 : match self.downloaded.get(owner, ctx).await? {
1969 20 : Delta(d) => d
1970 20 : .copy_prefix(writer, until, ctx)
1971 20 : .await
1972 20 : .with_context(|| format!("copy_delta_prefix until {until} of {self}")),
1973 0 : Image(_) => anyhow::bail!(format!("cannot copy_lsn_prefix of image layer {self}")),
1974 : }
1975 20 : }
1976 :
1977 3699 : pub(crate) fn local_path(&self) -> &Utf8Path {
1978 3699 : &self.owner.0.path
1979 3699 : }
1980 :
1981 4424 : pub(crate) fn metadata(&self) -> LayerFileMetadata {
1982 4424 : self.owner.metadata()
1983 4424 : }
1984 :
1985 : /// Cast the layer to a delta, return an error if it is an image layer.
1986 1912 : pub(crate) async fn get_as_delta(
1987 1912 : &self,
1988 1912 : ctx: &RequestContext,
1989 1912 : ) -> anyhow::Result<&delta_layer::DeltaLayerInner> {
1990 : use LayerKind::*;
1991 1912 : match self.downloaded.get(&self.owner.0, ctx).await? {
1992 1912 : Delta(d) => Ok(d),
1993 0 : Image(_) => Err(anyhow::anyhow!("image layer")),
1994 : }
1995 1912 : }
1996 :
1997 : /// Cast the layer to an image, return an error if it is a delta layer.
1998 144 : pub(crate) async fn get_as_image(
1999 144 : &self,
2000 144 : ctx: &RequestContext,
2001 144 : ) -> anyhow::Result<&image_layer::ImageLayerInner> {
2002 : use LayerKind::*;
2003 144 : match self.downloaded.get(&self.owner.0, ctx).await? {
2004 144 : Image(d) => Ok(d),
2005 0 : Delta(_) => Err(anyhow::anyhow!("delta layer")),
2006 : }
2007 144 : }
2008 : }
2009 :
2010 : impl AsLayerDesc for ResidentLayer {
2011 2382497 : fn layer_desc(&self) -> &PersistentLayerDesc {
2012 2382497 : self.owner.layer_desc()
2013 2382497 : }
2014 : }
2015 :
2016 : impl AsRef<Layer> for ResidentLayer {
2017 4100 : fn as_ref(&self) -> &Layer {
2018 4100 : &self.owner
2019 4100 : }
2020 : }
2021 :
2022 : /// Drop the eviction guard.
2023 : impl From<ResidentLayer> for Layer {
2024 840 : fn from(value: ResidentLayer) -> Self {
2025 840 : value.owner
2026 840 : }
2027 : }
2028 :
2029 : use metrics::IntCounter;
2030 :
2031 : pub(crate) struct LayerImplMetrics {
2032 : started_evictions: IntCounter,
2033 : completed_evictions: IntCounter,
2034 : cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
2035 :
2036 : started_deletes: IntCounter,
2037 : completed_deletes: IntCounter,
2038 : failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
2039 :
2040 : rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
2041 : inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
2042 : redownload_after: metrics::Histogram,
2043 : time_to_evict: metrics::Histogram,
2044 : }
2045 :
2046 : impl Default for LayerImplMetrics {
2047 108 : fn default() -> Self {
2048 : use enum_map::Enum;
2049 :
2050 : // reminder: these will be pageserver_layer_* with "_total" suffix
2051 :
2052 108 : let started_evictions = metrics::register_int_counter!(
2053 108 : "pageserver_layer_started_evictions",
2054 108 : "Evictions started in the Layer implementation"
2055 108 : )
2056 108 : .unwrap();
2057 108 : let completed_evictions = metrics::register_int_counter!(
2058 108 : "pageserver_layer_completed_evictions",
2059 108 : "Evictions completed in the Layer implementation"
2060 108 : )
2061 108 : .unwrap();
2062 108 :
2063 108 : let cancelled_evictions = metrics::register_int_counter_vec!(
2064 108 : "pageserver_layer_cancelled_evictions_count",
2065 108 : "Different reasons for evictions to have been cancelled or failed",
2066 108 : &["reason"]
2067 108 : )
2068 108 : .unwrap();
2069 108 :
2070 972 : let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
2071 972 : let reason = EvictionCancelled::from_usize(i);
2072 972 : let s = reason.as_str();
2073 972 : cancelled_evictions.with_label_values(&[s])
2074 972 : }));
2075 108 :
2076 108 : let started_deletes = metrics::register_int_counter!(
2077 108 : "pageserver_layer_started_deletes",
2078 108 : "Deletions on drop pending in the Layer implementation"
2079 108 : )
2080 108 : .unwrap();
2081 108 : let completed_deletes = metrics::register_int_counter!(
2082 108 : "pageserver_layer_completed_deletes",
2083 108 : "Deletions on drop completed in the Layer implementation"
2084 108 : )
2085 108 : .unwrap();
2086 108 :
2087 108 : let failed_deletes = metrics::register_int_counter_vec!(
2088 108 : "pageserver_layer_failed_deletes_count",
2089 108 : "Different reasons for deletions on drop to have failed",
2090 108 : &["reason"]
2091 108 : )
2092 108 : .unwrap();
2093 108 :
2094 216 : let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
2095 216 : let reason = DeleteFailed::from_usize(i);
2096 216 : let s = reason.as_str();
2097 216 : failed_deletes.with_label_values(&[s])
2098 216 : }));
2099 108 :
2100 108 : let rare_counters = metrics::register_int_counter_vec!(
2101 108 : "pageserver_layer_assumed_rare_count",
2102 108 : "Times unexpected or assumed rare event happened",
2103 108 : &["event"]
2104 108 : )
2105 108 : .unwrap();
2106 108 :
2107 756 : let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
2108 756 : let event = RareEvent::from_usize(i);
2109 756 : let s = event.as_str();
2110 756 : rare_counters.with_label_values(&[s])
2111 756 : }));
2112 108 :
2113 108 : let inits_cancelled = metrics::register_int_counter!(
2114 108 : "pageserver_layer_inits_cancelled_count",
2115 108 : "Times Layer initialization was cancelled",
2116 108 : )
2117 108 : .unwrap();
2118 108 :
2119 108 : let redownload_after = {
2120 108 : let minute = 60.0;
2121 108 : let hour = 60.0 * minute;
2122 108 : metrics::register_histogram!(
2123 108 : "pageserver_layer_redownloaded_after",
2124 108 : "Time between evicting and re-downloading.",
2125 108 : vec![
2126 108 : 10.0,
2127 108 : 30.0,
2128 108 : minute,
2129 108 : 5.0 * minute,
2130 108 : 15.0 * minute,
2131 108 : 30.0 * minute,
2132 108 : hour,
2133 108 : 12.0 * hour,
2134 108 : ]
2135 108 : )
2136 108 : .unwrap()
2137 108 : };
2138 108 :
2139 108 : let time_to_evict = metrics::register_histogram!(
2140 108 : "pageserver_layer_eviction_held_permit_seconds",
2141 108 : "Time eviction held the permit.",
2142 108 : vec![0.001, 0.010, 0.100, 0.500, 1.000, 5.000]
2143 108 : )
2144 108 : .unwrap();
2145 108 :
2146 108 : Self {
2147 108 : started_evictions,
2148 108 : completed_evictions,
2149 108 : cancelled_evictions,
2150 108 :
2151 108 : started_deletes,
2152 108 : completed_deletes,
2153 108 : failed_deletes,
2154 108 :
2155 108 : rare_counters,
2156 108 : inits_cancelled,
2157 108 : redownload_after,
2158 108 : time_to_evict,
2159 108 : }
2160 108 : }
2161 : }
2162 :
2163 : impl LayerImplMetrics {
2164 84 : fn inc_started_evictions(&self) {
2165 84 : self.started_evictions.inc();
2166 84 : }
2167 68 : fn inc_completed_evictions(&self) {
2168 68 : self.completed_evictions.inc();
2169 68 : }
2170 16 : fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
2171 16 : self.cancelled_evictions[reason].inc()
2172 16 : }
2173 :
2174 1032 : fn inc_started_deletes(&self) {
2175 1032 : self.started_deletes.inc();
2176 1032 : }
2177 1016 : fn inc_completed_deletes(&self) {
2178 1016 : self.completed_deletes.inc();
2179 1016 : }
2180 0 : fn inc_deletes_failed(&self, reason: DeleteFailed) {
2181 0 : self.failed_deletes[reason].inc();
2182 0 : }
2183 :
2184 : /// Counted separatedly from failed layer deletes because we will complete the layer deletion
2185 : /// attempt regardless of failure to delete local file.
2186 0 : fn inc_delete_removes_failed(&self) {
2187 0 : self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
2188 0 : }
2189 :
2190 : /// Expected rare just as cancellations are rare, but we could have cancellations separate from
2191 : /// the single caller which can start the download, so use this counter to separte them.
2192 0 : fn inc_init_completed_without_requester(&self) {
2193 0 : self.rare_counters[RareEvent::InitCompletedWithoutRequester].inc();
2194 0 : }
2195 :
2196 : /// Expected rare because cancellations are unexpected, and failures are unexpected
2197 0 : fn inc_download_failed_without_requester(&self) {
2198 0 : self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
2199 0 : }
2200 :
2201 : /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
2202 : ///
2203 : /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
2204 : /// Option.
2205 0 : fn inc_raced_wanted_evicted_accesses(&self) {
2206 0 : self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
2207 0 : }
2208 :
2209 : /// These are only expected for [`Self::inc_init_cancelled`] amount when
2210 : /// running with remote storage.
2211 12 : fn inc_init_needed_no_download(&self) {
2212 12 : self.rare_counters[RareEvent::InitWithoutDownload].inc();
2213 12 : }
2214 :
2215 : /// Expected rare because all layer files should be readable and good
2216 0 : fn inc_permanent_loading_failures(&self) {
2217 0 : self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
2218 0 : }
2219 :
2220 0 : fn inc_init_cancelled(&self) {
2221 0 : self.inits_cancelled.inc()
2222 0 : }
2223 :
2224 28 : fn record_redownloaded_after(&self, duration: std::time::Duration) {
2225 28 : self.redownload_after.observe(duration.as_secs_f64())
2226 28 : }
2227 :
2228 : /// This would be bad if it ever happened, or mean extreme disk pressure. We should probably
2229 : /// instead cancel eviction if we would have read waiters. We cannot however separate reads
2230 : /// from other evictions, so this could have noise as well.
2231 0 : fn inc_evicted_with_waiters(&self) {
2232 0 : self.rare_counters[RareEvent::EvictedWithWaiters].inc();
2233 0 : }
2234 :
2235 : /// Recorded at least initially as the permit is now acquired in async context before
2236 : /// spawn_blocking action.
2237 68 : fn record_time_to_evict(&self, duration: std::time::Duration) {
2238 68 : self.time_to_evict.observe(duration.as_secs_f64())
2239 68 : }
2240 : }
2241 :
2242 : #[derive(Debug, Clone, Copy, enum_map::Enum)]
2243 : enum EvictionCancelled {
2244 : LayerGone,
2245 : TimelineGone,
2246 : VersionCheckFailed,
2247 : FileNotFound,
2248 : RemoveFailed,
2249 : AlreadyReinitialized,
2250 : /// Not evicted because of a pending reinitialization
2251 : LostToDownload,
2252 : /// After eviction, there was a new layer access which cancelled the eviction.
2253 : UpgradedBackOnAccess,
2254 : UnexpectedEvictedState,
2255 : }
2256 :
2257 : impl EvictionCancelled {
2258 972 : fn as_str(&self) -> &'static str {
2259 972 : match self {
2260 108 : EvictionCancelled::LayerGone => "layer_gone",
2261 108 : EvictionCancelled::TimelineGone => "timeline_gone",
2262 108 : EvictionCancelled::VersionCheckFailed => "version_check_fail",
2263 108 : EvictionCancelled::FileNotFound => "file_not_found",
2264 108 : EvictionCancelled::RemoveFailed => "remove_failed",
2265 108 : EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
2266 108 : EvictionCancelled::LostToDownload => "lost_to_download",
2267 108 : EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
2268 108 : EvictionCancelled::UnexpectedEvictedState => "unexpected_evicted_state",
2269 : }
2270 972 : }
2271 : }
2272 :
2273 : #[derive(enum_map::Enum)]
2274 : enum DeleteFailed {
2275 : TimelineGone,
2276 : DeleteSchedulingFailed,
2277 : }
2278 :
2279 : impl DeleteFailed {
2280 216 : fn as_str(&self) -> &'static str {
2281 216 : match self {
2282 108 : DeleteFailed::TimelineGone => "timeline_gone",
2283 108 : DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
2284 : }
2285 216 : }
2286 : }
2287 :
2288 : #[derive(enum_map::Enum)]
2289 : enum RareEvent {
2290 : RemoveOnDropFailed,
2291 : InitCompletedWithoutRequester,
2292 : DownloadFailedWithoutRequester,
2293 : UpgradedWantedEvicted,
2294 : InitWithoutDownload,
2295 : PermanentLoadingFailure,
2296 : EvictedWithWaiters,
2297 : }
2298 :
2299 : impl RareEvent {
2300 756 : fn as_str(&self) -> &'static str {
2301 : use RareEvent::*;
2302 :
2303 756 : match self {
2304 108 : RemoveOnDropFailed => "remove_on_drop_failed",
2305 108 : InitCompletedWithoutRequester => "init_completed_without",
2306 108 : DownloadFailedWithoutRequester => "download_failed_without",
2307 108 : UpgradedWantedEvicted => "raced_wanted_evicted",
2308 108 : InitWithoutDownload => "init_needed_no_download",
2309 108 : PermanentLoadingFailure => "permanent_loading_failure",
2310 108 : EvictedWithWaiters => "evicted_with_waiters",
2311 : }
2312 756 : }
2313 : }
2314 :
2315 : pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
2316 : once_cell::sync::Lazy::new(LayerImplMetrics::default);
|