Line data Source code
1 : use std::ops::Range;
2 : use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
3 : use std::sync::{Arc, Weak};
4 : use std::time::{Duration, SystemTime};
5 :
6 : use crate::PERF_TRACE_TARGET;
7 : use crate::metrics::{ONDEMAND_DOWNLOAD_BYTES, ONDEMAND_DOWNLOAD_COUNT};
8 : use anyhow::Context;
9 : use camino::{Utf8Path, Utf8PathBuf};
10 : use pageserver_api::keyspace::KeySpace;
11 : use pageserver_api::models::HistoricLayerInfo;
12 : use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
13 : use tracing::{Instrument, info_span};
14 : use utils::generation::Generation;
15 : use utils::id::TimelineId;
16 : use utils::lsn::Lsn;
17 : use utils::sync::{gate, heavier_once_cell};
18 :
19 : use super::delta_layer::{self};
20 : use super::image_layer::{self};
21 : use super::{
22 : AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
23 : LayerVisibilityHint, PerfInstrumentFutureExt, PersistentLayerDesc, ValuesReconstructState,
24 : };
25 : use crate::config::PageServerConf;
26 : use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
27 : use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
28 : use crate::task_mgr::TaskKind;
29 : use crate::tenant::Timeline;
30 : use crate::tenant::remote_timeline_client::LayerFileMetadata;
31 : use crate::tenant::timeline::{CompactionError, GetVectoredError};
32 :
33 : #[cfg(test)]
34 : mod tests;
35 :
36 : #[cfg(test)]
37 : mod failpoints;
38 :
39 : pub const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
40 :
41 : /// A Layer contains all data in a "rectangle" consisting of a range of keys and
42 : /// range of LSNs.
43 : ///
44 : /// There are two kinds of layers, in-memory and on-disk layers. In-memory
45 : /// layers are used to ingest incoming WAL, and provide fast access to the
46 : /// recent page versions. On-disk layers are stored as files on disk, and are
47 : /// immutable. This type represents the on-disk kind while in-memory kind are represented by
48 : /// [`InMemoryLayer`].
49 : ///
50 : /// Furthermore, there are two kinds of on-disk layers: delta and image layers.
51 : /// A delta layer contains all modifications within a range of LSNs and keys.
52 : /// An image layer is a snapshot of all the data in a key-range, at a single
53 : /// LSN.
54 : ///
55 : /// This type models the on-disk layers, which can be evicted and on-demand downloaded. As a
56 : /// general goal, read accesses should always win eviction and eviction should not wait for
57 : /// download.
58 : ///
59 : /// ### State transitions
60 : ///
61 : /// The internal state of `Layer` is composed of most importantly the on-filesystem state and the
62 : /// [`ResidentOrWantedEvicted`] enum. On-filesystem state can be either present (fully downloaded,
63 : /// right size) or deleted.
64 : ///
65 : /// Reads will always win requests to evict until `wait_for_turn_and_evict` has acquired the
66 : /// `heavier_once_cell::InitPermit` and has started to `evict_blocking`. Before the
67 : /// `heavier_once_cell::InitPermit` has been acquired, any read request
68 : /// (`get_or_maybe_download`) can "re-initialize" using the existing downloaded file and thus
69 : /// cancelling the eviction.
70 : ///
71 : /// ```text
72 : /// +-----------------+ get_or_maybe_download +--------------------------------+
73 : /// | not initialized |--------------------------->| Resident(Arc<DownloadedLayer>) |
74 : /// | ENOENT | /->| |
75 : /// +-----------------+ | +--------------------------------+
76 : /// ^ | | ^
77 : /// | get_or_maybe_download | | | get_or_maybe_download, either:
78 : /// evict_blocking | /-------------------------/ | | - upgrade weak to strong
79 : /// | | | | - re-initialize without download
80 : /// | | evict_and_wait | |
81 : /// +-----------------+ v |
82 : /// | not initialized | on_downloaded_layer_drop +--------------------------------------+
83 : /// | file is present |<---------------------------| WantedEvicted(Weak<DownloadedLayer>) |
84 : /// +-----------------+ +--------------------------------------+
85 : /// ```
86 : ///
87 : /// ### Unsupported
88 : ///
89 : /// - Evicting by the operator deleting files from the filesystem
90 : ///
91 : /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer
92 : #[derive(Clone)]
93 : pub(crate) struct Layer(Arc<LayerInner>);
94 :
95 : impl std::fmt::Display for Layer {
96 13080 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
97 13080 : write!(
98 13080 : f,
99 13080 : "{}{}",
100 13080 : self.layer_desc().short_id(),
101 13080 : self.0.generation.get_suffix()
102 13080 : )
103 13080 : }
104 : }
105 :
106 : impl std::fmt::Debug for Layer {
107 24 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
108 24 : write!(f, "{}", self)
109 24 : }
110 : }
111 :
112 : impl AsLayerDesc for Layer {
113 12126033 : fn layer_desc(&self) -> &PersistentLayerDesc {
114 12126033 : self.0.layer_desc()
115 12126033 : }
116 : }
117 :
118 : impl PartialEq for Layer {
119 21 : fn eq(&self, other: &Self) -> bool {
120 21 : Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0)
121 21 : }
122 : }
123 :
124 11628 : pub(crate) fn local_layer_path(
125 11628 : conf: &PageServerConf,
126 11628 : tenant_shard_id: &TenantShardId,
127 11628 : timeline_id: &TimelineId,
128 11628 : layer_file_name: &LayerName,
129 11628 : generation: &Generation,
130 11628 : ) -> Utf8PathBuf {
131 11628 : let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);
132 11628 :
133 11628 : if generation.is_none() {
134 : // Without a generation, we may only use legacy path style
135 0 : timeline_path.join(layer_file_name.to_string())
136 : } else {
137 11628 : timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
138 : }
139 11628 : }
140 :
141 : pub(crate) enum LastEviction {
142 : Never,
143 : At(std::time::Instant),
144 : Evicting,
145 : }
146 :
147 : impl LastEviction {
148 156 : pub(crate) fn happened_after(&self, timepoint: std::time::Instant) -> bool {
149 156 : match self {
150 0 : LastEviction::Never => false,
151 156 : LastEviction::At(evicted_at) => evicted_at > &timepoint,
152 0 : LastEviction::Evicting => true,
153 : }
154 156 : }
155 : }
156 :
157 : impl Layer {
158 : /// Creates a layer value for a file we know to not be resident.
159 0 : pub(crate) fn for_evicted(
160 0 : conf: &'static PageServerConf,
161 0 : timeline: &Arc<Timeline>,
162 0 : file_name: LayerName,
163 0 : metadata: LayerFileMetadata,
164 0 : ) -> Self {
165 0 : let local_path = local_layer_path(
166 0 : conf,
167 0 : &timeline.tenant_shard_id,
168 0 : &timeline.timeline_id,
169 0 : &file_name,
170 0 : &metadata.generation,
171 0 : );
172 0 :
173 0 : let desc = PersistentLayerDesc::from_filename(
174 0 : timeline.tenant_shard_id,
175 0 : timeline.timeline_id,
176 0 : file_name,
177 0 : metadata.file_size,
178 0 : );
179 0 :
180 0 : let owner = Layer(Arc::new(LayerInner::new(
181 0 : conf,
182 0 : timeline,
183 0 : local_path,
184 0 : desc,
185 0 : None,
186 0 : metadata.generation,
187 0 : metadata.shard,
188 0 : )));
189 0 :
190 0 : debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
191 :
192 0 : owner
193 0 : }
194 :
195 : /// Creates a Layer value for a file we know to be resident in timeline directory.
196 732 : pub(crate) fn for_resident(
197 732 : conf: &'static PageServerConf,
198 732 : timeline: &Arc<Timeline>,
199 732 : local_path: Utf8PathBuf,
200 732 : file_name: LayerName,
201 732 : metadata: LayerFileMetadata,
202 732 : ) -> ResidentLayer {
203 732 : let desc = PersistentLayerDesc::from_filename(
204 732 : timeline.tenant_shard_id,
205 732 : timeline.timeline_id,
206 732 : file_name,
207 732 : metadata.file_size,
208 732 : );
209 732 :
210 732 : let mut resident = None;
211 732 :
212 732 : let owner = Layer(Arc::new_cyclic(|owner| {
213 732 : let inner = Arc::new(DownloadedLayer {
214 732 : owner: owner.clone(),
215 732 : kind: tokio::sync::OnceCell::default(),
216 732 : version: 0,
217 732 : });
218 732 : resident = Some(inner.clone());
219 732 :
220 732 : LayerInner::new(
221 732 : conf,
222 732 : timeline,
223 732 : local_path,
224 732 : desc,
225 732 : Some(inner),
226 732 : metadata.generation,
227 732 : metadata.shard,
228 732 : )
229 732 : }));
230 732 :
231 732 : let downloaded = resident.expect("just initialized");
232 732 :
233 732 : debug_assert!(owner.0.needs_download_blocking().unwrap().is_none());
234 :
235 732 : timeline
236 732 : .metrics
237 732 : .resident_physical_size_add(metadata.file_size);
238 732 :
239 732 : ResidentLayer { downloaded, owner }
240 732 : }
241 :
242 : /// Creates a Layer value for freshly written out new layer file by renaming it from a
243 : /// temporary path.
244 10992 : pub(crate) fn finish_creating(
245 10992 : conf: &'static PageServerConf,
246 10992 : timeline: &Arc<Timeline>,
247 10992 : desc: PersistentLayerDesc,
248 10992 : temp_path: &Utf8Path,
249 10992 : ) -> anyhow::Result<ResidentLayer> {
250 10992 : let mut resident = None;
251 10992 :
252 10992 : let owner = Layer(Arc::new_cyclic(|owner| {
253 10992 : let inner = Arc::new(DownloadedLayer {
254 10992 : owner: owner.clone(),
255 10992 : kind: tokio::sync::OnceCell::default(),
256 10992 : version: 0,
257 10992 : });
258 10992 : resident = Some(inner.clone());
259 10992 :
260 10992 : let local_path = local_layer_path(
261 10992 : conf,
262 10992 : &timeline.tenant_shard_id,
263 10992 : &timeline.timeline_id,
264 10992 : &desc.layer_name(),
265 10992 : &timeline.generation,
266 10992 : );
267 10992 :
268 10992 : LayerInner::new(
269 10992 : conf,
270 10992 : timeline,
271 10992 : local_path,
272 10992 : desc,
273 10992 : Some(inner),
274 10992 : timeline.generation,
275 10992 : timeline.get_shard_index(),
276 10992 : )
277 10992 : }));
278 10992 :
279 10992 : let downloaded = resident.expect("just initialized");
280 10992 :
281 10992 : // We never want to overwrite an existing file, so we use `RENAME_NOREPLACE`.
282 10992 : // TODO: this leaves the temp file in place if the rename fails, risking us running
283 10992 : // out of space. Should we clean it up here or does the calling context deal with this?
284 10992 : utils::fs_ext::rename_noreplace(temp_path.as_std_path(), owner.local_path().as_std_path())
285 10992 : .with_context(|| format!("rename temporary file as correct path for {owner}"))?;
286 :
287 10992 : Ok(ResidentLayer { downloaded, owner })
288 10992 : }
289 :
290 : /// Requests the layer to be evicted and waits for this to be done.
291 : ///
292 : /// If the file is not resident, an [`EvictionError::NotFound`] is returned.
293 : ///
294 : /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
295 : /// re-downloaded, [`EvictionError::Downloaded`] is returned.
296 : ///
297 : /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction
298 : /// will happen regardless the future returned by this method completing unless there is a
299 : /// read access before eviction gets to complete.
300 : ///
301 : /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
302 : /// of download-evict cycle on retry.
303 312 : pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
304 312 : self.0.evict_and_wait(timeout).await
305 288 : }
306 :
307 : /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
308 : /// then.
309 : ///
310 : /// On drop, this will cause a call to [`crate::tenant::remote_timeline_client::RemoteTimelineClient::schedule_deletion_of_unlinked`].
311 : /// This means that the unlinking by [gc] or [compaction] must have happened strictly before
312 : /// the value this is called on gets dropped.
313 : ///
314 : /// This is ensured by both of those methods accepting references to Layer.
315 : ///
316 : /// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
317 : /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
318 3120 : pub(crate) fn delete_on_drop(&self) {
319 3120 : self.0.delete_on_drop();
320 3120 : }
321 :
322 1596019 : pub(crate) async fn get_values_reconstruct_data(
323 1596019 : &self,
324 1596019 : keyspace: KeySpace,
325 1596019 : lsn_range: Range<Lsn>,
326 1596019 : reconstruct_data: &mut ValuesReconstructState,
327 1596019 : ctx: &RequestContext,
328 1596019 : ) -> Result<(), GetVectoredError> {
329 1596019 : let downloaded = {
330 1596019 : let ctx = RequestContextBuilder::from(ctx)
331 1596019 : .perf_span(|crnt_perf_span| {
332 0 : info_span!(
333 : target: PERF_TRACE_TARGET,
334 0 : parent: crnt_perf_span,
335 : "GET_LAYER",
336 : )
337 1596019 : })
338 1596019 : .attached_child();
339 1596019 :
340 1596019 : self.0
341 1596019 : .get_or_maybe_download(true, &ctx)
342 1596019 : .maybe_perf_instrument(&ctx, |crnt_perf_context| crnt_perf_context.clone())
343 1596019 : .await
344 1596019 : .map_err(|err| match err {
345 : DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
346 0 : GetVectoredError::Cancelled
347 : }
348 0 : other => GetVectoredError::Other(anyhow::anyhow!(other)),
349 1596019 : })?
350 : };
351 :
352 1596019 : let this = ResidentLayer {
353 1596019 : downloaded: downloaded.clone(),
354 1596019 : owner: self.clone(),
355 1596019 : };
356 1596019 :
357 1596019 : self.record_access(ctx);
358 1596019 :
359 1596019 : let ctx = RequestContextBuilder::from(ctx)
360 1596019 : .perf_span(|crnt_perf_span| {
361 0 : info_span!(
362 : target: PERF_TRACE_TARGET,
363 0 : parent: crnt_perf_span,
364 : "VISIT_LAYER",
365 : )
366 1596019 : })
367 1596019 : .attached_child();
368 1596019 :
369 1596019 : downloaded
370 1596019 : .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, &ctx)
371 1596019 : .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
372 1596019 : .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
373 1596019 : .await
374 1596019 : .map_err(|err| match err {
375 0 : GetVectoredError::Other(err) => GetVectoredError::Other(
376 0 : err.context(format!("get_values_reconstruct_data for layer {self}")),
377 0 : ),
378 0 : err => err,
379 1596019 : })
380 1596019 : }
381 :
382 : /// Download the layer if evicted.
383 : ///
384 : /// Will not error when the layer is already downloaded.
385 0 : pub(crate) async fn download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
386 0 : self.0.get_or_maybe_download(true, ctx).await?;
387 0 : Ok(())
388 0 : }
389 :
390 1896 : pub(crate) async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
391 1896 : self.0.needs_download().await
392 1896 : }
393 :
394 : /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
395 : /// while the guard exists.
396 : ///
397 : /// Returns None if the layer is currently evicted or becoming evicted.
398 120 : pub(crate) async fn keep_resident(&self) -> Option<ResidentLayer> {
399 120 : let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?;
400 :
401 84 : Some(ResidentLayer {
402 84 : downloaded,
403 84 : owner: self.clone(),
404 84 : })
405 120 : }
406 :
407 : /// Weak indicator of is the layer resident or not. Good enough for eviction, which can deal
408 : /// with `EvictionError::NotFound`.
409 : ///
410 : /// Returns `true` if this layer might be resident, or `false`, if it most likely evicted or
411 : /// will be unless a read happens soon.
412 1077 : pub(crate) fn is_likely_resident(&self) -> bool {
413 1077 : self.0
414 1077 : .inner
415 1077 : .get()
416 1077 : .map(|rowe| rowe.is_likely_resident())
417 1077 : .unwrap_or(false)
418 1077 : }
419 :
420 : /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
421 3480 : pub(crate) async fn download_and_keep_resident(
422 3480 : &self,
423 3480 : ctx: &RequestContext,
424 3480 : ) -> Result<ResidentLayer, DownloadError> {
425 3480 : let downloaded = self.0.get_or_maybe_download(true, ctx).await?;
426 :
427 3480 : Ok(ResidentLayer {
428 3480 : downloaded,
429 3480 : owner: self.clone(),
430 3480 : })
431 3480 : }
432 :
433 0 : pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
434 0 : self.0.info(reset)
435 0 : }
436 :
437 204 : pub(crate) fn latest_activity(&self) -> SystemTime {
438 204 : self.0.access_stats.latest_activity()
439 204 : }
440 :
441 552 : pub(crate) fn visibility(&self) -> LayerVisibilityHint {
442 552 : self.0.access_stats.visibility()
443 552 : }
444 :
445 11004 : pub(crate) fn local_path(&self) -> &Utf8Path {
446 11004 : &self.0.path
447 11004 : }
448 :
449 15984 : pub(crate) fn metadata(&self) -> LayerFileMetadata {
450 15984 : self.0.metadata()
451 15984 : }
452 :
453 156 : pub(crate) fn last_evicted_at(&self) -> LastEviction {
454 156 : match self.0.last_evicted_at.try_lock() {
455 156 : Ok(lock) => match *lock {
456 0 : None => LastEviction::Never,
457 156 : Some(at) => LastEviction::At(at),
458 : },
459 0 : Err(std::sync::TryLockError::WouldBlock) => LastEviction::Evicting,
460 0 : Err(std::sync::TryLockError::Poisoned(p)) => panic!("Lock poisoned: {p}"),
461 : }
462 156 : }
463 :
464 0 : pub(crate) fn get_timeline_id(&self) -> Option<TimelineId> {
465 0 : self.0
466 0 : .timeline
467 0 : .upgrade()
468 0 : .map(|timeline| timeline.timeline_id)
469 0 : }
470 :
471 : /// Traditional debug dumping facility
472 : #[allow(unused)]
473 24 : pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> {
474 24 : self.0.desc.dump();
475 24 :
476 24 : if verbose {
477 : // for now, unconditionally download everything, even if that might not be wanted.
478 24 : let l = self.0.get_or_maybe_download(true, ctx).await?;
479 24 : l.dump(&self.0, ctx).await?
480 0 : }
481 :
482 24 : Ok(())
483 24 : }
484 :
485 : /// Waits until this layer has been dropped (and if needed, local file deletion and remote
486 : /// deletion scheduling has completed).
487 : ///
488 : /// Does not start local deletion, use [`Self::delete_on_drop`] for that
489 : /// separatedly.
490 : #[cfg(any(feature = "testing", test))]
491 12 : pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
492 12 : let mut rx = self.0.status.as_ref().unwrap().subscribe();
493 :
494 12 : async move {
495 : loop {
496 12 : if rx.changed().await.is_err() {
497 12 : break;
498 0 : }
499 : }
500 12 : }
501 12 : }
502 :
503 1596019 : fn record_access(&self, ctx: &RequestContext) {
504 1596019 : if self.0.access_stats.record_access(ctx) {
505 : // Visibility was modified to Visible: maybe log about this
506 0 : match ctx.task_kind() {
507 : TaskKind::CalculateSyntheticSize
508 : | TaskKind::OndemandLogicalSizeCalculation
509 : | TaskKind::GarbageCollector
510 0 : | TaskKind::MgmtRequest => {
511 0 : // This situation is expected in code paths do binary searches of the LSN space to resolve
512 0 : // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
513 0 : // and on-demand for certain HTTP API requests. On-demand logical size calculation is also included
514 0 : // because it is run as a sub-task of synthetic size.
515 0 : }
516 : _ => {
517 : // In all other contexts, it is unusual to do I/O involving layers which are not visible at
518 : // some branch tip, so we log the fact that we are accessing something that the visibility
519 : // calculation thought should not be visible.
520 : //
521 : // This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
522 : // which was covered by a concurrent compaction.
523 0 : tracing::info!(
524 : layer=%self,
525 0 : "became visible as a result of access",
526 : );
527 : }
528 : }
529 :
530 : // Update the timeline's visible bytes count
531 0 : if let Some(tl) = self.0.timeline.upgrade() {
532 0 : tl.metrics
533 0 : .visible_physical_size_gauge
534 0 : .add(self.0.desc.file_size)
535 0 : }
536 1596019 : }
537 1596019 : }
538 :
539 2196 : pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
540 2196 : let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
541 : use LayerVisibilityHint::*;
542 2196 : match (old_visibility, visibility) {
543 : (Visible, Covered) => {
544 : // Subtract this layer's contribution to the visible size metric
545 216 : if let Some(tl) = self.0.timeline.upgrade() {
546 216 : debug_assert!(
547 216 : tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
548 : );
549 216 : tl.metrics
550 216 : .visible_physical_size_gauge
551 216 : .sub(self.0.desc.file_size)
552 0 : }
553 : }
554 : (Covered, Visible) => {
555 : // Add this layer's contribution to the visible size metric
556 0 : if let Some(tl) = self.0.timeline.upgrade() {
557 0 : tl.metrics
558 0 : .visible_physical_size_gauge
559 0 : .add(self.0.desc.file_size)
560 0 : }
561 : }
562 1980 : (Covered, Covered) | (Visible, Visible) => {
563 1980 : // no change
564 1980 : }
565 : }
566 2196 : }
567 : }
568 :
569 : /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
570 : ///
571 : /// However when we want something evicted, we cannot evict it right away as there might be current
572 : /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
573 : /// read with [`Layer::get_values_reconstruct_data`].
574 : ///
575 : /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
576 : #[derive(Debug)]
577 : enum ResidentOrWantedEvicted {
578 : Resident(Arc<DownloadedLayer>),
579 : WantedEvicted(Weak<DownloadedLayer>, usize),
580 : }
581 :
582 : impl ResidentOrWantedEvicted {
583 : /// Non-mutating access to the a DownloadedLayer, if possible.
584 : ///
585 : /// This is not used on the read path (anything that calls
586 : /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win
587 : /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`].
588 84 : fn get(&self) -> Option<Arc<DownloadedLayer>> {
589 84 : match self {
590 84 : ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
591 0 : ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.upgrade(),
592 : }
593 84 : }
594 :
595 : /// Best-effort query for residency right now, not as strong guarantee as receiving a strong
596 : /// reference from `ResidentOrWantedEvicted::get`.
597 669 : fn is_likely_resident(&self) -> bool {
598 669 : match self {
599 633 : ResidentOrWantedEvicted::Resident(_) => true,
600 36 : ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.strong_count() > 0,
601 : }
602 669 : }
603 :
604 : /// Upgrades any weak to strong if possible.
605 : ///
606 : /// Returns a strong reference if possible, along with a boolean telling if an upgrade
607 : /// happened.
608 1599499 : fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
609 1599499 : match self {
610 1599451 : ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
611 48 : ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
612 0 : Some(strong) => {
613 0 : LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
614 0 :
615 0 : *self = ResidentOrWantedEvicted::Resident(strong.clone());
616 0 :
617 0 : Some((strong, true))
618 : }
619 48 : None => None,
620 : },
621 : }
622 1599499 : }
623 :
624 : /// When eviction is first requested, drop down to holding a [`Weak`].
625 : ///
626 : /// Returns `Some` if this was the first time eviction was requested. Care should be taken to
627 : /// drop the possibly last strong reference outside of the mutex of
628 : /// [`heavier_once_cell::OnceCell`].
629 276 : fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
630 276 : match self {
631 252 : ResidentOrWantedEvicted::Resident(strong) => {
632 252 : let weak = Arc::downgrade(strong);
633 252 : let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
634 252 : std::mem::swap(self, &mut temp);
635 252 : match temp {
636 252 : ResidentOrWantedEvicted::Resident(strong) => Some(strong),
637 0 : ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
638 : }
639 : }
640 24 : ResidentOrWantedEvicted::WantedEvicted(..) => None,
641 : }
642 276 : }
643 : }
644 :
645 : struct LayerInner {
646 : /// Only needed to check ondemand_download_behavior_treat_error_as_warn and creation of
647 : /// [`Self::path`].
648 : conf: &'static PageServerConf,
649 :
650 : /// Full path to the file; unclear if this should exist anymore.
651 : path: Utf8PathBuf,
652 :
653 : desc: PersistentLayerDesc,
654 :
655 : /// Timeline access is needed for remote timeline client and metrics.
656 : ///
657 : /// There should not be an access to timeline for any reason without entering the
658 : /// [`Timeline::gate`] at the same time.
659 : timeline: Weak<Timeline>,
660 :
661 : access_stats: LayerAccessStats,
662 :
663 : /// This custom OnceCell is backed by std mutex, but only held for short time periods.
664 : ///
665 : /// Filesystem changes (download, evict) are only done while holding a permit which the
666 : /// `heavier_once_cell` provides.
667 : ///
668 : /// A number of fields in `Layer` are meant to only be updated when holding the InitPermit, but
669 : /// possibly read while not holding it.
670 : inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
671 :
672 : /// Do we want to delete locally and remotely this when `LayerInner` is dropped
673 : wanted_deleted: AtomicBool,
674 :
675 : /// Version is to make sure we will only evict a specific initialization of the downloaded file.
676 : ///
677 : /// Incremented for each initialization, stored in `DownloadedLayer::version` or
678 : /// `ResidentOrWantedEvicted::WantedEvicted`.
679 : version: AtomicUsize,
680 :
681 : /// Allow subscribing to when the layer actually gets evicted, a non-cancellable download
682 : /// starts, or completes.
683 : ///
684 : /// Updates must only be posted while holding the InitPermit or the heavier_once_cell::Guard.
685 : /// Holding the InitPermit is the only time we can do state transitions, but we also need to
686 : /// cancel a pending eviction on upgrading a [`ResidentOrWantedEvicted::WantedEvicted`] back to
687 : /// [`ResidentOrWantedEvicted::Resident`] on access.
688 : ///
689 : /// The sender is wrapped in an Option to facilitate moving it out on [`LayerInner::drop`].
690 : status: Option<tokio::sync::watch::Sender<Status>>,
691 :
692 : /// Counter for exponential backoff with the download.
693 : ///
694 : /// This is atomic only for the purposes of having additional data only accessed while holding
695 : /// the InitPermit.
696 : consecutive_failures: AtomicUsize,
697 :
698 : /// The generation of this Layer.
699 : ///
700 : /// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
701 : /// for created layers from [`Timeline::generation`].
702 : generation: Generation,
703 :
704 : /// The shard of this Layer.
705 : ///
706 : /// For layers created in this process, this will always be the [`ShardIndex`] of the
707 : /// current `ShardIdentity`` (TODO: add link once it's introduced).
708 : ///
709 : /// For loaded layers, this may be some other value if the tenant has undergone
710 : /// a shard split since the layer was originally written.
711 : shard: ShardIndex,
712 :
713 : /// When the Layer was last evicted but has not been downloaded since.
714 : ///
715 : /// This is used for skipping evicted layers from the previous heatmap (see
716 : /// `[Timeline::generate_heatmap]`) and for updating metrics
717 : /// (see [`LayerImplMetrics::redownload_after`]).
718 : last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
719 :
720 : #[cfg(test)]
721 : failpoints: std::sync::Mutex<Vec<failpoints::Failpoint>>,
722 : }
723 :
724 : impl std::fmt::Display for LayerInner {
725 324 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
726 324 : write!(f, "{}", self.layer_desc().short_id())
727 324 : }
728 : }
729 :
730 : impl AsLayerDesc for LayerInner {
731 12141722 : fn layer_desc(&self) -> &PersistentLayerDesc {
732 12141722 : &self.desc
733 12141722 : }
734 : }
735 :
736 : #[derive(Debug, Clone, Copy)]
737 : enum Status {
738 : Resident,
739 : Evicted,
740 : Downloading,
741 : }
742 :
743 : impl Drop for LayerInner {
744 4249 : fn drop(&mut self) {
745 4249 : // if there was a pending eviction, mark it cancelled here to balance metrics
746 4249 : if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit()
747 12 : {
748 12 : // eviction has already been started
749 12 : LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
750 12 :
751 12 : // eviction request is intentionally not honored as no one is present to wait for it
752 12 : // and we could be delaying shutdown for nothing.
753 4237 : }
754 :
755 4249 : let timeline = self.timeline.upgrade();
756 :
757 4249 : if let Some(timeline) = timeline.as_ref() {
758 : // Only need to decrement metrics if the timeline still exists: otherwise
759 : // it will have already de-registered these metrics via TimelineMetrics::shutdown
760 4153 : timeline.metrics.dec_layer(&self.desc);
761 :
762 4153 : if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
763 4153 : debug_assert!(
764 4153 : timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
765 : );
766 4153 : timeline
767 4153 : .metrics
768 4153 : .visible_physical_size_gauge
769 4153 : .sub(self.desc.file_size);
770 0 : }
771 96 : }
772 :
773 4249 : if !*self.wanted_deleted.get_mut() {
774 1176 : return;
775 3073 : }
776 :
777 3073 : let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
778 :
779 3073 : let path = std::mem::take(&mut self.path);
780 3073 : let file_name = self.layer_desc().layer_name();
781 3073 : let file_size = self.layer_desc().file_size;
782 3073 : let meta = self.metadata();
783 3073 : let status = self.status.take();
784 3073 :
785 3073 : Self::spawn_blocking(move || {
786 3068 : let _g = span.entered();
787 3068 :
788 3068 : // carry this until we are finished for [`Layer::wait_drop`] support
789 3068 : let _status = status;
790 :
791 3068 : let Some(timeline) = timeline else {
792 : // no need to nag that timeline is gone: under normal situation on
793 : // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
794 0 : LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
795 0 : return;
796 : };
797 :
798 3068 : let Ok(_guard) = timeline.gate.enter() else {
799 0 : LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
800 0 : return;
801 : };
802 :
803 3068 : let removed = match std::fs::remove_file(path) {
804 3056 : Ok(()) => true,
805 12 : Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
806 12 : // until we no longer do detaches by removing all local files before removing the
807 12 : // tenant from the global map, we will always get these errors even if we knew what
808 12 : // is the latest state.
809 12 : //
810 12 : // we currently do not track the latest state, so we'll also end up here on evicted
811 12 : // layers.
812 12 : false
813 : }
814 0 : Err(e) => {
815 0 : tracing::error!("failed to remove wanted deleted layer: {e}");
816 0 : LAYER_IMPL_METRICS.inc_delete_removes_failed();
817 0 : false
818 : }
819 : };
820 :
821 3068 : if removed {
822 3056 : timeline.metrics.resident_physical_size_sub(file_size);
823 3056 : }
824 3068 : let res = timeline
825 3068 : .remote_client
826 3068 : .schedule_deletion_of_unlinked(vec![(file_name, meta)]);
827 :
828 3068 : if let Err(e) = res {
829 : // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
830 : // demonstrating this deadlock (without spawn_blocking): stop will drop
831 : // queued items, which will have ResidentLayer's, and those drops would try
832 : // to re-entrantly lock the RemoteTimelineClient inner state.
833 12 : if !timeline.is_active() {
834 12 : tracing::info!("scheduling deletion on drop failed: {e:#}");
835 : } else {
836 0 : tracing::warn!("scheduling deletion on drop failed: {e:#}");
837 : }
838 12 : LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
839 3056 : } else {
840 3056 : LAYER_IMPL_METRICS.inc_completed_deletes();
841 3056 : }
842 3073 : });
843 4249 : }
844 : }
845 :
846 : impl LayerInner {
847 : #[allow(clippy::too_many_arguments)]
848 11724 : fn new(
849 11724 : conf: &'static PageServerConf,
850 11724 : timeline: &Arc<Timeline>,
851 11724 : local_path: Utf8PathBuf,
852 11724 : desc: PersistentLayerDesc,
853 11724 : downloaded: Option<Arc<DownloadedLayer>>,
854 11724 : generation: Generation,
855 11724 : shard: ShardIndex,
856 11724 : ) -> Self {
857 11724 : let (inner, version, init_status) = if let Some(inner) = downloaded {
858 11724 : let version = inner.version;
859 11724 : let resident = ResidentOrWantedEvicted::Resident(inner);
860 11724 : (
861 11724 : heavier_once_cell::OnceCell::new(resident),
862 11724 : version,
863 11724 : Status::Resident,
864 11724 : )
865 : } else {
866 0 : (heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
867 : };
868 :
869 : // This object acts as a RAII guard on these metrics: increment on construction
870 11724 : timeline.metrics.inc_layer(&desc);
871 11724 :
872 11724 : // New layers are visible by default. This metric is later updated on drop or in set_visibility
873 11724 : timeline
874 11724 : .metrics
875 11724 : .visible_physical_size_gauge
876 11724 : .add(desc.file_size);
877 11724 :
878 11724 : LayerInner {
879 11724 : conf,
880 11724 : path: local_path,
881 11724 : desc,
882 11724 : timeline: Arc::downgrade(timeline),
883 11724 : access_stats: Default::default(),
884 11724 : wanted_deleted: AtomicBool::new(false),
885 11724 : inner,
886 11724 : version: AtomicUsize::new(version),
887 11724 : status: Some(tokio::sync::watch::channel(init_status).0),
888 11724 : consecutive_failures: AtomicUsize::new(0),
889 11724 : generation,
890 11724 : shard,
891 11724 : last_evicted_at: std::sync::Mutex::default(),
892 11724 : #[cfg(test)]
893 11724 : failpoints: Default::default(),
894 11724 : }
895 11724 : }
896 :
897 3120 : fn delete_on_drop(&self) {
898 3120 : let res =
899 3120 : self.wanted_deleted
900 3120 : .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
901 3120 :
902 3120 : if res.is_ok() {
903 3096 : LAYER_IMPL_METRICS.inc_started_deletes();
904 3096 : }
905 3120 : }
906 :
907 : /// Cancellation safe, however dropping the future and calling this method again might result
908 : /// in a new attempt to evict OR join the previously started attempt.
909 312 : #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))]
910 : pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
911 : let mut rx = self.status.as_ref().unwrap().subscribe();
912 :
913 : {
914 : let current = rx.borrow_and_update();
915 : match &*current {
916 : Status::Resident => {
917 : // we might get lucky and evict this; continue
918 : }
919 : Status::Evicted | Status::Downloading => {
920 : // it is already evicted
921 : return Err(EvictionError::NotFound);
922 : }
923 : }
924 : }
925 :
926 : let strong = {
927 : match self.inner.get() {
928 : Some(mut either) => either.downgrade(),
929 : None => {
930 : // we already have a scheduled eviction, which just has not gotten to run yet.
931 : // it might still race with a read access, but that could also get cancelled,
932 : // so let's say this is not evictable.
933 : return Err(EvictionError::NotFound);
934 : }
935 : }
936 : };
937 :
938 : if strong.is_some() {
939 : // drop the DownloadedLayer outside of the holding the guard
940 : drop(strong);
941 :
942 : // idea here is that only one evicter should ever get to witness a strong reference,
943 : // which means whenever get_or_maybe_download upgrades a weak, it must mark up a
944 : // cancelled eviction and signal us, like it currently does.
945 : //
946 : // a second concurrent evict_and_wait will not see a strong reference.
947 : LAYER_IMPL_METRICS.inc_started_evictions();
948 : }
949 :
950 : let changed = rx.changed();
951 : let changed = tokio::time::timeout(timeout, changed).await;
952 :
953 : let Ok(changed) = changed else {
954 : return Err(EvictionError::Timeout);
955 : };
956 :
957 : let _: () = changed.expect("cannot be closed, because we are holding a strong reference");
958 :
959 : let current = rx.borrow_and_update();
960 :
961 : match &*current {
962 : // the easiest case
963 : Status::Evicted => Ok(()),
964 : // it surely was evicted in between, but then there was a new access now; we can't know
965 : // if it'll succeed so lets just call it evicted
966 : Status::Downloading => Ok(()),
967 : // either the download which was started after eviction completed already, or it was
968 : // never evicted
969 : Status::Resident => Err(EvictionError::Downloaded),
970 : }
971 : }
972 :
973 : /// Cancellation safe.
974 1599595 : async fn get_or_maybe_download(
975 1599595 : self: &Arc<Self>,
976 1599595 : allow_download: bool,
977 1599595 : ctx: &RequestContext,
978 1599595 : ) -> Result<Arc<DownloadedLayer>, DownloadError> {
979 1599595 : let mut wait_for_download_recorder =
980 1599595 : scopeguard::guard(utils::elapsed_accum::ElapsedAccum::default(), |accum| {
981 1599595 : ctx.ondemand_download_wait_observe(accum.get());
982 1599595 : });
983 144 : let (weak, permit) = {
984 : // get_or_init_detached can:
985 : // - be fast (mutex lock) OR uncontested semaphore permit acquire
986 : // - be slow (wait for semaphore permit or closing)
987 1599595 : let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
988 :
989 1599595 : let locked = self
990 1599595 : .inner
991 1599595 : .get_or_init_detached_measured(Some(&mut wait_for_download_recorder))
992 1599595 : .await
993 1599595 : .map(|mut guard| guard.get_and_upgrade().ok_or(guard));
994 1599595 :
995 1599595 : scopeguard::ScopeGuard::into_inner(init_cancelled);
996 :
997 1599451 : match locked {
998 : // this path could had been a RwLock::read
999 1599451 : Ok(Ok((strong, upgraded))) if !upgraded => return Ok(strong),
1000 0 : Ok(Ok((strong, _))) => {
1001 0 : // when upgraded back, the Arc<DownloadedLayer> is still available, but
1002 0 : // previously a `evict_and_wait` was received. this is the only place when we
1003 0 : // send out an update without holding the InitPermit.
1004 0 : //
1005 0 : // note that we also have dropped the Guard; this is fine, because we just made
1006 0 : // a state change and are holding a strong reference to be returned.
1007 0 : self.status.as_ref().unwrap().send_replace(Status::Resident);
1008 0 : LAYER_IMPL_METRICS
1009 0 : .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
1010 0 :
1011 0 : return Ok(strong);
1012 : }
1013 48 : Ok(Err(guard)) => {
1014 48 : // path to here: we won the eviction, the file should still be on the disk.
1015 48 : let (weak, permit) = guard.take_and_deinit();
1016 48 : (Some(weak), permit)
1017 : }
1018 96 : Err(permit) => (None, permit),
1019 : }
1020 : };
1021 144 : let _guard = wait_for_download_recorder.guard();
1022 :
1023 144 : if let Some(weak) = weak {
1024 : // only drop the weak after dropping the heavier_once_cell guard
1025 48 : assert!(
1026 48 : matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
1027 0 : "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
1028 : );
1029 96 : }
1030 :
1031 144 : let timeline = self
1032 144 : .timeline
1033 144 : .upgrade()
1034 144 : .ok_or(DownloadError::TimelineShutdown)?;
1035 :
1036 : // count cancellations, which currently remain largely unexpected
1037 144 : let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
1038 :
1039 : // check if we really need to be downloaded: this can happen if a read access won the
1040 : // semaphore before eviction.
1041 : //
1042 : // if we are cancelled while doing this `stat` the `self.inner` will be uninitialized. a
1043 : // pending eviction will try to evict even upon finding an uninitialized `self.inner`.
1044 144 : let needs_download = self
1045 144 : .needs_download()
1046 144 : .await
1047 144 : .map_err(DownloadError::PreStatFailed);
1048 144 :
1049 144 : scopeguard::ScopeGuard::into_inner(init_cancelled);
1050 :
1051 144 : let needs_download = needs_download?;
1052 :
1053 144 : let Some(reason) = needs_download else {
1054 : // the file is present locally because eviction has not had a chance to run yet
1055 :
1056 : #[cfg(test)]
1057 48 : self.failpoint(failpoints::FailpointKind::AfterDeterminingLayerNeedsNoDownload)
1058 48 : .await?;
1059 :
1060 36 : LAYER_IMPL_METRICS.inc_init_needed_no_download();
1061 36 :
1062 36 : return Ok(self.initialize_after_layer_is_on_disk(permit));
1063 : };
1064 :
1065 : // we must download; getting cancelled before spawning the download is not an issue as
1066 : // any still running eviction would not find anything to evict.
1067 :
1068 96 : if let NeedsDownload::NotFile(ft) = reason {
1069 0 : return Err(DownloadError::NotFile(ft));
1070 96 : }
1071 96 :
1072 96 : self.check_expected_download(ctx)?;
1073 :
1074 96 : if !allow_download {
1075 : // this is only used from tests, but it is hard to test without the boolean
1076 12 : return Err(DownloadError::DownloadRequired);
1077 84 : }
1078 :
1079 84 : let ctx = if ctx.has_perf_span() {
1080 0 : let dl_ctx = RequestContextBuilder::from(ctx)
1081 0 : .task_kind(TaskKind::LayerDownload)
1082 0 : .download_behavior(DownloadBehavior::Download)
1083 0 : .root_perf_span(|| {
1084 0 : info_span!(
1085 0 : target: PERF_TRACE_TARGET,
1086 0 : "DOWNLOAD_LAYER",
1087 0 : layer = %self,
1088 0 : reason = %reason
1089 0 : )
1090 0 : })
1091 0 : .detached_child();
1092 0 : ctx.perf_follows_from(&dl_ctx);
1093 0 : dl_ctx
1094 : } else {
1095 84 : ctx.attached_child()
1096 : };
1097 :
1098 84 : async move {
1099 84 : tracing::info!(%reason, "downloading on-demand");
1100 :
1101 84 : let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
1102 84 : let res = self
1103 84 : .download_init_and_wait(timeline, permit, ctx.attached_child())
1104 84 : .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
1105 84 : .await?;
1106 :
1107 84 : scopeguard::ScopeGuard::into_inner(init_cancelled);
1108 84 : Ok(res)
1109 84 : }
1110 84 : .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
1111 84 : .await
1112 1599595 : }
1113 :
1114 : /// Nag or fail per RequestContext policy
1115 96 : fn check_expected_download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
1116 : use crate::context::DownloadBehavior::*;
1117 96 : let b = ctx.download_behavior();
1118 96 : match b {
1119 96 : Download => Ok(()),
1120 : Warn | Error => {
1121 0 : tracing::info!(
1122 0 : "unexpectedly on-demand downloading for task kind {:?}",
1123 0 : ctx.task_kind()
1124 : );
1125 0 : crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
1126 :
1127 0 : let really_error =
1128 0 : matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn;
1129 :
1130 0 : if really_error {
1131 : // this check is only probablistic, seems like flakyness footgun
1132 0 : Err(DownloadError::ContextAndConfigReallyDeniesDownloads)
1133 : } else {
1134 0 : Ok(())
1135 : }
1136 : }
1137 : }
1138 96 : }
1139 :
1140 : /// Actual download, at most one is executed at the time.
1141 84 : async fn download_init_and_wait(
1142 84 : self: &Arc<Self>,
1143 84 : timeline: Arc<Timeline>,
1144 84 : permit: heavier_once_cell::InitPermit,
1145 84 : ctx: RequestContext,
1146 84 : ) -> Result<Arc<DownloadedLayer>, DownloadError> {
1147 84 : debug_assert_current_span_has_tenant_and_timeline_id();
1148 84 :
1149 84 : let (tx, rx) = tokio::sync::oneshot::channel();
1150 84 :
1151 84 : let this: Arc<Self> = self.clone();
1152 :
1153 84 : let guard = timeline
1154 84 : .gate
1155 84 : .enter()
1156 84 : .map_err(|_| DownloadError::DownloadCancelled)?;
1157 :
1158 84 : Self::spawn(
1159 84 : async move {
1160 0 : let _guard = guard;
1161 0 :
1162 0 : // now that we have commited to downloading, send out an update to:
1163 0 : // - unhang any pending eviction
1164 0 : // - break out of evict_and_wait
1165 0 : this.status
1166 0 : .as_ref()
1167 0 : .unwrap()
1168 0 : .send_replace(Status::Downloading);
1169 84 :
1170 84 : #[cfg(test)]
1171 84 : this.failpoint(failpoints::FailpointKind::WaitBeforeDownloading)
1172 84 : .await
1173 84 : .unwrap();
1174 :
1175 84 : let res = this.download_and_init(timeline, permit, &ctx).await;
1176 :
1177 84 : if let Err(res) = tx.send(res) {
1178 0 : match res {
1179 0 : Ok(_res) => {
1180 0 : tracing::debug!("layer initialized, but caller has been cancelled");
1181 0 : LAYER_IMPL_METRICS.inc_init_completed_without_requester();
1182 : }
1183 0 : Err(e) => {
1184 0 : tracing::info!(
1185 0 : "layer file download failed, and caller has been cancelled: {e:?}"
1186 : );
1187 0 : LAYER_IMPL_METRICS.inc_download_failed_without_requester();
1188 : }
1189 : }
1190 84 : }
1191 84 : }
1192 84 : .in_current_span(),
1193 84 : );
1194 84 :
1195 84 : match rx.await {
1196 84 : Ok(Ok(res)) => Ok(res),
1197 : Ok(Err(remote_storage::DownloadError::Cancelled)) => {
1198 0 : Err(DownloadError::DownloadCancelled)
1199 : }
1200 0 : Ok(Err(_)) => Err(DownloadError::DownloadFailed),
1201 0 : Err(_gone) => Err(DownloadError::DownloadCancelled),
1202 : }
1203 84 : }
1204 :
1205 84 : async fn download_and_init(
1206 84 : self: &Arc<LayerInner>,
1207 84 : timeline: Arc<Timeline>,
1208 84 : permit: heavier_once_cell::InitPermit,
1209 84 : ctx: &RequestContext,
1210 84 : ) -> Result<Arc<DownloadedLayer>, remote_storage::DownloadError> {
1211 84 : let start = std::time::Instant::now();
1212 84 : let result = timeline
1213 84 : .remote_client
1214 84 : .download_layer_file(
1215 84 : &self.desc.layer_name(),
1216 84 : &self.metadata(),
1217 84 : &self.path,
1218 84 : &timeline.gate,
1219 84 : &timeline.cancel,
1220 84 : ctx,
1221 84 : )
1222 84 : .await;
1223 84 : let latency = start.elapsed();
1224 84 : let latency_millis = u64::try_from(latency.as_millis()).unwrap();
1225 84 : match result {
1226 84 : Ok(size) => {
1227 84 : assert_eq!(size, self.desc.file_size);
1228 :
1229 84 : match self.needs_download().await {
1230 0 : Ok(Some(reason)) => {
1231 0 : // this is really a bug in needs_download or remote timeline client
1232 0 : panic!("post-condition failed: needs_download returned {reason:?}");
1233 : }
1234 84 : Ok(None) => {
1235 84 : // as expected
1236 84 : }
1237 0 : Err(e) => {
1238 0 : panic!("post-condition failed: needs_download errored: {e:?}");
1239 : }
1240 : };
1241 84 : tracing::info!(size=%self.desc.file_size, %latency_millis, "on-demand download successful");
1242 84 : timeline
1243 84 : .metrics
1244 84 : .resident_physical_size_add(self.desc.file_size);
1245 84 : self.consecutive_failures.store(0, Ordering::Relaxed);
1246 84 :
1247 84 : let since_last_eviction = self
1248 84 : .last_evicted_at
1249 84 : .lock()
1250 84 : .unwrap()
1251 84 : .take()
1252 84 : .map(|ts| ts.elapsed());
1253 84 : if let Some(since_last_eviction) = since_last_eviction {
1254 84 : LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
1255 84 : }
1256 :
1257 84 : self.access_stats.record_residence_event();
1258 84 :
1259 84 : let task_kind: &'static str = ctx.task_kind().into();
1260 84 : ONDEMAND_DOWNLOAD_BYTES
1261 84 : .with_label_values(&[task_kind])
1262 84 : .inc_by(self.desc.file_size);
1263 84 : ONDEMAND_DOWNLOAD_COUNT
1264 84 : .with_label_values(&[task_kind])
1265 84 : .inc();
1266 84 :
1267 84 : Ok(self.initialize_after_layer_is_on_disk(permit))
1268 : }
1269 0 : Err(e) => {
1270 0 : let consecutive_failures =
1271 0 : 1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
1272 0 :
1273 0 : if timeline.cancel.is_cancelled() {
1274 : // If we're shutting down, drop out before logging the error
1275 0 : return Err(e);
1276 0 : }
1277 0 :
1278 0 : tracing::error!(consecutive_failures, %latency_millis, "layer file download failed: {e:#}");
1279 :
1280 0 : let backoff = utils::backoff::exponential_backoff_duration_seconds(
1281 0 : consecutive_failures.min(u32::MAX as usize) as u32,
1282 0 : 1.5,
1283 0 : 60.0,
1284 0 : );
1285 0 :
1286 0 : let backoff = std::time::Duration::from_secs_f64(backoff);
1287 0 :
1288 0 : tokio::select! {
1289 0 : _ = tokio::time::sleep(backoff) => {},
1290 0 : _ = timeline.cancel.cancelled() => {},
1291 : };
1292 :
1293 0 : Err(e)
1294 : }
1295 : }
1296 84 : }
1297 :
1298 : /// Initializes the `Self::inner` to a "resident" state.
1299 : ///
1300 : /// Callers are assumed to ensure that the file is actually on disk with `Self::needs_download`
1301 : /// before calling this method.
1302 : ///
1303 : /// If this method is ever made async, it needs to be cancellation safe so that no state
1304 : /// changes are made before we can write to the OnceCell in non-cancellable fashion.
1305 120 : fn initialize_after_layer_is_on_disk(
1306 120 : self: &Arc<LayerInner>,
1307 120 : permit: heavier_once_cell::InitPermit,
1308 120 : ) -> Arc<DownloadedLayer> {
1309 120 : debug_assert_current_span_has_tenant_and_timeline_id();
1310 120 :
1311 120 : // disable any scheduled but not yet running eviction deletions for this initialization
1312 120 : let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
1313 120 : self.status.as_ref().unwrap().send_replace(Status::Resident);
1314 120 :
1315 120 : let res = Arc::new(DownloadedLayer {
1316 120 : owner: Arc::downgrade(self),
1317 120 : kind: tokio::sync::OnceCell::default(),
1318 120 : version: next_version,
1319 120 : });
1320 120 :
1321 120 : let waiters = self.inner.initializer_count();
1322 120 : if waiters > 0 {
1323 0 : tracing::info!(waiters, "completing layer init for other tasks");
1324 120 : }
1325 :
1326 120 : let value = ResidentOrWantedEvicted::Resident(res.clone());
1327 120 :
1328 120 : self.inner.set(value, permit);
1329 120 :
1330 120 : res
1331 120 : }
1332 :
1333 2136 : async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
1334 2136 : match tokio::fs::metadata(&self.path).await {
1335 2040 : Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
1336 96 : Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
1337 0 : Err(e) => Err(e),
1338 : }
1339 2136 : }
1340 :
1341 732 : fn needs_download_blocking(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
1342 732 : match self.path.metadata() {
1343 732 : Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
1344 0 : Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
1345 0 : Err(e) => Err(e),
1346 : }
1347 732 : }
1348 :
1349 2772 : fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
1350 2772 : // in future, this should include sha2-256 validation of the file.
1351 2772 : if !m.is_file() {
1352 0 : Err(NeedsDownload::NotFile(m.file_type()))
1353 2772 : } else if m.len() != self.desc.file_size {
1354 0 : Err(NeedsDownload::WrongSize {
1355 0 : actual: m.len(),
1356 0 : expected: self.desc.file_size,
1357 0 : })
1358 : } else {
1359 2772 : Ok(())
1360 : }
1361 2772 : }
1362 :
1363 0 : fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
1364 0 : let layer_name = self.desc.layer_name().to_string();
1365 0 :
1366 0 : let resident = self
1367 0 : .inner
1368 0 : .get()
1369 0 : .map(|rowe| rowe.is_likely_resident())
1370 0 : .unwrap_or(false);
1371 0 :
1372 0 : let access_stats = self.access_stats.as_api_model(reset);
1373 0 :
1374 0 : if self.desc.is_delta {
1375 0 : let lsn_range = &self.desc.lsn_range;
1376 0 :
1377 0 : HistoricLayerInfo::Delta {
1378 0 : layer_file_name: layer_name,
1379 0 : layer_file_size: self.desc.file_size,
1380 0 : lsn_start: lsn_range.start,
1381 0 : lsn_end: lsn_range.end,
1382 0 : remote: !resident,
1383 0 : access_stats,
1384 0 : l0: crate::tenant::layer_map::LayerMap::is_l0(
1385 0 : &self.layer_desc().key_range,
1386 0 : self.layer_desc().is_delta,
1387 0 : ),
1388 0 : }
1389 : } else {
1390 0 : let lsn = self.desc.image_layer_lsn();
1391 0 :
1392 0 : HistoricLayerInfo::Image {
1393 0 : layer_file_name: layer_name,
1394 0 : layer_file_size: self.desc.file_size,
1395 0 : lsn_start: lsn,
1396 0 : remote: !resident,
1397 0 : access_stats,
1398 0 : }
1399 : }
1400 0 : }
1401 :
1402 : /// `DownloadedLayer` is being dropped, so it calls this method.
1403 240 : fn on_downloaded_layer_drop(self: Arc<LayerInner>, only_version: usize) {
1404 : // we cannot know without inspecting LayerInner::inner if we should evict or not, even
1405 : // though here it is very likely
1406 240 : let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version);
1407 :
1408 : // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
1409 : // drop while the `self.inner` is being locked, leading to a deadlock.
1410 :
1411 240 : let start_evicting = async move {
1412 240 : #[cfg(test)]
1413 240 : self.failpoint(failpoints::FailpointKind::WaitBeforeStartingEvicting)
1414 240 : .await
1415 240 : .expect("failpoint should not have errored");
1416 240 :
1417 240 : tracing::debug!("eviction started");
1418 :
1419 240 : let res = self.wait_for_turn_and_evict(only_version).await;
1420 : // metrics: ignore the Ok branch, it is not done yet
1421 240 : if let Err(e) = res {
1422 36 : tracing::debug!(res=?Err::<(), _>(&e), "eviction completed");
1423 36 : LAYER_IMPL_METRICS.inc_eviction_cancelled(e);
1424 204 : }
1425 240 : };
1426 :
1427 240 : Self::spawn(start_evicting.instrument(span));
1428 240 : }
1429 :
1430 240 : async fn wait_for_turn_and_evict(
1431 240 : self: Arc<LayerInner>,
1432 240 : only_version: usize,
1433 240 : ) -> Result<(), EvictionCancelled> {
1434 468 : fn is_good_to_continue(status: &Status) -> Result<(), EvictionCancelled> {
1435 : use Status::*;
1436 468 : match status {
1437 456 : Resident => Ok(()),
1438 12 : Evicted => Err(EvictionCancelled::UnexpectedEvictedState),
1439 0 : Downloading => Err(EvictionCancelled::LostToDownload),
1440 : }
1441 468 : }
1442 :
1443 240 : let timeline = self
1444 240 : .timeline
1445 240 : .upgrade()
1446 240 : .ok_or(EvictionCancelled::TimelineGone)?;
1447 :
1448 240 : let mut rx = self
1449 240 : .status
1450 240 : .as_ref()
1451 240 : .expect("LayerInner cannot be dropped, holding strong ref")
1452 240 : .subscribe();
1453 240 :
1454 240 : is_good_to_continue(&rx.borrow_and_update())?;
1455 :
1456 228 : let Ok(gate) = timeline.gate.enter() else {
1457 0 : return Err(EvictionCancelled::TimelineGone);
1458 : };
1459 :
1460 204 : let permit = {
1461 : // we cannot just `std::fs::remove_file` because there might already be an
1462 : // get_or_maybe_download which will inspect filesystem and reinitialize. filesystem
1463 : // operations must be done while holding the heavier_once_cell::InitPermit
1464 228 : let mut wait = std::pin::pin!(self.inner.get_or_init_detached());
1465 :
1466 228 : let waited = loop {
1467 : // we must race to the Downloading starting, otherwise we would have to wait until the
1468 : // completion of the download. waiting for download could be long and hinder our
1469 : // efforts to alert on "hanging" evictions.
1470 228 : tokio::select! {
1471 228 : res = &mut wait => break res,
1472 228 : _ = rx.changed() => {
1473 0 : is_good_to_continue(&rx.borrow_and_update())?;
1474 : // two possibilities for Status::Resident:
1475 : // - the layer was found locally from disk by a read
1476 : // - we missed a bunch of updates and now the layer is
1477 : // again downloaded -- assume we'll fail later on with
1478 : // version check or AlreadyReinitialized
1479 : }
1480 : }
1481 : };
1482 :
1483 : // re-check now that we have the guard or permit; all updates should have happened
1484 : // while holding the permit.
1485 228 : is_good_to_continue(&rx.borrow_and_update())?;
1486 :
1487 : // the term deinitialize is used here, because we clearing out the Weak will eventually
1488 : // lead to deallocating the reference counted value, and the value we
1489 : // `Guard::take_and_deinit` is likely to be the last because the Weak is never cloned.
1490 228 : let (_weak, permit) = match waited {
1491 216 : Ok(guard) => {
1492 216 : match &*guard {
1493 204 : ResidentOrWantedEvicted::WantedEvicted(_weak, version)
1494 204 : if *version == only_version =>
1495 192 : {
1496 192 : tracing::debug!(version, "deinitializing matching WantedEvicted");
1497 192 : let (weak, permit) = guard.take_and_deinit();
1498 192 : (Some(weak), permit)
1499 : }
1500 12 : ResidentOrWantedEvicted::WantedEvicted(_, version) => {
1501 12 : // if we were not doing the version check, we would need to try to
1502 12 : // upgrade the weak here to see if it really is dropped. version check
1503 12 : // is done instead assuming that it is cheaper.
1504 12 : tracing::debug!(
1505 : version,
1506 : only_version,
1507 0 : "version mismatch, not deinitializing"
1508 : );
1509 12 : return Err(EvictionCancelled::VersionCheckFailed);
1510 : }
1511 : ResidentOrWantedEvicted::Resident(_) => {
1512 12 : return Err(EvictionCancelled::AlreadyReinitialized);
1513 : }
1514 : }
1515 : }
1516 12 : Err(permit) => {
1517 12 : tracing::debug!("continuing after cancelled get_or_maybe_download or eviction");
1518 12 : (None, permit)
1519 : }
1520 : };
1521 :
1522 204 : permit
1523 204 : };
1524 204 :
1525 204 : let span = tracing::Span::current();
1526 204 :
1527 204 : let spawned_at = std::time::Instant::now();
1528 204 :
1529 204 : // this is on purpose a detached spawn; we don't need to wait for it
1530 204 : //
1531 204 : // eviction completion reporting is the only thing hinging on this, and it can be just as
1532 204 : // well from a spawn_blocking thread.
1533 204 : //
1534 204 : // important to note that now that we've acquired the permit we have made sure the evicted
1535 204 : // file is either the exact `WantedEvicted` we wanted to evict, or uninitialized in case
1536 204 : // there are multiple evictions. The rest is not cancellable, and we've now commited to
1537 204 : // evicting.
1538 204 : //
1539 204 : // If spawn_blocking has a queue and maximum number of threads are in use, we could stall
1540 204 : // reads. We will need to add cancellation for that if necessary.
1541 204 : Self::spawn_blocking(move || {
1542 204 : let _span = span.entered();
1543 204 :
1544 204 : let res = self.evict_blocking(&timeline, &gate, &permit);
1545 204 :
1546 204 : let waiters = self.inner.initializer_count();
1547 204 :
1548 204 : if waiters > 0 {
1549 0 : LAYER_IMPL_METRICS.inc_evicted_with_waiters();
1550 204 : }
1551 :
1552 204 : let completed_in = spawned_at.elapsed();
1553 204 : LAYER_IMPL_METRICS.record_time_to_evict(completed_in);
1554 204 :
1555 204 : match res {
1556 204 : Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
1557 0 : Err(e) => LAYER_IMPL_METRICS.inc_eviction_cancelled(e),
1558 : }
1559 :
1560 204 : tracing::debug!(?res, elapsed_ms=%completed_in.as_millis(), %waiters, "eviction completed");
1561 204 : });
1562 204 :
1563 204 : Ok(())
1564 240 : }
1565 :
1566 : /// This is blocking only to do just one spawn_blocking hop compared to multiple via tokio::fs.
1567 204 : fn evict_blocking(
1568 204 : &self,
1569 204 : timeline: &Timeline,
1570 204 : _gate: &gate::GateGuard,
1571 204 : _permit: &heavier_once_cell::InitPermit,
1572 204 : ) -> Result<(), EvictionCancelled> {
1573 204 : // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`
1574 204 :
1575 204 : match capture_mtime_and_remove(&self.path) {
1576 204 : Ok(local_layer_mtime) => {
1577 204 : let duration = SystemTime::now().duration_since(local_layer_mtime);
1578 204 : match duration {
1579 204 : Ok(elapsed) => {
1580 204 : let accessed_and_visible = self.access_stats.accessed()
1581 24 : && self.access_stats.visibility() == LayerVisibilityHint::Visible;
1582 204 : if accessed_and_visible {
1583 24 : // Only layers used for reads contribute to our "low residence" metric that is used
1584 24 : // to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed
1585 24 : // to be rapidly evicted without contributing to this metric.
1586 24 : timeline
1587 24 : .metrics
1588 24 : .evictions_with_low_residence_duration
1589 24 : .read()
1590 24 : .unwrap()
1591 24 : .observe(elapsed);
1592 180 : }
1593 :
1594 204 : tracing::info!(
1595 0 : residence_millis = elapsed.as_millis(),
1596 0 : accessed_and_visible,
1597 0 : "evicted layer after known residence period"
1598 : );
1599 : }
1600 : Err(_) => {
1601 0 : tracing::info!("evicted layer after unknown residence period");
1602 : }
1603 : }
1604 204 : timeline.metrics.evictions.inc();
1605 204 : timeline
1606 204 : .metrics
1607 204 : .resident_physical_size_sub(self.desc.file_size);
1608 : }
1609 0 : Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
1610 0 : tracing::error!(
1611 : layer_size = %self.desc.file_size,
1612 0 : "failed to evict layer from disk, it was already gone"
1613 : );
1614 0 : return Err(EvictionCancelled::FileNotFound);
1615 : }
1616 0 : Err(e) => {
1617 0 : // FIXME: this should probably be an abort
1618 0 : tracing::error!("failed to evict file from disk: {e:#}");
1619 0 : return Err(EvictionCancelled::RemoveFailed);
1620 : }
1621 : }
1622 :
1623 204 : self.access_stats.record_residence_event();
1624 204 :
1625 204 : *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
1626 204 :
1627 204 : self.status.as_ref().unwrap().send_replace(Status::Evicted);
1628 204 :
1629 204 : Ok(())
1630 204 : }
1631 :
1632 19141 : fn metadata(&self) -> LayerFileMetadata {
1633 19141 : LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
1634 19141 : }
1635 :
1636 : /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME.
1637 : ///
1638 : /// Synchronizing with spawned tasks is very complicated otherwise.
1639 324 : fn spawn<F>(fut: F)
1640 324 : where
1641 324 : F: std::future::Future<Output = ()> + Send + 'static,
1642 324 : {
1643 324 : #[cfg(test)]
1644 324 : tokio::task::spawn(fut);
1645 324 : #[cfg(not(test))]
1646 324 : crate::task_mgr::BACKGROUND_RUNTIME.spawn(fut);
1647 324 : }
1648 :
1649 : /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME.
1650 3277 : fn spawn_blocking<F>(f: F)
1651 3277 : where
1652 3277 : F: FnOnce() + Send + 'static,
1653 3277 : {
1654 3277 : #[cfg(test)]
1655 3277 : tokio::task::spawn_blocking(f);
1656 3277 : #[cfg(not(test))]
1657 3277 : crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(f);
1658 3277 : }
1659 : }
1660 :
1661 204 : fn capture_mtime_and_remove(path: &Utf8Path) -> Result<SystemTime, std::io::Error> {
1662 204 : let m = path.metadata()?;
1663 204 : let local_layer_mtime = m.modified()?;
1664 204 : std::fs::remove_file(path)?;
1665 204 : Ok(local_layer_mtime)
1666 204 : }
1667 :
1668 : #[derive(Debug, thiserror::Error)]
1669 : pub(crate) enum EvictionError {
1670 : #[error("layer was already evicted")]
1671 : NotFound,
1672 :
1673 : /// Evictions must always lose to downloads in races, and this time it happened.
1674 : #[error("layer was downloaded instead")]
1675 : Downloaded,
1676 :
1677 : #[error("eviction did not happen within timeout")]
1678 : Timeout,
1679 : }
1680 :
1681 : /// Error internal to the [`LayerInner::get_or_maybe_download`]
1682 : #[derive(Debug, thiserror::Error)]
1683 : pub(crate) enum DownloadError {
1684 : #[error("timeline has already shutdown")]
1685 : TimelineShutdown,
1686 : #[error("context denies downloading")]
1687 : ContextAndConfigReallyDeniesDownloads,
1688 : #[error("downloading is really required but not allowed by this method")]
1689 : DownloadRequired,
1690 : #[error("layer path exists, but it is not a file: {0:?}")]
1691 : NotFile(std::fs::FileType),
1692 : /// Why no error here? Because it will be reported by page_service. We should had also done
1693 : /// retries already.
1694 : #[error("downloading evicted layer file failed")]
1695 : DownloadFailed,
1696 : #[error("downloading failed, possibly for shutdown")]
1697 : DownloadCancelled,
1698 : #[error("pre-condition: stat before download failed")]
1699 : PreStatFailed(#[source] std::io::Error),
1700 :
1701 : #[cfg(test)]
1702 : #[error("failpoint: {0:?}")]
1703 : Failpoint(failpoints::FailpointKind),
1704 : }
1705 :
1706 : impl DownloadError {
1707 0 : pub(crate) fn is_cancelled(&self) -> bool {
1708 0 : matches!(self, DownloadError::DownloadCancelled)
1709 0 : }
1710 : }
1711 :
1712 : #[derive(Debug, PartialEq)]
1713 : pub(crate) enum NeedsDownload {
1714 : NotFound,
1715 : NotFile(std::fs::FileType),
1716 : WrongSize { actual: u64, expected: u64 },
1717 : }
1718 :
1719 : impl std::fmt::Display for NeedsDownload {
1720 84 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1721 84 : match self {
1722 84 : NeedsDownload::NotFound => write!(f, "file was not found"),
1723 0 : NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
1724 0 : NeedsDownload::WrongSize { actual, expected } => {
1725 0 : write!(f, "file size mismatch {actual} vs. {expected}")
1726 : }
1727 : }
1728 84 : }
1729 : }
1730 :
1731 : /// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
1732 : pub(crate) struct DownloadedLayer {
1733 : owner: Weak<LayerInner>,
1734 : // Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
1735 : // DownloadedLayer
1736 : kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
1737 : version: usize,
1738 : }
1739 :
1740 : impl std::fmt::Debug for DownloadedLayer {
1741 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1742 0 : f.debug_struct("DownloadedLayer")
1743 0 : // owner omitted because it is always "Weak"
1744 0 : .field("kind", &self.kind)
1745 0 : .field("version", &self.version)
1746 0 : .finish()
1747 0 : }
1748 : }
1749 :
1750 : impl Drop for DownloadedLayer {
1751 4477 : fn drop(&mut self) {
1752 4477 : if let Some(owner) = self.owner.upgrade() {
1753 240 : owner.on_downloaded_layer_drop(self.version);
1754 4237 : } else {
1755 4237 : // Layer::drop will handle cancelling the eviction; because of drop order and
1756 4237 : // `DownloadedLayer` never leaking, we cannot know here if eviction was requested.
1757 4237 : }
1758 4477 : }
1759 : }
1760 :
1761 : impl DownloadedLayer {
1762 : /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`].
1763 : /// Failure to load the layer is sticky, i.e., future `get()` calls will return
1764 : /// the initial load failure immediately.
1765 : ///
1766 : /// `owner` parameter is a strong reference at the same `LayerInner` as the
1767 : /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called,
1768 : /// we will always have the LayerInner on the callstack, so we can just use it.
1769 1602319 : async fn get<'a>(
1770 1602319 : &'a self,
1771 1602319 : owner: &Arc<LayerInner>,
1772 1602319 : ctx: &RequestContext,
1773 1602319 : ) -> anyhow::Result<&'a LayerKind> {
1774 1602319 : let init = || async {
1775 7512 : assert_eq!(
1776 7512 : Weak::as_ptr(&self.owner),
1777 7512 : Arc::as_ptr(owner),
1778 0 : "these are the same, just avoiding the upgrade"
1779 : );
1780 :
1781 7512 : let res = if owner.desc.is_delta {
1782 6612 : let ctx = RequestContextBuilder::from(ctx)
1783 6612 : .page_content_kind(crate::context::PageContentKind::DeltaLayerSummary)
1784 6612 : .attached_child();
1785 6612 : let summary = Some(delta_layer::Summary::expected(
1786 6612 : owner.desc.tenant_shard_id.tenant_id,
1787 6612 : owner.desc.timeline_id,
1788 6612 : owner.desc.key_range.clone(),
1789 6612 : owner.desc.lsn_range.clone(),
1790 6612 : ));
1791 6612 : delta_layer::DeltaLayerInner::load(
1792 6612 : &owner.path,
1793 6612 : summary,
1794 6612 : Some(owner.conf.max_vectored_read_bytes),
1795 6612 : &ctx,
1796 6612 : )
1797 6612 : .await
1798 6612 : .map(LayerKind::Delta)
1799 : } else {
1800 900 : let ctx = RequestContextBuilder::from(ctx)
1801 900 : .page_content_kind(crate::context::PageContentKind::ImageLayerSummary)
1802 900 : .attached_child();
1803 900 : let lsn = owner.desc.image_layer_lsn();
1804 900 : let summary = Some(image_layer::Summary::expected(
1805 900 : owner.desc.tenant_shard_id.tenant_id,
1806 900 : owner.desc.timeline_id,
1807 900 : owner.desc.key_range.clone(),
1808 900 : lsn,
1809 900 : ));
1810 900 : image_layer::ImageLayerInner::load(
1811 900 : &owner.path,
1812 900 : lsn,
1813 900 : summary,
1814 900 : Some(owner.conf.max_vectored_read_bytes),
1815 900 : &ctx,
1816 900 : )
1817 900 : .await
1818 900 : .map(LayerKind::Image)
1819 : };
1820 :
1821 7512 : match res {
1822 7512 : Ok(layer) => Ok(layer),
1823 0 : Err(err) => {
1824 0 : LAYER_IMPL_METRICS.inc_permanent_loading_failures();
1825 0 : // We log this message once over the lifetime of `Self`
1826 0 : // => Ok and good to log backtrace and path here.
1827 0 : tracing::error!(
1828 0 : "layer load failed, assuming permanent failure: {}: {err:?}",
1829 0 : owner.path
1830 : );
1831 0 : Err(err)
1832 : }
1833 : }
1834 15024 : };
1835 1602319 : self.kind
1836 1602319 : .get_or_init(init)
1837 1602319 : .await
1838 1602319 : .as_ref()
1839 1602319 : // We already logged the full backtrace above, once. Don't repeat that here.
1840 1602319 : .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
1841 1602319 : }
1842 :
1843 1596019 : async fn get_values_reconstruct_data(
1844 1596019 : &self,
1845 1596019 : this: ResidentLayer,
1846 1596019 : keyspace: KeySpace,
1847 1596019 : lsn_range: Range<Lsn>,
1848 1596019 : reconstruct_data: &mut ValuesReconstructState,
1849 1596019 : ctx: &RequestContext,
1850 1596019 : ) -> Result<(), GetVectoredError> {
1851 : use LayerKind::*;
1852 :
1853 1596019 : match self
1854 1596019 : .get(&this.owner.0, ctx)
1855 1596019 : .await
1856 1596019 : .map_err(GetVectoredError::Other)?
1857 : {
1858 1414507 : Delta(d) => {
1859 1414507 : d.get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx)
1860 1414507 : .await
1861 : }
1862 181512 : Image(i) => {
1863 181512 : i.get_values_reconstruct_data(this, keyspace, reconstruct_data, ctx)
1864 181512 : .await
1865 : }
1866 : }
1867 1596019 : }
1868 :
1869 24 : async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
1870 : use LayerKind::*;
1871 24 : match self.get(owner, ctx).await? {
1872 24 : Delta(d) => d.dump(ctx).await?,
1873 0 : Image(i) => i.dump(ctx).await?,
1874 : }
1875 :
1876 24 : Ok(())
1877 24 : }
1878 : }
1879 :
1880 : /// Wrapper around an actual layer implementation.
1881 : #[derive(Debug)]
1882 : enum LayerKind {
1883 : Delta(delta_layer::DeltaLayerInner),
1884 : Image(image_layer::ImageLayerInner),
1885 : }
1886 :
1887 : /// Guard for forcing a layer be resident while it exists.
1888 : #[derive(Clone)]
1889 : pub struct ResidentLayer {
1890 : owner: Layer,
1891 : downloaded: Arc<DownloadedLayer>,
1892 : }
1893 :
1894 : impl std::fmt::Display for ResidentLayer {
1895 13008 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1896 13008 : write!(f, "{}", self.owner)
1897 13008 : }
1898 : }
1899 :
1900 : impl std::fmt::Debug for ResidentLayer {
1901 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1902 0 : write!(f, "{}", self.owner)
1903 0 : }
1904 : }
1905 :
1906 : impl ResidentLayer {
1907 : /// Release the eviction guard, converting back into a plain [`Layer`].
1908 : ///
1909 : /// You can access the [`Layer`] also by using `as_ref`.
1910 2520 : pub(crate) fn drop_eviction_guard(self) -> Layer {
1911 2520 : self.into()
1912 2520 : }
1913 :
1914 : /// Loads all keys stored in the layer. Returns key, lsn and value size.
1915 0 : #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
1916 : pub(crate) async fn load_keys<'a>(
1917 : &'a self,
1918 : ctx: &RequestContext,
1919 : ) -> anyhow::Result<Vec<pageserver_api::key::Key>> {
1920 : use LayerKind::*;
1921 :
1922 : let owner = &self.owner.0;
1923 : let inner = self.downloaded.get(owner, ctx).await?;
1924 :
1925 : // this is valid because the DownloadedLayer::kind is a OnceCell, not a
1926 : // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
1927 : // while it's being held.
1928 : self.owner.record_access(ctx);
1929 :
1930 : let res = match inner {
1931 : Delta(d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await,
1932 : Image(i) => image_layer::ImageLayerInner::load_keys(i, ctx).await,
1933 : };
1934 0 : res.with_context(|| format!("Layer index is corrupted for {self}"))
1935 : }
1936 :
1937 : /// Read all they keys in this layer which match the ShardIdentity, and write them all to
1938 : /// the provided writer. Return the number of keys written.
1939 48 : #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
1940 : pub(crate) async fn filter(
1941 : &self,
1942 : shard_identity: &ShardIdentity,
1943 : writer: &mut ImageLayerWriter,
1944 : ctx: &RequestContext,
1945 : ) -> Result<usize, CompactionError> {
1946 : use LayerKind::*;
1947 :
1948 : match self
1949 : .downloaded
1950 : .get(&self.owner.0, ctx)
1951 : .await
1952 : .map_err(CompactionError::Other)?
1953 : {
1954 : Delta(_) => {
1955 : return Err(CompactionError::Other(anyhow::anyhow!(format!(
1956 : "cannot filter() on a delta layer {self}"
1957 : ))));
1958 : }
1959 : Image(i) => i
1960 : .filter(shard_identity, writer, ctx)
1961 : .await
1962 : .map_err(CompactionError::Other),
1963 : }
1964 : }
1965 :
1966 : /// Returns the amount of keys and values written to the writer.
1967 60 : pub(crate) async fn copy_delta_prefix(
1968 60 : &self,
1969 60 : writer: &mut super::delta_layer::DeltaLayerWriter,
1970 60 : until: Lsn,
1971 60 : ctx: &RequestContext,
1972 60 : ) -> anyhow::Result<usize> {
1973 : use LayerKind::*;
1974 :
1975 60 : let owner = &self.owner.0;
1976 60 :
1977 60 : match self.downloaded.get(owner, ctx).await? {
1978 60 : Delta(d) => d
1979 60 : .copy_prefix(writer, until, ctx)
1980 60 : .await
1981 60 : .with_context(|| format!("copy_delta_prefix until {until} of {self}")),
1982 0 : Image(_) => anyhow::bail!(format!("cannot copy_lsn_prefix of image layer {self}")),
1983 : }
1984 60 : }
1985 :
1986 11264 : pub(crate) fn local_path(&self) -> &Utf8Path {
1987 11264 : &self.owner.0.path
1988 11264 : }
1989 :
1990 13392 : pub(crate) fn metadata(&self) -> LayerFileMetadata {
1991 13392 : self.owner.metadata()
1992 13392 : }
1993 :
1994 : /// Cast the layer to a delta, return an error if it is an image layer.
1995 5736 : pub(crate) async fn get_as_delta(
1996 5736 : &self,
1997 5736 : ctx: &RequestContext,
1998 5736 : ) -> anyhow::Result<&delta_layer::DeltaLayerInner> {
1999 : use LayerKind::*;
2000 5736 : match self.downloaded.get(&self.owner.0, ctx).await? {
2001 5736 : Delta(d) => Ok(d),
2002 0 : Image(_) => Err(anyhow::anyhow!("image layer")),
2003 : }
2004 5736 : }
2005 :
2006 : /// Cast the layer to an image, return an error if it is a delta layer.
2007 432 : pub(crate) async fn get_as_image(
2008 432 : &self,
2009 432 : ctx: &RequestContext,
2010 432 : ) -> anyhow::Result<&image_layer::ImageLayerInner> {
2011 : use LayerKind::*;
2012 432 : match self.downloaded.get(&self.owner.0, ctx).await? {
2013 432 : Image(d) => Ok(d),
2014 0 : Delta(_) => Err(anyhow::anyhow!("delta layer")),
2015 : }
2016 432 : }
2017 : }
2018 :
2019 : impl AsLayerDesc for ResidentLayer {
2020 7145724 : fn layer_desc(&self) -> &PersistentLayerDesc {
2021 7145724 : self.owner.layer_desc()
2022 7145724 : }
2023 : }
2024 :
2025 : impl AsRef<Layer> for ResidentLayer {
2026 12396 : fn as_ref(&self) -> &Layer {
2027 12396 : &self.owner
2028 12396 : }
2029 : }
2030 :
2031 : /// Drop the eviction guard.
2032 : impl From<ResidentLayer> for Layer {
2033 2520 : fn from(value: ResidentLayer) -> Self {
2034 2520 : value.owner
2035 2520 : }
2036 : }
2037 :
2038 : use metrics::IntCounter;
2039 :
2040 : pub(crate) struct LayerImplMetrics {
2041 : started_evictions: IntCounter,
2042 : completed_evictions: IntCounter,
2043 : cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
2044 :
2045 : started_deletes: IntCounter,
2046 : completed_deletes: IntCounter,
2047 : failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
2048 :
2049 : rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
2050 : inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
2051 : redownload_after: metrics::Histogram,
2052 : time_to_evict: metrics::Histogram,
2053 : }
2054 :
2055 : impl Default for LayerImplMetrics {
2056 324 : fn default() -> Self {
2057 : use enum_map::Enum;
2058 :
2059 : // reminder: these will be pageserver_layer_* with "_total" suffix
2060 :
2061 324 : let started_evictions = metrics::register_int_counter!(
2062 324 : "pageserver_layer_started_evictions",
2063 324 : "Evictions started in the Layer implementation"
2064 324 : )
2065 324 : .unwrap();
2066 324 : let completed_evictions = metrics::register_int_counter!(
2067 324 : "pageserver_layer_completed_evictions",
2068 324 : "Evictions completed in the Layer implementation"
2069 324 : )
2070 324 : .unwrap();
2071 324 :
2072 324 : let cancelled_evictions = metrics::register_int_counter_vec!(
2073 324 : "pageserver_layer_cancelled_evictions_count",
2074 324 : "Different reasons for evictions to have been cancelled or failed",
2075 324 : &["reason"]
2076 324 : )
2077 324 : .unwrap();
2078 324 :
2079 2916 : let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
2080 2916 : let reason = EvictionCancelled::from_usize(i);
2081 2916 : let s = reason.as_str();
2082 2916 : cancelled_evictions.with_label_values(&[s])
2083 2916 : }));
2084 324 :
2085 324 : let started_deletes = metrics::register_int_counter!(
2086 324 : "pageserver_layer_started_deletes",
2087 324 : "Deletions on drop pending in the Layer implementation"
2088 324 : )
2089 324 : .unwrap();
2090 324 : let completed_deletes = metrics::register_int_counter!(
2091 324 : "pageserver_layer_completed_deletes",
2092 324 : "Deletions on drop completed in the Layer implementation"
2093 324 : )
2094 324 : .unwrap();
2095 324 :
2096 324 : let failed_deletes = metrics::register_int_counter_vec!(
2097 324 : "pageserver_layer_failed_deletes_count",
2098 324 : "Different reasons for deletions on drop to have failed",
2099 324 : &["reason"]
2100 324 : )
2101 324 : .unwrap();
2102 324 :
2103 648 : let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
2104 648 : let reason = DeleteFailed::from_usize(i);
2105 648 : let s = reason.as_str();
2106 648 : failed_deletes.with_label_values(&[s])
2107 648 : }));
2108 324 :
2109 324 : let rare_counters = metrics::register_int_counter_vec!(
2110 324 : "pageserver_layer_assumed_rare_count",
2111 324 : "Times unexpected or assumed rare event happened",
2112 324 : &["event"]
2113 324 : )
2114 324 : .unwrap();
2115 324 :
2116 2268 : let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
2117 2268 : let event = RareEvent::from_usize(i);
2118 2268 : let s = event.as_str();
2119 2268 : rare_counters.with_label_values(&[s])
2120 2268 : }));
2121 324 :
2122 324 : let inits_cancelled = metrics::register_int_counter!(
2123 324 : "pageserver_layer_inits_cancelled_count",
2124 324 : "Times Layer initialization was cancelled",
2125 324 : )
2126 324 : .unwrap();
2127 324 :
2128 324 : let redownload_after = {
2129 324 : let minute = 60.0;
2130 324 : let hour = 60.0 * minute;
2131 324 : metrics::register_histogram!(
2132 324 : "pageserver_layer_redownloaded_after",
2133 324 : "Time between evicting and re-downloading.",
2134 324 : vec![
2135 324 : 10.0,
2136 324 : 30.0,
2137 324 : minute,
2138 324 : 5.0 * minute,
2139 324 : 15.0 * minute,
2140 324 : 30.0 * minute,
2141 324 : hour,
2142 324 : 12.0 * hour,
2143 324 : ]
2144 324 : )
2145 324 : .unwrap()
2146 324 : };
2147 324 :
2148 324 : let time_to_evict = metrics::register_histogram!(
2149 324 : "pageserver_layer_eviction_held_permit_seconds",
2150 324 : "Time eviction held the permit.",
2151 324 : vec![0.001, 0.010, 0.100, 0.500, 1.000, 5.000]
2152 324 : )
2153 324 : .unwrap();
2154 324 :
2155 324 : Self {
2156 324 : started_evictions,
2157 324 : completed_evictions,
2158 324 : cancelled_evictions,
2159 324 :
2160 324 : started_deletes,
2161 324 : completed_deletes,
2162 324 : failed_deletes,
2163 324 :
2164 324 : rare_counters,
2165 324 : inits_cancelled,
2166 324 : redownload_after,
2167 324 : time_to_evict,
2168 324 : }
2169 324 : }
2170 : }
2171 :
2172 : impl LayerImplMetrics {
2173 252 : fn inc_started_evictions(&self) {
2174 252 : self.started_evictions.inc();
2175 252 : }
2176 204 : fn inc_completed_evictions(&self) {
2177 204 : self.completed_evictions.inc();
2178 204 : }
2179 48 : fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
2180 48 : self.cancelled_evictions[reason].inc()
2181 48 : }
2182 :
2183 3096 : fn inc_started_deletes(&self) {
2184 3096 : self.started_deletes.inc();
2185 3096 : }
2186 3056 : fn inc_completed_deletes(&self) {
2187 3056 : self.completed_deletes.inc();
2188 3056 : }
2189 0 : fn inc_deletes_failed(&self, reason: DeleteFailed) {
2190 0 : self.failed_deletes[reason].inc();
2191 0 : }
2192 :
2193 : /// Counted separatedly from failed layer deletes because we will complete the layer deletion
2194 : /// attempt regardless of failure to delete local file.
2195 0 : fn inc_delete_removes_failed(&self) {
2196 0 : self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
2197 0 : }
2198 :
2199 : /// Expected rare just as cancellations are rare, but we could have cancellations separate from
2200 : /// the single caller which can start the download, so use this counter to separte them.
2201 0 : fn inc_init_completed_without_requester(&self) {
2202 0 : self.rare_counters[RareEvent::InitCompletedWithoutRequester].inc();
2203 0 : }
2204 :
2205 : /// Expected rare because cancellations are unexpected, and failures are unexpected
2206 0 : fn inc_download_failed_without_requester(&self) {
2207 0 : self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
2208 0 : }
2209 :
2210 : /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
2211 : ///
2212 : /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
2213 : /// Option.
2214 0 : fn inc_raced_wanted_evicted_accesses(&self) {
2215 0 : self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
2216 0 : }
2217 :
2218 : /// These are only expected for [`Self::inc_init_cancelled`] amount when
2219 : /// running with remote storage.
2220 36 : fn inc_init_needed_no_download(&self) {
2221 36 : self.rare_counters[RareEvent::InitWithoutDownload].inc();
2222 36 : }
2223 :
2224 : /// Expected rare because all layer files should be readable and good
2225 0 : fn inc_permanent_loading_failures(&self) {
2226 0 : self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
2227 0 : }
2228 :
2229 0 : fn inc_init_cancelled(&self) {
2230 0 : self.inits_cancelled.inc()
2231 0 : }
2232 :
2233 84 : fn record_redownloaded_after(&self, duration: std::time::Duration) {
2234 84 : self.redownload_after.observe(duration.as_secs_f64())
2235 84 : }
2236 :
2237 : /// This would be bad if it ever happened, or mean extreme disk pressure. We should probably
2238 : /// instead cancel eviction if we would have read waiters. We cannot however separate reads
2239 : /// from other evictions, so this could have noise as well.
2240 0 : fn inc_evicted_with_waiters(&self) {
2241 0 : self.rare_counters[RareEvent::EvictedWithWaiters].inc();
2242 0 : }
2243 :
2244 : /// Recorded at least initially as the permit is now acquired in async context before
2245 : /// spawn_blocking action.
2246 204 : fn record_time_to_evict(&self, duration: std::time::Duration) {
2247 204 : self.time_to_evict.observe(duration.as_secs_f64())
2248 204 : }
2249 : }
2250 :
2251 : #[derive(Debug, Clone, Copy, enum_map::Enum)]
2252 : enum EvictionCancelled {
2253 : LayerGone,
2254 : TimelineGone,
2255 : VersionCheckFailed,
2256 : FileNotFound,
2257 : RemoveFailed,
2258 : AlreadyReinitialized,
2259 : /// Not evicted because of a pending reinitialization
2260 : LostToDownload,
2261 : /// After eviction, there was a new layer access which cancelled the eviction.
2262 : UpgradedBackOnAccess,
2263 : UnexpectedEvictedState,
2264 : }
2265 :
2266 : impl EvictionCancelled {
2267 2916 : fn as_str(&self) -> &'static str {
2268 2916 : match self {
2269 324 : EvictionCancelled::LayerGone => "layer_gone",
2270 324 : EvictionCancelled::TimelineGone => "timeline_gone",
2271 324 : EvictionCancelled::VersionCheckFailed => "version_check_fail",
2272 324 : EvictionCancelled::FileNotFound => "file_not_found",
2273 324 : EvictionCancelled::RemoveFailed => "remove_failed",
2274 324 : EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
2275 324 : EvictionCancelled::LostToDownload => "lost_to_download",
2276 324 : EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
2277 324 : EvictionCancelled::UnexpectedEvictedState => "unexpected_evicted_state",
2278 : }
2279 2916 : }
2280 : }
2281 :
2282 : #[derive(enum_map::Enum)]
2283 : enum DeleteFailed {
2284 : TimelineGone,
2285 : DeleteSchedulingFailed,
2286 : }
2287 :
2288 : impl DeleteFailed {
2289 648 : fn as_str(&self) -> &'static str {
2290 648 : match self {
2291 324 : DeleteFailed::TimelineGone => "timeline_gone",
2292 324 : DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
2293 : }
2294 648 : }
2295 : }
2296 :
2297 : #[derive(enum_map::Enum)]
2298 : enum RareEvent {
2299 : RemoveOnDropFailed,
2300 : InitCompletedWithoutRequester,
2301 : DownloadFailedWithoutRequester,
2302 : UpgradedWantedEvicted,
2303 : InitWithoutDownload,
2304 : PermanentLoadingFailure,
2305 : EvictedWithWaiters,
2306 : }
2307 :
2308 : impl RareEvent {
2309 2268 : fn as_str(&self) -> &'static str {
2310 : use RareEvent::*;
2311 :
2312 2268 : match self {
2313 324 : RemoveOnDropFailed => "remove_on_drop_failed",
2314 324 : InitCompletedWithoutRequester => "init_completed_without",
2315 324 : DownloadFailedWithoutRequester => "download_failed_without",
2316 324 : UpgradedWantedEvicted => "raced_wanted_evicted",
2317 324 : InitWithoutDownload => "init_needed_no_download",
2318 324 : PermanentLoadingFailure => "permanent_loading_failure",
2319 324 : EvictedWithWaiters => "evicted_with_waiters",
2320 : }
2321 2268 : }
2322 : }
2323 :
2324 : pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
2325 : once_cell::sync::Lazy::new(LayerImplMetrics::default);
|