Line data Source code
1 : //! This module implements the pageserver-global disk-usage-based layer eviction task.
2 : //!
3 : //! # Mechanics
4 : //!
5 : //! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
6 : //! loop that evicts layers in response to a shortage of available bytes
7 : //! in the $repo/tenants directory's filesystem.
8 : //!
9 : //! The loop runs periodically at a configurable `period`.
10 : //!
11 : //! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
12 : //! It compares the returned usage data against two different types of thresholds.
13 : //! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
14 : //! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
15 : //! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
16 : //! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
17 : //!
18 : //! # Eviction Policy
19 : //!
20 : //! There are two thresholds:
21 : //! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
22 : //! If the actual usage is higher, the threshold is exceeded.
23 : //! `min_avail_bytes` is the absolute available space in bytes.
24 : //! If the actual usage is lower, the threshold is exceeded.
25 : //! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction
26 : //! is performed on the next iteration, to release disk space and bring the usage below the thresholds again.
27 : //! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
28 : //! The reservation is to keep the most recently accessed X bytes per tenant resident.
29 : //! If we cannot relieve pressure by evicting layers outside of the reservation, we
30 : //! start evicting layers that are part of the reservation, LRU first.
31 : //!
32 : //! The value for the per-tenant reservation is referred to as `tenant_min_resident_size`
33 : //! throughout the code, but, no actual variable carries that name.
34 : //! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`.
35 : //! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
36 : //! during page reconstruction.
37 : //! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
38 : //! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`).
39 :
40 : // Implementation notes:
41 : // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
42 : // reading these fields. We use the Debug impl for semi-structured logging, though.
43 :
44 : use std::sync::Arc;
45 : use std::time::SystemTime;
46 :
47 : use anyhow::Context;
48 : use pageserver_api::config::DiskUsageEvictionTaskConfig;
49 : use pageserver_api::shard::TenantShardId;
50 : use remote_storage::GenericRemoteStorage;
51 : use serde::Serialize;
52 : use tokio::time::Instant;
53 : use tokio_util::sync::CancellationToken;
54 : use tracing::{Instrument, debug, error, info, instrument, warn};
55 : use utils::completion;
56 : use utils::id::TimelineId;
57 :
58 : use crate::config::PageServerConf;
59 : use crate::metrics::disk_usage_based_eviction::METRICS;
60 : use crate::task_mgr::{self, BACKGROUND_RUNTIME};
61 : use crate::tenant::mgr::TenantManager;
62 : use crate::tenant::remote_timeline_client::LayerFileMetadata;
63 : use crate::tenant::secondary::SecondaryTenant;
64 : use crate::tenant::storage_layer::{
65 : AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint,
66 : };
67 : use crate::tenant::tasks::sleep_random;
68 : use crate::{CancellableTask, DiskUsageEvictionTask};
69 :
70 : /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
71 : /// partitioning.
72 : #[derive(Debug, Clone, Copy, PartialEq, Eq)]
73 : pub enum EvictionOrder {
74 : /// Order the layers to be evicted by how recently they have been accessed relatively within
75 : /// the set of resident layers of a tenant.
76 : RelativeAccessed {
77 : /// Determines if the tenant with most layers should lose first.
78 : ///
79 : /// Having this enabled is currently the only reasonable option, because the order in which
80 : /// we read tenants is deterministic. If we find the need to use this as `false`, we need
81 : /// to ensure nondeterminism by adding in a random number to break the
82 : /// `relative_last_activity==0.0` ties.
83 : highest_layer_count_loses_first: bool,
84 : },
85 : }
86 :
87 : impl From<pageserver_api::config::EvictionOrder> for EvictionOrder {
88 0 : fn from(value: pageserver_api::config::EvictionOrder) -> Self {
89 0 : match value {
90 : pageserver_api::config::EvictionOrder::RelativeAccessed {
91 0 : highest_layer_count_loses_first,
92 0 : } => Self::RelativeAccessed {
93 0 : highest_layer_count_loses_first,
94 0 : },
95 : }
96 0 : }
97 : }
98 :
99 : impl EvictionOrder {
100 0 : fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
101 : use EvictionOrder::*;
102 :
103 0 : match self {
104 0 : RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
105 0 : (*partition, candidate.relative_last_activity)
106 0 : }),
107 : }
108 0 : }
109 :
110 : /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants
111 : /// layers in **most** recently used order.
112 20 : fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 {
113 : use EvictionOrder::*;
114 :
115 20 : match self {
116 : RelativeAccessed {
117 20 : highest_layer_count_loses_first,
118 : } => {
119 : // keeping the -1 or not decides if every tenant should lose their least recently accessed
120 : // layer OR if this should happen in the order of having highest layer count:
121 20 : let fudge = if *highest_layer_count_loses_first {
122 : // relative_last_activity vs. tenant layer count:
123 : // - 0.1..=1.0 (10 layers)
124 : // - 0.01..=1.0 (100 layers)
125 : // - 0.001..=1.0 (1000 layers)
126 : //
127 : // leading to evicting less of the smallest tenants.
128 10 : 0
129 : } else {
130 : // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
131 : // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
132 : // be that less than 10k layer evictions is enough, so we would not need to evict from
133 : // all tenants.
134 : //
135 : // as the tenant ordering is now deterministic this could hit the same tenants
136 : // disproportionetly on multiple invocations. alternative could be to remember how many
137 : // layers did we evict last time from this tenant, and inject that as an additional
138 : // fudge here.
139 10 : 1
140 : };
141 :
142 20 : let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1);
143 20 : let divider = total as f32;
144 :
145 : // most recently used is always (total - 0) / divider == 1.0
146 : // least recently used depends on the fudge:
147 : // - (total - 1) - (total - 1) / total => 0 / total
148 : // - total - (total - 1) / total => 1 / total
149 20 : let distance = (total - index) as f32;
150 :
151 20 : finite_f32::FiniteF32::try_from_normalized(distance / divider)
152 20 : .unwrap_or_else(|val| {
153 0 : tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}");
154 0 : finite_f32::FiniteF32::ZERO
155 0 : })
156 : }
157 : }
158 20 : }
159 : }
160 :
161 : #[derive(Default)]
162 : pub struct State {
163 : /// Exclude http requests and background task from running at the same time.
164 : mutex: tokio::sync::Mutex<()>,
165 : }
166 :
167 0 : pub fn launch_disk_usage_global_eviction_task(
168 0 : conf: &'static PageServerConf,
169 0 : storage: GenericRemoteStorage,
170 0 : state: Arc<State>,
171 0 : tenant_manager: Arc<TenantManager>,
172 0 : background_jobs_barrier: completion::Barrier,
173 0 : ) -> Option<DiskUsageEvictionTask> {
174 0 : let task_config = &conf.disk_usage_based_eviction;
175 0 : if !task_config.enabled {
176 0 : info!("disk usage based eviction task not configured");
177 0 : return None;
178 0 : };
179 :
180 0 : info!("launching disk usage based eviction task");
181 :
182 0 : let cancel = CancellationToken::new();
183 0 : let task = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
184 : "disk usage based eviction",
185 : {
186 0 : let cancel = cancel.clone();
187 0 : async move {
188 : // wait until initial load is complete, because we cannot evict from loading tenants.
189 0 : tokio::select! {
190 0 : _ = cancel.cancelled() => { return anyhow::Ok(()); },
191 0 : _ = background_jobs_barrier.wait() => { }
192 : };
193 :
194 0 : disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel)
195 0 : .await;
196 0 : anyhow::Ok(())
197 0 : }
198 : },
199 : ));
200 :
201 0 : Some(DiskUsageEvictionTask(CancellableTask { cancel, task }))
202 0 : }
203 :
204 : #[instrument(skip_all)]
205 : async fn disk_usage_eviction_task(
206 : state: &State,
207 : task_config: &DiskUsageEvictionTaskConfig,
208 : storage: &GenericRemoteStorage,
209 : tenant_manager: Arc<TenantManager>,
210 : cancel: CancellationToken,
211 : ) {
212 : scopeguard::defer! {
213 : info!("disk usage based eviction task finishing");
214 : };
215 :
216 : if sleep_random(task_config.period, &cancel).await.is_err() {
217 : return;
218 : }
219 :
220 : let mut iteration_no = 0;
221 : loop {
222 : iteration_no += 1;
223 : let start = Instant::now();
224 :
225 0 : async {
226 0 : let res = disk_usage_eviction_task_iteration(
227 0 : state,
228 0 : task_config,
229 0 : storage,
230 0 : &tenant_manager,
231 0 : &cancel,
232 0 : )
233 0 : .await;
234 :
235 0 : match res {
236 0 : Ok(()) => {}
237 0 : Err(e) => {
238 : // these stat failures are expected to be very rare
239 0 : warn!("iteration failed, unexpected error: {e:#}");
240 : }
241 : }
242 0 : }
243 : .instrument(tracing::info_span!("iteration", iteration_no))
244 : .await;
245 :
246 : let sleep_until = start + task_config.period;
247 : if tokio::time::timeout_at(sleep_until, cancel.cancelled())
248 : .await
249 : .is_ok()
250 : {
251 : break;
252 : }
253 : }
254 : }
255 :
256 : pub trait Usage: Clone + Copy + std::fmt::Debug {
257 : fn has_pressure(&self) -> bool;
258 : fn add_available_bytes(&mut self, bytes: u64);
259 : }
260 :
261 0 : async fn disk_usage_eviction_task_iteration(
262 0 : state: &State,
263 0 : task_config: &DiskUsageEvictionTaskConfig,
264 0 : storage: &GenericRemoteStorage,
265 0 : tenant_manager: &Arc<TenantManager>,
266 0 : cancel: &CancellationToken,
267 0 : ) -> anyhow::Result<()> {
268 0 : let tenants_dir = tenant_manager.get_conf().tenants_path();
269 0 : let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
270 0 : .context("get filesystem-level disk usage before evictions")?;
271 0 : let res = disk_usage_eviction_task_iteration_impl(
272 0 : state,
273 0 : storage,
274 0 : usage_pre,
275 0 : tenant_manager,
276 0 : task_config.eviction_order.into(),
277 0 : cancel,
278 0 : )
279 0 : .await;
280 0 : match res {
281 0 : Ok(outcome) => {
282 0 : debug!(?outcome, "disk_usage_eviction_iteration finished");
283 0 : match outcome {
284 0 : IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
285 0 : // nothing to do, select statement below will handle things
286 0 : }
287 0 : IterationOutcome::Finished(outcome) => {
288 : // Verify with statvfs whether we made any real progress
289 0 : let after = filesystem_level_usage::get(&tenants_dir, task_config)
290 : // It's quite unlikely to hit the error here. Keep the code simple and bail out.
291 0 : .context("get filesystem-level disk usage after evictions")?;
292 :
293 0 : debug!(?after, "disk usage");
294 :
295 0 : if after.has_pressure() {
296 : // Don't bother doing an out-of-order iteration here now.
297 : // In practice, the task period is set to a value in the tens-of-seconds range,
298 : // which will cause another iteration to happen soon enough.
299 : // TODO: deltas between the three different usages would be helpful,
300 : // consider MiB, GiB, TiB
301 0 : warn!(?outcome, ?after, "disk usage still high");
302 : } else {
303 0 : info!(?outcome, ?after, "disk usage pressure relieved");
304 : }
305 : }
306 : }
307 : }
308 0 : Err(e) => {
309 0 : error!("disk_usage_eviction_iteration failed: {:#}", e);
310 : }
311 : }
312 :
313 0 : Ok(())
314 0 : }
315 :
316 : #[derive(Debug, Serialize)]
317 : #[allow(clippy::large_enum_variant)]
318 : pub enum IterationOutcome<U> {
319 : NoPressure,
320 : Cancelled,
321 : Finished(IterationOutcomeFinished<U>),
322 : }
323 :
324 : #[derive(Debug, Serialize)]
325 : pub struct IterationOutcomeFinished<U> {
326 : /// The actual usage observed before we started the iteration.
327 : before: U,
328 : /// The expected value for `after`, according to internal accounting, after phase 1.
329 : planned: PlannedUsage<U>,
330 : /// The outcome of phase 2, where we actually do the evictions.
331 : ///
332 : /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
333 : /// be the same as `planned`.
334 : assumed: AssumedUsage<U>,
335 : }
336 :
337 : #[derive(Debug, Serialize)]
338 : struct AssumedUsage<U> {
339 : /// The expected value for `after`, after phase 2.
340 : projected_after: U,
341 : /// The layers we failed to evict during phase 2.
342 : failed: LayerCount,
343 : }
344 :
345 : #[derive(Debug, Serialize)]
346 : struct PlannedUsage<U> {
347 : respecting_tenant_min_resident_size: U,
348 : fallback_to_global_lru: Option<U>,
349 : }
350 :
351 : #[derive(Debug, Default, Serialize)]
352 : struct LayerCount {
353 : file_sizes: u64,
354 : count: usize,
355 : }
356 :
357 0 : pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
358 0 : state: &State,
359 0 : _storage: &GenericRemoteStorage,
360 0 : usage_pre: U,
361 0 : tenant_manager: &Arc<TenantManager>,
362 0 : eviction_order: EvictionOrder,
363 0 : cancel: &CancellationToken,
364 0 : ) -> anyhow::Result<IterationOutcome<U>> {
365 : // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
366 0 : let _g = state
367 0 : .mutex
368 0 : .try_lock()
369 0 : .map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
370 :
371 0 : debug!(?usage_pre, "disk usage");
372 :
373 0 : if !usage_pre.has_pressure() {
374 0 : return Ok(IterationOutcome::NoPressure);
375 0 : }
376 :
377 0 : warn!(
378 : ?usage_pre,
379 0 : "running disk usage based eviction due to pressure"
380 : );
381 :
382 0 : let (candidates, collection_time) = {
383 0 : let started_at = std::time::Instant::now();
384 0 : match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
385 : EvictionCandidates::Cancelled => {
386 0 : return Ok(IterationOutcome::Cancelled);
387 : }
388 0 : EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()),
389 : }
390 : };
391 :
392 0 : METRICS.layers_collected.inc_by(candidates.len() as u64);
393 :
394 0 : tracing::info!(
395 0 : elapsed_ms = collection_time.as_millis(),
396 0 : total_layers = candidates.len(),
397 0 : "collection completed"
398 : );
399 :
400 : // Debug-log the list of candidates
401 0 : let now = SystemTime::now();
402 0 : for (i, (partition, candidate)) in candidates.iter().enumerate() {
403 0 : let nth = i + 1;
404 0 : let total_candidates = candidates.len();
405 0 : let size = candidate.layer.get_file_size();
406 0 : let rel = candidate.relative_last_activity;
407 0 : debug!(
408 0 : "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
409 0 : now.duration_since(candidate.last_activity_ts)
410 0 : .unwrap()
411 0 : .as_micros(),
412 0 : candidate.layer.get_tenant_shard_id(),
413 0 : candidate.layer.get_timeline_id(),
414 0 : candidate.layer.get_name(),
415 : );
416 : }
417 :
418 : // phase1: select victims to relieve pressure
419 : //
420 : // Walk through the list of candidates, until we have accumulated enough layers to get
421 : // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
422 : // how much disk space would be used after evicting all the layers up to the current
423 : // point in the list.
424 : //
425 : // If we get far enough in the list that we start to evict layers that are below
426 : // the tenant's min-resident-size threshold, print a warning, and memorize the disk
427 : // usage at that point, in 'usage_planned_min_resident_size_respecting'.
428 :
429 0 : let (evicted_amount, usage_planned) =
430 0 : select_victims(&candidates, usage_pre).into_amount_and_planned();
431 :
432 0 : METRICS.layers_selected.inc_by(evicted_amount as u64);
433 :
434 : // phase2: evict layers
435 :
436 0 : let mut js = tokio::task::JoinSet::new();
437 0 : let limit = 1000;
438 :
439 0 : let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
440 0 : let mut consumed_all = false;
441 :
442 : // After the evictions, `usage_assumed` is the post-eviction usage,
443 : // according to internal accounting.
444 0 : let mut usage_assumed = usage_pre;
445 0 : let mut evictions_failed = LayerCount::default();
446 :
447 0 : let evict_layers = async move {
448 : loop {
449 0 : let next = if js.len() >= limit || consumed_all {
450 0 : js.join_next().await
451 0 : } else if !js.is_empty() {
452 : // opportunistically consume ready result, one per each new evicted
453 0 : futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
454 : } else {
455 0 : None
456 : };
457 :
458 0 : if let Some(next) = next {
459 0 : match next {
460 0 : Ok(Ok(file_size)) => {
461 0 : METRICS.layers_evicted.inc();
462 0 : usage_assumed.add_available_bytes(file_size);
463 0 : }
464 : Ok(Err((
465 0 : file_size,
466 : EvictionError::NotFound
467 : | EvictionError::Downloaded
468 : | EvictionError::Timeout,
469 0 : ))) => {
470 0 : evictions_failed.file_sizes += file_size;
471 0 : evictions_failed.count += 1;
472 0 : }
473 0 : Err(je) if je.is_cancelled() => unreachable!("not used"),
474 0 : Err(je) if je.is_panic() => { /* already logged */ }
475 0 : Err(je) => tracing::error!("unknown JoinError: {je:?}"),
476 : }
477 0 : }
478 :
479 0 : if consumed_all && js.is_empty() {
480 0 : break;
481 0 : }
482 :
483 : // calling again when consumed_all is fine as evicted is fused.
484 0 : let Some((_partition, candidate)) = evicted.next() else {
485 0 : if !consumed_all {
486 0 : tracing::info!("all evictions started, waiting");
487 0 : consumed_all = true;
488 0 : }
489 0 : continue;
490 : };
491 :
492 0 : match candidate.layer {
493 0 : EvictionLayer::Attached(layer) => {
494 0 : let file_size = layer.layer_desc().file_size;
495 0 : js.spawn(async move {
496 : // have a low eviction waiting timeout because our LRU calculations go stale fast;
497 : // also individual layer evictions could hang because of bugs and we do not want to
498 : // pause disk_usage_based_eviction for such.
499 0 : let timeout = std::time::Duration::from_secs(5);
500 :
501 0 : match layer.evict_and_wait(timeout).await {
502 0 : Ok(()) => Ok(file_size),
503 0 : Err(e) => Err((file_size, e)),
504 : }
505 0 : });
506 : }
507 0 : EvictionLayer::Secondary(layer) => {
508 0 : let file_size = layer.metadata.file_size;
509 :
510 0 : js.spawn(async move {
511 0 : layer
512 0 : .secondary_tenant
513 0 : .evict_layer(layer.timeline_id, layer.name)
514 0 : .await;
515 0 : Ok(file_size)
516 0 : });
517 : }
518 : }
519 0 : tokio::task::yield_now().await;
520 : }
521 :
522 0 : (usage_assumed, evictions_failed)
523 0 : };
524 :
525 0 : let started_at = std::time::Instant::now();
526 :
527 0 : let evict_layers = async move {
528 0 : let mut evict_layers = std::pin::pin!(evict_layers);
529 :
530 0 : let maximum_expected = std::time::Duration::from_secs(10);
531 :
532 0 : let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await;
533 0 : let tuple = if let Ok(tuple) = res {
534 0 : tuple
535 : } else {
536 0 : let elapsed = started_at.elapsed();
537 0 : tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing");
538 0 : evict_layers.await
539 : };
540 :
541 0 : let elapsed = started_at.elapsed();
542 0 : tracing::info!(elapsed_ms = elapsed.as_millis(), "completed");
543 0 : tuple
544 0 : };
545 :
546 0 : let evict_layers =
547 0 : evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount));
548 :
549 0 : let (usage_assumed, evictions_failed) = tokio::select! {
550 0 : tuple = evict_layers => { tuple },
551 0 : _ = cancel.cancelled() => {
552 : // dropping joinset will abort all pending evict_and_waits and that is fine, our
553 : // requests will still stand
554 0 : return Ok(IterationOutcome::Cancelled);
555 : }
556 : };
557 :
558 0 : Ok(IterationOutcome::Finished(IterationOutcomeFinished {
559 0 : before: usage_pre,
560 0 : planned: usage_planned,
561 0 : assumed: AssumedUsage {
562 0 : projected_after: usage_assumed,
563 0 : failed: evictions_failed,
564 0 : },
565 0 : }))
566 0 : }
567 :
568 : #[derive(Clone)]
569 : pub(crate) struct EvictionSecondaryLayer {
570 : pub(crate) secondary_tenant: Arc<SecondaryTenant>,
571 : pub(crate) timeline_id: TimelineId,
572 : pub(crate) name: LayerName,
573 : pub(crate) metadata: LayerFileMetadata,
574 : }
575 :
576 : /// Full [`Layer`] objects are specific to tenants in attached mode. This type is a layer
577 : /// of indirection to store either a `Layer`, or a reference to a secondary tenant and a layer name.
578 : #[derive(Clone)]
579 : pub(crate) enum EvictionLayer {
580 : Attached(Layer),
581 : Secondary(EvictionSecondaryLayer),
582 : }
583 :
584 : impl From<Layer> for EvictionLayer {
585 0 : fn from(value: Layer) -> Self {
586 0 : Self::Attached(value)
587 0 : }
588 : }
589 :
590 : impl EvictionLayer {
591 0 : pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
592 0 : match self {
593 0 : Self::Attached(l) => &l.layer_desc().tenant_shard_id,
594 0 : Self::Secondary(sl) => sl.secondary_tenant.get_tenant_shard_id(),
595 : }
596 0 : }
597 :
598 0 : pub(crate) fn get_timeline_id(&self) -> &TimelineId {
599 0 : match self {
600 0 : Self::Attached(l) => &l.layer_desc().timeline_id,
601 0 : Self::Secondary(sl) => &sl.timeline_id,
602 : }
603 0 : }
604 :
605 0 : pub(crate) fn get_name(&self) -> LayerName {
606 0 : match self {
607 0 : Self::Attached(l) => l.layer_desc().layer_name(),
608 0 : Self::Secondary(sl) => sl.name.clone(),
609 : }
610 0 : }
611 :
612 0 : pub(crate) fn get_file_size(&self) -> u64 {
613 0 : match self {
614 0 : Self::Attached(l) => l.layer_desc().file_size,
615 0 : Self::Secondary(sl) => sl.metadata.file_size,
616 : }
617 0 : }
618 : }
619 :
620 : #[derive(Clone)]
621 : pub(crate) struct EvictionCandidate {
622 : pub(crate) layer: EvictionLayer,
623 : pub(crate) last_activity_ts: SystemTime,
624 : pub(crate) relative_last_activity: finite_f32::FiniteF32,
625 : pub(crate) visibility: LayerVisibilityHint,
626 : }
627 :
628 : impl std::fmt::Display for EvictionLayer {
629 0 : fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
630 0 : match self {
631 0 : Self::Attached(l) => l.fmt(f),
632 0 : Self::Secondary(sl) => {
633 0 : write!(f, "{}/{}", sl.timeline_id, sl.name)
634 : }
635 : }
636 0 : }
637 : }
638 :
639 : #[derive(Default)]
640 : pub(crate) struct DiskUsageEvictionInfo {
641 : /// Timeline's largest layer (remote or resident)
642 : pub max_layer_size: Option<u64>,
643 : /// Timeline's resident layers
644 : pub resident_layers: Vec<EvictionCandidate>,
645 : }
646 :
647 : impl std::fmt::Debug for EvictionCandidate {
648 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
649 : // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
650 : // having to allocate a string to this is bad, but it will rarely be formatted
651 0 : let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
652 0 : let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
653 : struct DisplayIsDebug<'a, T>(&'a T);
654 : impl<T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'_, T> {
655 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
656 0 : write!(f, "{}", self.0)
657 0 : }
658 : }
659 0 : f.debug_struct("LocalLayerInfoForDiskUsageEviction")
660 0 : .field("layer", &DisplayIsDebug(&self.layer))
661 0 : .field("last_activity", &ts)
662 0 : .finish()
663 0 : }
664 : }
665 :
666 : #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
667 : enum EvictionPartition {
668 : // A layer that is un-wanted by the tenant: evict all these first, before considering
669 : // any other layers
670 : EvictNow,
671 :
672 : // Above the minimum size threshold: this layer is a candidate for eviction.
673 : Above,
674 :
675 : // Below the minimum size threshold: this layer should only be evicted if all the
676 : // tenants' layers above the minimum size threshold have already been considered.
677 : Below,
678 : }
679 :
680 : enum EvictionCandidates {
681 : Cancelled,
682 : Finished(Vec<(EvictionPartition, EvictionCandidate)>),
683 : }
684 :
685 : /// Gather the eviction candidates.
686 : ///
687 : /// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction
688 : /// order. A caller that evicts in that order, until pressure is relieved, implements
689 : /// the eviction policy outlined in the module comment.
690 : ///
691 : /// # Example with EvictionOrder::AbsoluteAccessed
692 : ///
693 : /// Imagine that there are two tenants, A and B, with five layers each, a-e.
694 : /// Each layer has size 100, and both tenant's min_resident_size is 150.
695 : /// The eviction order would be
696 : ///
697 : /// ```text
698 : /// partition last_activity_ts tenant/layer
699 : /// Above 18:30 A/c
700 : /// Above 19:00 A/b
701 : /// Above 18:29 B/c
702 : /// Above 19:05 B/b
703 : /// Above 20:00 B/a
704 : /// Above 20:03 A/a
705 : /// Below 20:30 A/d
706 : /// Below 20:40 B/d
707 : /// Below 20:45 B/e
708 : /// Below 20:58 A/e
709 : /// ```
710 : ///
711 : /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
712 : /// They are all in the `Above` partition, so, we respected each tenant's min_resident_size.
713 : ///
714 : /// But, if we need to evict 900 bytes to relieve pressure, we'd evict
715 : /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
716 : /// after exhauting the `Above` partition.
717 : /// So, we did not respect each tenant's min_resident_size.
718 : ///
719 : /// # Example with EvictionOrder::RelativeAccessed
720 : ///
721 : /// ```text
722 : /// partition relative_age last_activity_ts tenant/layer
723 : /// Above 0/4 18:30 A/c
724 : /// Above 0/4 18:29 B/c
725 : /// Above 1/4 19:00 A/b
726 : /// Above 1/4 19:05 B/b
727 : /// Above 2/4 20:00 B/a
728 : /// Above 2/4 20:03 A/a
729 : /// Below 3/4 20:30 A/d
730 : /// Below 3/4 20:40 B/d
731 : /// Below 4/4 20:45 B/e
732 : /// Below 4/4 20:58 A/e
733 : /// ```
734 : ///
735 : /// With tenants having the same number of layers the picture does not change much. The same with
736 : /// A having many more layers **resident** (not all of them listed):
737 : ///
738 : /// ```text
739 : /// Above 0/100 18:30 A/c
740 : /// Above 0/4 18:29 B/c
741 : /// Above 1/100 19:00 A/b
742 : /// Above 2/100 20:03 A/a
743 : /// Above 3/100 20:03 A/nth_3
744 : /// Above 4/100 20:03 A/nth_4
745 : /// ...
746 : /// Above 1/4 19:05 B/b
747 : /// Above 25/100 20:04 A/nth_25
748 : /// ...
749 : /// Above 2/4 20:00 B/a
750 : /// Above 50/100 20:10 A/nth_50
751 : /// ...
752 : /// Below 3/4 20:40 B/d
753 : /// Below 99/100 20:30 A/nth_99
754 : /// Below 4/4 20:45 B/e
755 : /// Below 100/100 20:58 A/nth_100
756 : /// ```
757 : ///
758 : /// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
759 : /// difficult to see is what happens on the next round assuming the evicting 23 from the above list
760 : /// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
761 : /// appeared:
762 : ///
763 : /// ```text
764 : /// Above 0/87 20:04 A/nth_23
765 : /// Above 0/3 19:05 B/b
766 : /// Above 0/50 20:59 C/nth_0
767 : /// Above 1/87 20:04 A/nth_24
768 : /// Above 1/50 21:00 C/nth_1
769 : /// Above 2/87 20:04 A/nth_25
770 : /// ...
771 : /// Above 16/50 21:02 C/nth_16
772 : /// Above 1/3 20:00 B/a
773 : /// Above 27/87 20:10 A/nth_50
774 : /// ...
775 : /// Below 2/3 20:40 B/d
776 : /// Below 49/50 21:05 C/nth_49
777 : /// Below 86/87 20:30 A/nth_99
778 : /// Below 3/3 20:45 B/e
779 : /// Below 50/50 21:05 C/nth_50
780 : /// Below 87/87 20:58 A/nth_100
781 : /// ```
782 : ///
783 : /// Now relieving pressure with 23 layers would cost:
784 : /// - tenant A 14 layers
785 : /// - tenant B 1 layer
786 : /// - tenant C 8 layers
787 0 : async fn collect_eviction_candidates(
788 0 : tenant_manager: &Arc<TenantManager>,
789 0 : eviction_order: EvictionOrder,
790 0 : cancel: &CancellationToken,
791 0 : ) -> anyhow::Result<EvictionCandidates> {
792 : const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
793 :
794 : // get a snapshot of the list of tenants
795 0 : let tenants = tenant_manager
796 0 : .list_tenants()
797 0 : .context("get list of tenants")?;
798 :
799 : // TODO: avoid listing every layer in every tenant: this loop can block the executor,
800 : // and the resulting data structure can be huge.
801 : // (https://github.com/neondatabase/neon/issues/6224)
802 0 : let mut candidates = Vec::new();
803 :
804 0 : for (tenant_id, _state, _gen) in tenants {
805 0 : if cancel.is_cancelled() {
806 0 : return Ok(EvictionCandidates::Cancelled);
807 0 : }
808 0 : let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) {
809 0 : Ok(tenant) if tenant.is_active() => tenant,
810 : Ok(_) => {
811 0 : debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active");
812 0 : continue;
813 : }
814 0 : Err(e) => {
815 : // this can happen if tenant has lifecycle transition after we fetched it
816 0 : debug!("failed to get tenant: {e:#}");
817 0 : continue;
818 : }
819 : };
820 :
821 0 : if tenant.cancel.is_cancelled() {
822 0 : info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
823 0 : continue;
824 0 : }
825 :
826 0 : let started_at = std::time::Instant::now();
827 :
828 : // collect layers from all timelines in this tenant
829 : //
830 : // If one of the timelines becomes `!is_active()` during the iteration,
831 : // for example because we're shutting down, then `max_layer_size` can be too small.
832 : // That's OK. This code only runs under a disk pressure situation, and being
833 : // a little unfair to tenants during shutdown in such a situation is tolerable.
834 0 : let mut tenant_candidates = Vec::new();
835 0 : let mut max_layer_size = 0;
836 0 : for tl in tenant.list_timelines() {
837 0 : if !tl.is_active() {
838 0 : continue;
839 0 : }
840 0 : let info = tl.get_local_layers_for_disk_usage_eviction().await;
841 0 : debug!(
842 0 : tenant_id=%tl.tenant_shard_id.tenant_id,
843 0 : shard_id=%tl.tenant_shard_id.shard_slug(),
844 0 : timeline_id=%tl.timeline_id,
845 0 : "timeline resident layers count: {}", info.resident_layers.len()
846 : );
847 :
848 0 : tenant_candidates.extend(info.resident_layers.into_iter());
849 0 : max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
850 :
851 0 : if cancel.is_cancelled() {
852 0 : return Ok(EvictionCandidates::Cancelled);
853 0 : }
854 : }
855 :
856 : // Also consider layers of timelines being imported for eviction
857 0 : for tl in tenant.list_importing_timelines() {
858 0 : let info = tl.timeline.get_local_layers_for_disk_usage_eviction().await;
859 0 : debug!(
860 0 : tenant_id=%tl.timeline.tenant_shard_id.tenant_id,
861 0 : shard_id=%tl.timeline.tenant_shard_id.shard_slug(),
862 0 : timeline_id=%tl.timeline.timeline_id,
863 0 : "timeline resident layers count: {}", info.resident_layers.len()
864 : );
865 :
866 0 : tenant_candidates.extend(info.resident_layers.into_iter());
867 0 : max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
868 :
869 0 : if cancel.is_cancelled() {
870 0 : return Ok(EvictionCandidates::Cancelled);
871 0 : }
872 : }
873 :
874 : // `min_resident_size` defaults to maximum layer file size of the tenant.
875 : // This ensures that each tenant can have at least one layer resident at a given time,
876 : // ensuring forward progress for a single Timeline::get in that tenant.
877 : // It's a questionable heuristic since, usually, there are many Timeline::get
878 : // requests going on for a tenant, and, at least in Neon prod, the median
879 : // layer file size is much smaller than the compaction target size.
880 : // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
881 : // That's what's typically used by the various background loops.
882 : //
883 : // The default can be overridden with a fixed value in the tenant conf.
884 : // A default override can be put in the default tenant conf in the pageserver.toml.
885 0 : let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
886 0 : debug!(
887 0 : tenant_id=%tenant.tenant_shard_id().tenant_id,
888 0 : shard_id=%tenant.tenant_shard_id().shard_slug(),
889 : overridden_size=s,
890 0 : "using overridden min resident size for tenant"
891 : );
892 0 : s
893 : } else {
894 0 : debug!(
895 0 : tenant_id=%tenant.tenant_shard_id().tenant_id,
896 0 : shard_id=%tenant.tenant_shard_id().shard_slug(),
897 : max_layer_size,
898 0 : "using max layer size as min_resident_size for tenant",
899 : );
900 0 : max_layer_size
901 : };
902 :
903 : // Sort layers most-recently-used first, then calculate [`EvictionPartition`] for each layer,
904 : // where the inputs are:
905 : // - whether the layer is visible
906 : // - whether the layer is above/below the min_resident_size cutline
907 0 : tenant_candidates
908 0 : .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
909 0 : let mut cumsum: i128 = 0;
910 :
911 0 : let total = tenant_candidates.len();
912 :
913 0 : let tenant_candidates =
914 0 : tenant_candidates
915 0 : .into_iter()
916 0 : .enumerate()
917 0 : .map(|(i, mut candidate)| {
918 : // as we iterate this reverse sorted list, the most recently accessed layer will always
919 : // be 1.0; this is for us to evict it last.
920 0 : candidate.relative_last_activity =
921 0 : eviction_order.relative_last_activity(total, i);
922 :
923 0 : let partition = match candidate.visibility {
924 : LayerVisibilityHint::Covered => {
925 : // Covered layers are evicted first
926 0 : EvictionPartition::EvictNow
927 : }
928 : LayerVisibilityHint::Visible => {
929 0 : cumsum += i128::from(candidate.layer.get_file_size());
930 :
931 0 : if cumsum > min_resident_size as i128 {
932 0 : EvictionPartition::Above
933 : } else {
934 : // The most recent layers below the min_resident_size threshold
935 : // are the last to be evicted.
936 0 : EvictionPartition::Below
937 : }
938 : }
939 : };
940 :
941 0 : (partition, candidate)
942 0 : });
943 :
944 0 : METRICS
945 0 : .tenant_layer_count
946 0 : .observe(tenant_candidates.len() as f64);
947 :
948 0 : candidates.extend(tenant_candidates);
949 :
950 0 : let elapsed = started_at.elapsed();
951 0 : METRICS
952 0 : .tenant_collection_time
953 0 : .observe(elapsed.as_secs_f64());
954 :
955 0 : if elapsed > LOG_DURATION_THRESHOLD {
956 0 : tracing::info!(
957 0 : tenant_id=%tenant.tenant_shard_id().tenant_id,
958 0 : shard_id=%tenant.tenant_shard_id().shard_slug(),
959 0 : elapsed_ms = elapsed.as_millis(),
960 0 : "collection took longer than threshold"
961 : );
962 0 : }
963 : }
964 :
965 : // Note: the same tenant ID might be hit twice, if it transitions from attached to
966 : // secondary while we run. That is okay: when we eventually try and run the eviction,
967 : // the `Gate` on the object will ensure that whichever one has already been shut down
968 : // will not delete anything.
969 :
970 0 : let mut secondary_tenants = Vec::new();
971 0 : tenant_manager.foreach_secondary_tenants(
972 0 : |_tenant_shard_id: &TenantShardId, state: &Arc<SecondaryTenant>| {
973 0 : secondary_tenants.push(state.clone());
974 0 : },
975 : );
976 :
977 0 : for tenant in secondary_tenants {
978 : // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
979 : // to prevent repeated disk usage based evictions from completely draining less often
980 : // updating secondaries.
981 0 : let (mut layer_info, total_layers) = tenant.get_layers_for_eviction();
982 :
983 0 : debug_assert!(
984 0 : total_layers >= layer_info.resident_layers.len(),
985 0 : "total_layers ({total_layers}) must be at least the resident_layers.len() ({})",
986 0 : layer_info.resident_layers.len()
987 : );
988 :
989 0 : let started_at = std::time::Instant::now();
990 :
991 0 : layer_info
992 0 : .resident_layers
993 0 : .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
994 :
995 0 : let tenant_candidates =
996 0 : layer_info
997 0 : .resident_layers
998 0 : .into_iter()
999 0 : .enumerate()
1000 0 : .map(|(i, mut candidate)| {
1001 0 : candidate.relative_last_activity =
1002 0 : eviction_order.relative_last_activity(total_layers, i);
1003 0 : (
1004 0 : // Secondary locations' layers are always considered above the min resident size,
1005 0 : // i.e. secondary locations are permitted to be trimmed to zero layers if all
1006 0 : // the layers have sufficiently old access times.
1007 0 : EvictionPartition::Above,
1008 0 : candidate,
1009 0 : )
1010 0 : });
1011 :
1012 0 : METRICS
1013 0 : .tenant_layer_count
1014 0 : .observe(tenant_candidates.len() as f64);
1015 0 : candidates.extend(tenant_candidates);
1016 :
1017 0 : tokio::task::yield_now().await;
1018 :
1019 0 : let elapsed = started_at.elapsed();
1020 :
1021 0 : METRICS
1022 0 : .tenant_collection_time
1023 0 : .observe(elapsed.as_secs_f64());
1024 :
1025 0 : if elapsed > LOG_DURATION_THRESHOLD {
1026 0 : tracing::info!(
1027 0 : tenant_id=%tenant.tenant_shard_id().tenant_id,
1028 0 : shard_id=%tenant.tenant_shard_id().shard_slug(),
1029 0 : elapsed_ms = elapsed.as_millis(),
1030 0 : "collection took longer than threshold"
1031 : );
1032 0 : }
1033 : }
1034 :
1035 0 : debug_assert!(
1036 0 : EvictionPartition::Above < EvictionPartition::Below,
1037 0 : "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"
1038 : );
1039 0 : debug_assert!(
1040 0 : EvictionPartition::EvictNow < EvictionPartition::Above,
1041 0 : "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"
1042 : );
1043 :
1044 0 : eviction_order.sort(&mut candidates);
1045 :
1046 0 : Ok(EvictionCandidates::Finished(candidates))
1047 0 : }
1048 :
1049 : /// Given a pre-sorted vec of all layers in the system, select the first N which are enough to
1050 : /// relieve pressure.
1051 : ///
1052 : /// Returns the amount of candidates selected, with the planned usage.
1053 0 : fn select_victims<U: Usage>(
1054 0 : candidates: &[(EvictionPartition, EvictionCandidate)],
1055 0 : usage_pre: U,
1056 0 : ) -> VictimSelection<U> {
1057 0 : let mut usage_when_switched = None;
1058 0 : let mut usage_planned = usage_pre;
1059 0 : let mut evicted_amount = 0;
1060 :
1061 0 : for (i, (partition, candidate)) in candidates.iter().enumerate() {
1062 0 : if !usage_planned.has_pressure() {
1063 0 : break;
1064 0 : }
1065 :
1066 0 : if partition == &EvictionPartition::Below && usage_when_switched.is_none() {
1067 0 : usage_when_switched = Some((usage_planned, i));
1068 0 : }
1069 :
1070 0 : usage_planned.add_available_bytes(candidate.layer.get_file_size());
1071 0 : evicted_amount += 1;
1072 : }
1073 :
1074 0 : VictimSelection {
1075 0 : amount: evicted_amount,
1076 0 : usage_pre,
1077 0 : usage_when_switched,
1078 0 : usage_planned,
1079 0 : }
1080 0 : }
1081 :
1082 : struct VictimSelection<U> {
1083 : amount: usize,
1084 : usage_pre: U,
1085 : usage_when_switched: Option<(U, usize)>,
1086 : usage_planned: U,
1087 : }
1088 :
1089 : impl<U: Usage> VictimSelection<U> {
1090 0 : fn into_amount_and_planned(self) -> (usize, PlannedUsage<U>) {
1091 0 : debug!(
1092 : evicted_amount=%self.amount,
1093 0 : "took enough candidates for pressure to be relieved"
1094 : );
1095 :
1096 0 : if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() {
1097 0 : warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
1098 0 : }
1099 :
1100 0 : let planned = match self.usage_when_switched {
1101 0 : Some((respecting_tenant_min_resident_size, _)) => PlannedUsage {
1102 0 : respecting_tenant_min_resident_size,
1103 0 : fallback_to_global_lru: Some(self.usage_planned),
1104 0 : },
1105 0 : None => PlannedUsage {
1106 0 : respecting_tenant_min_resident_size: self.usage_planned,
1107 0 : fallback_to_global_lru: None,
1108 0 : },
1109 : };
1110 :
1111 0 : (self.amount, planned)
1112 0 : }
1113 : }
1114 :
1115 : /// A totally ordered f32 subset we can use with sorting functions.
1116 : pub(crate) mod finite_f32 {
1117 :
1118 : /// A totally ordered f32 subset we can use with sorting functions.
1119 : #[derive(Clone, Copy, PartialEq)]
1120 : pub struct FiniteF32(f32);
1121 :
1122 : impl std::fmt::Debug for FiniteF32 {
1123 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1124 0 : std::fmt::Debug::fmt(&self.0, f)
1125 0 : }
1126 : }
1127 :
1128 : impl std::fmt::Display for FiniteF32 {
1129 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1130 0 : std::fmt::Display::fmt(&self.0, f)
1131 0 : }
1132 : }
1133 :
1134 : impl std::cmp::Eq for FiniteF32 {}
1135 :
1136 : impl std::cmp::PartialOrd for FiniteF32 {
1137 0 : fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
1138 0 : Some(self.cmp(other))
1139 0 : }
1140 : }
1141 :
1142 : impl std::cmp::Ord for FiniteF32 {
1143 0 : fn cmp(&self, other: &Self) -> std::cmp::Ordering {
1144 0 : self.0.total_cmp(&other.0)
1145 0 : }
1146 : }
1147 :
1148 : impl TryFrom<f32> for FiniteF32 {
1149 : type Error = f32;
1150 :
1151 0 : fn try_from(value: f32) -> Result<Self, Self::Error> {
1152 0 : if value.is_finite() {
1153 0 : Ok(FiniteF32(value))
1154 : } else {
1155 0 : Err(value)
1156 : }
1157 0 : }
1158 : }
1159 :
1160 : impl From<FiniteF32> for f32 {
1161 20 : fn from(value: FiniteF32) -> f32 {
1162 20 : value.0
1163 20 : }
1164 : }
1165 :
1166 : impl FiniteF32 {
1167 : pub const ZERO: FiniteF32 = FiniteF32(0.0);
1168 :
1169 20 : pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
1170 20 : if (0.0..=1.0).contains(&value) {
1171 : // -0.0 is within the range, make sure it is assumed 0.0..=1.0
1172 20 : let value = value.abs();
1173 20 : Ok(FiniteF32(value))
1174 : } else {
1175 0 : Err(value)
1176 : }
1177 20 : }
1178 :
1179 20 : pub fn into_inner(self) -> f32 {
1180 20 : self.into()
1181 20 : }
1182 : }
1183 : }
1184 :
1185 : mod filesystem_level_usage {
1186 : use anyhow::Context;
1187 : use camino::Utf8Path;
1188 :
1189 : use super::DiskUsageEvictionTaskConfig;
1190 : use crate::statvfs::Statvfs;
1191 :
1192 : #[derive(Debug, Clone, Copy)]
1193 : pub struct Usage<'a> {
1194 : config: &'a DiskUsageEvictionTaskConfig,
1195 :
1196 : /// Filesystem capacity
1197 : total_bytes: u64,
1198 : /// Free filesystem space
1199 : avail_bytes: u64,
1200 : }
1201 :
1202 : impl super::Usage for Usage<'_> {
1203 7 : fn has_pressure(&self) -> bool {
1204 7 : let usage_pct =
1205 7 : (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
1206 :
1207 7 : let pressures = [
1208 7 : (
1209 7 : "min_avail_bytes",
1210 7 : self.avail_bytes < self.config.min_avail_bytes,
1211 7 : ),
1212 7 : (
1213 7 : "max_usage_pct",
1214 7 : usage_pct >= self.config.max_usage_pct.get() as u64,
1215 7 : ),
1216 7 : ];
1217 :
1218 7 : pressures.into_iter().any(|(_, has_pressure)| has_pressure)
1219 7 : }
1220 :
1221 6 : fn add_available_bytes(&mut self, bytes: u64) {
1222 6 : self.avail_bytes += bytes;
1223 6 : }
1224 : }
1225 :
1226 0 : pub fn get<'a>(
1227 0 : tenants_dir: &Utf8Path,
1228 0 : config: &'a DiskUsageEvictionTaskConfig,
1229 0 : ) -> anyhow::Result<Usage<'a>> {
1230 0 : let mock_config = {
1231 : #[cfg(feature = "testing")]
1232 : {
1233 0 : config.mock_statvfs.as_ref()
1234 : }
1235 : #[cfg(not(feature = "testing"))]
1236 : {
1237 : None
1238 : }
1239 : };
1240 :
1241 0 : let stat = Statvfs::get(tenants_dir, mock_config)
1242 0 : .context("statvfs failed, presumably directory got unlinked")?;
1243 :
1244 0 : let (avail_bytes, total_bytes) = stat.get_avail_total_bytes();
1245 :
1246 0 : Ok(Usage {
1247 0 : config,
1248 0 : total_bytes,
1249 0 : avail_bytes,
1250 0 : })
1251 0 : }
1252 :
1253 : #[test]
1254 1 : fn max_usage_pct_pressure() {
1255 : use std::time::Duration;
1256 :
1257 : use utils::serde_percent::Percent;
1258 :
1259 : use super::Usage as _;
1260 :
1261 1 : let mut usage = Usage {
1262 1 : config: &DiskUsageEvictionTaskConfig {
1263 1 : max_usage_pct: Percent::new(85).unwrap(),
1264 1 : min_avail_bytes: 0,
1265 1 : period: Duration::MAX,
1266 1 : #[cfg(feature = "testing")]
1267 1 : mock_statvfs: None,
1268 1 : eviction_order: pageserver_api::config::EvictionOrder::default(),
1269 1 : enabled: true,
1270 1 : },
1271 1 : total_bytes: 100_000,
1272 1 : avail_bytes: 0,
1273 1 : };
1274 :
1275 1 : assert!(usage.has_pressure(), "expected pressure at 100%");
1276 :
1277 1 : usage.add_available_bytes(14_000);
1278 1 : assert!(usage.has_pressure(), "expected pressure at 86%");
1279 :
1280 1 : usage.add_available_bytes(999);
1281 1 : assert!(usage.has_pressure(), "expected pressure at 85.001%");
1282 :
1283 1 : usage.add_available_bytes(1);
1284 1 : assert!(usage.has_pressure(), "expected pressure at precisely 85%");
1285 :
1286 1 : usage.add_available_bytes(1);
1287 1 : assert!(!usage.has_pressure(), "no pressure at 84.999%");
1288 :
1289 1 : usage.add_available_bytes(999);
1290 1 : assert!(!usage.has_pressure(), "no pressure at 84%");
1291 :
1292 1 : usage.add_available_bytes(16_000);
1293 1 : assert!(!usage.has_pressure());
1294 1 : }
1295 : }
1296 :
1297 : #[cfg(test)]
1298 : mod tests {
1299 : use super::*;
1300 :
1301 : #[test]
1302 1 : fn relative_equal_bounds() {
1303 1 : let order = EvictionOrder::RelativeAccessed {
1304 1 : highest_layer_count_loses_first: false,
1305 1 : };
1306 :
1307 1 : let len = 10;
1308 1 : let v = (0..len)
1309 10 : .map(|i| order.relative_last_activity(len, i).into_inner())
1310 1 : .collect::<Vec<_>>();
1311 :
1312 1 : assert_eq!(v.first(), Some(&1.0));
1313 1 : assert_eq!(v.last(), Some(&0.0));
1314 9 : assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
1315 1 : }
1316 :
1317 : #[test]
1318 1 : fn relative_spare_bounds() {
1319 1 : let order = EvictionOrder::RelativeAccessed {
1320 1 : highest_layer_count_loses_first: true,
1321 1 : };
1322 :
1323 1 : let len = 10;
1324 1 : let v = (0..len)
1325 10 : .map(|i| order.relative_last_activity(len, i).into_inner())
1326 1 : .collect::<Vec<_>>();
1327 :
1328 1 : assert_eq!(v.first(), Some(&1.0));
1329 1 : assert_eq!(v.last(), Some(&0.1));
1330 9 : assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
1331 1 : }
1332 : }
|