LCOV - 1e20c4f2b28aa592527961bb32170ebbd2c9172f.info - pageserver/src/disk_usage_eviction

LCOV - code coverage report

Current view:	top level - pageserver/src - disk_usage_eviction_task.rs (source / functions)		Coverage	Total	Hit
Test:	1e20c4f2b28aa592527961bb32170ebbd2c9172f.info	Lines:	15.8 %	588	93
Test Date:	2025-07-16 12:29:03	Functions:	22.6 %	62	14

            Line data    Source code

       1              : //! This module implements the pageserver-global disk-usage-based layer eviction task.
       2              : //!
       3              : //! # Mechanics
       4              : //!
       5              : //! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
       6              : //! loop that evicts layers in response to a shortage of available bytes
       7              : //! in the $repo/tenants directory's filesystem.
       8              : //!
       9              : //! The loop runs periodically at a configurable `period`.
      10              : //!
      11              : //! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
      12              : //! It compares the returned usage data against two different types of thresholds.
      13              : //! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
      14              : //! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
      15              : //! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
      16              : //! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
      17              : //!
      18              : //! # Eviction Policy
      19              : //!
      20              : //! There are two thresholds:
      21              : //! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
      22              : //! If the actual usage is higher, the threshold is exceeded.
      23              : //! `min_avail_bytes` is the absolute available space in bytes.
      24              : //! If the actual usage is lower, the threshold is exceeded.
      25              : //! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction
      26              : //! is performed on the next iteration, to release disk space and bring the usage below the thresholds again.
      27              : //! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
      28              : //! The reservation is to keep the most recently accessed X bytes per tenant resident.
      29              : //! If we cannot relieve pressure by evicting layers outside of the reservation, we
      30              : //! start evicting layers that are part of the reservation, LRU first.
      31              : //!
      32              : //! The value for the per-tenant reservation is referred to as `tenant_min_resident_size`
      33              : //! throughout the code, but, no actual variable carries that name.
      34              : //! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`.
      35              : //! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
      36              : //! during page reconstruction.
      37              : //! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
      38              : //! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`).
      39              : 
      40              : // Implementation notes:
      41              : // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
      42              : //   reading these fields. We use the Debug impl for semi-structured logging, though.
      43              : 
      44              : use std::sync::Arc;
      45              : use std::time::SystemTime;
      46              : 
      47              : use anyhow::Context;
      48              : use pageserver_api::config::DiskUsageEvictionTaskConfig;
      49              : use pageserver_api::shard::TenantShardId;
      50              : use remote_storage::GenericRemoteStorage;
      51              : use serde::Serialize;
      52              : use tokio::time::Instant;
      53              : use tokio_util::sync::CancellationToken;
      54              : use tracing::{Instrument, debug, error, info, instrument, warn};
      55              : use utils::completion;
      56              : use utils::id::TimelineId;
      57              : 
      58              : use crate::config::PageServerConf;
      59              : use crate::metrics::disk_usage_based_eviction::METRICS;
      60              : use crate::task_mgr::{self, BACKGROUND_RUNTIME};
      61              : use crate::tenant::mgr::TenantManager;
      62              : use crate::tenant::remote_timeline_client::LayerFileMetadata;
      63              : use crate::tenant::secondary::SecondaryTenant;
      64              : use crate::tenant::storage_layer::{
      65              :     AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint,
      66              : };
      67              : use crate::tenant::tasks::sleep_random;
      68              : use crate::{CancellableTask, DiskUsageEvictionTask};
      69              : 
      70              : /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
      71              : /// partitioning.
      72              : #[derive(Debug, Clone, Copy, PartialEq, Eq)]
      73              : pub enum EvictionOrder {
      74              :     /// Order the layers to be evicted by how recently they have been accessed relatively within
      75              :     /// the set of resident layers of a tenant.
      76              :     RelativeAccessed {
      77              :         /// Determines if the tenant with most layers should lose first.
      78              :         ///
      79              :         /// Having this enabled is currently the only reasonable option, because the order in which
      80              :         /// we read tenants is deterministic. If we find the need to use this as `false`, we need
      81              :         /// to ensure nondeterminism by adding in a random number to break the
      82              :         /// `relative_last_activity==0.0` ties.
      83              :         highest_layer_count_loses_first: bool,
      84              :     },
      85              : }
      86              : 
      87              : impl From<pageserver_api::config::EvictionOrder> for EvictionOrder {
      88            0 :     fn from(value: pageserver_api::config::EvictionOrder) -> Self {
      89            0 :         match value {
      90              :             pageserver_api::config::EvictionOrder::RelativeAccessed {
      91            0 :                 highest_layer_count_loses_first,
      92            0 :             } => Self::RelativeAccessed {
      93            0 :                 highest_layer_count_loses_first,
      94            0 :             },
      95              :         }
      96            0 :     }
      97              : }
      98              : 
      99              : impl EvictionOrder {
     100            0 :     fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
     101              :         use EvictionOrder::*;
     102              : 
     103            0 :         match self {
     104            0 :             RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
     105            0 :                 (*partition, candidate.relative_last_activity)
     106            0 :             }),
     107              :         }
     108            0 :     }
     109              : 
     110              :     /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants
     111              :     /// layers in **most** recently used order.
     112           20 :     fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 {
     113              :         use EvictionOrder::*;
     114              : 
     115           20 :         match self {
     116              :             RelativeAccessed {
     117           20 :                 highest_layer_count_loses_first,
     118              :             } => {
     119              :                 // keeping the -1 or not decides if every tenant should lose their least recently accessed
     120              :                 // layer OR if this should happen in the order of having highest layer count:
     121           20 :                 let fudge = if *highest_layer_count_loses_first {
     122              :                     // relative_last_activity vs. tenant layer count:
     123              :                     // - 0.1..=1.0 (10 layers)
     124              :                     // - 0.01..=1.0 (100 layers)
     125              :                     // - 0.001..=1.0 (1000 layers)
     126              :                     //
     127              :                     // leading to evicting less of the smallest tenants.
     128           10 :                     0
     129              :                 } else {
     130              :                     // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
     131              :                     // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
     132              :                     // be that less than 10k layer evictions is enough, so we would not need to evict from
     133              :                     // all tenants.
     134              :                     //
     135              :                     // as the tenant ordering is now deterministic this could hit the same tenants
     136              :                     // disproportionetly on multiple invocations. alternative could be to remember how many
     137              :                     // layers did we evict last time from this tenant, and inject that as an additional
     138              :                     // fudge here.
     139           10 :                     1
     140              :                 };
     141              : 
     142           20 :                 let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1);
     143           20 :                 let divider = total as f32;
     144              : 
     145              :                 // most recently used is always (total - 0) / divider == 1.0
     146              :                 // least recently used depends on the fudge:
     147              :                 // -       (total - 1) - (total - 1) / total => 0 / total
     148              :                 // -             total - (total - 1) / total => 1 / total
     149           20 :                 let distance = (total - index) as f32;
     150              : 
     151           20 :                 finite_f32::FiniteF32::try_from_normalized(distance / divider)
     152           20 :                     .unwrap_or_else(|val| {
     153            0 :                         tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}");
     154            0 :                         finite_f32::FiniteF32::ZERO
     155            0 :                     })
     156              :             }
     157              :         }
     158           20 :     }
     159              : }
     160              : 
     161              : #[derive(Default)]
     162              : pub struct State {
     163              :     /// Exclude http requests and background task from running at the same time.
     164              :     mutex: tokio::sync::Mutex<()>,
     165              : }
     166              : 
     167            0 : pub fn launch_disk_usage_global_eviction_task(
     168            0 :     conf: &'static PageServerConf,
     169            0 :     storage: GenericRemoteStorage,
     170            0 :     state: Arc<State>,
     171            0 :     tenant_manager: Arc<TenantManager>,
     172            0 :     background_jobs_barrier: completion::Barrier,
     173            0 : ) -> Option<DiskUsageEvictionTask> {
     174            0 :     let task_config = &conf.disk_usage_based_eviction;
     175            0 :     if !task_config.enabled {
     176            0 :         info!("disk usage based eviction task not configured");
     177            0 :         return None;
     178            0 :     };
     179              : 
     180            0 :     info!("launching disk usage based eviction task");
     181              : 
     182            0 :     let cancel = CancellationToken::new();
     183            0 :     let task = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
     184              :         "disk usage based eviction",
     185              :         {
     186            0 :             let cancel = cancel.clone();
     187            0 :             async move {
     188              :                 // wait until initial load is complete, because we cannot evict from loading tenants.
     189            0 :                 tokio::select! {
     190            0 :                     _ = cancel.cancelled() => { return anyhow::Ok(()); },
     191            0 :                     _ = background_jobs_barrier.wait() => { }
     192              :                 };
     193              : 
     194            0 :                 disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel)
     195            0 :                     .await;
     196            0 :                 anyhow::Ok(())
     197            0 :             }
     198              :         },
     199              :     ));
     200              : 
     201            0 :     Some(DiskUsageEvictionTask(CancellableTask { cancel, task }))
     202            0 : }
     203              : 
     204              : #[instrument(skip_all)]
     205              : async fn disk_usage_eviction_task(
     206              :     state: &State,
     207              :     task_config: &DiskUsageEvictionTaskConfig,
     208              :     storage: &GenericRemoteStorage,
     209              :     tenant_manager: Arc<TenantManager>,
     210              :     cancel: CancellationToken,
     211              : ) {
     212              :     scopeguard::defer! {
     213              :         info!("disk usage based eviction task finishing");
     214              :     };
     215              : 
     216              :     if sleep_random(task_config.period, &cancel).await.is_err() {
     217              :         return;
     218              :     }
     219              : 
     220              :     let mut iteration_no = 0;
     221              :     loop {
     222              :         iteration_no += 1;
     223              :         let start = Instant::now();
     224              : 
     225            0 :         async {
     226            0 :             let res = disk_usage_eviction_task_iteration(
     227            0 :                 state,
     228            0 :                 task_config,
     229            0 :                 storage,
     230            0 :                 &tenant_manager,
     231            0 :                 &cancel,
     232            0 :             )
     233            0 :             .await;
     234              : 
     235            0 :             match res {
     236            0 :                 Ok(()) => {}
     237            0 :                 Err(e) => {
     238              :                     // these stat failures are expected to be very rare
     239            0 :                     warn!("iteration failed, unexpected error: {e:#}");
     240              :                 }
     241              :             }
     242            0 :         }
     243              :         .instrument(tracing::info_span!("iteration", iteration_no))
     244              :         .await;
     245              : 
     246              :         let sleep_until = start + task_config.period;
     247              :         if tokio::time::timeout_at(sleep_until, cancel.cancelled())
     248              :             .await
     249              :             .is_ok()
     250              :         {
     251              :             break;
     252              :         }
     253              :     }
     254              : }
     255              : 
     256              : pub trait Usage: Clone + Copy + std::fmt::Debug {
     257              :     fn has_pressure(&self) -> bool;
     258              :     fn add_available_bytes(&mut self, bytes: u64);
     259              : }
     260              : 
     261            0 : async fn disk_usage_eviction_task_iteration(
     262            0 :     state: &State,
     263            0 :     task_config: &DiskUsageEvictionTaskConfig,
     264            0 :     storage: &GenericRemoteStorage,
     265            0 :     tenant_manager: &Arc<TenantManager>,
     266            0 :     cancel: &CancellationToken,
     267            0 : ) -> anyhow::Result<()> {
     268            0 :     let tenants_dir = tenant_manager.get_conf().tenants_path();
     269            0 :     let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
     270            0 :         .context("get filesystem-level disk usage before evictions")?;
     271            0 :     let res = disk_usage_eviction_task_iteration_impl(
     272            0 :         state,
     273            0 :         storage,
     274            0 :         usage_pre,
     275            0 :         tenant_manager,
     276            0 :         task_config.eviction_order.into(),
     277            0 :         cancel,
     278            0 :     )
     279            0 :     .await;
     280            0 :     match res {
     281            0 :         Ok(outcome) => {
     282            0 :             debug!(?outcome, "disk_usage_eviction_iteration finished");
     283            0 :             match outcome {
     284            0 :                 IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
     285            0 :                     // nothing to do, select statement below will handle things
     286            0 :                 }
     287            0 :                 IterationOutcome::Finished(outcome) => {
     288              :                     // Verify with statvfs whether we made any real progress
     289            0 :                     let after = filesystem_level_usage::get(&tenants_dir, task_config)
     290              :                         // It's quite unlikely to hit the error here. Keep the code simple and bail out.
     291            0 :                         .context("get filesystem-level disk usage after evictions")?;
     292              : 
     293            0 :                     debug!(?after, "disk usage");
     294              : 
     295            0 :                     if after.has_pressure() {
     296              :                         // Don't bother doing an out-of-order iteration here now.
     297              :                         // In practice, the task period is set to a value in the tens-of-seconds range,
     298              :                         // which will cause another iteration to happen soon enough.
     299              :                         // TODO: deltas between the three different usages would be helpful,
     300              :                         // consider MiB, GiB, TiB
     301            0 :                         warn!(?outcome, ?after, "disk usage still high");
     302              :                     } else {
     303            0 :                         info!(?outcome, ?after, "disk usage pressure relieved");
     304              :                     }
     305              :                 }
     306              :             }
     307              :         }
     308            0 :         Err(e) => {
     309            0 :             error!("disk_usage_eviction_iteration failed: {:#}", e);
     310              :         }
     311              :     }
     312              : 
     313            0 :     Ok(())
     314            0 : }
     315              : 
     316              : #[derive(Debug, Serialize)]
     317              : #[allow(clippy::large_enum_variant)]
     318              : pub enum IterationOutcome<U> {
     319              :     NoPressure,
     320              :     Cancelled,
     321              :     Finished(IterationOutcomeFinished<U>),
     322              : }
     323              : 
     324              : #[derive(Debug, Serialize)]
     325              : pub struct IterationOutcomeFinished<U> {
     326              :     /// The actual usage observed before we started the iteration.
     327              :     before: U,
     328              :     /// The expected value for `after`, according to internal accounting, after phase 1.
     329              :     planned: PlannedUsage<U>,
     330              :     /// The outcome of phase 2, where we actually do the evictions.
     331              :     ///
     332              :     /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
     333              :     /// be the same as `planned`.
     334              :     assumed: AssumedUsage<U>,
     335              : }
     336              : 
     337              : #[derive(Debug, Serialize)]
     338              : struct AssumedUsage<U> {
     339              :     /// The expected value for `after`, after phase 2.
     340              :     projected_after: U,
     341              :     /// The layers we failed to evict during phase 2.
     342              :     failed: LayerCount,
     343              : }
     344              : 
     345              : #[derive(Debug, Serialize)]
     346              : struct PlannedUsage<U> {
     347              :     respecting_tenant_min_resident_size: U,
     348              :     fallback_to_global_lru: Option<U>,
     349              : }
     350              : 
     351              : #[derive(Debug, Default, Serialize)]
     352              : struct LayerCount {
     353              :     file_sizes: u64,
     354              :     count: usize,
     355              : }
     356              : 
     357            0 : pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     358            0 :     state: &State,
     359            0 :     _storage: &GenericRemoteStorage,
     360            0 :     usage_pre: U,
     361            0 :     tenant_manager: &Arc<TenantManager>,
     362            0 :     eviction_order: EvictionOrder,
     363            0 :     cancel: &CancellationToken,
     364            0 : ) -> anyhow::Result<IterationOutcome<U>> {
     365              :     // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
     366            0 :     let _g = state
     367            0 :         .mutex
     368            0 :         .try_lock()
     369            0 :         .map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
     370              : 
     371            0 :     debug!(?usage_pre, "disk usage");
     372              : 
     373            0 :     if !usage_pre.has_pressure() {
     374            0 :         return Ok(IterationOutcome::NoPressure);
     375            0 :     }
     376              : 
     377            0 :     warn!(
     378              :         ?usage_pre,
     379            0 :         "running disk usage based eviction due to pressure"
     380              :     );
     381              : 
     382            0 :     let (candidates, collection_time) = {
     383            0 :         let started_at = std::time::Instant::now();
     384            0 :         match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
     385              :             EvictionCandidates::Cancelled => {
     386            0 :                 return Ok(IterationOutcome::Cancelled);
     387              :             }
     388            0 :             EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()),
     389              :         }
     390              :     };
     391              : 
     392            0 :     METRICS.layers_collected.inc_by(candidates.len() as u64);
     393              : 
     394            0 :     tracing::info!(
     395            0 :         elapsed_ms = collection_time.as_millis(),
     396            0 :         total_layers = candidates.len(),
     397            0 :         "collection completed"
     398              :     );
     399              : 
     400              :     // Debug-log the list of candidates
     401            0 :     let now = SystemTime::now();
     402            0 :     for (i, (partition, candidate)) in candidates.iter().enumerate() {
     403            0 :         let nth = i + 1;
     404            0 :         let total_candidates = candidates.len();
     405            0 :         let size = candidate.layer.get_file_size();
     406            0 :         let rel = candidate.relative_last_activity;
     407            0 :         debug!(
     408            0 :             "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
     409            0 :             now.duration_since(candidate.last_activity_ts)
     410            0 :                 .unwrap()
     411            0 :                 .as_micros(),
     412            0 :             candidate.layer.get_tenant_shard_id(),
     413            0 :             candidate.layer.get_timeline_id(),
     414            0 :             candidate.layer.get_name(),
     415              :         );
     416              :     }
     417              : 
     418              :     // phase1: select victims to relieve pressure
     419              :     //
     420              :     // Walk through the list of candidates, until we have accumulated enough layers to get
     421              :     // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
     422              :     // how much disk space would be used after evicting all the layers up to the current
     423              :     // point in the list.
     424              :     //
     425              :     // If we get far enough in the list that we start to evict layers that are below
     426              :     // the tenant's min-resident-size threshold, print a warning, and memorize the disk
     427              :     // usage at that point, in 'usage_planned_min_resident_size_respecting'.
     428              : 
     429            0 :     let (evicted_amount, usage_planned) =
     430            0 :         select_victims(&candidates, usage_pre).into_amount_and_planned();
     431              : 
     432            0 :     METRICS.layers_selected.inc_by(evicted_amount as u64);
     433              : 
     434              :     // phase2: evict layers
     435              : 
     436            0 :     let mut js = tokio::task::JoinSet::new();
     437            0 :     let limit = 1000;
     438              : 
     439            0 :     let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
     440            0 :     let mut consumed_all = false;
     441              : 
     442              :     // After the evictions, `usage_assumed` is the post-eviction usage,
     443              :     // according to internal accounting.
     444            0 :     let mut usage_assumed = usage_pre;
     445            0 :     let mut evictions_failed = LayerCount::default();
     446              : 
     447            0 :     let evict_layers = async move {
     448              :         loop {
     449            0 :             let next = if js.len() >= limit || consumed_all {
     450            0 :                 js.join_next().await
     451            0 :             } else if !js.is_empty() {
     452              :                 // opportunistically consume ready result, one per each new evicted
     453            0 :                 futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
     454              :             } else {
     455            0 :                 None
     456              :             };
     457              : 
     458            0 :             if let Some(next) = next {
     459            0 :                 match next {
     460            0 :                     Ok(Ok(file_size)) => {
     461            0 :                         METRICS.layers_evicted.inc();
     462            0 :                         /*BEGIN_HADRON */
     463            0 :                         METRICS.bytes_evicted.inc_by(file_size);
     464            0 :                         /*END_HADRON */
     465            0 :                         usage_assumed.add_available_bytes(file_size);
     466            0 :                     }
     467              :                     Ok(Err((
     468            0 :                         file_size,
     469              :                         EvictionError::NotFound
     470              :                         | EvictionError::Downloaded
     471              :                         | EvictionError::Timeout,
     472            0 :                     ))) => {
     473            0 :                         evictions_failed.file_sizes += file_size;
     474            0 :                         evictions_failed.count += 1;
     475            0 :                     }
     476            0 :                     Err(je) if je.is_cancelled() => unreachable!("not used"),
     477            0 :                     Err(je) if je.is_panic() => { /* already logged */ }
     478            0 :                     Err(je) => tracing::error!("unknown JoinError: {je:?}"),
     479              :                 }
     480            0 :             }
     481              : 
     482            0 :             if consumed_all && js.is_empty() {
     483            0 :                 break;
     484            0 :             }
     485              : 
     486              :             // calling again when consumed_all is fine as evicted is fused.
     487            0 :             let Some((_partition, candidate)) = evicted.next() else {
     488            0 :                 if !consumed_all {
     489            0 :                     tracing::info!("all evictions started, waiting");
     490            0 :                     consumed_all = true;
     491            0 :                 }
     492            0 :                 continue;
     493              :             };
     494              : 
     495            0 :             match candidate.layer {
     496            0 :                 EvictionLayer::Attached(layer) => {
     497            0 :                     let file_size = layer.layer_desc().file_size;
     498            0 :                     js.spawn(async move {
     499              :                         // have a low eviction waiting timeout because our LRU calculations go stale fast;
     500              :                         // also individual layer evictions could hang because of bugs and we do not want to
     501              :                         // pause disk_usage_based_eviction for such.
     502            0 :                         let timeout = std::time::Duration::from_secs(5);
     503              : 
     504            0 :                         match layer.evict_and_wait(timeout).await {
     505            0 :                             Ok(()) => Ok(file_size),
     506            0 :                             Err(e) => Err((file_size, e)),
     507              :                         }
     508            0 :                     });
     509              :                 }
     510            0 :                 EvictionLayer::Secondary(layer) => {
     511            0 :                     let file_size = layer.metadata.file_size;
     512              : 
     513            0 :                     js.spawn(async move {
     514            0 :                         layer
     515            0 :                             .secondary_tenant
     516            0 :                             .evict_layer(layer.timeline_id, layer.name)
     517            0 :                             .await;
     518            0 :                         Ok(file_size)
     519            0 :                     });
     520              :                 }
     521              :             }
     522            0 :             tokio::task::yield_now().await;
     523              :         }
     524              : 
     525            0 :         (usage_assumed, evictions_failed)
     526            0 :     };
     527              : 
     528            0 :     let started_at = std::time::Instant::now();
     529              : 
     530            0 :     let evict_layers = async move {
     531            0 :         let mut evict_layers = std::pin::pin!(evict_layers);
     532              : 
     533            0 :         let maximum_expected = std::time::Duration::from_secs(10);
     534              : 
     535            0 :         let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await;
     536            0 :         let tuple = if let Ok(tuple) = res {
     537            0 :             tuple
     538              :         } else {
     539            0 :             let elapsed = started_at.elapsed();
     540            0 :             tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing");
     541            0 :             evict_layers.await
     542              :         };
     543              : 
     544            0 :         let elapsed = started_at.elapsed();
     545            0 :         tracing::info!(elapsed_ms = elapsed.as_millis(), "completed");
     546            0 :         tuple
     547            0 :     };
     548              : 
     549            0 :     let evict_layers =
     550            0 :         evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount));
     551              : 
     552            0 :     let (usage_assumed, evictions_failed) = tokio::select! {
     553            0 :         tuple = evict_layers => { tuple },
     554            0 :         _ = cancel.cancelled() => {
     555              :             // dropping joinset will abort all pending evict_and_waits and that is fine, our
     556              :             // requests will still stand
     557            0 :             return Ok(IterationOutcome::Cancelled);
     558              :         }
     559              :     };
     560              : 
     561            0 :     Ok(IterationOutcome::Finished(IterationOutcomeFinished {
     562            0 :         before: usage_pre,
     563            0 :         planned: usage_planned,
     564            0 :         assumed: AssumedUsage {
     565            0 :             projected_after: usage_assumed,
     566            0 :             failed: evictions_failed,
     567            0 :         },
     568            0 :     }))
     569            0 : }
     570              : 
     571              : #[derive(Clone)]
     572              : pub(crate) struct EvictionSecondaryLayer {
     573              :     pub(crate) secondary_tenant: Arc<SecondaryTenant>,
     574              :     pub(crate) timeline_id: TimelineId,
     575              :     pub(crate) name: LayerName,
     576              :     pub(crate) metadata: LayerFileMetadata,
     577              : }
     578              : 
     579              : /// Full [`Layer`] objects are specific to tenants in attached mode.  This type is a layer
     580              : /// of indirection to store either a `Layer`, or a reference to a secondary tenant and a layer name.
     581              : #[derive(Clone)]
     582              : pub(crate) enum EvictionLayer {
     583              :     Attached(Layer),
     584              :     Secondary(EvictionSecondaryLayer),
     585              : }
     586              : 
     587              : impl From<Layer> for EvictionLayer {
     588            0 :     fn from(value: Layer) -> Self {
     589            0 :         Self::Attached(value)
     590            0 :     }
     591              : }
     592              : 
     593              : impl EvictionLayer {
     594            0 :     pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
     595            0 :         match self {
     596            0 :             Self::Attached(l) => &l.layer_desc().tenant_shard_id,
     597            0 :             Self::Secondary(sl) => sl.secondary_tenant.get_tenant_shard_id(),
     598              :         }
     599            0 :     }
     600              : 
     601            0 :     pub(crate) fn get_timeline_id(&self) -> &TimelineId {
     602            0 :         match self {
     603            0 :             Self::Attached(l) => &l.layer_desc().timeline_id,
     604            0 :             Self::Secondary(sl) => &sl.timeline_id,
     605              :         }
     606            0 :     }
     607              : 
     608            0 :     pub(crate) fn get_name(&self) -> LayerName {
     609            0 :         match self {
     610            0 :             Self::Attached(l) => l.layer_desc().layer_name(),
     611            0 :             Self::Secondary(sl) => sl.name.clone(),
     612              :         }
     613            0 :     }
     614              : 
     615            0 :     pub(crate) fn get_file_size(&self) -> u64 {
     616            0 :         match self {
     617            0 :             Self::Attached(l) => l.layer_desc().file_size,
     618            0 :             Self::Secondary(sl) => sl.metadata.file_size,
     619              :         }
     620            0 :     }
     621              : }
     622              : 
     623              : #[derive(Clone)]
     624              : pub(crate) struct EvictionCandidate {
     625              :     pub(crate) layer: EvictionLayer,
     626              :     pub(crate) last_activity_ts: SystemTime,
     627              :     pub(crate) relative_last_activity: finite_f32::FiniteF32,
     628              :     pub(crate) visibility: LayerVisibilityHint,
     629              : }
     630              : 
     631              : impl std::fmt::Display for EvictionLayer {
     632            0 :     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
     633            0 :         match self {
     634            0 :             Self::Attached(l) => l.fmt(f),
     635            0 :             Self::Secondary(sl) => {
     636            0 :                 write!(f, "{}/{}", sl.timeline_id, sl.name)
     637              :             }
     638              :         }
     639            0 :     }
     640              : }
     641              : 
     642              : #[derive(Default)]
     643              : pub(crate) struct DiskUsageEvictionInfo {
     644              :     /// Timeline's largest layer (remote or resident)
     645              :     pub max_layer_size: Option<u64>,
     646              :     /// Timeline's resident layers
     647              :     pub resident_layers: Vec<EvictionCandidate>,
     648              : }
     649              : 
     650              : impl std::fmt::Debug for EvictionCandidate {
     651            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     652              :         // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
     653              :         // having to allocate a string to this is bad, but it will rarely be formatted
     654            0 :         let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
     655            0 :         let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
     656              :         struct DisplayIsDebug<'a, T>(&'a T);
     657              :         impl<T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'_, T> {
     658            0 :             fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     659            0 :                 write!(f, "{}", self.0)
     660            0 :             }
     661              :         }
     662            0 :         f.debug_struct("LocalLayerInfoForDiskUsageEviction")
     663            0 :             .field("layer", &DisplayIsDebug(&self.layer))
     664            0 :             .field("last_activity", &ts)
     665            0 :             .finish()
     666            0 :     }
     667              : }
     668              : 
     669              : #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
     670              : enum EvictionPartition {
     671              :     // A layer that is un-wanted by the tenant: evict all these first, before considering
     672              :     // any other layers
     673              :     EvictNow,
     674              : 
     675              :     // Above the minimum size threshold: this layer is a candidate for eviction.
     676              :     Above,
     677              : 
     678              :     // Below the minimum size threshold: this layer should only be evicted if all the
     679              :     // tenants' layers above the minimum size threshold have already been considered.
     680              :     Below,
     681              : }
     682              : 
     683              : enum EvictionCandidates {
     684              :     Cancelled,
     685              :     Finished(Vec<(EvictionPartition, EvictionCandidate)>),
     686              : }
     687              : 
     688              : /// Gather the eviction candidates.
     689              : ///
     690              : /// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction
     691              : /// order. A caller that evicts in that order, until pressure is relieved, implements
     692              : /// the eviction policy outlined in the module comment.
     693              : ///
     694              : /// # Example with EvictionOrder::AbsoluteAccessed
     695              : ///
     696              : /// Imagine that there are two tenants, A and B, with five layers each, a-e.
     697              : /// Each layer has size 100, and both tenant's min_resident_size is 150.
     698              : /// The eviction order would be
     699              : ///
     700              : /// ```text
     701              : /// partition last_activity_ts tenant/layer
     702              : /// Above     18:30            A/c
     703              : /// Above     19:00            A/b
     704              : /// Above     18:29            B/c
     705              : /// Above     19:05            B/b
     706              : /// Above     20:00            B/a
     707              : /// Above     20:03            A/a
     708              : /// Below     20:30            A/d
     709              : /// Below     20:40            B/d
     710              : /// Below     20:45            B/e
     711              : /// Below     20:58            A/e
     712              : /// ```
     713              : ///
     714              : /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
     715              : /// They are all in the `Above` partition, so, we respected each tenant's min_resident_size.
     716              : ///
     717              : /// But, if we need to evict 900 bytes to relieve pressure, we'd evict
     718              : /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
     719              : /// after exhauting the `Above` partition.
     720              : /// So, we did not respect each tenant's min_resident_size.
     721              : ///
     722              : /// # Example with EvictionOrder::RelativeAccessed
     723              : ///
     724              : /// ```text
     725              : /// partition relative_age last_activity_ts tenant/layer
     726              : /// Above     0/4          18:30            A/c
     727              : /// Above     0/4          18:29            B/c
     728              : /// Above     1/4          19:00            A/b
     729              : /// Above     1/4          19:05            B/b
     730              : /// Above     2/4          20:00            B/a
     731              : /// Above     2/4          20:03            A/a
     732              : /// Below     3/4          20:30            A/d
     733              : /// Below     3/4          20:40            B/d
     734              : /// Below     4/4          20:45            B/e
     735              : /// Below     4/4          20:58            A/e
     736              : /// ```
     737              : ///
     738              : /// With tenants having the same number of layers the picture does not change much. The same with
     739              : /// A having many more layers **resident** (not all of them listed):
     740              : ///
     741              : /// ```text
     742              : /// Above       0/100      18:30            A/c
     743              : /// Above       0/4        18:29            B/c
     744              : /// Above       1/100      19:00            A/b
     745              : /// Above       2/100      20:03            A/a
     746              : /// Above       3/100      20:03            A/nth_3
     747              : /// Above       4/100      20:03            A/nth_4
     748              : ///             ...
     749              : /// Above       1/4        19:05            B/b
     750              : /// Above      25/100      20:04            A/nth_25
     751              : ///             ...
     752              : /// Above       2/4        20:00            B/a
     753              : /// Above      50/100      20:10            A/nth_50
     754              : ///             ...
     755              : /// Below       3/4        20:40            B/d
     756              : /// Below      99/100      20:30            A/nth_99
     757              : /// Below       4/4        20:45            B/e
     758              : /// Below     100/100      20:58            A/nth_100
     759              : /// ```
     760              : ///
     761              : /// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
     762              : /// difficult to see is what happens on the next round assuming the evicting 23 from the above list
     763              : /// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
     764              : /// appeared:
     765              : ///
     766              : /// ```text
     767              : /// Above       0/87       20:04            A/nth_23
     768              : /// Above       0/3        19:05            B/b
     769              : /// Above       0/50       20:59            C/nth_0
     770              : /// Above       1/87       20:04            A/nth_24
     771              : /// Above       1/50       21:00            C/nth_1
     772              : /// Above       2/87       20:04            A/nth_25
     773              : ///             ...
     774              : /// Above      16/50       21:02            C/nth_16
     775              : /// Above       1/3        20:00            B/a
     776              : /// Above      27/87       20:10            A/nth_50
     777              : ///             ...
     778              : /// Below       2/3        20:40            B/d
     779              : /// Below      49/50       21:05            C/nth_49
     780              : /// Below      86/87       20:30            A/nth_99
     781              : /// Below       3/3        20:45            B/e
     782              : /// Below      50/50       21:05            C/nth_50
     783              : /// Below      87/87       20:58            A/nth_100
     784              : /// ```
     785              : ///
     786              : /// Now relieving pressure with 23 layers would cost:
     787              : /// - tenant A 14 layers
     788              : /// - tenant B 1 layer
     789              : /// - tenant C 8 layers
     790            0 : async fn collect_eviction_candidates(
     791            0 :     tenant_manager: &Arc<TenantManager>,
     792            0 :     eviction_order: EvictionOrder,
     793            0 :     cancel: &CancellationToken,
     794            0 : ) -> anyhow::Result<EvictionCandidates> {
     795              :     const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
     796              : 
     797              :     // get a snapshot of the list of tenants
     798            0 :     let tenants = tenant_manager
     799            0 :         .list_tenants()
     800            0 :         .context("get list of tenants")?;
     801              : 
     802              :     // TODO: avoid listing every layer in every tenant: this loop can block the executor,
     803              :     // and the resulting data structure can be huge.
     804              :     // (https://github.com/neondatabase/neon/issues/6224)
     805            0 :     let mut candidates = Vec::new();
     806              : 
     807            0 :     for (tenant_id, _state, _gen) in tenants {
     808            0 :         if cancel.is_cancelled() {
     809            0 :             return Ok(EvictionCandidates::Cancelled);
     810            0 :         }
     811            0 :         let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) {
     812            0 :             Ok(tenant) if tenant.is_active() => tenant,
     813              :             Ok(_) => {
     814            0 :                 debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active");
     815            0 :                 continue;
     816              :             }
     817            0 :             Err(e) => {
     818              :                 // this can happen if tenant has lifecycle transition after we fetched it
     819            0 :                 debug!("failed to get tenant: {e:#}");
     820            0 :                 continue;
     821              :             }
     822              :         };
     823              : 
     824            0 :         if tenant.cancel.is_cancelled() {
     825            0 :             info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
     826            0 :             continue;
     827            0 :         }
     828              : 
     829            0 :         let started_at = std::time::Instant::now();
     830              : 
     831              :         // collect layers from all timelines in this tenant
     832              :         //
     833              :         // If one of the timelines becomes `!is_active()` during the iteration,
     834              :         // for example because we're shutting down, then `max_layer_size` can be too small.
     835              :         // That's OK. This code only runs under a disk pressure situation, and being
     836              :         // a little unfair to tenants during shutdown in such a situation is tolerable.
     837            0 :         let mut tenant_candidates = Vec::new();
     838            0 :         let mut max_layer_size = 0;
     839            0 :         for tl in tenant.list_timelines() {
     840            0 :             if !tl.is_active() {
     841            0 :                 continue;
     842            0 :             }
     843            0 :             let info = tl.get_local_layers_for_disk_usage_eviction().await;
     844            0 :             debug!(
     845            0 :                 tenant_id=%tl.tenant_shard_id.tenant_id,
     846            0 :                 shard_id=%tl.tenant_shard_id.shard_slug(),
     847            0 :                 timeline_id=%tl.timeline_id,
     848            0 :                 "timeline resident layers count: {}", info.resident_layers.len()
     849              :             );
     850              : 
     851            0 :             tenant_candidates.extend(info.resident_layers.into_iter());
     852            0 :             max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
     853              : 
     854            0 :             if cancel.is_cancelled() {
     855            0 :                 return Ok(EvictionCandidates::Cancelled);
     856            0 :             }
     857              :         }
     858              : 
     859              :         // Also consider layers of timelines being imported for eviction
     860            0 :         for tl in tenant.list_importing_timelines() {
     861            0 :             let info = tl.timeline.get_local_layers_for_disk_usage_eviction().await;
     862            0 :             debug!(
     863            0 :                 tenant_id=%tl.timeline.tenant_shard_id.tenant_id,
     864            0 :                 shard_id=%tl.timeline.tenant_shard_id.shard_slug(),
     865            0 :                 timeline_id=%tl.timeline.timeline_id,
     866            0 :                 "timeline resident layers count: {}", info.resident_layers.len()
     867              :             );
     868              : 
     869            0 :             tenant_candidates.extend(info.resident_layers.into_iter());
     870            0 :             max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
     871              : 
     872            0 :             if cancel.is_cancelled() {
     873            0 :                 return Ok(EvictionCandidates::Cancelled);
     874            0 :             }
     875              :         }
     876              : 
     877              :         // `min_resident_size` defaults to maximum layer file size of the tenant.
     878              :         // This ensures that each tenant can have at least one layer resident at a given time,
     879              :         // ensuring forward progress for a single Timeline::get in that tenant.
     880              :         // It's a questionable heuristic since, usually, there are many Timeline::get
     881              :         // requests going on for a tenant, and, at least in Neon prod, the median
     882              :         // layer file size is much smaller than the compaction target size.
     883              :         // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
     884              :         // That's what's typically used by the various background loops.
     885              :         //
     886              :         // The default can be overridden with a fixed value in the tenant conf.
     887              :         // A default override can be put in the default tenant conf in the pageserver.toml.
     888            0 :         let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
     889            0 :             debug!(
     890            0 :                 tenant_id=%tenant.tenant_shard_id().tenant_id,
     891            0 :                 shard_id=%tenant.tenant_shard_id().shard_slug(),
     892              :                 overridden_size=s,
     893            0 :                 "using overridden min resident size for tenant"
     894              :             );
     895            0 :             s
     896              :         } else {
     897            0 :             debug!(
     898            0 :                 tenant_id=%tenant.tenant_shard_id().tenant_id,
     899            0 :                 shard_id=%tenant.tenant_shard_id().shard_slug(),
     900              :                 max_layer_size,
     901            0 :                 "using max layer size as min_resident_size for tenant",
     902              :             );
     903            0 :             max_layer_size
     904              :         };
     905              : 
     906              :         // Sort layers most-recently-used first, then calculate [`EvictionPartition`] for each layer,
     907              :         // where the inputs are:
     908              :         //  - whether the layer is visible
     909              :         //  - whether the layer is above/below the min_resident_size cutline
     910            0 :         tenant_candidates
     911            0 :             .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
     912            0 :         let mut cumsum: i128 = 0;
     913              : 
     914            0 :         let total = tenant_candidates.len();
     915              : 
     916            0 :         let tenant_candidates =
     917            0 :             tenant_candidates
     918            0 :                 .into_iter()
     919            0 :                 .enumerate()
     920            0 :                 .map(|(i, mut candidate)| {
     921              :                     // as we iterate this reverse sorted list, the most recently accessed layer will always
     922              :                     // be 1.0; this is for us to evict it last.
     923            0 :                     candidate.relative_last_activity =
     924            0 :                         eviction_order.relative_last_activity(total, i);
     925              : 
     926            0 :                     let partition = match candidate.visibility {
     927              :                         LayerVisibilityHint::Covered => {
     928              :                             // Covered layers are evicted first
     929            0 :                             EvictionPartition::EvictNow
     930              :                         }
     931              :                         LayerVisibilityHint::Visible => {
     932            0 :                             cumsum += i128::from(candidate.layer.get_file_size());
     933              : 
     934            0 :                             if cumsum > min_resident_size as i128 {
     935            0 :                                 EvictionPartition::Above
     936              :                             } else {
     937              :                                 // The most recent layers below the min_resident_size threshold
     938              :                                 // are the last to be evicted.
     939            0 :                                 EvictionPartition::Below
     940              :                             }
     941              :                         }
     942              :                     };
     943              : 
     944            0 :                     (partition, candidate)
     945            0 :                 });
     946              : 
     947            0 :         METRICS
     948            0 :             .tenant_layer_count
     949            0 :             .observe(tenant_candidates.len() as f64);
     950              : 
     951            0 :         candidates.extend(tenant_candidates);
     952              : 
     953            0 :         let elapsed = started_at.elapsed();
     954            0 :         METRICS
     955            0 :             .tenant_collection_time
     956            0 :             .observe(elapsed.as_secs_f64());
     957              : 
     958            0 :         if elapsed > LOG_DURATION_THRESHOLD {
     959            0 :             tracing::info!(
     960            0 :                 tenant_id=%tenant.tenant_shard_id().tenant_id,
     961            0 :                 shard_id=%tenant.tenant_shard_id().shard_slug(),
     962            0 :                 elapsed_ms = elapsed.as_millis(),
     963            0 :                 "collection took longer than threshold"
     964              :             );
     965            0 :         }
     966              :     }
     967              : 
     968              :     // Note: the same tenant ID might be hit twice, if it transitions from attached to
     969              :     // secondary while we run.  That is okay: when we eventually try and run the eviction,
     970              :     // the `Gate` on the object will ensure that whichever one has already been shut down
     971              :     // will not delete anything.
     972              : 
     973            0 :     let mut secondary_tenants = Vec::new();
     974            0 :     tenant_manager.foreach_secondary_tenants(
     975            0 :         |_tenant_shard_id: &TenantShardId, state: &Arc<SecondaryTenant>| {
     976            0 :             secondary_tenants.push(state.clone());
     977            0 :         },
     978              :     );
     979              : 
     980            0 :     for tenant in secondary_tenants {
     981              :         // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
     982              :         // to prevent repeated disk usage based evictions from completely draining less often
     983              :         // updating secondaries.
     984            0 :         let (mut layer_info, total_layers) = tenant.get_layers_for_eviction();
     985              : 
     986            0 :         debug_assert!(
     987            0 :             total_layers >= layer_info.resident_layers.len(),
     988            0 :             "total_layers ({total_layers}) must be at least the resident_layers.len() ({})",
     989            0 :             layer_info.resident_layers.len()
     990              :         );
     991              : 
     992            0 :         let started_at = std::time::Instant::now();
     993              : 
     994            0 :         layer_info
     995            0 :             .resident_layers
     996            0 :             .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
     997              : 
     998            0 :         let tenant_candidates =
     999            0 :             layer_info
    1000            0 :                 .resident_layers
    1001            0 :                 .into_iter()
    1002            0 :                 .enumerate()
    1003            0 :                 .map(|(i, mut candidate)| {
    1004            0 :                     candidate.relative_last_activity =
    1005            0 :                         eviction_order.relative_last_activity(total_layers, i);
    1006            0 :                     (
    1007            0 :                         // Secondary locations' layers are always considered above the min resident size,
    1008            0 :                         // i.e. secondary locations are permitted to be trimmed to zero layers if all
    1009            0 :                         // the layers have sufficiently old access times.
    1010            0 :                         EvictionPartition::Above,
    1011            0 :                         candidate,
    1012            0 :                     )
    1013            0 :                 });
    1014              : 
    1015            0 :         METRICS
    1016            0 :             .tenant_layer_count
    1017            0 :             .observe(tenant_candidates.len() as f64);
    1018            0 :         candidates.extend(tenant_candidates);
    1019              : 
    1020            0 :         tokio::task::yield_now().await;
    1021              : 
    1022            0 :         let elapsed = started_at.elapsed();
    1023              : 
    1024            0 :         METRICS
    1025            0 :             .tenant_collection_time
    1026            0 :             .observe(elapsed.as_secs_f64());
    1027              : 
    1028            0 :         if elapsed > LOG_DURATION_THRESHOLD {
    1029            0 :             tracing::info!(
    1030            0 :                 tenant_id=%tenant.tenant_shard_id().tenant_id,
    1031            0 :                 shard_id=%tenant.tenant_shard_id().shard_slug(),
    1032            0 :                 elapsed_ms = elapsed.as_millis(),
    1033            0 :                 "collection took longer than threshold"
    1034              :             );
    1035            0 :         }
    1036              :     }
    1037              : 
    1038            0 :     debug_assert!(
    1039            0 :         EvictionPartition::Above < EvictionPartition::Below,
    1040            0 :         "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"
    1041              :     );
    1042            0 :     debug_assert!(
    1043            0 :         EvictionPartition::EvictNow < EvictionPartition::Above,
    1044            0 :         "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"
    1045              :     );
    1046              : 
    1047            0 :     eviction_order.sort(&mut candidates);
    1048              : 
    1049            0 :     Ok(EvictionCandidates::Finished(candidates))
    1050            0 : }
    1051              : 
    1052              : /// Given a pre-sorted vec of all layers in the system, select the first N which are enough to
    1053              : /// relieve pressure.
    1054              : ///
    1055              : /// Returns the amount of candidates selected, with the planned usage.
    1056            0 : fn select_victims<U: Usage>(
    1057            0 :     candidates: &[(EvictionPartition, EvictionCandidate)],
    1058            0 :     usage_pre: U,
    1059            0 : ) -> VictimSelection<U> {
    1060            0 :     let mut usage_when_switched = None;
    1061            0 :     let mut usage_planned = usage_pre;
    1062            0 :     let mut evicted_amount = 0;
    1063              : 
    1064            0 :     for (i, (partition, candidate)) in candidates.iter().enumerate() {
    1065            0 :         if !usage_planned.has_pressure() {
    1066            0 :             break;
    1067            0 :         }
    1068              : 
    1069            0 :         if partition == &EvictionPartition::Below && usage_when_switched.is_none() {
    1070            0 :             usage_when_switched = Some((usage_planned, i));
    1071            0 :         }
    1072              : 
    1073            0 :         usage_planned.add_available_bytes(candidate.layer.get_file_size());
    1074            0 :         evicted_amount += 1;
    1075              :     }
    1076              : 
    1077            0 :     VictimSelection {
    1078            0 :         amount: evicted_amount,
    1079            0 :         usage_pre,
    1080            0 :         usage_when_switched,
    1081            0 :         usage_planned,
    1082            0 :     }
    1083            0 : }
    1084              : 
    1085              : struct VictimSelection<U> {
    1086              :     amount: usize,
    1087              :     usage_pre: U,
    1088              :     usage_when_switched: Option<(U, usize)>,
    1089              :     usage_planned: U,
    1090              : }
    1091              : 
    1092              : impl<U: Usage> VictimSelection<U> {
    1093            0 :     fn into_amount_and_planned(self) -> (usize, PlannedUsage<U>) {
    1094            0 :         debug!(
    1095              :             evicted_amount=%self.amount,
    1096            0 :             "took enough candidates for pressure to be relieved"
    1097              :         );
    1098              : 
    1099            0 :         if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() {
    1100            0 :             warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
    1101            0 :         }
    1102              : 
    1103            0 :         let planned = match self.usage_when_switched {
    1104            0 :             Some((respecting_tenant_min_resident_size, _)) => PlannedUsage {
    1105            0 :                 respecting_tenant_min_resident_size,
    1106            0 :                 fallback_to_global_lru: Some(self.usage_planned),
    1107            0 :             },
    1108            0 :             None => PlannedUsage {
    1109            0 :                 respecting_tenant_min_resident_size: self.usage_planned,
    1110            0 :                 fallback_to_global_lru: None,
    1111            0 :             },
    1112              :         };
    1113              : 
    1114            0 :         (self.amount, planned)
    1115            0 :     }
    1116              : }
    1117              : 
    1118              : /// A totally ordered f32 subset we can use with sorting functions.
    1119              : pub(crate) mod finite_f32 {
    1120              : 
    1121              :     /// A totally ordered f32 subset we can use with sorting functions.
    1122              :     #[derive(Clone, Copy, PartialEq)]
    1123              :     pub struct FiniteF32(f32);
    1124              : 
    1125              :     impl std::fmt::Debug for FiniteF32 {
    1126            0 :         fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1127            0 :             std::fmt::Debug::fmt(&self.0, f)
    1128            0 :         }
    1129              :     }
    1130              : 
    1131              :     impl std::fmt::Display for FiniteF32 {
    1132            0 :         fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1133            0 :             std::fmt::Display::fmt(&self.0, f)
    1134            0 :         }
    1135              :     }
    1136              : 
    1137              :     impl std::cmp::Eq for FiniteF32 {}
    1138              : 
    1139              :     impl std::cmp::PartialOrd for FiniteF32 {
    1140            0 :         fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
    1141            0 :             Some(self.cmp(other))
    1142            0 :         }
    1143              :     }
    1144              : 
    1145              :     impl std::cmp::Ord for FiniteF32 {
    1146            0 :         fn cmp(&self, other: &Self) -> std::cmp::Ordering {
    1147            0 :             self.0.total_cmp(&other.0)
    1148            0 :         }
    1149              :     }
    1150              : 
    1151              :     impl TryFrom<f32> for FiniteF32 {
    1152              :         type Error = f32;
    1153              : 
    1154            0 :         fn try_from(value: f32) -> Result<Self, Self::Error> {
    1155            0 :             if value.is_finite() {
    1156            0 :                 Ok(FiniteF32(value))
    1157              :             } else {
    1158            0 :                 Err(value)
    1159              :             }
    1160            0 :         }
    1161              :     }
    1162              : 
    1163              :     impl From<FiniteF32> for f32 {
    1164           20 :         fn from(value: FiniteF32) -> f32 {
    1165           20 :             value.0
    1166           20 :         }
    1167              :     }
    1168              : 
    1169              :     impl FiniteF32 {
    1170              :         pub const ZERO: FiniteF32 = FiniteF32(0.0);
    1171              : 
    1172           20 :         pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
    1173           20 :             if (0.0..=1.0).contains(&value) {
    1174              :                 // -0.0 is within the range, make sure it is assumed 0.0..=1.0
    1175           20 :                 let value = value.abs();
    1176           20 :                 Ok(FiniteF32(value))
    1177              :             } else {
    1178            0 :                 Err(value)
    1179              :             }
    1180           20 :         }
    1181              : 
    1182           20 :         pub fn into_inner(self) -> f32 {
    1183           20 :             self.into()
    1184           20 :         }
    1185              :     }
    1186              : }
    1187              : 
    1188              : mod filesystem_level_usage {
    1189              :     use anyhow::Context;
    1190              :     use camino::Utf8Path;
    1191              : 
    1192              :     use super::DiskUsageEvictionTaskConfig;
    1193              :     use crate::statvfs::Statvfs;
    1194              : 
    1195              :     #[derive(Debug, Clone, Copy)]
    1196              :     pub struct Usage<'a> {
    1197              :         config: &'a DiskUsageEvictionTaskConfig,
    1198              : 
    1199              :         /// Filesystem capacity
    1200              :         total_bytes: u64,
    1201              :         /// Free filesystem space
    1202              :         avail_bytes: u64,
    1203              :     }
    1204              : 
    1205              :     impl super::Usage for Usage<'_> {
    1206            7 :         fn has_pressure(&self) -> bool {
    1207            7 :             let usage_pct =
    1208            7 :                 (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
    1209              : 
    1210            7 :             let pressures = [
    1211            7 :                 (
    1212            7 :                     "min_avail_bytes",
    1213            7 :                     self.avail_bytes < self.config.min_avail_bytes,
    1214            7 :                 ),
    1215            7 :                 (
    1216            7 :                     "max_usage_pct",
    1217            7 :                     usage_pct >= self.config.max_usage_pct.get() as u64,
    1218            7 :                 ),
    1219            7 :             ];
    1220              : 
    1221            7 :             pressures.into_iter().any(|(_, has_pressure)| has_pressure)
    1222            7 :         }
    1223              : 
    1224            6 :         fn add_available_bytes(&mut self, bytes: u64) {
    1225            6 :             self.avail_bytes += bytes;
    1226            6 :         }
    1227              :     }
    1228              : 
    1229            0 :     pub fn get<'a>(
    1230            0 :         tenants_dir: &Utf8Path,
    1231            0 :         config: &'a DiskUsageEvictionTaskConfig,
    1232            0 :     ) -> anyhow::Result<Usage<'a>> {
    1233            0 :         let mock_config = {
    1234              :             #[cfg(feature = "testing")]
    1235              :             {
    1236            0 :                 config.mock_statvfs.as_ref()
    1237              :             }
    1238              :             #[cfg(not(feature = "testing"))]
    1239              :             {
    1240              :                 None
    1241              :             }
    1242              :         };
    1243              : 
    1244            0 :         let stat = Statvfs::get(tenants_dir, mock_config)
    1245            0 :             .context("statvfs failed, presumably directory got unlinked")?;
    1246              : 
    1247            0 :         let (avail_bytes, total_bytes) = stat.get_avail_total_bytes();
    1248              : 
    1249            0 :         Ok(Usage {
    1250            0 :             config,
    1251            0 :             total_bytes,
    1252            0 :             avail_bytes,
    1253            0 :         })
    1254            0 :     }
    1255              : 
    1256              :     #[test]
    1257            1 :     fn max_usage_pct_pressure() {
    1258              :         use std::time::Duration;
    1259              : 
    1260              :         use utils::serde_percent::Percent;
    1261              : 
    1262              :         use super::Usage as _;
    1263              : 
    1264            1 :         let mut usage = Usage {
    1265            1 :             config: &DiskUsageEvictionTaskConfig {
    1266            1 :                 max_usage_pct: Percent::new(85).unwrap(),
    1267            1 :                 min_avail_bytes: 0,
    1268            1 :                 period: Duration::MAX,
    1269            1 :                 #[cfg(feature = "testing")]
    1270            1 :                 mock_statvfs: None,
    1271            1 :                 eviction_order: pageserver_api::config::EvictionOrder::default(),
    1272            1 :                 enabled: true,
    1273            1 :             },
    1274            1 :             total_bytes: 100_000,
    1275            1 :             avail_bytes: 0,
    1276            1 :         };
    1277              : 
    1278            1 :         assert!(usage.has_pressure(), "expected pressure at 100%");
    1279              : 
    1280            1 :         usage.add_available_bytes(14_000);
    1281            1 :         assert!(usage.has_pressure(), "expected pressure at 86%");
    1282              : 
    1283            1 :         usage.add_available_bytes(999);
    1284            1 :         assert!(usage.has_pressure(), "expected pressure at 85.001%");
    1285              : 
    1286            1 :         usage.add_available_bytes(1);
    1287            1 :         assert!(usage.has_pressure(), "expected pressure at precisely 85%");
    1288              : 
    1289            1 :         usage.add_available_bytes(1);
    1290            1 :         assert!(!usage.has_pressure(), "no pressure at 84.999%");
    1291              : 
    1292            1 :         usage.add_available_bytes(999);
    1293            1 :         assert!(!usage.has_pressure(), "no pressure at 84%");
    1294              : 
    1295            1 :         usage.add_available_bytes(16_000);
    1296            1 :         assert!(!usage.has_pressure());
    1297            1 :     }
    1298              : }
    1299              : 
    1300              : #[cfg(test)]
    1301              : mod tests {
    1302              :     use super::*;
    1303              : 
    1304              :     #[test]
    1305            1 :     fn relative_equal_bounds() {
    1306            1 :         let order = EvictionOrder::RelativeAccessed {
    1307            1 :             highest_layer_count_loses_first: false,
    1308            1 :         };
    1309              : 
    1310            1 :         let len = 10;
    1311            1 :         let v = (0..len)
    1312           10 :             .map(|i| order.relative_last_activity(len, i).into_inner())
    1313            1 :             .collect::<Vec<_>>();
    1314              : 
    1315            1 :         assert_eq!(v.first(), Some(&1.0));
    1316            1 :         assert_eq!(v.last(), Some(&0.0));
    1317            9 :         assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
    1318            1 :     }
    1319              : 
    1320              :     #[test]
    1321            1 :     fn relative_spare_bounds() {
    1322            1 :         let order = EvictionOrder::RelativeAccessed {
    1323            1 :             highest_layer_count_loses_first: true,
    1324            1 :         };
    1325              : 
    1326            1 :         let len = 10;
    1327            1 :         let v = (0..len)
    1328           10 :             .map(|i| order.relative_last_activity(len, i).into_inner())
    1329            1 :             .collect::<Vec<_>>();
    1330              : 
    1331            1 :         assert_eq!(v.first(), Some(&1.0));
    1332            1 :         assert_eq!(v.last(), Some(&0.1));
    1333            9 :         assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
    1334            1 :     }
    1335              : }

Generated by: LCOV version 2.1-beta