LCOV - code coverage report
Current view: top level - pageserver/src - disk_usage_eviction_task.rs (source / functions) Coverage Total Hit
Test: c639aa5f7ab62b43d647b10f40d15a15686ce8a9.info Lines: 80.5 % 620 499
Test Date: 2024-02-12 20:26:03 Functions: 49.3 % 215 106

            Line data    Source code
       1              : //! This module implements the pageserver-global disk-usage-based layer eviction task.
       2              : //!
       3              : //! # Mechanics
       4              : //!
       5              : //! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
       6              : //! loop that evicts layers in response to a shortage of available bytes
       7              : //! in the $repo/tenants directory's filesystem.
       8              : //!
       9              : //! The loop runs periodically at a configurable `period`.
      10              : //!
      11              : //! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
      12              : //! It compares the returned usage data against two different types of thresholds.
      13              : //! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
      14              : //! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
      15              : //! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
      16              : //! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
      17              : //!
      18              : //! # Eviction Policy
      19              : //!
      20              : //! There are two thresholds:
      21              : //! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
      22              : //! If the actual usage is higher, the threshold is exceeded.
      23              : //! `min_avail_bytes` is the absolute available space in bytes.
      24              : //! If the actual usage is lower, the threshold is exceeded.
      25              : //! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction
      26              : //! is performed on the next iteration, to release disk space and bring the usage below the thresholds again.
      27              : //! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
      28              : //! The reservation is to keep the most recently accessed X bytes per tenant resident.
      29              : //! If we cannot relieve pressure by evicting layers outside of the reservation, we
      30              : //! start evicting layers that are part of the reservation, LRU first.
      31              : //!
      32              : //! The value for the per-tenant reservation is referred to as `tenant_min_resident_size`
      33              : //! throughout the code, but, no actual variable carries that name.
      34              : //! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`.
      35              : //! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
      36              : //! during page reconstruction.
      37              : //! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
      38              : //! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`).
      39              : 
      40              : // Implementation notes:
      41              : // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
      42              : //   reading these fields. We use the Debug impl for semi-structured logging, though.
      43              : 
      44              : use std::{
      45              :     sync::Arc,
      46              :     time::{Duration, SystemTime},
      47              : };
      48              : 
      49              : use anyhow::Context;
      50              : use pageserver_api::shard::TenantShardId;
      51              : use remote_storage::GenericRemoteStorage;
      52              : use serde::{Deserialize, Serialize};
      53              : use tokio::time::Instant;
      54              : use tokio_util::sync::CancellationToken;
      55              : use tracing::{debug, error, info, instrument, warn, Instrument};
      56              : use utils::serde_percent::Percent;
      57              : use utils::{completion, id::TimelineId};
      58              : 
      59              : use crate::{
      60              :     config::PageServerConf,
      61              :     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
      62              :     tenant::{
      63              :         self,
      64              :         mgr::TenantManager,
      65              :         remote_timeline_client::LayerFileMetadata,
      66              :         secondary::SecondaryTenant,
      67              :         storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
      68              :         Timeline,
      69              :     },
      70              : };
      71              : 
      72           64 : #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
      73              : pub struct DiskUsageEvictionTaskConfig {
      74              :     pub max_usage_pct: Percent,
      75              :     pub min_avail_bytes: u64,
      76              :     #[serde(with = "humantime_serde")]
      77              :     pub period: Duration,
      78              :     #[cfg(feature = "testing")]
      79              :     pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
      80              :     /// Select sorting for evicted layers
      81              :     #[serde(default)]
      82              :     pub eviction_order: EvictionOrder,
      83              : }
      84              : 
      85              : /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
      86              : /// partitioning.
      87           60 : #[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
      88              : #[serde(tag = "type", content = "args")]
      89              : pub enum EvictionOrder {
      90              :     /// Order the layers to be evicted by how recently they have been accessed in absolute
      91              :     /// time.
      92              :     ///
      93              :     /// This strategy is unfair when some tenants grow faster than others towards the slower
      94              :     /// growing.
      95              :     #[default]
      96              :     AbsoluteAccessed,
      97              : 
      98              :     /// Order the layers to be evicted by how recently they have been accessed relatively within
      99              :     /// the set of resident layers of a tenant.
     100              :     RelativeAccessed {
     101              :         /// Determines if the tenant with most layers should lose first.
     102              :         ///
     103              :         /// Having this enabled is currently the only reasonable option, because the order in which
     104              :         /// we read tenants is deterministic. If we find the need to use this as `false`, we need
     105              :         /// to ensure nondeterminism by adding in a random number to break the
     106              :         /// `relative_last_activity==0.0` ties.
     107              :         #[serde(default = "default_highest_layer_count_loses_first")]
     108              :         highest_layer_count_loses_first: bool,
     109              :     },
     110              : }
     111              : 
     112            0 : fn default_highest_layer_count_loses_first() -> bool {
     113            0 :     true
     114            0 : }
     115              : 
     116              : impl EvictionOrder {
     117           17 :     fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
     118           17 :         use EvictionOrder::*;
     119           17 : 
     120           17 :         match self {
     121           10 :             AbsoluteAccessed => {
     122         2822 :                 candidates.sort_unstable_by_key(|(partition, candidate)| {
     123         2822 :                     (*partition, candidate.last_activity_ts)
     124         2822 :                 });
     125           10 :             }
     126         2522 :             RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
     127         2522 :                 (*partition, candidate.relative_last_activity)
     128         2522 :             }),
     129              :         }
     130           17 :     }
     131              : 
     132              :     /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants
     133              :     /// layers in **most** recently used order.
     134          617 :     fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 {
     135          617 :         use EvictionOrder::*;
     136          617 : 
     137          617 :         match self {
     138          331 :             AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
     139              :             RelativeAccessed {
     140          286 :                 highest_layer_count_loses_first,
     141              :             } => {
     142              :                 // keeping the -1 or not decides if every tenant should lose their least recently accessed
     143              :                 // layer OR if this should happen in the order of having highest layer count:
     144          286 :                 let fudge = if *highest_layer_count_loses_first {
     145              :                     // relative_last_activity vs. tenant layer count:
     146              :                     // - 0.1..=1.0 (10 layers)
     147              :                     // - 0.01..=1.0 (100 layers)
     148              :                     // - 0.001..=1.0 (1000 layers)
     149              :                     //
     150              :                     // leading to evicting less of the smallest tenants.
     151           86 :                     0
     152              :                 } else {
     153              :                     // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
     154              :                     // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
     155              :                     // be that less than 10k layer evictions is enough, so we would not need to evict from
     156              :                     // all tenants.
     157              :                     //
     158              :                     // as the tenant ordering is now deterministic this could hit the same tenants
     159              :                     // disproportionetly on multiple invocations. alternative could be to remember how many
     160              :                     // layers did we evict last time from this tenant, and inject that as an additional
     161              :                     // fudge here.
     162          200 :                     1
     163              :                 };
     164              : 
     165          286 :                 let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1);
     166          286 :                 let divider = total as f32;
     167          286 : 
     168          286 :                 // most recently used is always (total - 0) / divider == 1.0
     169          286 :                 // least recently used depends on the fudge:
     170          286 :                 // -       (total - 1) - (total - 1) / total => 0 / total
     171          286 :                 // -             total - (total - 1) / total => 1 / total
     172          286 :                 let distance = (total - index) as f32;
     173          286 : 
     174          286 :                 finite_f32::FiniteF32::try_from_normalized(distance / divider)
     175          286 :                     .unwrap_or_else(|val| {
     176            0 :                         tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}");
     177            0 :                         finite_f32::FiniteF32::ZERO
     178          286 :                     })
     179              :             }
     180              :         }
     181          617 :     }
     182              : }
     183              : 
     184          624 : #[derive(Default)]
     185              : pub struct State {
     186              :     /// Exclude http requests and background task from running at the same time.
     187              :     mutex: tokio::sync::Mutex<()>,
     188              : }
     189              : 
     190          624 : pub fn launch_disk_usage_global_eviction_task(
     191          624 :     conf: &'static PageServerConf,
     192          624 :     storage: GenericRemoteStorage,
     193          624 :     state: Arc<State>,
     194          624 :     tenant_manager: Arc<TenantManager>,
     195          624 :     background_jobs_barrier: completion::Barrier,
     196          624 : ) -> anyhow::Result<()> {
     197          624 :     let Some(task_config) = &conf.disk_usage_based_eviction else {
     198          620 :         info!("disk usage based eviction task not configured");
     199          620 :         return Ok(());
     200              :     };
     201              : 
     202            4 :     info!("launching disk usage based eviction task");
     203              : 
     204            4 :     task_mgr::spawn(
     205            4 :         BACKGROUND_RUNTIME.handle(),
     206            4 :         TaskKind::DiskUsageEviction,
     207            4 :         None,
     208            4 :         None,
     209            4 :         "disk usage based eviction",
     210            4 :         false,
     211            4 :         async move {
     212            4 :             let cancel = task_mgr::shutdown_token();
     213            4 : 
     214            4 :             // wait until initial load is complete, because we cannot evict from loading tenants.
     215            8 :             tokio::select! {
     216            8 :                 _ = cancel.cancelled() => { return Ok(()); },
     217            8 :                 _ = background_jobs_barrier.wait() => { }
     218            8 :             };
     219              : 
     220          107 :             disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
     221            0 :             Ok(())
     222            4 :         },
     223            4 :     );
     224            4 : 
     225            4 :     Ok(())
     226          624 : }
     227              : 
     228            8 : #[instrument(skip_all)]
     229              : async fn disk_usage_eviction_task(
     230              :     state: &State,
     231              :     task_config: &DiskUsageEvictionTaskConfig,
     232              :     storage: &GenericRemoteStorage,
     233              :     tenant_manager: Arc<TenantManager>,
     234              :     cancel: CancellationToken,
     235              : ) {
     236            0 :     scopeguard::defer! {
     237            0 :         info!("disk usage based eviction task finishing");
     238              :     };
     239              : 
     240              :     use crate::tenant::tasks::random_init_delay;
     241              :     {
     242              :         if random_init_delay(task_config.period, &cancel)
     243              :             .await
     244              :             .is_err()
     245              :         {
     246              :             return;
     247              :         }
     248              :     }
     249              : 
     250              :     let mut iteration_no = 0;
     251              :     loop {
     252              :         iteration_no += 1;
     253              :         let start = Instant::now();
     254              : 
     255            7 :         async {
     256            7 :             let res = disk_usage_eviction_task_iteration(
     257            7 :                 state,
     258            7 :                 task_config,
     259            7 :                 storage,
     260            7 :                 &tenant_manager,
     261            7 :                 &cancel,
     262            7 :             )
     263          100 :             .await;
     264              : 
     265            7 :             match res {
     266            6 :                 Ok(()) => {}
     267            1 :                 Err(e) => {
     268            1 :                     // these stat failures are expected to be very rare
     269            1 :                     warn!("iteration failed, unexpected error: {e:#}");
     270              :                 }
     271              :             }
     272            7 :         }
     273              :         .instrument(tracing::info_span!("iteration", iteration_no))
     274              :         .await;
     275              : 
     276              :         let sleep_until = start + task_config.period;
     277              :         if tokio::time::timeout_at(sleep_until, cancel.cancelled())
     278              :             .await
     279              :             .is_ok()
     280              :         {
     281              :             break;
     282              :         }
     283              :     }
     284              : }
     285              : 
     286              : pub trait Usage: Clone + Copy + std::fmt::Debug {
     287              :     fn has_pressure(&self) -> bool;
     288              :     fn add_available_bytes(&mut self, bytes: u64);
     289              : }
     290              : 
     291            7 : async fn disk_usage_eviction_task_iteration(
     292            7 :     state: &State,
     293            7 :     task_config: &DiskUsageEvictionTaskConfig,
     294            7 :     storage: &GenericRemoteStorage,
     295            7 :     tenant_manager: &Arc<TenantManager>,
     296            7 :     cancel: &CancellationToken,
     297            7 : ) -> anyhow::Result<()> {
     298            7 :     let tenants_dir = tenant_manager.get_conf().tenants_path();
     299            7 :     let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
     300            7 :         .context("get filesystem-level disk usage before evictions")?;
     301            6 :     let res = disk_usage_eviction_task_iteration_impl(
     302            6 :         state,
     303            6 :         storage,
     304            6 :         usage_pre,
     305            6 :         tenant_manager,
     306            6 :         task_config.eviction_order,
     307            6 :         cancel,
     308            6 :     )
     309          100 :     .await;
     310            6 :     match res {
     311            6 :         Ok(outcome) => {
     312            0 :             debug!(?outcome, "disk_usage_eviction_iteration finished");
     313            6 :             match outcome {
     314            2 :                 IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
     315            2 :                     // nothing to do, select statement below will handle things
     316            2 :                 }
     317            4 :                 IterationOutcome::Finished(outcome) => {
     318              :                     // Verify with statvfs whether we made any real progress
     319            4 :                     let after = filesystem_level_usage::get(&tenants_dir, task_config)
     320            4 :                         // It's quite unlikely to hit the error here. Keep the code simple and bail out.
     321            4 :                         .context("get filesystem-level disk usage after evictions")?;
     322              : 
     323            0 :                     debug!(?after, "disk usage");
     324              : 
     325            4 :                     if after.has_pressure() {
     326              :                         // Don't bother doing an out-of-order iteration here now.
     327              :                         // In practice, the task period is set to a value in the tens-of-seconds range,
     328              :                         // which will cause another iteration to happen soon enough.
     329              :                         // TODO: deltas between the three different usages would be helpful,
     330              :                         // consider MiB, GiB, TiB
     331            0 :                         warn!(?outcome, ?after, "disk usage still high");
     332              :                     } else {
     333            4 :                         info!(?outcome, ?after, "disk usage pressure relieved");
     334              :                     }
     335              :                 }
     336              :             }
     337              :         }
     338            0 :         Err(e) => {
     339            0 :             error!("disk_usage_eviction_iteration failed: {:#}", e);
     340              :         }
     341              :     }
     342              : 
     343            6 :     Ok(())
     344            7 : }
     345              : 
     346           13 : #[derive(Debug, Serialize)]
     347              : #[allow(clippy::large_enum_variant)]
     348              : pub enum IterationOutcome<U> {
     349              :     NoPressure,
     350              :     Cancelled,
     351              :     Finished(IterationOutcomeFinished<U>),
     352              : }
     353              : 
     354              : #[allow(dead_code)]
     355           17 : #[derive(Debug, Serialize)]
     356              : pub struct IterationOutcomeFinished<U> {
     357              :     /// The actual usage observed before we started the iteration.
     358              :     before: U,
     359              :     /// The expected value for `after`, according to internal accounting, after phase 1.
     360              :     planned: PlannedUsage<U>,
     361              :     /// The outcome of phase 2, where we actually do the evictions.
     362              :     ///
     363              :     /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
     364              :     /// be the same as `planned`.
     365              :     assumed: AssumedUsage<U>,
     366              : }
     367              : 
     368           17 : #[derive(Debug, Serialize)]
     369              : #[allow(dead_code)]
     370              : struct AssumedUsage<U> {
     371              :     /// The expected value for `after`, after phase 2.
     372              :     projected_after: U,
     373              :     /// The layers we failed to evict during phase 2.
     374              :     failed: LayerCount,
     375              : }
     376              : 
     377              : #[allow(dead_code)]
     378           17 : #[derive(Debug, Serialize)]
     379              : struct PlannedUsage<U> {
     380              :     respecting_tenant_min_resident_size: U,
     381              :     fallback_to_global_lru: Option<U>,
     382              : }
     383              : 
     384              : #[allow(dead_code)]
     385           17 : #[derive(Debug, Default, Serialize)]
     386              : struct LayerCount {
     387              :     file_sizes: u64,
     388              :     count: usize,
     389              : }
     390              : 
     391           19 : pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     392           19 :     state: &State,
     393           19 :     _storage: &GenericRemoteStorage,
     394           19 :     usage_pre: U,
     395           19 :     tenant_manager: &Arc<TenantManager>,
     396           19 :     eviction_order: EvictionOrder,
     397           19 :     cancel: &CancellationToken,
     398           19 : ) -> anyhow::Result<IterationOutcome<U>> {
     399              :     // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
     400           19 :     let _g = state
     401           19 :         .mutex
     402           19 :         .try_lock()
     403           19 :         .map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
     404              : 
     405            0 :     debug!(?usage_pre, "disk usage");
     406              : 
     407           19 :     if !usage_pre.has_pressure() {
     408            2 :         return Ok(IterationOutcome::NoPressure);
     409           17 :     }
     410              : 
     411           17 :     warn!(
     412           17 :         ?usage_pre,
     413           17 :         "running disk usage based eviction due to pressure"
     414           17 :     );
     415              : 
     416           17 :     let candidates =
     417           25 :         match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
     418              :             EvictionCandidates::Cancelled => {
     419            0 :                 return Ok(IterationOutcome::Cancelled);
     420              :             }
     421           17 :             EvictionCandidates::Finished(partitioned) => partitioned,
     422           17 :         };
     423           17 : 
     424           17 :     // Debug-log the list of candidates
     425           17 :     let now = SystemTime::now();
     426          577 :     for (i, (partition, candidate)) in candidates.iter().enumerate() {
     427          577 :         let nth = i + 1;
     428          577 :         let total_candidates = candidates.len();
     429          577 :         let size = candidate.layer.get_file_size();
     430          577 :         let rel = candidate.relative_last_activity;
     431            0 :         debug!(
     432            0 :             "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
     433            0 :             now.duration_since(candidate.last_activity_ts)
     434            0 :                 .unwrap()
     435            0 :                 .as_micros(),
     436            0 :             candidate.layer.get_tenant_shard_id(),
     437            0 :             candidate.layer.get_timeline_id(),
     438            0 :             candidate.layer.get_name(),
     439            0 :         );
     440              :     }
     441              : 
     442              :     // phase1: select victims to relieve pressure
     443              :     //
     444              :     // Walk through the list of candidates, until we have accumulated enough layers to get
     445              :     // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
     446              :     // how much disk space would be used after evicting all the layers up to the current
     447              :     // point in the list.
     448              :     //
     449              :     // If we get far enough in the list that we start to evict layers that are below
     450              :     // the tenant's min-resident-size threshold, print a warning, and memorize the disk
     451              :     // usage at that point, in 'usage_planned_min_resident_size_respecting'.
     452              : 
     453           17 :     let selection = select_victims(&candidates, usage_pre);
     454           17 : 
     455           17 :     let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
     456           17 : 
     457           17 :     // phase2: evict layers
     458           17 : 
     459           17 :     let mut js = tokio::task::JoinSet::new();
     460           17 :     let limit = 1000;
     461           17 : 
     462           17 :     let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
     463           17 :     let mut consumed_all = false;
     464           17 : 
     465           17 :     // After the evictions, `usage_assumed` is the post-eviction usage,
     466           17 :     // according to internal accounting.
     467           17 :     let mut usage_assumed = usage_pre;
     468           17 :     let mut evictions_failed = LayerCount::default();
     469           17 : 
     470           17 :     let evict_layers = async move {
     471              :         loop {
     472          524 :             let next = if js.len() >= limit || consumed_all {
     473          224 :                 js.join_next().await
     474          300 :             } else if !js.is_empty() {
     475              :                 // opportunistically consume ready result, one per each new evicted
     476          283 :                 futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
     477              :             } else {
     478           17 :                 None
     479              :             };
     480              : 
     481          524 :             if let Some(next) = next {
     482            0 :                 match next {
     483          283 :                     Ok(Ok(file_size)) => {
     484          283 :                         usage_assumed.add_available_bytes(file_size);
     485          283 :                     }
     486            0 :                     Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
     487            0 :                         evictions_failed.file_sizes += file_size;
     488            0 :                         evictions_failed.count += 1;
     489            0 :                     }
     490            0 :                     Err(je) if je.is_cancelled() => unreachable!("not used"),
     491            0 :                     Err(je) if je.is_panic() => { /* already logged */ }
     492            0 :                     Err(je) => tracing::error!("unknown JoinError: {je:?}"),
     493              :                 }
     494          241 :             }
     495              : 
     496          524 :             if consumed_all && js.is_empty() {
     497           17 :                 break;
     498          507 :             }
     499              : 
     500              :             // calling again when consumed_all is fine as evicted is fused.
     501          507 :             let Some((_partition, candidate)) = evicted.next() else {
     502          224 :                 consumed_all = true;
     503          224 :                 continue;
     504              :             };
     505              : 
     506          283 :             match candidate.layer {
     507          270 :                 EvictionLayer::Attached(layer) => {
     508          270 :                     let file_size = layer.layer_desc().file_size;
     509          270 :                     js.spawn(async move {
     510          270 :                         layer
     511          270 :                             .evict_and_wait()
     512          270 :                             .await
     513          270 :                             .map(|()| file_size)
     514          270 :                             .map_err(|e| (file_size, e))
     515          270 :                     });
     516          270 :                 }
     517           13 :                 EvictionLayer::Secondary(layer) => {
     518           13 :                     let file_size = layer.metadata.file_size();
     519           13 :                     let tenant_manager = tenant_manager.clone();
     520           13 : 
     521           13 :                     js.spawn(async move {
     522           13 :                         layer
     523           13 :                             .secondary_tenant
     524           13 :                             .evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name)
     525           13 :                             .await;
     526           13 :                         Ok(file_size)
     527           13 :                     });
     528           13 :                 }
     529              :             }
     530          283 :             tokio::task::yield_now().await;
     531              :         }
     532              : 
     533           17 :         (usage_assumed, evictions_failed)
     534           17 :     };
     535              : 
     536           17 :     let (usage_assumed, evictions_failed) = tokio::select! {
     537           17 :         tuple = evict_layers => { tuple },
     538              :         _ = cancel.cancelled() => {
     539              :             // dropping joinset will abort all pending evict_and_waits and that is fine, our
     540              :             // requests will still stand
     541              :             return Ok(IterationOutcome::Cancelled);
     542              :         }
     543              :     };
     544              : 
     545           17 :     Ok(IterationOutcome::Finished(IterationOutcomeFinished {
     546           17 :         before: usage_pre,
     547           17 :         planned: usage_planned,
     548           17 :         assumed: AssumedUsage {
     549           17 :             projected_after: usage_assumed,
     550           17 :             failed: evictions_failed,
     551           17 :         },
     552           17 :     }))
     553           19 : }
     554              : 
     555            0 : #[derive(Clone)]
     556              : pub(crate) struct EvictionSecondaryLayer {
     557              :     pub(crate) secondary_tenant: Arc<SecondaryTenant>,
     558              :     pub(crate) timeline_id: TimelineId,
     559              :     pub(crate) name: LayerFileName,
     560              :     pub(crate) metadata: LayerFileMetadata,
     561              : }
     562              : 
     563              : /// Full [`Layer`] objects are specific to tenants in attached mode.  This type is a layer
     564              : /// of indirection to store either a `Layer`, or a reference to a secondary tenant and a layer name.
     565            0 : #[derive(Clone)]
     566              : pub(crate) enum EvictionLayer {
     567              :     Attached(Layer),
     568              :     #[allow(dead_code)]
     569              :     Secondary(EvictionSecondaryLayer),
     570              : }
     571              : 
     572              : impl From<Layer> for EvictionLayer {
     573          539 :     fn from(value: Layer) -> Self {
     574          539 :         Self::Attached(value)
     575          539 :     }
     576              : }
     577              : 
     578              : impl EvictionLayer {
     579            0 :     pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
     580            0 :         match self {
     581            0 :             Self::Attached(l) => &l.layer_desc().tenant_shard_id,
     582            0 :             Self::Secondary(sl) => sl.secondary_tenant.get_tenant_shard_id(),
     583              :         }
     584            0 :     }
     585              : 
     586            0 :     pub(crate) fn get_timeline_id(&self) -> &TimelineId {
     587            0 :         match self {
     588            0 :             Self::Attached(l) => &l.layer_desc().timeline_id,
     589            0 :             Self::Secondary(sl) => &sl.timeline_id,
     590              :         }
     591            0 :     }
     592              : 
     593            0 :     pub(crate) fn get_name(&self) -> LayerFileName {
     594            0 :         match self {
     595            0 :             Self::Attached(l) => l.layer_desc().filename(),
     596            0 :             Self::Secondary(sl) => sl.name.clone(),
     597              :         }
     598            0 :     }
     599              : 
     600         1437 :     pub(crate) fn get_file_size(&self) -> u64 {
     601         1437 :         match self {
     602         1348 :             Self::Attached(l) => l.layer_desc().file_size,
     603           89 :             Self::Secondary(sl) => sl.metadata.file_size(),
     604              :         }
     605         1437 :     }
     606              : }
     607              : 
     608            0 : #[derive(Clone)]
     609              : pub(crate) struct EvictionCandidate {
     610              :     pub(crate) layer: EvictionLayer,
     611              :     pub(crate) last_activity_ts: SystemTime,
     612              :     pub(crate) relative_last_activity: finite_f32::FiniteF32,
     613              : }
     614              : 
     615              : impl std::fmt::Display for EvictionLayer {
     616            0 :     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
     617            0 :         match self {
     618            0 :             Self::Attached(l) => l.fmt(f),
     619            0 :             Self::Secondary(sl) => {
     620            0 :                 write!(f, "{}/{}", sl.timeline_id, sl.name)
     621              :             }
     622              :         }
     623            0 :     }
     624              : }
     625              : 
     626            2 : #[derive(Default)]
     627              : pub(crate) struct DiskUsageEvictionInfo {
     628              :     /// Timeline's largest layer (remote or resident)
     629              :     pub max_layer_size: Option<u64>,
     630              :     /// Timeline's resident layers
     631              :     pub resident_layers: Vec<EvictionCandidate>,
     632              : }
     633              : 
     634              : impl std::fmt::Debug for EvictionCandidate {
     635            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     636            0 :         // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
     637            0 :         // having to allocate a string to this is bad, but it will rarely be formatted
     638            0 :         let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
     639            0 :         let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
     640            0 :         struct DisplayIsDebug<'a, T>(&'a T);
     641            0 :         impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> {
     642            0 :             fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     643            0 :                 write!(f, "{}", self.0)
     644            0 :             }
     645            0 :         }
     646            0 :         f.debug_struct("LocalLayerInfoForDiskUsageEviction")
     647            0 :             .field("layer", &DisplayIsDebug(&self.layer))
     648            0 :             .field("last_activity", &ts)
     649            0 :             .finish()
     650            0 :     }
     651              : }
     652              : 
     653         2689 : #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
     654              : enum MinResidentSizePartition {
     655              :     Above,
     656              :     Below,
     657              : }
     658              : 
     659              : enum EvictionCandidates {
     660              :     Cancelled,
     661              :     Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
     662              : }
     663              : 
     664              : /// Gather the eviction candidates.
     665              : ///
     666              : /// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction
     667              : /// order. A caller that evicts in that order, until pressure is relieved, implements
     668              : /// the eviction policy outlined in the module comment.
     669              : ///
     670              : /// # Example with EvictionOrder::AbsoluteAccessed
     671              : ///
     672              : /// Imagine that there are two tenants, A and B, with five layers each, a-e.
     673              : /// Each layer has size 100, and both tenant's min_resident_size is 150.
     674              : /// The eviction order would be
     675              : ///
     676              : /// ```text
     677              : /// partition last_activity_ts tenant/layer
     678              : /// Above     18:30            A/c
     679              : /// Above     19:00            A/b
     680              : /// Above     18:29            B/c
     681              : /// Above     19:05            B/b
     682              : /// Above     20:00            B/a
     683              : /// Above     20:03            A/a
     684              : /// Below     20:30            A/d
     685              : /// Below     20:40            B/d
     686              : /// Below     20:45            B/e
     687              : /// Below     20:58            A/e
     688              : /// ```
     689              : ///
     690              : /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
     691              : /// They are all in the `Above` partition, so, we respected each tenant's min_resident_size.
     692              : ///
     693              : /// But, if we need to evict 900 bytes to relieve pressure, we'd evict
     694              : /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
     695              : /// after exhauting the `Above` partition.
     696              : /// So, we did not respect each tenant's min_resident_size.
     697              : ///
     698              : /// # Example with EvictionOrder::RelativeAccessed
     699              : ///
     700              : /// ```text
     701              : /// partition relative_age last_activity_ts tenant/layer
     702              : /// Above     0/4          18:30            A/c
     703              : /// Above     0/4          18:29            B/c
     704              : /// Above     1/4          19:00            A/b
     705              : /// Above     1/4          19:05            B/b
     706              : /// Above     2/4          20:00            B/a
     707              : /// Above     2/4          20:03            A/a
     708              : /// Below     3/4          20:30            A/d
     709              : /// Below     3/4          20:40            B/d
     710              : /// Below     4/4          20:45            B/e
     711              : /// Below     4/4          20:58            A/e
     712              : /// ```
     713              : ///
     714              : /// With tenants having the same number of layers the picture does not change much. The same with
     715              : /// A having many more layers **resident** (not all of them listed):
     716              : ///
     717              : /// ```text
     718              : /// Above       0/100      18:30            A/c
     719              : /// Above       0/4        18:29            B/c
     720              : /// Above       1/100      19:00            A/b
     721              : /// Above       2/100      20:03            A/a
     722              : /// Above       3/100      20:03            A/nth_3
     723              : /// Above       4/100      20:03            A/nth_4
     724              : ///             ...
     725              : /// Above       1/4        19:05            B/b
     726              : /// Above      25/100      20:04            A/nth_25
     727              : ///             ...
     728              : /// Above       2/4        20:00            B/a
     729              : /// Above      50/100      20:10            A/nth_50
     730              : ///             ...
     731              : /// Below       3/4        20:40            B/d
     732              : /// Below      99/100      20:30            A/nth_99
     733              : /// Below       4/4        20:45            B/e
     734              : /// Below     100/100      20:58            A/nth_100
     735              : /// ```
     736              : ///
     737              : /// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
     738              : /// difficult to see is what happens on the next round assuming the evicting 23 from the above list
     739              : /// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
     740              : /// appeared:
     741              : ///
     742              : /// ```text
     743              : /// Above       0/87       20:04            A/nth_23
     744              : /// Above       0/3        19:05            B/b
     745              : /// Above       0/50       20:59            C/nth_0
     746              : /// Above       1/87       20:04            A/nth_24
     747              : /// Above       1/50       21:00            C/nth_1
     748              : /// Above       2/87       20:04            A/nth_25
     749              : ///             ...
     750              : /// Above      16/50       21:02            C/nth_16
     751              : /// Above       1/3        20:00            B/a
     752              : /// Above      27/87       20:10            A/nth_50
     753              : ///             ...
     754              : /// Below       2/3        20:40            B/d
     755              : /// Below      49/50       21:05            C/nth_49
     756              : /// Below      86/87       20:30            A/nth_99
     757              : /// Below       3/3        20:45            B/e
     758              : /// Below      50/50       21:05            C/nth_50
     759              : /// Below      87/87       20:58            A/nth_100
     760              : /// ```
     761              : ///
     762              : /// Now relieving pressure with 23 layers would cost:
     763              : /// - tenant A 14 layers
     764              : /// - tenant B 1 layer
     765              : /// - tenant C 8 layers
     766           17 : async fn collect_eviction_candidates(
     767           17 :     tenant_manager: &Arc<TenantManager>,
     768           17 :     eviction_order: EvictionOrder,
     769           17 :     cancel: &CancellationToken,
     770           17 : ) -> anyhow::Result<EvictionCandidates> {
     771              :     // get a snapshot of the list of tenants
     772           17 :     let tenants = tenant::mgr::list_tenants()
     773            0 :         .await
     774           17 :         .context("get list of tenants")?;
     775              : 
     776              :     // TODO: avoid listing every layer in every tenant: this loop can block the executor,
     777              :     // and the resulting data structure can be huge.
     778              :     // (https://github.com/neondatabase/neon/issues/6224)
     779           17 :     let mut candidates = Vec::new();
     780              : 
     781           55 :     for (tenant_id, _state, _gen) in tenants {
     782           38 :         if cancel.is_cancelled() {
     783            0 :             return Ok(EvictionCandidates::Cancelled);
     784           38 :         }
     785           38 :         let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
     786           37 :             Ok(tenant) => tenant,
     787            1 :             Err(e) => {
     788              :                 // this can happen if tenant has lifecycle transition after we fetched it
     789            0 :                 debug!("failed to get tenant: {e:#}");
     790            1 :                 continue;
     791              :             }
     792              :         };
     793              : 
     794           37 :         if tenant.cancel.is_cancelled() {
     795            0 :             info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
     796            0 :             continue;
     797           37 :         }
     798           37 : 
     799           37 :         // collect layers from all timelines in this tenant
     800           37 :         //
     801           37 :         // If one of the timelines becomes `!is_active()` during the iteration,
     802           37 :         // for example because we're shutting down, then `max_layer_size` can be too small.
     803           37 :         // That's OK. This code only runs under a disk pressure situation, and being
     804           37 :         // a little unfair to tenants during shutdown in such a situation is tolerable.
     805           37 :         let mut tenant_candidates = Vec::new();
     806           37 :         let mut max_layer_size = 0;
     807           37 :         for tl in tenant.list_timelines() {
     808           37 :             if !tl.is_active() {
     809            0 :                 continue;
     810           37 :             }
     811           37 :             let info = tl.get_local_layers_for_disk_usage_eviction().await;
     812            0 :             debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
     813           37 :             tenant_candidates.extend(info.resident_layers.into_iter());
     814           37 :             max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
     815           37 : 
     816           37 :             if cancel.is_cancelled() {
     817            0 :                 return Ok(EvictionCandidates::Cancelled);
     818           37 :             }
     819              :         }
     820              : 
     821              :         // `min_resident_size` defaults to maximum layer file size of the tenant.
     822              :         // This ensures that each tenant can have at least one layer resident at a given time,
     823              :         // ensuring forward progress for a single Timeline::get in that tenant.
     824              :         // It's a questionable heuristic since, usually, there are many Timeline::get
     825              :         // requests going on for a tenant, and, at least in Neon prod, the median
     826              :         // layer file size is much smaller than the compaction target size.
     827              :         // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
     828              :         // That's what's typically used by the various background loops.
     829              :         //
     830              :         // The default can be overridden with a fixed value in the tenant conf.
     831              :         // A default override can be put in the default tenant conf in the pageserver.toml.
     832           37 :         let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
     833            0 :             debug!(
     834            0 :                 tenant_id=%tenant.tenant_shard_id().tenant_id,
     835            0 :                 shard_id=%tenant.tenant_shard_id().shard_slug(),
     836            0 :                 overridden_size=s,
     837            0 :                 "using overridden min resident size for tenant"
     838            0 :             );
     839            4 :             s
     840              :         } else {
     841            0 :             debug!(
     842            0 :                 tenant_id=%tenant.tenant_shard_id().tenant_id,
     843            0 :                 shard_id=%tenant.tenant_shard_id().shard_slug(),
     844            0 :                 max_layer_size,
     845            0 :                 "using max layer size as min_resident_size for tenant",
     846            0 :             );
     847           33 :             max_layer_size
     848              :         };
     849              : 
     850              :         // Sort layers most-recently-used first, then partition by
     851              :         // cumsum above/below min_resident_size.
     852           37 :         tenant_candidates
     853         4874 :             .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
     854           37 :         let mut cumsum: i128 = 0;
     855           37 : 
     856           37 :         let total = tenant_candidates.len();
     857           37 : 
     858           37 :         let tenant_candidates =
     859           37 :             tenant_candidates
     860           37 :                 .into_iter()
     861           37 :                 .enumerate()
     862          539 :                 .map(|(i, mut candidate)| {
     863          539 :                     // as we iterate this reverse sorted list, the most recently accessed layer will always
     864          539 :                     // be 1.0; this is for us to evict it last.
     865          539 :                     candidate.relative_last_activity =
     866          539 :                         eviction_order.relative_last_activity(total, i);
     867              : 
     868          539 :                     let partition = if cumsum > min_resident_size as i128 {
     869          408 :                         MinResidentSizePartition::Above
     870              :                     } else {
     871          131 :                         MinResidentSizePartition::Below
     872              :                     };
     873          539 :                     cumsum += i128::from(candidate.layer.get_file_size());
     874          539 : 
     875          539 :                     (partition, candidate)
     876          539 :                 });
     877           37 : 
     878           37 :         candidates.extend(tenant_candidates);
     879              :     }
     880              : 
     881              :     // Note: the same tenant ID might be hit twice, if it transitions from attached to
     882              :     // secondary while we run.  That is okay: when we eventually try and run the eviction,
     883              :     // the `Gate` on the object will ensure that whichever one has already been shut down
     884              :     // will not delete anything.
     885              : 
     886           17 :     let mut secondary_tenants = Vec::new();
     887           17 :     tenant_manager.foreach_secondary_tenants(
     888           17 :         |_tenant_shard_id: &TenantShardId, state: &Arc<SecondaryTenant>| {
     889            2 :             secondary_tenants.push(state.clone());
     890           17 :         },
     891           17 :     );
     892              : 
     893           19 :     for secondary_tenant in secondary_tenants {
     894              :         // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
     895              :         // to prevent repeated disk usage based evictions from completely draining less often
     896              :         // updating secondaries.
     897            2 :         let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
     898              : 
     899              :         debug_assert!(
     900            2 :             total_layers >= layer_info.resident_layers.len(),
     901            0 :             "total_layers ({total_layers}) must be at least the resident_layers.len() ({})",
     902            0 :             layer_info.resident_layers.len()
     903              :         );
     904              : 
     905            2 :         layer_info
     906            2 :             .resident_layers
     907          158 :             .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
     908            2 : 
     909            2 :         let tenant_candidates =
     910            2 :             layer_info
     911            2 :                 .resident_layers
     912            2 :                 .into_iter()
     913            2 :                 .enumerate()
     914           38 :                 .map(|(i, mut candidate)| {
     915           38 :                     candidate.relative_last_activity =
     916           38 :                         eviction_order.relative_last_activity(total_layers, i);
     917           38 :                     (
     918           38 :                         // Secondary locations' layers are always considered above the min resident size,
     919           38 :                         // i.e. secondary locations are permitted to be trimmed to zero layers if all
     920           38 :                         // the layers have sufficiently old access times.
     921           38 :                         MinResidentSizePartition::Above,
     922           38 :                         candidate,
     923           38 :                     )
     924           38 :                 });
     925            2 : 
     926            2 :         candidates.extend(tenant_candidates);
     927            2 : 
     928            2 :         tokio::task::yield_now().await;
     929              :     }
     930              : 
     931           17 :     debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
     932            0 :         "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
     933              : 
     934           17 :     eviction_order.sort(&mut candidates);
     935           17 : 
     936           17 :     Ok(EvictionCandidates::Finished(candidates))
     937           17 : }
     938              : 
     939              : /// Given a pre-sorted vec of all layers in the system, select the first N which are enough to
     940              : /// relieve pressure.
     941              : ///
     942              : /// Returns the amount of candidates selected, with the planned usage.
     943           17 : fn select_victims<U: Usage>(
     944           17 :     candidates: &[(MinResidentSizePartition, EvictionCandidate)],
     945           17 :     usage_pre: U,
     946           17 : ) -> VictimSelection<U> {
     947           17 :     let mut usage_when_switched = None;
     948           17 :     let mut usage_planned = usage_pre;
     949           17 :     let mut evicted_amount = 0;
     950              : 
     951          297 :     for (i, (partition, candidate)) in candidates.iter().enumerate() {
     952          297 :         if !usage_planned.has_pressure() {
     953           14 :             break;
     954          283 :         }
     955          283 : 
     956          283 :         if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
     957            3 :             usage_when_switched = Some((usage_planned, i));
     958          280 :         }
     959              : 
     960          283 :         usage_planned.add_available_bytes(candidate.layer.get_file_size());
     961          283 :         evicted_amount += 1;
     962              :     }
     963              : 
     964           17 :     VictimSelection {
     965           17 :         amount: evicted_amount,
     966           17 :         usage_pre,
     967           17 :         usage_when_switched,
     968           17 :         usage_planned,
     969           17 :     }
     970           17 : }
     971              : 
     972              : struct VictimSelection<U> {
     973              :     amount: usize,
     974              :     usage_pre: U,
     975              :     usage_when_switched: Option<(U, usize)>,
     976              :     usage_planned: U,
     977              : }
     978              : 
     979              : impl<U: Usage> VictimSelection<U> {
     980           17 :     fn into_amount_and_planned(self) -> (usize, PlannedUsage<U>) {
     981           17 :         debug!(
     982            0 :             evicted_amount=%self.amount,
     983            0 :             "took enough candidates for pressure to be relieved"
     984            0 :         );
     985              : 
     986           17 :         if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() {
     987            3 :             warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
     988           14 :         }
     989              : 
     990           17 :         let planned = match self.usage_when_switched {
     991            3 :             Some((respecting_tenant_min_resident_size, _)) => PlannedUsage {
     992            3 :                 respecting_tenant_min_resident_size,
     993            3 :                 fallback_to_global_lru: Some(self.usage_planned),
     994            3 :             },
     995           14 :             None => PlannedUsage {
     996           14 :                 respecting_tenant_min_resident_size: self.usage_planned,
     997           14 :                 fallback_to_global_lru: None,
     998           14 :             },
     999              :         };
    1000              : 
    1001           17 :         (self.amount, planned)
    1002           17 :     }
    1003              : }
    1004              : 
    1005              : struct TimelineKey(Arc<Timeline>);
    1006              : 
    1007              : impl PartialEq for TimelineKey {
    1008            0 :     fn eq(&self, other: &Self) -> bool {
    1009            0 :         Arc::ptr_eq(&self.0, &other.0)
    1010            0 :     }
    1011              : }
    1012              : 
    1013              : impl Eq for TimelineKey {}
    1014              : 
    1015              : impl std::hash::Hash for TimelineKey {
    1016            0 :     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
    1017            0 :         Arc::as_ptr(&self.0).hash(state);
    1018            0 :     }
    1019              : }
    1020              : 
    1021              : impl std::ops::Deref for TimelineKey {
    1022              :     type Target = Timeline;
    1023              : 
    1024            0 :     fn deref(&self) -> &Self::Target {
    1025            0 :         self.0.as_ref()
    1026            0 :     }
    1027              : }
    1028              : 
    1029              : /// A totally ordered f32 subset we can use with sorting functions.
    1030              : pub(crate) mod finite_f32 {
    1031              : 
    1032              :     /// A totally ordered f32 subset we can use with sorting functions.
    1033            0 :     #[derive(Clone, Copy, PartialEq)]
    1034              :     pub struct FiniteF32(f32);
    1035              : 
    1036              :     impl std::fmt::Debug for FiniteF32 {
    1037            0 :         fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1038            0 :             std::fmt::Debug::fmt(&self.0, f)
    1039            0 :         }
    1040              :     }
    1041              : 
    1042              :     impl std::fmt::Display for FiniteF32 {
    1043            0 :         fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    1044            0 :             std::fmt::Display::fmt(&self.0, f)
    1045            0 :         }
    1046              :     }
    1047              : 
    1048              :     impl std::cmp::Eq for FiniteF32 {}
    1049              : 
    1050              :     impl std::cmp::PartialOrd for FiniteF32 {
    1051         1019 :         fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
    1052         1019 :             Some(self.cmp(other))
    1053         1019 :         }
    1054              :     }
    1055              : 
    1056              :     impl std::cmp::Ord for FiniteF32 {
    1057         1019 :         fn cmp(&self, other: &Self) -> std::cmp::Ordering {
    1058         1019 :             self.0.total_cmp(&other.0)
    1059         1019 :         }
    1060              :     }
    1061              : 
    1062              :     impl TryFrom<f32> for FiniteF32 {
    1063              :         type Error = f32;
    1064              : 
    1065            0 :         fn try_from(value: f32) -> Result<Self, Self::Error> {
    1066            0 :             if value.is_finite() {
    1067            0 :                 Ok(FiniteF32(value))
    1068              :             } else {
    1069            0 :                 Err(value)
    1070              :             }
    1071            0 :         }
    1072              :     }
    1073              : 
    1074              :     impl From<FiniteF32> for f32 {
    1075           40 :         fn from(value: FiniteF32) -> f32 {
    1076           40 :             value.0
    1077           40 :         }
    1078              :     }
    1079              : 
    1080              :     impl FiniteF32 {
    1081              :         pub const ZERO: FiniteF32 = FiniteF32(0.0);
    1082              : 
    1083          286 :         pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
    1084          286 :             if (0.0..=1.0).contains(&value) {
    1085              :                 // -0.0 is within the range, make sure it is assumed 0.0..=1.0
    1086          286 :                 let value = value.abs();
    1087          286 :                 Ok(FiniteF32(value))
    1088              :             } else {
    1089            0 :                 Err(value)
    1090              :             }
    1091          286 :         }
    1092              : 
    1093           40 :         pub fn into_inner(self) -> f32 {
    1094           40 :             self.into()
    1095           40 :         }
    1096              :     }
    1097              : }
    1098              : 
    1099              : mod filesystem_level_usage {
    1100              :     use anyhow::Context;
    1101              :     use camino::Utf8Path;
    1102              : 
    1103              :     use crate::statvfs::Statvfs;
    1104              : 
    1105              :     use super::DiskUsageEvictionTaskConfig;
    1106              : 
    1107           20 :     #[derive(Debug, Clone, Copy)]
    1108              :     #[allow(dead_code)]
    1109              :     pub struct Usage<'a> {
    1110              :         config: &'a DiskUsageEvictionTaskConfig,
    1111              : 
    1112              :         /// Filesystem capacity
    1113              :         total_bytes: u64,
    1114              :         /// Free filesystem space
    1115              :         avail_bytes: u64,
    1116              :     }
    1117              : 
    1118              :     impl super::Usage for Usage<'_> {
    1119           81 :         fn has_pressure(&self) -> bool {
    1120           81 :             let usage_pct =
    1121           81 :                 (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
    1122           81 : 
    1123           81 :             let pressures = [
    1124           81 :                 (
    1125           81 :                     "min_avail_bytes",
    1126           81 :                     self.avail_bytes < self.config.min_avail_bytes,
    1127           81 :                 ),
    1128           81 :                 (
    1129           81 :                     "max_usage_pct",
    1130           81 :                     usage_pct >= self.config.max_usage_pct.get() as u64,
    1131           81 :                 ),
    1132           81 :             ];
    1133           81 : 
    1134          134 :             pressures.into_iter().any(|(_, has_pressure)| has_pressure)
    1135           81 :         }
    1136              : 
    1137          118 :         fn add_available_bytes(&mut self, bytes: u64) {
    1138          118 :             self.avail_bytes += bytes;
    1139          118 :         }
    1140              :     }
    1141              : 
    1142           11 :     pub fn get<'a>(
    1143           11 :         tenants_dir: &Utf8Path,
    1144           11 :         config: &'a DiskUsageEvictionTaskConfig,
    1145           11 :     ) -> anyhow::Result<Usage<'a>> {
    1146           11 :         let mock_config = {
    1147           11 :             #[cfg(feature = "testing")]
    1148           11 :             {
    1149           11 :                 config.mock_statvfs.as_ref()
    1150              :             }
    1151              :             #[cfg(not(feature = "testing"))]
    1152              :             {
    1153              :                 None
    1154              :             }
    1155              :         };
    1156              : 
    1157           11 :         let stat = Statvfs::get(tenants_dir, mock_config)
    1158           11 :             .context("statvfs failed, presumably directory got unlinked")?;
    1159              : 
    1160              :         // https://unix.stackexchange.com/a/703650
    1161           10 :         let blocksize = if stat.fragment_size() > 0 {
    1162           10 :             stat.fragment_size()
    1163              :         } else {
    1164            0 :             stat.block_size()
    1165              :         };
    1166              : 
    1167              :         // use blocks_available (b_avail) since, pageserver runs as unprivileged user
    1168           10 :         let avail_bytes = stat.blocks_available() * blocksize;
    1169           10 :         let total_bytes = stat.blocks() * blocksize;
    1170           10 : 
    1171           10 :         Ok(Usage {
    1172           10 :             config,
    1173           10 :             total_bytes,
    1174           10 :             avail_bytes,
    1175           10 :         })
    1176           11 :     }
    1177              : 
    1178            2 :     #[test]
    1179            2 :     fn max_usage_pct_pressure() {
    1180            2 :         use super::EvictionOrder;
    1181            2 :         use super::Usage as _;
    1182            2 :         use std::time::Duration;
    1183            2 :         use utils::serde_percent::Percent;
    1184            2 : 
    1185            2 :         let mut usage = Usage {
    1186            2 :             config: &DiskUsageEvictionTaskConfig {
    1187            2 :                 max_usage_pct: Percent::new(85).unwrap(),
    1188            2 :                 min_avail_bytes: 0,
    1189            2 :                 period: Duration::MAX,
    1190            2 :                 #[cfg(feature = "testing")]
    1191            2 :                 mock_statvfs: None,
    1192            2 :                 eviction_order: EvictionOrder::default(),
    1193            2 :             },
    1194            2 :             total_bytes: 100_000,
    1195            2 :             avail_bytes: 0,
    1196            2 :         };
    1197            2 : 
    1198            2 :         assert!(usage.has_pressure(), "expected pressure at 100%");
    1199              : 
    1200            2 :         usage.add_available_bytes(14_000);
    1201            2 :         assert!(usage.has_pressure(), "expected pressure at 86%");
    1202              : 
    1203            2 :         usage.add_available_bytes(999);
    1204            2 :         assert!(usage.has_pressure(), "expected pressure at 85.001%");
    1205              : 
    1206            2 :         usage.add_available_bytes(1);
    1207            2 :         assert!(usage.has_pressure(), "expected pressure at precisely 85%");
    1208              : 
    1209            2 :         usage.add_available_bytes(1);
    1210            2 :         assert!(!usage.has_pressure(), "no pressure at 84.999%");
    1211              : 
    1212            2 :         usage.add_available_bytes(999);
    1213            2 :         assert!(!usage.has_pressure(), "no pressure at 84%");
    1214              : 
    1215            2 :         usage.add_available_bytes(16_000);
    1216            2 :         assert!(!usage.has_pressure());
    1217            2 :     }
    1218              : }
    1219              : 
    1220              : #[cfg(test)]
    1221              : mod tests {
    1222              :     use super::*;
    1223              : 
    1224            2 :     #[test]
    1225            2 :     fn relative_equal_bounds() {
    1226            2 :         let order = EvictionOrder::RelativeAccessed {
    1227            2 :             highest_layer_count_loses_first: false,
    1228            2 :         };
    1229            2 : 
    1230            2 :         let len = 10;
    1231            2 :         let v = (0..len)
    1232           20 :             .map(|i| order.relative_last_activity(len, i).into_inner())
    1233            2 :             .collect::<Vec<_>>();
    1234            2 : 
    1235            2 :         assert_eq!(v.first(), Some(&1.0));
    1236            2 :         assert_eq!(v.last(), Some(&0.0));
    1237           18 :         assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
    1238            2 :     }
    1239              : 
    1240            2 :     #[test]
    1241            2 :     fn relative_spare_bounds() {
    1242            2 :         let order = EvictionOrder::RelativeAccessed {
    1243            2 :             highest_layer_count_loses_first: true,
    1244            2 :         };
    1245            2 : 
    1246            2 :         let len = 10;
    1247            2 :         let v = (0..len)
    1248           20 :             .map(|i| order.relative_last_activity(len, i).into_inner())
    1249            2 :             .collect::<Vec<_>>();
    1250            2 : 
    1251            2 :         assert_eq!(v.first(), Some(&1.0));
    1252            2 :         assert_eq!(v.last(), Some(&0.1));
    1253           18 :         assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
    1254            2 :     }
    1255              : }
        

Generated by: LCOV version 2.1-beta