LCOV - 8ac049b474321fdc72ddcb56d7165153a1a900e8.info - pageserver/src/disk_usage_eviction

LCOV - code coverage report

Current view:	top level - pageserver/src - disk_usage_eviction_task.rs (source / functions)		Coverage	Total	Hit
Test:	8ac049b474321fdc72ddcb56d7165153a1a900e8.info	Lines:	80.4 %	362	291
Test Date:	2023-09-06 10:18:01	Functions:	54.5 %	112	61

            Line data    Source code

       1              : //! This module implements the pageserver-global disk-usage-based layer eviction task.
       2              : //!
       3              : //! # Mechanics
       4              : //!
       5              : //! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
       6              : //! loop that evicts layers in response to a shortage of available bytes
       7              : //! in the $repo/tenants directory's filesystem.
       8              : //!
       9              : //! The loop runs periodically at a configurable `period`.
      10              : //!
      11              : //! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
      12              : //! It compares the returned usage data against two different types of thresholds.
      13              : //! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
      14              : //! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
      15              : //! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
      16              : //! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
      17              : //!
      18              : //! # Eviction Policy
      19              : //!
      20              : //! There are two thresholds:
      21              : //! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
      22              : //! If the actual usage is higher, the threshold is exceeded.
      23              : //! `min_avail_bytes` is the absolute available space in bytes.
      24              : //! If the actual usage is lower, the threshold is exceeded.
      25              : //! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction
      26              : //! is performed on the next iteration, to release disk space and bring the usage below the thresholds again.
      27              : //! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
      28              : //! The reservation is to keep the most recently accessed X bytes per tenant resident.
      29              : //! If we cannot relieve pressure by evicting layers outside of the reservation, we
      30              : //! start evicting layers that are part of the reservation, LRU first.
      31              : //!
      32              : //! The value for the per-tenant reservation is referred to as `tenant_min_resident_size`
      33              : //! throughout the code, but, no actual variable carries that name.
      34              : //! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`.
      35              : //! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
      36              : //! during page reconstruction.
      37              : //! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
      38              : //! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`).
      39              : 
      40              : // Implementation notes:
      41              : // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
      42              : //   reading these fields. We use the Debug impl for semi-structured logging, though.
      43              : 
      44              : use std::{
      45              :     collections::HashMap,
      46              :     path::Path,
      47              :     sync::Arc,
      48              :     time::{Duration, SystemTime},
      49              : };
      50              : 
      51              : use anyhow::Context;
      52              : use remote_storage::GenericRemoteStorage;
      53              : use serde::{Deserialize, Serialize};
      54              : use tokio::time::Instant;
      55              : use tokio_util::sync::CancellationToken;
      56              : use tracing::{debug, error, info, instrument, warn, Instrument};
      57              : use utils::completion;
      58              : use utils::serde_percent::Percent;
      59              : 
      60              : use crate::{
      61              :     config::PageServerConf,
      62              :     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
      63              :     tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
      64              : };
      65              : 
      66           38 : #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
      67              : pub struct DiskUsageEvictionTaskConfig {
      68              :     pub max_usage_pct: Percent,
      69              :     pub min_avail_bytes: u64,
      70              :     #[serde(with = "humantime_serde")]
      71              :     pub period: Duration,
      72              :     #[cfg(feature = "testing")]
      73              :     pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
      74              : }
      75              : 
      76          575 : #[derive(Default)]
      77              : pub struct State {
      78              :     /// Exclude http requests and background task from running at the same time.
      79              :     mutex: tokio::sync::Mutex<()>,
      80              : }
      81              : 
      82          303 : pub fn launch_disk_usage_global_eviction_task(
      83          303 :     conf: &'static PageServerConf,
      84          303 :     storage: GenericRemoteStorage,
      85          303 :     state: Arc<State>,
      86          303 :     background_jobs_barrier: completion::Barrier,
      87          303 : ) -> anyhow::Result<()> {
      88          303 :     let Some(task_config) = &conf.disk_usage_based_eviction else {
      89          300 :         info!("disk usage based eviction task not configured");
      90          300 :         return Ok(());
      91              :     };
      92              : 
      93            3 :     info!("launching disk usage based eviction task");
      94              : 
      95            3 :     task_mgr::spawn(
      96            3 :         BACKGROUND_RUNTIME.handle(),
      97            3 :         TaskKind::DiskUsageEviction,
      98            3 :         None,
      99            3 :         None,
     100            3 :         "disk usage based eviction",
     101            3 :         false,
     102            3 :         async move {
     103            3 :             let cancel = task_mgr::shutdown_token();
     104            3 : 
     105            3 :             // wait until initial load is complete, because we cannot evict from loading tenants.
     106            6 :             tokio::select! {
     107            6 :                 _ = cancel.cancelled() => { return Ok(()); },
     108            6 :                 _ = background_jobs_barrier.wait() => { }
     109            6 :             };
     110              : 
     111            3 :             disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
     112            4 :                 .await;
     113            0 :             Ok(())
     114            3 :         },
     115            3 :     );
     116            3 : 
     117            3 :     Ok(())
     118          303 : }
     119              : 
     120            9 : #[instrument(skip_all)]
     121              : async fn disk_usage_eviction_task(
     122              :     state: &State,
     123              :     task_config: &DiskUsageEvictionTaskConfig,
     124              :     storage: GenericRemoteStorage,
     125              :     tenants_dir: &Path,
     126              :     cancel: CancellationToken,
     127              : ) {
     128            0 :     scopeguard::defer! {
     129            0 :         info!("disk usage based eviction task finishing");
     130              :     };
     131              : 
     132              :     use crate::tenant::tasks::random_init_delay;
     133              :     {
     134              :         if random_init_delay(task_config.period, &cancel)
     135              :             .await
     136              :             .is_err()
     137              :         {
     138              :             return;
     139              :         }
     140              :     }
     141              : 
     142              :     let mut iteration_no = 0;
     143              :     loop {
     144              :         iteration_no += 1;
     145              :         let start = Instant::now();
     146              : 
     147            4 :         async {
     148            4 :             let res = disk_usage_eviction_task_iteration(
     149            4 :                 state,
     150            4 :                 task_config,
     151            4 :                 &storage,
     152            4 :                 tenants_dir,
     153            4 :                 &cancel,
     154            4 :             )
     155            0 :             .await;
     156              : 
     157            4 :             match res {
     158            2 :                 Ok(()) => {}
     159            2 :                 Err(e) => {
     160            2 :                     // these stat failures are expected to be very rare
     161            2 :                     warn!("iteration failed, unexpected error: {e:#}");
     162              :                 }
     163              :             }
     164            4 :         }
     165              :         .instrument(tracing::info_span!("iteration", iteration_no))
     166              :         .await;
     167              : 
     168              :         let sleep_until = start + task_config.period;
     169              :         if tokio::time::timeout_at(sleep_until, cancel.cancelled())
     170              :             .await
     171              :             .is_ok()
     172              :         {
     173              :             break;
     174              :         }
     175              :     }
     176              : }
     177              : 
     178              : pub trait Usage: Clone + Copy + std::fmt::Debug {
     179              :     fn has_pressure(&self) -> bool;
     180              :     fn add_available_bytes(&mut self, bytes: u64);
     181              : }
     182              : 
     183            4 : async fn disk_usage_eviction_task_iteration(
     184            4 :     state: &State,
     185            4 :     task_config: &DiskUsageEvictionTaskConfig,
     186            4 :     storage: &GenericRemoteStorage,
     187            4 :     tenants_dir: &Path,
     188            4 :     cancel: &CancellationToken,
     189            4 : ) -> anyhow::Result<()> {
     190            4 :     let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
     191            4 :         .context("get filesystem-level disk usage before evictions")?;
     192            2 :     let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
     193            2 :     match res {
     194            2 :         Ok(outcome) => {
     195            0 :             debug!(?outcome, "disk_usage_eviction_iteration finished");
     196            2 :             match outcome {
     197            0 :                 IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
     198            0 :                     // nothing to do, select statement below will handle things
     199            0 :                 }
     200            2 :                 IterationOutcome::Finished(outcome) => {
     201              :                     // Verify with statvfs whether we made any real progress
     202            2 :                     let after = filesystem_level_usage::get(tenants_dir, task_config)
     203            2 :                         // It's quite unlikely to hit the error here. Keep the code simple and bail out.
     204            2 :                         .context("get filesystem-level disk usage after evictions")?;
     205              : 
     206            0 :                     debug!(?after, "disk usage");
     207              : 
     208            2 :                     if after.has_pressure() {
     209              :                         // Don't bother doing an out-of-order iteration here now.
     210              :                         // In practice, the task period is set to a value in the tens-of-seconds range,
     211              :                         // which will cause another iteration to happen soon enough.
     212              :                         // TODO: deltas between the three different usages would be helpful,
     213              :                         // consider MiB, GiB, TiB
     214            0 :                         warn!(?outcome, ?after, "disk usage still high");
     215              :                     } else {
     216            2 :                         info!(?outcome, ?after, "disk usage pressure relieved");
     217              :                     }
     218              :                 }
     219              :             }
     220              :         }
     221            0 :         Err(e) => {
     222            0 :             error!("disk_usage_eviction_iteration failed: {:#}", e);
     223              :         }
     224              :     }
     225              : 
     226            2 :     Ok(())
     227            4 : }
     228              : 
     229            5 : #[derive(Debug, Serialize)]
     230              : #[allow(clippy::large_enum_variant)]
     231              : pub enum IterationOutcome<U> {
     232              :     NoPressure,
     233              :     Cancelled,
     234              :     Finished(IterationOutcomeFinished<U>),
     235              : }
     236              : 
     237              : #[allow(dead_code)]
     238            7 : #[derive(Debug, Serialize)]
     239              : pub struct IterationOutcomeFinished<U> {
     240              :     /// The actual usage observed before we started the iteration.
     241              :     before: U,
     242              :     /// The expected value for `after`, according to internal accounting, after phase 1.
     243              :     planned: PlannedUsage<U>,
     244              :     /// The outcome of phase 2, where we actually do the evictions.
     245              :     ///
     246              :     /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
     247              :     /// be the same as `planned`.
     248              :     assumed: AssumedUsage<U>,
     249              : }
     250              : 
     251            7 : #[derive(Debug, Serialize)]
     252              : #[allow(dead_code)]
     253              : struct AssumedUsage<U> {
     254              :     /// The expected value for `after`, after phase 2.
     255              :     projected_after: U,
     256              :     /// The layers we failed to evict during phase 2.
     257              :     failed: LayerCount,
     258              : }
     259              : 
     260              : #[allow(dead_code)]
     261            7 : #[derive(Debug, Serialize)]
     262              : struct PlannedUsage<U> {
     263              :     respecting_tenant_min_resident_size: U,
     264              :     fallback_to_global_lru: Option<U>,
     265              : }
     266              : 
     267              : #[allow(dead_code)]
     268            7 : #[derive(Debug, Default, Serialize)]
     269              : struct LayerCount {
     270              :     file_sizes: u64,
     271              :     count: usize,
     272              : }
     273              : 
     274            7 : pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     275            7 :     state: &State,
     276            7 :     storage: &GenericRemoteStorage,
     277            7 :     usage_pre: U,
     278            7 :     cancel: &CancellationToken,
     279            7 : ) -> anyhow::Result<IterationOutcome<U>> {
     280              :     // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
     281            7 :     let _g = state
     282            7 :         .mutex
     283            7 :         .try_lock()
     284            7 :         .map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
     285              : 
     286            0 :     debug!(?usage_pre, "disk usage");
     287              : 
     288            7 :     if !usage_pre.has_pressure() {
     289            0 :         return Ok(IterationOutcome::NoPressure);
     290            7 :     }
     291              : 
     292            7 :     warn!(
     293            7 :         ?usage_pre,
     294            7 :         "running disk usage based eviction due to pressure"
     295            7 :     );
     296              : 
     297            7 :     let candidates = match collect_eviction_candidates(cancel).await? {
     298              :         EvictionCandidates::Cancelled => {
     299            0 :             return Ok(IterationOutcome::Cancelled);
     300              :         }
     301            7 :         EvictionCandidates::Finished(partitioned) => partitioned,
     302            7 :     };
     303            7 : 
     304            7 :     // Debug-log the list of candidates
     305            7 :     let now = SystemTime::now();
     306          250 :     for (i, (partition, candidate)) in candidates.iter().enumerate() {
     307          250 :         let desc = candidate.layer.layer_desc();
     308            0 :         debug!(
     309            0 :             "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
     310            0 :             i + 1,
     311            0 :             candidates.len(),
     312            0 :             desc.file_size,
     313            0 :             now.duration_since(candidate.last_activity_ts)
     314            0 :                 .unwrap()
     315            0 :                 .as_micros(),
     316            0 :             partition,
     317            0 :             desc.tenant_id,
     318            0 :             desc.timeline_id,
     319            0 :             candidate.layer,
     320            0 :         );
     321              :     }
     322              : 
     323              :     // phase1: select victims to relieve pressure
     324              :     //
     325              :     // Walk through the list of candidates, until we have accumulated enough layers to get
     326              :     // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
     327              :     // how much disk space would be used after evicting all the layers up to the current
     328              :     // point in the list. The layers are collected in 'batched', grouped per timeline.
     329              :     //
     330              :     // If we get far enough in the list that we start to evict layers that are below
     331              :     // the tenant's min-resident-size threshold, print a warning, and memorize the disk
     332              :     // usage at that point, in 'usage_planned_min_resident_size_respecting'.
     333            7 :     let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
     334            7 :     let mut warned = None;
     335            7 :     let mut usage_planned = usage_pre;
     336          148 :     for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
     337          148 :         if !usage_planned.has_pressure() {
     338            0 :             debug!(
     339            0 :                 no_candidates_evicted = i,
     340            0 :                 "took enough candidates for pressure to be relieved"
     341            0 :             );
     342            5 :             break;
     343          143 :         }
     344          143 : 
     345          143 :         if partition == MinResidentSizePartition::Below && warned.is_none() {
     346            2 :             warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
     347            2 :             warned = Some(usage_planned);
     348          141 :         }
     349              : 
     350          143 :         usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
     351          143 : 
     352          143 :         batched
     353          143 :             .entry(TimelineKey(candidate.timeline))
     354          143 :             .or_default()
     355          143 :             .push(candidate.layer);
     356              :     }
     357              : 
     358            7 :     let usage_planned = match warned {
     359            2 :         Some(respecting_tenant_min_resident_size) => PlannedUsage {
     360            2 :             respecting_tenant_min_resident_size,
     361            2 :             fallback_to_global_lru: Some(usage_planned),
     362            2 :         },
     363            5 :         None => PlannedUsage {
     364            5 :             respecting_tenant_min_resident_size: usage_planned,
     365            5 :             fallback_to_global_lru: None,
     366            5 :         },
     367              :     };
     368            0 :     debug!(?usage_planned, "usage planned");
     369              : 
     370              :     // phase2: evict victims batched by timeline
     371              : 
     372              :     // After the loop, `usage_assumed` is the post-eviction usage,
     373              :     // according to internal accounting.
     374            7 :     let mut usage_assumed = usage_pre;
     375            7 :     let mut evictions_failed = LayerCount::default();
     376           18 :     for (timeline, batch) in batched {
     377           11 :         let tenant_id = timeline.tenant_id;
     378           11 :         let timeline_id = timeline.timeline_id;
     379           11 :         let batch_size = batch.len();
     380              : 
     381            0 :         debug!(%timeline_id, "evicting batch for timeline");
     382              : 
     383           11 :         async {
     384           11 :             let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
     385              : 
     386           11 :             match results {
     387            0 :                 Err(e) => {
     388            0 :                     warn!("failed to evict batch: {:#}", e);
     389              :                 }
     390           11 :                 Ok(results) => {
     391           11 :                     assert_eq!(results.len(), batch.len());
     392          143 :                     for (result, layer) in results.into_iter().zip(batch.iter()) {
     393          143 :                         let file_size = layer.layer_desc().file_size;
     394            0 :                         match result {
     395          143 :                             Some(Ok(())) => {
     396          143 :                                 usage_assumed.add_available_bytes(file_size);
     397          143 :                             }
     398              :                             Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
     399            0 :                                 unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
     400              :                             }
     401            0 :                             Some(Err(EvictionError::FileNotFound)) => {
     402            0 :                                 evictions_failed.file_sizes += file_size;
     403            0 :                                 evictions_failed.count += 1;
     404            0 :                             }
     405              :                             Some(Err(
     406            0 :                                 e @ EvictionError::LayerNotFound(_)
     407            0 :                                 | e @ EvictionError::StatFailed(_),
     408              :                             )) => {
     409            0 :                                 let e = utils::error::report_compact_sources(&e);
     410            0 :                                 warn!(%layer, "failed to evict layer: {e}");
     411            0 :                                 evictions_failed.file_sizes += file_size;
     412            0 :                                 evictions_failed.count += 1;
     413              :                             }
     414              :                             None => {
     415            0 :                                 assert!(cancel.is_cancelled());
     416            0 :                                 return;
     417              :                             }
     418              :                         }
     419              :                     }
     420              :                 }
     421              :             }
     422           11 :         }
     423           11 :         .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
     424            0 :         .await;
     425              : 
     426           11 :         if cancel.is_cancelled() {
     427            0 :             return Ok(IterationOutcome::Cancelled);
     428           11 :         }
     429              :     }
     430              : 
     431            7 :     Ok(IterationOutcome::Finished(IterationOutcomeFinished {
     432            7 :         before: usage_pre,
     433            7 :         planned: usage_planned,
     434            7 :         assumed: AssumedUsage {
     435            7 :             projected_after: usage_assumed,
     436            7 :             failed: evictions_failed,
     437            7 :         },
     438            7 :     }))
     439            7 : }
     440              : 
     441            0 : #[derive(Clone)]
     442              : struct EvictionCandidate {
     443              :     timeline: Arc<Timeline>,
     444              :     layer: Arc<dyn PersistentLayer>,
     445              :     last_activity_ts: SystemTime,
     446              : }
     447              : 
     448         1050 : #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
     449              : enum MinResidentSizePartition {
     450              :     Above,
     451              :     Below,
     452              : }
     453              : 
     454              : enum EvictionCandidates {
     455              :     Cancelled,
     456              :     Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
     457              : }
     458              : 
     459              : /// Gather the eviction candidates.
     460              : ///
     461              : /// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction
     462              : /// order. A caller that evicts in that order, until pressure is relieved, implements
     463              : /// the eviction policy outlined in the module comment.
     464              : ///
     465              : /// # Example
     466              : ///
     467              : /// Imagine that there are two tenants, A and B, with five layers each, a-e.
     468              : /// Each layer has size 100, and both tenant's min_resident_size is 150.
     469              : /// The eviction order would be
     470              : ///
     471              : /// ```text
     472              : /// partition last_activity_ts    tenant/layer
     473              : /// Above     18:30               A/c
     474              : /// Above     19:00               A/b
     475              : /// Above     18:29               B/c
     476              : /// Above     19:05               B/b
     477              : /// Above     20:00               B/a
     478              : /// Above     20:03               A/a
     479              : /// Below     20:30               A/d
     480              : /// Below     20:40               B/d
     481              : /// Below     20:45               B/e
     482              : /// Below     20:58               A/e
     483              : /// ```
     484              : ///
     485              : /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
     486              : /// They are all in the `Above` partition, so, we respected each tenant's min_resident_size.
     487              : ///
     488              : /// But, if we need to evict 900 bytes to relieve pressure, we'd evict
     489              : /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
     490              : /// after exhauting the `Above` partition.
     491              : /// So, we did not respect each tenant's min_resident_size.
     492            7 : async fn collect_eviction_candidates(
     493            7 :     cancel: &CancellationToken,
     494            7 : ) -> anyhow::Result<EvictionCandidates> {
     495              :     // get a snapshot of the list of tenants
     496            7 :     let tenants = tenant::mgr::list_tenants()
     497            0 :         .await
     498            7 :         .context("get list of tenants")?;
     499              : 
     500            7 :     let mut candidates = Vec::new();
     501              : 
     502           21 :     for (tenant_id, _state) in &tenants {
     503           14 :         if cancel.is_cancelled() {
     504            0 :             return Ok(EvictionCandidates::Cancelled);
     505           14 :         }
     506           14 :         let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
     507           13 :             Ok(tenant) => tenant,
     508            1 :             Err(e) => {
     509              :                 // this can happen if tenant has lifecycle transition after we fetched it
     510            0 :                 debug!("failed to get tenant: {e:#}");
     511            1 :                 continue;
     512              :             }
     513              :         };
     514              : 
     515              :         // collect layers from all timelines in this tenant
     516              :         //
     517              :         // If one of the timelines becomes `!is_active()` during the iteration,
     518              :         // for example because we're shutting down, then `max_layer_size` can be too small.
     519              :         // That's OK. This code only runs under a disk pressure situation, and being
     520              :         // a little unfair to tenants during shutdown in such a situation is tolerable.
     521           13 :         let mut tenant_candidates = Vec::new();
     522           13 :         let mut max_layer_size = 0;
     523           13 :         for tl in tenant.list_timelines() {
     524           13 :             if !tl.is_active() {
     525            0 :                 continue;
     526           13 :             }
     527           13 :             let info = tl.get_local_layers_for_disk_usage_eviction().await;
     528            0 :             debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
     529           13 :             tenant_candidates.extend(
     530           13 :                 info.resident_layers
     531           13 :                     .into_iter()
     532          250 :                     .map(|layer_infos| (tl.clone(), layer_infos)),
     533           13 :             );
     534           13 :             max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
     535           13 : 
     536           13 :             if cancel.is_cancelled() {
     537            0 :                 return Ok(EvictionCandidates::Cancelled);
     538           13 :             }
     539              :         }
     540              : 
     541              :         // `min_resident_size` defaults to maximum layer file size of the tenant.
     542              :         // This ensures that each tenant can have at least one layer resident at a given time,
     543              :         // ensuring forward progress for a single Timeline::get in that tenant.
     544              :         // It's a questionable heuristic since, usually, there are many Timeline::get
     545              :         // requests going on for a tenant, and, at least in Neon prod, the median
     546              :         // layer file size is much smaller than the compaction target size.
     547              :         // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
     548              :         // That's what's typically used by the various background loops.
     549              :         //
     550              :         // The default can be overridden with a fixed value in the tenant conf.
     551              :         // A default override can be put in the default tenant conf in the pageserver.toml.
     552           13 :         let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
     553            0 :             debug!(
     554            0 :                 tenant_id=%tenant.tenant_id(),
     555            0 :                 overridden_size=s,
     556            0 :                 "using overridden min resident size for tenant"
     557            0 :             );
     558            2 :             s
     559              :         } else {
     560            0 :             debug!(
     561            0 :                 tenant_id=%tenant.tenant_id(),
     562            0 :                 max_layer_size,
     563            0 :                 "using max layer size as min_resident_size for tenant",
     564            0 :             );
     565           11 :             max_layer_size
     566              :         };
     567              : 
     568              :         // Sort layers most-recently-used first, then partition by
     569              :         // cumsum above/below min_resident_size.
     570           13 :         tenant_candidates
     571         2328 :             .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
     572           13 :         let mut cumsum: i128 = 0;
     573          250 :         for (timeline, layer_info) in tenant_candidates.into_iter() {
     574          250 :             let file_size = layer_info.file_size();
     575          250 :             let candidate = EvictionCandidate {
     576          250 :                 timeline,
     577          250 :                 last_activity_ts: layer_info.last_activity_ts,
     578          250 :                 layer: layer_info.layer,
     579          250 :             };
     580          250 :             let partition = if cumsum > min_resident_size as i128 {
     581          195 :                 MinResidentSizePartition::Above
     582              :             } else {
     583           55 :                 MinResidentSizePartition::Below
     584              :             };
     585          250 :             candidates.push((partition, candidate));
     586          250 :             cumsum += i128::from(file_size);
     587              :         }
     588              :     }
     589              : 
     590            7 :     debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
     591            0 :         "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
     592            7 :     candidates
     593         2086 :         .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
     594            7 : 
     595            7 :     Ok(EvictionCandidates::Finished(candidates))
     596            7 : }
     597              : 
     598              : struct TimelineKey(Arc<Timeline>);
     599              : 
     600              : impl PartialEq for TimelineKey {
     601          132 :     fn eq(&self, other: &Self) -> bool {
     602          132 :         Arc::ptr_eq(&self.0, &other.0)
     603          132 :     }
     604              : }
     605              : 
     606              : impl Eq for TimelineKey {}
     607              : 
     608              : impl std::hash::Hash for TimelineKey {
     609          143 :     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
     610          143 :         Arc::as_ptr(&self.0).hash(state);
     611          143 :     }
     612              : }
     613              : 
     614              : impl std::ops::Deref for TimelineKey {
     615              :     type Target = Timeline;
     616              : 
     617           33 :     fn deref(&self) -> &Self::Target {
     618           33 :         self.0.as_ref()
     619           33 :     }
     620              : }
     621              : 
     622              : mod filesystem_level_usage {
     623              :     use std::path::Path;
     624              : 
     625              :     use anyhow::Context;
     626              : 
     627              :     use crate::statvfs::Statvfs;
     628              : 
     629              :     use super::DiskUsageEvictionTaskConfig;
     630              : 
     631           10 :     #[derive(Debug, Clone, Copy)]
     632              :     #[allow(dead_code)]
     633              :     pub struct Usage<'a> {
     634              :         config: &'a DiskUsageEvictionTaskConfig,
     635              : 
     636              :         /// Filesystem capacity
     637              :         total_bytes: u64,
     638              :         /// Free filesystem space
     639              :         avail_bytes: u64,
     640              :     }
     641              : 
     642              :     impl super::Usage for Usage<'_> {
     643           51 :         fn has_pressure(&self) -> bool {
     644           51 :             let usage_pct =
     645           51 :                 (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
     646           51 : 
     647           51 :             let pressures = [
     648           51 :                 (
     649           51 :                     "min_avail_bytes",
     650           51 :                     self.avail_bytes < self.config.min_avail_bytes,
     651           51 :                 ),
     652           51 :                 (
     653           51 :                     "max_usage_pct",
     654           51 :                     usage_pct >= self.config.max_usage_pct.get() as u64,
     655           51 :                 ),
     656           51 :             ];
     657           51 : 
     658           88 :             pressures.into_iter().any(|(_, has_pressure)| has_pressure)
     659           51 :         }
     660              : 
     661           82 :         fn add_available_bytes(&mut self, bytes: u64) {
     662           82 :             self.avail_bytes += bytes;
     663           82 :         }
     664              :     }
     665              : 
     666            6 :     pub fn get<'a>(
     667            6 :         tenants_dir: &Path,
     668            6 :         config: &'a DiskUsageEvictionTaskConfig,
     669            6 :     ) -> anyhow::Result<Usage<'a>> {
     670            6 :         let mock_config = {
     671            6 :             #[cfg(feature = "testing")]
     672            6 :             {
     673            6 :                 config.mock_statvfs.as_ref()
     674              :             }
     675              :             #[cfg(not(feature = "testing"))]
     676              :             {
     677              :                 None
     678              :             }
     679              :         };
     680              : 
     681            6 :         let stat = Statvfs::get(tenants_dir, mock_config)
     682            6 :             .context("statvfs failed, presumably directory got unlinked")?;
     683              : 
     684              :         // https://unix.stackexchange.com/a/703650
     685            4 :         let blocksize = if stat.fragment_size() > 0 {
     686            4 :             stat.fragment_size()
     687              :         } else {
     688            0 :             stat.block_size()
     689              :         };
     690              : 
     691              :         // use blocks_available (b_avail) since, pageserver runs as unprivileged user
     692            4 :         let avail_bytes = stat.blocks_available() * blocksize;
     693            4 :         let total_bytes = stat.blocks() * blocksize;
     694            4 : 
     695            4 :         Ok(Usage {
     696            4 :             config,
     697            4 :             total_bytes,
     698            4 :             avail_bytes,
     699            4 :         })
     700            6 :     }
     701              : 
     702            1 :     #[test]
     703            1 :     fn max_usage_pct_pressure() {
     704            1 :         use super::Usage as _;
     705            1 :         use std::time::Duration;
     706            1 :         use utils::serde_percent::Percent;
     707            1 : 
     708            1 :         let mut usage = Usage {
     709            1 :             config: &DiskUsageEvictionTaskConfig {
     710            1 :                 max_usage_pct: Percent::new(85).unwrap(),
     711            1 :                 min_avail_bytes: 0,
     712            1 :                 period: Duration::MAX,
     713            1 :                 #[cfg(feature = "testing")]
     714            1 :                 mock_statvfs: None,
     715            1 :             },
     716            1 :             total_bytes: 100_000,
     717            1 :             avail_bytes: 0,
     718            1 :         };
     719            1 : 
     720            1 :         assert!(usage.has_pressure(), "expected pressure at 100%");
     721              : 
     722            1 :         usage.add_available_bytes(14_000);
     723            1 :         assert!(usage.has_pressure(), "expected pressure at 86%");
     724              : 
     725            1 :         usage.add_available_bytes(999);
     726            1 :         assert!(usage.has_pressure(), "expected pressure at 85.001%");
     727              : 
     728            1 :         usage.add_available_bytes(1);
     729            1 :         assert!(usage.has_pressure(), "expected pressure at precisely 85%");
     730              : 
     731            1 :         usage.add_available_bytes(1);
     732            1 :         assert!(!usage.has_pressure(), "no pressure at 84.999%");
     733              : 
     734            1 :         usage.add_available_bytes(999);
     735            1 :         assert!(!usage.has_pressure(), "no pressure at 84%");
     736              : 
     737            1 :         usage.add_available_bytes(16_000);
     738            1 :         assert!(!usage.has_pressure());
     739            1 :     }
     740              : }

Generated by: LCOV version 2.1-beta