LCOV - f6946e90941b557c917ac98cd5a7e9506d180f3e.info - pageserver/src/disk_usage_eviction

LCOV - differential code coverage report

Current view:	top level - pageserver/src - disk_usage_eviction_task.rs (source / functions)		Coverage	Total	Hit	UBC	CBC
Current:	f6946e90941b557c917ac98cd5a7e9506d180f3e.info	Lines:	79.5 %	366	291	75	291
Current Date:	2023-10-19 02:04:12	Functions:	53.5 %	114	61	53	61
Baseline:	c8637f37369098875162f194f92736355783b050.info
Baseline Date:	2023-10-18 20:25:20

           TLA  Line data    Source code

       1                 : //! This module implements the pageserver-global disk-usage-based layer eviction task.
       2                 : //!
       3                 : //! # Mechanics
       4                 : //!
       5                 : //! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
       6                 : //! loop that evicts layers in response to a shortage of available bytes
       7                 : //! in the $repo/tenants directory's filesystem.
       8                 : //!
       9                 : //! The loop runs periodically at a configurable `period`.
      10                 : //!
      11                 : //! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
      12                 : //! It compares the returned usage data against two different types of thresholds.
      13                 : //! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
      14                 : //! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
      15                 : //! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
      16                 : //! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
      17                 : //!
      18                 : //! # Eviction Policy
      19                 : //!
      20                 : //! There are two thresholds:
      21                 : //! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
      22                 : //! If the actual usage is higher, the threshold is exceeded.
      23                 : //! `min_avail_bytes` is the absolute available space in bytes.
      24                 : //! If the actual usage is lower, the threshold is exceeded.
      25                 : //! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction
      26                 : //! is performed on the next iteration, to release disk space and bring the usage below the thresholds again.
      27                 : //! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
      28                 : //! The reservation is to keep the most recently accessed X bytes per tenant resident.
      29                 : //! If we cannot relieve pressure by evicting layers outside of the reservation, we
      30                 : //! start evicting layers that are part of the reservation, LRU first.
      31                 : //!
      32                 : //! The value for the per-tenant reservation is referred to as `tenant_min_resident_size`
      33                 : //! throughout the code, but, no actual variable carries that name.
      34                 : //! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`.
      35                 : //! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
      36                 : //! during page reconstruction.
      37                 : //! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
      38                 : //! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`).
      39                 : 
      40                 : // Implementation notes:
      41                 : // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
      42                 : //   reading these fields. We use the Debug impl for semi-structured logging, though.
      43                 : 
      44                 : use std::{
      45                 :     collections::HashMap,
      46                 :     sync::Arc,
      47                 :     time::{Duration, SystemTime},
      48                 : };
      49                 : 
      50                 : use anyhow::Context;
      51                 : use camino::Utf8Path;
      52                 : use remote_storage::GenericRemoteStorage;
      53                 : use serde::{Deserialize, Serialize};
      54                 : use tokio::time::Instant;
      55                 : use tokio_util::sync::CancellationToken;
      56                 : use tracing::{debug, error, info, instrument, warn, Instrument};
      57                 : use utils::completion;
      58                 : use utils::serde_percent::Percent;
      59                 : 
      60                 : use crate::{
      61                 :     config::PageServerConf,
      62                 :     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
      63                 :     tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
      64                 : };
      65                 : 
      66 CBC          38 : #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
      67                 : pub struct DiskUsageEvictionTaskConfig {
      68                 :     pub max_usage_pct: Percent,
      69                 :     pub min_avail_bytes: u64,
      70                 :     #[serde(with = "humantime_serde")]
      71                 :     pub period: Duration,
      72                 :     #[cfg(feature = "testing")]
      73                 :     pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
      74                 : }
      75                 : 
      76             560 : #[derive(Default)]
      77                 : pub struct State {
      78                 :     /// Exclude http requests and background task from running at the same time.
      79                 :     mutex: tokio::sync::Mutex<()>,
      80                 : }
      81                 : 
      82             560 : pub fn launch_disk_usage_global_eviction_task(
      83             560 :     conf: &'static PageServerConf,
      84             560 :     storage: GenericRemoteStorage,
      85             560 :     state: Arc<State>,
      86             560 :     background_jobs_barrier: completion::Barrier,
      87             560 : ) -> anyhow::Result<()> {
      88             560 :     let Some(task_config) = &conf.disk_usage_based_eviction else {
      89             557 :         info!("disk usage based eviction task not configured");
      90             557 :         return Ok(());
      91                 :     };
      92                 : 
      93               3 :     info!("launching disk usage based eviction task");
      94                 : 
      95               3 :     task_mgr::spawn(
      96               3 :         BACKGROUND_RUNTIME.handle(),
      97               3 :         TaskKind::DiskUsageEviction,
      98               3 :         None,
      99               3 :         None,
     100               3 :         "disk usage based eviction",
     101               3 :         false,
     102               3 :         async move {
     103               3 :             let cancel = task_mgr::shutdown_token();
     104               3 : 
     105               3 :             // wait until initial load is complete, because we cannot evict from loading tenants.
     106               6 :             tokio::select! {
     107               6 :                 _ = cancel.cancelled() => { return Ok(()); },
     108               6 :                 _ = background_jobs_barrier.wait() => { }
     109               6 :             };
     110                 : 
     111               3 :             disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
     112               3 :                 .await;
     113 UBC           0 :             Ok(())
     114 CBC           3 :         },
     115               3 :     );
     116               3 : 
     117               3 :     Ok(())
     118             560 : }
     119                 : 
     120               9 : #[instrument(skip_all)]
     121                 : async fn disk_usage_eviction_task(
     122                 :     state: &State,
     123                 :     task_config: &DiskUsageEvictionTaskConfig,
     124                 :     storage: GenericRemoteStorage,
     125                 :     tenants_dir: &Utf8Path,
     126                 :     cancel: CancellationToken,
     127                 : ) {
     128 UBC           0 :     scopeguard::defer! {
     129               0 :         info!("disk usage based eviction task finishing");
     130                 :     };
     131                 : 
     132                 :     use crate::tenant::tasks::random_init_delay;
     133                 :     {
     134                 :         if random_init_delay(task_config.period, &cancel)
     135                 :             .await
     136                 :             .is_err()
     137                 :         {
     138                 :             return;
     139                 :         }
     140                 :     }
     141                 : 
     142                 :     let mut iteration_no = 0;
     143                 :     loop {
     144                 :         iteration_no += 1;
     145                 :         let start = Instant::now();
     146                 : 
     147 CBC           3 :         async {
     148               3 :             let res = disk_usage_eviction_task_iteration(
     149               3 :                 state,
     150               3 :                 task_config,
     151               3 :                 &storage,
     152               3 :                 tenants_dir,
     153               3 :                 &cancel,
     154               3 :             )
     155 UBC           0 :             .await;
     156                 : 
     157 CBC           3 :             match res {
     158               2 :                 Ok(()) => {}
     159               1 :                 Err(e) => {
     160               1 :                     // these stat failures are expected to be very rare
     161               1 :                     warn!("iteration failed, unexpected error: {e:#}");
     162                 :                 }
     163                 :             }
     164               3 :         }
     165                 :         .instrument(tracing::info_span!("iteration", iteration_no))
     166                 :         .await;
     167                 : 
     168                 :         let sleep_until = start + task_config.period;
     169                 :         if tokio::time::timeout_at(sleep_until, cancel.cancelled())
     170                 :             .await
     171                 :             .is_ok()
     172                 :         {
     173                 :             break;
     174                 :         }
     175                 :     }
     176                 : }
     177                 : 
     178                 : pub trait Usage: Clone + Copy + std::fmt::Debug {
     179                 :     fn has_pressure(&self) -> bool;
     180                 :     fn add_available_bytes(&mut self, bytes: u64);
     181                 : }
     182                 : 
     183               3 : async fn disk_usage_eviction_task_iteration(
     184               3 :     state: &State,
     185               3 :     task_config: &DiskUsageEvictionTaskConfig,
     186               3 :     storage: &GenericRemoteStorage,
     187               3 :     tenants_dir: &Utf8Path,
     188               3 :     cancel: &CancellationToken,
     189               3 : ) -> anyhow::Result<()> {
     190               3 :     let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
     191               3 :         .context("get filesystem-level disk usage before evictions")?;
     192               2 :     let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
     193               2 :     match res {
     194               2 :         Ok(outcome) => {
     195 UBC           0 :             debug!(?outcome, "disk_usage_eviction_iteration finished");
     196 CBC           2 :             match outcome {
     197 UBC           0 :                 IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
     198               0 :                     // nothing to do, select statement below will handle things
     199               0 :                 }
     200 CBC           2 :                 IterationOutcome::Finished(outcome) => {
     201                 :                     // Verify with statvfs whether we made any real progress
     202               2 :                     let after = filesystem_level_usage::get(tenants_dir, task_config)
     203               2 :                         // It's quite unlikely to hit the error here. Keep the code simple and bail out.
     204               2 :                         .context("get filesystem-level disk usage after evictions")?;
     205                 : 
     206 UBC           0 :                     debug!(?after, "disk usage");
     207                 : 
     208 CBC           2 :                     if after.has_pressure() {
     209                 :                         // Don't bother doing an out-of-order iteration here now.
     210                 :                         // In practice, the task period is set to a value in the tens-of-seconds range,
     211                 :                         // which will cause another iteration to happen soon enough.
     212                 :                         // TODO: deltas between the three different usages would be helpful,
     213                 :                         // consider MiB, GiB, TiB
     214 UBC           0 :                         warn!(?outcome, ?after, "disk usage still high");
     215                 :                     } else {
     216 CBC           2 :                         info!(?outcome, ?after, "disk usage pressure relieved");
     217                 :                     }
     218                 :                 }
     219                 :             }
     220                 :         }
     221 UBC           0 :         Err(e) => {
     222               0 :             error!("disk_usage_eviction_iteration failed: {:#}", e);
     223                 :         }
     224                 :     }
     225                 : 
     226 CBC           2 :     Ok(())
     227               3 : }
     228                 : 
     229               5 : #[derive(Debug, Serialize)]
     230                 : #[allow(clippy::large_enum_variant)]
     231                 : pub enum IterationOutcome<U> {
     232                 :     NoPressure,
     233                 :     Cancelled,
     234                 :     Finished(IterationOutcomeFinished<U>),
     235                 : }
     236                 : 
     237                 : #[allow(dead_code)]
     238               7 : #[derive(Debug, Serialize)]
     239                 : pub struct IterationOutcomeFinished<U> {
     240                 :     /// The actual usage observed before we started the iteration.
     241                 :     before: U,
     242                 :     /// The expected value for `after`, according to internal accounting, after phase 1.
     243                 :     planned: PlannedUsage<U>,
     244                 :     /// The outcome of phase 2, where we actually do the evictions.
     245                 :     ///
     246                 :     /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
     247                 :     /// be the same as `planned`.
     248                 :     assumed: AssumedUsage<U>,
     249                 : }
     250                 : 
     251               7 : #[derive(Debug, Serialize)]
     252                 : #[allow(dead_code)]
     253                 : struct AssumedUsage<U> {
     254                 :     /// The expected value for `after`, after phase 2.
     255                 :     projected_after: U,
     256                 :     /// The layers we failed to evict during phase 2.
     257                 :     failed: LayerCount,
     258                 : }
     259                 : 
     260                 : #[allow(dead_code)]
     261               7 : #[derive(Debug, Serialize)]
     262                 : struct PlannedUsage<U> {
     263                 :     respecting_tenant_min_resident_size: U,
     264                 :     fallback_to_global_lru: Option<U>,
     265                 : }
     266                 : 
     267                 : #[allow(dead_code)]
     268               7 : #[derive(Debug, Default, Serialize)]
     269                 : struct LayerCount {
     270                 :     file_sizes: u64,
     271                 :     count: usize,
     272                 : }
     273                 : 
     274               7 : pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     275               7 :     state: &State,
     276               7 :     storage: &GenericRemoteStorage,
     277               7 :     usage_pre: U,
     278               7 :     cancel: &CancellationToken,
     279               7 : ) -> anyhow::Result<IterationOutcome<U>> {
     280                 :     // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
     281               7 :     let _g = state
     282               7 :         .mutex
     283               7 :         .try_lock()
     284               7 :         .map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
     285                 : 
     286 UBC           0 :     debug!(?usage_pre, "disk usage");
     287                 : 
     288 CBC           7 :     if !usage_pre.has_pressure() {
     289 UBC           0 :         return Ok(IterationOutcome::NoPressure);
     290 CBC           7 :     }
     291                 : 
     292               7 :     warn!(
     293               7 :         ?usage_pre,
     294               7 :         "running disk usage based eviction due to pressure"
     295               7 :     );
     296                 : 
     297               7 :     let candidates = match collect_eviction_candidates(cancel).await? {
     298                 :         EvictionCandidates::Cancelled => {
     299 UBC           0 :             return Ok(IterationOutcome::Cancelled);
     300                 :         }
     301 CBC           7 :         EvictionCandidates::Finished(partitioned) => partitioned,
     302               7 :     };
     303               7 : 
     304               7 :     // Debug-log the list of candidates
     305               7 :     let now = SystemTime::now();
     306             250 :     for (i, (partition, candidate)) in candidates.iter().enumerate() {
     307             250 :         let desc = candidate.layer.layer_desc();
     308 UBC           0 :         debug!(
     309               0 :             "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
     310               0 :             i + 1,
     311               0 :             candidates.len(),
     312               0 :             desc.file_size,
     313               0 :             now.duration_since(candidate.last_activity_ts)
     314               0 :                 .unwrap()
     315               0 :                 .as_micros(),
     316               0 :             partition,
     317               0 :             desc.tenant_id,
     318               0 :             desc.timeline_id,
     319               0 :             candidate.layer,
     320               0 :         );
     321                 :     }
     322                 : 
     323                 :     // phase1: select victims to relieve pressure
     324                 :     //
     325                 :     // Walk through the list of candidates, until we have accumulated enough layers to get
     326                 :     // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
     327                 :     // how much disk space would be used after evicting all the layers up to the current
     328                 :     // point in the list. The layers are collected in 'batched', grouped per timeline.
     329                 :     //
     330                 :     // If we get far enough in the list that we start to evict layers that are below
     331                 :     // the tenant's min-resident-size threshold, print a warning, and memorize the disk
     332                 :     // usage at that point, in 'usage_planned_min_resident_size_respecting'.
     333 CBC           7 :     let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
     334               7 :     let mut warned = None;
     335               7 :     let mut usage_planned = usage_pre;
     336             148 :     for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
     337             148 :         if !usage_planned.has_pressure() {
     338 UBC           0 :             debug!(
     339               0 :                 no_candidates_evicted = i,
     340               0 :                 "took enough candidates for pressure to be relieved"
     341               0 :             );
     342 CBC           5 :             break;
     343             143 :         }
     344             143 : 
     345             143 :         if partition == MinResidentSizePartition::Below && warned.is_none() {
     346               2 :             warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
     347               2 :             warned = Some(usage_planned);
     348             141 :         }
     349                 : 
     350             143 :         usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
     351             143 : 
     352             143 :         batched
     353             143 :             .entry(TimelineKey(candidate.timeline))
     354             143 :             .or_default()
     355             143 :             .push(candidate.layer);
     356                 :     }
     357                 : 
     358               7 :     let usage_planned = match warned {
     359               2 :         Some(respecting_tenant_min_resident_size) => PlannedUsage {
     360               2 :             respecting_tenant_min_resident_size,
     361               2 :             fallback_to_global_lru: Some(usage_planned),
     362               2 :         },
     363               5 :         None => PlannedUsage {
     364               5 :             respecting_tenant_min_resident_size: usage_planned,
     365               5 :             fallback_to_global_lru: None,
     366               5 :         },
     367                 :     };
     368 UBC           0 :     debug!(?usage_planned, "usage planned");
     369                 : 
     370                 :     // phase2: evict victims batched by timeline
     371                 : 
     372                 :     // After the loop, `usage_assumed` is the post-eviction usage,
     373                 :     // according to internal accounting.
     374 CBC           7 :     let mut usage_assumed = usage_pre;
     375               7 :     let mut evictions_failed = LayerCount::default();
     376              18 :     for (timeline, batch) in batched {
     377              11 :         let tenant_id = timeline.tenant_id;
     378              11 :         let timeline_id = timeline.timeline_id;
     379              11 :         let batch_size = batch.len();
     380                 : 
     381 UBC           0 :         debug!(%timeline_id, "evicting batch for timeline");
     382                 : 
     383 CBC          11 :         async {
     384              11 :             let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
     385                 : 
     386              11 :             match results {
     387 UBC           0 :                 Err(e) => {
     388               0 :                     warn!("failed to evict batch: {:#}", e);
     389                 :                 }
     390 CBC          11 :                 Ok(results) => {
     391              11 :                     assert_eq!(results.len(), batch.len());
     392             143 :                     for (result, layer) in results.into_iter().zip(batch.iter()) {
     393             143 :                         let file_size = layer.layer_desc().file_size;
     394 UBC           0 :                         match result {
     395 CBC         143 :                             Some(Ok(())) => {
     396             143 :                                 usage_assumed.add_available_bytes(file_size);
     397             143 :                             }
     398                 :                             Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
     399 UBC           0 :                                 unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
     400                 :                             }
     401               0 :                             Some(Err(EvictionError::FileNotFound)) => {
     402               0 :                                 evictions_failed.file_sizes += file_size;
     403               0 :                                 evictions_failed.count += 1;
     404               0 :                             }
     405                 :                             Some(Err(
     406               0 :                                 e @ EvictionError::LayerNotFound(_)
     407               0 :                                 | e @ EvictionError::StatFailed(_),
     408                 :                             )) => {
     409               0 :                                 let e = utils::error::report_compact_sources(&e);
     410               0 :                                 warn!(%layer, "failed to evict layer: {e}");
     411               0 :                                 evictions_failed.file_sizes += file_size;
     412               0 :                                 evictions_failed.count += 1;
     413                 :                             }
     414               0 :                             Some(Err(EvictionError::MetadataInconsistency(detail))) => {
     415               0 :                                 warn!(%layer, "failed to evict layer: {detail}");
     416               0 :                                 evictions_failed.file_sizes += file_size;
     417               0 :                                 evictions_failed.count += 1;
     418                 :                             }
     419                 :                             None => {
     420               0 :                                 assert!(cancel.is_cancelled());
     421               0 :                                 return;
     422                 :                             }
     423                 :                         }
     424                 :                     }
     425                 :                 }
     426                 :             }
     427 CBC          11 :         }
     428              11 :         .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
     429 UBC           0 :         .await;
     430                 : 
     431 CBC          11 :         if cancel.is_cancelled() {
     432 UBC           0 :             return Ok(IterationOutcome::Cancelled);
     433 CBC          11 :         }
     434                 :     }
     435                 : 
     436               7 :     Ok(IterationOutcome::Finished(IterationOutcomeFinished {
     437               7 :         before: usage_pre,
     438               7 :         planned: usage_planned,
     439               7 :         assumed: AssumedUsage {
     440               7 :             projected_after: usage_assumed,
     441               7 :             failed: evictions_failed,
     442               7 :         },
     443               7 :     }))
     444               7 : }
     445                 : 
     446 UBC           0 : #[derive(Clone)]
     447                 : struct EvictionCandidate {
     448                 :     timeline: Arc<Timeline>,
     449                 :     layer: Arc<dyn PersistentLayer>,
     450                 :     last_activity_ts: SystemTime,
     451                 : }
     452                 : 
     453 CBC        1007 : #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
     454                 : enum MinResidentSizePartition {
     455                 :     Above,
     456                 :     Below,
     457                 : }
     458                 : 
     459                 : enum EvictionCandidates {
     460                 :     Cancelled,
     461                 :     Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
     462                 : }
     463                 : 
     464                 : /// Gather the eviction candidates.
     465                 : ///
     466                 : /// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction
     467                 : /// order. A caller that evicts in that order, until pressure is relieved, implements
     468                 : /// the eviction policy outlined in the module comment.
     469                 : ///
     470                 : /// # Example
     471                 : ///
     472                 : /// Imagine that there are two tenants, A and B, with five layers each, a-e.
     473                 : /// Each layer has size 100, and both tenant's min_resident_size is 150.
     474                 : /// The eviction order would be
     475                 : ///
     476                 : /// ```text
     477                 : /// partition last_activity_ts    tenant/layer
     478                 : /// Above     18:30               A/c
     479                 : /// Above     19:00               A/b
     480                 : /// Above     18:29               B/c
     481                 : /// Above     19:05               B/b
     482                 : /// Above     20:00               B/a
     483                 : /// Above     20:03               A/a
     484                 : /// Below     20:30               A/d
     485                 : /// Below     20:40               B/d
     486                 : /// Below     20:45               B/e
     487                 : /// Below     20:58               A/e
     488                 : /// ```
     489                 : ///
     490                 : /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
     491                 : /// They are all in the `Above` partition, so, we respected each tenant's min_resident_size.
     492                 : ///
     493                 : /// But, if we need to evict 900 bytes to relieve pressure, we'd evict
     494                 : /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
     495                 : /// after exhauting the `Above` partition.
     496                 : /// So, we did not respect each tenant's min_resident_size.
     497               7 : async fn collect_eviction_candidates(
     498               7 :     cancel: &CancellationToken,
     499               7 : ) -> anyhow::Result<EvictionCandidates> {
     500                 :     // get a snapshot of the list of tenants
     501               7 :     let tenants = tenant::mgr::list_tenants()
     502 UBC           0 :         .await
     503 CBC           7 :         .context("get list of tenants")?;
     504                 : 
     505               7 :     let mut candidates = Vec::new();
     506                 : 
     507              21 :     for (tenant_id, _state) in &tenants {
     508              14 :         if cancel.is_cancelled() {
     509 UBC           0 :             return Ok(EvictionCandidates::Cancelled);
     510 CBC          14 :         }
     511              14 :         let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
     512              13 :             Ok(tenant) => tenant,
     513               1 :             Err(e) => {
     514                 :                 // this can happen if tenant has lifecycle transition after we fetched it
     515 UBC           0 :                 debug!("failed to get tenant: {e:#}");
     516 CBC           1 :                 continue;
     517                 :             }
     518                 :         };
     519                 : 
     520                 :         // collect layers from all timelines in this tenant
     521                 :         //
     522                 :         // If one of the timelines becomes `!is_active()` during the iteration,
     523                 :         // for example because we're shutting down, then `max_layer_size` can be too small.
     524                 :         // That's OK. This code only runs under a disk pressure situation, and being
     525                 :         // a little unfair to tenants during shutdown in such a situation is tolerable.
     526              13 :         let mut tenant_candidates = Vec::new();
     527              13 :         let mut max_layer_size = 0;
     528              13 :         for tl in tenant.list_timelines() {
     529              13 :             if !tl.is_active() {
     530 UBC           0 :                 continue;
     531 CBC          13 :             }
     532              13 :             let info = tl.get_local_layers_for_disk_usage_eviction().await;
     533 UBC           0 :             debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
     534 CBC          13 :             tenant_candidates.extend(
     535              13 :                 info.resident_layers
     536              13 :                     .into_iter()
     537             250 :                     .map(|layer_infos| (tl.clone(), layer_infos)),
     538              13 :             );
     539              13 :             max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
     540              13 : 
     541              13 :             if cancel.is_cancelled() {
     542 UBC           0 :                 return Ok(EvictionCandidates::Cancelled);
     543 CBC          13 :             }
     544                 :         }
     545                 : 
     546                 :         // `min_resident_size` defaults to maximum layer file size of the tenant.
     547                 :         // This ensures that each tenant can have at least one layer resident at a given time,
     548                 :         // ensuring forward progress for a single Timeline::get in that tenant.
     549                 :         // It's a questionable heuristic since, usually, there are many Timeline::get
     550                 :         // requests going on for a tenant, and, at least in Neon prod, the median
     551                 :         // layer file size is much smaller than the compaction target size.
     552                 :         // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
     553                 :         // That's what's typically used by the various background loops.
     554                 :         //
     555                 :         // The default can be overridden with a fixed value in the tenant conf.
     556                 :         // A default override can be put in the default tenant conf in the pageserver.toml.
     557              13 :         let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
     558 UBC           0 :             debug!(
     559               0 :                 tenant_id=%tenant.tenant_id(),
     560               0 :                 overridden_size=s,
     561               0 :                 "using overridden min resident size for tenant"
     562               0 :             );
     563 CBC           2 :             s
     564                 :         } else {
     565 UBC           0 :             debug!(
     566               0 :                 tenant_id=%tenant.tenant_id(),
     567               0 :                 max_layer_size,
     568               0 :                 "using max layer size as min_resident_size for tenant",
     569               0 :             );
     570 CBC          11 :             max_layer_size
     571                 :         };
     572                 : 
     573                 :         // Sort layers most-recently-used first, then partition by
     574                 :         // cumsum above/below min_resident_size.
     575              13 :         tenant_candidates
     576            2384 :             .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
     577              13 :         let mut cumsum: i128 = 0;
     578             250 :         for (timeline, layer_info) in tenant_candidates.into_iter() {
     579             250 :             let file_size = layer_info.file_size();
     580             250 :             let candidate = EvictionCandidate {
     581             250 :                 timeline,
     582             250 :                 last_activity_ts: layer_info.last_activity_ts,
     583             250 :                 layer: layer_info.layer,
     584             250 :             };
     585             250 :             let partition = if cumsum > min_resident_size as i128 {
     586             195 :                 MinResidentSizePartition::Above
     587                 :             } else {
     588              55 :                 MinResidentSizePartition::Below
     589                 :             };
     590             250 :             candidates.push((partition, candidate));
     591             250 :             cumsum += i128::from(file_size);
     592                 :         }
     593                 :     }
     594                 : 
     595               7 :     debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
     596 UBC           0 :         "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
     597 CBC           7 :     candidates
     598            2000 :         .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
     599               7 : 
     600               7 :     Ok(EvictionCandidates::Finished(candidates))
     601               7 : }
     602                 : 
     603                 : struct TimelineKey(Arc<Timeline>);
     604                 : 
     605                 : impl PartialEq for TimelineKey {
     606             132 :     fn eq(&self, other: &Self) -> bool {
     607             132 :         Arc::ptr_eq(&self.0, &other.0)
     608             132 :     }
     609                 : }
     610                 : 
     611                 : impl Eq for TimelineKey {}
     612                 : 
     613                 : impl std::hash::Hash for TimelineKey {
     614             143 :     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
     615             143 :         Arc::as_ptr(&self.0).hash(state);
     616             143 :     }
     617                 : }
     618                 : 
     619                 : impl std::ops::Deref for TimelineKey {
     620                 :     type Target = Timeline;
     621                 : 
     622              33 :     fn deref(&self) -> &Self::Target {
     623              33 :         self.0.as_ref()
     624              33 :     }
     625                 : }
     626                 : 
     627                 : mod filesystem_level_usage {
     628                 :     use anyhow::Context;
     629                 :     use camino::Utf8Path;
     630                 : 
     631                 :     use crate::statvfs::Statvfs;
     632                 : 
     633                 :     use super::DiskUsageEvictionTaskConfig;
     634                 : 
     635              10 :     #[derive(Debug, Clone, Copy)]
     636                 :     #[allow(dead_code)]
     637                 :     pub struct Usage<'a> {
     638                 :         config: &'a DiskUsageEvictionTaskConfig,
     639                 : 
     640                 :         /// Filesystem capacity
     641                 :         total_bytes: u64,
     642                 :         /// Free filesystem space
     643                 :         avail_bytes: u64,
     644                 :     }
     645                 : 
     646                 :     impl super::Usage for Usage<'_> {
     647              51 :         fn has_pressure(&self) -> bool {
     648              51 :             let usage_pct =
     649              51 :                 (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
     650              51 : 
     651              51 :             let pressures = [
     652              51 :                 (
     653              51 :                     "min_avail_bytes",
     654              51 :                     self.avail_bytes < self.config.min_avail_bytes,
     655              51 :                 ),
     656              51 :                 (
     657              51 :                     "max_usage_pct",
     658              51 :                     usage_pct >= self.config.max_usage_pct.get() as u64,
     659              51 :                 ),
     660              51 :             ];
     661              51 : 
     662              88 :             pressures.into_iter().any(|(_, has_pressure)| has_pressure)
     663              51 :         }
     664                 : 
     665              82 :         fn add_available_bytes(&mut self, bytes: u64) {
     666              82 :             self.avail_bytes += bytes;
     667              82 :         }
     668                 :     }
     669                 : 
     670               5 :     pub fn get<'a>(
     671               5 :         tenants_dir: &Utf8Path,
     672               5 :         config: &'a DiskUsageEvictionTaskConfig,
     673               5 :     ) -> anyhow::Result<Usage<'a>> {
     674               5 :         let mock_config = {
     675               5 :             #[cfg(feature = "testing")]
     676               5 :             {
     677               5 :                 config.mock_statvfs.as_ref()
     678                 :             }
     679                 :             #[cfg(not(feature = "testing"))]
     680                 :             {
     681                 :                 None
     682                 :             }
     683                 :         };
     684                 : 
     685               5 :         let stat = Statvfs::get(tenants_dir, mock_config)
     686               5 :             .context("statvfs failed, presumably directory got unlinked")?;
     687                 : 
     688                 :         // https://unix.stackexchange.com/a/703650
     689               4 :         let blocksize = if stat.fragment_size() > 0 {
     690               4 :             stat.fragment_size()
     691                 :         } else {
     692 UBC           0 :             stat.block_size()
     693                 :         };
     694                 : 
     695                 :         // use blocks_available (b_avail) since, pageserver runs as unprivileged user
     696 CBC           4 :         let avail_bytes = stat.blocks_available() * blocksize;
     697               4 :         let total_bytes = stat.blocks() * blocksize;
     698               4 : 
     699               4 :         Ok(Usage {
     700               4 :             config,
     701               4 :             total_bytes,
     702               4 :             avail_bytes,
     703               4 :         })
     704               5 :     }
     705                 : 
     706               1 :     #[test]
     707               1 :     fn max_usage_pct_pressure() {
     708               1 :         use super::Usage as _;
     709               1 :         use std::time::Duration;
     710               1 :         use utils::serde_percent::Percent;
     711               1 : 
     712               1 :         let mut usage = Usage {
     713               1 :             config: &DiskUsageEvictionTaskConfig {
     714               1 :                 max_usage_pct: Percent::new(85).unwrap(),
     715               1 :                 min_avail_bytes: 0,
     716               1 :                 period: Duration::MAX,
     717               1 :                 #[cfg(feature = "testing")]
     718               1 :                 mock_statvfs: None,
     719               1 :             },
     720               1 :             total_bytes: 100_000,
     721               1 :             avail_bytes: 0,
     722               1 :         };
     723               1 : 
     724               1 :         assert!(usage.has_pressure(), "expected pressure at 100%");
     725                 : 
     726               1 :         usage.add_available_bytes(14_000);
     727               1 :         assert!(usage.has_pressure(), "expected pressure at 86%");
     728                 : 
     729               1 :         usage.add_available_bytes(999);
     730               1 :         assert!(usage.has_pressure(), "expected pressure at 85.001%");
     731                 : 
     732               1 :         usage.add_available_bytes(1);
     733               1 :         assert!(usage.has_pressure(), "expected pressure at precisely 85%");
     734                 : 
     735               1 :         usage.add_available_bytes(1);
     736               1 :         assert!(!usage.has_pressure(), "no pressure at 84.999%");
     737                 : 
     738               1 :         usage.add_available_bytes(999);
     739               1 :         assert!(!usage.has_pressure(), "no pressure at 84%");
     740                 : 
     741               1 :         usage.add_available_bytes(16_000);
     742               1 :         assert!(!usage.has_pressure());
     743               1 :     }
     744                 : }

Generated by: LCOV version 2.1-beta