TLA Line data Source code
1 : //! This module implements the pageserver-global disk-usage-based layer eviction task.
2 : //!
3 : //! # Mechanics
4 : //!
5 : //! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
6 : //! loop that evicts layers in response to a shortage of available bytes
7 : //! in the $repo/tenants directory's filesystem.
8 : //!
9 : //! The loop runs periodically at a configurable `period`.
10 : //!
11 : //! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
12 : //! It compares the returned usage data against two different types of thresholds.
13 : //! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
14 : //! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
15 : //! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
16 : //! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
17 : //!
18 : //! # Eviction Policy
19 : //!
20 : //! There are two thresholds:
21 : //! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
22 : //! If the actual usage is higher, the threshold is exceeded.
23 : //! `min_avail_bytes` is the absolute available space in bytes.
24 : //! If the actual usage is lower, the threshold is exceeded.
25 : //! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction
26 : //! is performed on the next iteration, to release disk space and bring the usage below the thresholds again.
27 : //! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
28 : //! The reservation is to keep the most recently accessed X bytes per tenant resident.
29 : //! If we cannot relieve pressure by evicting layers outside of the reservation, we
30 : //! start evicting layers that are part of the reservation, LRU first.
31 : //!
32 : //! The value for the per-tenant reservation is referred to as `tenant_min_resident_size`
33 : //! throughout the code, but, no actual variable carries that name.
34 : //! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`.
35 : //! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
36 : //! during page reconstruction.
37 : //! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
38 : //! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`).
39 :
40 : // Implementation notes:
41 : // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
42 : // reading these fields. We use the Debug impl for semi-structured logging, though.
43 :
44 : use std::{
45 : collections::HashMap,
46 : sync::Arc,
47 : time::{Duration, SystemTime},
48 : };
49 :
50 : use anyhow::Context;
51 : use camino::Utf8Path;
52 : use remote_storage::GenericRemoteStorage;
53 : use serde::{Deserialize, Serialize};
54 : use tokio::time::Instant;
55 : use tokio_util::sync::CancellationToken;
56 : use tracing::{debug, error, info, instrument, warn, Instrument};
57 : use utils::completion;
58 : use utils::serde_percent::Percent;
59 :
60 : use crate::{
61 : config::PageServerConf,
62 : task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
63 : tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
64 : };
65 :
66 CBC 38 : #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
67 : pub struct DiskUsageEvictionTaskConfig {
68 : pub max_usage_pct: Percent,
69 : pub min_avail_bytes: u64,
70 : #[serde(with = "humantime_serde")]
71 : pub period: Duration,
72 : #[cfg(feature = "testing")]
73 : pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
74 : }
75 :
76 560 : #[derive(Default)]
77 : pub struct State {
78 : /// Exclude http requests and background task from running at the same time.
79 : mutex: tokio::sync::Mutex<()>,
80 : }
81 :
82 560 : pub fn launch_disk_usage_global_eviction_task(
83 560 : conf: &'static PageServerConf,
84 560 : storage: GenericRemoteStorage,
85 560 : state: Arc<State>,
86 560 : background_jobs_barrier: completion::Barrier,
87 560 : ) -> anyhow::Result<()> {
88 560 : let Some(task_config) = &conf.disk_usage_based_eviction else {
89 557 : info!("disk usage based eviction task not configured");
90 557 : return Ok(());
91 : };
92 :
93 3 : info!("launching disk usage based eviction task");
94 :
95 3 : task_mgr::spawn(
96 3 : BACKGROUND_RUNTIME.handle(),
97 3 : TaskKind::DiskUsageEviction,
98 3 : None,
99 3 : None,
100 3 : "disk usage based eviction",
101 3 : false,
102 3 : async move {
103 3 : let cancel = task_mgr::shutdown_token();
104 3 :
105 3 : // wait until initial load is complete, because we cannot evict from loading tenants.
106 6 : tokio::select! {
107 6 : _ = cancel.cancelled() => { return Ok(()); },
108 6 : _ = background_jobs_barrier.wait() => { }
109 6 : };
110 :
111 3 : disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
112 3 : .await;
113 UBC 0 : Ok(())
114 CBC 3 : },
115 3 : );
116 3 :
117 3 : Ok(())
118 560 : }
119 :
120 9 : #[instrument(skip_all)]
121 : async fn disk_usage_eviction_task(
122 : state: &State,
123 : task_config: &DiskUsageEvictionTaskConfig,
124 : storage: GenericRemoteStorage,
125 : tenants_dir: &Utf8Path,
126 : cancel: CancellationToken,
127 : ) {
128 UBC 0 : scopeguard::defer! {
129 0 : info!("disk usage based eviction task finishing");
130 : };
131 :
132 : use crate::tenant::tasks::random_init_delay;
133 : {
134 : if random_init_delay(task_config.period, &cancel)
135 : .await
136 : .is_err()
137 : {
138 : return;
139 : }
140 : }
141 :
142 : let mut iteration_no = 0;
143 : loop {
144 : iteration_no += 1;
145 : let start = Instant::now();
146 :
147 CBC 3 : async {
148 3 : let res = disk_usage_eviction_task_iteration(
149 3 : state,
150 3 : task_config,
151 3 : &storage,
152 3 : tenants_dir,
153 3 : &cancel,
154 3 : )
155 UBC 0 : .await;
156 :
157 CBC 3 : match res {
158 2 : Ok(()) => {}
159 1 : Err(e) => {
160 1 : // these stat failures are expected to be very rare
161 1 : warn!("iteration failed, unexpected error: {e:#}");
162 : }
163 : }
164 3 : }
165 : .instrument(tracing::info_span!("iteration", iteration_no))
166 : .await;
167 :
168 : let sleep_until = start + task_config.period;
169 : if tokio::time::timeout_at(sleep_until, cancel.cancelled())
170 : .await
171 : .is_ok()
172 : {
173 : break;
174 : }
175 : }
176 : }
177 :
178 : pub trait Usage: Clone + Copy + std::fmt::Debug {
179 : fn has_pressure(&self) -> bool;
180 : fn add_available_bytes(&mut self, bytes: u64);
181 : }
182 :
183 3 : async fn disk_usage_eviction_task_iteration(
184 3 : state: &State,
185 3 : task_config: &DiskUsageEvictionTaskConfig,
186 3 : storage: &GenericRemoteStorage,
187 3 : tenants_dir: &Utf8Path,
188 3 : cancel: &CancellationToken,
189 3 : ) -> anyhow::Result<()> {
190 3 : let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
191 3 : .context("get filesystem-level disk usage before evictions")?;
192 2 : let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
193 2 : match res {
194 2 : Ok(outcome) => {
195 UBC 0 : debug!(?outcome, "disk_usage_eviction_iteration finished");
196 CBC 2 : match outcome {
197 UBC 0 : IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
198 0 : // nothing to do, select statement below will handle things
199 0 : }
200 CBC 2 : IterationOutcome::Finished(outcome) => {
201 : // Verify with statvfs whether we made any real progress
202 2 : let after = filesystem_level_usage::get(tenants_dir, task_config)
203 2 : // It's quite unlikely to hit the error here. Keep the code simple and bail out.
204 2 : .context("get filesystem-level disk usage after evictions")?;
205 :
206 UBC 0 : debug!(?after, "disk usage");
207 :
208 CBC 2 : if after.has_pressure() {
209 : // Don't bother doing an out-of-order iteration here now.
210 : // In practice, the task period is set to a value in the tens-of-seconds range,
211 : // which will cause another iteration to happen soon enough.
212 : // TODO: deltas between the three different usages would be helpful,
213 : // consider MiB, GiB, TiB
214 UBC 0 : warn!(?outcome, ?after, "disk usage still high");
215 : } else {
216 CBC 2 : info!(?outcome, ?after, "disk usage pressure relieved");
217 : }
218 : }
219 : }
220 : }
221 UBC 0 : Err(e) => {
222 0 : error!("disk_usage_eviction_iteration failed: {:#}", e);
223 : }
224 : }
225 :
226 CBC 2 : Ok(())
227 3 : }
228 :
229 5 : #[derive(Debug, Serialize)]
230 : #[allow(clippy::large_enum_variant)]
231 : pub enum IterationOutcome<U> {
232 : NoPressure,
233 : Cancelled,
234 : Finished(IterationOutcomeFinished<U>),
235 : }
236 :
237 : #[allow(dead_code)]
238 7 : #[derive(Debug, Serialize)]
239 : pub struct IterationOutcomeFinished<U> {
240 : /// The actual usage observed before we started the iteration.
241 : before: U,
242 : /// The expected value for `after`, according to internal accounting, after phase 1.
243 : planned: PlannedUsage<U>,
244 : /// The outcome of phase 2, where we actually do the evictions.
245 : ///
246 : /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
247 : /// be the same as `planned`.
248 : assumed: AssumedUsage<U>,
249 : }
250 :
251 7 : #[derive(Debug, Serialize)]
252 : #[allow(dead_code)]
253 : struct AssumedUsage<U> {
254 : /// The expected value for `after`, after phase 2.
255 : projected_after: U,
256 : /// The layers we failed to evict during phase 2.
257 : failed: LayerCount,
258 : }
259 :
260 : #[allow(dead_code)]
261 7 : #[derive(Debug, Serialize)]
262 : struct PlannedUsage<U> {
263 : respecting_tenant_min_resident_size: U,
264 : fallback_to_global_lru: Option<U>,
265 : }
266 :
267 : #[allow(dead_code)]
268 7 : #[derive(Debug, Default, Serialize)]
269 : struct LayerCount {
270 : file_sizes: u64,
271 : count: usize,
272 : }
273 :
274 7 : pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
275 7 : state: &State,
276 7 : storage: &GenericRemoteStorage,
277 7 : usage_pre: U,
278 7 : cancel: &CancellationToken,
279 7 : ) -> anyhow::Result<IterationOutcome<U>> {
280 : // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
281 7 : let _g = state
282 7 : .mutex
283 7 : .try_lock()
284 7 : .map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
285 :
286 UBC 0 : debug!(?usage_pre, "disk usage");
287 :
288 CBC 7 : if !usage_pre.has_pressure() {
289 UBC 0 : return Ok(IterationOutcome::NoPressure);
290 CBC 7 : }
291 :
292 7 : warn!(
293 7 : ?usage_pre,
294 7 : "running disk usage based eviction due to pressure"
295 7 : );
296 :
297 7 : let candidates = match collect_eviction_candidates(cancel).await? {
298 : EvictionCandidates::Cancelled => {
299 UBC 0 : return Ok(IterationOutcome::Cancelled);
300 : }
301 CBC 7 : EvictionCandidates::Finished(partitioned) => partitioned,
302 7 : };
303 7 :
304 7 : // Debug-log the list of candidates
305 7 : let now = SystemTime::now();
306 250 : for (i, (partition, candidate)) in candidates.iter().enumerate() {
307 250 : let desc = candidate.layer.layer_desc();
308 UBC 0 : debug!(
309 0 : "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
310 0 : i + 1,
311 0 : candidates.len(),
312 0 : desc.file_size,
313 0 : now.duration_since(candidate.last_activity_ts)
314 0 : .unwrap()
315 0 : .as_micros(),
316 0 : partition,
317 0 : desc.tenant_id,
318 0 : desc.timeline_id,
319 0 : candidate.layer,
320 0 : );
321 : }
322 :
323 : // phase1: select victims to relieve pressure
324 : //
325 : // Walk through the list of candidates, until we have accumulated enough layers to get
326 : // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
327 : // how much disk space would be used after evicting all the layers up to the current
328 : // point in the list. The layers are collected in 'batched', grouped per timeline.
329 : //
330 : // If we get far enough in the list that we start to evict layers that are below
331 : // the tenant's min-resident-size threshold, print a warning, and memorize the disk
332 : // usage at that point, in 'usage_planned_min_resident_size_respecting'.
333 CBC 7 : let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
334 7 : let mut warned = None;
335 7 : let mut usage_planned = usage_pre;
336 148 : for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
337 148 : if !usage_planned.has_pressure() {
338 UBC 0 : debug!(
339 0 : no_candidates_evicted = i,
340 0 : "took enough candidates for pressure to be relieved"
341 0 : );
342 CBC 5 : break;
343 143 : }
344 143 :
345 143 : if partition == MinResidentSizePartition::Below && warned.is_none() {
346 2 : warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
347 2 : warned = Some(usage_planned);
348 141 : }
349 :
350 143 : usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
351 143 :
352 143 : batched
353 143 : .entry(TimelineKey(candidate.timeline))
354 143 : .or_default()
355 143 : .push(candidate.layer);
356 : }
357 :
358 7 : let usage_planned = match warned {
359 2 : Some(respecting_tenant_min_resident_size) => PlannedUsage {
360 2 : respecting_tenant_min_resident_size,
361 2 : fallback_to_global_lru: Some(usage_planned),
362 2 : },
363 5 : None => PlannedUsage {
364 5 : respecting_tenant_min_resident_size: usage_planned,
365 5 : fallback_to_global_lru: None,
366 5 : },
367 : };
368 UBC 0 : debug!(?usage_planned, "usage planned");
369 :
370 : // phase2: evict victims batched by timeline
371 :
372 : // After the loop, `usage_assumed` is the post-eviction usage,
373 : // according to internal accounting.
374 CBC 7 : let mut usage_assumed = usage_pre;
375 7 : let mut evictions_failed = LayerCount::default();
376 18 : for (timeline, batch) in batched {
377 11 : let tenant_id = timeline.tenant_id;
378 11 : let timeline_id = timeline.timeline_id;
379 11 : let batch_size = batch.len();
380 :
381 UBC 0 : debug!(%timeline_id, "evicting batch for timeline");
382 :
383 CBC 11 : async {
384 11 : let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
385 :
386 11 : match results {
387 UBC 0 : Err(e) => {
388 0 : warn!("failed to evict batch: {:#}", e);
389 : }
390 CBC 11 : Ok(results) => {
391 11 : assert_eq!(results.len(), batch.len());
392 143 : for (result, layer) in results.into_iter().zip(batch.iter()) {
393 143 : let file_size = layer.layer_desc().file_size;
394 UBC 0 : match result {
395 CBC 143 : Some(Ok(())) => {
396 143 : usage_assumed.add_available_bytes(file_size);
397 143 : }
398 : Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
399 UBC 0 : unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
400 : }
401 0 : Some(Err(EvictionError::FileNotFound)) => {
402 0 : evictions_failed.file_sizes += file_size;
403 0 : evictions_failed.count += 1;
404 0 : }
405 : Some(Err(
406 0 : e @ EvictionError::LayerNotFound(_)
407 0 : | e @ EvictionError::StatFailed(_),
408 : )) => {
409 0 : let e = utils::error::report_compact_sources(&e);
410 0 : warn!(%layer, "failed to evict layer: {e}");
411 0 : evictions_failed.file_sizes += file_size;
412 0 : evictions_failed.count += 1;
413 : }
414 0 : Some(Err(EvictionError::MetadataInconsistency(detail))) => {
415 0 : warn!(%layer, "failed to evict layer: {detail}");
416 0 : evictions_failed.file_sizes += file_size;
417 0 : evictions_failed.count += 1;
418 : }
419 : None => {
420 0 : assert!(cancel.is_cancelled());
421 0 : return;
422 : }
423 : }
424 : }
425 : }
426 : }
427 CBC 11 : }
428 11 : .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
429 UBC 0 : .await;
430 :
431 CBC 11 : if cancel.is_cancelled() {
432 UBC 0 : return Ok(IterationOutcome::Cancelled);
433 CBC 11 : }
434 : }
435 :
436 7 : Ok(IterationOutcome::Finished(IterationOutcomeFinished {
437 7 : before: usage_pre,
438 7 : planned: usage_planned,
439 7 : assumed: AssumedUsage {
440 7 : projected_after: usage_assumed,
441 7 : failed: evictions_failed,
442 7 : },
443 7 : }))
444 7 : }
445 :
446 UBC 0 : #[derive(Clone)]
447 : struct EvictionCandidate {
448 : timeline: Arc<Timeline>,
449 : layer: Arc<dyn PersistentLayer>,
450 : last_activity_ts: SystemTime,
451 : }
452 :
453 CBC 1007 : #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
454 : enum MinResidentSizePartition {
455 : Above,
456 : Below,
457 : }
458 :
459 : enum EvictionCandidates {
460 : Cancelled,
461 : Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
462 : }
463 :
464 : /// Gather the eviction candidates.
465 : ///
466 : /// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction
467 : /// order. A caller that evicts in that order, until pressure is relieved, implements
468 : /// the eviction policy outlined in the module comment.
469 : ///
470 : /// # Example
471 : ///
472 : /// Imagine that there are two tenants, A and B, with five layers each, a-e.
473 : /// Each layer has size 100, and both tenant's min_resident_size is 150.
474 : /// The eviction order would be
475 : ///
476 : /// ```text
477 : /// partition last_activity_ts tenant/layer
478 : /// Above 18:30 A/c
479 : /// Above 19:00 A/b
480 : /// Above 18:29 B/c
481 : /// Above 19:05 B/b
482 : /// Above 20:00 B/a
483 : /// Above 20:03 A/a
484 : /// Below 20:30 A/d
485 : /// Below 20:40 B/d
486 : /// Below 20:45 B/e
487 : /// Below 20:58 A/e
488 : /// ```
489 : ///
490 : /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
491 : /// They are all in the `Above` partition, so, we respected each tenant's min_resident_size.
492 : ///
493 : /// But, if we need to evict 900 bytes to relieve pressure, we'd evict
494 : /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
495 : /// after exhauting the `Above` partition.
496 : /// So, we did not respect each tenant's min_resident_size.
497 7 : async fn collect_eviction_candidates(
498 7 : cancel: &CancellationToken,
499 7 : ) -> anyhow::Result<EvictionCandidates> {
500 : // get a snapshot of the list of tenants
501 7 : let tenants = tenant::mgr::list_tenants()
502 UBC 0 : .await
503 CBC 7 : .context("get list of tenants")?;
504 :
505 7 : let mut candidates = Vec::new();
506 :
507 21 : for (tenant_id, _state) in &tenants {
508 14 : if cancel.is_cancelled() {
509 UBC 0 : return Ok(EvictionCandidates::Cancelled);
510 CBC 14 : }
511 14 : let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
512 13 : Ok(tenant) => tenant,
513 1 : Err(e) => {
514 : // this can happen if tenant has lifecycle transition after we fetched it
515 UBC 0 : debug!("failed to get tenant: {e:#}");
516 CBC 1 : continue;
517 : }
518 : };
519 :
520 : // collect layers from all timelines in this tenant
521 : //
522 : // If one of the timelines becomes `!is_active()` during the iteration,
523 : // for example because we're shutting down, then `max_layer_size` can be too small.
524 : // That's OK. This code only runs under a disk pressure situation, and being
525 : // a little unfair to tenants during shutdown in such a situation is tolerable.
526 13 : let mut tenant_candidates = Vec::new();
527 13 : let mut max_layer_size = 0;
528 13 : for tl in tenant.list_timelines() {
529 13 : if !tl.is_active() {
530 UBC 0 : continue;
531 CBC 13 : }
532 13 : let info = tl.get_local_layers_for_disk_usage_eviction().await;
533 UBC 0 : debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
534 CBC 13 : tenant_candidates.extend(
535 13 : info.resident_layers
536 13 : .into_iter()
537 250 : .map(|layer_infos| (tl.clone(), layer_infos)),
538 13 : );
539 13 : max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
540 13 :
541 13 : if cancel.is_cancelled() {
542 UBC 0 : return Ok(EvictionCandidates::Cancelled);
543 CBC 13 : }
544 : }
545 :
546 : // `min_resident_size` defaults to maximum layer file size of the tenant.
547 : // This ensures that each tenant can have at least one layer resident at a given time,
548 : // ensuring forward progress for a single Timeline::get in that tenant.
549 : // It's a questionable heuristic since, usually, there are many Timeline::get
550 : // requests going on for a tenant, and, at least in Neon prod, the median
551 : // layer file size is much smaller than the compaction target size.
552 : // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
553 : // That's what's typically used by the various background loops.
554 : //
555 : // The default can be overridden with a fixed value in the tenant conf.
556 : // A default override can be put in the default tenant conf in the pageserver.toml.
557 13 : let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
558 UBC 0 : debug!(
559 0 : tenant_id=%tenant.tenant_id(),
560 0 : overridden_size=s,
561 0 : "using overridden min resident size for tenant"
562 0 : );
563 CBC 2 : s
564 : } else {
565 UBC 0 : debug!(
566 0 : tenant_id=%tenant.tenant_id(),
567 0 : max_layer_size,
568 0 : "using max layer size as min_resident_size for tenant",
569 0 : );
570 CBC 11 : max_layer_size
571 : };
572 :
573 : // Sort layers most-recently-used first, then partition by
574 : // cumsum above/below min_resident_size.
575 13 : tenant_candidates
576 2384 : .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
577 13 : let mut cumsum: i128 = 0;
578 250 : for (timeline, layer_info) in tenant_candidates.into_iter() {
579 250 : let file_size = layer_info.file_size();
580 250 : let candidate = EvictionCandidate {
581 250 : timeline,
582 250 : last_activity_ts: layer_info.last_activity_ts,
583 250 : layer: layer_info.layer,
584 250 : };
585 250 : let partition = if cumsum > min_resident_size as i128 {
586 195 : MinResidentSizePartition::Above
587 : } else {
588 55 : MinResidentSizePartition::Below
589 : };
590 250 : candidates.push((partition, candidate));
591 250 : cumsum += i128::from(file_size);
592 : }
593 : }
594 :
595 7 : debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
596 UBC 0 : "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
597 CBC 7 : candidates
598 2000 : .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
599 7 :
600 7 : Ok(EvictionCandidates::Finished(candidates))
601 7 : }
602 :
603 : struct TimelineKey(Arc<Timeline>);
604 :
605 : impl PartialEq for TimelineKey {
606 132 : fn eq(&self, other: &Self) -> bool {
607 132 : Arc::ptr_eq(&self.0, &other.0)
608 132 : }
609 : }
610 :
611 : impl Eq for TimelineKey {}
612 :
613 : impl std::hash::Hash for TimelineKey {
614 143 : fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
615 143 : Arc::as_ptr(&self.0).hash(state);
616 143 : }
617 : }
618 :
619 : impl std::ops::Deref for TimelineKey {
620 : type Target = Timeline;
621 :
622 33 : fn deref(&self) -> &Self::Target {
623 33 : self.0.as_ref()
624 33 : }
625 : }
626 :
627 : mod filesystem_level_usage {
628 : use anyhow::Context;
629 : use camino::Utf8Path;
630 :
631 : use crate::statvfs::Statvfs;
632 :
633 : use super::DiskUsageEvictionTaskConfig;
634 :
635 10 : #[derive(Debug, Clone, Copy)]
636 : #[allow(dead_code)]
637 : pub struct Usage<'a> {
638 : config: &'a DiskUsageEvictionTaskConfig,
639 :
640 : /// Filesystem capacity
641 : total_bytes: u64,
642 : /// Free filesystem space
643 : avail_bytes: u64,
644 : }
645 :
646 : impl super::Usage for Usage<'_> {
647 51 : fn has_pressure(&self) -> bool {
648 51 : let usage_pct =
649 51 : (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
650 51 :
651 51 : let pressures = [
652 51 : (
653 51 : "min_avail_bytes",
654 51 : self.avail_bytes < self.config.min_avail_bytes,
655 51 : ),
656 51 : (
657 51 : "max_usage_pct",
658 51 : usage_pct >= self.config.max_usage_pct.get() as u64,
659 51 : ),
660 51 : ];
661 51 :
662 88 : pressures.into_iter().any(|(_, has_pressure)| has_pressure)
663 51 : }
664 :
665 82 : fn add_available_bytes(&mut self, bytes: u64) {
666 82 : self.avail_bytes += bytes;
667 82 : }
668 : }
669 :
670 5 : pub fn get<'a>(
671 5 : tenants_dir: &Utf8Path,
672 5 : config: &'a DiskUsageEvictionTaskConfig,
673 5 : ) -> anyhow::Result<Usage<'a>> {
674 5 : let mock_config = {
675 5 : #[cfg(feature = "testing")]
676 5 : {
677 5 : config.mock_statvfs.as_ref()
678 : }
679 : #[cfg(not(feature = "testing"))]
680 : {
681 : None
682 : }
683 : };
684 :
685 5 : let stat = Statvfs::get(tenants_dir, mock_config)
686 5 : .context("statvfs failed, presumably directory got unlinked")?;
687 :
688 : // https://unix.stackexchange.com/a/703650
689 4 : let blocksize = if stat.fragment_size() > 0 {
690 4 : stat.fragment_size()
691 : } else {
692 UBC 0 : stat.block_size()
693 : };
694 :
695 : // use blocks_available (b_avail) since, pageserver runs as unprivileged user
696 CBC 4 : let avail_bytes = stat.blocks_available() * blocksize;
697 4 : let total_bytes = stat.blocks() * blocksize;
698 4 :
699 4 : Ok(Usage {
700 4 : config,
701 4 : total_bytes,
702 4 : avail_bytes,
703 4 : })
704 5 : }
705 :
706 1 : #[test]
707 1 : fn max_usage_pct_pressure() {
708 1 : use super::Usage as _;
709 1 : use std::time::Duration;
710 1 : use utils::serde_percent::Percent;
711 1 :
712 1 : let mut usage = Usage {
713 1 : config: &DiskUsageEvictionTaskConfig {
714 1 : max_usage_pct: Percent::new(85).unwrap(),
715 1 : min_avail_bytes: 0,
716 1 : period: Duration::MAX,
717 1 : #[cfg(feature = "testing")]
718 1 : mock_statvfs: None,
719 1 : },
720 1 : total_bytes: 100_000,
721 1 : avail_bytes: 0,
722 1 : };
723 1 :
724 1 : assert!(usage.has_pressure(), "expected pressure at 100%");
725 :
726 1 : usage.add_available_bytes(14_000);
727 1 : assert!(usage.has_pressure(), "expected pressure at 86%");
728 :
729 1 : usage.add_available_bytes(999);
730 1 : assert!(usage.has_pressure(), "expected pressure at 85.001%");
731 :
732 1 : usage.add_available_bytes(1);
733 1 : assert!(usage.has_pressure(), "expected pressure at precisely 85%");
734 :
735 1 : usage.add_available_bytes(1);
736 1 : assert!(!usage.has_pressure(), "no pressure at 84.999%");
737 :
738 1 : usage.add_available_bytes(999);
739 1 : assert!(!usage.has_pressure(), "no pressure at 84%");
740 :
741 1 : usage.add_available_bytes(16_000);
742 1 : assert!(!usage.has_pressure());
743 1 : }
744 : }
|