Line data Source code
1 : //! This module implements the pageserver-global disk-usage-based layer eviction task.
2 : //!
3 : //! # Mechanics
4 : //!
5 : //! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
6 : //! loop that evicts layers in response to a shortage of available bytes
7 : //! in the $repo/tenants directory's filesystem.
8 : //!
9 : //! The loop runs periodically at a configurable `period`.
10 : //!
11 : //! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
12 : //! It compares the returned usage data against two different types of thresholds.
13 : //! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
14 : //! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
15 : //! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
16 : //! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
17 : //!
18 : //! # Eviction Policy
19 : //!
20 : //! There are two thresholds:
21 : //! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
22 : //! If the actual usage is higher, the threshold is exceeded.
23 : //! `min_avail_bytes` is the absolute available space in bytes.
24 : //! If the actual usage is lower, the threshold is exceeded.
25 : //! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction
26 : //! is performed on the next iteration, to release disk space and bring the usage below the thresholds again.
27 : //! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
28 : //! The reservation is to keep the most recently accessed X bytes per tenant resident.
29 : //! If we cannot relieve pressure by evicting layers outside of the reservation, we
30 : //! start evicting layers that are part of the reservation, LRU first.
31 : //!
32 : //! The value for the per-tenant reservation is referred to as `tenant_min_resident_size`
33 : //! throughout the code, but, no actual variable carries that name.
34 : //! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`.
35 : //! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
36 : //! during page reconstruction.
37 : //! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
38 : //! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`).
39 :
40 : // Implementation notes:
41 : // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
42 : // reading these fields. We use the Debug impl for semi-structured logging, though.
43 :
44 : use std::{
45 : collections::HashMap,
46 : path::Path,
47 : sync::Arc,
48 : time::{Duration, SystemTime},
49 : };
50 :
51 : use anyhow::Context;
52 : use remote_storage::GenericRemoteStorage;
53 : use serde::{Deserialize, Serialize};
54 : use tokio::time::Instant;
55 : use tokio_util::sync::CancellationToken;
56 : use tracing::{debug, error, info, instrument, warn, Instrument};
57 : use utils::completion;
58 : use utils::serde_percent::Percent;
59 :
60 : use crate::{
61 : config::PageServerConf,
62 : task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
63 : tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
64 : };
65 :
66 38 : #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
67 : pub struct DiskUsageEvictionTaskConfig {
68 : pub max_usage_pct: Percent,
69 : pub min_avail_bytes: u64,
70 : #[serde(with = "humantime_serde")]
71 : pub period: Duration,
72 : #[cfg(feature = "testing")]
73 : pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
74 : }
75 :
76 575 : #[derive(Default)]
77 : pub struct State {
78 : /// Exclude http requests and background task from running at the same time.
79 : mutex: tokio::sync::Mutex<()>,
80 : }
81 :
82 303 : pub fn launch_disk_usage_global_eviction_task(
83 303 : conf: &'static PageServerConf,
84 303 : storage: GenericRemoteStorage,
85 303 : state: Arc<State>,
86 303 : background_jobs_barrier: completion::Barrier,
87 303 : ) -> anyhow::Result<()> {
88 303 : let Some(task_config) = &conf.disk_usage_based_eviction else {
89 300 : info!("disk usage based eviction task not configured");
90 300 : return Ok(());
91 : };
92 :
93 3 : info!("launching disk usage based eviction task");
94 :
95 3 : task_mgr::spawn(
96 3 : BACKGROUND_RUNTIME.handle(),
97 3 : TaskKind::DiskUsageEviction,
98 3 : None,
99 3 : None,
100 3 : "disk usage based eviction",
101 3 : false,
102 3 : async move {
103 3 : let cancel = task_mgr::shutdown_token();
104 3 :
105 3 : // wait until initial load is complete, because we cannot evict from loading tenants.
106 6 : tokio::select! {
107 6 : _ = cancel.cancelled() => { return Ok(()); },
108 6 : _ = background_jobs_barrier.wait() => { }
109 6 : };
110 :
111 3 : disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
112 4 : .await;
113 0 : Ok(())
114 3 : },
115 3 : );
116 3 :
117 3 : Ok(())
118 303 : }
119 :
120 9 : #[instrument(skip_all)]
121 : async fn disk_usage_eviction_task(
122 : state: &State,
123 : task_config: &DiskUsageEvictionTaskConfig,
124 : storage: GenericRemoteStorage,
125 : tenants_dir: &Path,
126 : cancel: CancellationToken,
127 : ) {
128 0 : scopeguard::defer! {
129 0 : info!("disk usage based eviction task finishing");
130 : };
131 :
132 : use crate::tenant::tasks::random_init_delay;
133 : {
134 : if random_init_delay(task_config.period, &cancel)
135 : .await
136 : .is_err()
137 : {
138 : return;
139 : }
140 : }
141 :
142 : let mut iteration_no = 0;
143 : loop {
144 : iteration_no += 1;
145 : let start = Instant::now();
146 :
147 4 : async {
148 4 : let res = disk_usage_eviction_task_iteration(
149 4 : state,
150 4 : task_config,
151 4 : &storage,
152 4 : tenants_dir,
153 4 : &cancel,
154 4 : )
155 0 : .await;
156 :
157 4 : match res {
158 2 : Ok(()) => {}
159 2 : Err(e) => {
160 2 : // these stat failures are expected to be very rare
161 2 : warn!("iteration failed, unexpected error: {e:#}");
162 : }
163 : }
164 4 : }
165 : .instrument(tracing::info_span!("iteration", iteration_no))
166 : .await;
167 :
168 : let sleep_until = start + task_config.period;
169 : if tokio::time::timeout_at(sleep_until, cancel.cancelled())
170 : .await
171 : .is_ok()
172 : {
173 : break;
174 : }
175 : }
176 : }
177 :
178 : pub trait Usage: Clone + Copy + std::fmt::Debug {
179 : fn has_pressure(&self) -> bool;
180 : fn add_available_bytes(&mut self, bytes: u64);
181 : }
182 :
183 4 : async fn disk_usage_eviction_task_iteration(
184 4 : state: &State,
185 4 : task_config: &DiskUsageEvictionTaskConfig,
186 4 : storage: &GenericRemoteStorage,
187 4 : tenants_dir: &Path,
188 4 : cancel: &CancellationToken,
189 4 : ) -> anyhow::Result<()> {
190 4 : let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
191 4 : .context("get filesystem-level disk usage before evictions")?;
192 2 : let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
193 2 : match res {
194 2 : Ok(outcome) => {
195 0 : debug!(?outcome, "disk_usage_eviction_iteration finished");
196 2 : match outcome {
197 0 : IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
198 0 : // nothing to do, select statement below will handle things
199 0 : }
200 2 : IterationOutcome::Finished(outcome) => {
201 : // Verify with statvfs whether we made any real progress
202 2 : let after = filesystem_level_usage::get(tenants_dir, task_config)
203 2 : // It's quite unlikely to hit the error here. Keep the code simple and bail out.
204 2 : .context("get filesystem-level disk usage after evictions")?;
205 :
206 0 : debug!(?after, "disk usage");
207 :
208 2 : if after.has_pressure() {
209 : // Don't bother doing an out-of-order iteration here now.
210 : // In practice, the task period is set to a value in the tens-of-seconds range,
211 : // which will cause another iteration to happen soon enough.
212 : // TODO: deltas between the three different usages would be helpful,
213 : // consider MiB, GiB, TiB
214 0 : warn!(?outcome, ?after, "disk usage still high");
215 : } else {
216 2 : info!(?outcome, ?after, "disk usage pressure relieved");
217 : }
218 : }
219 : }
220 : }
221 0 : Err(e) => {
222 0 : error!("disk_usage_eviction_iteration failed: {:#}", e);
223 : }
224 : }
225 :
226 2 : Ok(())
227 4 : }
228 :
229 5 : #[derive(Debug, Serialize)]
230 : #[allow(clippy::large_enum_variant)]
231 : pub enum IterationOutcome<U> {
232 : NoPressure,
233 : Cancelled,
234 : Finished(IterationOutcomeFinished<U>),
235 : }
236 :
237 : #[allow(dead_code)]
238 7 : #[derive(Debug, Serialize)]
239 : pub struct IterationOutcomeFinished<U> {
240 : /// The actual usage observed before we started the iteration.
241 : before: U,
242 : /// The expected value for `after`, according to internal accounting, after phase 1.
243 : planned: PlannedUsage<U>,
244 : /// The outcome of phase 2, where we actually do the evictions.
245 : ///
246 : /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
247 : /// be the same as `planned`.
248 : assumed: AssumedUsage<U>,
249 : }
250 :
251 7 : #[derive(Debug, Serialize)]
252 : #[allow(dead_code)]
253 : struct AssumedUsage<U> {
254 : /// The expected value for `after`, after phase 2.
255 : projected_after: U,
256 : /// The layers we failed to evict during phase 2.
257 : failed: LayerCount,
258 : }
259 :
260 : #[allow(dead_code)]
261 7 : #[derive(Debug, Serialize)]
262 : struct PlannedUsage<U> {
263 : respecting_tenant_min_resident_size: U,
264 : fallback_to_global_lru: Option<U>,
265 : }
266 :
267 : #[allow(dead_code)]
268 7 : #[derive(Debug, Default, Serialize)]
269 : struct LayerCount {
270 : file_sizes: u64,
271 : count: usize,
272 : }
273 :
274 7 : pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
275 7 : state: &State,
276 7 : storage: &GenericRemoteStorage,
277 7 : usage_pre: U,
278 7 : cancel: &CancellationToken,
279 7 : ) -> anyhow::Result<IterationOutcome<U>> {
280 : // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
281 7 : let _g = state
282 7 : .mutex
283 7 : .try_lock()
284 7 : .map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
285 :
286 0 : debug!(?usage_pre, "disk usage");
287 :
288 7 : if !usage_pre.has_pressure() {
289 0 : return Ok(IterationOutcome::NoPressure);
290 7 : }
291 :
292 7 : warn!(
293 7 : ?usage_pre,
294 7 : "running disk usage based eviction due to pressure"
295 7 : );
296 :
297 7 : let candidates = match collect_eviction_candidates(cancel).await? {
298 : EvictionCandidates::Cancelled => {
299 0 : return Ok(IterationOutcome::Cancelled);
300 : }
301 7 : EvictionCandidates::Finished(partitioned) => partitioned,
302 7 : };
303 7 :
304 7 : // Debug-log the list of candidates
305 7 : let now = SystemTime::now();
306 250 : for (i, (partition, candidate)) in candidates.iter().enumerate() {
307 250 : let desc = candidate.layer.layer_desc();
308 0 : debug!(
309 0 : "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
310 0 : i + 1,
311 0 : candidates.len(),
312 0 : desc.file_size,
313 0 : now.duration_since(candidate.last_activity_ts)
314 0 : .unwrap()
315 0 : .as_micros(),
316 0 : partition,
317 0 : desc.tenant_id,
318 0 : desc.timeline_id,
319 0 : candidate.layer,
320 0 : );
321 : }
322 :
323 : // phase1: select victims to relieve pressure
324 : //
325 : // Walk through the list of candidates, until we have accumulated enough layers to get
326 : // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
327 : // how much disk space would be used after evicting all the layers up to the current
328 : // point in the list. The layers are collected in 'batched', grouped per timeline.
329 : //
330 : // If we get far enough in the list that we start to evict layers that are below
331 : // the tenant's min-resident-size threshold, print a warning, and memorize the disk
332 : // usage at that point, in 'usage_planned_min_resident_size_respecting'.
333 7 : let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
334 7 : let mut warned = None;
335 7 : let mut usage_planned = usage_pre;
336 148 : for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
337 148 : if !usage_planned.has_pressure() {
338 0 : debug!(
339 0 : no_candidates_evicted = i,
340 0 : "took enough candidates for pressure to be relieved"
341 0 : );
342 5 : break;
343 143 : }
344 143 :
345 143 : if partition == MinResidentSizePartition::Below && warned.is_none() {
346 2 : warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
347 2 : warned = Some(usage_planned);
348 141 : }
349 :
350 143 : usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
351 143 :
352 143 : batched
353 143 : .entry(TimelineKey(candidate.timeline))
354 143 : .or_default()
355 143 : .push(candidate.layer);
356 : }
357 :
358 7 : let usage_planned = match warned {
359 2 : Some(respecting_tenant_min_resident_size) => PlannedUsage {
360 2 : respecting_tenant_min_resident_size,
361 2 : fallback_to_global_lru: Some(usage_planned),
362 2 : },
363 5 : None => PlannedUsage {
364 5 : respecting_tenant_min_resident_size: usage_planned,
365 5 : fallback_to_global_lru: None,
366 5 : },
367 : };
368 0 : debug!(?usage_planned, "usage planned");
369 :
370 : // phase2: evict victims batched by timeline
371 :
372 : // After the loop, `usage_assumed` is the post-eviction usage,
373 : // according to internal accounting.
374 7 : let mut usage_assumed = usage_pre;
375 7 : let mut evictions_failed = LayerCount::default();
376 18 : for (timeline, batch) in batched {
377 11 : let tenant_id = timeline.tenant_id;
378 11 : let timeline_id = timeline.timeline_id;
379 11 : let batch_size = batch.len();
380 :
381 0 : debug!(%timeline_id, "evicting batch for timeline");
382 :
383 11 : async {
384 11 : let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
385 :
386 11 : match results {
387 0 : Err(e) => {
388 0 : warn!("failed to evict batch: {:#}", e);
389 : }
390 11 : Ok(results) => {
391 11 : assert_eq!(results.len(), batch.len());
392 143 : for (result, layer) in results.into_iter().zip(batch.iter()) {
393 143 : let file_size = layer.layer_desc().file_size;
394 0 : match result {
395 143 : Some(Ok(())) => {
396 143 : usage_assumed.add_available_bytes(file_size);
397 143 : }
398 : Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
399 0 : unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
400 : }
401 0 : Some(Err(EvictionError::FileNotFound)) => {
402 0 : evictions_failed.file_sizes += file_size;
403 0 : evictions_failed.count += 1;
404 0 : }
405 : Some(Err(
406 0 : e @ EvictionError::LayerNotFound(_)
407 0 : | e @ EvictionError::StatFailed(_),
408 : )) => {
409 0 : let e = utils::error::report_compact_sources(&e);
410 0 : warn!(%layer, "failed to evict layer: {e}");
411 0 : evictions_failed.file_sizes += file_size;
412 0 : evictions_failed.count += 1;
413 : }
414 : None => {
415 0 : assert!(cancel.is_cancelled());
416 0 : return;
417 : }
418 : }
419 : }
420 : }
421 : }
422 11 : }
423 11 : .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
424 0 : .await;
425 :
426 11 : if cancel.is_cancelled() {
427 0 : return Ok(IterationOutcome::Cancelled);
428 11 : }
429 : }
430 :
431 7 : Ok(IterationOutcome::Finished(IterationOutcomeFinished {
432 7 : before: usage_pre,
433 7 : planned: usage_planned,
434 7 : assumed: AssumedUsage {
435 7 : projected_after: usage_assumed,
436 7 : failed: evictions_failed,
437 7 : },
438 7 : }))
439 7 : }
440 :
441 0 : #[derive(Clone)]
442 : struct EvictionCandidate {
443 : timeline: Arc<Timeline>,
444 : layer: Arc<dyn PersistentLayer>,
445 : last_activity_ts: SystemTime,
446 : }
447 :
448 1050 : #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
449 : enum MinResidentSizePartition {
450 : Above,
451 : Below,
452 : }
453 :
454 : enum EvictionCandidates {
455 : Cancelled,
456 : Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
457 : }
458 :
459 : /// Gather the eviction candidates.
460 : ///
461 : /// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction
462 : /// order. A caller that evicts in that order, until pressure is relieved, implements
463 : /// the eviction policy outlined in the module comment.
464 : ///
465 : /// # Example
466 : ///
467 : /// Imagine that there are two tenants, A and B, with five layers each, a-e.
468 : /// Each layer has size 100, and both tenant's min_resident_size is 150.
469 : /// The eviction order would be
470 : ///
471 : /// ```text
472 : /// partition last_activity_ts tenant/layer
473 : /// Above 18:30 A/c
474 : /// Above 19:00 A/b
475 : /// Above 18:29 B/c
476 : /// Above 19:05 B/b
477 : /// Above 20:00 B/a
478 : /// Above 20:03 A/a
479 : /// Below 20:30 A/d
480 : /// Below 20:40 B/d
481 : /// Below 20:45 B/e
482 : /// Below 20:58 A/e
483 : /// ```
484 : ///
485 : /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
486 : /// They are all in the `Above` partition, so, we respected each tenant's min_resident_size.
487 : ///
488 : /// But, if we need to evict 900 bytes to relieve pressure, we'd evict
489 : /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
490 : /// after exhauting the `Above` partition.
491 : /// So, we did not respect each tenant's min_resident_size.
492 7 : async fn collect_eviction_candidates(
493 7 : cancel: &CancellationToken,
494 7 : ) -> anyhow::Result<EvictionCandidates> {
495 : // get a snapshot of the list of tenants
496 7 : let tenants = tenant::mgr::list_tenants()
497 0 : .await
498 7 : .context("get list of tenants")?;
499 :
500 7 : let mut candidates = Vec::new();
501 :
502 21 : for (tenant_id, _state) in &tenants {
503 14 : if cancel.is_cancelled() {
504 0 : return Ok(EvictionCandidates::Cancelled);
505 14 : }
506 14 : let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
507 13 : Ok(tenant) => tenant,
508 1 : Err(e) => {
509 : // this can happen if tenant has lifecycle transition after we fetched it
510 0 : debug!("failed to get tenant: {e:#}");
511 1 : continue;
512 : }
513 : };
514 :
515 : // collect layers from all timelines in this tenant
516 : //
517 : // If one of the timelines becomes `!is_active()` during the iteration,
518 : // for example because we're shutting down, then `max_layer_size` can be too small.
519 : // That's OK. This code only runs under a disk pressure situation, and being
520 : // a little unfair to tenants during shutdown in such a situation is tolerable.
521 13 : let mut tenant_candidates = Vec::new();
522 13 : let mut max_layer_size = 0;
523 13 : for tl in tenant.list_timelines() {
524 13 : if !tl.is_active() {
525 0 : continue;
526 13 : }
527 13 : let info = tl.get_local_layers_for_disk_usage_eviction().await;
528 0 : debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
529 13 : tenant_candidates.extend(
530 13 : info.resident_layers
531 13 : .into_iter()
532 250 : .map(|layer_infos| (tl.clone(), layer_infos)),
533 13 : );
534 13 : max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
535 13 :
536 13 : if cancel.is_cancelled() {
537 0 : return Ok(EvictionCandidates::Cancelled);
538 13 : }
539 : }
540 :
541 : // `min_resident_size` defaults to maximum layer file size of the tenant.
542 : // This ensures that each tenant can have at least one layer resident at a given time,
543 : // ensuring forward progress for a single Timeline::get in that tenant.
544 : // It's a questionable heuristic since, usually, there are many Timeline::get
545 : // requests going on for a tenant, and, at least in Neon prod, the median
546 : // layer file size is much smaller than the compaction target size.
547 : // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
548 : // That's what's typically used by the various background loops.
549 : //
550 : // The default can be overridden with a fixed value in the tenant conf.
551 : // A default override can be put in the default tenant conf in the pageserver.toml.
552 13 : let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
553 0 : debug!(
554 0 : tenant_id=%tenant.tenant_id(),
555 0 : overridden_size=s,
556 0 : "using overridden min resident size for tenant"
557 0 : );
558 2 : s
559 : } else {
560 0 : debug!(
561 0 : tenant_id=%tenant.tenant_id(),
562 0 : max_layer_size,
563 0 : "using max layer size as min_resident_size for tenant",
564 0 : );
565 11 : max_layer_size
566 : };
567 :
568 : // Sort layers most-recently-used first, then partition by
569 : // cumsum above/below min_resident_size.
570 13 : tenant_candidates
571 2328 : .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
572 13 : let mut cumsum: i128 = 0;
573 250 : for (timeline, layer_info) in tenant_candidates.into_iter() {
574 250 : let file_size = layer_info.file_size();
575 250 : let candidate = EvictionCandidate {
576 250 : timeline,
577 250 : last_activity_ts: layer_info.last_activity_ts,
578 250 : layer: layer_info.layer,
579 250 : };
580 250 : let partition = if cumsum > min_resident_size as i128 {
581 195 : MinResidentSizePartition::Above
582 : } else {
583 55 : MinResidentSizePartition::Below
584 : };
585 250 : candidates.push((partition, candidate));
586 250 : cumsum += i128::from(file_size);
587 : }
588 : }
589 :
590 7 : debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
591 0 : "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
592 7 : candidates
593 2086 : .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
594 7 :
595 7 : Ok(EvictionCandidates::Finished(candidates))
596 7 : }
597 :
598 : struct TimelineKey(Arc<Timeline>);
599 :
600 : impl PartialEq for TimelineKey {
601 132 : fn eq(&self, other: &Self) -> bool {
602 132 : Arc::ptr_eq(&self.0, &other.0)
603 132 : }
604 : }
605 :
606 : impl Eq for TimelineKey {}
607 :
608 : impl std::hash::Hash for TimelineKey {
609 143 : fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
610 143 : Arc::as_ptr(&self.0).hash(state);
611 143 : }
612 : }
613 :
614 : impl std::ops::Deref for TimelineKey {
615 : type Target = Timeline;
616 :
617 33 : fn deref(&self) -> &Self::Target {
618 33 : self.0.as_ref()
619 33 : }
620 : }
621 :
622 : mod filesystem_level_usage {
623 : use std::path::Path;
624 :
625 : use anyhow::Context;
626 :
627 : use crate::statvfs::Statvfs;
628 :
629 : use super::DiskUsageEvictionTaskConfig;
630 :
631 10 : #[derive(Debug, Clone, Copy)]
632 : #[allow(dead_code)]
633 : pub struct Usage<'a> {
634 : config: &'a DiskUsageEvictionTaskConfig,
635 :
636 : /// Filesystem capacity
637 : total_bytes: u64,
638 : /// Free filesystem space
639 : avail_bytes: u64,
640 : }
641 :
642 : impl super::Usage for Usage<'_> {
643 51 : fn has_pressure(&self) -> bool {
644 51 : let usage_pct =
645 51 : (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
646 51 :
647 51 : let pressures = [
648 51 : (
649 51 : "min_avail_bytes",
650 51 : self.avail_bytes < self.config.min_avail_bytes,
651 51 : ),
652 51 : (
653 51 : "max_usage_pct",
654 51 : usage_pct >= self.config.max_usage_pct.get() as u64,
655 51 : ),
656 51 : ];
657 51 :
658 88 : pressures.into_iter().any(|(_, has_pressure)| has_pressure)
659 51 : }
660 :
661 82 : fn add_available_bytes(&mut self, bytes: u64) {
662 82 : self.avail_bytes += bytes;
663 82 : }
664 : }
665 :
666 6 : pub fn get<'a>(
667 6 : tenants_dir: &Path,
668 6 : config: &'a DiskUsageEvictionTaskConfig,
669 6 : ) -> anyhow::Result<Usage<'a>> {
670 6 : let mock_config = {
671 6 : #[cfg(feature = "testing")]
672 6 : {
673 6 : config.mock_statvfs.as_ref()
674 : }
675 : #[cfg(not(feature = "testing"))]
676 : {
677 : None
678 : }
679 : };
680 :
681 6 : let stat = Statvfs::get(tenants_dir, mock_config)
682 6 : .context("statvfs failed, presumably directory got unlinked")?;
683 :
684 : // https://unix.stackexchange.com/a/703650
685 4 : let blocksize = if stat.fragment_size() > 0 {
686 4 : stat.fragment_size()
687 : } else {
688 0 : stat.block_size()
689 : };
690 :
691 : // use blocks_available (b_avail) since, pageserver runs as unprivileged user
692 4 : let avail_bytes = stat.blocks_available() * blocksize;
693 4 : let total_bytes = stat.blocks() * blocksize;
694 4 :
695 4 : Ok(Usage {
696 4 : config,
697 4 : total_bytes,
698 4 : avail_bytes,
699 4 : })
700 6 : }
701 :
702 1 : #[test]
703 1 : fn max_usage_pct_pressure() {
704 1 : use super::Usage as _;
705 1 : use std::time::Duration;
706 1 : use utils::serde_percent::Percent;
707 1 :
708 1 : let mut usage = Usage {
709 1 : config: &DiskUsageEvictionTaskConfig {
710 1 : max_usage_pct: Percent::new(85).unwrap(),
711 1 : min_avail_bytes: 0,
712 1 : period: Duration::MAX,
713 1 : #[cfg(feature = "testing")]
714 1 : mock_statvfs: None,
715 1 : },
716 1 : total_bytes: 100_000,
717 1 : avail_bytes: 0,
718 1 : };
719 1 :
720 1 : assert!(usage.has_pressure(), "expected pressure at 100%");
721 :
722 1 : usage.add_available_bytes(14_000);
723 1 : assert!(usage.has_pressure(), "expected pressure at 86%");
724 :
725 1 : usage.add_available_bytes(999);
726 1 : assert!(usage.has_pressure(), "expected pressure at 85.001%");
727 :
728 1 : usage.add_available_bytes(1);
729 1 : assert!(usage.has_pressure(), "expected pressure at precisely 85%");
730 :
731 1 : usage.add_available_bytes(1);
732 1 : assert!(!usage.has_pressure(), "no pressure at 84.999%");
733 :
734 1 : usage.add_available_bytes(999);
735 1 : assert!(!usage.has_pressure(), "no pressure at 84%");
736 :
737 1 : usage.add_available_bytes(16_000);
738 1 : assert!(!usage.has_pressure());
739 1 : }
740 : }
|