Line data Source code
1 : //! This module implements the pageserver-global disk-usage-based layer eviction task.
2 : //!
3 : //! # Mechanics
4 : //!
5 : //! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
6 : //! loop that evicts layers in response to a shortage of available bytes
7 : //! in the $repo/tenants directory's filesystem.
8 : //!
9 : //! The loop runs periodically at a configurable `period`.
10 : //!
11 : //! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
12 : //! It compares the returned usage data against two different types of thresholds.
13 : //! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
14 : //! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
15 : //! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
16 : //! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
17 : //!
18 : //! # Eviction Policy
19 : //!
20 : //! There are two thresholds:
21 : //! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
22 : //! If the actual usage is higher, the threshold is exceeded.
23 : //! `min_avail_bytes` is the absolute available space in bytes.
24 : //! If the actual usage is lower, the threshold is exceeded.
25 : //! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction
26 : //! is performed on the next iteration, to release disk space and bring the usage below the thresholds again.
27 : //! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
28 : //! The reservation is to keep the most recently accessed X bytes per tenant resident.
29 : //! If we cannot relieve pressure by evicting layers outside of the reservation, we
30 : //! start evicting layers that are part of the reservation, LRU first.
31 : //!
32 : //! The value for the per-tenant reservation is referred to as `tenant_min_resident_size`
33 : //! throughout the code, but, no actual variable carries that name.
34 : //! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`.
35 : //! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
36 : //! during page reconstruction.
37 : //! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
38 : //! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`).
39 :
40 : // Implementation notes:
41 : // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
42 : // reading these fields. We use the Debug impl for semi-structured logging, though.
43 :
44 : use std::{
45 : sync::Arc,
46 : time::{Duration, SystemTime},
47 : };
48 :
49 : use anyhow::Context;
50 : use pageserver_api::shard::TenantShardId;
51 : use remote_storage::GenericRemoteStorage;
52 : use serde::{Deserialize, Serialize};
53 : use tokio::time::Instant;
54 : use tokio_util::sync::CancellationToken;
55 : use tracing::{debug, error, info, instrument, warn, Instrument};
56 : use utils::serde_percent::Percent;
57 : use utils::{completion, id::TimelineId};
58 :
59 : use crate::{
60 : config::PageServerConf,
61 : metrics::disk_usage_based_eviction::METRICS,
62 : task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
63 : tenant::{
64 : mgr::TenantManager,
65 : remote_timeline_client::LayerFileMetadata,
66 : secondary::SecondaryTenant,
67 : storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
68 : },
69 : };
70 :
71 10 : #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
72 : pub struct DiskUsageEvictionTaskConfig {
73 : pub max_usage_pct: Percent,
74 : pub min_avail_bytes: u64,
75 : #[serde(with = "humantime_serde")]
76 : pub period: Duration,
77 : #[cfg(feature = "testing")]
78 : pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
79 : /// Select sorting for evicted layers
80 : #[serde(default)]
81 : pub eviction_order: EvictionOrder,
82 : }
83 :
84 : /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
85 : /// partitioning.
86 0 : #[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
87 : #[serde(tag = "type", content = "args")]
88 : pub enum EvictionOrder {
89 : /// Order the layers to be evicted by how recently they have been accessed in absolute
90 : /// time.
91 : ///
92 : /// This strategy is unfair when some tenants grow faster than others towards the slower
93 : /// growing.
94 : #[default]
95 : AbsoluteAccessed,
96 :
97 : /// Order the layers to be evicted by how recently they have been accessed relatively within
98 : /// the set of resident layers of a tenant.
99 : RelativeAccessed {
100 : /// Determines if the tenant with most layers should lose first.
101 : ///
102 : /// Having this enabled is currently the only reasonable option, because the order in which
103 : /// we read tenants is deterministic. If we find the need to use this as `false`, we need
104 : /// to ensure nondeterminism by adding in a random number to break the
105 : /// `relative_last_activity==0.0` ties.
106 : #[serde(default = "default_highest_layer_count_loses_first")]
107 : highest_layer_count_loses_first: bool,
108 : },
109 : }
110 :
111 0 : fn default_highest_layer_count_loses_first() -> bool {
112 0 : true
113 0 : }
114 :
115 : impl EvictionOrder {
116 0 : fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
117 0 : use EvictionOrder::*;
118 0 :
119 0 : match self {
120 0 : AbsoluteAccessed => {
121 0 : candidates.sort_unstable_by_key(|(partition, candidate)| {
122 0 : (*partition, candidate.last_activity_ts)
123 0 : });
124 0 : }
125 0 : RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
126 0 : (*partition, candidate.relative_last_activity)
127 0 : }),
128 : }
129 0 : }
130 :
131 : /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants
132 : /// layers in **most** recently used order.
133 40 : fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 {
134 40 : use EvictionOrder::*;
135 40 :
136 40 : match self {
137 0 : AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
138 : RelativeAccessed {
139 40 : highest_layer_count_loses_first,
140 : } => {
141 : // keeping the -1 or not decides if every tenant should lose their least recently accessed
142 : // layer OR if this should happen in the order of having highest layer count:
143 40 : let fudge = if *highest_layer_count_loses_first {
144 : // relative_last_activity vs. tenant layer count:
145 : // - 0.1..=1.0 (10 layers)
146 : // - 0.01..=1.0 (100 layers)
147 : // - 0.001..=1.0 (1000 layers)
148 : //
149 : // leading to evicting less of the smallest tenants.
150 20 : 0
151 : } else {
152 : // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
153 : // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
154 : // be that less than 10k layer evictions is enough, so we would not need to evict from
155 : // all tenants.
156 : //
157 : // as the tenant ordering is now deterministic this could hit the same tenants
158 : // disproportionetly on multiple invocations. alternative could be to remember how many
159 : // layers did we evict last time from this tenant, and inject that as an additional
160 : // fudge here.
161 20 : 1
162 : };
163 :
164 40 : let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1);
165 40 : let divider = total as f32;
166 40 :
167 40 : // most recently used is always (total - 0) / divider == 1.0
168 40 : // least recently used depends on the fudge:
169 40 : // - (total - 1) - (total - 1) / total => 0 / total
170 40 : // - total - (total - 1) / total => 1 / total
171 40 : let distance = (total - index) as f32;
172 40 :
173 40 : finite_f32::FiniteF32::try_from_normalized(distance / divider)
174 40 : .unwrap_or_else(|val| {
175 0 : tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}");
176 0 : finite_f32::FiniteF32::ZERO
177 40 : })
178 : }
179 : }
180 40 : }
181 : }
182 :
183 : #[derive(Default)]
184 : pub struct State {
185 : /// Exclude http requests and background task from running at the same time.
186 : mutex: tokio::sync::Mutex<()>,
187 : }
188 :
189 0 : pub fn launch_disk_usage_global_eviction_task(
190 0 : conf: &'static PageServerConf,
191 0 : storage: GenericRemoteStorage,
192 0 : state: Arc<State>,
193 0 : tenant_manager: Arc<TenantManager>,
194 0 : background_jobs_barrier: completion::Barrier,
195 0 : ) -> anyhow::Result<()> {
196 0 : let Some(task_config) = &conf.disk_usage_based_eviction else {
197 0 : info!("disk usage based eviction task not configured");
198 0 : return Ok(());
199 : };
200 :
201 0 : info!("launching disk usage based eviction task");
202 :
203 0 : task_mgr::spawn(
204 0 : BACKGROUND_RUNTIME.handle(),
205 0 : TaskKind::DiskUsageEviction,
206 0 : None,
207 0 : None,
208 0 : "disk usage based eviction",
209 0 : false,
210 0 : async move {
211 0 : let cancel = task_mgr::shutdown_token();
212 :
213 : // wait until initial load is complete, because we cannot evict from loading tenants.
214 : tokio::select! {
215 : _ = cancel.cancelled() => { return Ok(()); },
216 : _ = background_jobs_barrier.wait() => { }
217 : };
218 :
219 0 : disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
220 0 : Ok(())
221 0 : },
222 0 : );
223 0 :
224 0 : Ok(())
225 0 : }
226 :
227 0 : #[instrument(skip_all)]
228 : async fn disk_usage_eviction_task(
229 : state: &State,
230 : task_config: &DiskUsageEvictionTaskConfig,
231 : storage: &GenericRemoteStorage,
232 : tenant_manager: Arc<TenantManager>,
233 : cancel: CancellationToken,
234 : ) {
235 : scopeguard::defer! {
236 : info!("disk usage based eviction task finishing");
237 : };
238 :
239 : use crate::tenant::tasks::random_init_delay;
240 : {
241 : if random_init_delay(task_config.period, &cancel)
242 : .await
243 : .is_err()
244 : {
245 : return;
246 : }
247 : }
248 :
249 : let mut iteration_no = 0;
250 : loop {
251 : iteration_no += 1;
252 : let start = Instant::now();
253 :
254 0 : async {
255 0 : let res = disk_usage_eviction_task_iteration(
256 0 : state,
257 0 : task_config,
258 0 : storage,
259 0 : &tenant_manager,
260 0 : &cancel,
261 0 : )
262 0 : .await;
263 :
264 0 : match res {
265 0 : Ok(()) => {}
266 0 : Err(e) => {
267 0 : // these stat failures are expected to be very rare
268 0 : warn!("iteration failed, unexpected error: {e:#}");
269 : }
270 : }
271 0 : }
272 : .instrument(tracing::info_span!("iteration", iteration_no))
273 : .await;
274 :
275 : let sleep_until = start + task_config.period;
276 : if tokio::time::timeout_at(sleep_until, cancel.cancelled())
277 : .await
278 : .is_ok()
279 : {
280 : break;
281 : }
282 : }
283 : }
284 :
285 : pub trait Usage: Clone + Copy + std::fmt::Debug {
286 : fn has_pressure(&self) -> bool;
287 : fn add_available_bytes(&mut self, bytes: u64);
288 : }
289 :
290 0 : async fn disk_usage_eviction_task_iteration(
291 0 : state: &State,
292 0 : task_config: &DiskUsageEvictionTaskConfig,
293 0 : storage: &GenericRemoteStorage,
294 0 : tenant_manager: &Arc<TenantManager>,
295 0 : cancel: &CancellationToken,
296 0 : ) -> anyhow::Result<()> {
297 0 : let tenants_dir = tenant_manager.get_conf().tenants_path();
298 0 : let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
299 0 : .context("get filesystem-level disk usage before evictions")?;
300 0 : let res = disk_usage_eviction_task_iteration_impl(
301 0 : state,
302 0 : storage,
303 0 : usage_pre,
304 0 : tenant_manager,
305 0 : task_config.eviction_order,
306 0 : cancel,
307 0 : )
308 0 : .await;
309 0 : match res {
310 0 : Ok(outcome) => {
311 0 : debug!(?outcome, "disk_usage_eviction_iteration finished");
312 0 : match outcome {
313 0 : IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
314 0 : // nothing to do, select statement below will handle things
315 0 : }
316 0 : IterationOutcome::Finished(outcome) => {
317 : // Verify with statvfs whether we made any real progress
318 0 : let after = filesystem_level_usage::get(&tenants_dir, task_config)
319 0 : // It's quite unlikely to hit the error here. Keep the code simple and bail out.
320 0 : .context("get filesystem-level disk usage after evictions")?;
321 :
322 0 : debug!(?after, "disk usage");
323 :
324 0 : if after.has_pressure() {
325 : // Don't bother doing an out-of-order iteration here now.
326 : // In practice, the task period is set to a value in the tens-of-seconds range,
327 : // which will cause another iteration to happen soon enough.
328 : // TODO: deltas between the three different usages would be helpful,
329 : // consider MiB, GiB, TiB
330 0 : warn!(?outcome, ?after, "disk usage still high");
331 : } else {
332 0 : info!(?outcome, ?after, "disk usage pressure relieved");
333 : }
334 : }
335 : }
336 : }
337 0 : Err(e) => {
338 0 : error!("disk_usage_eviction_iteration failed: {:#}", e);
339 : }
340 : }
341 :
342 0 : Ok(())
343 0 : }
344 :
345 : #[derive(Debug, Serialize)]
346 : #[allow(clippy::large_enum_variant)]
347 : pub enum IterationOutcome<U> {
348 : NoPressure,
349 : Cancelled,
350 : Finished(IterationOutcomeFinished<U>),
351 : }
352 :
353 : #[derive(Debug, Serialize)]
354 : pub struct IterationOutcomeFinished<U> {
355 : /// The actual usage observed before we started the iteration.
356 : before: U,
357 : /// The expected value for `after`, according to internal accounting, after phase 1.
358 : planned: PlannedUsage<U>,
359 : /// The outcome of phase 2, where we actually do the evictions.
360 : ///
361 : /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
362 : /// be the same as `planned`.
363 : assumed: AssumedUsage<U>,
364 : }
365 :
366 : #[derive(Debug, Serialize)]
367 : struct AssumedUsage<U> {
368 : /// The expected value for `after`, after phase 2.
369 : projected_after: U,
370 : /// The layers we failed to evict during phase 2.
371 : failed: LayerCount,
372 : }
373 :
374 : #[derive(Debug, Serialize)]
375 : struct PlannedUsage<U> {
376 : respecting_tenant_min_resident_size: U,
377 : fallback_to_global_lru: Option<U>,
378 : }
379 :
380 : #[derive(Debug, Default, Serialize)]
381 : struct LayerCount {
382 : file_sizes: u64,
383 : count: usize,
384 : }
385 :
386 0 : pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
387 0 : state: &State,
388 0 : _storage: &GenericRemoteStorage,
389 0 : usage_pre: U,
390 0 : tenant_manager: &Arc<TenantManager>,
391 0 : eviction_order: EvictionOrder,
392 0 : cancel: &CancellationToken,
393 0 : ) -> anyhow::Result<IterationOutcome<U>> {
394 : // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
395 0 : let _g = state
396 0 : .mutex
397 0 : .try_lock()
398 0 : .map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
399 :
400 0 : debug!(?usage_pre, "disk usage");
401 :
402 0 : if !usage_pre.has_pressure() {
403 0 : return Ok(IterationOutcome::NoPressure);
404 0 : }
405 0 :
406 0 : warn!(
407 : ?usage_pre,
408 0 : "running disk usage based eviction due to pressure"
409 : );
410 :
411 0 : let (candidates, collection_time) = {
412 0 : let started_at = std::time::Instant::now();
413 0 : match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
414 : EvictionCandidates::Cancelled => {
415 0 : return Ok(IterationOutcome::Cancelled);
416 : }
417 0 : EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()),
418 0 : }
419 0 : };
420 0 :
421 0 : METRICS.layers_collected.inc_by(candidates.len() as u64);
422 0 :
423 0 : tracing::info!(
424 0 : elapsed_ms = collection_time.as_millis(),
425 0 : total_layers = candidates.len(),
426 0 : "collection completed"
427 : );
428 :
429 : // Debug-log the list of candidates
430 0 : let now = SystemTime::now();
431 0 : for (i, (partition, candidate)) in candidates.iter().enumerate() {
432 0 : let nth = i + 1;
433 0 : let total_candidates = candidates.len();
434 0 : let size = candidate.layer.get_file_size();
435 0 : let rel = candidate.relative_last_activity;
436 0 : debug!(
437 0 : "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
438 0 : now.duration_since(candidate.last_activity_ts)
439 0 : .unwrap()
440 0 : .as_micros(),
441 0 : candidate.layer.get_tenant_shard_id(),
442 0 : candidate.layer.get_timeline_id(),
443 0 : candidate.layer.get_name(),
444 : );
445 : }
446 :
447 : // phase1: select victims to relieve pressure
448 : //
449 : // Walk through the list of candidates, until we have accumulated enough layers to get
450 : // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
451 : // how much disk space would be used after evicting all the layers up to the current
452 : // point in the list.
453 : //
454 : // If we get far enough in the list that we start to evict layers that are below
455 : // the tenant's min-resident-size threshold, print a warning, and memorize the disk
456 : // usage at that point, in 'usage_planned_min_resident_size_respecting'.
457 :
458 0 : let (evicted_amount, usage_planned) =
459 0 : select_victims(&candidates, usage_pre).into_amount_and_planned();
460 0 :
461 0 : METRICS.layers_selected.inc_by(evicted_amount as u64);
462 0 :
463 0 : // phase2: evict layers
464 0 :
465 0 : let mut js = tokio::task::JoinSet::new();
466 0 : let limit = 1000;
467 0 :
468 0 : let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
469 0 : let mut consumed_all = false;
470 0 :
471 0 : // After the evictions, `usage_assumed` is the post-eviction usage,
472 0 : // according to internal accounting.
473 0 : let mut usage_assumed = usage_pre;
474 0 : let mut evictions_failed = LayerCount::default();
475 0 :
476 0 : let evict_layers = async move {
477 : loop {
478 0 : let next = if js.len() >= limit || consumed_all {
479 0 : js.join_next().await
480 0 : } else if !js.is_empty() {
481 : // opportunistically consume ready result, one per each new evicted
482 0 : futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
483 : } else {
484 0 : None
485 : };
486 :
487 0 : if let Some(next) = next {
488 0 : match next {
489 0 : Ok(Ok(file_size)) => {
490 0 : METRICS.layers_evicted.inc();
491 0 : usage_assumed.add_available_bytes(file_size);
492 0 : }
493 : Ok(Err((
494 0 : file_size,
495 : EvictionError::NotFound
496 : | EvictionError::Downloaded
497 : | EvictionError::Timeout,
498 0 : ))) => {
499 0 : evictions_failed.file_sizes += file_size;
500 0 : evictions_failed.count += 1;
501 0 : }
502 0 : Err(je) if je.is_cancelled() => unreachable!("not used"),
503 0 : Err(je) if je.is_panic() => { /* already logged */ }
504 0 : Err(je) => tracing::error!("unknown JoinError: {je:?}"),
505 : }
506 0 : }
507 :
508 0 : if consumed_all && js.is_empty() {
509 0 : break;
510 0 : }
511 :
512 : // calling again when consumed_all is fine as evicted is fused.
513 0 : let Some((_partition, candidate)) = evicted.next() else {
514 0 : if !consumed_all {
515 0 : tracing::info!("all evictions started, waiting");
516 0 : consumed_all = true;
517 0 : }
518 0 : continue;
519 : };
520 :
521 0 : match candidate.layer {
522 0 : EvictionLayer::Attached(layer) => {
523 0 : let file_size = layer.layer_desc().file_size;
524 0 : js.spawn(async move {
525 0 : // have a low eviction waiting timeout because our LRU calculations go stale fast;
526 0 : // also individual layer evictions could hang because of bugs and we do not want to
527 0 : // pause disk_usage_based_eviction for such.
528 0 : let timeout = std::time::Duration::from_secs(5);
529 0 :
530 0 : match layer.evict_and_wait(timeout).await {
531 0 : Ok(()) => Ok(file_size),
532 0 : Err(e) => Err((file_size, e)),
533 : }
534 0 : });
535 0 : }
536 0 : EvictionLayer::Secondary(layer) => {
537 0 : let file_size = layer.metadata.file_size();
538 0 : let tenant_manager = tenant_manager.clone();
539 0 :
540 0 : js.spawn(async move {
541 0 : layer
542 0 : .secondary_tenant
543 0 : .evict_layer(
544 0 : tenant_manager.get_conf(),
545 0 : layer.timeline_id,
546 0 : layer.name,
547 0 : layer.metadata,
548 0 : )
549 0 : .await;
550 0 : Ok(file_size)
551 0 : });
552 0 : }
553 : }
554 0 : tokio::task::yield_now().await;
555 : }
556 :
557 0 : (usage_assumed, evictions_failed)
558 0 : };
559 :
560 0 : let started_at = std::time::Instant::now();
561 0 :
562 0 : let evict_layers = async move {
563 0 : let mut evict_layers = std::pin::pin!(evict_layers);
564 0 :
565 0 : let maximum_expected = std::time::Duration::from_secs(10);
566 :
567 0 : let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await;
568 0 : let tuple = if let Ok(tuple) = res {
569 0 : tuple
570 : } else {
571 0 : let elapsed = started_at.elapsed();
572 0 : tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing");
573 0 : evict_layers.await
574 : };
575 :
576 0 : let elapsed = started_at.elapsed();
577 0 : tracing::info!(elapsed_ms = elapsed.as_millis(), "completed");
578 0 : tuple
579 0 : };
580 :
581 0 : let evict_layers =
582 0 : evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount));
583 :
584 0 : let (usage_assumed, evictions_failed) = tokio::select! {
585 : tuple = evict_layers => { tuple },
586 : _ = cancel.cancelled() => {
587 : // dropping joinset will abort all pending evict_and_waits and that is fine, our
588 : // requests will still stand
589 : return Ok(IterationOutcome::Cancelled);
590 : }
591 : };
592 :
593 0 : Ok(IterationOutcome::Finished(IterationOutcomeFinished {
594 0 : before: usage_pre,
595 0 : planned: usage_planned,
596 0 : assumed: AssumedUsage {
597 0 : projected_after: usage_assumed,
598 0 : failed: evictions_failed,
599 0 : },
600 0 : }))
601 0 : }
602 :
603 : #[derive(Clone)]
604 : pub(crate) struct EvictionSecondaryLayer {
605 : pub(crate) secondary_tenant: Arc<SecondaryTenant>,
606 : pub(crate) timeline_id: TimelineId,
607 : pub(crate) name: LayerName,
608 : pub(crate) metadata: LayerFileMetadata,
609 : }
610 :
611 : /// Full [`Layer`] objects are specific to tenants in attached mode. This type is a layer
612 : /// of indirection to store either a `Layer`, or a reference to a secondary tenant and a layer name.
613 : #[derive(Clone)]
614 : pub(crate) enum EvictionLayer {
615 : Attached(Layer),
616 : Secondary(EvictionSecondaryLayer),
617 : }
618 :
619 : impl From<Layer> for EvictionLayer {
620 0 : fn from(value: Layer) -> Self {
621 0 : Self::Attached(value)
622 0 : }
623 : }
624 :
625 : impl EvictionLayer {
626 0 : pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
627 0 : match self {
628 0 : Self::Attached(l) => &l.layer_desc().tenant_shard_id,
629 0 : Self::Secondary(sl) => sl.secondary_tenant.get_tenant_shard_id(),
630 : }
631 0 : }
632 :
633 0 : pub(crate) fn get_timeline_id(&self) -> &TimelineId {
634 0 : match self {
635 0 : Self::Attached(l) => &l.layer_desc().timeline_id,
636 0 : Self::Secondary(sl) => &sl.timeline_id,
637 : }
638 0 : }
639 :
640 0 : pub(crate) fn get_name(&self) -> LayerName {
641 0 : match self {
642 0 : Self::Attached(l) => l.layer_desc().layer_name(),
643 0 : Self::Secondary(sl) => sl.name.clone(),
644 : }
645 0 : }
646 :
647 0 : pub(crate) fn get_file_size(&self) -> u64 {
648 0 : match self {
649 0 : Self::Attached(l) => l.layer_desc().file_size,
650 0 : Self::Secondary(sl) => sl.metadata.file_size(),
651 : }
652 0 : }
653 : }
654 :
655 : #[derive(Clone)]
656 : pub(crate) struct EvictionCandidate {
657 : pub(crate) layer: EvictionLayer,
658 : pub(crate) last_activity_ts: SystemTime,
659 : pub(crate) relative_last_activity: finite_f32::FiniteF32,
660 : }
661 :
662 : impl std::fmt::Display for EvictionLayer {
663 0 : fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
664 0 : match self {
665 0 : Self::Attached(l) => l.fmt(f),
666 0 : Self::Secondary(sl) => {
667 0 : write!(f, "{}/{}", sl.timeline_id, sl.name)
668 : }
669 : }
670 0 : }
671 : }
672 :
673 : #[derive(Default)]
674 : pub(crate) struct DiskUsageEvictionInfo {
675 : /// Timeline's largest layer (remote or resident)
676 : pub max_layer_size: Option<u64>,
677 : /// Timeline's resident layers
678 : pub resident_layers: Vec<EvictionCandidate>,
679 : }
680 :
681 : impl std::fmt::Debug for EvictionCandidate {
682 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
683 0 : // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
684 0 : // having to allocate a string to this is bad, but it will rarely be formatted
685 0 : let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
686 0 : let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
687 0 : struct DisplayIsDebug<'a, T>(&'a T);
688 0 : impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> {
689 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
690 0 : write!(f, "{}", self.0)
691 0 : }
692 0 : }
693 0 : f.debug_struct("LocalLayerInfoForDiskUsageEviction")
694 0 : .field("layer", &DisplayIsDebug(&self.layer))
695 0 : .field("last_activity", &ts)
696 0 : .finish()
697 0 : }
698 : }
699 :
700 : #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
701 : enum MinResidentSizePartition {
702 : Above,
703 : Below,
704 : }
705 :
706 : enum EvictionCandidates {
707 : Cancelled,
708 : Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
709 : }
710 :
711 : /// Gather the eviction candidates.
712 : ///
713 : /// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction
714 : /// order. A caller that evicts in that order, until pressure is relieved, implements
715 : /// the eviction policy outlined in the module comment.
716 : ///
717 : /// # Example with EvictionOrder::AbsoluteAccessed
718 : ///
719 : /// Imagine that there are two tenants, A and B, with five layers each, a-e.
720 : /// Each layer has size 100, and both tenant's min_resident_size is 150.
721 : /// The eviction order would be
722 : ///
723 : /// ```text
724 : /// partition last_activity_ts tenant/layer
725 : /// Above 18:30 A/c
726 : /// Above 19:00 A/b
727 : /// Above 18:29 B/c
728 : /// Above 19:05 B/b
729 : /// Above 20:00 B/a
730 : /// Above 20:03 A/a
731 : /// Below 20:30 A/d
732 : /// Below 20:40 B/d
733 : /// Below 20:45 B/e
734 : /// Below 20:58 A/e
735 : /// ```
736 : ///
737 : /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
738 : /// They are all in the `Above` partition, so, we respected each tenant's min_resident_size.
739 : ///
740 : /// But, if we need to evict 900 bytes to relieve pressure, we'd evict
741 : /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
742 : /// after exhauting the `Above` partition.
743 : /// So, we did not respect each tenant's min_resident_size.
744 : ///
745 : /// # Example with EvictionOrder::RelativeAccessed
746 : ///
747 : /// ```text
748 : /// partition relative_age last_activity_ts tenant/layer
749 : /// Above 0/4 18:30 A/c
750 : /// Above 0/4 18:29 B/c
751 : /// Above 1/4 19:00 A/b
752 : /// Above 1/4 19:05 B/b
753 : /// Above 2/4 20:00 B/a
754 : /// Above 2/4 20:03 A/a
755 : /// Below 3/4 20:30 A/d
756 : /// Below 3/4 20:40 B/d
757 : /// Below 4/4 20:45 B/e
758 : /// Below 4/4 20:58 A/e
759 : /// ```
760 : ///
761 : /// With tenants having the same number of layers the picture does not change much. The same with
762 : /// A having many more layers **resident** (not all of them listed):
763 : ///
764 : /// ```text
765 : /// Above 0/100 18:30 A/c
766 : /// Above 0/4 18:29 B/c
767 : /// Above 1/100 19:00 A/b
768 : /// Above 2/100 20:03 A/a
769 : /// Above 3/100 20:03 A/nth_3
770 : /// Above 4/100 20:03 A/nth_4
771 : /// ...
772 : /// Above 1/4 19:05 B/b
773 : /// Above 25/100 20:04 A/nth_25
774 : /// ...
775 : /// Above 2/4 20:00 B/a
776 : /// Above 50/100 20:10 A/nth_50
777 : /// ...
778 : /// Below 3/4 20:40 B/d
779 : /// Below 99/100 20:30 A/nth_99
780 : /// Below 4/4 20:45 B/e
781 : /// Below 100/100 20:58 A/nth_100
782 : /// ```
783 : ///
784 : /// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
785 : /// difficult to see is what happens on the next round assuming the evicting 23 from the above list
786 : /// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
787 : /// appeared:
788 : ///
789 : /// ```text
790 : /// Above 0/87 20:04 A/nth_23
791 : /// Above 0/3 19:05 B/b
792 : /// Above 0/50 20:59 C/nth_0
793 : /// Above 1/87 20:04 A/nth_24
794 : /// Above 1/50 21:00 C/nth_1
795 : /// Above 2/87 20:04 A/nth_25
796 : /// ...
797 : /// Above 16/50 21:02 C/nth_16
798 : /// Above 1/3 20:00 B/a
799 : /// Above 27/87 20:10 A/nth_50
800 : /// ...
801 : /// Below 2/3 20:40 B/d
802 : /// Below 49/50 21:05 C/nth_49
803 : /// Below 86/87 20:30 A/nth_99
804 : /// Below 3/3 20:45 B/e
805 : /// Below 50/50 21:05 C/nth_50
806 : /// Below 87/87 20:58 A/nth_100
807 : /// ```
808 : ///
809 : /// Now relieving pressure with 23 layers would cost:
810 : /// - tenant A 14 layers
811 : /// - tenant B 1 layer
812 : /// - tenant C 8 layers
813 0 : async fn collect_eviction_candidates(
814 0 : tenant_manager: &Arc<TenantManager>,
815 0 : eviction_order: EvictionOrder,
816 0 : cancel: &CancellationToken,
817 0 : ) -> anyhow::Result<EvictionCandidates> {
818 : const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
819 :
820 : // get a snapshot of the list of tenants
821 0 : let tenants = tenant_manager
822 0 : .list_tenants()
823 0 : .context("get list of tenants")?;
824 :
825 : // TODO: avoid listing every layer in every tenant: this loop can block the executor,
826 : // and the resulting data structure can be huge.
827 : // (https://github.com/neondatabase/neon/issues/6224)
828 0 : let mut candidates = Vec::new();
829 :
830 0 : for (tenant_id, _state, _gen) in tenants {
831 0 : if cancel.is_cancelled() {
832 0 : return Ok(EvictionCandidates::Cancelled);
833 0 : }
834 0 : let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) {
835 0 : Ok(tenant) if tenant.is_active() => tenant,
836 : Ok(_) => {
837 0 : debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active");
838 0 : continue;
839 : }
840 0 : Err(e) => {
841 0 : // this can happen if tenant has lifecycle transition after we fetched it
842 0 : debug!("failed to get tenant: {e:#}");
843 0 : continue;
844 : }
845 : };
846 :
847 0 : if tenant.cancel.is_cancelled() {
848 0 : info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
849 0 : continue;
850 0 : }
851 0 :
852 0 : let started_at = std::time::Instant::now();
853 0 :
854 0 : // collect layers from all timelines in this tenant
855 0 : //
856 0 : // If one of the timelines becomes `!is_active()` during the iteration,
857 0 : // for example because we're shutting down, then `max_layer_size` can be too small.
858 0 : // That's OK. This code only runs under a disk pressure situation, and being
859 0 : // a little unfair to tenants during shutdown in such a situation is tolerable.
860 0 : let mut tenant_candidates = Vec::new();
861 0 : let mut max_layer_size = 0;
862 0 : for tl in tenant.list_timelines() {
863 0 : if !tl.is_active() {
864 0 : continue;
865 0 : }
866 0 : let info = tl.get_local_layers_for_disk_usage_eviction().await;
867 0 : debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
868 :
869 0 : tenant_candidates.extend(info.resident_layers.into_iter());
870 0 : max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
871 0 :
872 0 : if cancel.is_cancelled() {
873 0 : return Ok(EvictionCandidates::Cancelled);
874 0 : }
875 : }
876 :
877 : // `min_resident_size` defaults to maximum layer file size of the tenant.
878 : // This ensures that each tenant can have at least one layer resident at a given time,
879 : // ensuring forward progress for a single Timeline::get in that tenant.
880 : // It's a questionable heuristic since, usually, there are many Timeline::get
881 : // requests going on for a tenant, and, at least in Neon prod, the median
882 : // layer file size is much smaller than the compaction target size.
883 : // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
884 : // That's what's typically used by the various background loops.
885 : //
886 : // The default can be overridden with a fixed value in the tenant conf.
887 : // A default override can be put in the default tenant conf in the pageserver.toml.
888 0 : let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
889 0 : debug!(
890 0 : tenant_id=%tenant.tenant_shard_id().tenant_id,
891 0 : shard_id=%tenant.tenant_shard_id().shard_slug(),
892 0 : overridden_size=s,
893 0 : "using overridden min resident size for tenant"
894 : );
895 0 : s
896 : } else {
897 0 : debug!(
898 0 : tenant_id=%tenant.tenant_shard_id().tenant_id,
899 0 : shard_id=%tenant.tenant_shard_id().shard_slug(),
900 0 : max_layer_size,
901 0 : "using max layer size as min_resident_size for tenant",
902 : );
903 0 : max_layer_size
904 : };
905 :
906 : // Sort layers most-recently-used first, then partition by
907 : // cumsum above/below min_resident_size.
908 0 : tenant_candidates
909 0 : .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
910 0 : let mut cumsum: i128 = 0;
911 0 :
912 0 : let total = tenant_candidates.len();
913 0 :
914 0 : let tenant_candidates =
915 0 : tenant_candidates
916 0 : .into_iter()
917 0 : .enumerate()
918 0 : .map(|(i, mut candidate)| {
919 0 : // as we iterate this reverse sorted list, the most recently accessed layer will always
920 0 : // be 1.0; this is for us to evict it last.
921 0 : candidate.relative_last_activity =
922 0 : eviction_order.relative_last_activity(total, i);
923 :
924 0 : let partition = if cumsum > min_resident_size as i128 {
925 0 : MinResidentSizePartition::Above
926 : } else {
927 0 : MinResidentSizePartition::Below
928 : };
929 0 : cumsum += i128::from(candidate.layer.get_file_size());
930 0 :
931 0 : (partition, candidate)
932 0 : });
933 0 :
934 0 : METRICS
935 0 : .tenant_layer_count
936 0 : .observe(tenant_candidates.len() as f64);
937 0 :
938 0 : candidates.extend(tenant_candidates);
939 0 :
940 0 : let elapsed = started_at.elapsed();
941 0 : METRICS
942 0 : .tenant_collection_time
943 0 : .observe(elapsed.as_secs_f64());
944 0 :
945 0 : if elapsed > LOG_DURATION_THRESHOLD {
946 0 : tracing::info!(
947 0 : tenant_id=%tenant.tenant_shard_id().tenant_id,
948 0 : shard_id=%tenant.tenant_shard_id().shard_slug(),
949 0 : elapsed_ms = elapsed.as_millis(),
950 0 : "collection took longer than threshold"
951 : );
952 0 : }
953 : }
954 :
955 : // Note: the same tenant ID might be hit twice, if it transitions from attached to
956 : // secondary while we run. That is okay: when we eventually try and run the eviction,
957 : // the `Gate` on the object will ensure that whichever one has already been shut down
958 : // will not delete anything.
959 :
960 0 : let mut secondary_tenants = Vec::new();
961 0 : tenant_manager.foreach_secondary_tenants(
962 0 : |_tenant_shard_id: &TenantShardId, state: &Arc<SecondaryTenant>| {
963 0 : secondary_tenants.push(state.clone());
964 0 : },
965 0 : );
966 :
967 0 : for tenant in secondary_tenants {
968 : // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
969 : // to prevent repeated disk usage based evictions from completely draining less often
970 : // updating secondaries.
971 0 : let (mut layer_info, total_layers) = tenant.get_layers_for_eviction();
972 0 :
973 0 : debug_assert!(
974 0 : total_layers >= layer_info.resident_layers.len(),
975 0 : "total_layers ({total_layers}) must be at least the resident_layers.len() ({})",
976 0 : layer_info.resident_layers.len()
977 : );
978 :
979 0 : let started_at = std::time::Instant::now();
980 0 :
981 0 : layer_info
982 0 : .resident_layers
983 0 : .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
984 0 :
985 0 : let tenant_candidates =
986 0 : layer_info
987 0 : .resident_layers
988 0 : .into_iter()
989 0 : .enumerate()
990 0 : .map(|(i, mut candidate)| {
991 0 : candidate.relative_last_activity =
992 0 : eviction_order.relative_last_activity(total_layers, i);
993 0 : (
994 0 : // Secondary locations' layers are always considered above the min resident size,
995 0 : // i.e. secondary locations are permitted to be trimmed to zero layers if all
996 0 : // the layers have sufficiently old access times.
997 0 : MinResidentSizePartition::Above,
998 0 : candidate,
999 0 : )
1000 0 : });
1001 0 :
1002 0 : METRICS
1003 0 : .tenant_layer_count
1004 0 : .observe(tenant_candidates.len() as f64);
1005 0 : candidates.extend(tenant_candidates);
1006 0 :
1007 0 : tokio::task::yield_now().await;
1008 :
1009 0 : let elapsed = started_at.elapsed();
1010 0 :
1011 0 : METRICS
1012 0 : .tenant_collection_time
1013 0 : .observe(elapsed.as_secs_f64());
1014 0 :
1015 0 : if elapsed > LOG_DURATION_THRESHOLD {
1016 0 : tracing::info!(
1017 0 : tenant_id=%tenant.tenant_shard_id().tenant_id,
1018 0 : shard_id=%tenant.tenant_shard_id().shard_slug(),
1019 0 : elapsed_ms = elapsed.as_millis(),
1020 0 : "collection took longer than threshold"
1021 : );
1022 0 : }
1023 : }
1024 :
1025 0 : debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
1026 0 : "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
1027 :
1028 0 : eviction_order.sort(&mut candidates);
1029 0 :
1030 0 : Ok(EvictionCandidates::Finished(candidates))
1031 0 : }
1032 :
1033 : /// Given a pre-sorted vec of all layers in the system, select the first N which are enough to
1034 : /// relieve pressure.
1035 : ///
1036 : /// Returns the amount of candidates selected, with the planned usage.
1037 0 : fn select_victims<U: Usage>(
1038 0 : candidates: &[(MinResidentSizePartition, EvictionCandidate)],
1039 0 : usage_pre: U,
1040 0 : ) -> VictimSelection<U> {
1041 0 : let mut usage_when_switched = None;
1042 0 : let mut usage_planned = usage_pre;
1043 0 : let mut evicted_amount = 0;
1044 :
1045 0 : for (i, (partition, candidate)) in candidates.iter().enumerate() {
1046 0 : if !usage_planned.has_pressure() {
1047 0 : break;
1048 0 : }
1049 0 :
1050 0 : if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
1051 0 : usage_when_switched = Some((usage_planned, i));
1052 0 : }
1053 :
1054 0 : usage_planned.add_available_bytes(candidate.layer.get_file_size());
1055 0 : evicted_amount += 1;
1056 : }
1057 :
1058 0 : VictimSelection {
1059 0 : amount: evicted_amount,
1060 0 : usage_pre,
1061 0 : usage_when_switched,
1062 0 : usage_planned,
1063 0 : }
1064 0 : }
1065 :
1066 : struct VictimSelection<U> {
1067 : amount: usize,
1068 : usage_pre: U,
1069 : usage_when_switched: Option<(U, usize)>,
1070 : usage_planned: U,
1071 : }
1072 :
1073 : impl<U: Usage> VictimSelection<U> {
1074 0 : fn into_amount_and_planned(self) -> (usize, PlannedUsage<U>) {
1075 0 : debug!(
1076 : evicted_amount=%self.amount,
1077 0 : "took enough candidates for pressure to be relieved"
1078 : );
1079 :
1080 0 : if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() {
1081 0 : warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
1082 0 : }
1083 :
1084 0 : let planned = match self.usage_when_switched {
1085 0 : Some((respecting_tenant_min_resident_size, _)) => PlannedUsage {
1086 0 : respecting_tenant_min_resident_size,
1087 0 : fallback_to_global_lru: Some(self.usage_planned),
1088 0 : },
1089 0 : None => PlannedUsage {
1090 0 : respecting_tenant_min_resident_size: self.usage_planned,
1091 0 : fallback_to_global_lru: None,
1092 0 : },
1093 : };
1094 :
1095 0 : (self.amount, planned)
1096 0 : }
1097 : }
1098 :
1099 : /// A totally ordered f32 subset we can use with sorting functions.
1100 : pub(crate) mod finite_f32 {
1101 :
1102 : /// A totally ordered f32 subset we can use with sorting functions.
1103 : #[derive(Clone, Copy, PartialEq)]
1104 : pub struct FiniteF32(f32);
1105 :
1106 : impl std::fmt::Debug for FiniteF32 {
1107 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1108 0 : std::fmt::Debug::fmt(&self.0, f)
1109 0 : }
1110 : }
1111 :
1112 : impl std::fmt::Display for FiniteF32 {
1113 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1114 0 : std::fmt::Display::fmt(&self.0, f)
1115 0 : }
1116 : }
1117 :
1118 : impl std::cmp::Eq for FiniteF32 {}
1119 :
1120 : impl std::cmp::PartialOrd for FiniteF32 {
1121 0 : fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
1122 0 : Some(self.cmp(other))
1123 0 : }
1124 : }
1125 :
1126 : impl std::cmp::Ord for FiniteF32 {
1127 0 : fn cmp(&self, other: &Self) -> std::cmp::Ordering {
1128 0 : self.0.total_cmp(&other.0)
1129 0 : }
1130 : }
1131 :
1132 : impl TryFrom<f32> for FiniteF32 {
1133 : type Error = f32;
1134 :
1135 0 : fn try_from(value: f32) -> Result<Self, Self::Error> {
1136 0 : if value.is_finite() {
1137 0 : Ok(FiniteF32(value))
1138 : } else {
1139 0 : Err(value)
1140 : }
1141 0 : }
1142 : }
1143 :
1144 : impl From<FiniteF32> for f32 {
1145 40 : fn from(value: FiniteF32) -> f32 {
1146 40 : value.0
1147 40 : }
1148 : }
1149 :
1150 : impl FiniteF32 {
1151 : pub const ZERO: FiniteF32 = FiniteF32(0.0);
1152 :
1153 40 : pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
1154 40 : if (0.0..=1.0).contains(&value) {
1155 : // -0.0 is within the range, make sure it is assumed 0.0..=1.0
1156 40 : let value = value.abs();
1157 40 : Ok(FiniteF32(value))
1158 : } else {
1159 0 : Err(value)
1160 : }
1161 40 : }
1162 :
1163 40 : pub fn into_inner(self) -> f32 {
1164 40 : self.into()
1165 40 : }
1166 : }
1167 : }
1168 :
1169 : mod filesystem_level_usage {
1170 : use anyhow::Context;
1171 : use camino::Utf8Path;
1172 :
1173 : use crate::statvfs::Statvfs;
1174 :
1175 : use super::DiskUsageEvictionTaskConfig;
1176 :
1177 : #[derive(Debug, Clone, Copy)]
1178 : pub struct Usage<'a> {
1179 : config: &'a DiskUsageEvictionTaskConfig,
1180 :
1181 : /// Filesystem capacity
1182 : total_bytes: u64,
1183 : /// Free filesystem space
1184 : avail_bytes: u64,
1185 : }
1186 :
1187 : impl super::Usage for Usage<'_> {
1188 14 : fn has_pressure(&self) -> bool {
1189 14 : let usage_pct =
1190 14 : (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
1191 14 :
1192 14 : let pressures = [
1193 14 : (
1194 14 : "min_avail_bytes",
1195 14 : self.avail_bytes < self.config.min_avail_bytes,
1196 14 : ),
1197 14 : (
1198 14 : "max_usage_pct",
1199 14 : usage_pct >= self.config.max_usage_pct.get() as u64,
1200 14 : ),
1201 14 : ];
1202 14 :
1203 28 : pressures.into_iter().any(|(_, has_pressure)| has_pressure)
1204 14 : }
1205 :
1206 12 : fn add_available_bytes(&mut self, bytes: u64) {
1207 12 : self.avail_bytes += bytes;
1208 12 : }
1209 : }
1210 :
1211 0 : pub fn get<'a>(
1212 0 : tenants_dir: &Utf8Path,
1213 0 : config: &'a DiskUsageEvictionTaskConfig,
1214 0 : ) -> anyhow::Result<Usage<'a>> {
1215 0 : let mock_config = {
1216 0 : #[cfg(feature = "testing")]
1217 0 : {
1218 0 : config.mock_statvfs.as_ref()
1219 : }
1220 : #[cfg(not(feature = "testing"))]
1221 : {
1222 : None
1223 : }
1224 : };
1225 :
1226 0 : let stat = Statvfs::get(tenants_dir, mock_config)
1227 0 : .context("statvfs failed, presumably directory got unlinked")?;
1228 :
1229 : // https://unix.stackexchange.com/a/703650
1230 0 : let blocksize = if stat.fragment_size() > 0 {
1231 0 : stat.fragment_size()
1232 : } else {
1233 0 : stat.block_size()
1234 : };
1235 :
1236 : // use blocks_available (b_avail) since, pageserver runs as unprivileged user
1237 0 : let avail_bytes = stat.blocks_available() * blocksize;
1238 0 : let total_bytes = stat.blocks() * blocksize;
1239 0 :
1240 0 : Ok(Usage {
1241 0 : config,
1242 0 : total_bytes,
1243 0 : avail_bytes,
1244 0 : })
1245 0 : }
1246 :
1247 : #[test]
1248 2 : fn max_usage_pct_pressure() {
1249 2 : use super::EvictionOrder;
1250 2 : use super::Usage as _;
1251 2 : use std::time::Duration;
1252 2 : use utils::serde_percent::Percent;
1253 2 :
1254 2 : let mut usage = Usage {
1255 2 : config: &DiskUsageEvictionTaskConfig {
1256 2 : max_usage_pct: Percent::new(85).unwrap(),
1257 2 : min_avail_bytes: 0,
1258 2 : period: Duration::MAX,
1259 2 : #[cfg(feature = "testing")]
1260 2 : mock_statvfs: None,
1261 2 : eviction_order: EvictionOrder::default(),
1262 2 : },
1263 2 : total_bytes: 100_000,
1264 2 : avail_bytes: 0,
1265 2 : };
1266 2 :
1267 2 : assert!(usage.has_pressure(), "expected pressure at 100%");
1268 :
1269 2 : usage.add_available_bytes(14_000);
1270 2 : assert!(usage.has_pressure(), "expected pressure at 86%");
1271 :
1272 2 : usage.add_available_bytes(999);
1273 2 : assert!(usage.has_pressure(), "expected pressure at 85.001%");
1274 :
1275 2 : usage.add_available_bytes(1);
1276 2 : assert!(usage.has_pressure(), "expected pressure at precisely 85%");
1277 :
1278 2 : usage.add_available_bytes(1);
1279 2 : assert!(!usage.has_pressure(), "no pressure at 84.999%");
1280 :
1281 2 : usage.add_available_bytes(999);
1282 2 : assert!(!usage.has_pressure(), "no pressure at 84%");
1283 :
1284 2 : usage.add_available_bytes(16_000);
1285 2 : assert!(!usage.has_pressure());
1286 2 : }
1287 : }
1288 :
1289 : #[cfg(test)]
1290 : mod tests {
1291 : use super::*;
1292 :
1293 : #[test]
1294 2 : fn relative_equal_bounds() {
1295 2 : let order = EvictionOrder::RelativeAccessed {
1296 2 : highest_layer_count_loses_first: false,
1297 2 : };
1298 2 :
1299 2 : let len = 10;
1300 2 : let v = (0..len)
1301 20 : .map(|i| order.relative_last_activity(len, i).into_inner())
1302 2 : .collect::<Vec<_>>();
1303 2 :
1304 2 : assert_eq!(v.first(), Some(&1.0));
1305 2 : assert_eq!(v.last(), Some(&0.0));
1306 18 : assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
1307 2 : }
1308 :
1309 : #[test]
1310 2 : fn relative_spare_bounds() {
1311 2 : let order = EvictionOrder::RelativeAccessed {
1312 2 : highest_layer_count_loses_first: true,
1313 2 : };
1314 2 :
1315 2 : let len = 10;
1316 2 : let v = (0..len)
1317 20 : .map(|i| order.relative_last_activity(len, i).into_inner())
1318 2 : .collect::<Vec<_>>();
1319 2 :
1320 2 : assert_eq!(v.first(), Some(&1.0));
1321 2 : assert_eq!(v.last(), Some(&0.1));
1322 18 : assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
1323 2 : }
1324 : }
|