Line data Source code
1 : //! This module implements the pageserver-global disk-usage-based layer eviction task.
2 : //!
3 : //! # Mechanics
4 : //!
5 : //! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
6 : //! loop that evicts layers in response to a shortage of available bytes
7 : //! in the $repo/tenants directory's filesystem.
8 : //!
9 : //! The loop runs periodically at a configurable `period`.
10 : //!
11 : //! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
12 : //! It compares the returned usage data against two different types of thresholds.
13 : //! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
14 : //! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
15 : //! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
16 : //! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
17 : //!
18 : //! # Eviction Policy
19 : //!
20 : //! There are two thresholds:
21 : //! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
22 : //! If the actual usage is higher, the threshold is exceeded.
23 : //! `min_avail_bytes` is the absolute available space in bytes.
24 : //! If the actual usage is lower, the threshold is exceeded.
25 : //! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction
26 : //! is performed on the next iteration, to release disk space and bring the usage below the thresholds again.
27 : //! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
28 : //! The reservation is to keep the most recently accessed X bytes per tenant resident.
29 : //! If we cannot relieve pressure by evicting layers outside of the reservation, we
30 : //! start evicting layers that are part of the reservation, LRU first.
31 : //!
32 : //! The value for the per-tenant reservation is referred to as `tenant_min_resident_size`
33 : //! throughout the code, but, no actual variable carries that name.
34 : //! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`.
35 : //! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
36 : //! during page reconstruction.
37 : //! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
38 : //! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`).
39 :
40 : // Implementation notes:
41 : // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
42 : // reading these fields. We use the Debug impl for semi-structured logging, though.
43 :
44 : use std::{
45 : sync::Arc,
46 : time::{Duration, SystemTime},
47 : };
48 :
49 : use anyhow::Context;
50 : use pageserver_api::shard::TenantShardId;
51 : use remote_storage::GenericRemoteStorage;
52 : use serde::{Deserialize, Serialize};
53 : use tokio::time::Instant;
54 : use tokio_util::sync::CancellationToken;
55 : use tracing::{debug, error, info, instrument, warn, Instrument};
56 : use utils::serde_percent::Percent;
57 : use utils::{completion, id::TimelineId};
58 :
59 : use crate::{
60 : config::PageServerConf,
61 : task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
62 : tenant::{
63 : self,
64 : mgr::TenantManager,
65 : remote_timeline_client::LayerFileMetadata,
66 : secondary::SecondaryTenant,
67 : storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
68 : Timeline,
69 : },
70 : };
71 :
72 64 : #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
73 : pub struct DiskUsageEvictionTaskConfig {
74 : pub max_usage_pct: Percent,
75 : pub min_avail_bytes: u64,
76 : #[serde(with = "humantime_serde")]
77 : pub period: Duration,
78 : #[cfg(feature = "testing")]
79 : pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
80 : /// Select sorting for evicted layers
81 : #[serde(default)]
82 : pub eviction_order: EvictionOrder,
83 : }
84 :
85 : /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
86 : /// partitioning.
87 60 : #[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
88 : #[serde(tag = "type", content = "args")]
89 : pub enum EvictionOrder {
90 : /// Order the layers to be evicted by how recently they have been accessed in absolute
91 : /// time.
92 : ///
93 : /// This strategy is unfair when some tenants grow faster than others towards the slower
94 : /// growing.
95 : #[default]
96 : AbsoluteAccessed,
97 :
98 : /// Order the layers to be evicted by how recently they have been accessed relatively within
99 : /// the set of resident layers of a tenant.
100 : RelativeAccessed {
101 : /// Determines if the tenant with most layers should lose first.
102 : ///
103 : /// Having this enabled is currently the only reasonable option, because the order in which
104 : /// we read tenants is deterministic. If we find the need to use this as `false`, we need
105 : /// to ensure nondeterminism by adding in a random number to break the
106 : /// `relative_last_activity==0.0` ties.
107 : #[serde(default = "default_highest_layer_count_loses_first")]
108 : highest_layer_count_loses_first: bool,
109 : },
110 : }
111 :
112 0 : fn default_highest_layer_count_loses_first() -> bool {
113 0 : true
114 0 : }
115 :
116 : impl EvictionOrder {
117 17 : fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
118 17 : use EvictionOrder::*;
119 17 :
120 17 : match self {
121 10 : AbsoluteAccessed => {
122 2822 : candidates.sort_unstable_by_key(|(partition, candidate)| {
123 2822 : (*partition, candidate.last_activity_ts)
124 2822 : });
125 10 : }
126 2522 : RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
127 2522 : (*partition, candidate.relative_last_activity)
128 2522 : }),
129 : }
130 17 : }
131 :
132 : /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants
133 : /// layers in **most** recently used order.
134 617 : fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 {
135 617 : use EvictionOrder::*;
136 617 :
137 617 : match self {
138 331 : AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
139 : RelativeAccessed {
140 286 : highest_layer_count_loses_first,
141 : } => {
142 : // keeping the -1 or not decides if every tenant should lose their least recently accessed
143 : // layer OR if this should happen in the order of having highest layer count:
144 286 : let fudge = if *highest_layer_count_loses_first {
145 : // relative_last_activity vs. tenant layer count:
146 : // - 0.1..=1.0 (10 layers)
147 : // - 0.01..=1.0 (100 layers)
148 : // - 0.001..=1.0 (1000 layers)
149 : //
150 : // leading to evicting less of the smallest tenants.
151 86 : 0
152 : } else {
153 : // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
154 : // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
155 : // be that less than 10k layer evictions is enough, so we would not need to evict from
156 : // all tenants.
157 : //
158 : // as the tenant ordering is now deterministic this could hit the same tenants
159 : // disproportionetly on multiple invocations. alternative could be to remember how many
160 : // layers did we evict last time from this tenant, and inject that as an additional
161 : // fudge here.
162 200 : 1
163 : };
164 :
165 286 : let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1);
166 286 : let divider = total as f32;
167 286 :
168 286 : // most recently used is always (total - 0) / divider == 1.0
169 286 : // least recently used depends on the fudge:
170 286 : // - (total - 1) - (total - 1) / total => 0 / total
171 286 : // - total - (total - 1) / total => 1 / total
172 286 : let distance = (total - index) as f32;
173 286 :
174 286 : finite_f32::FiniteF32::try_from_normalized(distance / divider)
175 286 : .unwrap_or_else(|val| {
176 0 : tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}");
177 0 : finite_f32::FiniteF32::ZERO
178 286 : })
179 : }
180 : }
181 617 : }
182 : }
183 :
184 624 : #[derive(Default)]
185 : pub struct State {
186 : /// Exclude http requests and background task from running at the same time.
187 : mutex: tokio::sync::Mutex<()>,
188 : }
189 :
190 624 : pub fn launch_disk_usage_global_eviction_task(
191 624 : conf: &'static PageServerConf,
192 624 : storage: GenericRemoteStorage,
193 624 : state: Arc<State>,
194 624 : tenant_manager: Arc<TenantManager>,
195 624 : background_jobs_barrier: completion::Barrier,
196 624 : ) -> anyhow::Result<()> {
197 624 : let Some(task_config) = &conf.disk_usage_based_eviction else {
198 620 : info!("disk usage based eviction task not configured");
199 620 : return Ok(());
200 : };
201 :
202 4 : info!("launching disk usage based eviction task");
203 :
204 4 : task_mgr::spawn(
205 4 : BACKGROUND_RUNTIME.handle(),
206 4 : TaskKind::DiskUsageEviction,
207 4 : None,
208 4 : None,
209 4 : "disk usage based eviction",
210 4 : false,
211 4 : async move {
212 4 : let cancel = task_mgr::shutdown_token();
213 4 :
214 4 : // wait until initial load is complete, because we cannot evict from loading tenants.
215 8 : tokio::select! {
216 8 : _ = cancel.cancelled() => { return Ok(()); },
217 8 : _ = background_jobs_barrier.wait() => { }
218 8 : };
219 :
220 107 : disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
221 0 : Ok(())
222 4 : },
223 4 : );
224 4 :
225 4 : Ok(())
226 624 : }
227 :
228 8 : #[instrument(skip_all)]
229 : async fn disk_usage_eviction_task(
230 : state: &State,
231 : task_config: &DiskUsageEvictionTaskConfig,
232 : storage: &GenericRemoteStorage,
233 : tenant_manager: Arc<TenantManager>,
234 : cancel: CancellationToken,
235 : ) {
236 0 : scopeguard::defer! {
237 0 : info!("disk usage based eviction task finishing");
238 : };
239 :
240 : use crate::tenant::tasks::random_init_delay;
241 : {
242 : if random_init_delay(task_config.period, &cancel)
243 : .await
244 : .is_err()
245 : {
246 : return;
247 : }
248 : }
249 :
250 : let mut iteration_no = 0;
251 : loop {
252 : iteration_no += 1;
253 : let start = Instant::now();
254 :
255 7 : async {
256 7 : let res = disk_usage_eviction_task_iteration(
257 7 : state,
258 7 : task_config,
259 7 : storage,
260 7 : &tenant_manager,
261 7 : &cancel,
262 7 : )
263 100 : .await;
264 :
265 7 : match res {
266 6 : Ok(()) => {}
267 1 : Err(e) => {
268 1 : // these stat failures are expected to be very rare
269 1 : warn!("iteration failed, unexpected error: {e:#}");
270 : }
271 : }
272 7 : }
273 : .instrument(tracing::info_span!("iteration", iteration_no))
274 : .await;
275 :
276 : let sleep_until = start + task_config.period;
277 : if tokio::time::timeout_at(sleep_until, cancel.cancelled())
278 : .await
279 : .is_ok()
280 : {
281 : break;
282 : }
283 : }
284 : }
285 :
286 : pub trait Usage: Clone + Copy + std::fmt::Debug {
287 : fn has_pressure(&self) -> bool;
288 : fn add_available_bytes(&mut self, bytes: u64);
289 : }
290 :
291 7 : async fn disk_usage_eviction_task_iteration(
292 7 : state: &State,
293 7 : task_config: &DiskUsageEvictionTaskConfig,
294 7 : storage: &GenericRemoteStorage,
295 7 : tenant_manager: &Arc<TenantManager>,
296 7 : cancel: &CancellationToken,
297 7 : ) -> anyhow::Result<()> {
298 7 : let tenants_dir = tenant_manager.get_conf().tenants_path();
299 7 : let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
300 7 : .context("get filesystem-level disk usage before evictions")?;
301 6 : let res = disk_usage_eviction_task_iteration_impl(
302 6 : state,
303 6 : storage,
304 6 : usage_pre,
305 6 : tenant_manager,
306 6 : task_config.eviction_order,
307 6 : cancel,
308 6 : )
309 100 : .await;
310 6 : match res {
311 6 : Ok(outcome) => {
312 0 : debug!(?outcome, "disk_usage_eviction_iteration finished");
313 6 : match outcome {
314 2 : IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
315 2 : // nothing to do, select statement below will handle things
316 2 : }
317 4 : IterationOutcome::Finished(outcome) => {
318 : // Verify with statvfs whether we made any real progress
319 4 : let after = filesystem_level_usage::get(&tenants_dir, task_config)
320 4 : // It's quite unlikely to hit the error here. Keep the code simple and bail out.
321 4 : .context("get filesystem-level disk usage after evictions")?;
322 :
323 0 : debug!(?after, "disk usage");
324 :
325 4 : if after.has_pressure() {
326 : // Don't bother doing an out-of-order iteration here now.
327 : // In practice, the task period is set to a value in the tens-of-seconds range,
328 : // which will cause another iteration to happen soon enough.
329 : // TODO: deltas between the three different usages would be helpful,
330 : // consider MiB, GiB, TiB
331 0 : warn!(?outcome, ?after, "disk usage still high");
332 : } else {
333 4 : info!(?outcome, ?after, "disk usage pressure relieved");
334 : }
335 : }
336 : }
337 : }
338 0 : Err(e) => {
339 0 : error!("disk_usage_eviction_iteration failed: {:#}", e);
340 : }
341 : }
342 :
343 6 : Ok(())
344 7 : }
345 :
346 13 : #[derive(Debug, Serialize)]
347 : #[allow(clippy::large_enum_variant)]
348 : pub enum IterationOutcome<U> {
349 : NoPressure,
350 : Cancelled,
351 : Finished(IterationOutcomeFinished<U>),
352 : }
353 :
354 : #[allow(dead_code)]
355 17 : #[derive(Debug, Serialize)]
356 : pub struct IterationOutcomeFinished<U> {
357 : /// The actual usage observed before we started the iteration.
358 : before: U,
359 : /// The expected value for `after`, according to internal accounting, after phase 1.
360 : planned: PlannedUsage<U>,
361 : /// The outcome of phase 2, where we actually do the evictions.
362 : ///
363 : /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
364 : /// be the same as `planned`.
365 : assumed: AssumedUsage<U>,
366 : }
367 :
368 17 : #[derive(Debug, Serialize)]
369 : #[allow(dead_code)]
370 : struct AssumedUsage<U> {
371 : /// The expected value for `after`, after phase 2.
372 : projected_after: U,
373 : /// The layers we failed to evict during phase 2.
374 : failed: LayerCount,
375 : }
376 :
377 : #[allow(dead_code)]
378 17 : #[derive(Debug, Serialize)]
379 : struct PlannedUsage<U> {
380 : respecting_tenant_min_resident_size: U,
381 : fallback_to_global_lru: Option<U>,
382 : }
383 :
384 : #[allow(dead_code)]
385 17 : #[derive(Debug, Default, Serialize)]
386 : struct LayerCount {
387 : file_sizes: u64,
388 : count: usize,
389 : }
390 :
391 19 : pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
392 19 : state: &State,
393 19 : _storage: &GenericRemoteStorage,
394 19 : usage_pre: U,
395 19 : tenant_manager: &Arc<TenantManager>,
396 19 : eviction_order: EvictionOrder,
397 19 : cancel: &CancellationToken,
398 19 : ) -> anyhow::Result<IterationOutcome<U>> {
399 : // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
400 19 : let _g = state
401 19 : .mutex
402 19 : .try_lock()
403 19 : .map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
404 :
405 0 : debug!(?usage_pre, "disk usage");
406 :
407 19 : if !usage_pre.has_pressure() {
408 2 : return Ok(IterationOutcome::NoPressure);
409 17 : }
410 :
411 17 : warn!(
412 17 : ?usage_pre,
413 17 : "running disk usage based eviction due to pressure"
414 17 : );
415 :
416 17 : let candidates =
417 25 : match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
418 : EvictionCandidates::Cancelled => {
419 0 : return Ok(IterationOutcome::Cancelled);
420 : }
421 17 : EvictionCandidates::Finished(partitioned) => partitioned,
422 17 : };
423 17 :
424 17 : // Debug-log the list of candidates
425 17 : let now = SystemTime::now();
426 577 : for (i, (partition, candidate)) in candidates.iter().enumerate() {
427 577 : let nth = i + 1;
428 577 : let total_candidates = candidates.len();
429 577 : let size = candidate.layer.get_file_size();
430 577 : let rel = candidate.relative_last_activity;
431 0 : debug!(
432 0 : "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
433 0 : now.duration_since(candidate.last_activity_ts)
434 0 : .unwrap()
435 0 : .as_micros(),
436 0 : candidate.layer.get_tenant_shard_id(),
437 0 : candidate.layer.get_timeline_id(),
438 0 : candidate.layer.get_name(),
439 0 : );
440 : }
441 :
442 : // phase1: select victims to relieve pressure
443 : //
444 : // Walk through the list of candidates, until we have accumulated enough layers to get
445 : // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
446 : // how much disk space would be used after evicting all the layers up to the current
447 : // point in the list.
448 : //
449 : // If we get far enough in the list that we start to evict layers that are below
450 : // the tenant's min-resident-size threshold, print a warning, and memorize the disk
451 : // usage at that point, in 'usage_planned_min_resident_size_respecting'.
452 :
453 17 : let selection = select_victims(&candidates, usage_pre);
454 17 :
455 17 : let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
456 17 :
457 17 : // phase2: evict layers
458 17 :
459 17 : let mut js = tokio::task::JoinSet::new();
460 17 : let limit = 1000;
461 17 :
462 17 : let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
463 17 : let mut consumed_all = false;
464 17 :
465 17 : // After the evictions, `usage_assumed` is the post-eviction usage,
466 17 : // according to internal accounting.
467 17 : let mut usage_assumed = usage_pre;
468 17 : let mut evictions_failed = LayerCount::default();
469 17 :
470 17 : let evict_layers = async move {
471 : loop {
472 524 : let next = if js.len() >= limit || consumed_all {
473 224 : js.join_next().await
474 300 : } else if !js.is_empty() {
475 : // opportunistically consume ready result, one per each new evicted
476 283 : futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
477 : } else {
478 17 : None
479 : };
480 :
481 524 : if let Some(next) = next {
482 0 : match next {
483 283 : Ok(Ok(file_size)) => {
484 283 : usage_assumed.add_available_bytes(file_size);
485 283 : }
486 0 : Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
487 0 : evictions_failed.file_sizes += file_size;
488 0 : evictions_failed.count += 1;
489 0 : }
490 0 : Err(je) if je.is_cancelled() => unreachable!("not used"),
491 0 : Err(je) if je.is_panic() => { /* already logged */ }
492 0 : Err(je) => tracing::error!("unknown JoinError: {je:?}"),
493 : }
494 241 : }
495 :
496 524 : if consumed_all && js.is_empty() {
497 17 : break;
498 507 : }
499 :
500 : // calling again when consumed_all is fine as evicted is fused.
501 507 : let Some((_partition, candidate)) = evicted.next() else {
502 224 : consumed_all = true;
503 224 : continue;
504 : };
505 :
506 283 : match candidate.layer {
507 270 : EvictionLayer::Attached(layer) => {
508 270 : let file_size = layer.layer_desc().file_size;
509 270 : js.spawn(async move {
510 270 : layer
511 270 : .evict_and_wait()
512 270 : .await
513 270 : .map(|()| file_size)
514 270 : .map_err(|e| (file_size, e))
515 270 : });
516 270 : }
517 13 : EvictionLayer::Secondary(layer) => {
518 13 : let file_size = layer.metadata.file_size();
519 13 : let tenant_manager = tenant_manager.clone();
520 13 :
521 13 : js.spawn(async move {
522 13 : layer
523 13 : .secondary_tenant
524 13 : .evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name)
525 13 : .await;
526 13 : Ok(file_size)
527 13 : });
528 13 : }
529 : }
530 283 : tokio::task::yield_now().await;
531 : }
532 :
533 17 : (usage_assumed, evictions_failed)
534 17 : };
535 :
536 17 : let (usage_assumed, evictions_failed) = tokio::select! {
537 17 : tuple = evict_layers => { tuple },
538 : _ = cancel.cancelled() => {
539 : // dropping joinset will abort all pending evict_and_waits and that is fine, our
540 : // requests will still stand
541 : return Ok(IterationOutcome::Cancelled);
542 : }
543 : };
544 :
545 17 : Ok(IterationOutcome::Finished(IterationOutcomeFinished {
546 17 : before: usage_pre,
547 17 : planned: usage_planned,
548 17 : assumed: AssumedUsage {
549 17 : projected_after: usage_assumed,
550 17 : failed: evictions_failed,
551 17 : },
552 17 : }))
553 19 : }
554 :
555 0 : #[derive(Clone)]
556 : pub(crate) struct EvictionSecondaryLayer {
557 : pub(crate) secondary_tenant: Arc<SecondaryTenant>,
558 : pub(crate) timeline_id: TimelineId,
559 : pub(crate) name: LayerFileName,
560 : pub(crate) metadata: LayerFileMetadata,
561 : }
562 :
563 : /// Full [`Layer`] objects are specific to tenants in attached mode. This type is a layer
564 : /// of indirection to store either a `Layer`, or a reference to a secondary tenant and a layer name.
565 0 : #[derive(Clone)]
566 : pub(crate) enum EvictionLayer {
567 : Attached(Layer),
568 : #[allow(dead_code)]
569 : Secondary(EvictionSecondaryLayer),
570 : }
571 :
572 : impl From<Layer> for EvictionLayer {
573 539 : fn from(value: Layer) -> Self {
574 539 : Self::Attached(value)
575 539 : }
576 : }
577 :
578 : impl EvictionLayer {
579 0 : pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
580 0 : match self {
581 0 : Self::Attached(l) => &l.layer_desc().tenant_shard_id,
582 0 : Self::Secondary(sl) => sl.secondary_tenant.get_tenant_shard_id(),
583 : }
584 0 : }
585 :
586 0 : pub(crate) fn get_timeline_id(&self) -> &TimelineId {
587 0 : match self {
588 0 : Self::Attached(l) => &l.layer_desc().timeline_id,
589 0 : Self::Secondary(sl) => &sl.timeline_id,
590 : }
591 0 : }
592 :
593 0 : pub(crate) fn get_name(&self) -> LayerFileName {
594 0 : match self {
595 0 : Self::Attached(l) => l.layer_desc().filename(),
596 0 : Self::Secondary(sl) => sl.name.clone(),
597 : }
598 0 : }
599 :
600 1437 : pub(crate) fn get_file_size(&self) -> u64 {
601 1437 : match self {
602 1348 : Self::Attached(l) => l.layer_desc().file_size,
603 89 : Self::Secondary(sl) => sl.metadata.file_size(),
604 : }
605 1437 : }
606 : }
607 :
608 0 : #[derive(Clone)]
609 : pub(crate) struct EvictionCandidate {
610 : pub(crate) layer: EvictionLayer,
611 : pub(crate) last_activity_ts: SystemTime,
612 : pub(crate) relative_last_activity: finite_f32::FiniteF32,
613 : }
614 :
615 : impl std::fmt::Display for EvictionLayer {
616 0 : fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
617 0 : match self {
618 0 : Self::Attached(l) => l.fmt(f),
619 0 : Self::Secondary(sl) => {
620 0 : write!(f, "{}/{}", sl.timeline_id, sl.name)
621 : }
622 : }
623 0 : }
624 : }
625 :
626 2 : #[derive(Default)]
627 : pub(crate) struct DiskUsageEvictionInfo {
628 : /// Timeline's largest layer (remote or resident)
629 : pub max_layer_size: Option<u64>,
630 : /// Timeline's resident layers
631 : pub resident_layers: Vec<EvictionCandidate>,
632 : }
633 :
634 : impl std::fmt::Debug for EvictionCandidate {
635 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
636 0 : // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
637 0 : // having to allocate a string to this is bad, but it will rarely be formatted
638 0 : let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
639 0 : let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
640 0 : struct DisplayIsDebug<'a, T>(&'a T);
641 0 : impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> {
642 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
643 0 : write!(f, "{}", self.0)
644 0 : }
645 0 : }
646 0 : f.debug_struct("LocalLayerInfoForDiskUsageEviction")
647 0 : .field("layer", &DisplayIsDebug(&self.layer))
648 0 : .field("last_activity", &ts)
649 0 : .finish()
650 0 : }
651 : }
652 :
653 2689 : #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
654 : enum MinResidentSizePartition {
655 : Above,
656 : Below,
657 : }
658 :
659 : enum EvictionCandidates {
660 : Cancelled,
661 : Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
662 : }
663 :
664 : /// Gather the eviction candidates.
665 : ///
666 : /// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction
667 : /// order. A caller that evicts in that order, until pressure is relieved, implements
668 : /// the eviction policy outlined in the module comment.
669 : ///
670 : /// # Example with EvictionOrder::AbsoluteAccessed
671 : ///
672 : /// Imagine that there are two tenants, A and B, with five layers each, a-e.
673 : /// Each layer has size 100, and both tenant's min_resident_size is 150.
674 : /// The eviction order would be
675 : ///
676 : /// ```text
677 : /// partition last_activity_ts tenant/layer
678 : /// Above 18:30 A/c
679 : /// Above 19:00 A/b
680 : /// Above 18:29 B/c
681 : /// Above 19:05 B/b
682 : /// Above 20:00 B/a
683 : /// Above 20:03 A/a
684 : /// Below 20:30 A/d
685 : /// Below 20:40 B/d
686 : /// Below 20:45 B/e
687 : /// Below 20:58 A/e
688 : /// ```
689 : ///
690 : /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
691 : /// They are all in the `Above` partition, so, we respected each tenant's min_resident_size.
692 : ///
693 : /// But, if we need to evict 900 bytes to relieve pressure, we'd evict
694 : /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
695 : /// after exhauting the `Above` partition.
696 : /// So, we did not respect each tenant's min_resident_size.
697 : ///
698 : /// # Example with EvictionOrder::RelativeAccessed
699 : ///
700 : /// ```text
701 : /// partition relative_age last_activity_ts tenant/layer
702 : /// Above 0/4 18:30 A/c
703 : /// Above 0/4 18:29 B/c
704 : /// Above 1/4 19:00 A/b
705 : /// Above 1/4 19:05 B/b
706 : /// Above 2/4 20:00 B/a
707 : /// Above 2/4 20:03 A/a
708 : /// Below 3/4 20:30 A/d
709 : /// Below 3/4 20:40 B/d
710 : /// Below 4/4 20:45 B/e
711 : /// Below 4/4 20:58 A/e
712 : /// ```
713 : ///
714 : /// With tenants having the same number of layers the picture does not change much. The same with
715 : /// A having many more layers **resident** (not all of them listed):
716 : ///
717 : /// ```text
718 : /// Above 0/100 18:30 A/c
719 : /// Above 0/4 18:29 B/c
720 : /// Above 1/100 19:00 A/b
721 : /// Above 2/100 20:03 A/a
722 : /// Above 3/100 20:03 A/nth_3
723 : /// Above 4/100 20:03 A/nth_4
724 : /// ...
725 : /// Above 1/4 19:05 B/b
726 : /// Above 25/100 20:04 A/nth_25
727 : /// ...
728 : /// Above 2/4 20:00 B/a
729 : /// Above 50/100 20:10 A/nth_50
730 : /// ...
731 : /// Below 3/4 20:40 B/d
732 : /// Below 99/100 20:30 A/nth_99
733 : /// Below 4/4 20:45 B/e
734 : /// Below 100/100 20:58 A/nth_100
735 : /// ```
736 : ///
737 : /// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
738 : /// difficult to see is what happens on the next round assuming the evicting 23 from the above list
739 : /// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
740 : /// appeared:
741 : ///
742 : /// ```text
743 : /// Above 0/87 20:04 A/nth_23
744 : /// Above 0/3 19:05 B/b
745 : /// Above 0/50 20:59 C/nth_0
746 : /// Above 1/87 20:04 A/nth_24
747 : /// Above 1/50 21:00 C/nth_1
748 : /// Above 2/87 20:04 A/nth_25
749 : /// ...
750 : /// Above 16/50 21:02 C/nth_16
751 : /// Above 1/3 20:00 B/a
752 : /// Above 27/87 20:10 A/nth_50
753 : /// ...
754 : /// Below 2/3 20:40 B/d
755 : /// Below 49/50 21:05 C/nth_49
756 : /// Below 86/87 20:30 A/nth_99
757 : /// Below 3/3 20:45 B/e
758 : /// Below 50/50 21:05 C/nth_50
759 : /// Below 87/87 20:58 A/nth_100
760 : /// ```
761 : ///
762 : /// Now relieving pressure with 23 layers would cost:
763 : /// - tenant A 14 layers
764 : /// - tenant B 1 layer
765 : /// - tenant C 8 layers
766 17 : async fn collect_eviction_candidates(
767 17 : tenant_manager: &Arc<TenantManager>,
768 17 : eviction_order: EvictionOrder,
769 17 : cancel: &CancellationToken,
770 17 : ) -> anyhow::Result<EvictionCandidates> {
771 : // get a snapshot of the list of tenants
772 17 : let tenants = tenant::mgr::list_tenants()
773 0 : .await
774 17 : .context("get list of tenants")?;
775 :
776 : // TODO: avoid listing every layer in every tenant: this loop can block the executor,
777 : // and the resulting data structure can be huge.
778 : // (https://github.com/neondatabase/neon/issues/6224)
779 17 : let mut candidates = Vec::new();
780 :
781 55 : for (tenant_id, _state, _gen) in tenants {
782 38 : if cancel.is_cancelled() {
783 0 : return Ok(EvictionCandidates::Cancelled);
784 38 : }
785 38 : let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
786 37 : Ok(tenant) => tenant,
787 1 : Err(e) => {
788 : // this can happen if tenant has lifecycle transition after we fetched it
789 0 : debug!("failed to get tenant: {e:#}");
790 1 : continue;
791 : }
792 : };
793 :
794 37 : if tenant.cancel.is_cancelled() {
795 0 : info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
796 0 : continue;
797 37 : }
798 37 :
799 37 : // collect layers from all timelines in this tenant
800 37 : //
801 37 : // If one of the timelines becomes `!is_active()` during the iteration,
802 37 : // for example because we're shutting down, then `max_layer_size` can be too small.
803 37 : // That's OK. This code only runs under a disk pressure situation, and being
804 37 : // a little unfair to tenants during shutdown in such a situation is tolerable.
805 37 : let mut tenant_candidates = Vec::new();
806 37 : let mut max_layer_size = 0;
807 37 : for tl in tenant.list_timelines() {
808 37 : if !tl.is_active() {
809 0 : continue;
810 37 : }
811 37 : let info = tl.get_local_layers_for_disk_usage_eviction().await;
812 0 : debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
813 37 : tenant_candidates.extend(info.resident_layers.into_iter());
814 37 : max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
815 37 :
816 37 : if cancel.is_cancelled() {
817 0 : return Ok(EvictionCandidates::Cancelled);
818 37 : }
819 : }
820 :
821 : // `min_resident_size` defaults to maximum layer file size of the tenant.
822 : // This ensures that each tenant can have at least one layer resident at a given time,
823 : // ensuring forward progress for a single Timeline::get in that tenant.
824 : // It's a questionable heuristic since, usually, there are many Timeline::get
825 : // requests going on for a tenant, and, at least in Neon prod, the median
826 : // layer file size is much smaller than the compaction target size.
827 : // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
828 : // That's what's typically used by the various background loops.
829 : //
830 : // The default can be overridden with a fixed value in the tenant conf.
831 : // A default override can be put in the default tenant conf in the pageserver.toml.
832 37 : let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
833 0 : debug!(
834 0 : tenant_id=%tenant.tenant_shard_id().tenant_id,
835 0 : shard_id=%tenant.tenant_shard_id().shard_slug(),
836 0 : overridden_size=s,
837 0 : "using overridden min resident size for tenant"
838 0 : );
839 4 : s
840 : } else {
841 0 : debug!(
842 0 : tenant_id=%tenant.tenant_shard_id().tenant_id,
843 0 : shard_id=%tenant.tenant_shard_id().shard_slug(),
844 0 : max_layer_size,
845 0 : "using max layer size as min_resident_size for tenant",
846 0 : );
847 33 : max_layer_size
848 : };
849 :
850 : // Sort layers most-recently-used first, then partition by
851 : // cumsum above/below min_resident_size.
852 37 : tenant_candidates
853 4874 : .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
854 37 : let mut cumsum: i128 = 0;
855 37 :
856 37 : let total = tenant_candidates.len();
857 37 :
858 37 : let tenant_candidates =
859 37 : tenant_candidates
860 37 : .into_iter()
861 37 : .enumerate()
862 539 : .map(|(i, mut candidate)| {
863 539 : // as we iterate this reverse sorted list, the most recently accessed layer will always
864 539 : // be 1.0; this is for us to evict it last.
865 539 : candidate.relative_last_activity =
866 539 : eviction_order.relative_last_activity(total, i);
867 :
868 539 : let partition = if cumsum > min_resident_size as i128 {
869 408 : MinResidentSizePartition::Above
870 : } else {
871 131 : MinResidentSizePartition::Below
872 : };
873 539 : cumsum += i128::from(candidate.layer.get_file_size());
874 539 :
875 539 : (partition, candidate)
876 539 : });
877 37 :
878 37 : candidates.extend(tenant_candidates);
879 : }
880 :
881 : // Note: the same tenant ID might be hit twice, if it transitions from attached to
882 : // secondary while we run. That is okay: when we eventually try and run the eviction,
883 : // the `Gate` on the object will ensure that whichever one has already been shut down
884 : // will not delete anything.
885 :
886 17 : let mut secondary_tenants = Vec::new();
887 17 : tenant_manager.foreach_secondary_tenants(
888 17 : |_tenant_shard_id: &TenantShardId, state: &Arc<SecondaryTenant>| {
889 2 : secondary_tenants.push(state.clone());
890 17 : },
891 17 : );
892 :
893 19 : for secondary_tenant in secondary_tenants {
894 : // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
895 : // to prevent repeated disk usage based evictions from completely draining less often
896 : // updating secondaries.
897 2 : let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
898 :
899 : debug_assert!(
900 2 : total_layers >= layer_info.resident_layers.len(),
901 0 : "total_layers ({total_layers}) must be at least the resident_layers.len() ({})",
902 0 : layer_info.resident_layers.len()
903 : );
904 :
905 2 : layer_info
906 2 : .resident_layers
907 158 : .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
908 2 :
909 2 : let tenant_candidates =
910 2 : layer_info
911 2 : .resident_layers
912 2 : .into_iter()
913 2 : .enumerate()
914 38 : .map(|(i, mut candidate)| {
915 38 : candidate.relative_last_activity =
916 38 : eviction_order.relative_last_activity(total_layers, i);
917 38 : (
918 38 : // Secondary locations' layers are always considered above the min resident size,
919 38 : // i.e. secondary locations are permitted to be trimmed to zero layers if all
920 38 : // the layers have sufficiently old access times.
921 38 : MinResidentSizePartition::Above,
922 38 : candidate,
923 38 : )
924 38 : });
925 2 :
926 2 : candidates.extend(tenant_candidates);
927 2 :
928 2 : tokio::task::yield_now().await;
929 : }
930 :
931 17 : debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
932 0 : "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
933 :
934 17 : eviction_order.sort(&mut candidates);
935 17 :
936 17 : Ok(EvictionCandidates::Finished(candidates))
937 17 : }
938 :
939 : /// Given a pre-sorted vec of all layers in the system, select the first N which are enough to
940 : /// relieve pressure.
941 : ///
942 : /// Returns the amount of candidates selected, with the planned usage.
943 17 : fn select_victims<U: Usage>(
944 17 : candidates: &[(MinResidentSizePartition, EvictionCandidate)],
945 17 : usage_pre: U,
946 17 : ) -> VictimSelection<U> {
947 17 : let mut usage_when_switched = None;
948 17 : let mut usage_planned = usage_pre;
949 17 : let mut evicted_amount = 0;
950 :
951 297 : for (i, (partition, candidate)) in candidates.iter().enumerate() {
952 297 : if !usage_planned.has_pressure() {
953 14 : break;
954 283 : }
955 283 :
956 283 : if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
957 3 : usage_when_switched = Some((usage_planned, i));
958 280 : }
959 :
960 283 : usage_planned.add_available_bytes(candidate.layer.get_file_size());
961 283 : evicted_amount += 1;
962 : }
963 :
964 17 : VictimSelection {
965 17 : amount: evicted_amount,
966 17 : usage_pre,
967 17 : usage_when_switched,
968 17 : usage_planned,
969 17 : }
970 17 : }
971 :
972 : struct VictimSelection<U> {
973 : amount: usize,
974 : usage_pre: U,
975 : usage_when_switched: Option<(U, usize)>,
976 : usage_planned: U,
977 : }
978 :
979 : impl<U: Usage> VictimSelection<U> {
980 17 : fn into_amount_and_planned(self) -> (usize, PlannedUsage<U>) {
981 17 : debug!(
982 0 : evicted_amount=%self.amount,
983 0 : "took enough candidates for pressure to be relieved"
984 0 : );
985 :
986 17 : if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() {
987 3 : warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
988 14 : }
989 :
990 17 : let planned = match self.usage_when_switched {
991 3 : Some((respecting_tenant_min_resident_size, _)) => PlannedUsage {
992 3 : respecting_tenant_min_resident_size,
993 3 : fallback_to_global_lru: Some(self.usage_planned),
994 3 : },
995 14 : None => PlannedUsage {
996 14 : respecting_tenant_min_resident_size: self.usage_planned,
997 14 : fallback_to_global_lru: None,
998 14 : },
999 : };
1000 :
1001 17 : (self.amount, planned)
1002 17 : }
1003 : }
1004 :
1005 : struct TimelineKey(Arc<Timeline>);
1006 :
1007 : impl PartialEq for TimelineKey {
1008 0 : fn eq(&self, other: &Self) -> bool {
1009 0 : Arc::ptr_eq(&self.0, &other.0)
1010 0 : }
1011 : }
1012 :
1013 : impl Eq for TimelineKey {}
1014 :
1015 : impl std::hash::Hash for TimelineKey {
1016 0 : fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
1017 0 : Arc::as_ptr(&self.0).hash(state);
1018 0 : }
1019 : }
1020 :
1021 : impl std::ops::Deref for TimelineKey {
1022 : type Target = Timeline;
1023 :
1024 0 : fn deref(&self) -> &Self::Target {
1025 0 : self.0.as_ref()
1026 0 : }
1027 : }
1028 :
1029 : /// A totally ordered f32 subset we can use with sorting functions.
1030 : pub(crate) mod finite_f32 {
1031 :
1032 : /// A totally ordered f32 subset we can use with sorting functions.
1033 0 : #[derive(Clone, Copy, PartialEq)]
1034 : pub struct FiniteF32(f32);
1035 :
1036 : impl std::fmt::Debug for FiniteF32 {
1037 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1038 0 : std::fmt::Debug::fmt(&self.0, f)
1039 0 : }
1040 : }
1041 :
1042 : impl std::fmt::Display for FiniteF32 {
1043 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1044 0 : std::fmt::Display::fmt(&self.0, f)
1045 0 : }
1046 : }
1047 :
1048 : impl std::cmp::Eq for FiniteF32 {}
1049 :
1050 : impl std::cmp::PartialOrd for FiniteF32 {
1051 1019 : fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
1052 1019 : Some(self.cmp(other))
1053 1019 : }
1054 : }
1055 :
1056 : impl std::cmp::Ord for FiniteF32 {
1057 1019 : fn cmp(&self, other: &Self) -> std::cmp::Ordering {
1058 1019 : self.0.total_cmp(&other.0)
1059 1019 : }
1060 : }
1061 :
1062 : impl TryFrom<f32> for FiniteF32 {
1063 : type Error = f32;
1064 :
1065 0 : fn try_from(value: f32) -> Result<Self, Self::Error> {
1066 0 : if value.is_finite() {
1067 0 : Ok(FiniteF32(value))
1068 : } else {
1069 0 : Err(value)
1070 : }
1071 0 : }
1072 : }
1073 :
1074 : impl From<FiniteF32> for f32 {
1075 40 : fn from(value: FiniteF32) -> f32 {
1076 40 : value.0
1077 40 : }
1078 : }
1079 :
1080 : impl FiniteF32 {
1081 : pub const ZERO: FiniteF32 = FiniteF32(0.0);
1082 :
1083 286 : pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
1084 286 : if (0.0..=1.0).contains(&value) {
1085 : // -0.0 is within the range, make sure it is assumed 0.0..=1.0
1086 286 : let value = value.abs();
1087 286 : Ok(FiniteF32(value))
1088 : } else {
1089 0 : Err(value)
1090 : }
1091 286 : }
1092 :
1093 40 : pub fn into_inner(self) -> f32 {
1094 40 : self.into()
1095 40 : }
1096 : }
1097 : }
1098 :
1099 : mod filesystem_level_usage {
1100 : use anyhow::Context;
1101 : use camino::Utf8Path;
1102 :
1103 : use crate::statvfs::Statvfs;
1104 :
1105 : use super::DiskUsageEvictionTaskConfig;
1106 :
1107 20 : #[derive(Debug, Clone, Copy)]
1108 : #[allow(dead_code)]
1109 : pub struct Usage<'a> {
1110 : config: &'a DiskUsageEvictionTaskConfig,
1111 :
1112 : /// Filesystem capacity
1113 : total_bytes: u64,
1114 : /// Free filesystem space
1115 : avail_bytes: u64,
1116 : }
1117 :
1118 : impl super::Usage for Usage<'_> {
1119 81 : fn has_pressure(&self) -> bool {
1120 81 : let usage_pct =
1121 81 : (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
1122 81 :
1123 81 : let pressures = [
1124 81 : (
1125 81 : "min_avail_bytes",
1126 81 : self.avail_bytes < self.config.min_avail_bytes,
1127 81 : ),
1128 81 : (
1129 81 : "max_usage_pct",
1130 81 : usage_pct >= self.config.max_usage_pct.get() as u64,
1131 81 : ),
1132 81 : ];
1133 81 :
1134 134 : pressures.into_iter().any(|(_, has_pressure)| has_pressure)
1135 81 : }
1136 :
1137 118 : fn add_available_bytes(&mut self, bytes: u64) {
1138 118 : self.avail_bytes += bytes;
1139 118 : }
1140 : }
1141 :
1142 11 : pub fn get<'a>(
1143 11 : tenants_dir: &Utf8Path,
1144 11 : config: &'a DiskUsageEvictionTaskConfig,
1145 11 : ) -> anyhow::Result<Usage<'a>> {
1146 11 : let mock_config = {
1147 11 : #[cfg(feature = "testing")]
1148 11 : {
1149 11 : config.mock_statvfs.as_ref()
1150 : }
1151 : #[cfg(not(feature = "testing"))]
1152 : {
1153 : None
1154 : }
1155 : };
1156 :
1157 11 : let stat = Statvfs::get(tenants_dir, mock_config)
1158 11 : .context("statvfs failed, presumably directory got unlinked")?;
1159 :
1160 : // https://unix.stackexchange.com/a/703650
1161 10 : let blocksize = if stat.fragment_size() > 0 {
1162 10 : stat.fragment_size()
1163 : } else {
1164 0 : stat.block_size()
1165 : };
1166 :
1167 : // use blocks_available (b_avail) since, pageserver runs as unprivileged user
1168 10 : let avail_bytes = stat.blocks_available() * blocksize;
1169 10 : let total_bytes = stat.blocks() * blocksize;
1170 10 :
1171 10 : Ok(Usage {
1172 10 : config,
1173 10 : total_bytes,
1174 10 : avail_bytes,
1175 10 : })
1176 11 : }
1177 :
1178 2 : #[test]
1179 2 : fn max_usage_pct_pressure() {
1180 2 : use super::EvictionOrder;
1181 2 : use super::Usage as _;
1182 2 : use std::time::Duration;
1183 2 : use utils::serde_percent::Percent;
1184 2 :
1185 2 : let mut usage = Usage {
1186 2 : config: &DiskUsageEvictionTaskConfig {
1187 2 : max_usage_pct: Percent::new(85).unwrap(),
1188 2 : min_avail_bytes: 0,
1189 2 : period: Duration::MAX,
1190 2 : #[cfg(feature = "testing")]
1191 2 : mock_statvfs: None,
1192 2 : eviction_order: EvictionOrder::default(),
1193 2 : },
1194 2 : total_bytes: 100_000,
1195 2 : avail_bytes: 0,
1196 2 : };
1197 2 :
1198 2 : assert!(usage.has_pressure(), "expected pressure at 100%");
1199 :
1200 2 : usage.add_available_bytes(14_000);
1201 2 : assert!(usage.has_pressure(), "expected pressure at 86%");
1202 :
1203 2 : usage.add_available_bytes(999);
1204 2 : assert!(usage.has_pressure(), "expected pressure at 85.001%");
1205 :
1206 2 : usage.add_available_bytes(1);
1207 2 : assert!(usage.has_pressure(), "expected pressure at precisely 85%");
1208 :
1209 2 : usage.add_available_bytes(1);
1210 2 : assert!(!usage.has_pressure(), "no pressure at 84.999%");
1211 :
1212 2 : usage.add_available_bytes(999);
1213 2 : assert!(!usage.has_pressure(), "no pressure at 84%");
1214 :
1215 2 : usage.add_available_bytes(16_000);
1216 2 : assert!(!usage.has_pressure());
1217 2 : }
1218 : }
1219 :
1220 : #[cfg(test)]
1221 : mod tests {
1222 : use super::*;
1223 :
1224 2 : #[test]
1225 2 : fn relative_equal_bounds() {
1226 2 : let order = EvictionOrder::RelativeAccessed {
1227 2 : highest_layer_count_loses_first: false,
1228 2 : };
1229 2 :
1230 2 : let len = 10;
1231 2 : let v = (0..len)
1232 20 : .map(|i| order.relative_last_activity(len, i).into_inner())
1233 2 : .collect::<Vec<_>>();
1234 2 :
1235 2 : assert_eq!(v.first(), Some(&1.0));
1236 2 : assert_eq!(v.last(), Some(&0.0));
1237 18 : assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
1238 2 : }
1239 :
1240 2 : #[test]
1241 2 : fn relative_spare_bounds() {
1242 2 : let order = EvictionOrder::RelativeAccessed {
1243 2 : highest_layer_count_loses_first: true,
1244 2 : };
1245 2 :
1246 2 : let len = 10;
1247 2 : let v = (0..len)
1248 20 : .map(|i| order.relative_last_activity(len, i).into_inner())
1249 2 : .collect::<Vec<_>>();
1250 2 :
1251 2 : assert_eq!(v.first(), Some(&1.0));
1252 2 : assert_eq!(v.last(), Some(&0.1));
1253 18 : assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
1254 2 : }
1255 : }
|