Line data Source code
1 : //! This module contains functions to serve per-tenant background processes,
2 : //! such as compaction and GC
3 :
4 : use std::ops::ControlFlow;
5 : use std::str::FromStr;
6 : use std::sync::Arc;
7 : use std::time::{Duration, Instant};
8 :
9 : use crate::context::{DownloadBehavior, RequestContext};
10 : use crate::metrics::TENANT_TASK_EVENTS;
11 : use crate::task_mgr;
12 : use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
13 : use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD;
14 : use crate::tenant::throttle::Stats;
15 : use crate::tenant::timeline::CompactionError;
16 : use crate::tenant::{Tenant, TenantState};
17 : use rand::Rng;
18 : use tokio_util::sync::CancellationToken;
19 : use tracing::*;
20 : use utils::{backoff, completion, pausable_failpoint};
21 :
22 : static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
23 60 : once_cell::sync::Lazy::new(|| {
24 60 : let total_threads = task_mgr::TOKIO_WORKER_THREADS.get();
25 60 : let permits = usize::max(
26 60 : 1,
27 60 : // while a lot of the work is done on spawn_blocking, we still do
28 60 : // repartitioning in the async context. this should give leave us some workers
29 60 : // unblocked to be blocked on other work, hopefully easing any outside visible
30 60 : // effects of restarts.
31 60 : //
32 60 : // 6/8 is a guess; previously we ran with unlimited 8 and more from
33 60 : // spawn_blocking.
34 60 : (total_threads * 3).checked_div(4).unwrap_or(0),
35 60 : );
36 60 : assert_ne!(permits, 0, "we will not be adding in permits later");
37 60 : assert!(
38 60 : permits < total_threads,
39 0 : "need threads avail for shorter work"
40 : );
41 60 : tokio::sync::Semaphore::new(permits)
42 60 : });
43 :
44 1080 : #[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr, enum_map::Enum)]
45 : #[strum(serialize_all = "snake_case")]
46 : pub(crate) enum BackgroundLoopKind {
47 : Compaction,
48 : Gc,
49 : Eviction,
50 : IngestHouseKeeping,
51 : ConsumptionMetricsCollectMetrics,
52 : ConsumptionMetricsSyntheticSizeWorker,
53 : InitialLogicalSizeCalculation,
54 : HeatmapUpload,
55 : SecondaryDownload,
56 : }
57 :
58 : impl BackgroundLoopKind {
59 0 : fn as_static_str(&self) -> &'static str {
60 0 : self.into()
61 0 : }
62 : }
63 :
64 : /// Cancellation safe.
65 1092 : pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
66 1092 : loop_kind: BackgroundLoopKind,
67 1092 : _ctx: &RequestContext,
68 1092 : ) -> tokio::sync::SemaphorePermit<'static> {
69 1092 : let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
70 :
71 : pausable_failpoint!(
72 : "initial-size-calculation-permit-pause",
73 : loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation
74 : );
75 :
76 : // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
77 1092 : match CONCURRENT_BACKGROUND_TASKS.acquire().await {
78 1092 : Ok(permit) => permit,
79 0 : Err(_closed) => unreachable!("we never close the semaphore"),
80 : }
81 1092 : }
82 :
83 : /// Start per tenant background loops: compaction and gc.
84 0 : pub fn start_background_loops(
85 0 : tenant: &Arc<Tenant>,
86 0 : background_jobs_can_start: Option<&completion::Barrier>,
87 0 : ) {
88 0 : let tenant_shard_id = tenant.tenant_shard_id;
89 0 : task_mgr::spawn(
90 0 : BACKGROUND_RUNTIME.handle(),
91 0 : TaskKind::Compaction,
92 0 : tenant_shard_id,
93 0 : None,
94 0 : &format!("compactor for tenant {tenant_shard_id}"),
95 0 : {
96 0 : let tenant = Arc::clone(tenant);
97 0 : let background_jobs_can_start = background_jobs_can_start.cloned();
98 0 : async move {
99 0 : let cancel = task_mgr::shutdown_token();
100 : tokio::select! {
101 : _ = cancel.cancelled() => { return Ok(()) },
102 : _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
103 : };
104 0 : compaction_loop(tenant, cancel)
105 0 : // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py
106 0 : .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
107 0 : .await;
108 0 : Ok(())
109 0 : }
110 0 : },
111 0 : );
112 0 : task_mgr::spawn(
113 0 : BACKGROUND_RUNTIME.handle(),
114 0 : TaskKind::GarbageCollector,
115 0 : tenant_shard_id,
116 0 : None,
117 0 : &format!("garbage collector for tenant {tenant_shard_id}"),
118 0 : {
119 0 : let tenant = Arc::clone(tenant);
120 0 : let background_jobs_can_start = background_jobs_can_start.cloned();
121 0 : async move {
122 0 : let cancel = task_mgr::shutdown_token();
123 : tokio::select! {
124 : _ = cancel.cancelled() => { return Ok(()) },
125 : _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
126 : };
127 0 : gc_loop(tenant, cancel)
128 0 : .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
129 0 : .await;
130 0 : Ok(())
131 0 : }
132 0 : },
133 0 : );
134 0 :
135 0 : task_mgr::spawn(
136 0 : BACKGROUND_RUNTIME.handle(),
137 0 : TaskKind::IngestHousekeeping,
138 0 : tenant_shard_id,
139 0 : None,
140 0 : &format!("ingest housekeeping for tenant {tenant_shard_id}"),
141 0 : {
142 0 : let tenant = Arc::clone(tenant);
143 0 : let background_jobs_can_start = background_jobs_can_start.cloned();
144 0 : async move {
145 0 : let cancel = task_mgr::shutdown_token();
146 : tokio::select! {
147 : _ = cancel.cancelled() => { return Ok(()) },
148 : _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
149 : };
150 0 : ingest_housekeeping_loop(tenant, cancel)
151 0 : .instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
152 0 : .await;
153 0 : Ok(())
154 0 : }
155 0 : },
156 0 : );
157 0 : }
158 :
159 : ///
160 : /// Compaction task's main loop
161 : ///
162 0 : async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
163 0 : const MAX_BACKOFF_SECS: f64 = 300.0;
164 0 : // How many errors we have seen consequtively
165 0 : let mut error_run_count = 0;
166 0 :
167 0 : let mut last_throttle_flag_reset_at = Instant::now();
168 0 :
169 0 : TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
170 0 : async {
171 0 : let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
172 0 : let mut first = true;
173 0 : loop {
174 0 : tokio::select! {
175 : _ = cancel.cancelled() => {
176 : return;
177 : },
178 : tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
179 : ControlFlow::Break(()) => return,
180 : ControlFlow::Continue(()) => (),
181 : },
182 : }
183 :
184 0 : let period = tenant.get_compaction_period();
185 0 :
186 0 : // TODO: we shouldn't need to await to find tenant and this could be moved outside of
187 0 : // loop, #3501. There are also additional "allowed_errors" in tests.
188 0 : if first {
189 0 : first = false;
190 0 : if random_init_delay(period, &cancel).await.is_err() {
191 0 : break;
192 0 : }
193 0 : }
194 :
195 :
196 :
197 : let sleep_duration;
198 0 : if period == Duration::ZERO {
199 : #[cfg(not(feature = "testing"))]
200 : info!("automatic compaction is disabled");
201 : // check again in 10 seconds, in case it's been enabled again.
202 0 : sleep_duration = Duration::from_secs(10)
203 : } else {
204 0 : let iteration = Iteration {
205 0 : started_at: Instant::now(),
206 0 : period,
207 0 : kind: BackgroundLoopKind::Compaction,
208 0 : };
209 :
210 : // Run compaction
211 0 : let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await;
212 0 : match output {
213 0 : Ok(has_pending_task) => {
214 0 : error_run_count = 0;
215 0 : // schedule the next compaction immediately in case there is a pending compaction task
216 0 : sleep_duration = if has_pending_task { Duration::ZERO } else { period };
217 : }
218 0 : Err(e) => {
219 0 : let wait_duration = backoff::exponential_backoff_duration_seconds(
220 0 : error_run_count + 1,
221 0 : 1.0,
222 0 : MAX_BACKOFF_SECS,
223 0 : );
224 0 : error_run_count += 1;
225 0 : let wait_duration = Duration::from_secs_f64(wait_duration);
226 0 : log_compaction_error(
227 0 : &e,
228 0 : error_run_count,
229 0 : &wait_duration,
230 0 : cancel.is_cancelled(),
231 0 : );
232 0 : sleep_duration = wait_duration;
233 0 : }
234 : }
235 :
236 : // the duration is recorded by performance tests by enabling debug in this function
237 0 : tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
238 : };
239 :
240 :
241 : // Perhaps we did no work and the walredo process has been idle for some time:
242 : // give it a chance to shut down to avoid leaving walredo process running indefinitely.
243 0 : if let Some(walredo_mgr) = &tenant.walredo_mgr {
244 0 : walredo_mgr.maybe_quiesce(period * 10);
245 0 : }
246 :
247 : // TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
248 : // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
249 0 : info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
250 0 : let now = Instant::now();
251 0 : let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
252 0 : let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
253 0 : if count_throttled == 0 {
254 0 : return;
255 0 : }
256 0 : let allowed_rps = tenant.timeline_get_throttle.steady_rps();
257 0 : let delta = now - prev;
258 0 : info!(
259 0 : n_seconds=%format_args!("{:.3}",
260 0 : delta.as_secs_f64()),
261 : count_accounted,
262 : count_throttled,
263 : sum_throttled_usecs,
264 0 : allowed_rps=%format_args!("{allowed_rps:.0}"),
265 0 : "shard was throttled in the last n_seconds"
266 : );
267 0 : });
268 0 :
269 0 : // Sleep
270 0 : if tokio::time::timeout(sleep_duration, cancel.cancelled())
271 0 : .await
272 0 : .is_ok()
273 : {
274 0 : break;
275 0 : }
276 : }
277 0 : }
278 0 : .await;
279 0 : TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
280 0 : }
281 :
282 0 : fn log_compaction_error(
283 0 : e: &CompactionError,
284 0 : error_run_count: u32,
285 0 : sleep_duration: &std::time::Duration,
286 0 : task_cancelled: bool,
287 0 : ) {
288 : use crate::tenant::upload_queue::NotInitialized;
289 : use crate::tenant::PageReconstructError;
290 : use CompactionError::*;
291 :
292 : enum LooksLike {
293 : Info,
294 : Error,
295 : }
296 :
297 0 : let decision = match e {
298 0 : ShuttingDown => None,
299 0 : _ if task_cancelled => Some(LooksLike::Info),
300 0 : Other(e) => {
301 0 : let root_cause = e.root_cause();
302 :
303 0 : let is_stopping = {
304 0 : let upload_queue = root_cause
305 0 : .downcast_ref::<NotInitialized>()
306 0 : .is_some_and(|e| e.is_stopping());
307 0 :
308 0 : let timeline = root_cause
309 0 : .downcast_ref::<PageReconstructError>()
310 0 : .is_some_and(|e| e.is_stopping());
311 0 :
312 0 : upload_queue || timeline
313 : };
314 :
315 0 : if is_stopping {
316 0 : Some(LooksLike::Info)
317 : } else {
318 0 : Some(LooksLike::Error)
319 : }
320 : }
321 : };
322 :
323 0 : match decision {
324 0 : Some(LooksLike::Info) => info!(
325 0 : "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
326 : ),
327 0 : Some(LooksLike::Error) => error!(
328 0 : "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
329 : ),
330 0 : None => {}
331 : }
332 0 : }
333 :
334 : ///
335 : /// GC task's main loop
336 : ///
337 0 : async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
338 0 : const MAX_BACKOFF_SECS: f64 = 300.0;
339 0 : // How many errors we have seen consequtively
340 0 : let mut error_run_count = 0;
341 0 :
342 0 : TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
343 0 : async {
344 0 : // GC might require downloading, to find the cutoff LSN that corresponds to the
345 0 : // cutoff specified as time.
346 0 : let ctx =
347 0 : RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
348 0 :
349 0 : let mut first = true;
350 0 : loop {
351 0 : tokio::select! {
352 : _ = cancel.cancelled() => {
353 : return;
354 : },
355 : tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
356 : ControlFlow::Break(()) => return,
357 : ControlFlow::Continue(()) => (),
358 : },
359 : }
360 :
361 0 : let period = tenant.get_gc_period();
362 0 :
363 0 : if first {
364 0 : first = false;
365 0 :
366 0 : let delays = async {
367 0 : delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
368 0 : random_init_delay(period, &cancel).await?;
369 0 : Ok::<_, Cancelled>(())
370 0 : };
371 :
372 0 : if delays.await.is_err() {
373 0 : break;
374 0 : }
375 0 : }
376 :
377 0 : let gc_horizon = tenant.get_gc_horizon();
378 0 : let sleep_duration;
379 0 : if period == Duration::ZERO || gc_horizon == 0 {
380 0 : #[cfg(not(feature = "testing"))]
381 0 : info!("automatic GC is disabled");
382 0 : // check again in 10 seconds, in case it's been enabled again.
383 0 : sleep_duration = Duration::from_secs(10);
384 0 : } else {
385 0 : let iteration = Iteration {
386 0 : started_at: Instant::now(),
387 0 : period,
388 0 : kind: BackgroundLoopKind::Gc,
389 0 : };
390 : // Run gc
391 0 : let IterationResult { output, elapsed: _ } =
392 0 : iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx))
393 0 : .await;
394 0 : match output {
395 0 : Ok(_) => {
396 0 : error_run_count = 0;
397 0 : sleep_duration = period;
398 0 : }
399 : Err(crate::tenant::GcError::TenantCancelled) => {
400 0 : return;
401 : }
402 0 : Err(e) => {
403 0 : let wait_duration = backoff::exponential_backoff_duration_seconds(
404 0 : error_run_count + 1,
405 0 : 1.0,
406 0 : MAX_BACKOFF_SECS,
407 0 : );
408 0 : error_run_count += 1;
409 0 : let wait_duration = Duration::from_secs_f64(wait_duration);
410 :
411 0 : if matches!(e, crate::tenant::GcError::TimelineCancelled) {
412 : // Timeline was cancelled during gc. We might either be in an event
413 : // that affects the entire tenant (tenant deletion, pageserver shutdown),
414 : // or in one that affects the timeline only (timeline deletion).
415 : // Therefore, don't exit the loop.
416 0 : info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
417 : } else {
418 0 : error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
419 : }
420 :
421 0 : sleep_duration = wait_duration;
422 : }
423 : }
424 : };
425 :
426 0 : if tokio::time::timeout(sleep_duration, cancel.cancelled())
427 0 : .await
428 0 : .is_ok()
429 : {
430 0 : break;
431 0 : }
432 : }
433 0 : }
434 0 : .await;
435 0 : TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
436 0 : }
437 :
438 0 : async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
439 0 : TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
440 0 : async {
441 0 : loop {
442 0 : tokio::select! {
443 : _ = cancel.cancelled() => {
444 : return;
445 : },
446 : tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
447 : ControlFlow::Break(()) => return,
448 : ControlFlow::Continue(()) => (),
449 : },
450 : }
451 :
452 : // We run ingest housekeeping with the same frequency as compaction: it is not worth
453 : // having a distinct setting. But we don't run it in the same task, because compaction
454 : // blocks on acquiring the background job semaphore.
455 0 : let period = tenant.get_compaction_period();
456 :
457 : // If compaction period is set to zero (to disable it), then we will use a reasonable default
458 0 : let period = if period == Duration::ZERO {
459 0 : humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD)
460 0 : .unwrap()
461 0 : .into()
462 : } else {
463 0 : period
464 : };
465 :
466 : // Jitter the period by +/- 5%
467 0 : let period =
468 0 : rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100);
469 0 :
470 0 : // Always sleep first: we do not need to do ingest housekeeping early in the lifetime of
471 0 : // a tenant, since it won't have started writing any ephemeral files yet.
472 0 : if tokio::time::timeout(period, cancel.cancelled())
473 0 : .await
474 0 : .is_ok()
475 : {
476 0 : break;
477 0 : }
478 0 :
479 0 : let iteration = Iteration {
480 0 : started_at: Instant::now(),
481 0 : period,
482 0 : kind: BackgroundLoopKind::IngestHouseKeeping,
483 0 : };
484 0 : iteration.run(tenant.ingest_housekeeping()).await;
485 : }
486 0 : }
487 0 : .await;
488 0 : TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
489 0 : }
490 :
491 0 : async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
492 0 : // if the tenant has a proper status already, no need to wait for anything
493 0 : if tenant.current_state() == TenantState::Active {
494 0 : ControlFlow::Continue(())
495 : } else {
496 0 : let mut tenant_state_updates = tenant.subscribe_for_state_updates();
497 : loop {
498 0 : match tenant_state_updates.changed().await {
499 : Ok(()) => {
500 0 : let new_state = &*tenant_state_updates.borrow();
501 0 : match new_state {
502 : TenantState::Active => {
503 0 : debug!("Tenant state changed to active, continuing the task loop");
504 0 : return ControlFlow::Continue(());
505 : }
506 0 : state => {
507 0 : debug!("Not running the task loop, tenant is not active: {state:?}");
508 0 : continue;
509 : }
510 : }
511 : }
512 0 : Err(_sender_dropped_error) => {
513 0 : return ControlFlow::Break(());
514 : }
515 : }
516 : }
517 : }
518 0 : }
519 :
520 0 : #[derive(thiserror::Error, Debug)]
521 : #[error("cancelled")]
522 : pub(crate) struct Cancelled;
523 :
524 : /// Provide a random delay for background task initialization.
525 : ///
526 : /// This delay prevents a thundering herd of background tasks and will likely keep them running on
527 : /// different periods for more stable load.
528 0 : pub(crate) async fn random_init_delay(
529 0 : period: Duration,
530 0 : cancel: &CancellationToken,
531 0 : ) -> Result<(), Cancelled> {
532 0 : if period == Duration::ZERO {
533 0 : return Ok(());
534 0 : }
535 0 :
536 0 : let d = {
537 0 : let mut rng = rand::thread_rng();
538 0 : rng.gen_range(Duration::ZERO..=period)
539 0 : };
540 0 :
541 0 : match tokio::time::timeout(d, cancel.cancelled()).await {
542 0 : Ok(_) => Err(Cancelled),
543 0 : Err(_) => Ok(()),
544 : }
545 0 : }
546 :
547 : /// Delays GC by defaul lease length at restart.
548 : ///
549 : /// We do this as the leases mapping are not persisted to disk. By delaying GC by default
550 : /// length, we gurantees that all the leases we granted before the restart will expire
551 : /// when we run GC for the first time after the restart.
552 0 : pub(crate) async fn delay_by_lease_length(
553 0 : length: Duration,
554 0 : cancel: &CancellationToken,
555 0 : ) -> Result<(), Cancelled> {
556 0 : match tokio::time::timeout(length, cancel.cancelled()).await {
557 0 : Ok(_) => Err(Cancelled),
558 0 : Err(_) => Ok(()),
559 : }
560 0 : }
561 :
562 : struct Iteration {
563 : started_at: Instant,
564 : period: Duration,
565 : kind: BackgroundLoopKind,
566 : }
567 :
568 : struct IterationResult<O> {
569 : output: O,
570 : elapsed: Duration,
571 : }
572 :
573 : impl Iteration {
574 0 : #[instrument(skip_all)]
575 : pub(crate) async fn run<Fut, O>(self, fut: Fut) -> IterationResult<O>
576 : where
577 : Fut: std::future::Future<Output = O>,
578 : {
579 : let Self {
580 : started_at,
581 : period,
582 : kind,
583 : } = self;
584 :
585 : let mut fut = std::pin::pin!(fut);
586 :
587 : // Wrap `fut` into a future that logs a message every `period` so that we get a
588 : // very obvious breadcrumb in the logs _while_ a slow iteration is happening.
589 0 : let liveness_logger = async move {
590 : loop {
591 0 : match tokio::time::timeout(period, &mut fut).await {
592 0 : Ok(x) => return x,
593 : Err(_) => {
594 : // info level as per the same rationale why warn_when_period_overrun is info
595 : // => https://github.com/neondatabase/neon/pull/5724
596 0 : info!("still running");
597 : }
598 : }
599 : }
600 0 : };
601 :
602 : let output = liveness_logger.await;
603 :
604 : let elapsed = started_at.elapsed();
605 : warn_when_period_overrun(elapsed, period, kind);
606 :
607 : IterationResult { output, elapsed }
608 : }
609 : }
610 : /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
611 0 : pub(crate) fn warn_when_period_overrun(
612 0 : elapsed: Duration,
613 0 : period: Duration,
614 0 : task: BackgroundLoopKind,
615 0 : ) {
616 0 : // Duration::ZERO will happen because it's the "disable [bgtask]" value.
617 0 : if elapsed >= period && period != Duration::ZERO {
618 : // humantime does no significant digits clamping whereas Duration's debug is a bit more
619 : // intelligent. however it makes sense to keep the "configuration format" for period, even
620 : // though there's no way to output the actual config value.
621 0 : info!(
622 : ?elapsed,
623 0 : period = %humantime::format_duration(period),
624 0 : ?task,
625 0 : "task iteration took longer than the configured period"
626 : );
627 0 : crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
628 0 : .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())])
629 0 : .inc();
630 0 : }
631 0 : }
|