Line data Source code
1 : use std::collections::HashMap;
2 : use std::num::NonZeroUsize;
3 : use std::os::fd::RawFd;
4 : use std::pin::Pin;
5 : use std::sync::atomic::AtomicU64;
6 : use std::sync::{Arc, Mutex};
7 : use std::task::{Context, Poll};
8 : use std::time::{Duration, Instant};
9 :
10 : use enum_map::{Enum as _, EnumMap};
11 : use futures::Future;
12 : use metrics::{
13 : register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
14 : register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
15 : register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
16 : Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
17 : IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
18 : };
19 : use once_cell::sync::Lazy;
20 : use pageserver_api::config::{
21 : PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
22 : PageServiceProtocolPipelinedExecutionStrategy,
23 : };
24 : use pageserver_api::models::InMemoryLayerInfo;
25 : use pageserver_api::shard::TenantShardId;
26 : use pin_project_lite::pin_project;
27 : use postgres_backend::{is_expected_io_error, QueryError};
28 : use pq_proto::framed::ConnectionError;
29 :
30 : use strum::{EnumCount, IntoEnumIterator as _, VariantNames};
31 : use strum_macros::{IntoStaticStr, VariantNames};
32 : use utils::id::TimelineId;
33 :
34 : use crate::config::PageServerConf;
35 : use crate::context::{PageContentKind, RequestContext};
36 : use crate::pgdatadir_mapping::DatadirModificationStats;
37 : use crate::task_mgr::TaskKind;
38 : use crate::tenant::layer_map::LayerMap;
39 : use crate::tenant::mgr::TenantSlot;
40 : use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc};
41 : use crate::tenant::tasks::BackgroundLoopKind;
42 : use crate::tenant::throttle::ThrottleResult;
43 : use crate::tenant::Timeline;
44 :
45 : /// Prometheus histogram buckets (in seconds) for operations in the critical
46 : /// path. In other words, operations that directly affect that latency of user
47 : /// queries.
48 : ///
49 : /// The buckets capture the majority of latencies in the microsecond and
50 : /// millisecond range but also extend far enough up to distinguish "bad" from
51 : /// "really bad".
52 : const CRITICAL_OP_BUCKETS: &[f64] = &[
53 : 0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us
54 : 0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms
55 : 1.0, 10.0, 100.0, // 1 s, 10 s, 100 s
56 : ];
57 :
58 : // Metrics collected on operations on the storage repository.
59 : #[derive(Debug, VariantNames, IntoStaticStr)]
60 : #[strum(serialize_all = "kebab_case")]
61 : pub(crate) enum StorageTimeOperation {
62 : #[strum(serialize = "layer flush")]
63 : LayerFlush,
64 :
65 : #[strum(serialize = "layer flush delay")]
66 : LayerFlushDelay,
67 :
68 : #[strum(serialize = "compact")]
69 : Compact,
70 :
71 : #[strum(serialize = "create images")]
72 : CreateImages,
73 :
74 : #[strum(serialize = "logical size")]
75 : LogicalSize,
76 :
77 : #[strum(serialize = "imitate logical size")]
78 : ImitateLogicalSize,
79 :
80 : #[strum(serialize = "load layer map")]
81 : LoadLayerMap,
82 :
83 : #[strum(serialize = "gc")]
84 : Gc,
85 :
86 : #[strum(serialize = "find gc cutoffs")]
87 : FindGcCutoffs,
88 : }
89 :
90 404 : pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
91 404 : register_counter_vec!(
92 404 : "pageserver_storage_operations_seconds_sum",
93 404 : "Total time spent on storage operations with operation, tenant and timeline dimensions",
94 404 : &["operation", "tenant_id", "shard_id", "timeline_id"],
95 404 : )
96 404 : .expect("failed to define a metric")
97 404 : });
98 :
99 404 : pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
100 404 : register_int_counter_vec!(
101 404 : "pageserver_storage_operations_seconds_count",
102 404 : "Count of storage operations with operation, tenant and timeline dimensions",
103 404 : &["operation", "tenant_id", "shard_id", "timeline_id"],
104 404 : )
105 404 : .expect("failed to define a metric")
106 404 : });
107 :
108 : // Buckets for background operation duration in seconds, like compaction, GC, size calculation.
109 : const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
110 :
111 404 : pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
112 404 : register_histogram_vec!(
113 404 : "pageserver_storage_operations_seconds_global",
114 404 : "Time spent on storage operations",
115 404 : &["operation"],
116 404 : STORAGE_OP_BUCKETS.into(),
117 404 : )
118 404 : .expect("failed to define a metric")
119 404 : });
120 :
121 : /// Measures layers visited per read (i.e. read amplification).
122 : ///
123 : /// NB: for a batch, we count all visited layers towards each read. While the cost of layer visits
124 : /// are amortized across the batch, and some layers may not intersect with a given key, each visited
125 : /// layer contributes directly to the observed latency for every read in the batch, which is what we
126 : /// care about.
127 404 : pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
128 404 : register_histogram_vec!(
129 404 : "pageserver_layers_per_read",
130 404 : "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
131 404 : &["tenant_id", "shard_id", "timeline_id"],
132 404 : // Low resolution to reduce cardinality.
133 404 : vec![4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
134 404 : )
135 404 : .expect("failed to define a metric")
136 404 : });
137 :
138 396 : pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
139 396 : register_histogram!(
140 396 : "pageserver_layers_per_read_global",
141 396 : "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
142 396 : vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
143 396 : )
144 396 : .expect("failed to define a metric")
145 396 : });
146 :
147 396 : pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
148 396 : // We expect this to be low because of Postgres checkpoints. Let's see if that holds.
149 396 : register_histogram!(
150 396 : "pageserver_deltas_per_read_global",
151 396 : "Number of delta pages applied to image page per read",
152 396 : vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
153 396 : )
154 396 : .expect("failed to define a metric")
155 396 : });
156 :
157 0 : pub(crate) static CONCURRENT_INITDBS: Lazy<UIntGauge> = Lazy::new(|| {
158 0 : register_uint_gauge!(
159 0 : "pageserver_concurrent_initdb",
160 0 : "Number of initdb processes running"
161 0 : )
162 0 : .expect("failed to define a metric")
163 0 : });
164 :
165 0 : pub(crate) static INITDB_SEMAPHORE_ACQUISITION_TIME: Lazy<Histogram> = Lazy::new(|| {
166 0 : register_histogram!(
167 0 : "pageserver_initdb_semaphore_seconds_global",
168 0 : "Time spent getting a permit from the global initdb semaphore",
169 0 : STORAGE_OP_BUCKETS.into()
170 0 : )
171 0 : .expect("failed to define metric")
172 0 : });
173 :
174 0 : pub(crate) static INITDB_RUN_TIME: Lazy<Histogram> = Lazy::new(|| {
175 0 : register_histogram!(
176 0 : "pageserver_initdb_seconds_global",
177 0 : "Time spent performing initdb",
178 0 : STORAGE_OP_BUCKETS.into()
179 0 : )
180 0 : .expect("failed to define metric")
181 0 : });
182 :
183 : pub(crate) struct GetVectoredLatency {
184 : map: EnumMap<TaskKind, Option<Histogram>>,
185 : }
186 :
187 : #[allow(dead_code)]
188 : pub(crate) struct ScanLatency {
189 : map: EnumMap<TaskKind, Option<Histogram>>,
190 : }
191 :
192 : impl GetVectoredLatency {
193 : // Only these task types perform vectored gets. Filter all other tasks out to reduce total
194 : // cardinality of the metric.
195 : const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler];
196 :
197 39380 : pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
198 39380 : self.map[task_kind].as_ref()
199 39380 : }
200 : }
201 :
202 : impl ScanLatency {
203 : // Only these task types perform vectored gets. Filter all other tasks out to reduce total
204 : // cardinality of the metric.
205 : const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler];
206 :
207 24 : pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
208 24 : self.map[task_kind].as_ref()
209 24 : }
210 : }
211 :
212 : pub(crate) struct ScanLatencyOngoingRecording<'a> {
213 : parent: &'a Histogram,
214 : start: std::time::Instant,
215 : }
216 :
217 : impl<'a> ScanLatencyOngoingRecording<'a> {
218 0 : pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> {
219 0 : let start = Instant::now();
220 0 : ScanLatencyOngoingRecording { parent, start }
221 0 : }
222 :
223 0 : pub(crate) fn observe(self) {
224 0 : let elapsed = self.start.elapsed();
225 0 : self.parent.observe(elapsed.as_secs_f64());
226 0 : }
227 : }
228 :
229 388 : pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
230 388 : let inner = register_histogram_vec!(
231 388 : "pageserver_get_vectored_seconds",
232 388 : "Time spent in get_vectored.",
233 388 : &["task_kind"],
234 388 : CRITICAL_OP_BUCKETS.into(),
235 388 : )
236 388 : .expect("failed to define a metric");
237 388 :
238 388 : GetVectoredLatency {
239 12028 : map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
240 12028 : let task_kind = TaskKind::from_usize(task_kind_idx);
241 12028 :
242 12028 : if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
243 776 : let task_kind = task_kind.into();
244 776 : Some(inner.with_label_values(&[task_kind]))
245 : } else {
246 11252 : None
247 : }
248 12028 : })),
249 388 : }
250 388 : });
251 :
252 8 : pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
253 8 : let inner = register_histogram_vec!(
254 8 : "pageserver_scan_seconds",
255 8 : "Time spent in scan.",
256 8 : &["task_kind"],
257 8 : CRITICAL_OP_BUCKETS.into(),
258 8 : )
259 8 : .expect("failed to define a metric");
260 8 :
261 8 : ScanLatency {
262 248 : map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
263 248 : let task_kind = TaskKind::from_usize(task_kind_idx);
264 248 :
265 248 : if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
266 8 : let task_kind = task_kind.into();
267 8 : Some(inner.with_label_values(&[task_kind]))
268 : } else {
269 240 : None
270 : }
271 248 : })),
272 8 : }
273 8 : });
274 :
275 : pub(crate) struct PageCacheMetricsForTaskKind {
276 : pub read_accesses_immutable: IntCounter,
277 : pub read_hits_immutable: IntCounter,
278 : }
279 :
280 : pub(crate) struct PageCacheMetrics {
281 : map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
282 : }
283 :
284 184 : static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
285 184 : register_int_counter_vec!(
286 184 : "pageserver_page_cache_read_hits_total",
287 184 : "Number of read accesses to the page cache that hit",
288 184 : &["task_kind", "key_kind", "content_kind", "hit_kind"]
289 184 : )
290 184 : .expect("failed to define a metric")
291 184 : });
292 :
293 184 : static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
294 184 : register_int_counter_vec!(
295 184 : "pageserver_page_cache_read_accesses_total",
296 184 : "Number of read accesses to the page cache",
297 184 : &["task_kind", "key_kind", "content_kind"]
298 184 : )
299 184 : .expect("failed to define a metric")
300 184 : });
301 :
302 184 : pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
303 5704 : map: EnumMap::from_array(std::array::from_fn(|task_kind| {
304 5704 : let task_kind = TaskKind::from_usize(task_kind);
305 5704 : let task_kind: &'static str = task_kind.into();
306 45632 : EnumMap::from_array(std::array::from_fn(|content_kind| {
307 45632 : let content_kind = PageContentKind::from_usize(content_kind);
308 45632 : let content_kind: &'static str = content_kind.into();
309 45632 : PageCacheMetricsForTaskKind {
310 45632 : read_accesses_immutable: {
311 45632 : PAGE_CACHE_READ_ACCESSES
312 45632 : .get_metric_with_label_values(&[task_kind, "immutable", content_kind])
313 45632 : .unwrap()
314 45632 : },
315 45632 :
316 45632 : read_hits_immutable: {
317 45632 : PAGE_CACHE_READ_HITS
318 45632 : .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
319 45632 : .unwrap()
320 45632 : },
321 45632 : }
322 45632 : }))
323 5704 : })),
324 184 : });
325 :
326 : impl PageCacheMetrics {
327 1944592 : pub(crate) fn for_ctx(&self, ctx: &RequestContext) -> &PageCacheMetricsForTaskKind {
328 1944592 : &self.map[ctx.task_kind()][ctx.page_content_kind()]
329 1944592 : }
330 : }
331 :
332 : pub(crate) struct PageCacheSizeMetrics {
333 : pub max_bytes: UIntGauge,
334 :
335 : pub current_bytes_immutable: UIntGauge,
336 : }
337 :
338 184 : static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
339 184 : register_uint_gauge_vec!(
340 184 : "pageserver_page_cache_size_current_bytes",
341 184 : "Current size of the page cache in bytes, by key kind",
342 184 : &["key_kind"]
343 184 : )
344 184 : .expect("failed to define a metric")
345 184 : });
346 :
347 : pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
348 184 : Lazy::new(|| PageCacheSizeMetrics {
349 184 : max_bytes: {
350 184 : register_uint_gauge!(
351 184 : "pageserver_page_cache_size_max_bytes",
352 184 : "Maximum size of the page cache in bytes"
353 184 : )
354 184 : .expect("failed to define a metric")
355 184 : },
356 184 : current_bytes_immutable: {
357 184 : PAGE_CACHE_SIZE_CURRENT_BYTES
358 184 : .get_metric_with_label_values(&["immutable"])
359 184 : .unwrap()
360 184 : },
361 184 : });
362 :
363 : pub(crate) mod page_cache_eviction_metrics {
364 : use std::num::NonZeroUsize;
365 :
366 : use metrics::{register_int_counter_vec, IntCounter, IntCounterVec};
367 : use once_cell::sync::Lazy;
368 :
369 : #[derive(Clone, Copy)]
370 : pub(crate) enum Outcome {
371 : FoundSlotUnused { iters: NonZeroUsize },
372 : FoundSlotEvicted { iters: NonZeroUsize },
373 : ItersExceeded { iters: NonZeroUsize },
374 : }
375 :
376 184 : static ITERS_TOTAL_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
377 184 : register_int_counter_vec!(
378 184 : "pageserver_page_cache_find_victim_iters_total",
379 184 : "Counter for the number of iterations in the find_victim loop",
380 184 : &["outcome"],
381 184 : )
382 184 : .expect("failed to define a metric")
383 184 : });
384 :
385 184 : static CALLS_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
386 184 : register_int_counter_vec!(
387 184 : "pageserver_page_cache_find_victim_calls",
388 184 : "Incremented at the end of each find_victim() call.\
389 184 : Filter by outcome to get e.g., eviction rate.",
390 184 : &["outcome"]
391 184 : )
392 184 : .unwrap()
393 184 : });
394 :
395 63357 : pub(crate) fn observe(outcome: Outcome) {
396 : macro_rules! dry {
397 : ($label:literal, $iters:expr) => {{
398 : static LABEL: &'static str = $label;
399 : static ITERS_TOTAL: Lazy<IntCounter> =
400 224 : Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL]));
401 : static CALLS: Lazy<IntCounter> =
402 224 : Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL]));
403 : ITERS_TOTAL.inc_by(($iters.get()) as u64);
404 : CALLS.inc();
405 : }};
406 : }
407 63357 : match outcome {
408 3272 : Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters),
409 60085 : Outcome::FoundSlotEvicted { iters } => {
410 60085 : dry!("found_evicted", iters)
411 : }
412 0 : Outcome::ItersExceeded { iters } => {
413 0 : dry!("err_iters_exceeded", iters);
414 0 : super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit);
415 0 : }
416 : }
417 63357 : }
418 : }
419 :
420 0 : static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
421 0 : register_int_counter_vec!(
422 0 : "page_cache_errors_total",
423 0 : "Number of timeouts while acquiring a pinned slot in the page cache",
424 0 : &["error_kind"]
425 0 : )
426 0 : .expect("failed to define a metric")
427 0 : });
428 :
429 : #[derive(IntoStaticStr)]
430 : #[strum(serialize_all = "kebab_case")]
431 : pub(crate) enum PageCacheErrorKind {
432 : AcquirePinnedSlotTimeout,
433 : EvictIterLimit,
434 : }
435 :
436 0 : pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
437 0 : PAGE_CACHE_ERRORS
438 0 : .get_metric_with_label_values(&[error_kind.into()])
439 0 : .unwrap()
440 0 : .inc();
441 0 : }
442 :
443 40 : pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
444 40 : register_histogram!(
445 40 : "pageserver_wait_lsn_seconds",
446 40 : "Time spent waiting for WAL to arrive",
447 40 : CRITICAL_OP_BUCKETS.into(),
448 40 : )
449 40 : .expect("failed to define a metric")
450 40 : });
451 :
452 404 : static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
453 404 : register_gauge_vec!(
454 404 : "pageserver_flush_wait_upload_seconds",
455 404 : "Time spent waiting for preceding uploads during layer flush",
456 404 : &["tenant_id", "shard_id", "timeline_id"]
457 404 : )
458 404 : .expect("failed to define a metric")
459 404 : });
460 :
461 404 : static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
462 404 : register_int_gauge_vec!(
463 404 : "pageserver_last_record_lsn",
464 404 : "Last record LSN grouped by timeline",
465 404 : &["tenant_id", "shard_id", "timeline_id"]
466 404 : )
467 404 : .expect("failed to define a metric")
468 404 : });
469 :
470 404 : static DISK_CONSISTENT_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
471 404 : register_int_gauge_vec!(
472 404 : "pageserver_disk_consistent_lsn",
473 404 : "Disk consistent LSN grouped by timeline",
474 404 : &["tenant_id", "shard_id", "timeline_id"]
475 404 : )
476 404 : .expect("failed to define a metric")
477 404 : });
478 :
479 404 : pub(crate) static PROJECTED_REMOTE_CONSISTENT_LSN: Lazy<UIntGaugeVec> = Lazy::new(|| {
480 404 : register_uint_gauge_vec!(
481 404 : "pageserver_projected_remote_consistent_lsn",
482 404 : "Projected remote consistent LSN grouped by timeline",
483 404 : &["tenant_id", "shard_id", "timeline_id"]
484 404 : )
485 404 : .expect("failed to define a metric")
486 404 : });
487 :
488 404 : static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
489 404 : register_uint_gauge_vec!(
490 404 : "pageserver_pitr_history_size",
491 404 : "Data written since PITR cutoff on this timeline",
492 404 : &["tenant_id", "shard_id", "timeline_id"]
493 404 : )
494 404 : .expect("failed to define a metric")
495 404 : });
496 :
497 : #[derive(
498 240 : strum_macros::EnumIter,
499 0 : strum_macros::EnumString,
500 : strum_macros::Display,
501 : strum_macros::IntoStaticStr,
502 : )]
503 : #[strum(serialize_all = "kebab_case")]
504 : pub(crate) enum LayerKind {
505 : Delta,
506 : Image,
507 : }
508 :
509 : #[derive(
510 100 : strum_macros::EnumIter,
511 0 : strum_macros::EnumString,
512 : strum_macros::Display,
513 : strum_macros::IntoStaticStr,
514 : )]
515 : #[strum(serialize_all = "kebab_case")]
516 : pub(crate) enum LayerLevel {
517 : // We don't track the currently open ephemeral layer, since there's always exactly 1 and its
518 : // size changes. See `TIMELINE_EPHEMERAL_BYTES`.
519 : Frozen,
520 : L0,
521 : L1,
522 : }
523 :
524 396 : static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
525 396 : register_uint_gauge_vec!(
526 396 : "pageserver_layer_bytes",
527 396 : "Sum of frozen, L0, and L1 layer physical sizes in bytes (excluding the open ephemeral layer)",
528 396 : &["tenant_id", "shard_id", "timeline_id", "level", "kind"]
529 396 : )
530 396 : .expect("failed to define a metric")
531 396 : });
532 :
533 396 : static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
534 396 : register_uint_gauge_vec!(
535 396 : "pageserver_layer_count",
536 396 : "Number of frozen, L0, and L1 layers (excluding the open ephemeral layer)",
537 396 : &["tenant_id", "shard_id", "timeline_id", "level", "kind"]
538 396 : )
539 396 : .expect("failed to define a metric")
540 396 : });
541 :
542 404 : static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
543 404 : register_uint_gauge_vec!(
544 404 : "pageserver_archive_size",
545 404 : "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero",
546 404 : &["tenant_id", "shard_id", "timeline_id"]
547 404 : )
548 404 : .expect("failed to define a metric")
549 404 : });
550 :
551 404 : static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
552 404 : register_int_gauge_vec!(
553 404 : "pageserver_standby_horizon",
554 404 : "Standby apply LSN for which GC is hold off, by timeline.",
555 404 : &["tenant_id", "shard_id", "timeline_id"]
556 404 : )
557 404 : .expect("failed to define a metric")
558 404 : });
559 :
560 404 : static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
561 404 : register_uint_gauge_vec!(
562 404 : "pageserver_resident_physical_size",
563 404 : "The size of the layer files present in the pageserver's filesystem, for attached locations.",
564 404 : &["tenant_id", "shard_id", "timeline_id"]
565 404 : )
566 404 : .expect("failed to define a metric")
567 404 : });
568 :
569 404 : static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
570 404 : register_uint_gauge_vec!(
571 404 : "pageserver_visible_physical_size",
572 404 : "The size of the layer files present in the pageserver's filesystem.",
573 404 : &["tenant_id", "shard_id", "timeline_id"]
574 404 : )
575 404 : .expect("failed to define a metric")
576 404 : });
577 :
578 396 : pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
579 396 : register_uint_gauge!(
580 396 : "pageserver_resident_physical_size_global",
581 396 : "Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions."
582 396 : )
583 396 : .expect("failed to define a metric")
584 396 : });
585 :
586 404 : static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
587 404 : register_uint_gauge_vec!(
588 404 : "pageserver_remote_physical_size",
589 404 : "The size of the layer files present in the remote storage that are listed in the remote index_part.json.",
590 404 : // Corollary: If any files are missing from the index part, they won't be included here.
591 404 : &["tenant_id", "shard_id", "timeline_id"]
592 404 : )
593 404 : .expect("failed to define a metric")
594 404 : });
595 :
596 404 : static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
597 404 : register_uint_gauge!(
598 404 : "pageserver_remote_physical_size_global",
599 404 : "Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions."
600 404 : )
601 404 : .expect("failed to define a metric")
602 404 : });
603 :
604 12 : pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
605 12 : register_int_counter!(
606 12 : "pageserver_remote_ondemand_downloaded_layers_total",
607 12 : "Total on-demand downloaded layers"
608 12 : )
609 12 : .unwrap()
610 12 : });
611 :
612 12 : pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
613 12 : register_int_counter!(
614 12 : "pageserver_remote_ondemand_downloaded_bytes_total",
615 12 : "Total bytes of layers on-demand downloaded",
616 12 : )
617 12 : .unwrap()
618 12 : });
619 :
620 404 : static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
621 404 : register_uint_gauge_vec!(
622 404 : "pageserver_current_logical_size",
623 404 : "Current logical size grouped by timeline",
624 404 : &["tenant_id", "shard_id", "timeline_id"]
625 404 : )
626 404 : .expect("failed to define current logical size metric")
627 404 : });
628 :
629 404 : static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
630 404 : register_int_gauge_vec!(
631 404 : "pageserver_aux_file_estimated_size",
632 404 : "The size of all aux files for a timeline in aux file v2 store.",
633 404 : &["tenant_id", "shard_id", "timeline_id"]
634 404 : )
635 404 : .expect("failed to define a metric")
636 404 : });
637 :
638 404 : static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
639 404 : register_uint_gauge_vec!(
640 404 : "pageserver_valid_lsn_lease_count",
641 404 : "The number of valid leases after refreshing gc info.",
642 404 : &["tenant_id", "shard_id", "timeline_id"],
643 404 : )
644 404 : .expect("failed to define a metric")
645 404 : });
646 :
647 0 : pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
648 0 : register_int_counter!(
649 0 : "pageserver_circuit_breaker_broken",
650 0 : "How many times a circuit breaker has broken"
651 0 : )
652 0 : .expect("failed to define a metric")
653 0 : });
654 :
655 0 : pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
656 0 : register_int_counter!(
657 0 : "pageserver_circuit_breaker_unbroken",
658 0 : "How many times a circuit breaker has been un-broken (recovered)"
659 0 : )
660 0 : .expect("failed to define a metric")
661 0 : });
662 :
663 388 : pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
664 388 : register_int_counter!(
665 388 : "pageserver_compression_image_in_bytes_total",
666 388 : "Size of data written into image layers before compression"
667 388 : )
668 388 : .expect("failed to define a metric")
669 388 : });
670 :
671 388 : pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
672 388 : register_int_counter!(
673 388 : "pageserver_compression_image_in_bytes_considered",
674 388 : "Size of potentially compressible data written into image layers before compression"
675 388 : )
676 388 : .expect("failed to define a metric")
677 388 : });
678 :
679 388 : pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
680 388 : register_int_counter!(
681 388 : "pageserver_compression_image_in_bytes_chosen",
682 388 : "Size of data whose compressed form was written into image layers"
683 388 : )
684 388 : .expect("failed to define a metric")
685 388 : });
686 :
687 388 : pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
688 388 : register_int_counter!(
689 388 : "pageserver_compression_image_out_bytes_total",
690 388 : "Size of compressed image layer written"
691 388 : )
692 388 : .expect("failed to define a metric")
693 388 : });
694 :
695 20 : pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
696 20 : register_uint_gauge!(
697 20 : "pageserver_relsize_cache_entries",
698 20 : "Number of entries in the relation size cache",
699 20 : )
700 20 : .expect("failed to define a metric")
701 20 : });
702 :
703 20 : pub(crate) static RELSIZE_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
704 20 : register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",)
705 20 : .expect("failed to define a metric")
706 20 : });
707 :
708 20 : pub(crate) static RELSIZE_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
709 20 : register_int_counter!(
710 20 : "pageserver_relsize_cache_misses",
711 20 : "Relation size cache misses",
712 20 : )
713 20 : .expect("failed to define a metric")
714 20 : });
715 :
716 8 : pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy<IntCounter> = Lazy::new(|| {
717 8 : register_int_counter!(
718 8 : "pageserver_relsize_cache_misses_old",
719 8 : "Relation size cache misses where the lookup LSN is older than the last relation update"
720 8 : )
721 8 : .expect("failed to define a metric")
722 8 : });
723 :
724 : pub(crate) mod initial_logical_size {
725 : use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
726 : use once_cell::sync::Lazy;
727 :
728 : pub(crate) struct StartCalculation(IntCounterVec);
729 404 : pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
730 404 : StartCalculation(
731 404 : register_int_counter_vec!(
732 404 : "pageserver_initial_logical_size_start_calculation",
733 404 : "Incremented each time we start an initial logical size calculation attempt. \
734 404 : The `circumstances` label provides some additional details.",
735 404 : &["attempt", "circumstances"]
736 404 : )
737 404 : .unwrap(),
738 404 : )
739 404 : });
740 :
741 : struct DropCalculation {
742 : first: IntCounter,
743 : retry: IntCounter,
744 : }
745 :
746 404 : static DROP_CALCULATION: Lazy<DropCalculation> = Lazy::new(|| {
747 404 : let vec = register_int_counter_vec!(
748 404 : "pageserver_initial_logical_size_drop_calculation",
749 404 : "Incremented each time we abort a started size calculation attmpt.",
750 404 : &["attempt"]
751 404 : )
752 404 : .unwrap();
753 404 : DropCalculation {
754 404 : first: vec.with_label_values(&["first"]),
755 404 : retry: vec.with_label_values(&["retry"]),
756 404 : }
757 404 : });
758 :
759 : pub(crate) struct Calculated {
760 : pub(crate) births: IntCounter,
761 : pub(crate) deaths: IntCounter,
762 : }
763 :
764 404 : pub(crate) static CALCULATED: Lazy<Calculated> = Lazy::new(|| Calculated {
765 404 : births: register_int_counter!(
766 404 : "pageserver_initial_logical_size_finish_calculation",
767 404 : "Incremented every time we finish calculation of initial logical size.\
768 404 : If everything is working well, this should happen at most once per Timeline object."
769 404 : )
770 404 : .unwrap(),
771 404 : deaths: register_int_counter!(
772 404 : "pageserver_initial_logical_size_drop_finished_calculation",
773 404 : "Incremented when we drop a finished initial logical size calculation result.\
774 404 : Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge."
775 404 : )
776 404 : .unwrap(),
777 404 : });
778 :
779 : pub(crate) struct OngoingCalculationGuard {
780 : inc_drop_calculation: Option<IntCounter>,
781 : }
782 :
783 : #[derive(strum_macros::IntoStaticStr)]
784 : pub(crate) enum StartCircumstances {
785 : EmptyInitial,
786 : SkippedConcurrencyLimiter,
787 : AfterBackgroundTasksRateLimit,
788 : }
789 :
790 : impl StartCalculation {
791 428 : pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
792 428 : let circumstances_label: &'static str = circumstances.into();
793 428 : self.0
794 428 : .with_label_values(&["first", circumstances_label])
795 428 : .inc();
796 428 : OngoingCalculationGuard {
797 428 : inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
798 428 : }
799 428 : }
800 0 : pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
801 0 : let circumstances_label: &'static str = circumstances.into();
802 0 : self.0
803 0 : .with_label_values(&["retry", circumstances_label])
804 0 : .inc();
805 0 : OngoingCalculationGuard {
806 0 : inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
807 0 : }
808 0 : }
809 : }
810 :
811 : impl Drop for OngoingCalculationGuard {
812 428 : fn drop(&mut self) {
813 428 : if let Some(counter) = self.inc_drop_calculation.take() {
814 0 : counter.inc();
815 428 : }
816 428 : }
817 : }
818 :
819 : impl OngoingCalculationGuard {
820 428 : pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard {
821 428 : drop(self.inc_drop_calculation.take());
822 428 : CALCULATED.births.inc();
823 428 : FinishedCalculationGuard {
824 428 : inc_on_drop: CALCULATED.deaths.clone(),
825 428 : }
826 428 : }
827 : }
828 :
829 : pub(crate) struct FinishedCalculationGuard {
830 : inc_on_drop: IntCounter,
831 : }
832 :
833 : impl Drop for FinishedCalculationGuard {
834 12 : fn drop(&mut self) {
835 12 : self.inc_on_drop.inc();
836 12 : }
837 : }
838 :
839 : // context: https://github.com/neondatabase/neon/issues/5963
840 : pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy<IntCounter> =
841 0 : Lazy::new(|| {
842 0 : register_int_counter!(
843 0 : "pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size",
844 0 : "Counter for the following event: walreceiver calls\
845 0 : Timeline::get_current_logical_size() and it returns `Approximate` for the first time."
846 0 : )
847 0 : .unwrap()
848 0 : });
849 : }
850 :
851 0 : static DIRECTORY_ENTRIES_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
852 0 : register_uint_gauge_vec!(
853 0 : "pageserver_directory_entries_count",
854 0 : "Sum of the entries in pageserver-stored directory listings",
855 0 : &["tenant_id", "shard_id", "timeline_id"]
856 0 : )
857 0 : .expect("failed to define a metric")
858 0 : });
859 :
860 408 : pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
861 408 : register_uint_gauge_vec!(
862 408 : "pageserver_tenant_states_count",
863 408 : "Count of tenants per state",
864 408 : &["state"]
865 408 : )
866 408 : .expect("Failed to register pageserver_tenant_states_count metric")
867 408 : });
868 :
869 : /// A set of broken tenants.
870 : ///
871 : /// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
872 : /// tenant.
873 20 : pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
874 20 : register_uint_gauge_vec!(
875 20 : "pageserver_broken_tenants_count",
876 20 : "Set of broken tenants",
877 20 : &["tenant_id", "shard_id"]
878 20 : )
879 20 : .expect("Failed to register pageserver_tenant_states_count metric")
880 20 : });
881 :
882 12 : pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
883 12 : register_uint_gauge_vec!(
884 12 : "pageserver_tenant_synthetic_cached_size_bytes",
885 12 : "Synthetic size of each tenant in bytes",
886 12 : &["tenant_id"]
887 12 : )
888 12 : .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
889 12 : });
890 :
891 0 : pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
892 0 : register_histogram_vec!(
893 0 : "pageserver_eviction_iteration_duration_seconds_global",
894 0 : "Time spent on a single eviction iteration",
895 0 : &["period_secs", "threshold_secs"],
896 0 : STORAGE_OP_BUCKETS.into(),
897 0 : )
898 0 : .expect("failed to define a metric")
899 0 : });
900 :
901 404 : static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
902 404 : register_int_counter_vec!(
903 404 : "pageserver_evictions",
904 404 : "Number of layers evicted from the pageserver",
905 404 : &["tenant_id", "shard_id", "timeline_id"]
906 404 : )
907 404 : .expect("failed to define a metric")
908 404 : });
909 :
910 404 : static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(|| {
911 404 : register_int_counter_vec!(
912 404 : "pageserver_evictions_with_low_residence_duration",
913 404 : "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
914 404 : Residence duration is determined using the `residence_duration_data_source`.",
915 404 : &["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
916 404 : )
917 404 : .expect("failed to define a metric")
918 404 : });
919 :
920 0 : pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
921 0 : register_int_counter!(
922 0 : "pageserver_unexpected_ondemand_downloads_count",
923 0 : "Number of unexpected on-demand downloads. \
924 0 : We log more context for each increment, so, forgo any labels in this metric.",
925 0 : )
926 0 : .expect("failed to define a metric")
927 0 : });
928 :
929 : /// How long did we take to start up? Broken down by labels to describe
930 : /// different phases of startup.
931 0 : pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
932 0 : register_gauge_vec!(
933 0 : "pageserver_startup_duration_seconds",
934 0 : "Time taken by phases of pageserver startup, in seconds",
935 0 : &["phase"]
936 0 : )
937 0 : .expect("Failed to register pageserver_startup_duration_seconds metric")
938 0 : });
939 :
940 0 : pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
941 0 : register_uint_gauge!(
942 0 : "pageserver_startup_is_loading",
943 0 : "1 while in initial startup load of tenants, 0 at other times"
944 0 : )
945 0 : .expect("Failed to register pageserver_startup_is_loading")
946 0 : });
947 :
948 396 : pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
949 396 : register_uint_gauge!(
950 396 : "pageserver_timeline_ephemeral_bytes",
951 396 : "Total number of bytes in ephemeral layers, summed for all timelines. Approximate, lazily updated."
952 396 : )
953 396 : .expect("Failed to register metric")
954 396 : });
955 :
956 : /// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
957 : /// like how long it took to load.
958 : ///
959 : /// Note that these are process-global metrics, _not_ per-tenant metrics. Per-tenant
960 : /// metrics are rather expensive, and usually fine grained stuff makes more sense
961 : /// at a timeline level than tenant level.
962 : pub(crate) struct TenantMetrics {
963 : /// How long did tenants take to go from construction to active state?
964 : pub(crate) activation: Histogram,
965 : pub(crate) preload: Histogram,
966 : pub(crate) attach: Histogram,
967 :
968 : /// How many tenants are included in the initial startup of the pagesrever?
969 : pub(crate) startup_scheduled: IntCounter,
970 : pub(crate) startup_complete: IntCounter,
971 : }
972 :
973 0 : pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
974 0 : TenantMetrics {
975 0 : activation: register_histogram!(
976 0 : "pageserver_tenant_activation_seconds",
977 0 : "Time taken by tenants to activate, in seconds",
978 0 : CRITICAL_OP_BUCKETS.into()
979 0 : )
980 0 : .expect("Failed to register metric"),
981 0 : preload: register_histogram!(
982 0 : "pageserver_tenant_preload_seconds",
983 0 : "Time taken by tenants to load remote metadata on startup/attach, in seconds",
984 0 : CRITICAL_OP_BUCKETS.into()
985 0 : )
986 0 : .expect("Failed to register metric"),
987 0 : attach: register_histogram!(
988 0 : "pageserver_tenant_attach_seconds",
989 0 : "Time taken by tenants to intialize, after remote metadata is already loaded",
990 0 : CRITICAL_OP_BUCKETS.into()
991 0 : )
992 0 : .expect("Failed to register metric"),
993 0 : startup_scheduled: register_int_counter!(
994 0 : "pageserver_tenant_startup_scheduled",
995 0 : "Number of tenants included in pageserver startup (doesn't count tenants attached later)"
996 0 : ).expect("Failed to register metric"),
997 0 : startup_complete: register_int_counter!(
998 0 : "pageserver_tenant_startup_complete",
999 0 : "Number of tenants that have completed warm-up, or activated on-demand during initial startup: \
1000 0 : should eventually reach `pageserver_tenant_startup_scheduled_total`. Does not include broken \
1001 0 : tenants: such cases will lead to this metric never reaching the scheduled count."
1002 0 : ).expect("Failed to register metric"),
1003 0 : }
1004 0 : });
1005 :
1006 : /// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
1007 : #[derive(Debug)]
1008 : pub(crate) struct EvictionsWithLowResidenceDuration {
1009 : data_source: &'static str,
1010 : threshold: Duration,
1011 : counter: Option<IntCounter>,
1012 : }
1013 :
1014 : pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
1015 : data_source: &'static str,
1016 : threshold: Duration,
1017 : }
1018 :
1019 : impl EvictionsWithLowResidenceDurationBuilder {
1020 896 : pub fn new(data_source: &'static str, threshold: Duration) -> Self {
1021 896 : Self {
1022 896 : data_source,
1023 896 : threshold,
1024 896 : }
1025 896 : }
1026 :
1027 896 : fn build(
1028 896 : &self,
1029 896 : tenant_id: &str,
1030 896 : shard_id: &str,
1031 896 : timeline_id: &str,
1032 896 : ) -> EvictionsWithLowResidenceDuration {
1033 896 : let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
1034 896 : .get_metric_with_label_values(&[
1035 896 : tenant_id,
1036 896 : shard_id,
1037 896 : timeline_id,
1038 896 : self.data_source,
1039 896 : &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
1040 896 : ])
1041 896 : .unwrap();
1042 896 : EvictionsWithLowResidenceDuration {
1043 896 : data_source: self.data_source,
1044 896 : threshold: self.threshold,
1045 896 : counter: Some(counter),
1046 896 : }
1047 896 : }
1048 : }
1049 :
1050 : impl EvictionsWithLowResidenceDuration {
1051 916 : fn threshold_label_value(threshold: Duration) -> String {
1052 916 : format!("{}", threshold.as_secs())
1053 916 : }
1054 :
1055 8 : pub fn observe(&self, observed_value: Duration) {
1056 8 : if observed_value < self.threshold {
1057 8 : self.counter
1058 8 : .as_ref()
1059 8 : .expect("nobody calls this function after `remove_from_vec`")
1060 8 : .inc();
1061 8 : }
1062 8 : }
1063 :
1064 0 : pub fn change_threshold(
1065 0 : &mut self,
1066 0 : tenant_id: &str,
1067 0 : shard_id: &str,
1068 0 : timeline_id: &str,
1069 0 : new_threshold: Duration,
1070 0 : ) {
1071 0 : if new_threshold == self.threshold {
1072 0 : return;
1073 0 : }
1074 0 : let mut with_new = EvictionsWithLowResidenceDurationBuilder::new(
1075 0 : self.data_source,
1076 0 : new_threshold,
1077 0 : )
1078 0 : .build(tenant_id, shard_id, timeline_id);
1079 0 : std::mem::swap(self, &mut with_new);
1080 0 : with_new.remove(tenant_id, shard_id, timeline_id);
1081 0 : }
1082 :
1083 : // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
1084 20 : fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) {
1085 20 : let Some(_counter) = self.counter.take() else {
1086 0 : return;
1087 : };
1088 :
1089 20 : let threshold = Self::threshold_label_value(self.threshold);
1090 20 :
1091 20 : let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
1092 20 : tenant_id,
1093 20 : shard_id,
1094 20 : timeline_id,
1095 20 : self.data_source,
1096 20 : &threshold,
1097 20 : ]);
1098 20 :
1099 20 : match removed {
1100 0 : Err(e) => {
1101 0 : // this has been hit in staging as
1102 0 : // <https://neondatabase.sentry.io/issues/4142396994/>, but we don't know how.
1103 0 : // because we can be in the drop path already, don't risk:
1104 0 : // - "double-panic => illegal instruction" or
1105 0 : // - future "drop panick => abort"
1106 0 : //
1107 0 : // so just nag: (the error has the labels)
1108 0 : tracing::warn!("failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}");
1109 : }
1110 : Ok(()) => {
1111 : // to help identify cases where we double-remove the same values, let's log all
1112 : // deletions?
1113 20 : tracing::info!("removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source);
1114 : }
1115 : }
1116 20 : }
1117 : }
1118 :
1119 : // Metrics collected on disk IO operations
1120 : //
1121 : // Roughly logarithmic scale.
1122 : const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
1123 : 0.000030, // 30 usec
1124 : 0.001000, // 1000 usec
1125 : 0.030, // 30 ms
1126 : 1.000, // 1000 ms
1127 : 30.000, // 30000 ms
1128 : ];
1129 :
1130 : /// VirtualFile fs operation variants.
1131 : ///
1132 : /// Operations:
1133 : /// - open ([`std::fs::OpenOptions::open`])
1134 : /// - close (dropping [`crate::virtual_file::VirtualFile`])
1135 : /// - close-by-replace (close by replacement algorithm)
1136 : /// - read (`read_at`)
1137 : /// - write (`write_at`)
1138 : /// - seek (modify internal position or file length query)
1139 : /// - fsync ([`std::fs::File::sync_all`])
1140 : /// - metadata ([`std::fs::File::metadata`])
1141 : #[derive(
1142 0 : Debug, Clone, Copy, strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr,
1143 : )]
1144 : pub(crate) enum StorageIoOperation {
1145 : Open,
1146 : OpenAfterReplace,
1147 : Close,
1148 : CloseByReplace,
1149 : Read,
1150 : Write,
1151 : Seek,
1152 : Fsync,
1153 : Metadata,
1154 : }
1155 :
1156 : impl StorageIoOperation {
1157 4176 : pub fn as_str(&self) -> &'static str {
1158 4176 : match self {
1159 464 : StorageIoOperation::Open => "open",
1160 464 : StorageIoOperation::OpenAfterReplace => "open-after-replace",
1161 464 : StorageIoOperation::Close => "close",
1162 464 : StorageIoOperation::CloseByReplace => "close-by-replace",
1163 464 : StorageIoOperation::Read => "read",
1164 464 : StorageIoOperation::Write => "write",
1165 464 : StorageIoOperation::Seek => "seek",
1166 464 : StorageIoOperation::Fsync => "fsync",
1167 464 : StorageIoOperation::Metadata => "metadata",
1168 : }
1169 4176 : }
1170 : }
1171 :
1172 : /// Tracks time taken by fs operations near VirtualFile.
1173 : #[derive(Debug)]
1174 : pub(crate) struct StorageIoTime {
1175 : metrics: [Histogram; StorageIoOperation::COUNT],
1176 : }
1177 :
1178 : impl StorageIoTime {
1179 464 : fn new() -> Self {
1180 464 : let storage_io_histogram_vec = register_histogram_vec!(
1181 464 : "pageserver_io_operations_seconds",
1182 464 : "Time spent in IO operations",
1183 464 : &["operation"],
1184 464 : STORAGE_IO_TIME_BUCKETS.into()
1185 464 : )
1186 464 : .expect("failed to define a metric");
1187 4176 : let metrics = std::array::from_fn(|i| {
1188 4176 : let op = StorageIoOperation::from_repr(i).unwrap();
1189 4176 : storage_io_histogram_vec
1190 4176 : .get_metric_with_label_values(&[op.as_str()])
1191 4176 : .unwrap()
1192 4176 : });
1193 464 : Self { metrics }
1194 464 : }
1195 :
1196 4030855 : pub(crate) fn get(&self, op: StorageIoOperation) -> &Histogram {
1197 4030855 : &self.metrics[op as usize]
1198 4030855 : }
1199 : }
1200 :
1201 : pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(StorageIoTime::new);
1202 :
1203 : const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
1204 :
1205 : // Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
1206 456 : pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
1207 456 : register_int_gauge_vec!(
1208 456 : "pageserver_io_operations_bytes_total",
1209 456 : "Total amount of bytes read/written in IO operations",
1210 456 : &["operation", "tenant_id", "shard_id", "timeline_id"]
1211 456 : )
1212 456 : .expect("failed to define a metric")
1213 456 : });
1214 :
1215 : #[cfg(not(test))]
1216 : pub(crate) mod virtual_file_descriptor_cache {
1217 : use super::*;
1218 :
1219 0 : pub(crate) static SIZE_MAX: Lazy<UIntGauge> = Lazy::new(|| {
1220 0 : register_uint_gauge!(
1221 0 : "pageserver_virtual_file_descriptor_cache_size_max",
1222 0 : "Maximum number of open file descriptors in the cache."
1223 0 : )
1224 0 : .unwrap()
1225 0 : });
1226 :
1227 : // SIZE_CURRENT: derive it like so:
1228 : // ```
1229 : // sum (pageserver_io_operations_seconds_count{operation=~"^(open|open-after-replace)$")
1230 : // -ignoring(operation)
1231 : // sum(pageserver_io_operations_seconds_count{operation=~"^(close|close-by-replace)$"}
1232 : // ```
1233 : }
1234 :
1235 : #[cfg(not(test))]
1236 : pub(crate) mod virtual_file_io_engine {
1237 : use super::*;
1238 :
1239 0 : pub(crate) static KIND: Lazy<UIntGaugeVec> = Lazy::new(|| {
1240 0 : register_uint_gauge_vec!(
1241 0 : "pageserver_virtual_file_io_engine_kind",
1242 0 : "The configured io engine for VirtualFile",
1243 0 : &["kind"],
1244 0 : )
1245 0 : .unwrap()
1246 0 : });
1247 : }
1248 :
1249 : pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
1250 : pub(crate) struct SmgrOpTimerInner {
1251 : global_execution_latency_histo: Histogram,
1252 : per_timeline_execution_latency_histo: Option<Histogram>,
1253 :
1254 : global_batch_wait_time: Histogram,
1255 : per_timeline_batch_wait_time: Histogram,
1256 :
1257 : global_flush_in_progress_micros: IntCounter,
1258 : per_timeline_flush_in_progress_micros: IntCounter,
1259 :
1260 : throttling: Arc<tenant_throttling::Pagestream>,
1261 :
1262 : timings: SmgrOpTimerState,
1263 : }
1264 :
1265 : /// The stages of request processing are represented by the enum variants.
1266 : /// Used as part of [`SmgrOpTimerInner::timings`].
1267 : ///
1268 : /// Request processing calls into the `SmgrOpTimer::observe_*` methods at the
1269 : /// transition points.
1270 : /// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`]
1271 : /// to the next state.
1272 : ///
1273 : /// Each request goes through every stage, in all configurations.
1274 : ///
1275 : #[derive(Debug)]
1276 : enum SmgrOpTimerState {
1277 : Received {
1278 : // In the future, we may want to track the full time the request spent
1279 : // inside pageserver process (time spent in kernel buffers can't be tracked).
1280 : // `received_at` would be used for that.
1281 : #[allow(dead_code)]
1282 : received_at: Instant,
1283 : },
1284 : Throttling {
1285 : throttle_started_at: Instant,
1286 : },
1287 : Batching {
1288 : throttle_done_at: Instant,
1289 : },
1290 : Executing {
1291 : execution_started_at: Instant,
1292 : },
1293 : Flushing,
1294 : // NB: when adding observation points, remember to update the Drop impl.
1295 : }
1296 :
1297 : // NB: when adding observation points, remember to update the Drop impl.
1298 : impl SmgrOpTimer {
1299 : /// See [`SmgrOpTimerState`] for more context.
1300 0 : pub(crate) fn observe_throttle_start(&mut self, at: Instant) {
1301 0 : let Some(inner) = self.0.as_mut() else {
1302 0 : return;
1303 : };
1304 0 : let SmgrOpTimerState::Received { received_at: _ } = &mut inner.timings else {
1305 0 : return;
1306 : };
1307 0 : inner.throttling.count_accounted_start.inc();
1308 0 : inner.timings = SmgrOpTimerState::Throttling {
1309 0 : throttle_started_at: at,
1310 0 : };
1311 0 : }
1312 :
1313 : /// See [`SmgrOpTimerState`] for more context.
1314 0 : pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) {
1315 0 : let Some(inner) = self.0.as_mut() else {
1316 0 : return;
1317 : };
1318 : let SmgrOpTimerState::Throttling {
1319 0 : throttle_started_at,
1320 0 : } = &inner.timings
1321 : else {
1322 0 : return;
1323 : };
1324 0 : inner.throttling.count_accounted_finish.inc();
1325 0 : match throttle {
1326 0 : ThrottleResult::NotThrottled { end } => {
1327 0 : inner.timings = SmgrOpTimerState::Batching {
1328 0 : throttle_done_at: end,
1329 0 : };
1330 0 : }
1331 0 : ThrottleResult::Throttled { end } => {
1332 0 : // update metrics
1333 0 : inner.throttling.count_throttled.inc();
1334 0 : inner
1335 0 : .throttling
1336 0 : .wait_time
1337 0 : .inc_by((end - *throttle_started_at).as_micros().try_into().unwrap());
1338 0 : // state transition
1339 0 : inner.timings = SmgrOpTimerState::Batching {
1340 0 : throttle_done_at: end,
1341 0 : };
1342 0 : }
1343 : }
1344 0 : }
1345 :
1346 : /// See [`SmgrOpTimerState`] for more context.
1347 0 : pub(crate) fn observe_execution_start(&mut self, at: Instant) {
1348 0 : let Some(inner) = self.0.as_mut() else {
1349 0 : return;
1350 : };
1351 0 : let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else {
1352 0 : return;
1353 : };
1354 : // update metrics
1355 0 : let batch = at - *throttle_done_at;
1356 0 : inner.global_batch_wait_time.observe(batch.as_secs_f64());
1357 0 : inner
1358 0 : .per_timeline_batch_wait_time
1359 0 : .observe(batch.as_secs_f64());
1360 0 : // state transition
1361 0 : inner.timings = SmgrOpTimerState::Executing {
1362 0 : execution_started_at: at,
1363 0 : }
1364 0 : }
1365 :
1366 : /// For all but the first caller, this is a no-op.
1367 : /// The first callers receives Some, subsequent ones None.
1368 : ///
1369 : /// See [`SmgrOpTimerState`] for more context.
1370 0 : pub(crate) fn observe_execution_end(&mut self, at: Instant) -> Option<SmgrOpFlushInProgress> {
1371 : // NB: unlike the other observe_* methods, this one take()s.
1372 : #[allow(clippy::question_mark)] // maintain similar code pattern.
1373 0 : let Some(mut inner) = self.0.take() else {
1374 0 : return None;
1375 : };
1376 : let SmgrOpTimerState::Executing {
1377 0 : execution_started_at,
1378 0 : } = &inner.timings
1379 : else {
1380 0 : return None;
1381 : };
1382 : // update metrics
1383 0 : let execution = at - *execution_started_at;
1384 0 : inner
1385 0 : .global_execution_latency_histo
1386 0 : .observe(execution.as_secs_f64());
1387 0 : if let Some(per_timeline_execution_latency_histo) =
1388 0 : &inner.per_timeline_execution_latency_histo
1389 0 : {
1390 0 : per_timeline_execution_latency_histo.observe(execution.as_secs_f64());
1391 0 : }
1392 :
1393 : // state transition
1394 0 : inner.timings = SmgrOpTimerState::Flushing;
1395 0 :
1396 0 : // return the flush in progress object which
1397 0 : // will do the remaining metrics updates
1398 0 : let SmgrOpTimerInner {
1399 0 : global_flush_in_progress_micros,
1400 0 : per_timeline_flush_in_progress_micros,
1401 0 : ..
1402 0 : } = inner;
1403 0 : Some(SmgrOpFlushInProgress {
1404 0 : global_micros: global_flush_in_progress_micros,
1405 0 : per_timeline_micros: per_timeline_flush_in_progress_micros,
1406 0 : })
1407 0 : }
1408 : }
1409 :
1410 : /// The last stage of request processing is serializing and flushing the request
1411 : /// into the TCP connection. We want to make slow flushes observable
1412 : /// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`]
1413 : /// to periodically bump the metric.
1414 : ///
1415 : /// If in the future we decide that we're not interested in live updates, we can
1416 : /// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
1417 : /// and remove this struct from the code base.
1418 : pub(crate) struct SmgrOpFlushInProgress {
1419 : global_micros: IntCounter,
1420 : per_timeline_micros: IntCounter,
1421 : }
1422 :
1423 : impl Drop for SmgrOpTimer {
1424 0 : fn drop(&mut self) {
1425 0 : // In case of early drop, update any of the remaining metrics with
1426 0 : // observations so that (started,finished) counter pairs balance out
1427 0 : // and all counters on the latency path have the the same number of
1428 0 : // observations.
1429 0 : // It's technically lying and it would be better if each metric had
1430 0 : // a separate label or similar for cancelled requests.
1431 0 : // But we don't have that right now and counter pairs balancing
1432 0 : // out is useful when using the metrics in panels and whatnot.
1433 0 : let now = Instant::now();
1434 0 : self.observe_throttle_start(now);
1435 0 : self.observe_throttle_done(ThrottleResult::NotThrottled { end: now });
1436 0 : self.observe_execution_start(now);
1437 0 : let maybe_flush_timer = self.observe_execution_end(now);
1438 0 : drop(maybe_flush_timer);
1439 0 : }
1440 : }
1441 :
1442 : impl SmgrOpFlushInProgress {
1443 : /// The caller must guarantee that `socket_fd`` outlives this function.
1444 0 : pub(crate) async fn measure<Fut, O>(
1445 0 : self,
1446 0 : started_at: Instant,
1447 0 : mut fut: Fut,
1448 0 : socket_fd: RawFd,
1449 0 : ) -> O
1450 0 : where
1451 0 : Fut: std::future::Future<Output = O>,
1452 0 : {
1453 0 : let mut fut = std::pin::pin!(fut);
1454 0 :
1455 0 : let mut logged = false;
1456 0 : let mut last_counter_increment_at = started_at;
1457 0 : let mut observe_guard = scopeguard::guard(
1458 0 : |is_timeout| {
1459 0 : let now = Instant::now();
1460 0 :
1461 0 : // Increment counter
1462 0 : {
1463 0 : let elapsed_since_last_observe = now - last_counter_increment_at;
1464 0 : self.global_micros
1465 0 : .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
1466 0 : self.per_timeline_micros
1467 0 : .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
1468 0 : last_counter_increment_at = now;
1469 0 : }
1470 0 :
1471 0 : // Log something on every timeout, and on completion but only if we hit a timeout.
1472 0 : if is_timeout || logged {
1473 0 : logged = true;
1474 0 : let elapsed_total = now - started_at;
1475 0 : let msg = if is_timeout {
1476 0 : "slow flush ongoing"
1477 : } else {
1478 0 : "slow flush completed or cancelled"
1479 : };
1480 :
1481 0 : let (inq, outq) = {
1482 0 : // SAFETY: caller guarantees that `socket_fd` outlives this function.
1483 0 : #[cfg(target_os = "linux")]
1484 0 : unsafe {
1485 0 : (
1486 0 : utils::linux_socket_ioctl::inq(socket_fd).unwrap_or(-2),
1487 0 : utils::linux_socket_ioctl::outq(socket_fd).unwrap_or(-2),
1488 0 : )
1489 0 : }
1490 0 : #[cfg(not(target_os = "linux"))]
1491 0 : {
1492 0 : _ = socket_fd; // appease unused lint on macOS
1493 0 : (-1, -1)
1494 0 : }
1495 0 : };
1496 0 :
1497 0 : let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64());
1498 0 : tracing::info!(elapsed_total_secs, inq, outq, msg);
1499 0 : }
1500 0 : },
1501 0 : |mut observe| {
1502 0 : observe(false);
1503 0 : },
1504 0 : );
1505 :
1506 : loop {
1507 0 : match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
1508 0 : Ok(v) => return v,
1509 0 : Err(_timeout) => {
1510 0 : (*observe_guard)(true);
1511 0 : }
1512 : }
1513 : }
1514 0 : }
1515 : }
1516 :
1517 : #[derive(
1518 : Debug,
1519 : Clone,
1520 : Copy,
1521 : IntoStaticStr,
1522 : strum_macros::EnumCount,
1523 0 : strum_macros::EnumIter,
1524 : strum_macros::FromRepr,
1525 : enum_map::Enum,
1526 : )]
1527 : #[strum(serialize_all = "snake_case")]
1528 : pub enum SmgrQueryType {
1529 : GetRelExists,
1530 : GetRelSize,
1531 : GetPageAtLsn,
1532 : GetDbSize,
1533 : GetSlruSegment,
1534 : #[cfg(feature = "testing")]
1535 : Test,
1536 : }
1537 :
1538 : pub(crate) struct SmgrQueryTimePerTimeline {
1539 : global_started: [IntCounter; SmgrQueryType::COUNT],
1540 : global_latency: [Histogram; SmgrQueryType::COUNT],
1541 : per_timeline_getpage_started: IntCounter,
1542 : per_timeline_getpage_latency: Histogram,
1543 : global_batch_size: Histogram,
1544 : per_timeline_batch_size: Histogram,
1545 : global_flush_in_progress_micros: IntCounter,
1546 : per_timeline_flush_in_progress_micros: IntCounter,
1547 : global_batch_wait_time: Histogram,
1548 : per_timeline_batch_wait_time: Histogram,
1549 : throttling: Arc<tenant_throttling::Pagestream>,
1550 : }
1551 :
1552 404 : static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
1553 404 : register_int_counter_vec!(
1554 404 : // it's a counter, but, name is prepared to extend it to a histogram of queue depth
1555 404 : "pageserver_smgr_query_started_global_count",
1556 404 : "Number of smgr queries started, aggregated by query type.",
1557 404 : &["smgr_query_type"],
1558 404 : )
1559 404 : .expect("failed to define a metric")
1560 404 : });
1561 :
1562 404 : static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
1563 404 : register_int_counter_vec!(
1564 404 : // it's a counter, but, name is prepared to extend it to a histogram of queue depth
1565 404 : "pageserver_smgr_query_started_count",
1566 404 : "Number of smgr queries started, aggregated by query type and tenant/timeline.",
1567 404 : &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
1568 404 : )
1569 404 : .expect("failed to define a metric")
1570 404 : });
1571 :
1572 : // Alias so all histograms recording per-timeline smgr timings use the same buckets.
1573 : static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS;
1574 :
1575 404 : static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
1576 404 : register_histogram_vec!(
1577 404 : "pageserver_smgr_query_seconds",
1578 404 : "Time spent _executing_ smgr query handling, excluding batch and throttle delays.",
1579 404 : &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
1580 404 : SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
1581 404 : )
1582 404 : .expect("failed to define a metric")
1583 404 : });
1584 :
1585 404 : static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
1586 404 : [
1587 404 : 1,
1588 404 : 10,
1589 404 : 20,
1590 404 : 40,
1591 404 : 60,
1592 404 : 80,
1593 404 : 100,
1594 404 : 200,
1595 404 : 300,
1596 404 : 400,
1597 404 : 500,
1598 404 : 600,
1599 404 : 700,
1600 404 : 800,
1601 404 : 900,
1602 404 : 1_000, // 1ms
1603 404 : 2_000,
1604 404 : 4_000,
1605 404 : 6_000,
1606 404 : 8_000,
1607 404 : 10_000, // 10ms
1608 404 : 20_000,
1609 404 : 40_000,
1610 404 : 60_000,
1611 404 : 80_000,
1612 404 : 100_000,
1613 404 : 200_000,
1614 404 : 400_000,
1615 404 : 600_000,
1616 404 : 800_000,
1617 404 : 1_000_000, // 1s
1618 404 : 2_000_000,
1619 404 : 4_000_000,
1620 404 : 6_000_000,
1621 404 : 8_000_000,
1622 404 : 10_000_000, // 10s
1623 404 : 20_000_000,
1624 404 : 50_000_000,
1625 404 : 100_000_000,
1626 404 : 200_000_000,
1627 404 : 1_000_000_000, // 1000s
1628 404 : ]
1629 404 : .into_iter()
1630 404 : .map(Duration::from_micros)
1631 16564 : .map(|d| d.as_secs_f64())
1632 404 : .collect()
1633 404 : });
1634 :
1635 404 : static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
1636 404 : register_histogram_vec!(
1637 404 : "pageserver_smgr_query_seconds_global",
1638 404 : "Like pageserver_smgr_query_seconds, but aggregated to instance level.",
1639 404 : &["smgr_query_type"],
1640 404 : SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
1641 404 : )
1642 404 : .expect("failed to define a metric")
1643 404 : });
1644 :
1645 404 : static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy<Vec<f64>> = Lazy::new(|| {
1646 404 : (1..=u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap())
1647 12928 : .map(|v| v.into())
1648 404 : .collect()
1649 404 : });
1650 :
1651 404 : static PAGE_SERVICE_BATCH_SIZE_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
1652 404 : register_histogram!(
1653 404 : "pageserver_page_service_batch_size_global",
1654 404 : "Batch size of pageserver page service requests",
1655 404 : PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL.clone(),
1656 404 : )
1657 404 : .expect("failed to define a metric")
1658 404 : });
1659 :
1660 404 : static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy<Vec<f64>> = Lazy::new(|| {
1661 404 : let mut buckets = Vec::new();
1662 2828 : for i in 0.. {
1663 2828 : let bucket = 1 << i;
1664 2828 : if bucket > u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap() {
1665 404 : break;
1666 2424 : }
1667 2424 : buckets.push(bucket.into());
1668 : }
1669 404 : buckets
1670 404 : });
1671 :
1672 404 : static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
1673 404 : register_histogram_vec!(
1674 404 : "pageserver_page_service_batch_size",
1675 404 : "Batch size of pageserver page service requests",
1676 404 : &["tenant_id", "shard_id", "timeline_id"],
1677 404 : PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE.clone()
1678 404 : )
1679 404 : .expect("failed to define a metric")
1680 404 : });
1681 :
1682 0 : pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
1683 0 : register_int_gauge_vec!(
1684 0 : "pageserver_page_service_config_max_batch_size",
1685 0 : "Configured maximum batch size for the server-side batching functionality of page_service. \
1686 0 : Labels expose more of the configuration parameters.",
1687 0 : &["mode", "execution"]
1688 0 : )
1689 0 : .expect("failed to define a metric")
1690 0 : });
1691 :
1692 0 : fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
1693 0 : PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
1694 0 : let (label_values, value) = match conf {
1695 0 : PageServicePipeliningConfig::Serial => (["serial", "-"], 1),
1696 : PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
1697 0 : max_batch_size,
1698 0 : execution,
1699 0 : }) => {
1700 0 : let mode = "pipelined";
1701 0 : let execution = match execution {
1702 : PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => {
1703 0 : "concurrent-futures"
1704 : }
1705 0 : PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
1706 : };
1707 0 : ([mode, execution], max_batch_size.get())
1708 : }
1709 : };
1710 0 : PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
1711 0 : .with_label_values(&label_values)
1712 0 : .set(value.try_into().unwrap());
1713 0 : }
1714 :
1715 404 : static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy<IntCounterVec> = Lazy::new(|| {
1716 404 : register_int_counter_vec!(
1717 404 : "pageserver_page_service_pagestream_flush_in_progress_micros",
1718 404 : "Counter that sums up the microseconds that a pagestream response was being flushed into the TCP connection. \
1719 404 : If the flush is particularly slow, this counter will be updated periodically to make slow flushes \
1720 404 : easily discoverable in monitoring. \
1721 404 : Hence, this is NOT a completion latency historgram.",
1722 404 : &["tenant_id", "shard_id", "timeline_id"],
1723 404 : )
1724 404 : .expect("failed to define a metric")
1725 404 : });
1726 :
1727 404 : static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy::new(|| {
1728 404 : register_int_counter!(
1729 404 : "pageserver_page_service_pagestream_flush_in_progress_micros_global",
1730 404 : "Like pageserver_page_service_pagestream_flush_in_progress_seconds, but instance-wide.",
1731 404 : )
1732 404 : .expect("failed to define a metric")
1733 404 : });
1734 :
1735 404 : static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
1736 404 : register_histogram_vec!(
1737 404 : "pageserver_page_service_pagestream_batch_wait_time_seconds",
1738 404 : "Time a request spent waiting in its batch until the batch moved to throttle&execution.",
1739 404 : &["tenant_id", "shard_id", "timeline_id"],
1740 404 : SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
1741 404 : )
1742 404 : .expect("failed to define a metric")
1743 404 : });
1744 :
1745 404 : static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
1746 404 : register_histogram!(
1747 404 : "pageserver_page_service_pagestream_batch_wait_time_seconds_global",
1748 404 : "Like pageserver_page_service_pagestream_batch_wait_time_seconds, but aggregated to instance level.",
1749 404 : SMGR_QUERY_TIME_GLOBAL_BUCKETS.to_vec(),
1750 404 : )
1751 404 : .expect("failed to define a metric")
1752 404 : });
1753 :
1754 : impl SmgrQueryTimePerTimeline {
1755 896 : pub(crate) fn new(
1756 896 : tenant_shard_id: &TenantShardId,
1757 896 : timeline_id: &TimelineId,
1758 896 : pagestream_throttle_metrics: Arc<tenant_throttling::Pagestream>,
1759 896 : ) -> Self {
1760 896 : let tenant_id = tenant_shard_id.tenant_id.to_string();
1761 896 : let shard_slug = format!("{}", tenant_shard_id.shard_slug());
1762 896 : let timeline_id = timeline_id.to_string();
1763 5376 : let global_started = std::array::from_fn(|i| {
1764 5376 : let op = SmgrQueryType::from_repr(i).unwrap();
1765 5376 : SMGR_QUERY_STARTED_GLOBAL
1766 5376 : .get_metric_with_label_values(&[op.into()])
1767 5376 : .unwrap()
1768 5376 : });
1769 5376 : let global_latency = std::array::from_fn(|i| {
1770 5376 : let op = SmgrQueryType::from_repr(i).unwrap();
1771 5376 : SMGR_QUERY_TIME_GLOBAL
1772 5376 : .get_metric_with_label_values(&[op.into()])
1773 5376 : .unwrap()
1774 5376 : });
1775 896 :
1776 896 : let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE
1777 896 : .get_metric_with_label_values(&[
1778 896 : SmgrQueryType::GetPageAtLsn.into(),
1779 896 : &tenant_id,
1780 896 : &shard_slug,
1781 896 : &timeline_id,
1782 896 : ])
1783 896 : .unwrap();
1784 896 : let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
1785 896 : .get_metric_with_label_values(&[
1786 896 : SmgrQueryType::GetPageAtLsn.into(),
1787 896 : &tenant_id,
1788 896 : &shard_slug,
1789 896 : &timeline_id,
1790 896 : ])
1791 896 : .unwrap();
1792 896 :
1793 896 : let global_batch_size = PAGE_SERVICE_BATCH_SIZE_GLOBAL.clone();
1794 896 : let per_timeline_batch_size = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE
1795 896 : .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
1796 896 : .unwrap();
1797 896 :
1798 896 : let global_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL.clone();
1799 896 : let per_timeline_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME
1800 896 : .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
1801 896 : .unwrap();
1802 896 :
1803 896 : let global_flush_in_progress_micros =
1804 896 : PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
1805 896 : let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
1806 896 : .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
1807 896 : .unwrap();
1808 896 :
1809 896 : Self {
1810 896 : global_started,
1811 896 : global_latency,
1812 896 : per_timeline_getpage_latency,
1813 896 : per_timeline_getpage_started,
1814 896 : global_batch_size,
1815 896 : per_timeline_batch_size,
1816 896 : global_flush_in_progress_micros,
1817 896 : per_timeline_flush_in_progress_micros,
1818 896 : global_batch_wait_time,
1819 896 : per_timeline_batch_wait_time,
1820 896 : throttling: pagestream_throttle_metrics,
1821 896 : }
1822 896 : }
1823 0 : pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
1824 0 : self.global_started[op as usize].inc();
1825 :
1826 0 : let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
1827 0 : self.per_timeline_getpage_started.inc();
1828 0 : Some(self.per_timeline_getpage_latency.clone())
1829 : } else {
1830 0 : None
1831 : };
1832 :
1833 0 : SmgrOpTimer(Some(SmgrOpTimerInner {
1834 0 : global_execution_latency_histo: self.global_latency[op as usize].clone(),
1835 0 : per_timeline_execution_latency_histo: per_timeline_latency_histo,
1836 0 : global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
1837 0 : per_timeline_flush_in_progress_micros: self
1838 0 : .per_timeline_flush_in_progress_micros
1839 0 : .clone(),
1840 0 : global_batch_wait_time: self.global_batch_wait_time.clone(),
1841 0 : per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
1842 0 : throttling: self.throttling.clone(),
1843 0 : timings: SmgrOpTimerState::Received { received_at },
1844 0 : }))
1845 0 : }
1846 :
1847 : /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
1848 0 : pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
1849 0 : self.global_batch_size.observe(batch_size as f64);
1850 0 : self.per_timeline_batch_size.observe(batch_size as f64);
1851 0 : }
1852 : }
1853 :
1854 : // keep in sync with control plane Go code so that we can validate
1855 : // compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
1856 0 : static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
1857 0 : // Go code uses milliseconds. Variable is called `computeStartupBuckets`
1858 0 : [
1859 0 : 5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000,
1860 0 : 1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000,
1861 0 : ]
1862 0 : .map(|ms| (ms as f64) / 1000.0)
1863 0 : });
1864 :
1865 : pub(crate) struct BasebackupQueryTime {
1866 : ok: Histogram,
1867 : error: Histogram,
1868 : client_error: Histogram,
1869 : }
1870 :
1871 0 : pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
1872 0 : let vec = register_histogram_vec!(
1873 0 : "pageserver_basebackup_query_seconds",
1874 0 : "Histogram of basebackup queries durations, by result type",
1875 0 : &["result"],
1876 0 : COMPUTE_STARTUP_BUCKETS.to_vec(),
1877 0 : )
1878 0 : .expect("failed to define a metric");
1879 0 : BasebackupQueryTime {
1880 0 : ok: vec.get_metric_with_label_values(&["ok"]).unwrap(),
1881 0 : error: vec.get_metric_with_label_values(&["error"]).unwrap(),
1882 0 : client_error: vec.get_metric_with_label_values(&["client_error"]).unwrap(),
1883 0 : }
1884 0 : });
1885 :
1886 : pub(crate) struct BasebackupQueryTimeOngoingRecording<'a> {
1887 : parent: &'a BasebackupQueryTime,
1888 : start: std::time::Instant,
1889 : }
1890 :
1891 : impl BasebackupQueryTime {
1892 0 : pub(crate) fn start_recording(&self) -> BasebackupQueryTimeOngoingRecording<'_> {
1893 0 : let start = Instant::now();
1894 0 : BasebackupQueryTimeOngoingRecording {
1895 0 : parent: self,
1896 0 : start,
1897 0 : }
1898 0 : }
1899 : }
1900 :
1901 : impl BasebackupQueryTimeOngoingRecording<'_> {
1902 0 : pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
1903 0 : let elapsed = self.start.elapsed().as_secs_f64();
1904 : // If you want to change categorize of a specific error, also change it in `log_query_error`.
1905 0 : let metric = match res {
1906 0 : Ok(_) => &self.parent.ok,
1907 0 : Err(QueryError::Disconnected(ConnectionError::Io(io_error)))
1908 0 : if is_expected_io_error(io_error) =>
1909 0 : {
1910 0 : &self.parent.client_error
1911 : }
1912 0 : Err(_) => &self.parent.error,
1913 : };
1914 0 : metric.observe(elapsed);
1915 0 : }
1916 : }
1917 :
1918 0 : pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
1919 0 : register_int_counter_pair_vec!(
1920 0 : "pageserver_live_connections_started",
1921 0 : "Number of network connections that we started handling",
1922 0 : "pageserver_live_connections_finished",
1923 0 : "Number of network connections that we finished handling",
1924 0 : &["pageserver_connection_kind"]
1925 0 : )
1926 0 : .expect("failed to define a metric")
1927 0 : });
1928 :
1929 : #[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
1930 : pub(crate) enum ComputeCommandKind {
1931 : PageStreamV3,
1932 : PageStreamV2,
1933 : Basebackup,
1934 : Fullbackup,
1935 : LeaseLsn,
1936 : }
1937 :
1938 : pub(crate) struct ComputeCommandCounters {
1939 : map: EnumMap<ComputeCommandKind, IntCounter>,
1940 : }
1941 :
1942 0 : pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy<ComputeCommandCounters> = Lazy::new(|| {
1943 0 : let inner = register_int_counter_vec!(
1944 0 : "pageserver_compute_commands",
1945 0 : "Number of compute -> pageserver commands processed",
1946 0 : &["command"]
1947 0 : )
1948 0 : .expect("failed to define a metric");
1949 0 :
1950 0 : ComputeCommandCounters {
1951 0 : map: EnumMap::from_array(std::array::from_fn(|i| {
1952 0 : let command = ComputeCommandKind::from_usize(i);
1953 0 : let command_str: &'static str = command.into();
1954 0 : inner.with_label_values(&[command_str])
1955 0 : })),
1956 0 : }
1957 0 : });
1958 :
1959 : impl ComputeCommandCounters {
1960 0 : pub(crate) fn for_command(&self, command: ComputeCommandKind) -> &IntCounter {
1961 0 : &self.map[command]
1962 0 : }
1963 : }
1964 :
1965 : // remote storage metrics
1966 :
1967 396 : static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
1968 396 : register_int_counter_pair_vec!(
1969 396 : "pageserver_remote_timeline_client_calls_started",
1970 396 : "Number of started calls to remote timeline client.",
1971 396 : "pageserver_remote_timeline_client_calls_finished",
1972 396 : "Number of finshed calls to remote timeline client.",
1973 396 : &[
1974 396 : "tenant_id",
1975 396 : "shard_id",
1976 396 : "timeline_id",
1977 396 : "file_kind",
1978 396 : "op_kind"
1979 396 : ],
1980 396 : )
1981 396 : .unwrap()
1982 396 : });
1983 :
1984 : static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
1985 392 : Lazy::new(|| {
1986 392 : register_int_counter_vec!(
1987 392 : "pageserver_remote_timeline_client_bytes_started",
1988 392 : "Incremented by the number of bytes associated with a remote timeline client operation. \
1989 392 : The increment happens when the operation is scheduled.",
1990 392 : &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
1991 392 : )
1992 392 : .expect("failed to define a metric")
1993 392 : });
1994 :
1995 392 : static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
1996 392 : register_int_counter_vec!(
1997 392 : "pageserver_remote_timeline_client_bytes_finished",
1998 392 : "Incremented by the number of bytes associated with a remote timeline client operation. \
1999 392 : The increment happens when the operation finishes (regardless of success/failure/shutdown).",
2000 392 : &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
2001 392 : )
2002 392 : .expect("failed to define a metric")
2003 392 : });
2004 :
2005 : pub(crate) struct TenantManagerMetrics {
2006 : tenant_slots_attached: UIntGauge,
2007 : tenant_slots_secondary: UIntGauge,
2008 : tenant_slots_inprogress: UIntGauge,
2009 : pub(crate) tenant_slot_writes: IntCounter,
2010 : pub(crate) unexpected_errors: IntCounter,
2011 : }
2012 :
2013 : impl TenantManagerMetrics {
2014 : /// Helpers for tracking slots. Note that these do not track the lifetime of TenantSlot objects
2015 : /// exactly: they track the lifetime of the slots _in the tenant map_.
2016 4 : pub(crate) fn slot_inserted(&self, slot: &TenantSlot) {
2017 4 : match slot {
2018 0 : TenantSlot::Attached(_) => {
2019 0 : self.tenant_slots_attached.inc();
2020 0 : }
2021 0 : TenantSlot::Secondary(_) => {
2022 0 : self.tenant_slots_secondary.inc();
2023 0 : }
2024 4 : TenantSlot::InProgress(_) => {
2025 4 : self.tenant_slots_inprogress.inc();
2026 4 : }
2027 : }
2028 4 : }
2029 :
2030 4 : pub(crate) fn slot_removed(&self, slot: &TenantSlot) {
2031 4 : match slot {
2032 4 : TenantSlot::Attached(_) => {
2033 4 : self.tenant_slots_attached.dec();
2034 4 : }
2035 0 : TenantSlot::Secondary(_) => {
2036 0 : self.tenant_slots_secondary.dec();
2037 0 : }
2038 0 : TenantSlot::InProgress(_) => {
2039 0 : self.tenant_slots_inprogress.dec();
2040 0 : }
2041 : }
2042 4 : }
2043 :
2044 : #[cfg(all(debug_assertions, not(test)))]
2045 0 : pub(crate) fn slots_total(&self) -> u64 {
2046 0 : self.tenant_slots_attached.get()
2047 0 : + self.tenant_slots_secondary.get()
2048 0 : + self.tenant_slots_inprogress.get()
2049 0 : }
2050 : }
2051 :
2052 4 : pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
2053 4 : let tenant_slots = register_uint_gauge_vec!(
2054 4 : "pageserver_tenant_manager_slots",
2055 4 : "How many slots currently exist, including all attached, secondary and in-progress operations",
2056 4 : &["mode"]
2057 4 : )
2058 4 : .expect("failed to define a metric");
2059 4 : TenantManagerMetrics {
2060 4 : tenant_slots_attached: tenant_slots
2061 4 : .get_metric_with_label_values(&["attached"])
2062 4 : .unwrap(),
2063 4 : tenant_slots_secondary: tenant_slots
2064 4 : .get_metric_with_label_values(&["secondary"])
2065 4 : .unwrap(),
2066 4 : tenant_slots_inprogress: tenant_slots
2067 4 : .get_metric_with_label_values(&["inprogress"])
2068 4 : .unwrap(),
2069 4 : tenant_slot_writes: register_int_counter!(
2070 4 : "pageserver_tenant_manager_slot_writes",
2071 4 : "Writes to a tenant slot, including all of create/attach/detach/delete"
2072 4 : )
2073 4 : .expect("failed to define a metric"),
2074 4 : unexpected_errors: register_int_counter!(
2075 4 : "pageserver_tenant_manager_unexpected_errors_total",
2076 4 : "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
2077 4 : )
2078 4 : .expect("failed to define a metric"),
2079 4 : }
2080 4 : });
2081 :
2082 : pub(crate) struct DeletionQueueMetrics {
2083 : pub(crate) keys_submitted: IntCounter,
2084 : pub(crate) keys_dropped: IntCounter,
2085 : pub(crate) keys_executed: IntCounter,
2086 : pub(crate) keys_validated: IntCounter,
2087 : pub(crate) dropped_lsn_updates: IntCounter,
2088 : pub(crate) unexpected_errors: IntCounter,
2089 : pub(crate) remote_errors: IntCounterVec,
2090 : }
2091 67 : pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
2092 67 : DeletionQueueMetrics{
2093 67 :
2094 67 : keys_submitted: register_int_counter!(
2095 67 : "pageserver_deletion_queue_submitted_total",
2096 67 : "Number of objects submitted for deletion"
2097 67 : )
2098 67 : .expect("failed to define a metric"),
2099 67 :
2100 67 : keys_dropped: register_int_counter!(
2101 67 : "pageserver_deletion_queue_dropped_total",
2102 67 : "Number of object deletions dropped due to stale generation."
2103 67 : )
2104 67 : .expect("failed to define a metric"),
2105 67 :
2106 67 : keys_executed: register_int_counter!(
2107 67 : "pageserver_deletion_queue_executed_total",
2108 67 : "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion"
2109 67 : )
2110 67 : .expect("failed to define a metric"),
2111 67 :
2112 67 : keys_validated: register_int_counter!(
2113 67 : "pageserver_deletion_queue_validated_total",
2114 67 : "Number of keys validated for deletion. Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage."
2115 67 : )
2116 67 : .expect("failed to define a metric"),
2117 67 :
2118 67 : dropped_lsn_updates: register_int_counter!(
2119 67 : "pageserver_deletion_queue_dropped_lsn_updates_total",
2120 67 : "Updates to remote_consistent_lsn dropped due to stale generation number."
2121 67 : )
2122 67 : .expect("failed to define a metric"),
2123 67 : unexpected_errors: register_int_counter!(
2124 67 : "pageserver_deletion_queue_unexpected_errors_total",
2125 67 : "Number of unexpected condiions that may stall the queue: any value above zero is unexpected."
2126 67 : )
2127 67 : .expect("failed to define a metric"),
2128 67 : remote_errors: register_int_counter_vec!(
2129 67 : "pageserver_deletion_queue_remote_errors_total",
2130 67 : "Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects",
2131 67 : &["op_kind"],
2132 67 : )
2133 67 : .expect("failed to define a metric")
2134 67 : }
2135 67 : });
2136 :
2137 : pub(crate) struct SecondaryModeMetrics {
2138 : pub(crate) upload_heatmap: IntCounter,
2139 : pub(crate) upload_heatmap_errors: IntCounter,
2140 : pub(crate) upload_heatmap_duration: Histogram,
2141 : pub(crate) download_heatmap: IntCounter,
2142 : pub(crate) download_layer: IntCounter,
2143 : }
2144 0 : pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
2145 0 : SecondaryModeMetrics {
2146 0 : upload_heatmap: register_int_counter!(
2147 0 : "pageserver_secondary_upload_heatmap",
2148 0 : "Number of heatmaps written to remote storage by attached tenants"
2149 0 : )
2150 0 : .expect("failed to define a metric"),
2151 0 : upload_heatmap_errors: register_int_counter!(
2152 0 : "pageserver_secondary_upload_heatmap_errors",
2153 0 : "Failures writing heatmap to remote storage"
2154 0 : )
2155 0 : .expect("failed to define a metric"),
2156 0 : upload_heatmap_duration: register_histogram!(
2157 0 : "pageserver_secondary_upload_heatmap_duration",
2158 0 : "Time to build and upload a heatmap, including any waiting inside the remote storage client"
2159 0 : )
2160 0 : .expect("failed to define a metric"),
2161 0 : download_heatmap: register_int_counter!(
2162 0 : "pageserver_secondary_download_heatmap",
2163 0 : "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed"
2164 0 : )
2165 0 : .expect("failed to define a metric"),
2166 0 : download_layer: register_int_counter!(
2167 0 : "pageserver_secondary_download_layer",
2168 0 : "Number of downloads of layers by secondary mode locations"
2169 0 : )
2170 0 : .expect("failed to define a metric"),
2171 0 : }
2172 0 : });
2173 :
2174 0 : pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
2175 0 : register_uint_gauge_vec!(
2176 0 : "pageserver_secondary_resident_physical_size",
2177 0 : "The size of the layer files present in the pageserver's filesystem, for secondary locations.",
2178 0 : &["tenant_id", "shard_id"]
2179 0 : )
2180 0 : .expect("failed to define a metric")
2181 0 : });
2182 :
2183 0 : pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
2184 0 : register_uint_gauge!(
2185 0 : "pageserver_utilization_score",
2186 0 : "The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
2187 0 : )
2188 0 : .expect("failed to define a metric")
2189 0 : });
2190 :
2191 0 : pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
2192 0 : register_uint_gauge_vec!(
2193 0 : "pageserver_secondary_heatmap_total_size",
2194 0 : "The total size in bytes of all layers in the most recently downloaded heatmap.",
2195 0 : &["tenant_id", "shard_id"]
2196 0 : )
2197 0 : .expect("failed to define a metric")
2198 0 : });
2199 :
2200 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
2201 : pub enum RemoteOpKind {
2202 : Upload,
2203 : Download,
2204 : Delete,
2205 : }
2206 : impl RemoteOpKind {
2207 30515 : pub fn as_str(&self) -> &'static str {
2208 30515 : match self {
2209 28723 : Self::Upload => "upload",
2210 136 : Self::Download => "download",
2211 1656 : Self::Delete => "delete",
2212 : }
2213 30515 : }
2214 : }
2215 :
2216 : #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
2217 : pub enum RemoteOpFileKind {
2218 : Layer,
2219 : Index,
2220 : }
2221 : impl RemoteOpFileKind {
2222 30515 : pub fn as_str(&self) -> &'static str {
2223 30515 : match self {
2224 21416 : Self::Layer => "layer",
2225 9099 : Self::Index => "index",
2226 : }
2227 30515 : }
2228 : }
2229 :
2230 392 : pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
2231 392 : register_histogram_vec!(
2232 392 : "pageserver_remote_operation_seconds",
2233 392 : "Time spent on remote storage operations. \
2234 392 : Grouped by tenant, timeline, operation_kind and status. \
2235 392 : Does not account for time spent waiting in remote timeline client's queues.",
2236 392 : &["file_kind", "op_kind", "status"]
2237 392 : )
2238 392 : .expect("failed to define a metric")
2239 392 : });
2240 :
2241 0 : pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
2242 0 : register_int_counter_vec!(
2243 0 : "pageserver_tenant_task_events",
2244 0 : "Number of task start/stop/fail events.",
2245 0 : &["event"],
2246 0 : )
2247 0 : .expect("Failed to register tenant_task_events metric")
2248 0 : });
2249 :
2250 : pub struct BackgroundLoopSemaphoreMetrics {
2251 : counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
2252 : durations: EnumMap<BackgroundLoopKind, Histogram>,
2253 : waiting_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
2254 : running_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
2255 : }
2256 :
2257 : pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> =
2258 40 : Lazy::new(|| {
2259 40 : let counters = register_int_counter_pair_vec!(
2260 40 : "pageserver_background_loop_semaphore_wait_start_count",
2261 40 : "Counter for background loop concurrency-limiting semaphore acquire calls started",
2262 40 : "pageserver_background_loop_semaphore_wait_finish_count",
2263 40 : "Counter for background loop concurrency-limiting semaphore acquire calls finished",
2264 40 : &["task"],
2265 40 : )
2266 40 : .unwrap();
2267 40 :
2268 40 : let durations = register_histogram_vec!(
2269 40 : "pageserver_background_loop_semaphore_wait_seconds",
2270 40 : "Seconds spent waiting on background loop semaphore acquisition",
2271 40 : &["task"],
2272 40 : vec![0.01, 1.0, 5.0, 10.0, 30.0, 60.0, 180.0, 300.0, 600.0],
2273 40 : )
2274 40 : .unwrap();
2275 40 :
2276 40 : let waiting_tasks = register_int_gauge_vec!(
2277 40 : "pageserver_background_loop_semaphore_waiting_tasks",
2278 40 : "Number of background loop tasks waiting for semaphore",
2279 40 : &["task"],
2280 40 : )
2281 40 : .unwrap();
2282 40 :
2283 40 : let running_tasks = register_int_gauge_vec!(
2284 40 : "pageserver_background_loop_semaphore_running_tasks",
2285 40 : "Number of background loop tasks running concurrently",
2286 40 : &["task"],
2287 40 : )
2288 40 : .unwrap();
2289 40 :
2290 40 : BackgroundLoopSemaphoreMetrics {
2291 400 : counters: EnumMap::from_array(std::array::from_fn(|i| {
2292 400 : let kind = BackgroundLoopKind::from_usize(i);
2293 400 : counters.with_label_values(&[kind.into()])
2294 400 : })),
2295 400 : durations: EnumMap::from_array(std::array::from_fn(|i| {
2296 400 : let kind = BackgroundLoopKind::from_usize(i);
2297 400 : durations.with_label_values(&[kind.into()])
2298 400 : })),
2299 400 : waiting_tasks: EnumMap::from_array(std::array::from_fn(|i| {
2300 400 : let kind = BackgroundLoopKind::from_usize(i);
2301 400 : waiting_tasks.with_label_values(&[kind.into()])
2302 400 : })),
2303 400 : running_tasks: EnumMap::from_array(std::array::from_fn(|i| {
2304 400 : let kind = BackgroundLoopKind::from_usize(i);
2305 400 : running_tasks.with_label_values(&[kind.into()])
2306 400 : })),
2307 40 : }
2308 40 : });
2309 :
2310 : impl BackgroundLoopSemaphoreMetrics {
2311 : /// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the
2312 : /// semaphore is acquired, and drop it when the task completes or is cancelled.
2313 728 : pub(crate) fn record(
2314 728 : &self,
2315 728 : task: BackgroundLoopKind,
2316 728 : ) -> BackgroundLoopSemaphoreMetricsRecorder {
2317 728 : BackgroundLoopSemaphoreMetricsRecorder::start(self, task)
2318 728 : }
2319 : }
2320 :
2321 : /// Records metrics for a background task.
2322 : pub struct BackgroundLoopSemaphoreMetricsRecorder<'a> {
2323 : metrics: &'a BackgroundLoopSemaphoreMetrics,
2324 : task: BackgroundLoopKind,
2325 : start: Instant,
2326 : wait_counter_guard: Option<metrics::IntCounterPairGuard>,
2327 : }
2328 :
2329 : impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> {
2330 : /// Starts recording semaphore metrics, by recording wait time and incrementing
2331 : /// `wait_start_count` and `waiting_tasks`.
2332 728 : fn start(metrics: &'a BackgroundLoopSemaphoreMetrics, task: BackgroundLoopKind) -> Self {
2333 728 : metrics.waiting_tasks[task].inc();
2334 728 : Self {
2335 728 : metrics,
2336 728 : task,
2337 728 : start: Instant::now(),
2338 728 : wait_counter_guard: Some(metrics.counters[task].guard()),
2339 728 : }
2340 728 : }
2341 :
2342 : /// Signals that the semaphore has been acquired, and updates relevant metrics.
2343 728 : pub fn acquired(&mut self) -> Duration {
2344 728 : let waited = self.start.elapsed();
2345 728 : self.wait_counter_guard.take().expect("already acquired");
2346 728 : self.metrics.durations[self.task].observe(waited.as_secs_f64());
2347 728 : self.metrics.waiting_tasks[self.task].dec();
2348 728 : self.metrics.running_tasks[self.task].inc();
2349 728 : waited
2350 728 : }
2351 : }
2352 :
2353 : impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> {
2354 : /// The task either completed or was cancelled.
2355 728 : fn drop(&mut self) {
2356 728 : if self.wait_counter_guard.take().is_some() {
2357 0 : // Waiting.
2358 0 : self.metrics.durations[self.task].observe(self.start.elapsed().as_secs_f64());
2359 0 : self.metrics.waiting_tasks[self.task].dec();
2360 728 : } else {
2361 728 : // Running.
2362 728 : self.metrics.running_tasks[self.task].dec();
2363 728 : }
2364 728 : }
2365 : }
2366 :
2367 0 : pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
2368 0 : register_int_counter_vec!(
2369 0 : "pageserver_background_loop_period_overrun_count",
2370 0 : "Incremented whenever warn_when_period_overrun() logs a warning.",
2371 0 : &["task", "period"],
2372 0 : )
2373 0 : .expect("failed to define a metric")
2374 0 : });
2375 :
2376 : // walreceiver metrics
2377 :
2378 0 : pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
2379 0 : register_int_counter!(
2380 0 : "pageserver_walreceiver_started_connections_total",
2381 0 : "Number of started walreceiver connections"
2382 0 : )
2383 0 : .expect("failed to define a metric")
2384 0 : });
2385 :
2386 0 : pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
2387 0 : register_int_gauge!(
2388 0 : "pageserver_walreceiver_active_managers",
2389 0 : "Number of active walreceiver managers"
2390 0 : )
2391 0 : .expect("failed to define a metric")
2392 0 : });
2393 :
2394 0 : pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
2395 0 : register_int_counter_vec!(
2396 0 : "pageserver_walreceiver_switches_total",
2397 0 : "Number of walreceiver manager change_connection calls",
2398 0 : &["reason"]
2399 0 : )
2400 0 : .expect("failed to define a metric")
2401 0 : });
2402 :
2403 0 : pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
2404 0 : register_int_counter!(
2405 0 : "pageserver_walreceiver_broker_updates_total",
2406 0 : "Number of received broker updates in walreceiver"
2407 0 : )
2408 0 : .expect("failed to define a metric")
2409 0 : });
2410 :
2411 4 : pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
2412 4 : register_int_counter_vec!(
2413 4 : "pageserver_walreceiver_candidates_events_total",
2414 4 : "Number of walreceiver candidate events",
2415 4 : &["event"]
2416 4 : )
2417 4 : .expect("failed to define a metric")
2418 4 : });
2419 :
2420 : pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
2421 0 : Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
2422 :
2423 : pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
2424 4 : Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
2425 :
2426 : // Metrics collected on WAL redo operations
2427 : //
2428 : // We collect the time spent in actual WAL redo ('redo'), and time waiting
2429 : // for access to the postgres process ('wait') since there is only one for
2430 : // each tenant.
2431 :
2432 : /// Time buckets are small because we want to be able to measure the
2433 : /// smallest redo processing times. These buckets allow us to measure down
2434 : /// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
2435 : /// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
2436 : ///
2437 : /// Values up to 1s are recorded because metrics show that we have redo
2438 : /// durations and lock times larger than 0.250s.
2439 : macro_rules! redo_histogram_time_buckets {
2440 : () => {
2441 : vec![
2442 : 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
2443 : 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000,
2444 : 1.000_000,
2445 : ]
2446 : };
2447 : }
2448 :
2449 : /// While we're at it, also measure the amount of records replayed in each
2450 : /// operation. We have a global 'total replayed' counter, but that's not
2451 : /// as useful as 'what is the skew for how many records we replay in one
2452 : /// operation'.
2453 : macro_rules! redo_histogram_count_buckets {
2454 : () => {
2455 : vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0]
2456 : };
2457 : }
2458 :
2459 : macro_rules! redo_bytes_histogram_count_buckets {
2460 : () => {
2461 : // powers of (2^.5), from 2^4.5 to 2^15 (22 buckets)
2462 : // rounded up to the next multiple of 8 to capture any MAXALIGNed record of that size, too.
2463 : vec![
2464 : 24.0, 32.0, 48.0, 64.0, 96.0, 128.0, 184.0, 256.0, 368.0, 512.0, 728.0, 1024.0, 1456.0,
2465 : 2048.0, 2904.0, 4096.0, 5800.0, 8192.0, 11592.0, 16384.0, 23176.0, 32768.0,
2466 : ]
2467 : };
2468 : }
2469 :
2470 : pub(crate) struct WalIngestMetrics {
2471 : pub(crate) bytes_received: IntCounter,
2472 : pub(crate) records_received: IntCounter,
2473 : pub(crate) records_observed: IntCounter,
2474 : pub(crate) records_committed: IntCounter,
2475 : pub(crate) records_filtered: IntCounter,
2476 : pub(crate) values_committed_metadata_images: IntCounter,
2477 : pub(crate) values_committed_metadata_deltas: IntCounter,
2478 : pub(crate) values_committed_data_images: IntCounter,
2479 : pub(crate) values_committed_data_deltas: IntCounter,
2480 : pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
2481 : }
2482 :
2483 : impl WalIngestMetrics {
2484 0 : pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) {
2485 0 : if stats.metadata_images > 0 {
2486 0 : self.values_committed_metadata_images
2487 0 : .inc_by(stats.metadata_images);
2488 0 : }
2489 0 : if stats.metadata_deltas > 0 {
2490 0 : self.values_committed_metadata_deltas
2491 0 : .inc_by(stats.metadata_deltas);
2492 0 : }
2493 0 : if stats.data_images > 0 {
2494 0 : self.values_committed_data_images.inc_by(stats.data_images);
2495 0 : }
2496 0 : if stats.data_deltas > 0 {
2497 0 : self.values_committed_data_deltas.inc_by(stats.data_deltas);
2498 0 : }
2499 0 : }
2500 : }
2501 :
2502 20 : pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
2503 20 : let values_committed = register_int_counter_vec!(
2504 20 : "pageserver_wal_ingest_values_committed",
2505 20 : "Number of values committed to pageserver storage from WAL records",
2506 20 : &["class", "kind"],
2507 20 : )
2508 20 : .expect("failed to define a metric");
2509 20 :
2510 20 : WalIngestMetrics {
2511 20 : bytes_received: register_int_counter!(
2512 20 : "pageserver_wal_ingest_bytes_received",
2513 20 : "Bytes of WAL ingested from safekeepers",
2514 20 : )
2515 20 : .unwrap(),
2516 20 : records_received: register_int_counter!(
2517 20 : "pageserver_wal_ingest_records_received",
2518 20 : "Number of WAL records received from safekeepers"
2519 20 : )
2520 20 : .expect("failed to define a metric"),
2521 20 : records_observed: register_int_counter!(
2522 20 : "pageserver_wal_ingest_records_observed",
2523 20 : "Number of WAL records observed from safekeepers. These are metadata only records for shard 0."
2524 20 : )
2525 20 : .expect("failed to define a metric"),
2526 20 : records_committed: register_int_counter!(
2527 20 : "pageserver_wal_ingest_records_committed",
2528 20 : "Number of WAL records which resulted in writes to pageserver storage"
2529 20 : )
2530 20 : .expect("failed to define a metric"),
2531 20 : records_filtered: register_int_counter!(
2532 20 : "pageserver_wal_ingest_records_filtered",
2533 20 : "Number of WAL records filtered out due to sharding"
2534 20 : )
2535 20 : .expect("failed to define a metric"),
2536 20 : values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
2537 20 : values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
2538 20 : values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
2539 20 : values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]),
2540 20 : gap_blocks_zeroed_on_rel_extend: register_int_counter!(
2541 20 : "pageserver_gap_blocks_zeroed_on_rel_extend",
2542 20 : "Total number of zero gap blocks written on relation extends"
2543 20 : )
2544 20 : .expect("failed to define a metric"),
2545 20 : }
2546 20 : });
2547 :
2548 404 : pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
2549 404 : register_int_counter_vec!(
2550 404 : "pageserver_timeline_wal_records_received",
2551 404 : "Number of WAL records received per shard",
2552 404 : &["tenant_id", "shard_id", "timeline_id"]
2553 404 : )
2554 404 : .expect("failed to define a metric")
2555 404 : });
2556 :
2557 12 : pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
2558 12 : register_histogram!(
2559 12 : "pageserver_wal_redo_seconds",
2560 12 : "Time spent on WAL redo",
2561 12 : redo_histogram_time_buckets!()
2562 12 : )
2563 12 : .expect("failed to define a metric")
2564 12 : });
2565 :
2566 12 : pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
2567 12 : register_histogram!(
2568 12 : "pageserver_wal_redo_records_histogram",
2569 12 : "Histogram of number of records replayed per redo in the Postgres WAL redo process",
2570 12 : redo_histogram_count_buckets!(),
2571 12 : )
2572 12 : .expect("failed to define a metric")
2573 12 : });
2574 :
2575 12 : pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
2576 12 : register_histogram!(
2577 12 : "pageserver_wal_redo_bytes_histogram",
2578 12 : "Histogram of number of records replayed per redo sent to Postgres",
2579 12 : redo_bytes_histogram_count_buckets!(),
2580 12 : )
2581 12 : .expect("failed to define a metric")
2582 12 : });
2583 :
2584 : // FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
2585 12 : pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
2586 12 : register_int_counter!(
2587 12 : "pageserver_replayed_wal_records_total",
2588 12 : "Number of WAL records replayed in WAL redo process"
2589 12 : )
2590 12 : .unwrap()
2591 12 : });
2592 :
2593 : #[rustfmt::skip]
2594 16 : pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
2595 16 : register_histogram!(
2596 16 : "pageserver_wal_redo_process_launch_duration",
2597 16 : "Histogram of the duration of successful WalRedoProcess::launch calls",
2598 16 : vec![
2599 16 : 0.0002, 0.0004, 0.0006, 0.0008, 0.0010,
2600 16 : 0.0020, 0.0040, 0.0060, 0.0080, 0.0100,
2601 16 : 0.0200, 0.0400, 0.0600, 0.0800, 0.1000,
2602 16 : 0.2000, 0.4000, 0.6000, 0.8000, 1.0000,
2603 16 : 1.5000, 2.0000, 2.5000, 3.0000, 4.0000, 10.0000
2604 16 : ],
2605 16 : )
2606 16 : .expect("failed to define a metric")
2607 16 : });
2608 :
2609 : pub(crate) struct WalRedoProcessCounters {
2610 : pub(crate) started: IntCounter,
2611 : pub(crate) killed_by_cause: EnumMap<WalRedoKillCause, IntCounter>,
2612 : pub(crate) active_stderr_logger_tasks_started: IntCounter,
2613 : pub(crate) active_stderr_logger_tasks_finished: IntCounter,
2614 : }
2615 :
2616 : #[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
2617 : pub(crate) enum WalRedoKillCause {
2618 : WalRedoProcessDrop,
2619 : NoLeakChildDrop,
2620 : Startup,
2621 : }
2622 :
2623 : impl Default for WalRedoProcessCounters {
2624 16 : fn default() -> Self {
2625 16 : let started = register_int_counter!(
2626 16 : "pageserver_wal_redo_process_started_total",
2627 16 : "Number of WAL redo processes started",
2628 16 : )
2629 16 : .unwrap();
2630 16 :
2631 16 : let killed = register_int_counter_vec!(
2632 16 : "pageserver_wal_redo_process_stopped_total",
2633 16 : "Number of WAL redo processes stopped",
2634 16 : &["cause"],
2635 16 : )
2636 16 : .unwrap();
2637 16 :
2638 16 : let active_stderr_logger_tasks_started = register_int_counter!(
2639 16 : "pageserver_walredo_stderr_logger_tasks_started_total",
2640 16 : "Number of active walredo stderr logger tasks that have started",
2641 16 : )
2642 16 : .unwrap();
2643 16 :
2644 16 : let active_stderr_logger_tasks_finished = register_int_counter!(
2645 16 : "pageserver_walredo_stderr_logger_tasks_finished_total",
2646 16 : "Number of active walredo stderr logger tasks that have finished",
2647 16 : )
2648 16 : .unwrap();
2649 16 :
2650 16 : Self {
2651 16 : started,
2652 48 : killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
2653 48 : let cause = WalRedoKillCause::from_usize(i);
2654 48 : let cause_str: &'static str = cause.into();
2655 48 : killed.with_label_values(&[cause_str])
2656 48 : })),
2657 16 : active_stderr_logger_tasks_started,
2658 16 : active_stderr_logger_tasks_finished,
2659 16 : }
2660 16 : }
2661 : }
2662 :
2663 : pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
2664 : Lazy::new(WalRedoProcessCounters::default);
2665 :
2666 : /// Similar to `prometheus::HistogramTimer` but does not record on drop.
2667 : pub(crate) struct StorageTimeMetricsTimer {
2668 : metrics: StorageTimeMetrics,
2669 : start: Instant,
2670 : }
2671 :
2672 : impl StorageTimeMetricsTimer {
2673 4244 : fn new(metrics: StorageTimeMetrics) -> Self {
2674 4244 : Self {
2675 4244 : metrics,
2676 4244 : start: Instant::now(),
2677 4244 : }
2678 4244 : }
2679 :
2680 : /// Returns the elapsed duration of the timer.
2681 4244 : pub fn elapsed(&self) -> Duration {
2682 4244 : self.start.elapsed()
2683 4244 : }
2684 :
2685 : /// Record the time from creation to now and return it.
2686 4244 : pub fn stop_and_record(self) -> Duration {
2687 4244 : let duration = self.elapsed();
2688 4244 : let seconds = duration.as_secs_f64();
2689 4244 : self.metrics.timeline_sum.inc_by(seconds);
2690 4244 : self.metrics.timeline_count.inc();
2691 4244 : self.metrics.global_histogram.observe(seconds);
2692 4244 : duration
2693 4244 : }
2694 :
2695 : /// Turns this timer into a timer, which will always record -- usually this means recording
2696 : /// regardless an early `?` path was taken in a function.
2697 8 : pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer {
2698 8 : AlwaysRecordingStorageTimeMetricsTimer(Some(self))
2699 8 : }
2700 : }
2701 :
2702 : pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option<StorageTimeMetricsTimer>);
2703 :
2704 : impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
2705 8 : fn drop(&mut self) {
2706 8 : if let Some(inner) = self.0.take() {
2707 8 : inner.stop_and_record();
2708 8 : }
2709 8 : }
2710 : }
2711 :
2712 : impl AlwaysRecordingStorageTimeMetricsTimer {
2713 : /// Returns the elapsed duration of the timer.
2714 0 : pub fn elapsed(&self) -> Duration {
2715 0 : self.0.as_ref().expect("not dropped yet").elapsed()
2716 0 : }
2717 : }
2718 :
2719 : /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
2720 : /// timeline total sum and count.
2721 : #[derive(Clone, Debug)]
2722 : pub(crate) struct StorageTimeMetrics {
2723 : /// Sum of f64 seconds, per operation, tenant_id and timeline_id
2724 : timeline_sum: Counter,
2725 : /// Number of oeprations, per operation, tenant_id and timeline_id
2726 : timeline_count: IntCounter,
2727 : /// Global histogram having only the "operation" label.
2728 : global_histogram: Histogram,
2729 : }
2730 :
2731 : impl StorageTimeMetrics {
2732 8064 : pub fn new(
2733 8064 : operation: StorageTimeOperation,
2734 8064 : tenant_id: &str,
2735 8064 : shard_id: &str,
2736 8064 : timeline_id: &str,
2737 8064 : ) -> Self {
2738 8064 : let operation: &'static str = operation.into();
2739 8064 :
2740 8064 : let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
2741 8064 : .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
2742 8064 : .unwrap();
2743 8064 : let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
2744 8064 : .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
2745 8064 : .unwrap();
2746 8064 : let global_histogram = STORAGE_TIME_GLOBAL
2747 8064 : .get_metric_with_label_values(&[operation])
2748 8064 : .unwrap();
2749 8064 :
2750 8064 : StorageTimeMetrics {
2751 8064 : timeline_sum,
2752 8064 : timeline_count,
2753 8064 : global_histogram,
2754 8064 : }
2755 8064 : }
2756 :
2757 : /// Starts timing a new operation.
2758 : ///
2759 : /// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop.
2760 4244 : pub fn start_timer(&self) -> StorageTimeMetricsTimer {
2761 4244 : StorageTimeMetricsTimer::new(self.clone())
2762 4244 : }
2763 : }
2764 :
2765 : #[derive(Debug)]
2766 : pub(crate) struct TimelineMetrics {
2767 : tenant_id: String,
2768 : shard_id: String,
2769 : timeline_id: String,
2770 : pub flush_time_histo: StorageTimeMetrics,
2771 : pub flush_delay_histo: StorageTimeMetrics,
2772 : pub flush_wait_upload_time_gauge: Gauge,
2773 : pub compact_time_histo: StorageTimeMetrics,
2774 : pub create_images_time_histo: StorageTimeMetrics,
2775 : pub logical_size_histo: StorageTimeMetrics,
2776 : pub imitate_logical_size_histo: StorageTimeMetrics,
2777 : pub load_layer_map_histo: StorageTimeMetrics,
2778 : pub garbage_collect_histo: StorageTimeMetrics,
2779 : pub find_gc_cutoffs_histo: StorageTimeMetrics,
2780 : pub last_record_lsn_gauge: IntGauge,
2781 : pub disk_consistent_lsn_gauge: IntGauge,
2782 : pub pitr_history_size: UIntGauge,
2783 : pub archival_size: UIntGauge,
2784 : pub layers_per_read: Histogram,
2785 : pub standby_horizon_gauge: IntGauge,
2786 : pub resident_physical_size_gauge: UIntGauge,
2787 : pub visible_physical_size_gauge: UIntGauge,
2788 : /// copy of LayeredTimeline.current_logical_size
2789 : pub current_logical_size_gauge: UIntGauge,
2790 : pub aux_file_size_gauge: IntGauge,
2791 : pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
2792 : pub evictions: IntCounter,
2793 : pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
2794 : /// Number of valid LSN leases.
2795 : pub valid_lsn_lease_count_gauge: UIntGauge,
2796 : pub wal_records_received: IntCounter,
2797 : shutdown: std::sync::atomic::AtomicBool,
2798 : }
2799 :
2800 : impl TimelineMetrics {
2801 896 : pub fn new(
2802 896 : tenant_shard_id: &TenantShardId,
2803 896 : timeline_id_raw: &TimelineId,
2804 896 : evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
2805 896 : ) -> Self {
2806 896 : let tenant_id = tenant_shard_id.tenant_id.to_string();
2807 896 : let shard_id = format!("{}", tenant_shard_id.shard_slug());
2808 896 : let timeline_id = timeline_id_raw.to_string();
2809 896 : let flush_time_histo = StorageTimeMetrics::new(
2810 896 : StorageTimeOperation::LayerFlush,
2811 896 : &tenant_id,
2812 896 : &shard_id,
2813 896 : &timeline_id,
2814 896 : );
2815 896 : let flush_delay_histo = StorageTimeMetrics::new(
2816 896 : StorageTimeOperation::LayerFlushDelay,
2817 896 : &tenant_id,
2818 896 : &shard_id,
2819 896 : &timeline_id,
2820 896 : );
2821 896 : let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
2822 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2823 896 : .unwrap();
2824 896 : let compact_time_histo = StorageTimeMetrics::new(
2825 896 : StorageTimeOperation::Compact,
2826 896 : &tenant_id,
2827 896 : &shard_id,
2828 896 : &timeline_id,
2829 896 : );
2830 896 : let create_images_time_histo = StorageTimeMetrics::new(
2831 896 : StorageTimeOperation::CreateImages,
2832 896 : &tenant_id,
2833 896 : &shard_id,
2834 896 : &timeline_id,
2835 896 : );
2836 896 : let logical_size_histo = StorageTimeMetrics::new(
2837 896 : StorageTimeOperation::LogicalSize,
2838 896 : &tenant_id,
2839 896 : &shard_id,
2840 896 : &timeline_id,
2841 896 : );
2842 896 : let imitate_logical_size_histo = StorageTimeMetrics::new(
2843 896 : StorageTimeOperation::ImitateLogicalSize,
2844 896 : &tenant_id,
2845 896 : &shard_id,
2846 896 : &timeline_id,
2847 896 : );
2848 896 : let load_layer_map_histo = StorageTimeMetrics::new(
2849 896 : StorageTimeOperation::LoadLayerMap,
2850 896 : &tenant_id,
2851 896 : &shard_id,
2852 896 : &timeline_id,
2853 896 : );
2854 896 : let garbage_collect_histo = StorageTimeMetrics::new(
2855 896 : StorageTimeOperation::Gc,
2856 896 : &tenant_id,
2857 896 : &shard_id,
2858 896 : &timeline_id,
2859 896 : );
2860 896 : let find_gc_cutoffs_histo = StorageTimeMetrics::new(
2861 896 : StorageTimeOperation::FindGcCutoffs,
2862 896 : &tenant_id,
2863 896 : &shard_id,
2864 896 : &timeline_id,
2865 896 : );
2866 896 : let last_record_lsn_gauge = LAST_RECORD_LSN
2867 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2868 896 : .unwrap();
2869 896 :
2870 896 : let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
2871 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2872 896 : .unwrap();
2873 896 :
2874 896 : let pitr_history_size = PITR_HISTORY_SIZE
2875 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2876 896 : .unwrap();
2877 896 :
2878 896 : let archival_size = TIMELINE_ARCHIVE_SIZE
2879 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2880 896 : .unwrap();
2881 896 :
2882 896 : let layers_per_read = LAYERS_PER_READ
2883 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2884 896 : .unwrap();
2885 896 :
2886 896 : let standby_horizon_gauge = STANDBY_HORIZON
2887 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2888 896 : .unwrap();
2889 896 : let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
2890 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2891 896 : .unwrap();
2892 896 : let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
2893 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2894 896 : .unwrap();
2895 896 : // TODO: we shouldn't expose this metric
2896 896 : let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
2897 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2898 896 : .unwrap();
2899 896 : let aux_file_size_gauge = AUX_FILE_SIZE
2900 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2901 896 : .unwrap();
2902 896 : // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
2903 896 : let directory_entries_count_gauge_closure = {
2904 896 : let tenant_shard_id = *tenant_shard_id;
2905 896 : let timeline_id_raw = *timeline_id_raw;
2906 0 : move || {
2907 0 : let tenant_id = tenant_shard_id.tenant_id.to_string();
2908 0 : let shard_id = format!("{}", tenant_shard_id.shard_slug());
2909 0 : let timeline_id = timeline_id_raw.to_string();
2910 0 : let gauge: UIntGauge = DIRECTORY_ENTRIES_COUNT
2911 0 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2912 0 : .unwrap();
2913 0 : gauge
2914 0 : }
2915 : };
2916 896 : let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
2917 896 : Lazy::new(Box::new(directory_entries_count_gauge_closure));
2918 896 : let evictions = EVICTIONS
2919 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2920 896 : .unwrap();
2921 896 : let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
2922 896 : .build(&tenant_id, &shard_id, &timeline_id);
2923 896 :
2924 896 : let valid_lsn_lease_count_gauge = VALID_LSN_LEASE_COUNT
2925 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2926 896 : .unwrap();
2927 896 :
2928 896 : let wal_records_received = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED
2929 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2930 896 : .unwrap();
2931 896 :
2932 896 : TimelineMetrics {
2933 896 : tenant_id,
2934 896 : shard_id,
2935 896 : timeline_id,
2936 896 : flush_time_histo,
2937 896 : flush_delay_histo,
2938 896 : flush_wait_upload_time_gauge,
2939 896 : compact_time_histo,
2940 896 : create_images_time_histo,
2941 896 : logical_size_histo,
2942 896 : imitate_logical_size_histo,
2943 896 : garbage_collect_histo,
2944 896 : find_gc_cutoffs_histo,
2945 896 : load_layer_map_histo,
2946 896 : last_record_lsn_gauge,
2947 896 : disk_consistent_lsn_gauge,
2948 896 : pitr_history_size,
2949 896 : archival_size,
2950 896 : layers_per_read,
2951 896 : standby_horizon_gauge,
2952 896 : resident_physical_size_gauge,
2953 896 : visible_physical_size_gauge,
2954 896 : current_logical_size_gauge,
2955 896 : aux_file_size_gauge,
2956 896 : directory_entries_count_gauge,
2957 896 : evictions,
2958 896 : evictions_with_low_residence_duration: std::sync::RwLock::new(
2959 896 : evictions_with_low_residence_duration,
2960 896 : ),
2961 896 : valid_lsn_lease_count_gauge,
2962 896 : wal_records_received,
2963 896 : shutdown: std::sync::atomic::AtomicBool::default(),
2964 896 : }
2965 896 : }
2966 :
2967 3148 : pub(crate) fn record_new_file_metrics(&self, sz: u64) {
2968 3148 : self.resident_physical_size_add(sz);
2969 3148 : }
2970 :
2971 1067 : pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
2972 1067 : self.resident_physical_size_gauge.sub(sz);
2973 1067 : crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
2974 1067 : }
2975 :
2976 3420 : pub(crate) fn resident_physical_size_add(&self, sz: u64) {
2977 3420 : self.resident_physical_size_gauge.add(sz);
2978 3420 : crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
2979 3420 : }
2980 :
2981 20 : pub(crate) fn resident_physical_size_get(&self) -> u64 {
2982 20 : self.resident_physical_size_gauge.get()
2983 20 : }
2984 :
2985 2348 : pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
2986 2348 : self.flush_wait_upload_time_gauge.add(duration);
2987 2348 : crate::metrics::FLUSH_WAIT_UPLOAD_TIME
2988 2348 : .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
2989 2348 : .unwrap()
2990 2348 : .add(duration);
2991 2348 : }
2992 :
2993 : /// Generates TIMELINE_LAYER labels for a persistent layer.
2994 5226 : fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] {
2995 5226 : let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) {
2996 2848 : true => LayerLevel::L0,
2997 2378 : false => LayerLevel::L1,
2998 : };
2999 5226 : let kind = match layer_desc.is_delta() {
3000 4359 : true => LayerKind::Delta,
3001 867 : false => LayerKind::Image,
3002 : };
3003 5226 : [
3004 5226 : &self.tenant_id,
3005 5226 : &self.shard_id,
3006 5226 : &self.timeline_id,
3007 5226 : level.into(),
3008 5226 : kind.into(),
3009 5226 : ]
3010 5226 : }
3011 :
3012 : /// Generates TIMELINE_LAYER labels for a frozen ephemeral layer.
3013 4696 : fn make_frozen_layer_labels(&self, _layer: &InMemoryLayer) -> [&str; 5] {
3014 4696 : [
3015 4696 : &self.tenant_id,
3016 4696 : &self.shard_id,
3017 4696 : &self.timeline_id,
3018 4696 : LayerLevel::Frozen.into(),
3019 4696 : LayerKind::Delta.into(), // by definition
3020 4696 : ]
3021 4696 : }
3022 :
3023 : /// Removes a frozen ephemeral layer to TIMELINE_LAYER metrics.
3024 2348 : pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) {
3025 2348 : assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
3026 2348 : let labels = self.make_frozen_layer_labels(layer);
3027 2348 : let size = layer.try_len().expect("frozen layer should have no writer");
3028 2348 : TIMELINE_LAYER_COUNT
3029 2348 : .get_metric_with_label_values(&labels)
3030 2348 : .unwrap()
3031 2348 : .dec();
3032 2348 : TIMELINE_LAYER_SIZE
3033 2348 : .get_metric_with_label_values(&labels)
3034 2348 : .unwrap()
3035 2348 : .sub(size);
3036 2348 : }
3037 :
3038 : /// Adds a frozen ephemeral layer to TIMELINE_LAYER metrics.
3039 2348 : pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) {
3040 2348 : assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
3041 2348 : let labels = self.make_frozen_layer_labels(layer);
3042 2348 : let size = layer.try_len().expect("frozen layer should have no writer");
3043 2348 : TIMELINE_LAYER_COUNT
3044 2348 : .get_metric_with_label_values(&labels)
3045 2348 : .unwrap()
3046 2348 : .inc();
3047 2348 : TIMELINE_LAYER_SIZE
3048 2348 : .get_metric_with_label_values(&labels)
3049 2348 : .unwrap()
3050 2348 : .add(size);
3051 2348 : }
3052 :
3053 : /// Removes a persistent layer from TIMELINE_LAYER metrics.
3054 1382 : pub fn dec_layer(&self, layer_desc: &PersistentLayerDesc) {
3055 1382 : let labels = self.make_layer_labels(layer_desc);
3056 1382 : TIMELINE_LAYER_COUNT
3057 1382 : .get_metric_with_label_values(&labels)
3058 1382 : .unwrap()
3059 1382 : .dec();
3060 1382 : TIMELINE_LAYER_SIZE
3061 1382 : .get_metric_with_label_values(&labels)
3062 1382 : .unwrap()
3063 1382 : .sub(layer_desc.file_size);
3064 1382 : }
3065 :
3066 : /// Adds a persistent layer to TIMELINE_LAYER metrics.
3067 3844 : pub fn inc_layer(&self, layer_desc: &PersistentLayerDesc) {
3068 3844 : let labels = self.make_layer_labels(layer_desc);
3069 3844 : TIMELINE_LAYER_COUNT
3070 3844 : .get_metric_with_label_values(&labels)
3071 3844 : .unwrap()
3072 3844 : .inc();
3073 3844 : TIMELINE_LAYER_SIZE
3074 3844 : .get_metric_with_label_values(&labels)
3075 3844 : .unwrap()
3076 3844 : .add(layer_desc.file_size);
3077 3844 : }
3078 :
3079 20 : pub(crate) fn shutdown(&self) {
3080 20 : let was_shutdown = self
3081 20 : .shutdown
3082 20 : .swap(true, std::sync::atomic::Ordering::Relaxed);
3083 20 :
3084 20 : if was_shutdown {
3085 : // this happens on tenant deletion because tenant first shuts down timelines, then
3086 : // invokes timeline deletion which first shuts down the timeline again.
3087 : // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
3088 0 : return;
3089 20 : }
3090 20 :
3091 20 : let tenant_id = &self.tenant_id;
3092 20 : let timeline_id = &self.timeline_id;
3093 20 : let shard_id = &self.shard_id;
3094 20 : let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3095 20 : let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3096 20 : let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3097 20 : let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3098 20 : {
3099 20 : RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
3100 20 : let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3101 20 : }
3102 20 : let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3103 20 : let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3104 20 : if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
3105 0 : let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3106 20 : }
3107 :
3108 20 : let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3109 20 : let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3110 :
3111 80 : for ref level in LayerLevel::iter() {
3112 180 : for ref kind in LayerKind::iter() {
3113 120 : let labels: [&str; 5] =
3114 120 : [tenant_id, shard_id, timeline_id, level.into(), kind.into()];
3115 120 : let _ = TIMELINE_LAYER_SIZE.remove_label_values(&labels);
3116 120 : let _ = TIMELINE_LAYER_COUNT.remove_label_values(&labels);
3117 120 : }
3118 : }
3119 :
3120 20 : let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3121 20 :
3122 20 : let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3123 20 : let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3124 20 : let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3125 20 :
3126 20 : self.evictions_with_low_residence_duration
3127 20 : .write()
3128 20 : .unwrap()
3129 20 : .remove(tenant_id, shard_id, timeline_id);
3130 :
3131 : // The following metrics are born outside of the TimelineMetrics lifecycle but still
3132 : // removed at the end of it. The idea is to have the metrics outlive the
3133 : // entity during which they're observed, e.g., the smgr metrics shall
3134 : // outlive an individual smgr connection, but not the timeline.
3135 :
3136 200 : for op in StorageTimeOperation::VARIANTS {
3137 180 : let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[
3138 180 : op,
3139 180 : tenant_id,
3140 180 : shard_id,
3141 180 : timeline_id,
3142 180 : ]);
3143 180 : let _ = STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[
3144 180 : op,
3145 180 : tenant_id,
3146 180 : shard_id,
3147 180 : timeline_id,
3148 180 : ]);
3149 180 : }
3150 :
3151 60 : for op in STORAGE_IO_SIZE_OPERATIONS {
3152 40 : let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
3153 40 : }
3154 :
3155 20 : let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
3156 20 : SmgrQueryType::GetPageAtLsn.into(),
3157 20 : tenant_id,
3158 20 : shard_id,
3159 20 : timeline_id,
3160 20 : ]);
3161 20 : let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
3162 20 : SmgrQueryType::GetPageAtLsn.into(),
3163 20 : tenant_id,
3164 20 : shard_id,
3165 20 : timeline_id,
3166 20 : ]);
3167 20 : let _ = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE.remove_label_values(&[
3168 20 : tenant_id,
3169 20 : shard_id,
3170 20 : timeline_id,
3171 20 : ]);
3172 20 : let _ = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED.remove_label_values(&[
3173 20 : tenant_id,
3174 20 : shard_id,
3175 20 : timeline_id,
3176 20 : ]);
3177 20 : let _ = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS.remove_label_values(&[
3178 20 : tenant_id,
3179 20 : shard_id,
3180 20 : timeline_id,
3181 20 : ]);
3182 20 : let _ = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME.remove_label_values(&[
3183 20 : tenant_id,
3184 20 : shard_id,
3185 20 : timeline_id,
3186 20 : ]);
3187 20 : }
3188 : }
3189 :
3190 12 : pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
3191 12 : // Only shard zero deals in synthetic sizes
3192 12 : if tenant_shard_id.is_shard_zero() {
3193 12 : let tid = tenant_shard_id.tenant_id.to_string();
3194 12 : let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
3195 12 : }
3196 :
3197 12 : tenant_throttling::remove_tenant_metrics(tenant_shard_id);
3198 12 :
3199 12 : // we leave the BROKEN_TENANTS_SET entry if any
3200 12 : }
3201 :
3202 : /// Maintain a per timeline gauge in addition to the global gauge.
3203 : pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
3204 : last_set: AtomicU64,
3205 : gauge: UIntGauge,
3206 : }
3207 :
3208 : impl PerTimelineRemotePhysicalSizeGauge {
3209 916 : fn new(per_timeline_gauge: UIntGauge) -> Self {
3210 916 : Self {
3211 916 : last_set: AtomicU64::new(0),
3212 916 : gauge: per_timeline_gauge,
3213 916 : }
3214 916 : }
3215 3861 : pub(crate) fn set(&self, sz: u64) {
3216 3861 : self.gauge.set(sz);
3217 3861 : let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed);
3218 3861 : if sz < prev {
3219 71 : REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz);
3220 3790 : } else {
3221 3790 : REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev);
3222 3790 : };
3223 3861 : }
3224 4 : pub(crate) fn get(&self) -> u64 {
3225 4 : self.gauge.get()
3226 4 : }
3227 : }
3228 :
3229 : impl Drop for PerTimelineRemotePhysicalSizeGauge {
3230 40 : fn drop(&mut self) {
3231 40 : REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed));
3232 40 : }
3233 : }
3234 :
3235 : pub(crate) struct RemoteTimelineClientMetrics {
3236 : tenant_id: String,
3237 : shard_id: String,
3238 : timeline_id: String,
3239 : pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge,
3240 : calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
3241 : bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
3242 : bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
3243 : pub(crate) projected_remote_consistent_lsn_gauge: UIntGauge,
3244 : }
3245 :
3246 : impl RemoteTimelineClientMetrics {
3247 916 : pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
3248 916 : let tenant_id_str = tenant_shard_id.tenant_id.to_string();
3249 916 : let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
3250 916 : let timeline_id_str = timeline_id.to_string();
3251 916 :
3252 916 : let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new(
3253 916 : REMOTE_PHYSICAL_SIZE
3254 916 : .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
3255 916 : .unwrap(),
3256 916 : );
3257 916 :
3258 916 : let projected_remote_consistent_lsn_gauge = PROJECTED_REMOTE_CONSISTENT_LSN
3259 916 : .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
3260 916 : .unwrap();
3261 916 :
3262 916 : RemoteTimelineClientMetrics {
3263 916 : tenant_id: tenant_id_str,
3264 916 : shard_id: shard_id_str,
3265 916 : timeline_id: timeline_id_str,
3266 916 : calls: Mutex::new(HashMap::default()),
3267 916 : bytes_started_counter: Mutex::new(HashMap::default()),
3268 916 : bytes_finished_counter: Mutex::new(HashMap::default()),
3269 916 : remote_physical_size_gauge,
3270 916 : projected_remote_consistent_lsn_gauge,
3271 916 : }
3272 916 : }
3273 :
3274 6125 : pub fn remote_operation_time(
3275 6125 : &self,
3276 6125 : file_kind: &RemoteOpFileKind,
3277 6125 : op_kind: &RemoteOpKind,
3278 6125 : status: &'static str,
3279 6125 : ) -> Histogram {
3280 6125 : let key = (file_kind.as_str(), op_kind.as_str(), status);
3281 6125 : REMOTE_OPERATION_TIME
3282 6125 : .get_metric_with_label_values(&[key.0, key.1, key.2])
3283 6125 : .unwrap()
3284 6125 : }
3285 :
3286 14338 : fn calls_counter_pair(
3287 14338 : &self,
3288 14338 : file_kind: &RemoteOpFileKind,
3289 14338 : op_kind: &RemoteOpKind,
3290 14338 : ) -> IntCounterPair {
3291 14338 : let mut guard = self.calls.lock().unwrap();
3292 14338 : let key = (file_kind.as_str(), op_kind.as_str());
3293 14338 : let metric = guard.entry(key).or_insert_with(move || {
3294 1646 : REMOTE_TIMELINE_CLIENT_CALLS
3295 1646 : .get_metric_with_label_values(&[
3296 1646 : &self.tenant_id,
3297 1646 : &self.shard_id,
3298 1646 : &self.timeline_id,
3299 1646 : key.0,
3300 1646 : key.1,
3301 1646 : ])
3302 1646 : .unwrap()
3303 14338 : });
3304 14338 : metric.clone()
3305 14338 : }
3306 :
3307 3468 : fn bytes_started_counter(
3308 3468 : &self,
3309 3468 : file_kind: &RemoteOpFileKind,
3310 3468 : op_kind: &RemoteOpKind,
3311 3468 : ) -> IntCounter {
3312 3468 : let mut guard = self.bytes_started_counter.lock().unwrap();
3313 3468 : let key = (file_kind.as_str(), op_kind.as_str());
3314 3468 : let metric = guard.entry(key).or_insert_with(move || {
3315 648 : REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
3316 648 : .get_metric_with_label_values(&[
3317 648 : &self.tenant_id,
3318 648 : &self.shard_id,
3319 648 : &self.timeline_id,
3320 648 : key.0,
3321 648 : key.1,
3322 648 : ])
3323 648 : .unwrap()
3324 3468 : });
3325 3468 : metric.clone()
3326 3468 : }
3327 :
3328 6560 : fn bytes_finished_counter(
3329 6560 : &self,
3330 6560 : file_kind: &RemoteOpFileKind,
3331 6560 : op_kind: &RemoteOpKind,
3332 6560 : ) -> IntCounter {
3333 6560 : let mut guard = self.bytes_finished_counter.lock().unwrap();
3334 6560 : let key = (file_kind.as_str(), op_kind.as_str());
3335 6560 : let metric = guard.entry(key).or_insert_with(move || {
3336 648 : REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
3337 648 : .get_metric_with_label_values(&[
3338 648 : &self.tenant_id,
3339 648 : &self.shard_id,
3340 648 : &self.timeline_id,
3341 648 : key.0,
3342 648 : key.1,
3343 648 : ])
3344 648 : .unwrap()
3345 6560 : });
3346 6560 : metric.clone()
3347 6560 : }
3348 : }
3349 :
3350 : #[cfg(test)]
3351 : impl RemoteTimelineClientMetrics {
3352 12 : pub fn get_bytes_started_counter_value(
3353 12 : &self,
3354 12 : file_kind: &RemoteOpFileKind,
3355 12 : op_kind: &RemoteOpKind,
3356 12 : ) -> Option<u64> {
3357 12 : let guard = self.bytes_started_counter.lock().unwrap();
3358 12 : let key = (file_kind.as_str(), op_kind.as_str());
3359 12 : guard.get(&key).map(|counter| counter.get())
3360 12 : }
3361 :
3362 12 : pub fn get_bytes_finished_counter_value(
3363 12 : &self,
3364 12 : file_kind: &RemoteOpFileKind,
3365 12 : op_kind: &RemoteOpKind,
3366 12 : ) -> Option<u64> {
3367 12 : let guard = self.bytes_finished_counter.lock().unwrap();
3368 12 : let key = (file_kind.as_str(), op_kind.as_str());
3369 12 : guard.get(&key).map(|counter| counter.get())
3370 12 : }
3371 : }
3372 :
3373 : /// See [`RemoteTimelineClientMetrics::call_begin`].
3374 : #[must_use]
3375 : pub(crate) struct RemoteTimelineClientCallMetricGuard {
3376 : /// Decremented on drop.
3377 : calls_counter_pair: Option<IntCounterPair>,
3378 : /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
3379 : bytes_finished: Option<(IntCounter, u64)>,
3380 : }
3381 :
3382 : impl RemoteTimelineClientCallMetricGuard {
3383 : /// Consume this guard object without performing the metric updates it would do on `drop()`.
3384 : /// The caller vouches to do the metric updates manually.
3385 7555 : pub fn will_decrement_manually(mut self) {
3386 7555 : let RemoteTimelineClientCallMetricGuard {
3387 7555 : calls_counter_pair,
3388 7555 : bytes_finished,
3389 7555 : } = &mut self;
3390 7555 : calls_counter_pair.take();
3391 7555 : bytes_finished.take();
3392 7555 : }
3393 : }
3394 :
3395 : impl Drop for RemoteTimelineClientCallMetricGuard {
3396 7623 : fn drop(&mut self) {
3397 7623 : let RemoteTimelineClientCallMetricGuard {
3398 7623 : calls_counter_pair,
3399 7623 : bytes_finished,
3400 7623 : } = self;
3401 7623 : if let Some(guard) = calls_counter_pair.take() {
3402 68 : guard.dec();
3403 7555 : }
3404 7623 : if let Some((bytes_finished_metric, value)) = bytes_finished {
3405 0 : bytes_finished_metric.inc_by(*value);
3406 7623 : }
3407 7623 : }
3408 : }
3409 :
3410 : /// The enum variants communicate to the [`RemoteTimelineClientMetrics`] whether to
3411 : /// track the byte size of this call in applicable metric(s).
3412 : pub(crate) enum RemoteTimelineClientMetricsCallTrackSize {
3413 : /// Do not account for this call's byte size in any metrics.
3414 : /// The `reason` field is there to make the call sites self-documenting
3415 : /// about why they don't need the metric.
3416 : DontTrackSize { reason: &'static str },
3417 : /// Track the byte size of the call in applicable metric(s).
3418 : Bytes(u64),
3419 : }
3420 :
3421 : impl RemoteTimelineClientMetrics {
3422 : /// Update the metrics that change when a call to the remote timeline client instance starts.
3423 : ///
3424 : /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
3425 : /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that
3426 : /// is more suitable.
3427 : /// Never do both.
3428 7623 : pub(crate) fn call_begin(
3429 7623 : &self,
3430 7623 : file_kind: &RemoteOpFileKind,
3431 7623 : op_kind: &RemoteOpKind,
3432 7623 : size: RemoteTimelineClientMetricsCallTrackSize,
3433 7623 : ) -> RemoteTimelineClientCallMetricGuard {
3434 7623 : let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
3435 7623 : calls_counter_pair.inc();
3436 :
3437 7623 : let bytes_finished = match size {
3438 4155 : RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
3439 4155 : // nothing to do
3440 4155 : None
3441 : }
3442 3468 : RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
3443 3468 : self.bytes_started_counter(file_kind, op_kind).inc_by(size);
3444 3468 : let finished_counter = self.bytes_finished_counter(file_kind, op_kind);
3445 3468 : Some((finished_counter, size))
3446 : }
3447 : };
3448 7623 : RemoteTimelineClientCallMetricGuard {
3449 7623 : calls_counter_pair: Some(calls_counter_pair),
3450 7623 : bytes_finished,
3451 7623 : }
3452 7623 : }
3453 :
3454 : /// Manually udpate the metrics that track completions, instead of using the guard object.
3455 : /// Using the guard object is generally preferable.
3456 : /// See [`call_begin`](Self::call_begin) for more context.
3457 6715 : pub(crate) fn call_end(
3458 6715 : &self,
3459 6715 : file_kind: &RemoteOpFileKind,
3460 6715 : op_kind: &RemoteOpKind,
3461 6715 : size: RemoteTimelineClientMetricsCallTrackSize,
3462 6715 : ) {
3463 6715 : let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
3464 6715 : calls_counter_pair.dec();
3465 6715 : match size {
3466 3623 : RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
3467 3092 : RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
3468 3092 : self.bytes_finished_counter(file_kind, op_kind).inc_by(size);
3469 3092 : }
3470 : }
3471 6715 : }
3472 : }
3473 :
3474 : impl Drop for RemoteTimelineClientMetrics {
3475 40 : fn drop(&mut self) {
3476 40 : let RemoteTimelineClientMetrics {
3477 40 : tenant_id,
3478 40 : shard_id,
3479 40 : timeline_id,
3480 40 : remote_physical_size_gauge,
3481 40 : calls,
3482 40 : bytes_started_counter,
3483 40 : bytes_finished_counter,
3484 40 : projected_remote_consistent_lsn_gauge,
3485 40 : } = self;
3486 48 : for ((a, b), _) in calls.get_mut().unwrap().drain() {
3487 48 : let mut res = [Ok(()), Ok(())];
3488 48 : REMOTE_TIMELINE_CLIENT_CALLS
3489 48 : .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id, a, b]);
3490 48 : // don't care about results
3491 48 : }
3492 40 : for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
3493 12 : let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
3494 12 : tenant_id,
3495 12 : shard_id,
3496 12 : timeline_id,
3497 12 : a,
3498 12 : b,
3499 12 : ]);
3500 12 : }
3501 40 : for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
3502 12 : let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
3503 12 : tenant_id,
3504 12 : shard_id,
3505 12 : timeline_id,
3506 12 : a,
3507 12 : b,
3508 12 : ]);
3509 12 : }
3510 40 : {
3511 40 : let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
3512 40 : let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3513 40 : }
3514 40 : {
3515 40 : let _ = projected_remote_consistent_lsn_gauge;
3516 40 : let _ = PROJECTED_REMOTE_CONSISTENT_LSN.remove_label_values(&[
3517 40 : tenant_id,
3518 40 : shard_id,
3519 40 : timeline_id,
3520 40 : ]);
3521 40 : }
3522 40 : }
3523 : }
3524 :
3525 : /// Wrapper future that measures the time spent by a remote storage operation,
3526 : /// and records the time and success/failure as a prometheus metric.
3527 : pub(crate) trait MeasureRemoteOp: Sized {
3528 6400 : fn measure_remote_op(
3529 6400 : self,
3530 6400 : file_kind: RemoteOpFileKind,
3531 6400 : op: RemoteOpKind,
3532 6400 : metrics: Arc<RemoteTimelineClientMetrics>,
3533 6400 : ) -> MeasuredRemoteOp<Self> {
3534 6400 : let start = Instant::now();
3535 6400 : MeasuredRemoteOp {
3536 6400 : inner: self,
3537 6400 : file_kind,
3538 6400 : op,
3539 6400 : start,
3540 6400 : metrics,
3541 6400 : }
3542 6400 : }
3543 : }
3544 :
3545 : impl<T: Sized> MeasureRemoteOp for T {}
3546 :
3547 : pin_project! {
3548 : pub(crate) struct MeasuredRemoteOp<F>
3549 : {
3550 : #[pin]
3551 : inner: F,
3552 : file_kind: RemoteOpFileKind,
3553 : op: RemoteOpKind,
3554 : start: Instant,
3555 : metrics: Arc<RemoteTimelineClientMetrics>,
3556 : }
3557 : }
3558 :
3559 : impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
3560 : type Output = Result<O, E>;
3561 :
3562 96939 : fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
3563 96939 : let this = self.project();
3564 96939 : let poll_result = this.inner.poll(cx);
3565 96939 : if let Poll::Ready(ref res) = poll_result {
3566 6125 : let duration = this.start.elapsed();
3567 6125 : let status = if res.is_ok() { &"success" } else { &"failure" };
3568 6125 : this.metrics
3569 6125 : .remote_operation_time(this.file_kind, this.op, status)
3570 6125 : .observe(duration.as_secs_f64());
3571 90814 : }
3572 96939 : poll_result
3573 96939 : }
3574 : }
3575 :
3576 : pub mod tokio_epoll_uring {
3577 : use std::{
3578 : collections::HashMap,
3579 : sync::{Arc, Mutex},
3580 : };
3581 :
3582 : use metrics::{register_histogram, register_int_counter, Histogram, LocalHistogram, UIntGauge};
3583 : use once_cell::sync::Lazy;
3584 :
3585 : /// Shared storage for tokio-epoll-uring thread local metrics.
3586 : pub(crate) static THREAD_LOCAL_METRICS_STORAGE: Lazy<ThreadLocalMetricsStorage> =
3587 234 : Lazy::new(|| {
3588 234 : let slots_submission_queue_depth = register_histogram!(
3589 234 : "pageserver_tokio_epoll_uring_slots_submission_queue_depth",
3590 234 : "The slots waiters queue depth of each tokio_epoll_uring system",
3591 234 : vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
3592 234 : )
3593 234 : .expect("failed to define a metric");
3594 234 : ThreadLocalMetricsStorage {
3595 234 : observers: Mutex::new(HashMap::new()),
3596 234 : slots_submission_queue_depth,
3597 234 : }
3598 234 : });
3599 :
3600 : pub struct ThreadLocalMetricsStorage {
3601 : /// List of thread local metrics observers.
3602 : observers: Mutex<HashMap<u64, Arc<ThreadLocalMetrics>>>,
3603 : /// A histogram shared between all thread local systems
3604 : /// for collecting slots submission queue depth.
3605 : slots_submission_queue_depth: Histogram,
3606 : }
3607 :
3608 : /// Each thread-local [`tokio_epoll_uring::System`] gets one of these as its
3609 : /// [`tokio_epoll_uring::metrics::PerSystemMetrics`] generic.
3610 : ///
3611 : /// The System makes observations into [`Self`] and periodically, the collector
3612 : /// comes along and flushes [`Self`] into the shared storage [`THREAD_LOCAL_METRICS_STORAGE`].
3613 : ///
3614 : /// [`LocalHistogram`] is `!Send`, so, we need to put it behind a [`Mutex`].
3615 : /// But except for the periodic flush, the lock is uncontended so there's no waiting
3616 : /// for cache coherence protocol to get an exclusive cache line.
3617 : pub struct ThreadLocalMetrics {
3618 : /// Local observer of thread local tokio-epoll-uring system's slots waiters queue depth.
3619 : slots_submission_queue_depth: Mutex<LocalHistogram>,
3620 : }
3621 :
3622 : impl ThreadLocalMetricsStorage {
3623 : /// Registers a new thread local system. Returns a thread local metrics observer.
3624 1018 : pub fn register_system(&self, id: u64) -> Arc<ThreadLocalMetrics> {
3625 1018 : let per_system_metrics = Arc::new(ThreadLocalMetrics::new(
3626 1018 : self.slots_submission_queue_depth.local(),
3627 1018 : ));
3628 1018 : let mut g = self.observers.lock().unwrap();
3629 1018 : g.insert(id, Arc::clone(&per_system_metrics));
3630 1018 : per_system_metrics
3631 1018 : }
3632 :
3633 : /// Removes metrics observer for a thread local system.
3634 : /// This should be called before dropping a thread local system.
3635 234 : pub fn remove_system(&self, id: u64) {
3636 234 : let mut g = self.observers.lock().unwrap();
3637 234 : g.remove(&id);
3638 234 : }
3639 :
3640 : /// Flush all thread local metrics to the shared storage.
3641 0 : pub fn flush_thread_local_metrics(&self) {
3642 0 : let g = self.observers.lock().unwrap();
3643 0 : g.values().for_each(|local| {
3644 0 : local.flush();
3645 0 : });
3646 0 : }
3647 : }
3648 :
3649 : impl ThreadLocalMetrics {
3650 1018 : pub fn new(slots_submission_queue_depth: LocalHistogram) -> Self {
3651 1018 : ThreadLocalMetrics {
3652 1018 : slots_submission_queue_depth: Mutex::new(slots_submission_queue_depth),
3653 1018 : }
3654 1018 : }
3655 :
3656 : /// Flushes the thread local metrics to shared aggregator.
3657 0 : pub fn flush(&self) {
3658 0 : let Self {
3659 0 : slots_submission_queue_depth,
3660 0 : } = self;
3661 0 : slots_submission_queue_depth.lock().unwrap().flush();
3662 0 : }
3663 : }
3664 :
3665 : impl tokio_epoll_uring::metrics::PerSystemMetrics for ThreadLocalMetrics {
3666 1819434 : fn observe_slots_submission_queue_depth(&self, queue_depth: u64) {
3667 1819434 : let Self {
3668 1819434 : slots_submission_queue_depth,
3669 1819434 : } = self;
3670 1819434 : slots_submission_queue_depth
3671 1819434 : .lock()
3672 1819434 : .unwrap()
3673 1819434 : .observe(queue_depth as f64);
3674 1819434 : }
3675 : }
3676 :
3677 : pub struct Collector {
3678 : descs: Vec<metrics::core::Desc>,
3679 : systems_created: UIntGauge,
3680 : systems_destroyed: UIntGauge,
3681 : thread_local_metrics_storage: &'static ThreadLocalMetricsStorage,
3682 : }
3683 :
3684 : impl metrics::core::Collector for Collector {
3685 0 : fn desc(&self) -> Vec<&metrics::core::Desc> {
3686 0 : self.descs.iter().collect()
3687 0 : }
3688 :
3689 0 : fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
3690 0 : let mut mfs = Vec::with_capacity(Self::NMETRICS);
3691 0 : let tokio_epoll_uring::metrics::GlobalMetrics {
3692 0 : systems_created,
3693 0 : systems_destroyed,
3694 0 : } = tokio_epoll_uring::metrics::global();
3695 0 : self.systems_created.set(systems_created);
3696 0 : mfs.extend(self.systems_created.collect());
3697 0 : self.systems_destroyed.set(systems_destroyed);
3698 0 : mfs.extend(self.systems_destroyed.collect());
3699 0 :
3700 0 : self.thread_local_metrics_storage
3701 0 : .flush_thread_local_metrics();
3702 0 :
3703 0 : mfs.extend(
3704 0 : self.thread_local_metrics_storage
3705 0 : .slots_submission_queue_depth
3706 0 : .collect(),
3707 0 : );
3708 0 : mfs
3709 0 : }
3710 : }
3711 :
3712 : impl Collector {
3713 : const NMETRICS: usize = 3;
3714 :
3715 : #[allow(clippy::new_without_default)]
3716 0 : pub fn new() -> Self {
3717 0 : let mut descs = Vec::new();
3718 0 :
3719 0 : let systems_created = UIntGauge::new(
3720 0 : "pageserver_tokio_epoll_uring_systems_created",
3721 0 : "counter of tokio-epoll-uring systems that were created",
3722 0 : )
3723 0 : .unwrap();
3724 0 : descs.extend(
3725 0 : metrics::core::Collector::desc(&systems_created)
3726 0 : .into_iter()
3727 0 : .cloned(),
3728 0 : );
3729 0 :
3730 0 : let systems_destroyed = UIntGauge::new(
3731 0 : "pageserver_tokio_epoll_uring_systems_destroyed",
3732 0 : "counter of tokio-epoll-uring systems that were destroyed",
3733 0 : )
3734 0 : .unwrap();
3735 0 : descs.extend(
3736 0 : metrics::core::Collector::desc(&systems_destroyed)
3737 0 : .into_iter()
3738 0 : .cloned(),
3739 0 : );
3740 0 :
3741 0 : Self {
3742 0 : descs,
3743 0 : systems_created,
3744 0 : systems_destroyed,
3745 0 : thread_local_metrics_storage: &THREAD_LOCAL_METRICS_STORAGE,
3746 0 : }
3747 0 : }
3748 : }
3749 :
3750 234 : pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy<metrics::IntCounter> = Lazy::new(|| {
3751 234 : register_int_counter!(
3752 234 : "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count",
3753 234 : "Number of times where thread_local_system creation spanned multiple executor threads",
3754 234 : )
3755 234 : .unwrap()
3756 234 : });
3757 :
3758 0 : pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy<metrics::IntCounter> = Lazy::new(|| {
3759 0 : register_int_counter!(
3760 0 : "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count",
3761 0 : "Number of times thread_local_system creation failed and was retried after back-off.",
3762 0 : )
3763 0 : .unwrap()
3764 0 : });
3765 : }
3766 :
3767 : pub(crate) mod tenant_throttling {
3768 : use metrics::{register_int_counter_vec, IntCounter};
3769 : use once_cell::sync::Lazy;
3770 : use utils::shard::TenantShardId;
3771 :
3772 : pub(crate) struct GlobalAndPerTenantIntCounter {
3773 : global: IntCounter,
3774 : per_tenant: IntCounter,
3775 : }
3776 :
3777 : impl GlobalAndPerTenantIntCounter {
3778 : #[inline(always)]
3779 0 : pub(crate) fn inc(&self) {
3780 0 : self.inc_by(1)
3781 0 : }
3782 : #[inline(always)]
3783 0 : pub(crate) fn inc_by(&self, n: u64) {
3784 0 : self.global.inc_by(n);
3785 0 : self.per_tenant.inc_by(n);
3786 0 : }
3787 : }
3788 :
3789 : pub(crate) struct Metrics<const KIND: usize> {
3790 : pub(super) count_accounted_start: GlobalAndPerTenantIntCounter,
3791 : pub(super) count_accounted_finish: GlobalAndPerTenantIntCounter,
3792 : pub(super) wait_time: GlobalAndPerTenantIntCounter,
3793 : pub(super) count_throttled: GlobalAndPerTenantIntCounter,
3794 : }
3795 :
3796 408 : static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3797 408 : register_int_counter_vec!(
3798 408 : "pageserver_tenant_throttling_count_accounted_start_global",
3799 408 : "Count of tenant throttling starts, by kind of throttle.",
3800 408 : &["kind"]
3801 408 : )
3802 408 : .unwrap()
3803 408 : });
3804 408 : static COUNT_ACCOUNTED_START_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3805 408 : register_int_counter_vec!(
3806 408 : "pageserver_tenant_throttling_count_accounted_start",
3807 408 : "Count of tenant throttling starts, by kind of throttle.",
3808 408 : &["kind", "tenant_id", "shard_id"]
3809 408 : )
3810 408 : .unwrap()
3811 408 : });
3812 408 : static COUNT_ACCOUNTED_FINISH: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3813 408 : register_int_counter_vec!(
3814 408 : "pageserver_tenant_throttling_count_accounted_finish_global",
3815 408 : "Count of tenant throttling finishes, by kind of throttle.",
3816 408 : &["kind"]
3817 408 : )
3818 408 : .unwrap()
3819 408 : });
3820 408 : static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3821 408 : register_int_counter_vec!(
3822 408 : "pageserver_tenant_throttling_count_accounted_finish",
3823 408 : "Count of tenant throttling finishes, by kind of throttle.",
3824 408 : &["kind", "tenant_id", "shard_id"]
3825 408 : )
3826 408 : .unwrap()
3827 408 : });
3828 408 : static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3829 408 : register_int_counter_vec!(
3830 408 : "pageserver_tenant_throttling_wait_usecs_sum_global",
3831 408 : "Sum of microseconds that spent waiting throttle by kind of throttle.",
3832 408 : &["kind"]
3833 408 : )
3834 408 : .unwrap()
3835 408 : });
3836 408 : static WAIT_USECS_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3837 408 : register_int_counter_vec!(
3838 408 : "pageserver_tenant_throttling_wait_usecs_sum",
3839 408 : "Sum of microseconds that spent waiting throttle by kind of throttle.",
3840 408 : &["kind", "tenant_id", "shard_id"]
3841 408 : )
3842 408 : .unwrap()
3843 408 : });
3844 :
3845 408 : static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3846 408 : register_int_counter_vec!(
3847 408 : "pageserver_tenant_throttling_count_global",
3848 408 : "Count of tenant throttlings, by kind of throttle.",
3849 408 : &["kind"]
3850 408 : )
3851 408 : .unwrap()
3852 408 : });
3853 408 : static WAIT_COUNT_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3854 408 : register_int_counter_vec!(
3855 408 : "pageserver_tenant_throttling_count",
3856 408 : "Count of tenant throttlings, by kind of throttle.",
3857 408 : &["kind", "tenant_id", "shard_id"]
3858 408 : )
3859 408 : .unwrap()
3860 408 : });
3861 :
3862 : const KINDS: &[&str] = &["pagestream"];
3863 : pub type Pagestream = Metrics<0>;
3864 :
3865 : impl<const KIND: usize> Metrics<KIND> {
3866 444 : pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
3867 444 : let per_tenant_label_values = &[
3868 444 : KINDS[KIND],
3869 444 : &tenant_shard_id.tenant_id.to_string(),
3870 444 : &tenant_shard_id.shard_slug().to_string(),
3871 444 : ];
3872 444 : Metrics {
3873 444 : count_accounted_start: {
3874 444 : GlobalAndPerTenantIntCounter {
3875 444 : global: COUNT_ACCOUNTED_START.with_label_values(&[KINDS[KIND]]),
3876 444 : per_tenant: COUNT_ACCOUNTED_START_PER_TENANT
3877 444 : .with_label_values(per_tenant_label_values),
3878 444 : }
3879 444 : },
3880 444 : count_accounted_finish: {
3881 444 : GlobalAndPerTenantIntCounter {
3882 444 : global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KINDS[KIND]]),
3883 444 : per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT
3884 444 : .with_label_values(per_tenant_label_values),
3885 444 : }
3886 444 : },
3887 444 : wait_time: {
3888 444 : GlobalAndPerTenantIntCounter {
3889 444 : global: WAIT_USECS.with_label_values(&[KINDS[KIND]]),
3890 444 : per_tenant: WAIT_USECS_PER_TENANT
3891 444 : .with_label_values(per_tenant_label_values),
3892 444 : }
3893 444 : },
3894 444 : count_throttled: {
3895 444 : GlobalAndPerTenantIntCounter {
3896 444 : global: WAIT_COUNT.with_label_values(&[KINDS[KIND]]),
3897 444 : per_tenant: WAIT_COUNT_PER_TENANT
3898 444 : .with_label_values(per_tenant_label_values),
3899 444 : }
3900 444 : },
3901 444 : }
3902 444 : }
3903 : }
3904 :
3905 0 : pub(crate) fn preinitialize_global_metrics() {
3906 0 : Lazy::force(&COUNT_ACCOUNTED_START);
3907 0 : Lazy::force(&COUNT_ACCOUNTED_FINISH);
3908 0 : Lazy::force(&WAIT_USECS);
3909 0 : Lazy::force(&WAIT_COUNT);
3910 0 : }
3911 :
3912 12 : pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
3913 48 : for m in &[
3914 12 : &COUNT_ACCOUNTED_START_PER_TENANT,
3915 12 : &COUNT_ACCOUNTED_FINISH_PER_TENANT,
3916 12 : &WAIT_USECS_PER_TENANT,
3917 12 : &WAIT_COUNT_PER_TENANT,
3918 12 : ] {
3919 96 : for kind in KINDS {
3920 48 : let _ = m.remove_label_values(&[
3921 48 : kind,
3922 48 : &tenant_shard_id.tenant_id.to_string(),
3923 48 : &tenant_shard_id.shard_slug().to_string(),
3924 48 : ]);
3925 48 : }
3926 : }
3927 12 : }
3928 : }
3929 :
3930 : pub(crate) mod disk_usage_based_eviction {
3931 : use super::*;
3932 :
3933 : pub(crate) struct Metrics {
3934 : pub(crate) tenant_collection_time: Histogram,
3935 : pub(crate) tenant_layer_count: Histogram,
3936 : pub(crate) layers_collected: IntCounter,
3937 : pub(crate) layers_selected: IntCounter,
3938 : pub(crate) layers_evicted: IntCounter,
3939 : }
3940 :
3941 : impl Default for Metrics {
3942 0 : fn default() -> Self {
3943 0 : let tenant_collection_time = register_histogram!(
3944 0 : "pageserver_disk_usage_based_eviction_tenant_collection_seconds",
3945 0 : "Time spent collecting layers from a tenant -- not normalized by collected layer amount",
3946 0 : vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
3947 0 : )
3948 0 : .unwrap();
3949 0 :
3950 0 : let tenant_layer_count = register_histogram!(
3951 0 : "pageserver_disk_usage_based_eviction_tenant_collected_layers",
3952 0 : "Amount of layers gathered from a tenant",
3953 0 : vec![5.0, 50.0, 500.0, 5000.0, 50000.0]
3954 0 : )
3955 0 : .unwrap();
3956 0 :
3957 0 : let layers_collected = register_int_counter!(
3958 0 : "pageserver_disk_usage_based_eviction_collected_layers_total",
3959 0 : "Amount of layers collected"
3960 0 : )
3961 0 : .unwrap();
3962 0 :
3963 0 : let layers_selected = register_int_counter!(
3964 0 : "pageserver_disk_usage_based_eviction_select_layers_total",
3965 0 : "Amount of layers selected"
3966 0 : )
3967 0 : .unwrap();
3968 0 :
3969 0 : let layers_evicted = register_int_counter!(
3970 0 : "pageserver_disk_usage_based_eviction_evicted_layers_total",
3971 0 : "Amount of layers successfully evicted"
3972 0 : )
3973 0 : .unwrap();
3974 0 :
3975 0 : Self {
3976 0 : tenant_collection_time,
3977 0 : tenant_layer_count,
3978 0 : layers_collected,
3979 0 : layers_selected,
3980 0 : layers_evicted,
3981 0 : }
3982 0 : }
3983 : }
3984 :
3985 : pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
3986 : }
3987 :
3988 396 : static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
3989 396 : register_uint_gauge_vec!(
3990 396 : "pageserver_tokio_executor_thread_configured_count",
3991 396 : "Total number of configued tokio executor threads in the process.
3992 396 : The `setup` label denotes whether we're running with multiple runtimes or a single runtime.",
3993 396 : &["setup"],
3994 396 : )
3995 396 : .unwrap()
3996 396 : });
3997 :
3998 396 : pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
3999 : static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
4000 396 : let _guard = SERIALIZE.lock().unwrap();
4001 396 : TOKIO_EXECUTOR_THREAD_COUNT.reset();
4002 396 : TOKIO_EXECUTOR_THREAD_COUNT
4003 396 : .get_metric_with_label_values(&[setup])
4004 396 : .unwrap()
4005 396 : .set(u64::try_from(num_threads.get()).unwrap());
4006 396 : }
4007 :
4008 0 : pub fn preinitialize_metrics(conf: &'static PageServerConf) {
4009 0 : set_page_service_config_max_batch_size(&conf.page_service_pipelining);
4010 0 :
4011 0 : // Python tests need these and on some we do alerting.
4012 0 : //
4013 0 : // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
4014 0 : // order:
4015 0 : // - global metrics reside in a Lazy<PageserverMetrics>
4016 0 : // - access via crate::metrics::PS_METRICS.some_metric.inc()
4017 0 : // - could move the statics into TimelineMetrics::new()?
4018 0 :
4019 0 : // counters
4020 0 : [
4021 0 : &UNEXPECTED_ONDEMAND_DOWNLOADS,
4022 0 : &WALRECEIVER_STARTED_CONNECTIONS,
4023 0 : &WALRECEIVER_BROKER_UPDATES,
4024 0 : &WALRECEIVER_CANDIDATES_ADDED,
4025 0 : &WALRECEIVER_CANDIDATES_REMOVED,
4026 0 : &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
4027 0 : &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
4028 0 : &REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
4029 0 : &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
4030 0 : &CIRCUIT_BREAKERS_BROKEN,
4031 0 : &CIRCUIT_BREAKERS_UNBROKEN,
4032 0 : &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
4033 0 : ]
4034 0 : .into_iter()
4035 0 : .for_each(|c| {
4036 0 : Lazy::force(c);
4037 0 : });
4038 0 :
4039 0 : // Deletion queue stats
4040 0 : Lazy::force(&DELETION_QUEUE);
4041 0 :
4042 0 : // Tenant stats
4043 0 : Lazy::force(&TENANT);
4044 0 :
4045 0 : // Tenant manager stats
4046 0 : Lazy::force(&TENANT_MANAGER);
4047 0 :
4048 0 : Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
4049 0 : Lazy::force(&disk_usage_based_eviction::METRICS);
4050 :
4051 0 : for state_name in pageserver_api::models::TenantState::VARIANTS {
4052 0 : // initialize the metric for all gauges, otherwise the time series might seemingly show
4053 0 : // values from last restart.
4054 0 : TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0);
4055 0 : }
4056 :
4057 : // countervecs
4058 0 : [
4059 0 : &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
4060 0 : &SMGR_QUERY_STARTED_GLOBAL,
4061 0 : ]
4062 0 : .into_iter()
4063 0 : .for_each(|c| {
4064 0 : Lazy::force(c);
4065 0 : });
4066 0 :
4067 0 : // gauges
4068 0 : WALRECEIVER_ACTIVE_MANAGERS.get();
4069 0 :
4070 0 : // histograms
4071 0 : [
4072 0 : &LAYERS_PER_READ_GLOBAL,
4073 0 : &DELTAS_PER_READ_GLOBAL,
4074 0 : &WAIT_LSN_TIME,
4075 0 : &WAL_REDO_TIME,
4076 0 : &WAL_REDO_RECORDS_HISTOGRAM,
4077 0 : &WAL_REDO_BYTES_HISTOGRAM,
4078 0 : &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
4079 0 : &PAGE_SERVICE_BATCH_SIZE_GLOBAL,
4080 0 : &PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL,
4081 0 : ]
4082 0 : .into_iter()
4083 0 : .for_each(|h| {
4084 0 : Lazy::force(h);
4085 0 : });
4086 0 :
4087 0 : // Custom
4088 0 : Lazy::force(&BASEBACKUP_QUERY_TIME);
4089 0 : Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
4090 0 : Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);
4091 0 :
4092 0 : tenant_throttling::preinitialize_global_metrics();
4093 0 : }
|