Line data Source code
1 : use std::collections::HashMap;
2 : use std::num::NonZeroUsize;
3 : use std::os::fd::RawFd;
4 : use std::pin::Pin;
5 : use std::sync::atomic::AtomicU64;
6 : use std::sync::{Arc, Mutex};
7 : use std::task::{Context, Poll};
8 : use std::time::{Duration, Instant};
9 :
10 : use enum_map::{Enum as _, EnumMap};
11 : use futures::Future;
12 : use metrics::{
13 : register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
14 : register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
15 : register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
16 : Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
17 : IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
18 : };
19 : use once_cell::sync::Lazy;
20 : use pageserver_api::config::{
21 : PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
22 : PageServiceProtocolPipelinedExecutionStrategy,
23 : };
24 : use pageserver_api::models::InMemoryLayerInfo;
25 : use pageserver_api::shard::TenantShardId;
26 : use pin_project_lite::pin_project;
27 : use postgres_backend::{is_expected_io_error, QueryError};
28 : use pq_proto::framed::ConnectionError;
29 :
30 : use strum::{EnumCount, IntoEnumIterator as _, VariantNames};
31 : use strum_macros::{IntoStaticStr, VariantNames};
32 : use utils::id::TimelineId;
33 :
34 : use crate::config::PageServerConf;
35 : use crate::context::{PageContentKind, RequestContext};
36 : use crate::pgdatadir_mapping::DatadirModificationStats;
37 : use crate::task_mgr::TaskKind;
38 : use crate::tenant::layer_map::LayerMap;
39 : use crate::tenant::mgr::TenantSlot;
40 : use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc};
41 : use crate::tenant::tasks::BackgroundLoopKind;
42 : use crate::tenant::throttle::ThrottleResult;
43 : use crate::tenant::Timeline;
44 :
45 : /// Prometheus histogram buckets (in seconds) for operations in the critical
46 : /// path. In other words, operations that directly affect that latency of user
47 : /// queries.
48 : ///
49 : /// The buckets capture the majority of latencies in the microsecond and
50 : /// millisecond range but also extend far enough up to distinguish "bad" from
51 : /// "really bad".
52 : const CRITICAL_OP_BUCKETS: &[f64] = &[
53 : 0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us
54 : 0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms
55 : 1.0, 10.0, 100.0, // 1 s, 10 s, 100 s
56 : ];
57 :
58 : // Metrics collected on operations on the storage repository.
59 : #[derive(Debug, VariantNames, IntoStaticStr)]
60 : #[strum(serialize_all = "kebab_case")]
61 : pub(crate) enum StorageTimeOperation {
62 : #[strum(serialize = "layer flush")]
63 : LayerFlush,
64 :
65 : #[strum(serialize = "layer flush delay")]
66 : LayerFlushDelay,
67 :
68 : #[strum(serialize = "compact")]
69 : Compact,
70 :
71 : #[strum(serialize = "create images")]
72 : CreateImages,
73 :
74 : #[strum(serialize = "logical size")]
75 : LogicalSize,
76 :
77 : #[strum(serialize = "imitate logical size")]
78 : ImitateLogicalSize,
79 :
80 : #[strum(serialize = "load layer map")]
81 : LoadLayerMap,
82 :
83 : #[strum(serialize = "gc")]
84 : Gc,
85 :
86 : #[strum(serialize = "find gc cutoffs")]
87 : FindGcCutoffs,
88 : }
89 :
90 404 : pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
91 404 : register_counter_vec!(
92 404 : "pageserver_storage_operations_seconds_sum",
93 404 : "Total time spent on storage operations with operation, tenant and timeline dimensions",
94 404 : &["operation", "tenant_id", "shard_id", "timeline_id"],
95 404 : )
96 404 : .expect("failed to define a metric")
97 404 : });
98 :
99 404 : pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
100 404 : register_int_counter_vec!(
101 404 : "pageserver_storage_operations_seconds_count",
102 404 : "Count of storage operations with operation, tenant and timeline dimensions",
103 404 : &["operation", "tenant_id", "shard_id", "timeline_id"],
104 404 : )
105 404 : .expect("failed to define a metric")
106 404 : });
107 :
108 : // Buckets for background operation duration in seconds, like compaction, GC, size calculation.
109 : const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
110 :
111 404 : pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
112 404 : register_histogram_vec!(
113 404 : "pageserver_storage_operations_seconds_global",
114 404 : "Time spent on storage operations",
115 404 : &["operation"],
116 404 : STORAGE_OP_BUCKETS.into(),
117 404 : )
118 404 : .expect("failed to define a metric")
119 404 : });
120 :
121 : /// Measures layers visited per read (i.e. read amplification).
122 : ///
123 : /// NB: for a batch, we count all visited layers towards each read. While the cost of layer visits
124 : /// are amortized across the batch, and some layers may not intersect with a given key, each visited
125 : /// layer contributes directly to the observed latency for every read in the batch, which is what we
126 : /// care about.
127 404 : pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
128 404 : register_histogram_vec!(
129 404 : "pageserver_layers_per_read",
130 404 : "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
131 404 : &["tenant_id", "shard_id", "timeline_id"],
132 404 : // Low resolution to reduce cardinality.
133 404 : vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
134 404 : )
135 404 : .expect("failed to define a metric")
136 404 : });
137 :
138 396 : pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
139 396 : register_histogram!(
140 396 : "pageserver_layers_per_read_global",
141 396 : "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
142 396 : vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
143 396 : )
144 396 : .expect("failed to define a metric")
145 396 : });
146 :
147 396 : pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
148 396 : // We expect this to be low because of Postgres checkpoints. Let's see if that holds.
149 396 : register_histogram!(
150 396 : "pageserver_deltas_per_read_global",
151 396 : "Number of delta pages applied to image page per read",
152 396 : vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
153 396 : )
154 396 : .expect("failed to define a metric")
155 396 : });
156 :
157 0 : pub(crate) static CONCURRENT_INITDBS: Lazy<UIntGauge> = Lazy::new(|| {
158 0 : register_uint_gauge!(
159 0 : "pageserver_concurrent_initdb",
160 0 : "Number of initdb processes running"
161 0 : )
162 0 : .expect("failed to define a metric")
163 0 : });
164 :
165 0 : pub(crate) static INITDB_SEMAPHORE_ACQUISITION_TIME: Lazy<Histogram> = Lazy::new(|| {
166 0 : register_histogram!(
167 0 : "pageserver_initdb_semaphore_seconds_global",
168 0 : "Time spent getting a permit from the global initdb semaphore",
169 0 : STORAGE_OP_BUCKETS.into()
170 0 : )
171 0 : .expect("failed to define metric")
172 0 : });
173 :
174 0 : pub(crate) static INITDB_RUN_TIME: Lazy<Histogram> = Lazy::new(|| {
175 0 : register_histogram!(
176 0 : "pageserver_initdb_seconds_global",
177 0 : "Time spent performing initdb",
178 0 : STORAGE_OP_BUCKETS.into()
179 0 : )
180 0 : .expect("failed to define metric")
181 0 : });
182 :
183 : pub(crate) struct GetVectoredLatency {
184 : map: EnumMap<TaskKind, Option<Histogram>>,
185 : }
186 :
187 : #[allow(dead_code)]
188 : pub(crate) struct ScanLatency {
189 : map: EnumMap<TaskKind, Option<Histogram>>,
190 : }
191 :
192 : impl GetVectoredLatency {
193 : // Only these task types perform vectored gets. Filter all other tasks out to reduce total
194 : // cardinality of the metric.
195 : const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler];
196 :
197 39332 : pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
198 39332 : self.map[task_kind].as_ref()
199 39332 : }
200 : }
201 :
202 : impl ScanLatency {
203 : // Only these task types perform vectored gets. Filter all other tasks out to reduce total
204 : // cardinality of the metric.
205 : const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler];
206 :
207 24 : pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
208 24 : self.map[task_kind].as_ref()
209 24 : }
210 : }
211 :
212 : pub(crate) struct ScanLatencyOngoingRecording<'a> {
213 : parent: &'a Histogram,
214 : start: std::time::Instant,
215 : }
216 :
217 : impl<'a> ScanLatencyOngoingRecording<'a> {
218 0 : pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> {
219 0 : let start = Instant::now();
220 0 : ScanLatencyOngoingRecording { parent, start }
221 0 : }
222 :
223 0 : pub(crate) fn observe(self) {
224 0 : let elapsed = self.start.elapsed();
225 0 : self.parent.observe(elapsed.as_secs_f64());
226 0 : }
227 : }
228 :
229 388 : pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
230 388 : let inner = register_histogram_vec!(
231 388 : "pageserver_get_vectored_seconds",
232 388 : "Time spent in get_vectored.",
233 388 : &["task_kind"],
234 388 : CRITICAL_OP_BUCKETS.into(),
235 388 : )
236 388 : .expect("failed to define a metric");
237 388 :
238 388 : GetVectoredLatency {
239 12028 : map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
240 12028 : let task_kind = TaskKind::from_usize(task_kind_idx);
241 12028 :
242 12028 : if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
243 776 : let task_kind = task_kind.into();
244 776 : Some(inner.with_label_values(&[task_kind]))
245 : } else {
246 11252 : None
247 : }
248 12028 : })),
249 388 : }
250 388 : });
251 :
252 8 : pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
253 8 : let inner = register_histogram_vec!(
254 8 : "pageserver_scan_seconds",
255 8 : "Time spent in scan.",
256 8 : &["task_kind"],
257 8 : CRITICAL_OP_BUCKETS.into(),
258 8 : )
259 8 : .expect("failed to define a metric");
260 8 :
261 8 : ScanLatency {
262 248 : map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
263 248 : let task_kind = TaskKind::from_usize(task_kind_idx);
264 248 :
265 248 : if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
266 8 : let task_kind = task_kind.into();
267 8 : Some(inner.with_label_values(&[task_kind]))
268 : } else {
269 240 : None
270 : }
271 248 : })),
272 8 : }
273 8 : });
274 :
275 : pub(crate) struct PageCacheMetricsForTaskKind {
276 : pub read_accesses_immutable: IntCounter,
277 : pub read_hits_immutable: IntCounter,
278 : }
279 :
280 : pub(crate) struct PageCacheMetrics {
281 : map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
282 : }
283 :
284 184 : static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
285 184 : register_int_counter_vec!(
286 184 : "pageserver_page_cache_read_hits_total",
287 184 : "Number of read accesses to the page cache that hit",
288 184 : &["task_kind", "key_kind", "content_kind", "hit_kind"]
289 184 : )
290 184 : .expect("failed to define a metric")
291 184 : });
292 :
293 184 : static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
294 184 : register_int_counter_vec!(
295 184 : "pageserver_page_cache_read_accesses_total",
296 184 : "Number of read accesses to the page cache",
297 184 : &["task_kind", "key_kind", "content_kind"]
298 184 : )
299 184 : .expect("failed to define a metric")
300 184 : });
301 :
302 184 : pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
303 5704 : map: EnumMap::from_array(std::array::from_fn(|task_kind| {
304 5704 : let task_kind = TaskKind::from_usize(task_kind);
305 5704 : let task_kind: &'static str = task_kind.into();
306 45632 : EnumMap::from_array(std::array::from_fn(|content_kind| {
307 45632 : let content_kind = PageContentKind::from_usize(content_kind);
308 45632 : let content_kind: &'static str = content_kind.into();
309 45632 : PageCacheMetricsForTaskKind {
310 45632 : read_accesses_immutable: {
311 45632 : PAGE_CACHE_READ_ACCESSES
312 45632 : .get_metric_with_label_values(&[task_kind, "immutable", content_kind])
313 45632 : .unwrap()
314 45632 : },
315 45632 :
316 45632 : read_hits_immutable: {
317 45632 : PAGE_CACHE_READ_HITS
318 45632 : .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
319 45632 : .unwrap()
320 45632 : },
321 45632 : }
322 45632 : }))
323 5704 : })),
324 184 : });
325 :
326 : impl PageCacheMetrics {
327 2329480 : pub(crate) fn for_ctx(&self, ctx: &RequestContext) -> &PageCacheMetricsForTaskKind {
328 2329480 : &self.map[ctx.task_kind()][ctx.page_content_kind()]
329 2329480 : }
330 : }
331 :
332 : pub(crate) struct PageCacheSizeMetrics {
333 : pub max_bytes: UIntGauge,
334 :
335 : pub current_bytes_immutable: UIntGauge,
336 : }
337 :
338 184 : static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
339 184 : register_uint_gauge_vec!(
340 184 : "pageserver_page_cache_size_current_bytes",
341 184 : "Current size of the page cache in bytes, by key kind",
342 184 : &["key_kind"]
343 184 : )
344 184 : .expect("failed to define a metric")
345 184 : });
346 :
347 : pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
348 184 : Lazy::new(|| PageCacheSizeMetrics {
349 184 : max_bytes: {
350 184 : register_uint_gauge!(
351 184 : "pageserver_page_cache_size_max_bytes",
352 184 : "Maximum size of the page cache in bytes"
353 184 : )
354 184 : .expect("failed to define a metric")
355 184 : },
356 184 : current_bytes_immutable: {
357 184 : PAGE_CACHE_SIZE_CURRENT_BYTES
358 184 : .get_metric_with_label_values(&["immutable"])
359 184 : .unwrap()
360 184 : },
361 184 : });
362 :
363 : pub(crate) mod page_cache_eviction_metrics {
364 : use std::num::NonZeroUsize;
365 :
366 : use metrics::{register_int_counter_vec, IntCounter, IntCounterVec};
367 : use once_cell::sync::Lazy;
368 :
369 : #[derive(Clone, Copy)]
370 : pub(crate) enum Outcome {
371 : FoundSlotUnused { iters: NonZeroUsize },
372 : FoundSlotEvicted { iters: NonZeroUsize },
373 : ItersExceeded { iters: NonZeroUsize },
374 : }
375 :
376 184 : static ITERS_TOTAL_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
377 184 : register_int_counter_vec!(
378 184 : "pageserver_page_cache_find_victim_iters_total",
379 184 : "Counter for the number of iterations in the find_victim loop",
380 184 : &["outcome"],
381 184 : )
382 184 : .expect("failed to define a metric")
383 184 : });
384 :
385 184 : static CALLS_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
386 184 : register_int_counter_vec!(
387 184 : "pageserver_page_cache_find_victim_calls",
388 184 : "Incremented at the end of each find_victim() call.\
389 184 : Filter by outcome to get e.g., eviction rate.",
390 184 : &["outcome"]
391 184 : )
392 184 : .unwrap()
393 184 : });
394 :
395 63464 : pub(crate) fn observe(outcome: Outcome) {
396 : macro_rules! dry {
397 : ($label:literal, $iters:expr) => {{
398 : static LABEL: &'static str = $label;
399 : static ITERS_TOTAL: Lazy<IntCounter> =
400 224 : Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL]));
401 : static CALLS: Lazy<IntCounter> =
402 224 : Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL]));
403 : ITERS_TOTAL.inc_by(($iters.get()) as u64);
404 : CALLS.inc();
405 : }};
406 : }
407 63464 : match outcome {
408 3248 : Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters),
409 60216 : Outcome::FoundSlotEvicted { iters } => {
410 60216 : dry!("found_evicted", iters)
411 : }
412 0 : Outcome::ItersExceeded { iters } => {
413 0 : dry!("err_iters_exceeded", iters);
414 0 : super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit);
415 0 : }
416 : }
417 63464 : }
418 : }
419 :
420 0 : static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
421 0 : register_int_counter_vec!(
422 0 : "page_cache_errors_total",
423 0 : "Number of timeouts while acquiring a pinned slot in the page cache",
424 0 : &["error_kind"]
425 0 : )
426 0 : .expect("failed to define a metric")
427 0 : });
428 :
429 : #[derive(IntoStaticStr)]
430 : #[strum(serialize_all = "kebab_case")]
431 : pub(crate) enum PageCacheErrorKind {
432 : AcquirePinnedSlotTimeout,
433 : EvictIterLimit,
434 : }
435 :
436 0 : pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
437 0 : PAGE_CACHE_ERRORS
438 0 : .get_metric_with_label_values(&[error_kind.into()])
439 0 : .unwrap()
440 0 : .inc();
441 0 : }
442 :
443 40 : pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
444 40 : register_histogram!(
445 40 : "pageserver_wait_lsn_seconds",
446 40 : "Time spent waiting for WAL to arrive",
447 40 : CRITICAL_OP_BUCKETS.into(),
448 40 : )
449 40 : .expect("failed to define a metric")
450 40 : });
451 :
452 404 : static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
453 404 : register_gauge_vec!(
454 404 : "pageserver_flush_wait_upload_seconds",
455 404 : "Time spent waiting for preceding uploads during layer flush",
456 404 : &["tenant_id", "shard_id", "timeline_id"]
457 404 : )
458 404 : .expect("failed to define a metric")
459 404 : });
460 :
461 404 : static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
462 404 : register_int_gauge_vec!(
463 404 : "pageserver_last_record_lsn",
464 404 : "Last record LSN grouped by timeline",
465 404 : &["tenant_id", "shard_id", "timeline_id"]
466 404 : )
467 404 : .expect("failed to define a metric")
468 404 : });
469 :
470 404 : static DISK_CONSISTENT_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
471 404 : register_int_gauge_vec!(
472 404 : "pageserver_disk_consistent_lsn",
473 404 : "Disk consistent LSN grouped by timeline",
474 404 : &["tenant_id", "shard_id", "timeline_id"]
475 404 : )
476 404 : .expect("failed to define a metric")
477 404 : });
478 :
479 404 : pub(crate) static PROJECTED_REMOTE_CONSISTENT_LSN: Lazy<UIntGaugeVec> = Lazy::new(|| {
480 404 : register_uint_gauge_vec!(
481 404 : "pageserver_projected_remote_consistent_lsn",
482 404 : "Projected remote consistent LSN grouped by timeline",
483 404 : &["tenant_id", "shard_id", "timeline_id"]
484 404 : )
485 404 : .expect("failed to define a metric")
486 404 : });
487 :
488 404 : static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
489 404 : register_uint_gauge_vec!(
490 404 : "pageserver_pitr_history_size",
491 404 : "Data written since PITR cutoff on this timeline",
492 404 : &["tenant_id", "shard_id", "timeline_id"]
493 404 : )
494 404 : .expect("failed to define a metric")
495 404 : });
496 :
497 : #[derive(
498 240 : strum_macros::EnumIter,
499 0 : strum_macros::EnumString,
500 : strum_macros::Display,
501 : strum_macros::IntoStaticStr,
502 : )]
503 : #[strum(serialize_all = "kebab_case")]
504 : pub(crate) enum LayerKind {
505 : Delta,
506 : Image,
507 : }
508 :
509 : #[derive(
510 100 : strum_macros::EnumIter,
511 0 : strum_macros::EnumString,
512 : strum_macros::Display,
513 : strum_macros::IntoStaticStr,
514 : )]
515 : #[strum(serialize_all = "kebab_case")]
516 : pub(crate) enum LayerLevel {
517 : // We don't track the currently open ephemeral layer, since there's always exactly 1 and its
518 : // size changes. See `TIMELINE_EPHEMERAL_BYTES`.
519 : Frozen,
520 : L0,
521 : L1,
522 : }
523 :
524 396 : static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
525 396 : register_uint_gauge_vec!(
526 396 : "pageserver_layer_bytes",
527 396 : "Sum of frozen, L0, and L1 layer physical sizes in bytes (excluding the open ephemeral layer)",
528 396 : &["tenant_id", "shard_id", "timeline_id", "level", "kind"]
529 396 : )
530 396 : .expect("failed to define a metric")
531 396 : });
532 :
533 396 : static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
534 396 : register_uint_gauge_vec!(
535 396 : "pageserver_layer_count",
536 396 : "Number of frozen, L0, and L1 layers (excluding the open ephemeral layer)",
537 396 : &["tenant_id", "shard_id", "timeline_id", "level", "kind"]
538 396 : )
539 396 : .expect("failed to define a metric")
540 396 : });
541 :
542 404 : static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
543 404 : register_uint_gauge_vec!(
544 404 : "pageserver_archive_size",
545 404 : "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero",
546 404 : &["tenant_id", "shard_id", "timeline_id"]
547 404 : )
548 404 : .expect("failed to define a metric")
549 404 : });
550 :
551 404 : static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
552 404 : register_int_gauge_vec!(
553 404 : "pageserver_standby_horizon",
554 404 : "Standby apply LSN for which GC is hold off, by timeline.",
555 404 : &["tenant_id", "shard_id", "timeline_id"]
556 404 : )
557 404 : .expect("failed to define a metric")
558 404 : });
559 :
560 404 : static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
561 404 : register_uint_gauge_vec!(
562 404 : "pageserver_resident_physical_size",
563 404 : "The size of the layer files present in the pageserver's filesystem, for attached locations.",
564 404 : &["tenant_id", "shard_id", "timeline_id"]
565 404 : )
566 404 : .expect("failed to define a metric")
567 404 : });
568 :
569 404 : static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
570 404 : register_uint_gauge_vec!(
571 404 : "pageserver_visible_physical_size",
572 404 : "The size of the layer files present in the pageserver's filesystem.",
573 404 : &["tenant_id", "shard_id", "timeline_id"]
574 404 : )
575 404 : .expect("failed to define a metric")
576 404 : });
577 :
578 396 : pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
579 396 : register_uint_gauge!(
580 396 : "pageserver_resident_physical_size_global",
581 396 : "Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions."
582 396 : )
583 396 : .expect("failed to define a metric")
584 396 : });
585 :
586 404 : static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
587 404 : register_uint_gauge_vec!(
588 404 : "pageserver_remote_physical_size",
589 404 : "The size of the layer files present in the remote storage that are listed in the remote index_part.json.",
590 404 : // Corollary: If any files are missing from the index part, they won't be included here.
591 404 : &["tenant_id", "shard_id", "timeline_id"]
592 404 : )
593 404 : .expect("failed to define a metric")
594 404 : });
595 :
596 404 : static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
597 404 : register_uint_gauge!(
598 404 : "pageserver_remote_physical_size_global",
599 404 : "Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions."
600 404 : )
601 404 : .expect("failed to define a metric")
602 404 : });
603 :
604 12 : pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
605 12 : register_int_counter!(
606 12 : "pageserver_remote_ondemand_downloaded_layers_total",
607 12 : "Total on-demand downloaded layers"
608 12 : )
609 12 : .unwrap()
610 12 : });
611 :
612 12 : pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
613 12 : register_int_counter!(
614 12 : "pageserver_remote_ondemand_downloaded_bytes_total",
615 12 : "Total bytes of layers on-demand downloaded",
616 12 : )
617 12 : .unwrap()
618 12 : });
619 :
620 404 : static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
621 404 : register_uint_gauge_vec!(
622 404 : "pageserver_current_logical_size",
623 404 : "Current logical size grouped by timeline",
624 404 : &["tenant_id", "shard_id", "timeline_id"]
625 404 : )
626 404 : .expect("failed to define current logical size metric")
627 404 : });
628 :
629 404 : static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
630 404 : register_int_gauge_vec!(
631 404 : "pageserver_aux_file_estimated_size",
632 404 : "The size of all aux files for a timeline in aux file v2 store.",
633 404 : &["tenant_id", "shard_id", "timeline_id"]
634 404 : )
635 404 : .expect("failed to define a metric")
636 404 : });
637 :
638 404 : static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
639 404 : register_uint_gauge_vec!(
640 404 : "pageserver_valid_lsn_lease_count",
641 404 : "The number of valid leases after refreshing gc info.",
642 404 : &["tenant_id", "shard_id", "timeline_id"],
643 404 : )
644 404 : .expect("failed to define a metric")
645 404 : });
646 :
647 0 : pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
648 0 : register_int_counter!(
649 0 : "pageserver_circuit_breaker_broken",
650 0 : "How many times a circuit breaker has broken"
651 0 : )
652 0 : .expect("failed to define a metric")
653 0 : });
654 :
655 0 : pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
656 0 : register_int_counter!(
657 0 : "pageserver_circuit_breaker_unbroken",
658 0 : "How many times a circuit breaker has been un-broken (recovered)"
659 0 : )
660 0 : .expect("failed to define a metric")
661 0 : });
662 :
663 388 : pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
664 388 : register_int_counter!(
665 388 : "pageserver_compression_image_in_bytes_total",
666 388 : "Size of data written into image layers before compression"
667 388 : )
668 388 : .expect("failed to define a metric")
669 388 : });
670 :
671 388 : pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
672 388 : register_int_counter!(
673 388 : "pageserver_compression_image_in_bytes_considered",
674 388 : "Size of potentially compressible data written into image layers before compression"
675 388 : )
676 388 : .expect("failed to define a metric")
677 388 : });
678 :
679 388 : pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
680 388 : register_int_counter!(
681 388 : "pageserver_compression_image_in_bytes_chosen",
682 388 : "Size of data whose compressed form was written into image layers"
683 388 : )
684 388 : .expect("failed to define a metric")
685 388 : });
686 :
687 388 : pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
688 388 : register_int_counter!(
689 388 : "pageserver_compression_image_out_bytes_total",
690 388 : "Size of compressed image layer written"
691 388 : )
692 388 : .expect("failed to define a metric")
693 388 : });
694 :
695 20 : pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
696 20 : register_uint_gauge!(
697 20 : "pageserver_relsize_cache_entries",
698 20 : "Number of entries in the relation size cache",
699 20 : )
700 20 : .expect("failed to define a metric")
701 20 : });
702 :
703 20 : pub(crate) static RELSIZE_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
704 20 : register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",)
705 20 : .expect("failed to define a metric")
706 20 : });
707 :
708 20 : pub(crate) static RELSIZE_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
709 20 : register_int_counter!(
710 20 : "pageserver_relsize_cache_misses",
711 20 : "Relation size cache misses",
712 20 : )
713 20 : .expect("failed to define a metric")
714 20 : });
715 :
716 8 : pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy<IntCounter> = Lazy::new(|| {
717 8 : register_int_counter!(
718 8 : "pageserver_relsize_cache_misses_old",
719 8 : "Relation size cache misses where the lookup LSN is older than the last relation update"
720 8 : )
721 8 : .expect("failed to define a metric")
722 8 : });
723 :
724 : pub(crate) mod initial_logical_size {
725 : use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
726 : use once_cell::sync::Lazy;
727 :
728 : pub(crate) struct StartCalculation(IntCounterVec);
729 404 : pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
730 404 : StartCalculation(
731 404 : register_int_counter_vec!(
732 404 : "pageserver_initial_logical_size_start_calculation",
733 404 : "Incremented each time we start an initial logical size calculation attempt. \
734 404 : The `circumstances` label provides some additional details.",
735 404 : &["attempt", "circumstances"]
736 404 : )
737 404 : .unwrap(),
738 404 : )
739 404 : });
740 :
741 : struct DropCalculation {
742 : first: IntCounter,
743 : retry: IntCounter,
744 : }
745 :
746 404 : static DROP_CALCULATION: Lazy<DropCalculation> = Lazy::new(|| {
747 404 : let vec = register_int_counter_vec!(
748 404 : "pageserver_initial_logical_size_drop_calculation",
749 404 : "Incremented each time we abort a started size calculation attmpt.",
750 404 : &["attempt"]
751 404 : )
752 404 : .unwrap();
753 404 : DropCalculation {
754 404 : first: vec.with_label_values(&["first"]),
755 404 : retry: vec.with_label_values(&["retry"]),
756 404 : }
757 404 : });
758 :
759 : pub(crate) struct Calculated {
760 : pub(crate) births: IntCounter,
761 : pub(crate) deaths: IntCounter,
762 : }
763 :
764 404 : pub(crate) static CALCULATED: Lazy<Calculated> = Lazy::new(|| Calculated {
765 404 : births: register_int_counter!(
766 404 : "pageserver_initial_logical_size_finish_calculation",
767 404 : "Incremented every time we finish calculation of initial logical size.\
768 404 : If everything is working well, this should happen at most once per Timeline object."
769 404 : )
770 404 : .unwrap(),
771 404 : deaths: register_int_counter!(
772 404 : "pageserver_initial_logical_size_drop_finished_calculation",
773 404 : "Incremented when we drop a finished initial logical size calculation result.\
774 404 : Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge."
775 404 : )
776 404 : .unwrap(),
777 404 : });
778 :
779 : pub(crate) struct OngoingCalculationGuard {
780 : inc_drop_calculation: Option<IntCounter>,
781 : }
782 :
783 : #[derive(strum_macros::IntoStaticStr)]
784 : pub(crate) enum StartCircumstances {
785 : EmptyInitial,
786 : SkippedConcurrencyLimiter,
787 : AfterBackgroundTasksRateLimit,
788 : }
789 :
790 : impl StartCalculation {
791 428 : pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
792 428 : let circumstances_label: &'static str = circumstances.into();
793 428 : self.0
794 428 : .with_label_values(&["first", circumstances_label])
795 428 : .inc();
796 428 : OngoingCalculationGuard {
797 428 : inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
798 428 : }
799 428 : }
800 0 : pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
801 0 : let circumstances_label: &'static str = circumstances.into();
802 0 : self.0
803 0 : .with_label_values(&["retry", circumstances_label])
804 0 : .inc();
805 0 : OngoingCalculationGuard {
806 0 : inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
807 0 : }
808 0 : }
809 : }
810 :
811 : impl Drop for OngoingCalculationGuard {
812 428 : fn drop(&mut self) {
813 428 : if let Some(counter) = self.inc_drop_calculation.take() {
814 0 : counter.inc();
815 428 : }
816 428 : }
817 : }
818 :
819 : impl OngoingCalculationGuard {
820 428 : pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard {
821 428 : drop(self.inc_drop_calculation.take());
822 428 : CALCULATED.births.inc();
823 428 : FinishedCalculationGuard {
824 428 : inc_on_drop: CALCULATED.deaths.clone(),
825 428 : }
826 428 : }
827 : }
828 :
829 : pub(crate) struct FinishedCalculationGuard {
830 : inc_on_drop: IntCounter,
831 : }
832 :
833 : impl Drop for FinishedCalculationGuard {
834 12 : fn drop(&mut self) {
835 12 : self.inc_on_drop.inc();
836 12 : }
837 : }
838 :
839 : // context: https://github.com/neondatabase/neon/issues/5963
840 : pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy<IntCounter> =
841 0 : Lazy::new(|| {
842 0 : register_int_counter!(
843 0 : "pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size",
844 0 : "Counter for the following event: walreceiver calls\
845 0 : Timeline::get_current_logical_size() and it returns `Approximate` for the first time."
846 0 : )
847 0 : .unwrap()
848 0 : });
849 : }
850 :
851 0 : static DIRECTORY_ENTRIES_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
852 0 : register_uint_gauge_vec!(
853 0 : "pageserver_directory_entries_count",
854 0 : "Sum of the entries in pageserver-stored directory listings",
855 0 : &["tenant_id", "shard_id", "timeline_id"]
856 0 : )
857 0 : .expect("failed to define a metric")
858 0 : });
859 :
860 408 : pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
861 408 : register_uint_gauge_vec!(
862 408 : "pageserver_tenant_states_count",
863 408 : "Count of tenants per state",
864 408 : &["state"]
865 408 : )
866 408 : .expect("Failed to register pageserver_tenant_states_count metric")
867 408 : });
868 :
869 : /// A set of broken tenants.
870 : ///
871 : /// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
872 : /// tenant.
873 20 : pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
874 20 : register_uint_gauge_vec!(
875 20 : "pageserver_broken_tenants_count",
876 20 : "Set of broken tenants",
877 20 : &["tenant_id", "shard_id"]
878 20 : )
879 20 : .expect("Failed to register pageserver_tenant_states_count metric")
880 20 : });
881 :
882 12 : pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
883 12 : register_uint_gauge_vec!(
884 12 : "pageserver_tenant_synthetic_cached_size_bytes",
885 12 : "Synthetic size of each tenant in bytes",
886 12 : &["tenant_id"]
887 12 : )
888 12 : .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
889 12 : });
890 :
891 0 : pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
892 0 : register_histogram_vec!(
893 0 : "pageserver_eviction_iteration_duration_seconds_global",
894 0 : "Time spent on a single eviction iteration",
895 0 : &["period_secs", "threshold_secs"],
896 0 : STORAGE_OP_BUCKETS.into(),
897 0 : )
898 0 : .expect("failed to define a metric")
899 0 : });
900 :
901 404 : static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
902 404 : register_int_counter_vec!(
903 404 : "pageserver_evictions",
904 404 : "Number of layers evicted from the pageserver",
905 404 : &["tenant_id", "shard_id", "timeline_id"]
906 404 : )
907 404 : .expect("failed to define a metric")
908 404 : });
909 :
910 404 : static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(|| {
911 404 : register_int_counter_vec!(
912 404 : "pageserver_evictions_with_low_residence_duration",
913 404 : "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
914 404 : Residence duration is determined using the `residence_duration_data_source`.",
915 404 : &["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
916 404 : )
917 404 : .expect("failed to define a metric")
918 404 : });
919 :
920 0 : pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
921 0 : register_int_counter!(
922 0 : "pageserver_unexpected_ondemand_downloads_count",
923 0 : "Number of unexpected on-demand downloads. \
924 0 : We log more context for each increment, so, forgo any labels in this metric.",
925 0 : )
926 0 : .expect("failed to define a metric")
927 0 : });
928 :
929 : /// How long did we take to start up? Broken down by labels to describe
930 : /// different phases of startup.
931 0 : pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
932 0 : register_gauge_vec!(
933 0 : "pageserver_startup_duration_seconds",
934 0 : "Time taken by phases of pageserver startup, in seconds",
935 0 : &["phase"]
936 0 : )
937 0 : .expect("Failed to register pageserver_startup_duration_seconds metric")
938 0 : });
939 :
940 0 : pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
941 0 : register_uint_gauge!(
942 0 : "pageserver_startup_is_loading",
943 0 : "1 while in initial startup load of tenants, 0 at other times"
944 0 : )
945 0 : .expect("Failed to register pageserver_startup_is_loading")
946 0 : });
947 :
948 396 : pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
949 396 : register_uint_gauge!(
950 396 : "pageserver_timeline_ephemeral_bytes",
951 396 : "Total number of bytes in ephemeral layers, summed for all timelines. Approximate, lazily updated."
952 396 : )
953 396 : .expect("Failed to register metric")
954 396 : });
955 :
956 : /// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
957 : /// like how long it took to load.
958 : ///
959 : /// Note that these are process-global metrics, _not_ per-tenant metrics. Per-tenant
960 : /// metrics are rather expensive, and usually fine grained stuff makes more sense
961 : /// at a timeline level than tenant level.
962 : pub(crate) struct TenantMetrics {
963 : /// How long did tenants take to go from construction to active state?
964 : pub(crate) activation: Histogram,
965 : pub(crate) preload: Histogram,
966 : pub(crate) attach: Histogram,
967 :
968 : /// How many tenants are included in the initial startup of the pagesrever?
969 : pub(crate) startup_scheduled: IntCounter,
970 : pub(crate) startup_complete: IntCounter,
971 : }
972 :
973 0 : pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
974 0 : TenantMetrics {
975 0 : activation: register_histogram!(
976 0 : "pageserver_tenant_activation_seconds",
977 0 : "Time taken by tenants to activate, in seconds",
978 0 : CRITICAL_OP_BUCKETS.into()
979 0 : )
980 0 : .expect("Failed to register metric"),
981 0 : preload: register_histogram!(
982 0 : "pageserver_tenant_preload_seconds",
983 0 : "Time taken by tenants to load remote metadata on startup/attach, in seconds",
984 0 : CRITICAL_OP_BUCKETS.into()
985 0 : )
986 0 : .expect("Failed to register metric"),
987 0 : attach: register_histogram!(
988 0 : "pageserver_tenant_attach_seconds",
989 0 : "Time taken by tenants to intialize, after remote metadata is already loaded",
990 0 : CRITICAL_OP_BUCKETS.into()
991 0 : )
992 0 : .expect("Failed to register metric"),
993 0 : startup_scheduled: register_int_counter!(
994 0 : "pageserver_tenant_startup_scheduled",
995 0 : "Number of tenants included in pageserver startup (doesn't count tenants attached later)"
996 0 : ).expect("Failed to register metric"),
997 0 : startup_complete: register_int_counter!(
998 0 : "pageserver_tenant_startup_complete",
999 0 : "Number of tenants that have completed warm-up, or activated on-demand during initial startup: \
1000 0 : should eventually reach `pageserver_tenant_startup_scheduled_total`. Does not include broken \
1001 0 : tenants: such cases will lead to this metric never reaching the scheduled count."
1002 0 : ).expect("Failed to register metric"),
1003 0 : }
1004 0 : });
1005 :
1006 : /// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
1007 : #[derive(Debug)]
1008 : pub(crate) struct EvictionsWithLowResidenceDuration {
1009 : data_source: &'static str,
1010 : threshold: Duration,
1011 : counter: Option<IntCounter>,
1012 : }
1013 :
1014 : pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
1015 : data_source: &'static str,
1016 : threshold: Duration,
1017 : }
1018 :
1019 : impl EvictionsWithLowResidenceDurationBuilder {
1020 896 : pub fn new(data_source: &'static str, threshold: Duration) -> Self {
1021 896 : Self {
1022 896 : data_source,
1023 896 : threshold,
1024 896 : }
1025 896 : }
1026 :
1027 896 : fn build(
1028 896 : &self,
1029 896 : tenant_id: &str,
1030 896 : shard_id: &str,
1031 896 : timeline_id: &str,
1032 896 : ) -> EvictionsWithLowResidenceDuration {
1033 896 : let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
1034 896 : .get_metric_with_label_values(&[
1035 896 : tenant_id,
1036 896 : shard_id,
1037 896 : timeline_id,
1038 896 : self.data_source,
1039 896 : &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
1040 896 : ])
1041 896 : .unwrap();
1042 896 : EvictionsWithLowResidenceDuration {
1043 896 : data_source: self.data_source,
1044 896 : threshold: self.threshold,
1045 896 : counter: Some(counter),
1046 896 : }
1047 896 : }
1048 : }
1049 :
1050 : impl EvictionsWithLowResidenceDuration {
1051 916 : fn threshold_label_value(threshold: Duration) -> String {
1052 916 : format!("{}", threshold.as_secs())
1053 916 : }
1054 :
1055 8 : pub fn observe(&self, observed_value: Duration) {
1056 8 : if observed_value < self.threshold {
1057 8 : self.counter
1058 8 : .as_ref()
1059 8 : .expect("nobody calls this function after `remove_from_vec`")
1060 8 : .inc();
1061 8 : }
1062 8 : }
1063 :
1064 0 : pub fn change_threshold(
1065 0 : &mut self,
1066 0 : tenant_id: &str,
1067 0 : shard_id: &str,
1068 0 : timeline_id: &str,
1069 0 : new_threshold: Duration,
1070 0 : ) {
1071 0 : if new_threshold == self.threshold {
1072 0 : return;
1073 0 : }
1074 0 : let mut with_new = EvictionsWithLowResidenceDurationBuilder::new(
1075 0 : self.data_source,
1076 0 : new_threshold,
1077 0 : )
1078 0 : .build(tenant_id, shard_id, timeline_id);
1079 0 : std::mem::swap(self, &mut with_new);
1080 0 : with_new.remove(tenant_id, shard_id, timeline_id);
1081 0 : }
1082 :
1083 : // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
1084 20 : fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) {
1085 20 : let Some(_counter) = self.counter.take() else {
1086 0 : return;
1087 : };
1088 :
1089 20 : let threshold = Self::threshold_label_value(self.threshold);
1090 20 :
1091 20 : let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
1092 20 : tenant_id,
1093 20 : shard_id,
1094 20 : timeline_id,
1095 20 : self.data_source,
1096 20 : &threshold,
1097 20 : ]);
1098 20 :
1099 20 : match removed {
1100 0 : Err(e) => {
1101 0 : // this has been hit in staging as
1102 0 : // <https://neondatabase.sentry.io/issues/4142396994/>, but we don't know how.
1103 0 : // because we can be in the drop path already, don't risk:
1104 0 : // - "double-panic => illegal instruction" or
1105 0 : // - future "drop panick => abort"
1106 0 : //
1107 0 : // so just nag: (the error has the labels)
1108 0 : tracing::warn!("failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}");
1109 : }
1110 : Ok(()) => {
1111 : // to help identify cases where we double-remove the same values, let's log all
1112 : // deletions?
1113 20 : tracing::info!("removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source);
1114 : }
1115 : }
1116 20 : }
1117 : }
1118 :
1119 : // Metrics collected on disk IO operations
1120 : //
1121 : // Roughly logarithmic scale.
1122 : const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
1123 : 0.000030, // 30 usec
1124 : 0.001000, // 1000 usec
1125 : 0.030, // 30 ms
1126 : 1.000, // 1000 ms
1127 : 30.000, // 30000 ms
1128 : ];
1129 :
1130 : /// VirtualFile fs operation variants.
1131 : ///
1132 : /// Operations:
1133 : /// - open ([`std::fs::OpenOptions::open`])
1134 : /// - close (dropping [`crate::virtual_file::VirtualFile`])
1135 : /// - close-by-replace (close by replacement algorithm)
1136 : /// - read (`read_at`)
1137 : /// - write (`write_at`)
1138 : /// - seek (modify internal position or file length query)
1139 : /// - fsync ([`std::fs::File::sync_all`])
1140 : /// - metadata ([`std::fs::File::metadata`])
1141 : #[derive(
1142 0 : Debug, Clone, Copy, strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr,
1143 : )]
1144 : pub(crate) enum StorageIoOperation {
1145 : Open,
1146 : OpenAfterReplace,
1147 : Close,
1148 : CloseByReplace,
1149 : Read,
1150 : Write,
1151 : Seek,
1152 : Fsync,
1153 : Metadata,
1154 : }
1155 :
1156 : impl StorageIoOperation {
1157 4176 : pub fn as_str(&self) -> &'static str {
1158 4176 : match self {
1159 464 : StorageIoOperation::Open => "open",
1160 464 : StorageIoOperation::OpenAfterReplace => "open-after-replace",
1161 464 : StorageIoOperation::Close => "close",
1162 464 : StorageIoOperation::CloseByReplace => "close-by-replace",
1163 464 : StorageIoOperation::Read => "read",
1164 464 : StorageIoOperation::Write => "write",
1165 464 : StorageIoOperation::Seek => "seek",
1166 464 : StorageIoOperation::Fsync => "fsync",
1167 464 : StorageIoOperation::Metadata => "metadata",
1168 : }
1169 4176 : }
1170 : }
1171 :
1172 : /// Tracks time taken by fs operations near VirtualFile.
1173 : #[derive(Debug)]
1174 : pub(crate) struct StorageIoTime {
1175 : metrics: [Histogram; StorageIoOperation::COUNT],
1176 : }
1177 :
1178 : impl StorageIoTime {
1179 464 : fn new() -> Self {
1180 464 : let storage_io_histogram_vec = register_histogram_vec!(
1181 464 : "pageserver_io_operations_seconds",
1182 464 : "Time spent in IO operations",
1183 464 : &["operation"],
1184 464 : STORAGE_IO_TIME_BUCKETS.into()
1185 464 : )
1186 464 : .expect("failed to define a metric");
1187 4176 : let metrics = std::array::from_fn(|i| {
1188 4176 : let op = StorageIoOperation::from_repr(i).unwrap();
1189 4176 : storage_io_histogram_vec
1190 4176 : .get_metric_with_label_values(&[op.as_str()])
1191 4176 : .unwrap()
1192 4176 : });
1193 464 : Self { metrics }
1194 464 : }
1195 :
1196 4061046 : pub(crate) fn get(&self, op: StorageIoOperation) -> &Histogram {
1197 4061046 : &self.metrics[op as usize]
1198 4061046 : }
1199 : }
1200 :
1201 : pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(StorageIoTime::new);
1202 :
1203 : const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
1204 :
1205 : // Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
1206 456 : pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
1207 456 : register_int_gauge_vec!(
1208 456 : "pageserver_io_operations_bytes_total",
1209 456 : "Total amount of bytes read/written in IO operations",
1210 456 : &["operation", "tenant_id", "shard_id", "timeline_id"]
1211 456 : )
1212 456 : .expect("failed to define a metric")
1213 456 : });
1214 :
1215 : #[cfg(not(test))]
1216 : pub(crate) mod virtual_file_descriptor_cache {
1217 : use super::*;
1218 :
1219 0 : pub(crate) static SIZE_MAX: Lazy<UIntGauge> = Lazy::new(|| {
1220 0 : register_uint_gauge!(
1221 0 : "pageserver_virtual_file_descriptor_cache_size_max",
1222 0 : "Maximum number of open file descriptors in the cache."
1223 0 : )
1224 0 : .unwrap()
1225 0 : });
1226 :
1227 : // SIZE_CURRENT: derive it like so:
1228 : // ```
1229 : // sum (pageserver_io_operations_seconds_count{operation=~"^(open|open-after-replace)$")
1230 : // -ignoring(operation)
1231 : // sum(pageserver_io_operations_seconds_count{operation=~"^(close|close-by-replace)$"}
1232 : // ```
1233 : }
1234 :
1235 : #[cfg(not(test))]
1236 : pub(crate) mod virtual_file_io_engine {
1237 : use super::*;
1238 :
1239 0 : pub(crate) static KIND: Lazy<UIntGaugeVec> = Lazy::new(|| {
1240 0 : register_uint_gauge_vec!(
1241 0 : "pageserver_virtual_file_io_engine_kind",
1242 0 : "The configured io engine for VirtualFile",
1243 0 : &["kind"],
1244 0 : )
1245 0 : .unwrap()
1246 0 : });
1247 : }
1248 :
1249 : pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
1250 : pub(crate) struct SmgrOpTimerInner {
1251 : global_execution_latency_histo: Histogram,
1252 : per_timeline_execution_latency_histo: Option<Histogram>,
1253 :
1254 : global_batch_wait_time: Histogram,
1255 : per_timeline_batch_wait_time: Histogram,
1256 :
1257 : global_flush_in_progress_micros: IntCounter,
1258 : per_timeline_flush_in_progress_micros: IntCounter,
1259 :
1260 : throttling: Arc<tenant_throttling::Pagestream>,
1261 :
1262 : timings: SmgrOpTimerState,
1263 : }
1264 :
1265 : /// The stages of request processing are represented by the enum variants.
1266 : /// Used as part of [`SmgrOpTimerInner::timings`].
1267 : ///
1268 : /// Request processing calls into the `SmgrOpTimer::observe_*` methods at the
1269 : /// transition points.
1270 : /// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`]
1271 : /// to the next state.
1272 : ///
1273 : /// Each request goes through every stage, in all configurations.
1274 : ///
1275 : #[derive(Debug)]
1276 : enum SmgrOpTimerState {
1277 : Received {
1278 : // In the future, we may want to track the full time the request spent
1279 : // inside pageserver process (time spent in kernel buffers can't be tracked).
1280 : // `received_at` would be used for that.
1281 : #[allow(dead_code)]
1282 : received_at: Instant,
1283 : },
1284 : Throttling {
1285 : throttle_started_at: Instant,
1286 : },
1287 : Batching {
1288 : throttle_done_at: Instant,
1289 : },
1290 : Executing {
1291 : execution_started_at: Instant,
1292 : },
1293 : Flushing,
1294 : // NB: when adding observation points, remember to update the Drop impl.
1295 : }
1296 :
1297 : // NB: when adding observation points, remember to update the Drop impl.
1298 : impl SmgrOpTimer {
1299 : /// See [`SmgrOpTimerState`] for more context.
1300 0 : pub(crate) fn observe_throttle_start(&mut self, at: Instant) {
1301 0 : let Some(inner) = self.0.as_mut() else {
1302 0 : return;
1303 : };
1304 0 : let SmgrOpTimerState::Received { received_at: _ } = &mut inner.timings else {
1305 0 : return;
1306 : };
1307 0 : inner.throttling.count_accounted_start.inc();
1308 0 : inner.timings = SmgrOpTimerState::Throttling {
1309 0 : throttle_started_at: at,
1310 0 : };
1311 0 : }
1312 :
1313 : /// See [`SmgrOpTimerState`] for more context.
1314 0 : pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) {
1315 0 : let Some(inner) = self.0.as_mut() else {
1316 0 : return;
1317 : };
1318 : let SmgrOpTimerState::Throttling {
1319 0 : throttle_started_at,
1320 0 : } = &inner.timings
1321 : else {
1322 0 : return;
1323 : };
1324 0 : inner.throttling.count_accounted_finish.inc();
1325 0 : match throttle {
1326 0 : ThrottleResult::NotThrottled { end } => {
1327 0 : inner.timings = SmgrOpTimerState::Batching {
1328 0 : throttle_done_at: end,
1329 0 : };
1330 0 : }
1331 0 : ThrottleResult::Throttled { end } => {
1332 0 : // update metrics
1333 0 : inner.throttling.count_throttled.inc();
1334 0 : inner
1335 0 : .throttling
1336 0 : .wait_time
1337 0 : .inc_by((end - *throttle_started_at).as_micros().try_into().unwrap());
1338 0 : // state transition
1339 0 : inner.timings = SmgrOpTimerState::Batching {
1340 0 : throttle_done_at: end,
1341 0 : };
1342 0 : }
1343 : }
1344 0 : }
1345 :
1346 : /// See [`SmgrOpTimerState`] for more context.
1347 0 : pub(crate) fn observe_execution_start(&mut self, at: Instant) {
1348 0 : let Some(inner) = self.0.as_mut() else {
1349 0 : return;
1350 : };
1351 0 : let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else {
1352 0 : return;
1353 : };
1354 : // update metrics
1355 0 : let batch = at - *throttle_done_at;
1356 0 : inner.global_batch_wait_time.observe(batch.as_secs_f64());
1357 0 : inner
1358 0 : .per_timeline_batch_wait_time
1359 0 : .observe(batch.as_secs_f64());
1360 0 : // state transition
1361 0 : inner.timings = SmgrOpTimerState::Executing {
1362 0 : execution_started_at: at,
1363 0 : }
1364 0 : }
1365 :
1366 : /// For all but the first caller, this is a no-op.
1367 : /// The first callers receives Some, subsequent ones None.
1368 : ///
1369 : /// See [`SmgrOpTimerState`] for more context.
1370 0 : pub(crate) fn observe_execution_end(&mut self, at: Instant) -> Option<SmgrOpFlushInProgress> {
1371 : // NB: unlike the other observe_* methods, this one take()s.
1372 : #[allow(clippy::question_mark)] // maintain similar code pattern.
1373 0 : let Some(mut inner) = self.0.take() else {
1374 0 : return None;
1375 : };
1376 : let SmgrOpTimerState::Executing {
1377 0 : execution_started_at,
1378 0 : } = &inner.timings
1379 : else {
1380 0 : return None;
1381 : };
1382 : // update metrics
1383 0 : let execution = at - *execution_started_at;
1384 0 : inner
1385 0 : .global_execution_latency_histo
1386 0 : .observe(execution.as_secs_f64());
1387 0 : if let Some(per_timeline_execution_latency_histo) =
1388 0 : &inner.per_timeline_execution_latency_histo
1389 0 : {
1390 0 : per_timeline_execution_latency_histo.observe(execution.as_secs_f64());
1391 0 : }
1392 :
1393 : // state transition
1394 0 : inner.timings = SmgrOpTimerState::Flushing;
1395 0 :
1396 0 : // return the flush in progress object which
1397 0 : // will do the remaining metrics updates
1398 0 : let SmgrOpTimerInner {
1399 0 : global_flush_in_progress_micros,
1400 0 : per_timeline_flush_in_progress_micros,
1401 0 : ..
1402 0 : } = inner;
1403 0 : Some(SmgrOpFlushInProgress {
1404 0 : global_micros: global_flush_in_progress_micros,
1405 0 : per_timeline_micros: per_timeline_flush_in_progress_micros,
1406 0 : })
1407 0 : }
1408 : }
1409 :
1410 : /// The last stage of request processing is serializing and flushing the request
1411 : /// into the TCP connection. We want to make slow flushes observable
1412 : /// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`]
1413 : /// to periodically bump the metric.
1414 : ///
1415 : /// If in the future we decide that we're not interested in live updates, we can
1416 : /// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
1417 : /// and remove this struct from the code base.
1418 : pub(crate) struct SmgrOpFlushInProgress {
1419 : global_micros: IntCounter,
1420 : per_timeline_micros: IntCounter,
1421 : }
1422 :
1423 : impl Drop for SmgrOpTimer {
1424 0 : fn drop(&mut self) {
1425 0 : // In case of early drop, update any of the remaining metrics with
1426 0 : // observations so that (started,finished) counter pairs balance out
1427 0 : // and all counters on the latency path have the the same number of
1428 0 : // observations.
1429 0 : // It's technically lying and it would be better if each metric had
1430 0 : // a separate label or similar for cancelled requests.
1431 0 : // But we don't have that right now and counter pairs balancing
1432 0 : // out is useful when using the metrics in panels and whatnot.
1433 0 : let now = Instant::now();
1434 0 : self.observe_throttle_start(now);
1435 0 : self.observe_throttle_done(ThrottleResult::NotThrottled { end: now });
1436 0 : self.observe_execution_start(now);
1437 0 : let maybe_flush_timer = self.observe_execution_end(now);
1438 0 : drop(maybe_flush_timer);
1439 0 : }
1440 : }
1441 :
1442 : impl SmgrOpFlushInProgress {
1443 : /// The caller must guarantee that `socket_fd`` outlives this function.
1444 0 : pub(crate) async fn measure<Fut, O>(
1445 0 : self,
1446 0 : started_at: Instant,
1447 0 : mut fut: Fut,
1448 0 : socket_fd: RawFd,
1449 0 : ) -> O
1450 0 : where
1451 0 : Fut: std::future::Future<Output = O>,
1452 0 : {
1453 0 : let mut fut = std::pin::pin!(fut);
1454 0 :
1455 0 : let mut logged = false;
1456 0 : let mut last_counter_increment_at = started_at;
1457 0 : let mut observe_guard = scopeguard::guard(
1458 0 : |is_timeout| {
1459 0 : let now = Instant::now();
1460 0 :
1461 0 : // Increment counter
1462 0 : {
1463 0 : let elapsed_since_last_observe = now - last_counter_increment_at;
1464 0 : self.global_micros
1465 0 : .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
1466 0 : self.per_timeline_micros
1467 0 : .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
1468 0 : last_counter_increment_at = now;
1469 0 : }
1470 0 :
1471 0 : // Log something on every timeout, and on completion but only if we hit a timeout.
1472 0 : if is_timeout || logged {
1473 0 : logged = true;
1474 0 : let elapsed_total = now - started_at;
1475 0 : let msg = if is_timeout {
1476 0 : "slow flush ongoing"
1477 : } else {
1478 0 : "slow flush completed or cancelled"
1479 : };
1480 :
1481 0 : let (inq, outq) = {
1482 0 : // SAFETY: caller guarantees that `socket_fd` outlives this function.
1483 0 : #[cfg(target_os = "linux")]
1484 0 : unsafe {
1485 0 : (
1486 0 : utils::linux_socket_ioctl::inq(socket_fd).unwrap_or(-2),
1487 0 : utils::linux_socket_ioctl::outq(socket_fd).unwrap_or(-2),
1488 0 : )
1489 0 : }
1490 0 : #[cfg(not(target_os = "linux"))]
1491 0 : {
1492 0 : (-1, -1)
1493 0 : }
1494 0 : };
1495 0 :
1496 0 : let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64());
1497 0 : tracing::info!(elapsed_total_secs, inq, outq, msg);
1498 0 : }
1499 0 : },
1500 0 : |mut observe| {
1501 0 : observe(false);
1502 0 : },
1503 0 : );
1504 :
1505 : loop {
1506 0 : match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
1507 0 : Ok(v) => return v,
1508 0 : Err(_timeout) => {
1509 0 : (*observe_guard)(true);
1510 0 : }
1511 : }
1512 : }
1513 0 : }
1514 : }
1515 :
1516 : #[derive(
1517 : Debug,
1518 : Clone,
1519 : Copy,
1520 : IntoStaticStr,
1521 : strum_macros::EnumCount,
1522 0 : strum_macros::EnumIter,
1523 : strum_macros::FromRepr,
1524 : enum_map::Enum,
1525 : )]
1526 : #[strum(serialize_all = "snake_case")]
1527 : pub enum SmgrQueryType {
1528 : GetRelExists,
1529 : GetRelSize,
1530 : GetPageAtLsn,
1531 : GetDbSize,
1532 : GetSlruSegment,
1533 : #[cfg(feature = "testing")]
1534 : Test,
1535 : }
1536 :
1537 : pub(crate) struct SmgrQueryTimePerTimeline {
1538 : global_started: [IntCounter; SmgrQueryType::COUNT],
1539 : global_latency: [Histogram; SmgrQueryType::COUNT],
1540 : per_timeline_getpage_started: IntCounter,
1541 : per_timeline_getpage_latency: Histogram,
1542 : global_batch_size: Histogram,
1543 : per_timeline_batch_size: Histogram,
1544 : global_flush_in_progress_micros: IntCounter,
1545 : per_timeline_flush_in_progress_micros: IntCounter,
1546 : global_batch_wait_time: Histogram,
1547 : per_timeline_batch_wait_time: Histogram,
1548 : throttling: Arc<tenant_throttling::Pagestream>,
1549 : }
1550 :
1551 404 : static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
1552 404 : register_int_counter_vec!(
1553 404 : // it's a counter, but, name is prepared to extend it to a histogram of queue depth
1554 404 : "pageserver_smgr_query_started_global_count",
1555 404 : "Number of smgr queries started, aggregated by query type.",
1556 404 : &["smgr_query_type"],
1557 404 : )
1558 404 : .expect("failed to define a metric")
1559 404 : });
1560 :
1561 404 : static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
1562 404 : register_int_counter_vec!(
1563 404 : // it's a counter, but, name is prepared to extend it to a histogram of queue depth
1564 404 : "pageserver_smgr_query_started_count",
1565 404 : "Number of smgr queries started, aggregated by query type and tenant/timeline.",
1566 404 : &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
1567 404 : )
1568 404 : .expect("failed to define a metric")
1569 404 : });
1570 :
1571 : // Alias so all histograms recording per-timeline smgr timings use the same buckets.
1572 : static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS;
1573 :
1574 404 : static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
1575 404 : register_histogram_vec!(
1576 404 : "pageserver_smgr_query_seconds",
1577 404 : "Time spent _executing_ smgr query handling, excluding batch and throttle delays.",
1578 404 : &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
1579 404 : SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
1580 404 : )
1581 404 : .expect("failed to define a metric")
1582 404 : });
1583 :
1584 404 : static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
1585 404 : [
1586 404 : 1,
1587 404 : 10,
1588 404 : 20,
1589 404 : 40,
1590 404 : 60,
1591 404 : 80,
1592 404 : 100,
1593 404 : 200,
1594 404 : 300,
1595 404 : 400,
1596 404 : 500,
1597 404 : 600,
1598 404 : 700,
1599 404 : 800,
1600 404 : 900,
1601 404 : 1_000, // 1ms
1602 404 : 2_000,
1603 404 : 4_000,
1604 404 : 6_000,
1605 404 : 8_000,
1606 404 : 10_000, // 10ms
1607 404 : 20_000,
1608 404 : 40_000,
1609 404 : 60_000,
1610 404 : 80_000,
1611 404 : 100_000,
1612 404 : 200_000,
1613 404 : 400_000,
1614 404 : 600_000,
1615 404 : 800_000,
1616 404 : 1_000_000, // 1s
1617 404 : 2_000_000,
1618 404 : 4_000_000,
1619 404 : 6_000_000,
1620 404 : 8_000_000,
1621 404 : 10_000_000, // 10s
1622 404 : 20_000_000,
1623 404 : 50_000_000,
1624 404 : 100_000_000,
1625 404 : 200_000_000,
1626 404 : 1_000_000_000, // 1000s
1627 404 : ]
1628 404 : .into_iter()
1629 404 : .map(Duration::from_micros)
1630 16564 : .map(|d| d.as_secs_f64())
1631 404 : .collect()
1632 404 : });
1633 :
1634 404 : static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
1635 404 : register_histogram_vec!(
1636 404 : "pageserver_smgr_query_seconds_global",
1637 404 : "Like pageserver_smgr_query_seconds, but aggregated to instance level.",
1638 404 : &["smgr_query_type"],
1639 404 : SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
1640 404 : )
1641 404 : .expect("failed to define a metric")
1642 404 : });
1643 :
1644 404 : static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy<Vec<f64>> = Lazy::new(|| {
1645 404 : (1..=u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap())
1646 12928 : .map(|v| v.into())
1647 404 : .collect()
1648 404 : });
1649 :
1650 404 : static PAGE_SERVICE_BATCH_SIZE_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
1651 404 : register_histogram!(
1652 404 : "pageserver_page_service_batch_size_global",
1653 404 : "Batch size of pageserver page service requests",
1654 404 : PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL.clone(),
1655 404 : )
1656 404 : .expect("failed to define a metric")
1657 404 : });
1658 :
1659 404 : static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy<Vec<f64>> = Lazy::new(|| {
1660 404 : let mut buckets = Vec::new();
1661 2828 : for i in 0.. {
1662 2828 : let bucket = 1 << i;
1663 2828 : if bucket > u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap() {
1664 404 : break;
1665 2424 : }
1666 2424 : buckets.push(bucket.into());
1667 : }
1668 404 : buckets
1669 404 : });
1670 :
1671 404 : static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
1672 404 : register_histogram_vec!(
1673 404 : "pageserver_page_service_batch_size",
1674 404 : "Batch size of pageserver page service requests",
1675 404 : &["tenant_id", "shard_id", "timeline_id"],
1676 404 : PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE.clone()
1677 404 : )
1678 404 : .expect("failed to define a metric")
1679 404 : });
1680 :
1681 0 : pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
1682 0 : register_int_gauge_vec!(
1683 0 : "pageserver_page_service_config_max_batch_size",
1684 0 : "Configured maximum batch size for the server-side batching functionality of page_service. \
1685 0 : Labels expose more of the configuration parameters.",
1686 0 : &["mode", "execution"]
1687 0 : )
1688 0 : .expect("failed to define a metric")
1689 0 : });
1690 :
1691 0 : fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
1692 0 : PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
1693 0 : let (label_values, value) = match conf {
1694 0 : PageServicePipeliningConfig::Serial => (["serial", "-"], 1),
1695 : PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
1696 0 : max_batch_size,
1697 0 : execution,
1698 0 : }) => {
1699 0 : let mode = "pipelined";
1700 0 : let execution = match execution {
1701 : PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => {
1702 0 : "concurrent-futures"
1703 : }
1704 0 : PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
1705 : };
1706 0 : ([mode, execution], max_batch_size.get())
1707 : }
1708 : };
1709 0 : PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
1710 0 : .with_label_values(&label_values)
1711 0 : .set(value.try_into().unwrap());
1712 0 : }
1713 :
1714 404 : static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy<IntCounterVec> = Lazy::new(|| {
1715 404 : register_int_counter_vec!(
1716 404 : "pageserver_page_service_pagestream_flush_in_progress_micros",
1717 404 : "Counter that sums up the microseconds that a pagestream response was being flushed into the TCP connection. \
1718 404 : If the flush is particularly slow, this counter will be updated periodically to make slow flushes \
1719 404 : easily discoverable in monitoring. \
1720 404 : Hence, this is NOT a completion latency historgram.",
1721 404 : &["tenant_id", "shard_id", "timeline_id"],
1722 404 : )
1723 404 : .expect("failed to define a metric")
1724 404 : });
1725 :
1726 404 : static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy::new(|| {
1727 404 : register_int_counter!(
1728 404 : "pageserver_page_service_pagestream_flush_in_progress_micros_global",
1729 404 : "Like pageserver_page_service_pagestream_flush_in_progress_seconds, but instance-wide.",
1730 404 : )
1731 404 : .expect("failed to define a metric")
1732 404 : });
1733 :
1734 404 : static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
1735 404 : register_histogram_vec!(
1736 404 : "pageserver_page_service_pagestream_batch_wait_time_seconds",
1737 404 : "Time a request spent waiting in its batch until the batch moved to throttle&execution.",
1738 404 : &["tenant_id", "shard_id", "timeline_id"],
1739 404 : SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
1740 404 : )
1741 404 : .expect("failed to define a metric")
1742 404 : });
1743 :
1744 404 : static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
1745 404 : register_histogram!(
1746 404 : "pageserver_page_service_pagestream_batch_wait_time_seconds_global",
1747 404 : "Like pageserver_page_service_pagestream_batch_wait_time_seconds, but aggregated to instance level.",
1748 404 : SMGR_QUERY_TIME_GLOBAL_BUCKETS.to_vec(),
1749 404 : )
1750 404 : .expect("failed to define a metric")
1751 404 : });
1752 :
1753 : impl SmgrQueryTimePerTimeline {
1754 896 : pub(crate) fn new(
1755 896 : tenant_shard_id: &TenantShardId,
1756 896 : timeline_id: &TimelineId,
1757 896 : pagestream_throttle_metrics: Arc<tenant_throttling::Pagestream>,
1758 896 : ) -> Self {
1759 896 : let tenant_id = tenant_shard_id.tenant_id.to_string();
1760 896 : let shard_slug = format!("{}", tenant_shard_id.shard_slug());
1761 896 : let timeline_id = timeline_id.to_string();
1762 5376 : let global_started = std::array::from_fn(|i| {
1763 5376 : let op = SmgrQueryType::from_repr(i).unwrap();
1764 5376 : SMGR_QUERY_STARTED_GLOBAL
1765 5376 : .get_metric_with_label_values(&[op.into()])
1766 5376 : .unwrap()
1767 5376 : });
1768 5376 : let global_latency = std::array::from_fn(|i| {
1769 5376 : let op = SmgrQueryType::from_repr(i).unwrap();
1770 5376 : SMGR_QUERY_TIME_GLOBAL
1771 5376 : .get_metric_with_label_values(&[op.into()])
1772 5376 : .unwrap()
1773 5376 : });
1774 896 :
1775 896 : let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE
1776 896 : .get_metric_with_label_values(&[
1777 896 : SmgrQueryType::GetPageAtLsn.into(),
1778 896 : &tenant_id,
1779 896 : &shard_slug,
1780 896 : &timeline_id,
1781 896 : ])
1782 896 : .unwrap();
1783 896 : let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
1784 896 : .get_metric_with_label_values(&[
1785 896 : SmgrQueryType::GetPageAtLsn.into(),
1786 896 : &tenant_id,
1787 896 : &shard_slug,
1788 896 : &timeline_id,
1789 896 : ])
1790 896 : .unwrap();
1791 896 :
1792 896 : let global_batch_size = PAGE_SERVICE_BATCH_SIZE_GLOBAL.clone();
1793 896 : let per_timeline_batch_size = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE
1794 896 : .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
1795 896 : .unwrap();
1796 896 :
1797 896 : let global_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL.clone();
1798 896 : let per_timeline_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME
1799 896 : .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
1800 896 : .unwrap();
1801 896 :
1802 896 : let global_flush_in_progress_micros =
1803 896 : PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
1804 896 : let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
1805 896 : .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
1806 896 : .unwrap();
1807 896 :
1808 896 : Self {
1809 896 : global_started,
1810 896 : global_latency,
1811 896 : per_timeline_getpage_latency,
1812 896 : per_timeline_getpage_started,
1813 896 : global_batch_size,
1814 896 : per_timeline_batch_size,
1815 896 : global_flush_in_progress_micros,
1816 896 : per_timeline_flush_in_progress_micros,
1817 896 : global_batch_wait_time,
1818 896 : per_timeline_batch_wait_time,
1819 896 : throttling: pagestream_throttle_metrics,
1820 896 : }
1821 896 : }
1822 0 : pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
1823 0 : self.global_started[op as usize].inc();
1824 :
1825 0 : let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
1826 0 : self.per_timeline_getpage_started.inc();
1827 0 : Some(self.per_timeline_getpage_latency.clone())
1828 : } else {
1829 0 : None
1830 : };
1831 :
1832 0 : SmgrOpTimer(Some(SmgrOpTimerInner {
1833 0 : global_execution_latency_histo: self.global_latency[op as usize].clone(),
1834 0 : per_timeline_execution_latency_histo: per_timeline_latency_histo,
1835 0 : global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
1836 0 : per_timeline_flush_in_progress_micros: self
1837 0 : .per_timeline_flush_in_progress_micros
1838 0 : .clone(),
1839 0 : global_batch_wait_time: self.global_batch_wait_time.clone(),
1840 0 : per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
1841 0 : throttling: self.throttling.clone(),
1842 0 : timings: SmgrOpTimerState::Received { received_at },
1843 0 : }))
1844 0 : }
1845 :
1846 : /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
1847 0 : pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
1848 0 : self.global_batch_size.observe(batch_size as f64);
1849 0 : self.per_timeline_batch_size.observe(batch_size as f64);
1850 0 : }
1851 : }
1852 :
1853 : // keep in sync with control plane Go code so that we can validate
1854 : // compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
1855 0 : static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
1856 0 : // Go code uses milliseconds. Variable is called `computeStartupBuckets`
1857 0 : [
1858 0 : 5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000,
1859 0 : 1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000,
1860 0 : ]
1861 0 : .map(|ms| (ms as f64) / 1000.0)
1862 0 : });
1863 :
1864 : pub(crate) struct BasebackupQueryTime {
1865 : ok: Histogram,
1866 : error: Histogram,
1867 : client_error: Histogram,
1868 : }
1869 :
1870 0 : pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
1871 0 : let vec = register_histogram_vec!(
1872 0 : "pageserver_basebackup_query_seconds",
1873 0 : "Histogram of basebackup queries durations, by result type",
1874 0 : &["result"],
1875 0 : COMPUTE_STARTUP_BUCKETS.to_vec(),
1876 0 : )
1877 0 : .expect("failed to define a metric");
1878 0 : BasebackupQueryTime {
1879 0 : ok: vec.get_metric_with_label_values(&["ok"]).unwrap(),
1880 0 : error: vec.get_metric_with_label_values(&["error"]).unwrap(),
1881 0 : client_error: vec.get_metric_with_label_values(&["client_error"]).unwrap(),
1882 0 : }
1883 0 : });
1884 :
1885 : pub(crate) struct BasebackupQueryTimeOngoingRecording<'a> {
1886 : parent: &'a BasebackupQueryTime,
1887 : start: std::time::Instant,
1888 : }
1889 :
1890 : impl BasebackupQueryTime {
1891 0 : pub(crate) fn start_recording(&self) -> BasebackupQueryTimeOngoingRecording<'_> {
1892 0 : let start = Instant::now();
1893 0 : BasebackupQueryTimeOngoingRecording {
1894 0 : parent: self,
1895 0 : start,
1896 0 : }
1897 0 : }
1898 : }
1899 :
1900 : impl BasebackupQueryTimeOngoingRecording<'_> {
1901 0 : pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
1902 0 : let elapsed = self.start.elapsed().as_secs_f64();
1903 : // If you want to change categorize of a specific error, also change it in `log_query_error`.
1904 0 : let metric = match res {
1905 0 : Ok(_) => &self.parent.ok,
1906 0 : Err(QueryError::Disconnected(ConnectionError::Io(io_error)))
1907 0 : if is_expected_io_error(io_error) =>
1908 0 : {
1909 0 : &self.parent.client_error
1910 : }
1911 0 : Err(_) => &self.parent.error,
1912 : };
1913 0 : metric.observe(elapsed);
1914 0 : }
1915 : }
1916 :
1917 0 : pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
1918 0 : register_int_counter_pair_vec!(
1919 0 : "pageserver_live_connections_started",
1920 0 : "Number of network connections that we started handling",
1921 0 : "pageserver_live_connections_finished",
1922 0 : "Number of network connections that we finished handling",
1923 0 : &["pageserver_connection_kind"]
1924 0 : )
1925 0 : .expect("failed to define a metric")
1926 0 : });
1927 :
1928 : #[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
1929 : pub(crate) enum ComputeCommandKind {
1930 : PageStreamV3,
1931 : PageStreamV2,
1932 : Basebackup,
1933 : Fullbackup,
1934 : LeaseLsn,
1935 : }
1936 :
1937 : pub(crate) struct ComputeCommandCounters {
1938 : map: EnumMap<ComputeCommandKind, IntCounter>,
1939 : }
1940 :
1941 0 : pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy<ComputeCommandCounters> = Lazy::new(|| {
1942 0 : let inner = register_int_counter_vec!(
1943 0 : "pageserver_compute_commands",
1944 0 : "Number of compute -> pageserver commands processed",
1945 0 : &["command"]
1946 0 : )
1947 0 : .expect("failed to define a metric");
1948 0 :
1949 0 : ComputeCommandCounters {
1950 0 : map: EnumMap::from_array(std::array::from_fn(|i| {
1951 0 : let command = ComputeCommandKind::from_usize(i);
1952 0 : let command_str: &'static str = command.into();
1953 0 : inner.with_label_values(&[command_str])
1954 0 : })),
1955 0 : }
1956 0 : });
1957 :
1958 : impl ComputeCommandCounters {
1959 0 : pub(crate) fn for_command(&self, command: ComputeCommandKind) -> &IntCounter {
1960 0 : &self.map[command]
1961 0 : }
1962 : }
1963 :
1964 : // remote storage metrics
1965 :
1966 396 : static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
1967 396 : register_int_counter_pair_vec!(
1968 396 : "pageserver_remote_timeline_client_calls_started",
1969 396 : "Number of started calls to remote timeline client.",
1970 396 : "pageserver_remote_timeline_client_calls_finished",
1971 396 : "Number of finshed calls to remote timeline client.",
1972 396 : &[
1973 396 : "tenant_id",
1974 396 : "shard_id",
1975 396 : "timeline_id",
1976 396 : "file_kind",
1977 396 : "op_kind"
1978 396 : ],
1979 396 : )
1980 396 : .unwrap()
1981 396 : });
1982 :
1983 : static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
1984 392 : Lazy::new(|| {
1985 392 : register_int_counter_vec!(
1986 392 : "pageserver_remote_timeline_client_bytes_started",
1987 392 : "Incremented by the number of bytes associated with a remote timeline client operation. \
1988 392 : The increment happens when the operation is scheduled.",
1989 392 : &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
1990 392 : )
1991 392 : .expect("failed to define a metric")
1992 392 : });
1993 :
1994 392 : static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
1995 392 : register_int_counter_vec!(
1996 392 : "pageserver_remote_timeline_client_bytes_finished",
1997 392 : "Incremented by the number of bytes associated with a remote timeline client operation. \
1998 392 : The increment happens when the operation finishes (regardless of success/failure/shutdown).",
1999 392 : &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
2000 392 : )
2001 392 : .expect("failed to define a metric")
2002 392 : });
2003 :
2004 : pub(crate) struct TenantManagerMetrics {
2005 : tenant_slots_attached: UIntGauge,
2006 : tenant_slots_secondary: UIntGauge,
2007 : tenant_slots_inprogress: UIntGauge,
2008 : pub(crate) tenant_slot_writes: IntCounter,
2009 : pub(crate) unexpected_errors: IntCounter,
2010 : }
2011 :
2012 : impl TenantManagerMetrics {
2013 : /// Helpers for tracking slots. Note that these do not track the lifetime of TenantSlot objects
2014 : /// exactly: they track the lifetime of the slots _in the tenant map_.
2015 4 : pub(crate) fn slot_inserted(&self, slot: &TenantSlot) {
2016 4 : match slot {
2017 0 : TenantSlot::Attached(_) => {
2018 0 : self.tenant_slots_attached.inc();
2019 0 : }
2020 0 : TenantSlot::Secondary(_) => {
2021 0 : self.tenant_slots_secondary.inc();
2022 0 : }
2023 4 : TenantSlot::InProgress(_) => {
2024 4 : self.tenant_slots_inprogress.inc();
2025 4 : }
2026 : }
2027 4 : }
2028 :
2029 4 : pub(crate) fn slot_removed(&self, slot: &TenantSlot) {
2030 4 : match slot {
2031 4 : TenantSlot::Attached(_) => {
2032 4 : self.tenant_slots_attached.dec();
2033 4 : }
2034 0 : TenantSlot::Secondary(_) => {
2035 0 : self.tenant_slots_secondary.dec();
2036 0 : }
2037 0 : TenantSlot::InProgress(_) => {
2038 0 : self.tenant_slots_inprogress.dec();
2039 0 : }
2040 : }
2041 4 : }
2042 :
2043 : #[cfg(all(debug_assertions, not(test)))]
2044 0 : pub(crate) fn slots_total(&self) -> u64 {
2045 0 : self.tenant_slots_attached.get()
2046 0 : + self.tenant_slots_secondary.get()
2047 0 : + self.tenant_slots_inprogress.get()
2048 0 : }
2049 : }
2050 :
2051 4 : pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
2052 4 : let tenant_slots = register_uint_gauge_vec!(
2053 4 : "pageserver_tenant_manager_slots",
2054 4 : "How many slots currently exist, including all attached, secondary and in-progress operations",
2055 4 : &["mode"]
2056 4 : )
2057 4 : .expect("failed to define a metric");
2058 4 : TenantManagerMetrics {
2059 4 : tenant_slots_attached: tenant_slots
2060 4 : .get_metric_with_label_values(&["attached"])
2061 4 : .unwrap(),
2062 4 : tenant_slots_secondary: tenant_slots
2063 4 : .get_metric_with_label_values(&["secondary"])
2064 4 : .unwrap(),
2065 4 : tenant_slots_inprogress: tenant_slots
2066 4 : .get_metric_with_label_values(&["inprogress"])
2067 4 : .unwrap(),
2068 4 : tenant_slot_writes: register_int_counter!(
2069 4 : "pageserver_tenant_manager_slot_writes",
2070 4 : "Writes to a tenant slot, including all of create/attach/detach/delete"
2071 4 : )
2072 4 : .expect("failed to define a metric"),
2073 4 : unexpected_errors: register_int_counter!(
2074 4 : "pageserver_tenant_manager_unexpected_errors_total",
2075 4 : "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
2076 4 : )
2077 4 : .expect("failed to define a metric"),
2078 4 : }
2079 4 : });
2080 :
2081 : pub(crate) struct DeletionQueueMetrics {
2082 : pub(crate) keys_submitted: IntCounter,
2083 : pub(crate) keys_dropped: IntCounter,
2084 : pub(crate) keys_executed: IntCounter,
2085 : pub(crate) keys_validated: IntCounter,
2086 : pub(crate) dropped_lsn_updates: IntCounter,
2087 : pub(crate) unexpected_errors: IntCounter,
2088 : pub(crate) remote_errors: IntCounterVec,
2089 : }
2090 62 : pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
2091 62 : DeletionQueueMetrics{
2092 62 :
2093 62 : keys_submitted: register_int_counter!(
2094 62 : "pageserver_deletion_queue_submitted_total",
2095 62 : "Number of objects submitted for deletion"
2096 62 : )
2097 62 : .expect("failed to define a metric"),
2098 62 :
2099 62 : keys_dropped: register_int_counter!(
2100 62 : "pageserver_deletion_queue_dropped_total",
2101 62 : "Number of object deletions dropped due to stale generation."
2102 62 : )
2103 62 : .expect("failed to define a metric"),
2104 62 :
2105 62 : keys_executed: register_int_counter!(
2106 62 : "pageserver_deletion_queue_executed_total",
2107 62 : "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion"
2108 62 : )
2109 62 : .expect("failed to define a metric"),
2110 62 :
2111 62 : keys_validated: register_int_counter!(
2112 62 : "pageserver_deletion_queue_validated_total",
2113 62 : "Number of keys validated for deletion. Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage."
2114 62 : )
2115 62 : .expect("failed to define a metric"),
2116 62 :
2117 62 : dropped_lsn_updates: register_int_counter!(
2118 62 : "pageserver_deletion_queue_dropped_lsn_updates_total",
2119 62 : "Updates to remote_consistent_lsn dropped due to stale generation number."
2120 62 : )
2121 62 : .expect("failed to define a metric"),
2122 62 : unexpected_errors: register_int_counter!(
2123 62 : "pageserver_deletion_queue_unexpected_errors_total",
2124 62 : "Number of unexpected condiions that may stall the queue: any value above zero is unexpected."
2125 62 : )
2126 62 : .expect("failed to define a metric"),
2127 62 : remote_errors: register_int_counter_vec!(
2128 62 : "pageserver_deletion_queue_remote_errors_total",
2129 62 : "Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects",
2130 62 : &["op_kind"],
2131 62 : )
2132 62 : .expect("failed to define a metric")
2133 62 : }
2134 62 : });
2135 :
2136 : pub(crate) struct SecondaryModeMetrics {
2137 : pub(crate) upload_heatmap: IntCounter,
2138 : pub(crate) upload_heatmap_errors: IntCounter,
2139 : pub(crate) upload_heatmap_duration: Histogram,
2140 : pub(crate) download_heatmap: IntCounter,
2141 : pub(crate) download_layer: IntCounter,
2142 : }
2143 0 : pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
2144 0 : SecondaryModeMetrics {
2145 0 : upload_heatmap: register_int_counter!(
2146 0 : "pageserver_secondary_upload_heatmap",
2147 0 : "Number of heatmaps written to remote storage by attached tenants"
2148 0 : )
2149 0 : .expect("failed to define a metric"),
2150 0 : upload_heatmap_errors: register_int_counter!(
2151 0 : "pageserver_secondary_upload_heatmap_errors",
2152 0 : "Failures writing heatmap to remote storage"
2153 0 : )
2154 0 : .expect("failed to define a metric"),
2155 0 : upload_heatmap_duration: register_histogram!(
2156 0 : "pageserver_secondary_upload_heatmap_duration",
2157 0 : "Time to build and upload a heatmap, including any waiting inside the remote storage client"
2158 0 : )
2159 0 : .expect("failed to define a metric"),
2160 0 : download_heatmap: register_int_counter!(
2161 0 : "pageserver_secondary_download_heatmap",
2162 0 : "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed"
2163 0 : )
2164 0 : .expect("failed to define a metric"),
2165 0 : download_layer: register_int_counter!(
2166 0 : "pageserver_secondary_download_layer",
2167 0 : "Number of downloads of layers by secondary mode locations"
2168 0 : )
2169 0 : .expect("failed to define a metric"),
2170 0 : }
2171 0 : });
2172 :
2173 0 : pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
2174 0 : register_uint_gauge_vec!(
2175 0 : "pageserver_secondary_resident_physical_size",
2176 0 : "The size of the layer files present in the pageserver's filesystem, for secondary locations.",
2177 0 : &["tenant_id", "shard_id"]
2178 0 : )
2179 0 : .expect("failed to define a metric")
2180 0 : });
2181 :
2182 0 : pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
2183 0 : register_uint_gauge!(
2184 0 : "pageserver_utilization_score",
2185 0 : "The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
2186 0 : )
2187 0 : .expect("failed to define a metric")
2188 0 : });
2189 :
2190 0 : pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
2191 0 : register_uint_gauge_vec!(
2192 0 : "pageserver_secondary_heatmap_total_size",
2193 0 : "The total size in bytes of all layers in the most recently downloaded heatmap.",
2194 0 : &["tenant_id", "shard_id"]
2195 0 : )
2196 0 : .expect("failed to define a metric")
2197 0 : });
2198 :
2199 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
2200 : pub enum RemoteOpKind {
2201 : Upload,
2202 : Download,
2203 : Delete,
2204 : }
2205 : impl RemoteOpKind {
2206 30722 : pub fn as_str(&self) -> &'static str {
2207 30722 : match self {
2208 28922 : Self::Upload => "upload",
2209 136 : Self::Download => "download",
2210 1664 : Self::Delete => "delete",
2211 : }
2212 30722 : }
2213 : }
2214 :
2215 : #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
2216 : pub enum RemoteOpFileKind {
2217 : Layer,
2218 : Index,
2219 : }
2220 : impl RemoteOpFileKind {
2221 30722 : pub fn as_str(&self) -> &'static str {
2222 30722 : match self {
2223 21544 : Self::Layer => "layer",
2224 9178 : Self::Index => "index",
2225 : }
2226 30722 : }
2227 : }
2228 :
2229 392 : pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
2230 392 : register_histogram_vec!(
2231 392 : "pageserver_remote_operation_seconds",
2232 392 : "Time spent on remote storage operations. \
2233 392 : Grouped by tenant, timeline, operation_kind and status. \
2234 392 : Does not account for time spent waiting in remote timeline client's queues.",
2235 392 : &["file_kind", "op_kind", "status"]
2236 392 : )
2237 392 : .expect("failed to define a metric")
2238 392 : });
2239 :
2240 0 : pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
2241 0 : register_int_counter_vec!(
2242 0 : "pageserver_tenant_task_events",
2243 0 : "Number of task start/stop/fail events.",
2244 0 : &["event"],
2245 0 : )
2246 0 : .expect("Failed to register tenant_task_events metric")
2247 0 : });
2248 :
2249 : pub struct BackgroundLoopSemaphoreMetrics {
2250 : counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
2251 : durations: EnumMap<BackgroundLoopKind, Histogram>,
2252 : waiting_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
2253 : running_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
2254 : }
2255 :
2256 : pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> =
2257 40 : Lazy::new(|| {
2258 40 : let counters = register_int_counter_pair_vec!(
2259 40 : "pageserver_background_loop_semaphore_wait_start_count",
2260 40 : "Counter for background loop concurrency-limiting semaphore acquire calls started",
2261 40 : "pageserver_background_loop_semaphore_wait_finish_count",
2262 40 : "Counter for background loop concurrency-limiting semaphore acquire calls finished",
2263 40 : &["task"],
2264 40 : )
2265 40 : .unwrap();
2266 40 :
2267 40 : let durations = register_histogram_vec!(
2268 40 : "pageserver_background_loop_semaphore_wait_seconds",
2269 40 : "Seconds spent waiting on background loop semaphore acquisition",
2270 40 : &["task"],
2271 40 : vec![0.01, 1.0, 5.0, 10.0, 30.0, 60.0, 180.0, 300.0, 600.0],
2272 40 : )
2273 40 : .unwrap();
2274 40 :
2275 40 : let waiting_tasks = register_int_gauge_vec!(
2276 40 : "pageserver_background_loop_semaphore_waiting_tasks",
2277 40 : "Number of background loop tasks waiting for semaphore",
2278 40 : &["task"],
2279 40 : )
2280 40 : .unwrap();
2281 40 :
2282 40 : let running_tasks = register_int_gauge_vec!(
2283 40 : "pageserver_background_loop_semaphore_running_tasks",
2284 40 : "Number of background loop tasks running concurrently",
2285 40 : &["task"],
2286 40 : )
2287 40 : .unwrap();
2288 40 :
2289 40 : BackgroundLoopSemaphoreMetrics {
2290 400 : counters: EnumMap::from_array(std::array::from_fn(|i| {
2291 400 : let kind = BackgroundLoopKind::from_usize(i);
2292 400 : counters.with_label_values(&[kind.into()])
2293 400 : })),
2294 400 : durations: EnumMap::from_array(std::array::from_fn(|i| {
2295 400 : let kind = BackgroundLoopKind::from_usize(i);
2296 400 : durations.with_label_values(&[kind.into()])
2297 400 : })),
2298 400 : waiting_tasks: EnumMap::from_array(std::array::from_fn(|i| {
2299 400 : let kind = BackgroundLoopKind::from_usize(i);
2300 400 : waiting_tasks.with_label_values(&[kind.into()])
2301 400 : })),
2302 400 : running_tasks: EnumMap::from_array(std::array::from_fn(|i| {
2303 400 : let kind = BackgroundLoopKind::from_usize(i);
2304 400 : running_tasks.with_label_values(&[kind.into()])
2305 400 : })),
2306 40 : }
2307 40 : });
2308 :
2309 : impl BackgroundLoopSemaphoreMetrics {
2310 : /// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the
2311 : /// semaphore is acquired, and drop it when the task completes or is cancelled.
2312 768 : pub(crate) fn record(
2313 768 : &self,
2314 768 : task: BackgroundLoopKind,
2315 768 : ) -> BackgroundLoopSemaphoreMetricsRecorder {
2316 768 : BackgroundLoopSemaphoreMetricsRecorder::start(self, task)
2317 768 : }
2318 : }
2319 :
2320 : /// Records metrics for a background task.
2321 : pub struct BackgroundLoopSemaphoreMetricsRecorder<'a> {
2322 : metrics: &'a BackgroundLoopSemaphoreMetrics,
2323 : task: BackgroundLoopKind,
2324 : start: Instant,
2325 : wait_counter_guard: Option<metrics::IntCounterPairGuard>,
2326 : }
2327 :
2328 : impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> {
2329 : /// Starts recording semaphore metrics, by recording wait time and incrementing
2330 : /// `wait_start_count` and `waiting_tasks`.
2331 768 : fn start(metrics: &'a BackgroundLoopSemaphoreMetrics, task: BackgroundLoopKind) -> Self {
2332 768 : metrics.waiting_tasks[task].inc();
2333 768 : Self {
2334 768 : metrics,
2335 768 : task,
2336 768 : start: Instant::now(),
2337 768 : wait_counter_guard: Some(metrics.counters[task].guard()),
2338 768 : }
2339 768 : }
2340 :
2341 : /// Signals that the semaphore has been acquired, and updates relevant metrics.
2342 768 : pub fn acquired(&mut self) -> Duration {
2343 768 : let waited = self.start.elapsed();
2344 768 : self.wait_counter_guard.take().expect("already acquired");
2345 768 : self.metrics.durations[self.task].observe(waited.as_secs_f64());
2346 768 : self.metrics.waiting_tasks[self.task].dec();
2347 768 : self.metrics.running_tasks[self.task].inc();
2348 768 : waited
2349 768 : }
2350 : }
2351 :
2352 : impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> {
2353 : /// The task either completed or was cancelled.
2354 768 : fn drop(&mut self) {
2355 768 : if self.wait_counter_guard.take().is_some() {
2356 0 : // Waiting.
2357 0 : self.metrics.durations[self.task].observe(self.start.elapsed().as_secs_f64());
2358 0 : self.metrics.waiting_tasks[self.task].dec();
2359 768 : } else {
2360 768 : // Running.
2361 768 : self.metrics.running_tasks[self.task].dec();
2362 768 : }
2363 768 : }
2364 : }
2365 :
2366 0 : pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
2367 0 : register_int_counter_vec!(
2368 0 : "pageserver_background_loop_period_overrun_count",
2369 0 : "Incremented whenever warn_when_period_overrun() logs a warning.",
2370 0 : &["task", "period"],
2371 0 : )
2372 0 : .expect("failed to define a metric")
2373 0 : });
2374 :
2375 : // walreceiver metrics
2376 :
2377 0 : pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
2378 0 : register_int_counter!(
2379 0 : "pageserver_walreceiver_started_connections_total",
2380 0 : "Number of started walreceiver connections"
2381 0 : )
2382 0 : .expect("failed to define a metric")
2383 0 : });
2384 :
2385 0 : pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
2386 0 : register_int_gauge!(
2387 0 : "pageserver_walreceiver_active_managers",
2388 0 : "Number of active walreceiver managers"
2389 0 : )
2390 0 : .expect("failed to define a metric")
2391 0 : });
2392 :
2393 0 : pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
2394 0 : register_int_counter_vec!(
2395 0 : "pageserver_walreceiver_switches_total",
2396 0 : "Number of walreceiver manager change_connection calls",
2397 0 : &["reason"]
2398 0 : )
2399 0 : .expect("failed to define a metric")
2400 0 : });
2401 :
2402 0 : pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
2403 0 : register_int_counter!(
2404 0 : "pageserver_walreceiver_broker_updates_total",
2405 0 : "Number of received broker updates in walreceiver"
2406 0 : )
2407 0 : .expect("failed to define a metric")
2408 0 : });
2409 :
2410 4 : pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
2411 4 : register_int_counter_vec!(
2412 4 : "pageserver_walreceiver_candidates_events_total",
2413 4 : "Number of walreceiver candidate events",
2414 4 : &["event"]
2415 4 : )
2416 4 : .expect("failed to define a metric")
2417 4 : });
2418 :
2419 : pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
2420 0 : Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
2421 :
2422 : pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
2423 4 : Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
2424 :
2425 : // Metrics collected on WAL redo operations
2426 : //
2427 : // We collect the time spent in actual WAL redo ('redo'), and time waiting
2428 : // for access to the postgres process ('wait') since there is only one for
2429 : // each tenant.
2430 :
2431 : /// Time buckets are small because we want to be able to measure the
2432 : /// smallest redo processing times. These buckets allow us to measure down
2433 : /// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
2434 : /// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
2435 : ///
2436 : /// Values up to 1s are recorded because metrics show that we have redo
2437 : /// durations and lock times larger than 0.250s.
2438 : macro_rules! redo_histogram_time_buckets {
2439 : () => {
2440 : vec![
2441 : 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
2442 : 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000,
2443 : 1.000_000,
2444 : ]
2445 : };
2446 : }
2447 :
2448 : /// While we're at it, also measure the amount of records replayed in each
2449 : /// operation. We have a global 'total replayed' counter, but that's not
2450 : /// as useful as 'what is the skew for how many records we replay in one
2451 : /// operation'.
2452 : macro_rules! redo_histogram_count_buckets {
2453 : () => {
2454 : vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0]
2455 : };
2456 : }
2457 :
2458 : macro_rules! redo_bytes_histogram_count_buckets {
2459 : () => {
2460 : // powers of (2^.5), from 2^4.5 to 2^15 (22 buckets)
2461 : // rounded up to the next multiple of 8 to capture any MAXALIGNed record of that size, too.
2462 : vec![
2463 : 24.0, 32.0, 48.0, 64.0, 96.0, 128.0, 184.0, 256.0, 368.0, 512.0, 728.0, 1024.0, 1456.0,
2464 : 2048.0, 2904.0, 4096.0, 5800.0, 8192.0, 11592.0, 16384.0, 23176.0, 32768.0,
2465 : ]
2466 : };
2467 : }
2468 :
2469 : pub(crate) struct WalIngestMetrics {
2470 : pub(crate) bytes_received: IntCounter,
2471 : pub(crate) records_received: IntCounter,
2472 : pub(crate) records_observed: IntCounter,
2473 : pub(crate) records_committed: IntCounter,
2474 : pub(crate) records_filtered: IntCounter,
2475 : pub(crate) values_committed_metadata_images: IntCounter,
2476 : pub(crate) values_committed_metadata_deltas: IntCounter,
2477 : pub(crate) values_committed_data_images: IntCounter,
2478 : pub(crate) values_committed_data_deltas: IntCounter,
2479 : pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
2480 : }
2481 :
2482 : impl WalIngestMetrics {
2483 0 : pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) {
2484 0 : if stats.metadata_images > 0 {
2485 0 : self.values_committed_metadata_images
2486 0 : .inc_by(stats.metadata_images);
2487 0 : }
2488 0 : if stats.metadata_deltas > 0 {
2489 0 : self.values_committed_metadata_deltas
2490 0 : .inc_by(stats.metadata_deltas);
2491 0 : }
2492 0 : if stats.data_images > 0 {
2493 0 : self.values_committed_data_images.inc_by(stats.data_images);
2494 0 : }
2495 0 : if stats.data_deltas > 0 {
2496 0 : self.values_committed_data_deltas.inc_by(stats.data_deltas);
2497 0 : }
2498 0 : }
2499 : }
2500 :
2501 20 : pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
2502 20 : let values_committed = register_int_counter_vec!(
2503 20 : "pageserver_wal_ingest_values_committed",
2504 20 : "Number of values committed to pageserver storage from WAL records",
2505 20 : &["class", "kind"],
2506 20 : )
2507 20 : .expect("failed to define a metric");
2508 20 :
2509 20 : WalIngestMetrics {
2510 20 : bytes_received: register_int_counter!(
2511 20 : "pageserver_wal_ingest_bytes_received",
2512 20 : "Bytes of WAL ingested from safekeepers",
2513 20 : )
2514 20 : .unwrap(),
2515 20 : records_received: register_int_counter!(
2516 20 : "pageserver_wal_ingest_records_received",
2517 20 : "Number of WAL records received from safekeepers"
2518 20 : )
2519 20 : .expect("failed to define a metric"),
2520 20 : records_observed: register_int_counter!(
2521 20 : "pageserver_wal_ingest_records_observed",
2522 20 : "Number of WAL records observed from safekeepers. These are metadata only records for shard 0."
2523 20 : )
2524 20 : .expect("failed to define a metric"),
2525 20 : records_committed: register_int_counter!(
2526 20 : "pageserver_wal_ingest_records_committed",
2527 20 : "Number of WAL records which resulted in writes to pageserver storage"
2528 20 : )
2529 20 : .expect("failed to define a metric"),
2530 20 : records_filtered: register_int_counter!(
2531 20 : "pageserver_wal_ingest_records_filtered",
2532 20 : "Number of WAL records filtered out due to sharding"
2533 20 : )
2534 20 : .expect("failed to define a metric"),
2535 20 : values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
2536 20 : values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
2537 20 : values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
2538 20 : values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]),
2539 20 : gap_blocks_zeroed_on_rel_extend: register_int_counter!(
2540 20 : "pageserver_gap_blocks_zeroed_on_rel_extend",
2541 20 : "Total number of zero gap blocks written on relation extends"
2542 20 : )
2543 20 : .expect("failed to define a metric"),
2544 20 : }
2545 20 : });
2546 :
2547 404 : pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
2548 404 : register_int_counter_vec!(
2549 404 : "pageserver_timeline_wal_records_received",
2550 404 : "Number of WAL records received per shard",
2551 404 : &["tenant_id", "shard_id", "timeline_id"]
2552 404 : )
2553 404 : .expect("failed to define a metric")
2554 404 : });
2555 :
2556 12 : pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
2557 12 : register_histogram!(
2558 12 : "pageserver_wal_redo_seconds",
2559 12 : "Time spent on WAL redo",
2560 12 : redo_histogram_time_buckets!()
2561 12 : )
2562 12 : .expect("failed to define a metric")
2563 12 : });
2564 :
2565 12 : pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
2566 12 : register_histogram!(
2567 12 : "pageserver_wal_redo_records_histogram",
2568 12 : "Histogram of number of records replayed per redo in the Postgres WAL redo process",
2569 12 : redo_histogram_count_buckets!(),
2570 12 : )
2571 12 : .expect("failed to define a metric")
2572 12 : });
2573 :
2574 12 : pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
2575 12 : register_histogram!(
2576 12 : "pageserver_wal_redo_bytes_histogram",
2577 12 : "Histogram of number of records replayed per redo sent to Postgres",
2578 12 : redo_bytes_histogram_count_buckets!(),
2579 12 : )
2580 12 : .expect("failed to define a metric")
2581 12 : });
2582 :
2583 : // FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
2584 12 : pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
2585 12 : register_int_counter!(
2586 12 : "pageserver_replayed_wal_records_total",
2587 12 : "Number of WAL records replayed in WAL redo process"
2588 12 : )
2589 12 : .unwrap()
2590 12 : });
2591 :
2592 : #[rustfmt::skip]
2593 16 : pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
2594 16 : register_histogram!(
2595 16 : "pageserver_wal_redo_process_launch_duration",
2596 16 : "Histogram of the duration of successful WalRedoProcess::launch calls",
2597 16 : vec![
2598 16 : 0.0002, 0.0004, 0.0006, 0.0008, 0.0010,
2599 16 : 0.0020, 0.0040, 0.0060, 0.0080, 0.0100,
2600 16 : 0.0200, 0.0400, 0.0600, 0.0800, 0.1000,
2601 16 : 0.2000, 0.4000, 0.6000, 0.8000, 1.0000,
2602 16 : 1.5000, 2.0000, 2.5000, 3.0000, 4.0000, 10.0000
2603 16 : ],
2604 16 : )
2605 16 : .expect("failed to define a metric")
2606 16 : });
2607 :
2608 : pub(crate) struct WalRedoProcessCounters {
2609 : pub(crate) started: IntCounter,
2610 : pub(crate) killed_by_cause: EnumMap<WalRedoKillCause, IntCounter>,
2611 : pub(crate) active_stderr_logger_tasks_started: IntCounter,
2612 : pub(crate) active_stderr_logger_tasks_finished: IntCounter,
2613 : }
2614 :
2615 : #[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
2616 : pub(crate) enum WalRedoKillCause {
2617 : WalRedoProcessDrop,
2618 : NoLeakChildDrop,
2619 : Startup,
2620 : }
2621 :
2622 : impl Default for WalRedoProcessCounters {
2623 16 : fn default() -> Self {
2624 16 : let started = register_int_counter!(
2625 16 : "pageserver_wal_redo_process_started_total",
2626 16 : "Number of WAL redo processes started",
2627 16 : )
2628 16 : .unwrap();
2629 16 :
2630 16 : let killed = register_int_counter_vec!(
2631 16 : "pageserver_wal_redo_process_stopped_total",
2632 16 : "Number of WAL redo processes stopped",
2633 16 : &["cause"],
2634 16 : )
2635 16 : .unwrap();
2636 16 :
2637 16 : let active_stderr_logger_tasks_started = register_int_counter!(
2638 16 : "pageserver_walredo_stderr_logger_tasks_started_total",
2639 16 : "Number of active walredo stderr logger tasks that have started",
2640 16 : )
2641 16 : .unwrap();
2642 16 :
2643 16 : let active_stderr_logger_tasks_finished = register_int_counter!(
2644 16 : "pageserver_walredo_stderr_logger_tasks_finished_total",
2645 16 : "Number of active walredo stderr logger tasks that have finished",
2646 16 : )
2647 16 : .unwrap();
2648 16 :
2649 16 : Self {
2650 16 : started,
2651 48 : killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
2652 48 : let cause = WalRedoKillCause::from_usize(i);
2653 48 : let cause_str: &'static str = cause.into();
2654 48 : killed.with_label_values(&[cause_str])
2655 48 : })),
2656 16 : active_stderr_logger_tasks_started,
2657 16 : active_stderr_logger_tasks_finished,
2658 16 : }
2659 16 : }
2660 : }
2661 :
2662 : pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
2663 : Lazy::new(WalRedoProcessCounters::default);
2664 :
2665 : /// Similar to `prometheus::HistogramTimer` but does not record on drop.
2666 : pub(crate) struct StorageTimeMetricsTimer {
2667 : metrics: StorageTimeMetrics,
2668 : start: Instant,
2669 : }
2670 :
2671 : impl StorageTimeMetricsTimer {
2672 4324 : fn new(metrics: StorageTimeMetrics) -> Self {
2673 4324 : Self {
2674 4324 : metrics,
2675 4324 : start: Instant::now(),
2676 4324 : }
2677 4324 : }
2678 :
2679 : /// Returns the elapsed duration of the timer.
2680 4324 : pub fn elapsed(&self) -> Duration {
2681 4324 : self.start.elapsed()
2682 4324 : }
2683 :
2684 : /// Record the time from creation to now and return it.
2685 4324 : pub fn stop_and_record(self) -> Duration {
2686 4324 : let duration = self.elapsed();
2687 4324 : let seconds = duration.as_secs_f64();
2688 4324 : self.metrics.timeline_sum.inc_by(seconds);
2689 4324 : self.metrics.timeline_count.inc();
2690 4324 : self.metrics.global_histogram.observe(seconds);
2691 4324 : duration
2692 4324 : }
2693 :
2694 : /// Turns this timer into a timer, which will always record -- usually this means recording
2695 : /// regardless an early `?` path was taken in a function.
2696 8 : pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer {
2697 8 : AlwaysRecordingStorageTimeMetricsTimer(Some(self))
2698 8 : }
2699 : }
2700 :
2701 : pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option<StorageTimeMetricsTimer>);
2702 :
2703 : impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
2704 8 : fn drop(&mut self) {
2705 8 : if let Some(inner) = self.0.take() {
2706 8 : inner.stop_and_record();
2707 8 : }
2708 8 : }
2709 : }
2710 :
2711 : impl AlwaysRecordingStorageTimeMetricsTimer {
2712 : /// Returns the elapsed duration of the timer.
2713 0 : pub fn elapsed(&self) -> Duration {
2714 0 : self.0.as_ref().expect("not dropped yet").elapsed()
2715 0 : }
2716 : }
2717 :
2718 : /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
2719 : /// timeline total sum and count.
2720 : #[derive(Clone, Debug)]
2721 : pub(crate) struct StorageTimeMetrics {
2722 : /// Sum of f64 seconds, per operation, tenant_id and timeline_id
2723 : timeline_sum: Counter,
2724 : /// Number of oeprations, per operation, tenant_id and timeline_id
2725 : timeline_count: IntCounter,
2726 : /// Global histogram having only the "operation" label.
2727 : global_histogram: Histogram,
2728 : }
2729 :
2730 : impl StorageTimeMetrics {
2731 8064 : pub fn new(
2732 8064 : operation: StorageTimeOperation,
2733 8064 : tenant_id: &str,
2734 8064 : shard_id: &str,
2735 8064 : timeline_id: &str,
2736 8064 : ) -> Self {
2737 8064 : let operation: &'static str = operation.into();
2738 8064 :
2739 8064 : let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
2740 8064 : .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
2741 8064 : .unwrap();
2742 8064 : let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
2743 8064 : .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
2744 8064 : .unwrap();
2745 8064 : let global_histogram = STORAGE_TIME_GLOBAL
2746 8064 : .get_metric_with_label_values(&[operation])
2747 8064 : .unwrap();
2748 8064 :
2749 8064 : StorageTimeMetrics {
2750 8064 : timeline_sum,
2751 8064 : timeline_count,
2752 8064 : global_histogram,
2753 8064 : }
2754 8064 : }
2755 :
2756 : /// Starts timing a new operation.
2757 : ///
2758 : /// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop.
2759 4324 : pub fn start_timer(&self) -> StorageTimeMetricsTimer {
2760 4324 : StorageTimeMetricsTimer::new(self.clone())
2761 4324 : }
2762 : }
2763 :
2764 : #[derive(Debug)]
2765 : pub(crate) struct TimelineMetrics {
2766 : tenant_id: String,
2767 : shard_id: String,
2768 : timeline_id: String,
2769 : pub flush_time_histo: StorageTimeMetrics,
2770 : pub flush_delay_histo: StorageTimeMetrics,
2771 : pub flush_wait_upload_time_gauge: Gauge,
2772 : pub compact_time_histo: StorageTimeMetrics,
2773 : pub create_images_time_histo: StorageTimeMetrics,
2774 : pub logical_size_histo: StorageTimeMetrics,
2775 : pub imitate_logical_size_histo: StorageTimeMetrics,
2776 : pub load_layer_map_histo: StorageTimeMetrics,
2777 : pub garbage_collect_histo: StorageTimeMetrics,
2778 : pub find_gc_cutoffs_histo: StorageTimeMetrics,
2779 : pub last_record_lsn_gauge: IntGauge,
2780 : pub disk_consistent_lsn_gauge: IntGauge,
2781 : pub pitr_history_size: UIntGauge,
2782 : pub archival_size: UIntGauge,
2783 : pub layers_per_read: Histogram,
2784 : pub standby_horizon_gauge: IntGauge,
2785 : pub resident_physical_size_gauge: UIntGauge,
2786 : pub visible_physical_size_gauge: UIntGauge,
2787 : /// copy of LayeredTimeline.current_logical_size
2788 : pub current_logical_size_gauge: UIntGauge,
2789 : pub aux_file_size_gauge: IntGauge,
2790 : pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
2791 : pub evictions: IntCounter,
2792 : pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
2793 : /// Number of valid LSN leases.
2794 : pub valid_lsn_lease_count_gauge: UIntGauge,
2795 : pub wal_records_received: IntCounter,
2796 : shutdown: std::sync::atomic::AtomicBool,
2797 : }
2798 :
2799 : impl TimelineMetrics {
2800 896 : pub fn new(
2801 896 : tenant_shard_id: &TenantShardId,
2802 896 : timeline_id_raw: &TimelineId,
2803 896 : evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
2804 896 : ) -> Self {
2805 896 : let tenant_id = tenant_shard_id.tenant_id.to_string();
2806 896 : let shard_id = format!("{}", tenant_shard_id.shard_slug());
2807 896 : let timeline_id = timeline_id_raw.to_string();
2808 896 : let flush_time_histo = StorageTimeMetrics::new(
2809 896 : StorageTimeOperation::LayerFlush,
2810 896 : &tenant_id,
2811 896 : &shard_id,
2812 896 : &timeline_id,
2813 896 : );
2814 896 : let flush_delay_histo = StorageTimeMetrics::new(
2815 896 : StorageTimeOperation::LayerFlushDelay,
2816 896 : &tenant_id,
2817 896 : &shard_id,
2818 896 : &timeline_id,
2819 896 : );
2820 896 : let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
2821 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2822 896 : .unwrap();
2823 896 : let compact_time_histo = StorageTimeMetrics::new(
2824 896 : StorageTimeOperation::Compact,
2825 896 : &tenant_id,
2826 896 : &shard_id,
2827 896 : &timeline_id,
2828 896 : );
2829 896 : let create_images_time_histo = StorageTimeMetrics::new(
2830 896 : StorageTimeOperation::CreateImages,
2831 896 : &tenant_id,
2832 896 : &shard_id,
2833 896 : &timeline_id,
2834 896 : );
2835 896 : let logical_size_histo = StorageTimeMetrics::new(
2836 896 : StorageTimeOperation::LogicalSize,
2837 896 : &tenant_id,
2838 896 : &shard_id,
2839 896 : &timeline_id,
2840 896 : );
2841 896 : let imitate_logical_size_histo = StorageTimeMetrics::new(
2842 896 : StorageTimeOperation::ImitateLogicalSize,
2843 896 : &tenant_id,
2844 896 : &shard_id,
2845 896 : &timeline_id,
2846 896 : );
2847 896 : let load_layer_map_histo = StorageTimeMetrics::new(
2848 896 : StorageTimeOperation::LoadLayerMap,
2849 896 : &tenant_id,
2850 896 : &shard_id,
2851 896 : &timeline_id,
2852 896 : );
2853 896 : let garbage_collect_histo = StorageTimeMetrics::new(
2854 896 : StorageTimeOperation::Gc,
2855 896 : &tenant_id,
2856 896 : &shard_id,
2857 896 : &timeline_id,
2858 896 : );
2859 896 : let find_gc_cutoffs_histo = StorageTimeMetrics::new(
2860 896 : StorageTimeOperation::FindGcCutoffs,
2861 896 : &tenant_id,
2862 896 : &shard_id,
2863 896 : &timeline_id,
2864 896 : );
2865 896 : let last_record_lsn_gauge = LAST_RECORD_LSN
2866 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2867 896 : .unwrap();
2868 896 :
2869 896 : let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
2870 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2871 896 : .unwrap();
2872 896 :
2873 896 : let pitr_history_size = PITR_HISTORY_SIZE
2874 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2875 896 : .unwrap();
2876 896 :
2877 896 : let archival_size = TIMELINE_ARCHIVE_SIZE
2878 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2879 896 : .unwrap();
2880 896 :
2881 896 : let layers_per_read = LAYERS_PER_READ
2882 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2883 896 : .unwrap();
2884 896 :
2885 896 : let standby_horizon_gauge = STANDBY_HORIZON
2886 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2887 896 : .unwrap();
2888 896 : let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
2889 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2890 896 : .unwrap();
2891 896 : let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
2892 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2893 896 : .unwrap();
2894 896 : // TODO: we shouldn't expose this metric
2895 896 : let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
2896 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2897 896 : .unwrap();
2898 896 : let aux_file_size_gauge = AUX_FILE_SIZE
2899 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2900 896 : .unwrap();
2901 896 : // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
2902 896 : let directory_entries_count_gauge_closure = {
2903 896 : let tenant_shard_id = *tenant_shard_id;
2904 896 : let timeline_id_raw = *timeline_id_raw;
2905 0 : move || {
2906 0 : let tenant_id = tenant_shard_id.tenant_id.to_string();
2907 0 : let shard_id = format!("{}", tenant_shard_id.shard_slug());
2908 0 : let timeline_id = timeline_id_raw.to_string();
2909 0 : let gauge: UIntGauge = DIRECTORY_ENTRIES_COUNT
2910 0 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2911 0 : .unwrap();
2912 0 : gauge
2913 0 : }
2914 : };
2915 896 : let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
2916 896 : Lazy::new(Box::new(directory_entries_count_gauge_closure));
2917 896 : let evictions = EVICTIONS
2918 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2919 896 : .unwrap();
2920 896 : let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
2921 896 : .build(&tenant_id, &shard_id, &timeline_id);
2922 896 :
2923 896 : let valid_lsn_lease_count_gauge = VALID_LSN_LEASE_COUNT
2924 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2925 896 : .unwrap();
2926 896 :
2927 896 : let wal_records_received = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED
2928 896 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
2929 896 : .unwrap();
2930 896 :
2931 896 : TimelineMetrics {
2932 896 : tenant_id,
2933 896 : shard_id,
2934 896 : timeline_id,
2935 896 : flush_time_histo,
2936 896 : flush_delay_histo,
2937 896 : flush_wait_upload_time_gauge,
2938 896 : compact_time_histo,
2939 896 : create_images_time_histo,
2940 896 : logical_size_histo,
2941 896 : imitate_logical_size_histo,
2942 896 : garbage_collect_histo,
2943 896 : find_gc_cutoffs_histo,
2944 896 : load_layer_map_histo,
2945 896 : last_record_lsn_gauge,
2946 896 : disk_consistent_lsn_gauge,
2947 896 : pitr_history_size,
2948 896 : archival_size,
2949 896 : layers_per_read,
2950 896 : standby_horizon_gauge,
2951 896 : resident_physical_size_gauge,
2952 896 : visible_physical_size_gauge,
2953 896 : current_logical_size_gauge,
2954 896 : aux_file_size_gauge,
2955 896 : directory_entries_count_gauge,
2956 896 : evictions,
2957 896 : evictions_with_low_residence_duration: std::sync::RwLock::new(
2958 896 : evictions_with_low_residence_duration,
2959 896 : ),
2960 896 : valid_lsn_lease_count_gauge,
2961 896 : wal_records_received,
2962 896 : shutdown: std::sync::atomic::AtomicBool::default(),
2963 896 : }
2964 896 : }
2965 :
2966 3172 : pub(crate) fn record_new_file_metrics(&self, sz: u64) {
2967 3172 : self.resident_physical_size_add(sz);
2968 3172 : }
2969 :
2970 1084 : pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
2971 1084 : self.resident_physical_size_gauge.sub(sz);
2972 1084 : crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
2973 1084 : }
2974 :
2975 3444 : pub(crate) fn resident_physical_size_add(&self, sz: u64) {
2976 3444 : self.resident_physical_size_gauge.add(sz);
2977 3444 : crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
2978 3444 : }
2979 :
2980 20 : pub(crate) fn resident_physical_size_get(&self) -> u64 {
2981 20 : self.resident_physical_size_gauge.get()
2982 20 : }
2983 :
2984 2348 : pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
2985 2348 : self.flush_wait_upload_time_gauge.add(duration);
2986 2348 : crate::metrics::FLUSH_WAIT_UPLOAD_TIME
2987 2348 : .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
2988 2348 : .unwrap()
2989 2348 : .add(duration);
2990 2348 : }
2991 :
2992 : /// Generates TIMELINE_LAYER labels for a persistent layer.
2993 5249 : fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] {
2994 5249 : let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) {
2995 2848 : true => LayerLevel::L0,
2996 2401 : false => LayerLevel::L1,
2997 : };
2998 5249 : let kind = match layer_desc.is_delta() {
2999 4396 : true => LayerKind::Delta,
3000 853 : false => LayerKind::Image,
3001 : };
3002 5249 : [
3003 5249 : &self.tenant_id,
3004 5249 : &self.shard_id,
3005 5249 : &self.timeline_id,
3006 5249 : level.into(),
3007 5249 : kind.into(),
3008 5249 : ]
3009 5249 : }
3010 :
3011 : /// Generates TIMELINE_LAYER labels for a frozen ephemeral layer.
3012 4696 : fn make_frozen_layer_labels(&self, _layer: &InMemoryLayer) -> [&str; 5] {
3013 4696 : [
3014 4696 : &self.tenant_id,
3015 4696 : &self.shard_id,
3016 4696 : &self.timeline_id,
3017 4696 : LayerLevel::Frozen.into(),
3018 4696 : LayerKind::Delta.into(), // by definition
3019 4696 : ]
3020 4696 : }
3021 :
3022 : /// Removes a frozen ephemeral layer to TIMELINE_LAYER metrics.
3023 2348 : pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) {
3024 2348 : assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
3025 2348 : let labels = self.make_frozen_layer_labels(layer);
3026 2348 : let size = layer.try_len().expect("frozen layer should have no writer");
3027 2348 : TIMELINE_LAYER_COUNT
3028 2348 : .get_metric_with_label_values(&labels)
3029 2348 : .unwrap()
3030 2348 : .dec();
3031 2348 : TIMELINE_LAYER_SIZE
3032 2348 : .get_metric_with_label_values(&labels)
3033 2348 : .unwrap()
3034 2348 : .sub(size);
3035 2348 : }
3036 :
3037 : /// Adds a frozen ephemeral layer to TIMELINE_LAYER metrics.
3038 2348 : pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) {
3039 2348 : assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
3040 2348 : let labels = self.make_frozen_layer_labels(layer);
3041 2348 : let size = layer.try_len().expect("frozen layer should have no writer");
3042 2348 : TIMELINE_LAYER_COUNT
3043 2348 : .get_metric_with_label_values(&labels)
3044 2348 : .unwrap()
3045 2348 : .inc();
3046 2348 : TIMELINE_LAYER_SIZE
3047 2348 : .get_metric_with_label_values(&labels)
3048 2348 : .unwrap()
3049 2348 : .add(size);
3050 2348 : }
3051 :
3052 : /// Removes a persistent layer from TIMELINE_LAYER metrics.
3053 1381 : pub fn dec_layer(&self, layer_desc: &PersistentLayerDesc) {
3054 1381 : let labels = self.make_layer_labels(layer_desc);
3055 1381 : TIMELINE_LAYER_COUNT
3056 1381 : .get_metric_with_label_values(&labels)
3057 1381 : .unwrap()
3058 1381 : .dec();
3059 1381 : TIMELINE_LAYER_SIZE
3060 1381 : .get_metric_with_label_values(&labels)
3061 1381 : .unwrap()
3062 1381 : .sub(layer_desc.file_size);
3063 1381 : }
3064 :
3065 : /// Adds a persistent layer to TIMELINE_LAYER metrics.
3066 3868 : pub fn inc_layer(&self, layer_desc: &PersistentLayerDesc) {
3067 3868 : let labels = self.make_layer_labels(layer_desc);
3068 3868 : TIMELINE_LAYER_COUNT
3069 3868 : .get_metric_with_label_values(&labels)
3070 3868 : .unwrap()
3071 3868 : .inc();
3072 3868 : TIMELINE_LAYER_SIZE
3073 3868 : .get_metric_with_label_values(&labels)
3074 3868 : .unwrap()
3075 3868 : .add(layer_desc.file_size);
3076 3868 : }
3077 :
3078 20 : pub(crate) fn shutdown(&self) {
3079 20 : let was_shutdown = self
3080 20 : .shutdown
3081 20 : .swap(true, std::sync::atomic::Ordering::Relaxed);
3082 20 :
3083 20 : if was_shutdown {
3084 : // this happens on tenant deletion because tenant first shuts down timelines, then
3085 : // invokes timeline deletion which first shuts down the timeline again.
3086 : // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
3087 0 : return;
3088 20 : }
3089 20 :
3090 20 : let tenant_id = &self.tenant_id;
3091 20 : let timeline_id = &self.timeline_id;
3092 20 : let shard_id = &self.shard_id;
3093 20 : let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3094 20 : let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3095 20 : let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3096 20 : let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3097 20 : {
3098 20 : RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
3099 20 : let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3100 20 : }
3101 20 : let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3102 20 : let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3103 20 : if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
3104 0 : let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3105 20 : }
3106 :
3107 20 : let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3108 20 : let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3109 :
3110 80 : for ref level in LayerLevel::iter() {
3111 180 : for ref kind in LayerKind::iter() {
3112 120 : let labels: [&str; 5] =
3113 120 : [tenant_id, shard_id, timeline_id, level.into(), kind.into()];
3114 120 : let _ = TIMELINE_LAYER_SIZE.remove_label_values(&labels);
3115 120 : let _ = TIMELINE_LAYER_COUNT.remove_label_values(&labels);
3116 120 : }
3117 : }
3118 :
3119 20 : let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3120 20 :
3121 20 : let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3122 20 : let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3123 20 : let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3124 20 :
3125 20 : self.evictions_with_low_residence_duration
3126 20 : .write()
3127 20 : .unwrap()
3128 20 : .remove(tenant_id, shard_id, timeline_id);
3129 :
3130 : // The following metrics are born outside of the TimelineMetrics lifecycle but still
3131 : // removed at the end of it. The idea is to have the metrics outlive the
3132 : // entity during which they're observed, e.g., the smgr metrics shall
3133 : // outlive an individual smgr connection, but not the timeline.
3134 :
3135 200 : for op in StorageTimeOperation::VARIANTS {
3136 180 : let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[
3137 180 : op,
3138 180 : tenant_id,
3139 180 : shard_id,
3140 180 : timeline_id,
3141 180 : ]);
3142 180 : let _ = STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[
3143 180 : op,
3144 180 : tenant_id,
3145 180 : shard_id,
3146 180 : timeline_id,
3147 180 : ]);
3148 180 : }
3149 :
3150 60 : for op in STORAGE_IO_SIZE_OPERATIONS {
3151 40 : let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
3152 40 : }
3153 :
3154 20 : let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
3155 20 : SmgrQueryType::GetPageAtLsn.into(),
3156 20 : tenant_id,
3157 20 : shard_id,
3158 20 : timeline_id,
3159 20 : ]);
3160 20 : let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
3161 20 : SmgrQueryType::GetPageAtLsn.into(),
3162 20 : tenant_id,
3163 20 : shard_id,
3164 20 : timeline_id,
3165 20 : ]);
3166 20 : let _ = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE.remove_label_values(&[
3167 20 : tenant_id,
3168 20 : shard_id,
3169 20 : timeline_id,
3170 20 : ]);
3171 20 : let _ = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED.remove_label_values(&[
3172 20 : tenant_id,
3173 20 : shard_id,
3174 20 : timeline_id,
3175 20 : ]);
3176 20 : let _ = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS.remove_label_values(&[
3177 20 : tenant_id,
3178 20 : shard_id,
3179 20 : timeline_id,
3180 20 : ]);
3181 20 : let _ = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME.remove_label_values(&[
3182 20 : tenant_id,
3183 20 : shard_id,
3184 20 : timeline_id,
3185 20 : ]);
3186 20 : }
3187 : }
3188 :
3189 12 : pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
3190 12 : // Only shard zero deals in synthetic sizes
3191 12 : if tenant_shard_id.is_shard_zero() {
3192 12 : let tid = tenant_shard_id.tenant_id.to_string();
3193 12 : let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
3194 12 : }
3195 :
3196 12 : tenant_throttling::remove_tenant_metrics(tenant_shard_id);
3197 12 :
3198 12 : // we leave the BROKEN_TENANTS_SET entry if any
3199 12 : }
3200 :
3201 : /// Maintain a per timeline gauge in addition to the global gauge.
3202 : pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
3203 : last_set: AtomicU64,
3204 : gauge: UIntGauge,
3205 : }
3206 :
3207 : impl PerTimelineRemotePhysicalSizeGauge {
3208 916 : fn new(per_timeline_gauge: UIntGauge) -> Self {
3209 916 : Self {
3210 916 : last_set: AtomicU64::new(0),
3211 916 : gauge: per_timeline_gauge,
3212 916 : }
3213 916 : }
3214 3886 : pub(crate) fn set(&self, sz: u64) {
3215 3886 : self.gauge.set(sz);
3216 3886 : let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed);
3217 3886 : if sz < prev {
3218 68 : REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz);
3219 3818 : } else {
3220 3818 : REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev);
3221 3818 : };
3222 3886 : }
3223 4 : pub(crate) fn get(&self) -> u64 {
3224 4 : self.gauge.get()
3225 4 : }
3226 : }
3227 :
3228 : impl Drop for PerTimelineRemotePhysicalSizeGauge {
3229 40 : fn drop(&mut self) {
3230 40 : REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed));
3231 40 : }
3232 : }
3233 :
3234 : pub(crate) struct RemoteTimelineClientMetrics {
3235 : tenant_id: String,
3236 : shard_id: String,
3237 : timeline_id: String,
3238 : pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge,
3239 : calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
3240 : bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
3241 : bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
3242 : pub(crate) projected_remote_consistent_lsn_gauge: UIntGauge,
3243 : }
3244 :
3245 : impl RemoteTimelineClientMetrics {
3246 916 : pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
3247 916 : let tenant_id_str = tenant_shard_id.tenant_id.to_string();
3248 916 : let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
3249 916 : let timeline_id_str = timeline_id.to_string();
3250 916 :
3251 916 : let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new(
3252 916 : REMOTE_PHYSICAL_SIZE
3253 916 : .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
3254 916 : .unwrap(),
3255 916 : );
3256 916 :
3257 916 : let projected_remote_consistent_lsn_gauge = PROJECTED_REMOTE_CONSISTENT_LSN
3258 916 : .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
3259 916 : .unwrap();
3260 916 :
3261 916 : RemoteTimelineClientMetrics {
3262 916 : tenant_id: tenant_id_str,
3263 916 : shard_id: shard_id_str,
3264 916 : timeline_id: timeline_id_str,
3265 916 : calls: Mutex::new(HashMap::default()),
3266 916 : bytes_started_counter: Mutex::new(HashMap::default()),
3267 916 : bytes_finished_counter: Mutex::new(HashMap::default()),
3268 916 : remote_physical_size_gauge,
3269 916 : projected_remote_consistent_lsn_gauge,
3270 916 : }
3271 916 : }
3272 :
3273 6166 : pub fn remote_operation_time(
3274 6166 : &self,
3275 6166 : file_kind: &RemoteOpFileKind,
3276 6166 : op_kind: &RemoteOpKind,
3277 6166 : status: &'static str,
3278 6166 : ) -> Histogram {
3279 6166 : let key = (file_kind.as_str(), op_kind.as_str(), status);
3280 6166 : REMOTE_OPERATION_TIME
3281 6166 : .get_metric_with_label_values(&[key.0, key.1, key.2])
3282 6166 : .unwrap()
3283 6166 : }
3284 :
3285 14440 : fn calls_counter_pair(
3286 14440 : &self,
3287 14440 : file_kind: &RemoteOpFileKind,
3288 14440 : op_kind: &RemoteOpKind,
3289 14440 : ) -> IntCounterPair {
3290 14440 : let mut guard = self.calls.lock().unwrap();
3291 14440 : let key = (file_kind.as_str(), op_kind.as_str());
3292 14440 : let metric = guard.entry(key).or_insert_with(move || {
3293 1646 : REMOTE_TIMELINE_CLIENT_CALLS
3294 1646 : .get_metric_with_label_values(&[
3295 1646 : &self.tenant_id,
3296 1646 : &self.shard_id,
3297 1646 : &self.timeline_id,
3298 1646 : key.0,
3299 1646 : key.1,
3300 1646 : ])
3301 1646 : .unwrap()
3302 14440 : });
3303 14440 : metric.clone()
3304 14440 : }
3305 :
3306 3492 : fn bytes_started_counter(
3307 3492 : &self,
3308 3492 : file_kind: &RemoteOpFileKind,
3309 3492 : op_kind: &RemoteOpKind,
3310 3492 : ) -> IntCounter {
3311 3492 : let mut guard = self.bytes_started_counter.lock().unwrap();
3312 3492 : let key = (file_kind.as_str(), op_kind.as_str());
3313 3492 : let metric = guard.entry(key).or_insert_with(move || {
3314 648 : REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
3315 648 : .get_metric_with_label_values(&[
3316 648 : &self.tenant_id,
3317 648 : &self.shard_id,
3318 648 : &self.timeline_id,
3319 648 : key.0,
3320 648 : key.1,
3321 648 : ])
3322 648 : .unwrap()
3323 3492 : });
3324 3492 : metric.clone()
3325 3492 : }
3326 :
3327 6600 : fn bytes_finished_counter(
3328 6600 : &self,
3329 6600 : file_kind: &RemoteOpFileKind,
3330 6600 : op_kind: &RemoteOpKind,
3331 6600 : ) -> IntCounter {
3332 6600 : let mut guard = self.bytes_finished_counter.lock().unwrap();
3333 6600 : let key = (file_kind.as_str(), op_kind.as_str());
3334 6600 : let metric = guard.entry(key).or_insert_with(move || {
3335 648 : REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
3336 648 : .get_metric_with_label_values(&[
3337 648 : &self.tenant_id,
3338 648 : &self.shard_id,
3339 648 : &self.timeline_id,
3340 648 : key.0,
3341 648 : key.1,
3342 648 : ])
3343 648 : .unwrap()
3344 6600 : });
3345 6600 : metric.clone()
3346 6600 : }
3347 : }
3348 :
3349 : #[cfg(test)]
3350 : impl RemoteTimelineClientMetrics {
3351 12 : pub fn get_bytes_started_counter_value(
3352 12 : &self,
3353 12 : file_kind: &RemoteOpFileKind,
3354 12 : op_kind: &RemoteOpKind,
3355 12 : ) -> Option<u64> {
3356 12 : let guard = self.bytes_started_counter.lock().unwrap();
3357 12 : let key = (file_kind.as_str(), op_kind.as_str());
3358 12 : guard.get(&key).map(|counter| counter.get())
3359 12 : }
3360 :
3361 12 : pub fn get_bytes_finished_counter_value(
3362 12 : &self,
3363 12 : file_kind: &RemoteOpFileKind,
3364 12 : op_kind: &RemoteOpKind,
3365 12 : ) -> Option<u64> {
3366 12 : let guard = self.bytes_finished_counter.lock().unwrap();
3367 12 : let key = (file_kind.as_str(), op_kind.as_str());
3368 12 : guard.get(&key).map(|counter| counter.get())
3369 12 : }
3370 : }
3371 :
3372 : /// See [`RemoteTimelineClientMetrics::call_begin`].
3373 : #[must_use]
3374 : pub(crate) struct RemoteTimelineClientCallMetricGuard {
3375 : /// Decremented on drop.
3376 : calls_counter_pair: Option<IntCounterPair>,
3377 : /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
3378 : bytes_finished: Option<(IntCounter, u64)>,
3379 : }
3380 :
3381 : impl RemoteTimelineClientCallMetricGuard {
3382 : /// Consume this guard object without performing the metric updates it would do on `drop()`.
3383 : /// The caller vouches to do the metric updates manually.
3384 7628 : pub fn will_decrement_manually(mut self) {
3385 7628 : let RemoteTimelineClientCallMetricGuard {
3386 7628 : calls_counter_pair,
3387 7628 : bytes_finished,
3388 7628 : } = &mut self;
3389 7628 : calls_counter_pair.take();
3390 7628 : bytes_finished.take();
3391 7628 : }
3392 : }
3393 :
3394 : impl Drop for RemoteTimelineClientCallMetricGuard {
3395 7696 : fn drop(&mut self) {
3396 7696 : let RemoteTimelineClientCallMetricGuard {
3397 7696 : calls_counter_pair,
3398 7696 : bytes_finished,
3399 7696 : } = self;
3400 7696 : if let Some(guard) = calls_counter_pair.take() {
3401 68 : guard.dec();
3402 7628 : }
3403 7696 : if let Some((bytes_finished_metric, value)) = bytes_finished {
3404 0 : bytes_finished_metric.inc_by(*value);
3405 7696 : }
3406 7696 : }
3407 : }
3408 :
3409 : /// The enum variants communicate to the [`RemoteTimelineClientMetrics`] whether to
3410 : /// track the byte size of this call in applicable metric(s).
3411 : pub(crate) enum RemoteTimelineClientMetricsCallTrackSize {
3412 : /// Do not account for this call's byte size in any metrics.
3413 : /// The `reason` field is there to make the call sites self-documenting
3414 : /// about why they don't need the metric.
3415 : DontTrackSize { reason: &'static str },
3416 : /// Track the byte size of the call in applicable metric(s).
3417 : Bytes(u64),
3418 : }
3419 :
3420 : impl RemoteTimelineClientMetrics {
3421 : /// Update the metrics that change when a call to the remote timeline client instance starts.
3422 : ///
3423 : /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
3424 : /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that
3425 : /// is more suitable.
3426 : /// Never do both.
3427 7696 : pub(crate) fn call_begin(
3428 7696 : &self,
3429 7696 : file_kind: &RemoteOpFileKind,
3430 7696 : op_kind: &RemoteOpKind,
3431 7696 : size: RemoteTimelineClientMetricsCallTrackSize,
3432 7696 : ) -> RemoteTimelineClientCallMetricGuard {
3433 7696 : let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
3434 7696 : calls_counter_pair.inc();
3435 :
3436 7696 : let bytes_finished = match size {
3437 4204 : RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
3438 4204 : // nothing to do
3439 4204 : None
3440 : }
3441 3492 : RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
3442 3492 : self.bytes_started_counter(file_kind, op_kind).inc_by(size);
3443 3492 : let finished_counter = self.bytes_finished_counter(file_kind, op_kind);
3444 3492 : Some((finished_counter, size))
3445 : }
3446 : };
3447 7696 : RemoteTimelineClientCallMetricGuard {
3448 7696 : calls_counter_pair: Some(calls_counter_pair),
3449 7696 : bytes_finished,
3450 7696 : }
3451 7696 : }
3452 :
3453 : /// Manually udpate the metrics that track completions, instead of using the guard object.
3454 : /// Using the guard object is generally preferable.
3455 : /// See [`call_begin`](Self::call_begin) for more context.
3456 6744 : pub(crate) fn call_end(
3457 6744 : &self,
3458 6744 : file_kind: &RemoteOpFileKind,
3459 6744 : op_kind: &RemoteOpKind,
3460 6744 : size: RemoteTimelineClientMetricsCallTrackSize,
3461 6744 : ) {
3462 6744 : let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
3463 6744 : calls_counter_pair.dec();
3464 6744 : match size {
3465 3636 : RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
3466 3108 : RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
3467 3108 : self.bytes_finished_counter(file_kind, op_kind).inc_by(size);
3468 3108 : }
3469 : }
3470 6744 : }
3471 : }
3472 :
3473 : impl Drop for RemoteTimelineClientMetrics {
3474 40 : fn drop(&mut self) {
3475 40 : let RemoteTimelineClientMetrics {
3476 40 : tenant_id,
3477 40 : shard_id,
3478 40 : timeline_id,
3479 40 : remote_physical_size_gauge,
3480 40 : calls,
3481 40 : bytes_started_counter,
3482 40 : bytes_finished_counter,
3483 40 : projected_remote_consistent_lsn_gauge,
3484 40 : } = self;
3485 48 : for ((a, b), _) in calls.get_mut().unwrap().drain() {
3486 48 : let mut res = [Ok(()), Ok(())];
3487 48 : REMOTE_TIMELINE_CLIENT_CALLS
3488 48 : .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id, a, b]);
3489 48 : // don't care about results
3490 48 : }
3491 40 : for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
3492 12 : let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
3493 12 : tenant_id,
3494 12 : shard_id,
3495 12 : timeline_id,
3496 12 : a,
3497 12 : b,
3498 12 : ]);
3499 12 : }
3500 40 : for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
3501 12 : let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
3502 12 : tenant_id,
3503 12 : shard_id,
3504 12 : timeline_id,
3505 12 : a,
3506 12 : b,
3507 12 : ]);
3508 12 : }
3509 40 : {
3510 40 : let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
3511 40 : let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3512 40 : }
3513 40 : {
3514 40 : let _ = projected_remote_consistent_lsn_gauge;
3515 40 : let _ = PROJECTED_REMOTE_CONSISTENT_LSN.remove_label_values(&[
3516 40 : tenant_id,
3517 40 : shard_id,
3518 40 : timeline_id,
3519 40 : ]);
3520 40 : }
3521 40 : }
3522 : }
3523 :
3524 : /// Wrapper future that measures the time spent by a remote storage operation,
3525 : /// and records the time and success/failure as a prometheus metric.
3526 : pub(crate) trait MeasureRemoteOp: Sized {
3527 6447 : fn measure_remote_op(
3528 6447 : self,
3529 6447 : file_kind: RemoteOpFileKind,
3530 6447 : op: RemoteOpKind,
3531 6447 : metrics: Arc<RemoteTimelineClientMetrics>,
3532 6447 : ) -> MeasuredRemoteOp<Self> {
3533 6447 : let start = Instant::now();
3534 6447 : MeasuredRemoteOp {
3535 6447 : inner: self,
3536 6447 : file_kind,
3537 6447 : op,
3538 6447 : start,
3539 6447 : metrics,
3540 6447 : }
3541 6447 : }
3542 : }
3543 :
3544 : impl<T: Sized> MeasureRemoteOp for T {}
3545 :
3546 : pin_project! {
3547 : pub(crate) struct MeasuredRemoteOp<F>
3548 : {
3549 : #[pin]
3550 : inner: F,
3551 : file_kind: RemoteOpFileKind,
3552 : op: RemoteOpKind,
3553 : start: Instant,
3554 : metrics: Arc<RemoteTimelineClientMetrics>,
3555 : }
3556 : }
3557 :
3558 : impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
3559 : type Output = Result<O, E>;
3560 :
3561 98421 : fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
3562 98421 : let this = self.project();
3563 98421 : let poll_result = this.inner.poll(cx);
3564 98421 : if let Poll::Ready(ref res) = poll_result {
3565 6166 : let duration = this.start.elapsed();
3566 6166 : let status = if res.is_ok() { &"success" } else { &"failure" };
3567 6166 : this.metrics
3568 6166 : .remote_operation_time(this.file_kind, this.op, status)
3569 6166 : .observe(duration.as_secs_f64());
3570 92255 : }
3571 98421 : poll_result
3572 98421 : }
3573 : }
3574 :
3575 : pub mod tokio_epoll_uring {
3576 : use std::{
3577 : collections::HashMap,
3578 : sync::{Arc, Mutex},
3579 : };
3580 :
3581 : use metrics::{register_histogram, register_int_counter, Histogram, LocalHistogram, UIntGauge};
3582 : use once_cell::sync::Lazy;
3583 :
3584 : /// Shared storage for tokio-epoll-uring thread local metrics.
3585 : pub(crate) static THREAD_LOCAL_METRICS_STORAGE: Lazy<ThreadLocalMetricsStorage> =
3586 234 : Lazy::new(|| {
3587 234 : let slots_submission_queue_depth = register_histogram!(
3588 234 : "pageserver_tokio_epoll_uring_slots_submission_queue_depth",
3589 234 : "The slots waiters queue depth of each tokio_epoll_uring system",
3590 234 : vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
3591 234 : )
3592 234 : .expect("failed to define a metric");
3593 234 : ThreadLocalMetricsStorage {
3594 234 : observers: Mutex::new(HashMap::new()),
3595 234 : slots_submission_queue_depth,
3596 234 : }
3597 234 : });
3598 :
3599 : pub struct ThreadLocalMetricsStorage {
3600 : /// List of thread local metrics observers.
3601 : observers: Mutex<HashMap<u64, Arc<ThreadLocalMetrics>>>,
3602 : /// A histogram shared between all thread local systems
3603 : /// for collecting slots submission queue depth.
3604 : slots_submission_queue_depth: Histogram,
3605 : }
3606 :
3607 : /// Each thread-local [`tokio_epoll_uring::System`] gets one of these as its
3608 : /// [`tokio_epoll_uring::metrics::PerSystemMetrics`] generic.
3609 : ///
3610 : /// The System makes observations into [`Self`] and periodically, the collector
3611 : /// comes along and flushes [`Self`] into the shared storage [`THREAD_LOCAL_METRICS_STORAGE`].
3612 : ///
3613 : /// [`LocalHistogram`] is `!Send`, so, we need to put it behind a [`Mutex`].
3614 : /// But except for the periodic flush, the lock is uncontended so there's no waiting
3615 : /// for cache coherence protocol to get an exclusive cache line.
3616 : pub struct ThreadLocalMetrics {
3617 : /// Local observer of thread local tokio-epoll-uring system's slots waiters queue depth.
3618 : slots_submission_queue_depth: Mutex<LocalHistogram>,
3619 : }
3620 :
3621 : impl ThreadLocalMetricsStorage {
3622 : /// Registers a new thread local system. Returns a thread local metrics observer.
3623 1119 : pub fn register_system(&self, id: u64) -> Arc<ThreadLocalMetrics> {
3624 1119 : let per_system_metrics = Arc::new(ThreadLocalMetrics::new(
3625 1119 : self.slots_submission_queue_depth.local(),
3626 1119 : ));
3627 1119 : let mut g = self.observers.lock().unwrap();
3628 1119 : g.insert(id, Arc::clone(&per_system_metrics));
3629 1119 : per_system_metrics
3630 1119 : }
3631 :
3632 : /// Removes metrics observer for a thread local system.
3633 : /// This should be called before dropping a thread local system.
3634 234 : pub fn remove_system(&self, id: u64) {
3635 234 : let mut g = self.observers.lock().unwrap();
3636 234 : g.remove(&id);
3637 234 : }
3638 :
3639 : /// Flush all thread local metrics to the shared storage.
3640 0 : pub fn flush_thread_local_metrics(&self) {
3641 0 : let g = self.observers.lock().unwrap();
3642 0 : g.values().for_each(|local| {
3643 0 : local.flush();
3644 0 : });
3645 0 : }
3646 : }
3647 :
3648 : impl ThreadLocalMetrics {
3649 1119 : pub fn new(slots_submission_queue_depth: LocalHistogram) -> Self {
3650 1119 : ThreadLocalMetrics {
3651 1119 : slots_submission_queue_depth: Mutex::new(slots_submission_queue_depth),
3652 1119 : }
3653 1119 : }
3654 :
3655 : /// Flushes the thread local metrics to shared aggregator.
3656 0 : pub fn flush(&self) {
3657 0 : let Self {
3658 0 : slots_submission_queue_depth,
3659 0 : } = self;
3660 0 : slots_submission_queue_depth.lock().unwrap().flush();
3661 0 : }
3662 : }
3663 :
3664 : impl tokio_epoll_uring::metrics::PerSystemMetrics for ThreadLocalMetrics {
3665 1834281 : fn observe_slots_submission_queue_depth(&self, queue_depth: u64) {
3666 1834281 : let Self {
3667 1834281 : slots_submission_queue_depth,
3668 1834281 : } = self;
3669 1834281 : slots_submission_queue_depth
3670 1834281 : .lock()
3671 1834281 : .unwrap()
3672 1834281 : .observe(queue_depth as f64);
3673 1834281 : }
3674 : }
3675 :
3676 : pub struct Collector {
3677 : descs: Vec<metrics::core::Desc>,
3678 : systems_created: UIntGauge,
3679 : systems_destroyed: UIntGauge,
3680 : thread_local_metrics_storage: &'static ThreadLocalMetricsStorage,
3681 : }
3682 :
3683 : impl metrics::core::Collector for Collector {
3684 0 : fn desc(&self) -> Vec<&metrics::core::Desc> {
3685 0 : self.descs.iter().collect()
3686 0 : }
3687 :
3688 0 : fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
3689 0 : let mut mfs = Vec::with_capacity(Self::NMETRICS);
3690 0 : let tokio_epoll_uring::metrics::GlobalMetrics {
3691 0 : systems_created,
3692 0 : systems_destroyed,
3693 0 : } = tokio_epoll_uring::metrics::global();
3694 0 : self.systems_created.set(systems_created);
3695 0 : mfs.extend(self.systems_created.collect());
3696 0 : self.systems_destroyed.set(systems_destroyed);
3697 0 : mfs.extend(self.systems_destroyed.collect());
3698 0 :
3699 0 : self.thread_local_metrics_storage
3700 0 : .flush_thread_local_metrics();
3701 0 :
3702 0 : mfs.extend(
3703 0 : self.thread_local_metrics_storage
3704 0 : .slots_submission_queue_depth
3705 0 : .collect(),
3706 0 : );
3707 0 : mfs
3708 0 : }
3709 : }
3710 :
3711 : impl Collector {
3712 : const NMETRICS: usize = 3;
3713 :
3714 : #[allow(clippy::new_without_default)]
3715 0 : pub fn new() -> Self {
3716 0 : let mut descs = Vec::new();
3717 0 :
3718 0 : let systems_created = UIntGauge::new(
3719 0 : "pageserver_tokio_epoll_uring_systems_created",
3720 0 : "counter of tokio-epoll-uring systems that were created",
3721 0 : )
3722 0 : .unwrap();
3723 0 : descs.extend(
3724 0 : metrics::core::Collector::desc(&systems_created)
3725 0 : .into_iter()
3726 0 : .cloned(),
3727 0 : );
3728 0 :
3729 0 : let systems_destroyed = UIntGauge::new(
3730 0 : "pageserver_tokio_epoll_uring_systems_destroyed",
3731 0 : "counter of tokio-epoll-uring systems that were destroyed",
3732 0 : )
3733 0 : .unwrap();
3734 0 : descs.extend(
3735 0 : metrics::core::Collector::desc(&systems_destroyed)
3736 0 : .into_iter()
3737 0 : .cloned(),
3738 0 : );
3739 0 :
3740 0 : Self {
3741 0 : descs,
3742 0 : systems_created,
3743 0 : systems_destroyed,
3744 0 : thread_local_metrics_storage: &THREAD_LOCAL_METRICS_STORAGE,
3745 0 : }
3746 0 : }
3747 : }
3748 :
3749 234 : pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy<metrics::IntCounter> = Lazy::new(|| {
3750 234 : register_int_counter!(
3751 234 : "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count",
3752 234 : "Number of times where thread_local_system creation spanned multiple executor threads",
3753 234 : )
3754 234 : .unwrap()
3755 234 : });
3756 :
3757 0 : pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy<metrics::IntCounter> = Lazy::new(|| {
3758 0 : register_int_counter!(
3759 0 : "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count",
3760 0 : "Number of times thread_local_system creation failed and was retried after back-off.",
3761 0 : )
3762 0 : .unwrap()
3763 0 : });
3764 : }
3765 :
3766 : pub(crate) mod tenant_throttling {
3767 : use metrics::{register_int_counter_vec, IntCounter};
3768 : use once_cell::sync::Lazy;
3769 : use utils::shard::TenantShardId;
3770 :
3771 : pub(crate) struct GlobalAndPerTenantIntCounter {
3772 : global: IntCounter,
3773 : per_tenant: IntCounter,
3774 : }
3775 :
3776 : impl GlobalAndPerTenantIntCounter {
3777 : #[inline(always)]
3778 0 : pub(crate) fn inc(&self) {
3779 0 : self.inc_by(1)
3780 0 : }
3781 : #[inline(always)]
3782 0 : pub(crate) fn inc_by(&self, n: u64) {
3783 0 : self.global.inc_by(n);
3784 0 : self.per_tenant.inc_by(n);
3785 0 : }
3786 : }
3787 :
3788 : pub(crate) struct Metrics<const KIND: usize> {
3789 : pub(super) count_accounted_start: GlobalAndPerTenantIntCounter,
3790 : pub(super) count_accounted_finish: GlobalAndPerTenantIntCounter,
3791 : pub(super) wait_time: GlobalAndPerTenantIntCounter,
3792 : pub(super) count_throttled: GlobalAndPerTenantIntCounter,
3793 : }
3794 :
3795 408 : static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3796 408 : register_int_counter_vec!(
3797 408 : "pageserver_tenant_throttling_count_accounted_start_global",
3798 408 : "Count of tenant throttling starts, by kind of throttle.",
3799 408 : &["kind"]
3800 408 : )
3801 408 : .unwrap()
3802 408 : });
3803 408 : static COUNT_ACCOUNTED_START_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3804 408 : register_int_counter_vec!(
3805 408 : "pageserver_tenant_throttling_count_accounted_start",
3806 408 : "Count of tenant throttling starts, by kind of throttle.",
3807 408 : &["kind", "tenant_id", "shard_id"]
3808 408 : )
3809 408 : .unwrap()
3810 408 : });
3811 408 : static COUNT_ACCOUNTED_FINISH: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3812 408 : register_int_counter_vec!(
3813 408 : "pageserver_tenant_throttling_count_accounted_finish_global",
3814 408 : "Count of tenant throttling finishes, by kind of throttle.",
3815 408 : &["kind"]
3816 408 : )
3817 408 : .unwrap()
3818 408 : });
3819 408 : static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3820 408 : register_int_counter_vec!(
3821 408 : "pageserver_tenant_throttling_count_accounted_finish",
3822 408 : "Count of tenant throttling finishes, by kind of throttle.",
3823 408 : &["kind", "tenant_id", "shard_id"]
3824 408 : )
3825 408 : .unwrap()
3826 408 : });
3827 408 : static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3828 408 : register_int_counter_vec!(
3829 408 : "pageserver_tenant_throttling_wait_usecs_sum_global",
3830 408 : "Sum of microseconds that spent waiting throttle by kind of throttle.",
3831 408 : &["kind"]
3832 408 : )
3833 408 : .unwrap()
3834 408 : });
3835 408 : static WAIT_USECS_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3836 408 : register_int_counter_vec!(
3837 408 : "pageserver_tenant_throttling_wait_usecs_sum",
3838 408 : "Sum of microseconds that spent waiting throttle by kind of throttle.",
3839 408 : &["kind", "tenant_id", "shard_id"]
3840 408 : )
3841 408 : .unwrap()
3842 408 : });
3843 :
3844 408 : static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3845 408 : register_int_counter_vec!(
3846 408 : "pageserver_tenant_throttling_count_global",
3847 408 : "Count of tenant throttlings, by kind of throttle.",
3848 408 : &["kind"]
3849 408 : )
3850 408 : .unwrap()
3851 408 : });
3852 408 : static WAIT_COUNT_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
3853 408 : register_int_counter_vec!(
3854 408 : "pageserver_tenant_throttling_count",
3855 408 : "Count of tenant throttlings, by kind of throttle.",
3856 408 : &["kind", "tenant_id", "shard_id"]
3857 408 : )
3858 408 : .unwrap()
3859 408 : });
3860 :
3861 : const KINDS: &[&str] = &["pagestream"];
3862 : pub type Pagestream = Metrics<0>;
3863 :
3864 : impl<const KIND: usize> Metrics<KIND> {
3865 444 : pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
3866 444 : let per_tenant_label_values = &[
3867 444 : KINDS[KIND],
3868 444 : &tenant_shard_id.tenant_id.to_string(),
3869 444 : &tenant_shard_id.shard_slug().to_string(),
3870 444 : ];
3871 444 : Metrics {
3872 444 : count_accounted_start: {
3873 444 : GlobalAndPerTenantIntCounter {
3874 444 : global: COUNT_ACCOUNTED_START.with_label_values(&[KINDS[KIND]]),
3875 444 : per_tenant: COUNT_ACCOUNTED_START_PER_TENANT
3876 444 : .with_label_values(per_tenant_label_values),
3877 444 : }
3878 444 : },
3879 444 : count_accounted_finish: {
3880 444 : GlobalAndPerTenantIntCounter {
3881 444 : global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KINDS[KIND]]),
3882 444 : per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT
3883 444 : .with_label_values(per_tenant_label_values),
3884 444 : }
3885 444 : },
3886 444 : wait_time: {
3887 444 : GlobalAndPerTenantIntCounter {
3888 444 : global: WAIT_USECS.with_label_values(&[KINDS[KIND]]),
3889 444 : per_tenant: WAIT_USECS_PER_TENANT
3890 444 : .with_label_values(per_tenant_label_values),
3891 444 : }
3892 444 : },
3893 444 : count_throttled: {
3894 444 : GlobalAndPerTenantIntCounter {
3895 444 : global: WAIT_COUNT.with_label_values(&[KINDS[KIND]]),
3896 444 : per_tenant: WAIT_COUNT_PER_TENANT
3897 444 : .with_label_values(per_tenant_label_values),
3898 444 : }
3899 444 : },
3900 444 : }
3901 444 : }
3902 : }
3903 :
3904 0 : pub(crate) fn preinitialize_global_metrics() {
3905 0 : Lazy::force(&COUNT_ACCOUNTED_START);
3906 0 : Lazy::force(&COUNT_ACCOUNTED_FINISH);
3907 0 : Lazy::force(&WAIT_USECS);
3908 0 : Lazy::force(&WAIT_COUNT);
3909 0 : }
3910 :
3911 12 : pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
3912 48 : for m in &[
3913 12 : &COUNT_ACCOUNTED_START_PER_TENANT,
3914 12 : &COUNT_ACCOUNTED_FINISH_PER_TENANT,
3915 12 : &WAIT_USECS_PER_TENANT,
3916 12 : &WAIT_COUNT_PER_TENANT,
3917 12 : ] {
3918 96 : for kind in KINDS {
3919 48 : let _ = m.remove_label_values(&[
3920 48 : kind,
3921 48 : &tenant_shard_id.tenant_id.to_string(),
3922 48 : &tenant_shard_id.shard_slug().to_string(),
3923 48 : ]);
3924 48 : }
3925 : }
3926 12 : }
3927 : }
3928 :
3929 : pub(crate) mod disk_usage_based_eviction {
3930 : use super::*;
3931 :
3932 : pub(crate) struct Metrics {
3933 : pub(crate) tenant_collection_time: Histogram,
3934 : pub(crate) tenant_layer_count: Histogram,
3935 : pub(crate) layers_collected: IntCounter,
3936 : pub(crate) layers_selected: IntCounter,
3937 : pub(crate) layers_evicted: IntCounter,
3938 : }
3939 :
3940 : impl Default for Metrics {
3941 0 : fn default() -> Self {
3942 0 : let tenant_collection_time = register_histogram!(
3943 0 : "pageserver_disk_usage_based_eviction_tenant_collection_seconds",
3944 0 : "Time spent collecting layers from a tenant -- not normalized by collected layer amount",
3945 0 : vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
3946 0 : )
3947 0 : .unwrap();
3948 0 :
3949 0 : let tenant_layer_count = register_histogram!(
3950 0 : "pageserver_disk_usage_based_eviction_tenant_collected_layers",
3951 0 : "Amount of layers gathered from a tenant",
3952 0 : vec![5.0, 50.0, 500.0, 5000.0, 50000.0]
3953 0 : )
3954 0 : .unwrap();
3955 0 :
3956 0 : let layers_collected = register_int_counter!(
3957 0 : "pageserver_disk_usage_based_eviction_collected_layers_total",
3958 0 : "Amount of layers collected"
3959 0 : )
3960 0 : .unwrap();
3961 0 :
3962 0 : let layers_selected = register_int_counter!(
3963 0 : "pageserver_disk_usage_based_eviction_select_layers_total",
3964 0 : "Amount of layers selected"
3965 0 : )
3966 0 : .unwrap();
3967 0 :
3968 0 : let layers_evicted = register_int_counter!(
3969 0 : "pageserver_disk_usage_based_eviction_evicted_layers_total",
3970 0 : "Amount of layers successfully evicted"
3971 0 : )
3972 0 : .unwrap();
3973 0 :
3974 0 : Self {
3975 0 : tenant_collection_time,
3976 0 : tenant_layer_count,
3977 0 : layers_collected,
3978 0 : layers_selected,
3979 0 : layers_evicted,
3980 0 : }
3981 0 : }
3982 : }
3983 :
3984 : pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
3985 : }
3986 :
3987 396 : static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
3988 396 : register_uint_gauge_vec!(
3989 396 : "pageserver_tokio_executor_thread_configured_count",
3990 396 : "Total number of configued tokio executor threads in the process.
3991 396 : The `setup` label denotes whether we're running with multiple runtimes or a single runtime.",
3992 396 : &["setup"],
3993 396 : )
3994 396 : .unwrap()
3995 396 : });
3996 :
3997 396 : pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
3998 : static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
3999 396 : let _guard = SERIALIZE.lock().unwrap();
4000 396 : TOKIO_EXECUTOR_THREAD_COUNT.reset();
4001 396 : TOKIO_EXECUTOR_THREAD_COUNT
4002 396 : .get_metric_with_label_values(&[setup])
4003 396 : .unwrap()
4004 396 : .set(u64::try_from(num_threads.get()).unwrap());
4005 396 : }
4006 :
4007 0 : pub fn preinitialize_metrics(conf: &'static PageServerConf) {
4008 0 : set_page_service_config_max_batch_size(&conf.page_service_pipelining);
4009 0 :
4010 0 : // Python tests need these and on some we do alerting.
4011 0 : //
4012 0 : // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
4013 0 : // order:
4014 0 : // - global metrics reside in a Lazy<PageserverMetrics>
4015 0 : // - access via crate::metrics::PS_METRICS.some_metric.inc()
4016 0 : // - could move the statics into TimelineMetrics::new()?
4017 0 :
4018 0 : // counters
4019 0 : [
4020 0 : &UNEXPECTED_ONDEMAND_DOWNLOADS,
4021 0 : &WALRECEIVER_STARTED_CONNECTIONS,
4022 0 : &WALRECEIVER_BROKER_UPDATES,
4023 0 : &WALRECEIVER_CANDIDATES_ADDED,
4024 0 : &WALRECEIVER_CANDIDATES_REMOVED,
4025 0 : &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
4026 0 : &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
4027 0 : &REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
4028 0 : &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
4029 0 : &CIRCUIT_BREAKERS_BROKEN,
4030 0 : &CIRCUIT_BREAKERS_UNBROKEN,
4031 0 : &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
4032 0 : ]
4033 0 : .into_iter()
4034 0 : .for_each(|c| {
4035 0 : Lazy::force(c);
4036 0 : });
4037 0 :
4038 0 : // Deletion queue stats
4039 0 : Lazy::force(&DELETION_QUEUE);
4040 0 :
4041 0 : // Tenant stats
4042 0 : Lazy::force(&TENANT);
4043 0 :
4044 0 : // Tenant manager stats
4045 0 : Lazy::force(&TENANT_MANAGER);
4046 0 :
4047 0 : Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
4048 0 : Lazy::force(&disk_usage_based_eviction::METRICS);
4049 :
4050 0 : for state_name in pageserver_api::models::TenantState::VARIANTS {
4051 0 : // initialize the metric for all gauges, otherwise the time series might seemingly show
4052 0 : // values from last restart.
4053 0 : TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0);
4054 0 : }
4055 :
4056 : // countervecs
4057 0 : [
4058 0 : &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
4059 0 : &SMGR_QUERY_STARTED_GLOBAL,
4060 0 : ]
4061 0 : .into_iter()
4062 0 : .for_each(|c| {
4063 0 : Lazy::force(c);
4064 0 : });
4065 0 :
4066 0 : // gauges
4067 0 : WALRECEIVER_ACTIVE_MANAGERS.get();
4068 0 :
4069 0 : // histograms
4070 0 : [
4071 0 : &LAYERS_PER_READ_GLOBAL,
4072 0 : &DELTAS_PER_READ_GLOBAL,
4073 0 : &WAIT_LSN_TIME,
4074 0 : &WAL_REDO_TIME,
4075 0 : &WAL_REDO_RECORDS_HISTOGRAM,
4076 0 : &WAL_REDO_BYTES_HISTOGRAM,
4077 0 : &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
4078 0 : &PAGE_SERVICE_BATCH_SIZE_GLOBAL,
4079 0 : &PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL,
4080 0 : ]
4081 0 : .into_iter()
4082 0 : .for_each(|h| {
4083 0 : Lazy::force(h);
4084 0 : });
4085 0 :
4086 0 : // Custom
4087 0 : Lazy::force(&BASEBACKUP_QUERY_TIME);
4088 0 : Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
4089 0 : Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);
4090 0 :
4091 0 : tenant_throttling::preinitialize_global_metrics();
4092 0 : }
|