Line data Source code
1 : use std::collections::HashMap;
2 : use std::num::NonZeroUsize;
3 : use std::os::fd::RawFd;
4 : use std::sync::atomic::AtomicU64;
5 : use std::sync::{Arc, Mutex};
6 : use std::time::{Duration, Instant};
7 :
8 : use enum_map::{Enum as _, EnumMap};
9 : use futures::Future;
10 : use metrics::{
11 : Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
12 : IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
13 : register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
14 : register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
15 : register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
16 : };
17 : use once_cell::sync::Lazy;
18 : use pageserver_api::config::defaults::DEFAULT_MAX_GET_VECTORED_KEYS;
19 : use pageserver_api::config::{
20 : PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
21 : PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
22 : };
23 : use pageserver_api::models::InMemoryLayerInfo;
24 : use pageserver_api::shard::TenantShardId;
25 : use postgres_backend::{QueryError, is_expected_io_error};
26 : use pq_proto::framed::ConnectionError;
27 : use strum::{EnumCount, IntoEnumIterator as _, VariantNames};
28 : use strum_macros::{IntoStaticStr, VariantNames};
29 : use utils::id::TimelineId;
30 :
31 : use crate::config;
32 : use crate::config::PageServerConf;
33 : use crate::context::{PageContentKind, RequestContext};
34 : use crate::pgdatadir_mapping::DatadirModificationStats;
35 : use crate::task_mgr::TaskKind;
36 : use crate::tenant::layer_map::LayerMap;
37 : use crate::tenant::mgr::TenantSlot;
38 : use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc};
39 : use crate::tenant::tasks::BackgroundLoopKind;
40 : use crate::tenant::throttle::ThrottleResult;
41 :
42 : /// Prometheus histogram buckets (in seconds) for operations in the critical
43 : /// path. In other words, operations that directly affect that latency of user
44 : /// queries.
45 : ///
46 : /// The buckets capture the majority of latencies in the microsecond and
47 : /// millisecond range but also extend far enough up to distinguish "bad" from
48 : /// "really bad".
49 : const CRITICAL_OP_BUCKETS: &[f64] = &[
50 : 0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us
51 : 0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms
52 : 1.0, 10.0, 100.0, // 1 s, 10 s, 100 s
53 : ];
54 :
55 : // Metrics collected on operations on the storage repository.
56 : #[derive(Debug, VariantNames, IntoStaticStr)]
57 : #[strum(serialize_all = "kebab_case")]
58 : pub(crate) enum StorageTimeOperation {
59 : #[strum(serialize = "layer flush")]
60 : LayerFlush,
61 :
62 : #[strum(serialize = "layer flush delay")]
63 : LayerFlushDelay,
64 :
65 : #[strum(serialize = "compact")]
66 : Compact,
67 :
68 : #[strum(serialize = "create images")]
69 : CreateImages,
70 :
71 : #[strum(serialize = "logical size")]
72 : LogicalSize,
73 :
74 : #[strum(serialize = "imitate logical size")]
75 : ImitateLogicalSize,
76 :
77 : #[strum(serialize = "load layer map")]
78 : LoadLayerMap,
79 :
80 : #[strum(serialize = "gc")]
81 : Gc,
82 :
83 : #[strum(serialize = "find gc cutoffs")]
84 : FindGcCutoffs,
85 : }
86 :
87 108 : pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
88 108 : register_counter_vec!(
89 108 : "pageserver_storage_operations_seconds_sum",
90 108 : "Total time spent on storage operations with operation, tenant and timeline dimensions",
91 108 : &["operation", "tenant_id", "shard_id", "timeline_id"],
92 108 : )
93 108 : .expect("failed to define a metric")
94 108 : });
95 :
96 108 : pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
97 108 : register_int_counter_vec!(
98 108 : "pageserver_storage_operations_seconds_count",
99 108 : "Count of storage operations with operation, tenant and timeline dimensions",
100 108 : &["operation", "tenant_id", "shard_id", "timeline_id"],
101 108 : )
102 108 : .expect("failed to define a metric")
103 108 : });
104 :
105 : // Buckets for background operation duration in seconds, like compaction, GC, size calculation.
106 : const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
107 :
108 108 : pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
109 108 : register_histogram_vec!(
110 108 : "pageserver_storage_operations_seconds_global",
111 108 : "Time spent on storage operations",
112 108 : &["operation"],
113 108 : STORAGE_OP_BUCKETS.into(),
114 108 : )
115 108 : .expect("failed to define a metric")
116 108 : });
117 :
118 : /// Measures layers visited per read (i.e. read amplification).
119 : ///
120 : /// NB: for a batch, we count all visited layers towards each read. While the cost of layer visits
121 : /// are amortized across the batch, and some layers may not intersect with a given key, each visited
122 : /// layer contributes directly to the observed latency for every read in the batch, which is what we
123 : /// care about.
124 108 : pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
125 108 : register_histogram_vec!(
126 108 : "pageserver_layers_per_read",
127 108 : "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
128 108 : &["tenant_id", "shard_id", "timeline_id"],
129 108 : // Low resolution to reduce cardinality.
130 108 : vec![4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
131 108 : )
132 108 : .expect("failed to define a metric")
133 108 : });
134 :
135 106 : pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
136 106 : register_histogram!(
137 106 : "pageserver_layers_per_read_global",
138 106 : "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
139 106 : vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
140 106 : )
141 106 : .expect("failed to define a metric")
142 106 : });
143 :
144 106 : pub(crate) static LAYERS_PER_READ_BATCH_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
145 106 : register_histogram!(
146 106 : "pageserver_layers_per_read_batch_global",
147 106 : "Layers visited to serve a single read batch (read amplification), regardless of number of reads.",
148 106 : vec![
149 106 : 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
150 106 : ],
151 106 : )
152 106 : .expect("failed to define a metric")
153 106 : });
154 :
155 106 : pub(crate) static LAYERS_PER_READ_AMORTIZED_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
156 106 : register_histogram!(
157 106 : "pageserver_layers_per_read_amortized_global",
158 106 : "Layers visited to serve a single read (read amplification). Amortized across a batch: \
159 106 : all visited layers are divided by number of reads.",
160 106 : vec![
161 106 : 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
162 106 : ],
163 106 : )
164 106 : .expect("failed to define a metric")
165 106 : });
166 :
167 106 : pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
168 106 : // We expect this to be low because of Postgres checkpoints. Let's see if that holds.
169 106 : register_histogram!(
170 106 : "pageserver_deltas_per_read_global",
171 106 : "Number of delta pages applied to image page per read",
172 106 : vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
173 106 : )
174 106 : .expect("failed to define a metric")
175 106 : });
176 :
177 0 : pub(crate) static CONCURRENT_INITDBS: Lazy<UIntGauge> = Lazy::new(|| {
178 0 : register_uint_gauge!(
179 0 : "pageserver_concurrent_initdb",
180 0 : "Number of initdb processes running"
181 0 : )
182 0 : .expect("failed to define a metric")
183 0 : });
184 :
185 0 : pub(crate) static INITDB_SEMAPHORE_ACQUISITION_TIME: Lazy<Histogram> = Lazy::new(|| {
186 0 : register_histogram!(
187 0 : "pageserver_initdb_semaphore_seconds_global",
188 0 : "Time spent getting a permit from the global initdb semaphore",
189 0 : STORAGE_OP_BUCKETS.into()
190 0 : )
191 0 : .expect("failed to define metric")
192 0 : });
193 :
194 0 : pub(crate) static INITDB_RUN_TIME: Lazy<Histogram> = Lazy::new(|| {
195 0 : register_histogram!(
196 0 : "pageserver_initdb_seconds_global",
197 0 : "Time spent performing initdb",
198 0 : STORAGE_OP_BUCKETS.into()
199 0 : )
200 0 : .expect("failed to define metric")
201 0 : });
202 :
203 : pub(crate) struct GetVectoredLatency {
204 : map: EnumMap<TaskKind, Option<Histogram>>,
205 : }
206 :
207 : #[allow(dead_code)]
208 : pub(crate) struct ScanLatency {
209 : map: EnumMap<TaskKind, Option<Histogram>>,
210 : }
211 :
212 : impl GetVectoredLatency {
213 : // Only these task types perform vectored gets. Filter all other tasks out to reduce total
214 : // cardinality of the metric.
215 : const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler];
216 :
217 10888 : pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
218 10888 : self.map[task_kind].as_ref()
219 10888 : }
220 : }
221 :
222 : impl ScanLatency {
223 : // Only these task types perform vectored gets. Filter all other tasks out to reduce total
224 : // cardinality of the metric.
225 : const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler];
226 :
227 8 : pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
228 8 : self.map[task_kind].as_ref()
229 8 : }
230 : }
231 :
232 : pub(crate) struct ScanLatencyOngoingRecording<'a> {
233 : parent: &'a Histogram,
234 : start: std::time::Instant,
235 : }
236 :
237 : impl<'a> ScanLatencyOngoingRecording<'a> {
238 0 : pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> {
239 0 : let start = Instant::now();
240 0 : ScanLatencyOngoingRecording { parent, start }
241 0 : }
242 :
243 0 : pub(crate) fn observe(self) {
244 0 : let elapsed = self.start.elapsed();
245 0 : self.parent.observe(elapsed.as_secs_f64());
246 0 : }
247 : }
248 :
249 104 : pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
250 104 : let inner = register_histogram_vec!(
251 104 : "pageserver_get_vectored_seconds",
252 104 : "Time spent in get_vectored.",
253 104 : &["task_kind"],
254 104 : CRITICAL_OP_BUCKETS.into(),
255 104 : )
256 104 : .expect("failed to define a metric");
257 104 :
258 104 : GetVectoredLatency {
259 3328 : map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
260 3328 : let task_kind = TaskKind::from_usize(task_kind_idx);
261 3328 :
262 3328 : if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
263 208 : let task_kind = task_kind.into();
264 208 : Some(inner.with_label_values(&[task_kind]))
265 : } else {
266 3120 : None
267 : }
268 3328 : })),
269 104 : }
270 104 : });
271 :
272 3 : pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
273 3 : let inner = register_histogram_vec!(
274 3 : "pageserver_scan_seconds",
275 3 : "Time spent in scan.",
276 3 : &["task_kind"],
277 3 : CRITICAL_OP_BUCKETS.into(),
278 3 : )
279 3 : .expect("failed to define a metric");
280 3 :
281 3 : ScanLatency {
282 96 : map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
283 96 : let task_kind = TaskKind::from_usize(task_kind_idx);
284 96 :
285 96 : if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
286 3 : let task_kind = task_kind.into();
287 3 : Some(inner.with_label_values(&[task_kind]))
288 : } else {
289 93 : None
290 : }
291 96 : })),
292 3 : }
293 3 : });
294 :
295 : pub(crate) struct PageCacheMetricsForTaskKind {
296 : pub read_accesses_immutable: IntCounter,
297 : pub read_hits_immutable: IntCounter,
298 : }
299 :
300 : pub(crate) struct PageCacheMetrics {
301 : map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
302 : }
303 :
304 50 : static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
305 50 : register_int_counter_vec!(
306 50 : "pageserver_page_cache_read_hits_total",
307 50 : "Number of read accesses to the page cache that hit",
308 50 : &["task_kind", "key_kind", "content_kind", "hit_kind"]
309 50 : )
310 50 : .expect("failed to define a metric")
311 50 : });
312 :
313 50 : static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
314 50 : register_int_counter_vec!(
315 50 : "pageserver_page_cache_read_accesses_total",
316 50 : "Number of read accesses to the page cache",
317 50 : &["task_kind", "key_kind", "content_kind"]
318 50 : )
319 50 : .expect("failed to define a metric")
320 50 : });
321 :
322 50 : pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
323 1600 : map: EnumMap::from_array(std::array::from_fn(|task_kind| {
324 1600 : let task_kind = TaskKind::from_usize(task_kind);
325 1600 : let task_kind: &'static str = task_kind.into();
326 12800 : EnumMap::from_array(std::array::from_fn(|content_kind| {
327 12800 : let content_kind = PageContentKind::from_usize(content_kind);
328 12800 : let content_kind: &'static str = content_kind.into();
329 12800 : PageCacheMetricsForTaskKind {
330 12800 : read_accesses_immutable: {
331 12800 : PAGE_CACHE_READ_ACCESSES
332 12800 : .get_metric_with_label_values(&[task_kind, "immutable", content_kind])
333 12800 : .unwrap()
334 12800 : },
335 12800 :
336 12800 : read_hits_immutable: {
337 12800 : PAGE_CACHE_READ_HITS
338 12800 : .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
339 12800 : .unwrap()
340 12800 : },
341 12800 : }
342 12800 : }))
343 1600 : })),
344 50 : });
345 :
346 : impl PageCacheMetrics {
347 586904 : pub(crate) fn for_ctx(&self, ctx: &RequestContext) -> &PageCacheMetricsForTaskKind {
348 586904 : &self.map[ctx.task_kind()][ctx.page_content_kind()]
349 586904 : }
350 : }
351 :
352 : pub(crate) struct PageCacheSizeMetrics {
353 : pub max_bytes: UIntGauge,
354 :
355 : pub current_bytes_immutable: UIntGauge,
356 : }
357 :
358 50 : static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
359 50 : register_uint_gauge_vec!(
360 50 : "pageserver_page_cache_size_current_bytes",
361 50 : "Current size of the page cache in bytes, by key kind",
362 50 : &["key_kind"]
363 50 : )
364 50 : .expect("failed to define a metric")
365 50 : });
366 :
367 : pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
368 50 : Lazy::new(|| PageCacheSizeMetrics {
369 50 : max_bytes: {
370 50 : register_uint_gauge!(
371 50 : "pageserver_page_cache_size_max_bytes",
372 50 : "Maximum size of the page cache in bytes"
373 50 : )
374 50 : .expect("failed to define a metric")
375 50 : },
376 50 : current_bytes_immutable: {
377 50 : PAGE_CACHE_SIZE_CURRENT_BYTES
378 50 : .get_metric_with_label_values(&["immutable"])
379 50 : .unwrap()
380 50 : },
381 50 : });
382 :
383 : pub(crate) mod page_cache_eviction_metrics {
384 : use std::num::NonZeroUsize;
385 :
386 : use metrics::{IntCounter, IntCounterVec, register_int_counter_vec};
387 : use once_cell::sync::Lazy;
388 :
389 : #[derive(Clone, Copy)]
390 : pub(crate) enum Outcome {
391 : FoundSlotUnused { iters: NonZeroUsize },
392 : FoundSlotEvicted { iters: NonZeroUsize },
393 : ItersExceeded { iters: NonZeroUsize },
394 : }
395 :
396 50 : static ITERS_TOTAL_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
397 50 : register_int_counter_vec!(
398 50 : "pageserver_page_cache_find_victim_iters_total",
399 50 : "Counter for the number of iterations in the find_victim loop",
400 50 : &["outcome"],
401 50 : )
402 50 : .expect("failed to define a metric")
403 50 : });
404 :
405 50 : static CALLS_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
406 50 : register_int_counter_vec!(
407 50 : "pageserver_page_cache_find_victim_calls",
408 50 : "Incremented at the end of each find_victim() call.\
409 50 : Filter by outcome to get e.g., eviction rate.",
410 50 : &["outcome"]
411 50 : )
412 50 : .unwrap()
413 50 : });
414 :
415 15429 : pub(crate) fn observe(outcome: Outcome) {
416 : macro_rules! dry {
417 : ($label:literal, $iters:expr) => {{
418 : static LABEL: &'static str = $label;
419 : static ITERS_TOTAL: Lazy<IntCounter> =
420 59 : Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL]));
421 : static CALLS: Lazy<IntCounter> =
422 59 : Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL]));
423 : ITERS_TOTAL.inc_by(($iters.get()) as u64);
424 : CALLS.inc();
425 : }};
426 : }
427 15429 : match outcome {
428 820 : Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters),
429 14609 : Outcome::FoundSlotEvicted { iters } => {
430 14609 : dry!("found_evicted", iters)
431 : }
432 0 : Outcome::ItersExceeded { iters } => {
433 0 : dry!("err_iters_exceeded", iters);
434 0 : super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit);
435 0 : }
436 : }
437 15429 : }
438 : }
439 :
440 0 : static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
441 0 : register_int_counter_vec!(
442 0 : "page_cache_errors_total",
443 0 : "Number of timeouts while acquiring a pinned slot in the page cache",
444 0 : &["error_kind"]
445 0 : )
446 0 : .expect("failed to define a metric")
447 0 : });
448 :
449 0 : pub(crate) static FEATURE_FLAG_EVALUATION: Lazy<CounterVec> = Lazy::new(|| {
450 0 : register_counter_vec!(
451 0 : "pageserver_feature_flag_evaluation",
452 0 : "Number of times a feature flag is evaluated",
453 0 : &["flag_key", "status", "value"],
454 0 : )
455 0 : .unwrap()
456 0 : });
457 :
458 : #[derive(IntoStaticStr)]
459 : #[strum(serialize_all = "kebab_case")]
460 : pub(crate) enum PageCacheErrorKind {
461 : AcquirePinnedSlotTimeout,
462 : EvictIterLimit,
463 : }
464 :
465 0 : pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
466 0 : PAGE_CACHE_ERRORS
467 0 : .get_metric_with_label_values(&[error_kind.into()])
468 0 : .unwrap()
469 0 : .inc();
470 0 : }
471 :
472 11 : pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
473 11 : register_histogram!(
474 11 : "pageserver_wait_lsn_seconds",
475 11 : "Time spent waiting for WAL to arrive. Updated on completion of the wait_lsn operation.",
476 11 : CRITICAL_OP_BUCKETS.into(),
477 11 : )
478 11 : .expect("failed to define a metric")
479 11 : });
480 :
481 108 : pub(crate) static WAIT_LSN_START_FINISH_COUNTERPAIR: Lazy<IntCounterPairVec> = Lazy::new(|| {
482 108 : register_int_counter_pair_vec!(
483 108 : "pageserver_wait_lsn_started_count",
484 108 : "Number of wait_lsn operations started.",
485 108 : "pageserver_wait_lsn_finished_count",
486 108 : "Number of wait_lsn operations finished.",
487 108 : &["tenant_id", "shard_id", "timeline_id"],
488 108 : )
489 108 : .expect("failed to define a metric")
490 108 : });
491 :
492 108 : pub(crate) static WAIT_LSN_IN_PROGRESS_MICROS: Lazy<IntCounterVec> = Lazy::new(|| {
493 108 : register_int_counter_vec!(
494 108 : "pageserver_wait_lsn_in_progress_micros",
495 108 : "Time spent waiting for WAL to arrive, by timeline_id. Updated periodically while waiting.",
496 108 : &["tenant_id", "shard_id", "timeline_id"],
497 108 : )
498 108 : .expect("failed to define a metric")
499 108 : });
500 :
501 108 : pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy<IntCounter> = Lazy::new(|| {
502 108 : register_int_counter!(
503 108 : "pageserver_wait_lsn_in_progress_micros_global",
504 108 : "Time spent waiting for WAL to arrive, globally. Updated periodically while waiting."
505 108 : )
506 108 : .expect("failed to define a metric")
507 108 : });
508 :
509 3 : pub(crate) static ONDEMAND_DOWNLOAD_BYTES: Lazy<IntCounterVec> = Lazy::new(|| {
510 3 : register_int_counter_vec!(
511 3 : "pageserver_ondemand_download_bytes_total",
512 3 : "Total bytes of layers on-demand downloaded",
513 3 : &["task_kind"]
514 3 : )
515 3 : .expect("failed to define a metric")
516 3 : });
517 :
518 3 : pub(crate) static ONDEMAND_DOWNLOAD_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
519 3 : register_int_counter_vec!(
520 3 : "pageserver_ondemand_download_count",
521 3 : "Total count of layers on-demand downloaded",
522 3 : &["task_kind"]
523 3 : )
524 3 : .expect("failed to define a metric")
525 3 : });
526 :
527 : pub(crate) mod wait_ondemand_download_time {
528 : use super::*;
529 : const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[
530 : 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, // 10 ms - 100ms
531 : 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, // 100ms to 1s
532 : 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, // 1s to 10s
533 : 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, // 10s to 1m
534 : ];
535 :
536 : /// The task kinds for which we want to track wait times for on-demand downloads.
537 : /// Other task kinds' wait times are accumulated in label value `unknown`.
538 : pub(crate) const WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS: [TaskKind; 2] = [
539 : TaskKind::PageRequestHandler,
540 : TaskKind::WalReceiverConnectionHandler,
541 : ];
542 :
543 0 : pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL: Lazy<Vec<Histogram>> = Lazy::new(|| {
544 0 : let histo = register_histogram_vec!(
545 0 : "pageserver_wait_ondemand_download_seconds_global",
546 0 : "Observations are individual tasks' wait times for on-demand downloads. \
547 0 : If N tasks coalesce on an on-demand download, and it takes 10s, than we observe N * 10s.",
548 0 : &["task_kind"],
549 0 : WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS.into(),
550 0 : )
551 0 : .expect("failed to define a metric");
552 0 : WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS
553 0 : .iter()
554 0 : .map(|task_kind| histo.with_label_values(&[task_kind.into()]))
555 0 : .collect::<Vec<_>>()
556 0 : });
557 :
558 108 : pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_SUM: Lazy<CounterVec> = Lazy::new(|| {
559 108 : register_counter_vec!(
560 108 : // use a name that _could_ be evolved into a per-timeline histogram later
561 108 : "pageserver_wait_ondemand_download_seconds_sum",
562 108 : "Like `pageserver_wait_ondemand_download_seconds_global` but per timeline",
563 108 : &["tenant_id", "shard_id", "timeline_id", "task_kind"],
564 108 : )
565 108 : .unwrap()
566 108 : });
567 :
568 : pub struct WaitOndemandDownloadTimeSum {
569 : counters: [Counter; WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS.len()],
570 : }
571 :
572 : impl WaitOndemandDownloadTimeSum {
573 234 : pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self {
574 234 : let counters = WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS
575 234 : .iter()
576 468 : .map(|task_kind| {
577 468 : WAIT_ONDEMAND_DOWNLOAD_TIME_SUM
578 468 : .get_metric_with_label_values(&[
579 468 : tenant_id,
580 468 : shard_id,
581 468 : timeline_id,
582 468 : task_kind.into(),
583 468 : ])
584 468 : .unwrap()
585 468 : })
586 234 : .collect::<Vec<_>>();
587 234 : Self {
588 234 : counters: counters.try_into().unwrap(),
589 234 : }
590 234 : }
591 12 : pub(crate) fn observe(&self, task_kind: TaskKind, duration: Duration) {
592 12 : let maybe = WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS
593 12 : .iter()
594 12 : .enumerate()
595 24 : .find(|(_, kind)| **kind == task_kind);
596 12 : let Some((idx, _)) = maybe else {
597 12 : return;
598 : };
599 0 : WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL[idx].observe(duration.as_secs_f64());
600 0 : let counter = &self.counters[idx];
601 0 : counter.inc_by(duration.as_secs_f64());
602 12 : }
603 : }
604 :
605 5 : pub(crate) fn shutdown_timeline(tenant_id: &str, shard_id: &str, timeline_id: &str) {
606 15 : for task_kind in WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS {
607 10 : let _ = WAIT_ONDEMAND_DOWNLOAD_TIME_SUM.remove_label_values(&[
608 10 : tenant_id,
609 10 : shard_id,
610 10 : timeline_id,
611 10 : task_kind.into(),
612 10 : ]);
613 10 : }
614 5 : }
615 :
616 0 : pub(crate) fn preinitialize_global_metrics() {
617 0 : Lazy::force(&WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL);
618 0 : }
619 : }
620 :
621 108 : static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
622 108 : register_int_gauge_vec!(
623 108 : "pageserver_last_record_lsn",
624 108 : "Last record LSN grouped by timeline",
625 108 : &["tenant_id", "shard_id", "timeline_id"]
626 108 : )
627 108 : .expect("failed to define a metric")
628 108 : });
629 :
630 108 : static DISK_CONSISTENT_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
631 108 : register_int_gauge_vec!(
632 108 : "pageserver_disk_consistent_lsn",
633 108 : "Disk consistent LSN grouped by timeline",
634 108 : &["tenant_id", "shard_id", "timeline_id"]
635 108 : )
636 108 : .expect("failed to define a metric")
637 108 : });
638 :
639 108 : pub(crate) static PROJECTED_REMOTE_CONSISTENT_LSN: Lazy<UIntGaugeVec> = Lazy::new(|| {
640 108 : register_uint_gauge_vec!(
641 108 : "pageserver_projected_remote_consistent_lsn",
642 108 : "Projected remote consistent LSN grouped by timeline",
643 108 : &["tenant_id", "shard_id", "timeline_id"]
644 108 : )
645 108 : .expect("failed to define a metric")
646 108 : });
647 :
648 108 : static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
649 108 : register_uint_gauge_vec!(
650 108 : "pageserver_pitr_history_size",
651 108 : "Data written since PITR cutoff on this timeline",
652 108 : &["tenant_id", "shard_id", "timeline_id"]
653 108 : )
654 108 : .expect("failed to define a metric")
655 108 : });
656 :
657 : #[derive(
658 60 : strum_macros::EnumIter,
659 0 : strum_macros::EnumString,
660 : strum_macros::Display,
661 : strum_macros::IntoStaticStr,
662 : )]
663 : #[strum(serialize_all = "kebab_case")]
664 : pub(crate) enum LayerKind {
665 : Delta,
666 : Image,
667 : }
668 :
669 : #[derive(
670 25 : strum_macros::EnumIter,
671 0 : strum_macros::EnumString,
672 : strum_macros::Display,
673 : strum_macros::IntoStaticStr,
674 : )]
675 : #[strum(serialize_all = "kebab_case")]
676 : pub(crate) enum LayerLevel {
677 : // We don't track the currently open ephemeral layer, since there's always exactly 1 and its
678 : // size changes. See `TIMELINE_EPHEMERAL_BYTES`.
679 : Frozen,
680 : L0,
681 : L1,
682 : }
683 :
684 106 : static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
685 106 : register_uint_gauge_vec!(
686 106 : "pageserver_layer_bytes",
687 106 : "Sum of frozen, L0, and L1 layer physical sizes in bytes (excluding the open ephemeral layer)",
688 106 : &["tenant_id", "shard_id", "timeline_id", "level", "kind"]
689 106 : )
690 106 : .expect("failed to define a metric")
691 106 : });
692 :
693 106 : static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
694 106 : register_uint_gauge_vec!(
695 106 : "pageserver_layer_count",
696 106 : "Number of frozen, L0, and L1 layers (excluding the open ephemeral layer)",
697 106 : &["tenant_id", "shard_id", "timeline_id", "level", "kind"]
698 106 : )
699 106 : .expect("failed to define a metric")
700 106 : });
701 :
702 108 : static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
703 108 : register_uint_gauge_vec!(
704 108 : "pageserver_archive_size",
705 108 : "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero",
706 108 : &["tenant_id", "shard_id", "timeline_id"]
707 108 : )
708 108 : .expect("failed to define a metric")
709 108 : });
710 :
711 108 : static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
712 108 : register_int_gauge_vec!(
713 108 : "pageserver_standby_horizon",
714 108 : "Standby apply LSN for which GC is hold off, by timeline.",
715 108 : &["tenant_id", "shard_id", "timeline_id"]
716 108 : )
717 108 : .expect("failed to define a metric")
718 108 : });
719 :
720 108 : static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
721 108 : register_uint_gauge_vec!(
722 108 : "pageserver_resident_physical_size",
723 108 : "The size of the layer files present in the pageserver's filesystem, for attached locations.",
724 108 : &["tenant_id", "shard_id", "timeline_id"]
725 108 : )
726 108 : .expect("failed to define a metric")
727 108 : });
728 :
729 108 : static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
730 108 : register_uint_gauge_vec!(
731 108 : "pageserver_visible_physical_size",
732 108 : "The size of the layer files present in the pageserver's filesystem.",
733 108 : &["tenant_id", "shard_id", "timeline_id"]
734 108 : )
735 108 : .expect("failed to define a metric")
736 108 : });
737 :
738 106 : pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
739 106 : register_uint_gauge!(
740 106 : "pageserver_resident_physical_size_global",
741 106 : "Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions."
742 106 : )
743 106 : .expect("failed to define a metric")
744 106 : });
745 :
746 108 : static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
747 108 : register_uint_gauge_vec!(
748 108 : "pageserver_remote_physical_size",
749 108 : "The size of the layer files present in the remote storage that are listed in the remote index_part.json.",
750 108 : // Corollary: If any files are missing from the index part, they won't be included here.
751 108 : &["tenant_id", "shard_id", "timeline_id"]
752 108 : )
753 108 : .expect("failed to define a metric")
754 108 : });
755 :
756 108 : static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
757 108 : register_uint_gauge!(
758 108 : "pageserver_remote_physical_size_global",
759 108 : "Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions."
760 108 : )
761 108 : .expect("failed to define a metric")
762 108 : });
763 :
764 3 : pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
765 3 : register_int_counter!(
766 3 : "pageserver_remote_ondemand_downloaded_layers_total",
767 3 : "Total on-demand downloaded layers"
768 3 : )
769 3 : .unwrap()
770 3 : });
771 :
772 3 : pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
773 3 : register_int_counter!(
774 3 : "pageserver_remote_ondemand_downloaded_bytes_total",
775 3 : "Total bytes of layers on-demand downloaded",
776 3 : )
777 3 : .unwrap()
778 3 : });
779 :
780 108 : static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
781 108 : register_uint_gauge_vec!(
782 108 : "pageserver_current_logical_size",
783 108 : "Current logical size grouped by timeline",
784 108 : &["tenant_id", "shard_id", "timeline_id"]
785 108 : )
786 108 : .expect("failed to define current logical size metric")
787 108 : });
788 :
789 108 : static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
790 108 : register_int_gauge_vec!(
791 108 : "pageserver_aux_file_estimated_size",
792 108 : "The size of all aux files for a timeline in aux file v2 store.",
793 108 : &["tenant_id", "shard_id", "timeline_id"]
794 108 : )
795 108 : .expect("failed to define a metric")
796 108 : });
797 :
798 108 : static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
799 108 : register_uint_gauge_vec!(
800 108 : "pageserver_valid_lsn_lease_count",
801 108 : "The number of valid leases after refreshing gc info.",
802 108 : &["tenant_id", "shard_id", "timeline_id"],
803 108 : )
804 108 : .expect("failed to define a metric")
805 108 : });
806 :
807 0 : pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
808 0 : register_int_counter!(
809 0 : "pageserver_circuit_breaker_broken",
810 0 : "How many times a circuit breaker has broken"
811 0 : )
812 0 : .expect("failed to define a metric")
813 0 : });
814 :
815 0 : pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
816 0 : register_int_counter!(
817 0 : "pageserver_circuit_breaker_unbroken",
818 0 : "How many times a circuit breaker has been un-broken (recovered)"
819 0 : )
820 0 : .expect("failed to define a metric")
821 0 : });
822 :
823 104 : pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
824 104 : register_int_counter!(
825 104 : "pageserver_compression_image_in_bytes_total",
826 104 : "Size of data written into image layers before compression"
827 104 : )
828 104 : .expect("failed to define a metric")
829 104 : });
830 :
831 104 : pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
832 104 : register_int_counter!(
833 104 : "pageserver_compression_image_in_bytes_considered",
834 104 : "Size of potentially compressible data written into image layers before compression"
835 104 : )
836 104 : .expect("failed to define a metric")
837 104 : });
838 :
839 104 : pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
840 104 : register_int_counter!(
841 104 : "pageserver_compression_image_in_bytes_chosen",
842 104 : "Size of data whose compressed form was written into image layers"
843 104 : )
844 104 : .expect("failed to define a metric")
845 104 : });
846 :
847 104 : pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
848 104 : register_int_counter!(
849 104 : "pageserver_compression_image_out_bytes_total",
850 104 : "Size of compressed image layer written"
851 104 : )
852 104 : .expect("failed to define a metric")
853 104 : });
854 :
855 5 : pub(crate) static RELSIZE_LATEST_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
856 5 : register_uint_gauge!(
857 5 : "pageserver_relsize_latest_cache_entries",
858 5 : "Number of entries in the latest relation size cache",
859 5 : )
860 5 : .expect("failed to define a metric")
861 5 : });
862 :
863 5 : pub(crate) static RELSIZE_LATEST_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
864 5 : register_int_counter!(
865 5 : "pageserver_relsize_latest_cache_hits",
866 5 : "Latest relation size cache hits",
867 5 : )
868 5 : .expect("failed to define a metric")
869 5 : });
870 :
871 4 : pub(crate) static RELSIZE_LATEST_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
872 4 : register_int_counter!(
873 4 : "pageserver_relsize_latest_cache_misses",
874 4 : "Relation size latest cache misses",
875 4 : )
876 4 : .expect("failed to define a metric")
877 4 : });
878 :
879 2 : pub(crate) static RELSIZE_SNAPSHOT_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
880 2 : register_uint_gauge!(
881 2 : "pageserver_relsize_snapshot_cache_entries",
882 2 : "Number of entries in the pitr relation size cache",
883 2 : )
884 2 : .expect("failed to define a metric")
885 2 : });
886 :
887 2 : pub(crate) static RELSIZE_SNAPSHOT_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
888 2 : register_int_counter!(
889 2 : "pageserver_relsize_snapshot_cache_hits",
890 2 : "Pitr relation size cache hits",
891 2 : )
892 2 : .expect("failed to define a metric")
893 2 : });
894 :
895 3 : pub(crate) static RELSIZE_SNAPSHOT_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
896 3 : register_int_counter!(
897 3 : "pageserver_relsize_snapshot_cache_misses",
898 3 : "Relation size snapshot cache misses",
899 3 : )
900 3 : .expect("failed to define a metric")
901 3 : });
902 :
903 2 : pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy<IntCounter> = Lazy::new(|| {
904 2 : register_int_counter!(
905 2 : "pageserver_relsize_cache_misses_old",
906 2 : "Relation size cache misses where the lookup LSN is older than the last relation update"
907 2 : )
908 2 : .expect("failed to define a metric")
909 2 : });
910 :
911 : pub(crate) mod initial_logical_size {
912 : use metrics::{IntCounter, IntCounterVec, register_int_counter, register_int_counter_vec};
913 : use once_cell::sync::Lazy;
914 :
915 : pub(crate) struct StartCalculation(IntCounterVec);
916 108 : pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
917 108 : StartCalculation(
918 108 : register_int_counter_vec!(
919 108 : "pageserver_initial_logical_size_start_calculation",
920 108 : "Incremented each time we start an initial logical size calculation attempt. \
921 108 : The `circumstances` label provides some additional details.",
922 108 : &["attempt", "circumstances"]
923 108 : )
924 108 : .unwrap(),
925 108 : )
926 108 : });
927 :
928 : struct DropCalculation {
929 : first: IntCounter,
930 : retry: IntCounter,
931 : }
932 :
933 108 : static DROP_CALCULATION: Lazy<DropCalculation> = Lazy::new(|| {
934 108 : let vec = register_int_counter_vec!(
935 108 : "pageserver_initial_logical_size_drop_calculation",
936 108 : "Incremented each time we abort a started size calculation attmpt.",
937 108 : &["attempt"]
938 108 : )
939 108 : .unwrap();
940 108 : DropCalculation {
941 108 : first: vec.with_label_values(&["first"]),
942 108 : retry: vec.with_label_values(&["retry"]),
943 108 : }
944 108 : });
945 :
946 : pub(crate) struct Calculated {
947 : pub(crate) births: IntCounter,
948 : pub(crate) deaths: IntCounter,
949 : }
950 :
951 108 : pub(crate) static CALCULATED: Lazy<Calculated> = Lazy::new(|| Calculated {
952 108 : births: register_int_counter!(
953 108 : "pageserver_initial_logical_size_finish_calculation",
954 108 : "Incremented every time we finish calculation of initial logical size.\
955 108 : If everything is working well, this should happen at most once per Timeline object."
956 108 : )
957 108 : .unwrap(),
958 108 : deaths: register_int_counter!(
959 108 : "pageserver_initial_logical_size_drop_finished_calculation",
960 108 : "Incremented when we drop a finished initial logical size calculation result.\
961 108 : Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge."
962 108 : )
963 108 : .unwrap(),
964 108 : });
965 :
966 : pub(crate) struct OngoingCalculationGuard {
967 : inc_drop_calculation: Option<IntCounter>,
968 : }
969 :
970 : #[derive(strum_macros::IntoStaticStr)]
971 : pub(crate) enum StartCircumstances {
972 : EmptyInitial,
973 : SkippedConcurrencyLimiter,
974 : AfterBackgroundTasksRateLimit,
975 : }
976 :
977 : impl StartCalculation {
978 114 : pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
979 114 : let circumstances_label: &'static str = circumstances.into();
980 114 : self.0
981 114 : .with_label_values(&["first", circumstances_label])
982 114 : .inc();
983 114 : OngoingCalculationGuard {
984 114 : inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
985 114 : }
986 114 : }
987 0 : pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
988 0 : let circumstances_label: &'static str = circumstances.into();
989 0 : self.0
990 0 : .with_label_values(&["retry", circumstances_label])
991 0 : .inc();
992 0 : OngoingCalculationGuard {
993 0 : inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
994 0 : }
995 0 : }
996 : }
997 :
998 : impl Drop for OngoingCalculationGuard {
999 114 : fn drop(&mut self) {
1000 114 : if let Some(counter) = self.inc_drop_calculation.take() {
1001 0 : counter.inc();
1002 114 : }
1003 114 : }
1004 : }
1005 :
1006 : impl OngoingCalculationGuard {
1007 114 : pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard {
1008 114 : drop(self.inc_drop_calculation.take());
1009 114 : CALCULATED.births.inc();
1010 114 : FinishedCalculationGuard {
1011 114 : inc_on_drop: CALCULATED.deaths.clone(),
1012 114 : }
1013 114 : }
1014 : }
1015 :
1016 : pub(crate) struct FinishedCalculationGuard {
1017 : inc_on_drop: IntCounter,
1018 : }
1019 :
1020 : impl Drop for FinishedCalculationGuard {
1021 3 : fn drop(&mut self) {
1022 3 : self.inc_on_drop.inc();
1023 3 : }
1024 : }
1025 :
1026 : // context: https://github.com/neondatabase/neon/issues/5963
1027 : pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy<IntCounter> =
1028 0 : Lazy::new(|| {
1029 0 : register_int_counter!(
1030 0 : "pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size",
1031 0 : "Counter for the following event: walreceiver calls\
1032 0 : Timeline::get_current_logical_size() and it returns `Approximate` for the first time."
1033 0 : )
1034 0 : .unwrap()
1035 0 : });
1036 : }
1037 :
1038 0 : static DIRECTORY_ENTRIES_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
1039 0 : register_uint_gauge_vec!(
1040 0 : "pageserver_directory_entries_count",
1041 0 : "Sum of the entries in pageserver-stored directory listings",
1042 0 : &["tenant_id", "shard_id", "timeline_id"]
1043 0 : )
1044 0 : .expect("failed to define a metric")
1045 0 : });
1046 :
1047 109 : pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
1048 109 : register_uint_gauge_vec!(
1049 109 : "pageserver_tenant_states_count",
1050 109 : "Count of tenants per state",
1051 109 : &["state"]
1052 109 : )
1053 109 : .expect("Failed to register pageserver_tenant_states_count metric")
1054 109 : });
1055 :
1056 108 : pub(crate) static TIMELINE_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
1057 108 : register_uint_gauge_vec!(
1058 108 : "pageserver_timeline_states_count",
1059 108 : "Count of timelines per state",
1060 108 : &["state"]
1061 108 : )
1062 108 : .expect("Failed to register pageserver_timeline_states_count metric")
1063 108 : });
1064 :
1065 : /// A set of broken tenants.
1066 : ///
1067 : /// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
1068 : /// tenant.
1069 5 : pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
1070 5 : register_uint_gauge_vec!(
1071 5 : "pageserver_broken_tenants_count",
1072 5 : "Set of broken tenants",
1073 5 : &["tenant_id", "shard_id"]
1074 5 : )
1075 5 : .expect("Failed to register pageserver_tenant_states_count metric")
1076 5 : });
1077 :
1078 3 : pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
1079 3 : register_uint_gauge_vec!(
1080 3 : "pageserver_tenant_synthetic_cached_size_bytes",
1081 3 : "Synthetic size of each tenant in bytes",
1082 3 : &["tenant_id"]
1083 3 : )
1084 3 : .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
1085 3 : });
1086 :
1087 109 : pub(crate) static TENANT_OFFLOADED_TIMELINES: Lazy<UIntGaugeVec> = Lazy::new(|| {
1088 109 : register_uint_gauge_vec!(
1089 109 : "pageserver_tenant_offloaded_timelines",
1090 109 : "Number of offloaded timelines of a tenant",
1091 109 : &["tenant_id", "shard_id"]
1092 109 : )
1093 109 : .expect("Failed to register pageserver_tenant_offloaded_timelines metric")
1094 109 : });
1095 :
1096 0 : pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
1097 0 : register_histogram_vec!(
1098 0 : "pageserver_eviction_iteration_duration_seconds_global",
1099 0 : "Time spent on a single eviction iteration",
1100 0 : &["period_secs", "threshold_secs"],
1101 0 : STORAGE_OP_BUCKETS.into(),
1102 0 : )
1103 0 : .expect("failed to define a metric")
1104 0 : });
1105 :
1106 108 : static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
1107 108 : register_int_counter_vec!(
1108 108 : "pageserver_evictions",
1109 108 : "Number of layers evicted from the pageserver",
1110 108 : &["tenant_id", "shard_id", "timeline_id"]
1111 108 : )
1112 108 : .expect("failed to define a metric")
1113 108 : });
1114 :
1115 108 : static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(|| {
1116 108 : register_int_counter_vec!(
1117 108 : "pageserver_evictions_with_low_residence_duration",
1118 108 : "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
1119 108 : Residence duration is determined using the `residence_duration_data_source`.",
1120 108 : &["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
1121 108 : )
1122 108 : .expect("failed to define a metric")
1123 108 : });
1124 :
1125 0 : pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
1126 0 : register_int_counter!(
1127 0 : "pageserver_unexpected_ondemand_downloads_count",
1128 0 : "Number of unexpected on-demand downloads. \
1129 0 : We log more context for each increment, so, forgo any labels in this metric.",
1130 0 : )
1131 0 : .expect("failed to define a metric")
1132 0 : });
1133 :
1134 : /// How long did we take to start up? Broken down by labels to describe
1135 : /// different phases of startup.
1136 0 : pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
1137 0 : register_gauge_vec!(
1138 0 : "pageserver_startup_duration_seconds",
1139 0 : "Time taken by phases of pageserver startup, in seconds",
1140 0 : &["phase"]
1141 0 : )
1142 0 : .expect("Failed to register pageserver_startup_duration_seconds metric")
1143 0 : });
1144 :
1145 0 : pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
1146 0 : register_uint_gauge!(
1147 0 : "pageserver_startup_is_loading",
1148 0 : "1 while in initial startup load of tenants, 0 at other times"
1149 0 : )
1150 0 : .expect("Failed to register pageserver_startup_is_loading")
1151 0 : });
1152 :
1153 106 : pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
1154 106 : register_uint_gauge!(
1155 106 : "pageserver_timeline_ephemeral_bytes",
1156 106 : "Total number of bytes in ephemeral layers, summed for all timelines. Approximate, lazily updated."
1157 106 : )
1158 106 : .expect("Failed to register metric")
1159 106 : });
1160 :
1161 : /// Metrics related to the lifecycle of a [`crate::tenant::TenantShard`] object: things
1162 : /// like how long it took to load.
1163 : ///
1164 : /// Note that these are process-global metrics, _not_ per-tenant metrics. Per-tenant
1165 : /// metrics are rather expensive, and usually fine grained stuff makes more sense
1166 : /// at a timeline level than tenant level.
1167 : pub(crate) struct TenantMetrics {
1168 : /// How long did tenants take to go from construction to active state?
1169 : pub(crate) activation: Histogram,
1170 : pub(crate) preload: Histogram,
1171 : pub(crate) attach: Histogram,
1172 :
1173 : /// How many tenants are included in the initial startup of the pagesrever?
1174 : pub(crate) startup_scheduled: IntCounter,
1175 : pub(crate) startup_complete: IntCounter,
1176 : }
1177 :
1178 0 : pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
1179 0 : TenantMetrics {
1180 0 : activation: register_histogram!(
1181 0 : "pageserver_tenant_activation_seconds",
1182 0 : "Time taken by tenants to activate, in seconds",
1183 0 : CRITICAL_OP_BUCKETS.into()
1184 0 : )
1185 0 : .expect("Failed to register metric"),
1186 0 : preload: register_histogram!(
1187 0 : "pageserver_tenant_preload_seconds",
1188 0 : "Time taken by tenants to load remote metadata on startup/attach, in seconds",
1189 0 : CRITICAL_OP_BUCKETS.into()
1190 0 : )
1191 0 : .expect("Failed to register metric"),
1192 0 : attach: register_histogram!(
1193 0 : "pageserver_tenant_attach_seconds",
1194 0 : "Time taken by tenants to intialize, after remote metadata is already loaded",
1195 0 : CRITICAL_OP_BUCKETS.into()
1196 0 : )
1197 0 : .expect("Failed to register metric"),
1198 0 : startup_scheduled: register_int_counter!(
1199 0 : "pageserver_tenant_startup_scheduled",
1200 0 : "Number of tenants included in pageserver startup (doesn't count tenants attached later)"
1201 0 : ).expect("Failed to register metric"),
1202 0 : startup_complete: register_int_counter!(
1203 0 : "pageserver_tenant_startup_complete",
1204 0 : "Number of tenants that have completed warm-up, or activated on-demand during initial startup: \
1205 0 : should eventually reach `pageserver_tenant_startup_scheduled_total`. Does not include broken \
1206 0 : tenants: such cases will lead to this metric never reaching the scheduled count."
1207 0 : ).expect("Failed to register metric"),
1208 0 : }
1209 0 : });
1210 :
1211 : /// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
1212 : #[derive(Debug)]
1213 : pub(crate) struct EvictionsWithLowResidenceDuration {
1214 : data_source: &'static str,
1215 : threshold: Duration,
1216 : counter: Option<IntCounter>,
1217 : }
1218 :
1219 : pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
1220 : data_source: &'static str,
1221 : threshold: Duration,
1222 : }
1223 :
1224 : impl EvictionsWithLowResidenceDurationBuilder {
1225 234 : pub fn new(data_source: &'static str, threshold: Duration) -> Self {
1226 234 : Self {
1227 234 : data_source,
1228 234 : threshold,
1229 234 : }
1230 234 : }
1231 :
1232 234 : fn build(
1233 234 : &self,
1234 234 : tenant_id: &str,
1235 234 : shard_id: &str,
1236 234 : timeline_id: &str,
1237 234 : ) -> EvictionsWithLowResidenceDuration {
1238 234 : let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
1239 234 : .get_metric_with_label_values(&[
1240 234 : tenant_id,
1241 234 : shard_id,
1242 234 : timeline_id,
1243 234 : self.data_source,
1244 234 : &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
1245 234 : ])
1246 234 : .unwrap();
1247 234 : EvictionsWithLowResidenceDuration {
1248 234 : data_source: self.data_source,
1249 234 : threshold: self.threshold,
1250 234 : counter: Some(counter),
1251 234 : }
1252 234 : }
1253 : }
1254 :
1255 : impl EvictionsWithLowResidenceDuration {
1256 239 : fn threshold_label_value(threshold: Duration) -> String {
1257 239 : format!("{}", threshold.as_secs())
1258 239 : }
1259 :
1260 2 : pub fn observe(&self, observed_value: Duration) {
1261 2 : if observed_value < self.threshold {
1262 2 : self.counter
1263 2 : .as_ref()
1264 2 : .expect("nobody calls this function after `remove_from_vec`")
1265 2 : .inc();
1266 2 : }
1267 2 : }
1268 :
1269 0 : pub fn change_threshold(
1270 0 : &mut self,
1271 0 : tenant_id: &str,
1272 0 : shard_id: &str,
1273 0 : timeline_id: &str,
1274 0 : new_threshold: Duration,
1275 0 : ) {
1276 0 : if new_threshold == self.threshold {
1277 0 : return;
1278 0 : }
1279 0 : let mut with_new = EvictionsWithLowResidenceDurationBuilder::new(
1280 0 : self.data_source,
1281 0 : new_threshold,
1282 0 : )
1283 0 : .build(tenant_id, shard_id, timeline_id);
1284 0 : std::mem::swap(self, &mut with_new);
1285 0 : with_new.remove(tenant_id, shard_id, timeline_id);
1286 0 : }
1287 :
1288 : // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
1289 5 : fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) {
1290 5 : let Some(_counter) = self.counter.take() else {
1291 0 : return;
1292 : };
1293 :
1294 5 : let threshold = Self::threshold_label_value(self.threshold);
1295 5 :
1296 5 : let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
1297 5 : tenant_id,
1298 5 : shard_id,
1299 5 : timeline_id,
1300 5 : self.data_source,
1301 5 : &threshold,
1302 5 : ]);
1303 5 :
1304 5 : match removed {
1305 0 : Err(e) => {
1306 0 : // this has been hit in staging as
1307 0 : // <https://neondatabase.sentry.io/issues/4142396994/>, but we don't know how.
1308 0 : // because we can be in the drop path already, don't risk:
1309 0 : // - "double-panic => illegal instruction" or
1310 0 : // - future "drop panick => abort"
1311 0 : //
1312 0 : // so just nag: (the error has the labels)
1313 0 : tracing::warn!(
1314 0 : "failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}"
1315 : );
1316 : }
1317 : Ok(()) => {
1318 : // to help identify cases where we double-remove the same values, let's log all
1319 : // deletions?
1320 5 : tracing::info!(
1321 0 : "removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}",
1322 : self.data_source
1323 : );
1324 : }
1325 : }
1326 5 : }
1327 : }
1328 :
1329 : // Metrics collected on disk IO operations
1330 : //
1331 : // Roughly logarithmic scale.
1332 : const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
1333 : 0.00005, // 50us
1334 : 0.00006, // 60us
1335 : 0.00007, // 70us
1336 : 0.00008, // 80us
1337 : 0.00009, // 90us
1338 : 0.0001, // 100us
1339 : 0.000110, // 110us
1340 : 0.000120, // 120us
1341 : 0.000130, // 130us
1342 : 0.000140, // 140us
1343 : 0.000150, // 150us
1344 : 0.000160, // 160us
1345 : 0.000170, // 170us
1346 : 0.000180, // 180us
1347 : 0.000190, // 190us
1348 : 0.000200, // 200us
1349 : 0.000210, // 210us
1350 : 0.000220, // 220us
1351 : 0.000230, // 230us
1352 : 0.000240, // 240us
1353 : 0.000250, // 250us
1354 : 0.000300, // 300us
1355 : 0.000350, // 350us
1356 : 0.000400, // 400us
1357 : 0.000450, // 450us
1358 : 0.000500, // 500us
1359 : 0.000600, // 600us
1360 : 0.000700, // 700us
1361 : 0.000800, // 800us
1362 : 0.000900, // 900us
1363 : 0.001000, // 1ms
1364 : 0.002000, // 2ms
1365 : 0.003000, // 3ms
1366 : 0.004000, // 4ms
1367 : 0.005000, // 5ms
1368 : 0.01000, // 10ms
1369 : 0.02000, // 20ms
1370 : 0.05000, // 50ms
1371 : ];
1372 :
1373 : /// VirtualFile fs operation variants.
1374 : ///
1375 : /// Operations:
1376 : /// - open ([`std::fs::OpenOptions::open`])
1377 : /// - close (dropping [`crate::virtual_file::VirtualFile`])
1378 : /// - close-by-replace (close by replacement algorithm)
1379 : /// - read (`read_at`)
1380 : /// - write (`write_at`)
1381 : /// - seek (modify internal position or file length query)
1382 : /// - fsync ([`std::fs::File::sync_all`])
1383 : /// - metadata ([`std::fs::File::metadata`])
1384 : #[derive(
1385 0 : Debug, Clone, Copy, strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr,
1386 : )]
1387 : pub(crate) enum StorageIoOperation {
1388 : Open,
1389 : OpenAfterReplace,
1390 : Close,
1391 : CloseByReplace,
1392 : Read,
1393 : Write,
1394 : Seek,
1395 : Fsync,
1396 : Metadata,
1397 : SetLen,
1398 : }
1399 :
1400 : impl StorageIoOperation {
1401 1210 : pub fn as_str(&self) -> &'static str {
1402 1210 : match self {
1403 121 : StorageIoOperation::Open => "open",
1404 121 : StorageIoOperation::OpenAfterReplace => "open-after-replace",
1405 121 : StorageIoOperation::Close => "close",
1406 121 : StorageIoOperation::CloseByReplace => "close-by-replace",
1407 121 : StorageIoOperation::Read => "read",
1408 121 : StorageIoOperation::Write => "write",
1409 121 : StorageIoOperation::Seek => "seek",
1410 121 : StorageIoOperation::Fsync => "fsync",
1411 121 : StorageIoOperation::Metadata => "metadata",
1412 121 : StorageIoOperation::SetLen => "set_len",
1413 : }
1414 1210 : }
1415 : }
1416 :
1417 : /// Tracks time taken by fs operations near VirtualFile.
1418 : #[derive(Debug)]
1419 : pub(crate) struct StorageIoTime {
1420 : metrics: [Histogram; StorageIoOperation::COUNT],
1421 : }
1422 :
1423 : impl StorageIoTime {
1424 121 : fn new() -> Self {
1425 121 : let storage_io_histogram_vec = register_histogram_vec!(
1426 121 : "pageserver_io_operations_seconds",
1427 121 : "Time spent in IO operations",
1428 121 : &["operation"],
1429 121 : STORAGE_IO_TIME_BUCKETS.into()
1430 121 : )
1431 121 : .expect("failed to define a metric");
1432 1210 : let metrics = std::array::from_fn(|i| {
1433 1210 : let op = StorageIoOperation::from_repr(i).unwrap();
1434 1210 : storage_io_histogram_vec
1435 1210 : .get_metric_with_label_values(&[op.as_str()])
1436 1210 : .unwrap()
1437 1210 : });
1438 121 : Self { metrics }
1439 121 : }
1440 :
1441 493998 : pub(crate) fn get(&self, op: StorageIoOperation) -> &Histogram {
1442 493998 : &self.metrics[op as usize]
1443 493998 : }
1444 : }
1445 :
1446 : pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(StorageIoTime::new);
1447 :
1448 : #[derive(Clone, Copy)]
1449 : #[repr(usize)]
1450 : pub(crate) enum StorageIoSizeOperation {
1451 : Read,
1452 : Write,
1453 : }
1454 :
1455 : impl StorageIoSizeOperation {
1456 : pub(crate) const VARIANTS: &'static [&'static str] = &["read", "write"];
1457 :
1458 752 : fn as_str(&self) -> &'static str {
1459 752 : Self::VARIANTS[*self as usize]
1460 752 : }
1461 : }
1462 :
1463 : // Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
1464 142 : pub(crate) static STORAGE_IO_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
1465 142 : register_uint_gauge_vec!(
1466 142 : "pageserver_io_operations_bytes_total",
1467 142 : "Total amount of bytes read/written in IO operations",
1468 142 : &["operation", "tenant_id", "shard_id", "timeline_id"]
1469 142 : )
1470 142 : .expect("failed to define a metric")
1471 142 : });
1472 :
1473 : #[derive(Clone, Debug)]
1474 : pub(crate) struct StorageIoSizeMetrics {
1475 : pub read: UIntGauge,
1476 : pub write: UIntGauge,
1477 : }
1478 :
1479 : impl StorageIoSizeMetrics {
1480 376 : pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self {
1481 376 : let read = STORAGE_IO_SIZE
1482 376 : .get_metric_with_label_values(&[
1483 376 : StorageIoSizeOperation::Read.as_str(),
1484 376 : tenant_id,
1485 376 : shard_id,
1486 376 : timeline_id,
1487 376 : ])
1488 376 : .unwrap();
1489 376 : let write = STORAGE_IO_SIZE
1490 376 : .get_metric_with_label_values(&[
1491 376 : StorageIoSizeOperation::Write.as_str(),
1492 376 : tenant_id,
1493 376 : shard_id,
1494 376 : timeline_id,
1495 376 : ])
1496 376 : .unwrap();
1497 376 : Self { read, write }
1498 376 : }
1499 : }
1500 :
1501 : #[cfg(not(test))]
1502 : pub(crate) mod virtual_file_descriptor_cache {
1503 : use super::*;
1504 :
1505 0 : pub(crate) static SIZE_MAX: Lazy<UIntGauge> = Lazy::new(|| {
1506 0 : register_uint_gauge!(
1507 0 : "pageserver_virtual_file_descriptor_cache_size_max",
1508 0 : "Maximum number of open file descriptors in the cache."
1509 0 : )
1510 0 : .unwrap()
1511 0 : });
1512 :
1513 : // SIZE_CURRENT: derive it like so:
1514 : // ```
1515 : // sum (pageserver_io_operations_seconds_count{operation=~"^(open|open-after-replace)$")
1516 : // -ignoring(operation)
1517 : // sum(pageserver_io_operations_seconds_count{operation=~"^(close|close-by-replace)$"}
1518 : // ```
1519 : }
1520 :
1521 : #[cfg(not(test))]
1522 : pub(crate) mod virtual_file_io_engine {
1523 : use super::*;
1524 :
1525 0 : pub(crate) static KIND: Lazy<UIntGaugeVec> = Lazy::new(|| {
1526 0 : register_uint_gauge_vec!(
1527 0 : "pageserver_virtual_file_io_engine_kind",
1528 0 : "The configured io engine for VirtualFile",
1529 0 : &["kind"],
1530 0 : )
1531 0 : .unwrap()
1532 0 : });
1533 : }
1534 :
1535 : pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
1536 : pub(crate) struct SmgrOpTimerInner {
1537 : global_execution_latency_histo: Histogram,
1538 : per_timeline_execution_latency_histo: Option<Histogram>,
1539 :
1540 : global_batch_wait_time: Histogram,
1541 : per_timeline_batch_wait_time: Histogram,
1542 :
1543 : global_flush_in_progress_micros: IntCounter,
1544 : per_timeline_flush_in_progress_micros: IntCounter,
1545 :
1546 : throttling: Arc<tenant_throttling::Pagestream>,
1547 :
1548 : timings: SmgrOpTimerState,
1549 : }
1550 :
1551 : /// The stages of request processing are represented by the enum variants.
1552 : /// Used as part of [`SmgrOpTimerInner::timings`].
1553 : ///
1554 : /// Request processing calls into the `SmgrOpTimer::observe_*` methods at the
1555 : /// transition points.
1556 : /// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`]
1557 : /// to the next state.
1558 : ///
1559 : /// Each request goes through every stage, in all configurations.
1560 : ///
1561 : #[derive(Debug)]
1562 : enum SmgrOpTimerState {
1563 : Received {
1564 : // In the future, we may want to track the full time the request spent
1565 : // inside pageserver process (time spent in kernel buffers can't be tracked).
1566 : // `received_at` would be used for that.
1567 : #[allow(dead_code)]
1568 : received_at: Instant,
1569 : },
1570 : Throttling {
1571 : throttle_started_at: Instant,
1572 : },
1573 : Batching {
1574 : throttle_done_at: Instant,
1575 : },
1576 : Executing {
1577 : execution_started_at: Instant,
1578 : },
1579 : Flushing,
1580 : // NB: when adding observation points, remember to update the Drop impl.
1581 : }
1582 :
1583 : // NB: when adding observation points, remember to update the Drop impl.
1584 : impl SmgrOpTimer {
1585 : /// See [`SmgrOpTimerState`] for more context.
1586 0 : pub(crate) fn observe_throttle_start(&mut self, at: Instant) {
1587 0 : let Some(inner) = self.0.as_mut() else {
1588 0 : return;
1589 : };
1590 0 : let SmgrOpTimerState::Received { received_at: _ } = &mut inner.timings else {
1591 0 : return;
1592 : };
1593 0 : inner.throttling.count_accounted_start.inc();
1594 0 : inner.timings = SmgrOpTimerState::Throttling {
1595 0 : throttle_started_at: at,
1596 0 : };
1597 0 : }
1598 :
1599 : /// See [`SmgrOpTimerState`] for more context.
1600 0 : pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) {
1601 0 : let Some(inner) = self.0.as_mut() else {
1602 0 : return;
1603 : };
1604 : let SmgrOpTimerState::Throttling {
1605 0 : throttle_started_at,
1606 0 : } = &inner.timings
1607 : else {
1608 0 : return;
1609 : };
1610 0 : inner.throttling.count_accounted_finish.inc();
1611 0 : match throttle {
1612 0 : ThrottleResult::NotThrottled { end } => {
1613 0 : inner.timings = SmgrOpTimerState::Batching {
1614 0 : throttle_done_at: end,
1615 0 : };
1616 0 : }
1617 0 : ThrottleResult::Throttled { end } => {
1618 0 : // update metrics
1619 0 : inner.throttling.count_throttled.inc();
1620 0 : inner
1621 0 : .throttling
1622 0 : .wait_time
1623 0 : .inc_by((end - *throttle_started_at).as_micros().try_into().unwrap());
1624 0 : // state transition
1625 0 : inner.timings = SmgrOpTimerState::Batching {
1626 0 : throttle_done_at: end,
1627 0 : };
1628 0 : }
1629 : }
1630 0 : }
1631 :
1632 : /// See [`SmgrOpTimerState`] for more context.
1633 0 : pub(crate) fn observe_execution_start(&mut self, at: Instant) {
1634 0 : let Some(inner) = self.0.as_mut() else {
1635 0 : return;
1636 : };
1637 0 : let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else {
1638 0 : return;
1639 : };
1640 : // update metrics
1641 0 : let batch = at - *throttle_done_at;
1642 0 : inner.global_batch_wait_time.observe(batch.as_secs_f64());
1643 0 : inner
1644 0 : .per_timeline_batch_wait_time
1645 0 : .observe(batch.as_secs_f64());
1646 0 : // state transition
1647 0 : inner.timings = SmgrOpTimerState::Executing {
1648 0 : execution_started_at: at,
1649 0 : }
1650 0 : }
1651 :
1652 : /// For all but the first caller, this is a no-op.
1653 : /// The first callers receives Some, subsequent ones None.
1654 : ///
1655 : /// See [`SmgrOpTimerState`] for more context.
1656 0 : pub(crate) fn observe_execution_end(&mut self, at: Instant) -> Option<SmgrOpFlushInProgress> {
1657 : // NB: unlike the other observe_* methods, this one take()s.
1658 : #[allow(clippy::question_mark)] // maintain similar code pattern.
1659 0 : let Some(mut inner) = self.0.take() else {
1660 0 : return None;
1661 : };
1662 : let SmgrOpTimerState::Executing {
1663 0 : execution_started_at,
1664 0 : } = &inner.timings
1665 : else {
1666 0 : return None;
1667 : };
1668 : // update metrics
1669 0 : let execution = at - *execution_started_at;
1670 0 : inner
1671 0 : .global_execution_latency_histo
1672 0 : .observe(execution.as_secs_f64());
1673 0 : if let Some(per_timeline_execution_latency_histo) =
1674 0 : &inner.per_timeline_execution_latency_histo
1675 0 : {
1676 0 : per_timeline_execution_latency_histo.observe(execution.as_secs_f64());
1677 0 : }
1678 :
1679 : // state transition
1680 0 : inner.timings = SmgrOpTimerState::Flushing;
1681 0 :
1682 0 : // return the flush in progress object which
1683 0 : // will do the remaining metrics updates
1684 0 : let SmgrOpTimerInner {
1685 0 : global_flush_in_progress_micros,
1686 0 : per_timeline_flush_in_progress_micros,
1687 0 : ..
1688 0 : } = inner;
1689 0 : Some(SmgrOpFlushInProgress {
1690 0 : global_micros: global_flush_in_progress_micros,
1691 0 : per_timeline_micros: per_timeline_flush_in_progress_micros,
1692 0 : })
1693 0 : }
1694 : }
1695 :
1696 : /// The last stage of request processing is serializing and flushing the request
1697 : /// into the TCP connection. We want to make slow flushes observable
1698 : /// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`]
1699 : /// to periodically bump the metric.
1700 : ///
1701 : /// If in the future we decide that we're not interested in live updates, we can
1702 : /// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
1703 : /// and remove this struct from the code base.
1704 : pub(crate) struct SmgrOpFlushInProgress {
1705 : global_micros: IntCounter,
1706 : per_timeline_micros: IntCounter,
1707 : }
1708 :
1709 : impl Drop for SmgrOpTimer {
1710 0 : fn drop(&mut self) {
1711 0 : // In case of early drop, update any of the remaining metrics with
1712 0 : // observations so that (started,finished) counter pairs balance out
1713 0 : // and all counters on the latency path have the the same number of
1714 0 : // observations.
1715 0 : // It's technically lying and it would be better if each metric had
1716 0 : // a separate label or similar for cancelled requests.
1717 0 : // But we don't have that right now and counter pairs balancing
1718 0 : // out is useful when using the metrics in panels and whatnot.
1719 0 : let now = Instant::now();
1720 0 : self.observe_throttle_start(now);
1721 0 : self.observe_throttle_done(ThrottleResult::NotThrottled { end: now });
1722 0 : self.observe_execution_start(now);
1723 0 : let maybe_flush_timer = self.observe_execution_end(now);
1724 0 : drop(maybe_flush_timer);
1725 0 : }
1726 : }
1727 :
1728 : impl SmgrOpFlushInProgress {
1729 : /// The caller must guarantee that `socket_fd`` outlives this function.
1730 0 : pub(crate) async fn measure<Fut, O>(
1731 0 : self,
1732 0 : started_at: Instant,
1733 0 : mut fut: Fut,
1734 0 : socket_fd: RawFd,
1735 0 : ) -> O
1736 0 : where
1737 0 : Fut: std::future::Future<Output = O>,
1738 0 : {
1739 0 : let mut fut = std::pin::pin!(fut);
1740 0 :
1741 0 : let mut logged = false;
1742 0 : let mut last_counter_increment_at = started_at;
1743 0 : let mut observe_guard = scopeguard::guard(
1744 0 : |is_timeout| {
1745 0 : let now = Instant::now();
1746 0 :
1747 0 : // Increment counter
1748 0 : {
1749 0 : let elapsed_since_last_observe = now - last_counter_increment_at;
1750 0 : self.global_micros
1751 0 : .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
1752 0 : self.per_timeline_micros
1753 0 : .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
1754 0 : last_counter_increment_at = now;
1755 0 : }
1756 0 :
1757 0 : // Log something on every timeout, and on completion but only if we hit a timeout.
1758 0 : if is_timeout || logged {
1759 0 : logged = true;
1760 0 : let elapsed_total = now - started_at;
1761 0 : let msg = if is_timeout {
1762 0 : "slow flush ongoing"
1763 : } else {
1764 0 : "slow flush completed or cancelled"
1765 : };
1766 :
1767 0 : let (inq, outq) = {
1768 0 : // SAFETY: caller guarantees that `socket_fd` outlives this function.
1769 0 : #[cfg(target_os = "linux")]
1770 0 : unsafe {
1771 0 : (
1772 0 : utils::linux_socket_ioctl::inq(socket_fd).unwrap_or(-2),
1773 0 : utils::linux_socket_ioctl::outq(socket_fd).unwrap_or(-2),
1774 0 : )
1775 0 : }
1776 0 : #[cfg(not(target_os = "linux"))]
1777 0 : {
1778 0 : _ = socket_fd; // appease unused lint on macOS
1779 0 : (-1, -1)
1780 0 : }
1781 0 : };
1782 0 :
1783 0 : let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64());
1784 0 : tracing::info!(elapsed_total_secs, inq, outq, msg);
1785 0 : }
1786 0 : },
1787 0 : |mut observe| {
1788 0 : observe(false);
1789 0 : },
1790 0 : );
1791 :
1792 : loop {
1793 0 : match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
1794 0 : Ok(v) => return v,
1795 0 : Err(_timeout) => {
1796 0 : (*observe_guard)(true);
1797 0 : }
1798 : }
1799 : }
1800 0 : }
1801 : }
1802 :
1803 : #[derive(
1804 : Debug,
1805 : Clone,
1806 : Copy,
1807 : IntoStaticStr,
1808 : strum_macros::EnumCount,
1809 0 : strum_macros::EnumIter,
1810 : strum_macros::FromRepr,
1811 : enum_map::Enum,
1812 : )]
1813 : #[strum(serialize_all = "snake_case")]
1814 : pub enum SmgrQueryType {
1815 : GetRelExists,
1816 : GetRelSize,
1817 : GetPageAtLsn,
1818 : GetDbSize,
1819 : GetSlruSegment,
1820 : #[cfg(feature = "testing")]
1821 : Test,
1822 : }
1823 :
1824 : #[derive(
1825 : Debug,
1826 : Clone,
1827 : Copy,
1828 : IntoStaticStr,
1829 : strum_macros::EnumCount,
1830 45 : strum_macros::EnumIter,
1831 : strum_macros::FromRepr,
1832 : enum_map::Enum,
1833 : )]
1834 : #[strum(serialize_all = "snake_case")]
1835 : pub enum GetPageBatchBreakReason {
1836 : BatchFull,
1837 : NonBatchableRequest,
1838 : NonUniformLsn,
1839 : SamePageAtDifferentLsn,
1840 : NonUniformTimeline,
1841 : ExecutorSteal,
1842 : #[cfg(feature = "testing")]
1843 : NonUniformKey,
1844 : }
1845 :
1846 : pub(crate) struct SmgrQueryTimePerTimeline {
1847 : global_started: [IntCounter; SmgrQueryType::COUNT],
1848 : global_latency: [Histogram; SmgrQueryType::COUNT],
1849 : per_timeline_getpage_started: IntCounter,
1850 : per_timeline_getpage_latency: Histogram,
1851 : global_batch_size: Histogram,
1852 : per_timeline_batch_size: Histogram,
1853 : global_flush_in_progress_micros: IntCounter,
1854 : per_timeline_flush_in_progress_micros: IntCounter,
1855 : global_batch_wait_time: Histogram,
1856 : per_timeline_batch_wait_time: Histogram,
1857 : global_batch_break_reason: [IntCounter; GetPageBatchBreakReason::COUNT],
1858 : per_timeline_batch_break_reason: GetPageBatchBreakReasonTimelineMetrics,
1859 : throttling: Arc<tenant_throttling::Pagestream>,
1860 : }
1861 :
1862 108 : static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
1863 108 : register_int_counter_vec!(
1864 108 : // it's a counter, but, name is prepared to extend it to a histogram of queue depth
1865 108 : "pageserver_smgr_query_started_global_count",
1866 108 : "Number of smgr queries started, aggregated by query type.",
1867 108 : &["smgr_query_type"],
1868 108 : )
1869 108 : .expect("failed to define a metric")
1870 108 : });
1871 :
1872 108 : static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
1873 108 : register_int_counter_vec!(
1874 108 : // it's a counter, but, name is prepared to extend it to a histogram of queue depth
1875 108 : "pageserver_smgr_query_started_count",
1876 108 : "Number of smgr queries started, aggregated by query type and tenant/timeline.",
1877 108 : &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
1878 108 : )
1879 108 : .expect("failed to define a metric")
1880 108 : });
1881 :
1882 : /// Per-timeline smgr histogram buckets should be the same as the compute buckets, such that the
1883 : /// metrics are comparable across compute and Pageserver. See also:
1884 : /// <https://github.com/neondatabase/neon/blob/1a87975d956a8ad17ec8b85da32a137ec4893fcc/pgxn/neon/neon_perf_counters.h#L18-L27>
1885 : /// <https://github.com/neondatabase/flux-fleet/blob/556182a939edda87ff1d85a6b02e5cec901e0e9e/apps/base/compute-metrics/scrape-compute-sql-exporter.yaml#L29-L35>
1886 : static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] =
1887 : &[0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.1, 1.0, 3.0];
1888 :
1889 108 : static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
1890 108 : register_histogram_vec!(
1891 108 : "pageserver_smgr_query_seconds",
1892 108 : "Time spent _executing_ smgr query handling, excluding batch and throttle delays.",
1893 108 : &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
1894 108 : SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
1895 108 : )
1896 108 : .expect("failed to define a metric")
1897 108 : });
1898 :
1899 108 : static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
1900 108 : [
1901 108 : 1,
1902 108 : 10,
1903 108 : 20,
1904 108 : 40,
1905 108 : 60,
1906 108 : 80,
1907 108 : 100,
1908 108 : 200,
1909 108 : 300,
1910 108 : 400,
1911 108 : 500,
1912 108 : 600,
1913 108 : 700,
1914 108 : 800,
1915 108 : 900,
1916 108 : 1_000, // 1ms
1917 108 : 2_000,
1918 108 : 4_000,
1919 108 : 6_000,
1920 108 : 8_000,
1921 108 : 10_000, // 10ms
1922 108 : 20_000,
1923 108 : 40_000,
1924 108 : 60_000,
1925 108 : 80_000,
1926 108 : 100_000,
1927 108 : 200_000,
1928 108 : 400_000,
1929 108 : 600_000,
1930 108 : 800_000,
1931 108 : 1_000_000, // 1s
1932 108 : 2_000_000,
1933 108 : 4_000_000,
1934 108 : 6_000_000,
1935 108 : 8_000_000,
1936 108 : 10_000_000, // 10s
1937 108 : 20_000_000,
1938 108 : 50_000_000,
1939 108 : 100_000_000,
1940 108 : 200_000_000,
1941 108 : 1_000_000_000, // 1000s
1942 108 : ]
1943 108 : .into_iter()
1944 108 : .map(Duration::from_micros)
1945 4428 : .map(|d| d.as_secs_f64())
1946 108 : .collect()
1947 108 : });
1948 :
1949 108 : static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
1950 108 : register_histogram_vec!(
1951 108 : "pageserver_smgr_query_seconds_global",
1952 108 : "Like pageserver_smgr_query_seconds, but aggregated to instance level.",
1953 108 : &["smgr_query_type"],
1954 108 : SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
1955 108 : )
1956 108 : .expect("failed to define a metric")
1957 108 : });
1958 :
1959 108 : static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy<Vec<f64>> = Lazy::new(|| {
1960 108 : (1..=u32::try_from(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap())
1961 3456 : .map(|v| v.into())
1962 108 : .collect()
1963 108 : });
1964 :
1965 108 : static PAGE_SERVICE_BATCH_SIZE_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
1966 108 : register_histogram!(
1967 108 : "pageserver_page_service_batch_size_global",
1968 108 : "Batch size of pageserver page service requests",
1969 108 : PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL.clone(),
1970 108 : )
1971 108 : .expect("failed to define a metric")
1972 108 : });
1973 :
1974 108 : static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy<Vec<f64>> = Lazy::new(|| {
1975 108 : let mut buckets = Vec::new();
1976 756 : for i in 0.. {
1977 756 : let bucket = 1 << i;
1978 756 : if bucket > u32::try_from(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap() {
1979 108 : break;
1980 648 : }
1981 648 : buckets.push(bucket.into());
1982 : }
1983 108 : buckets
1984 108 : });
1985 :
1986 108 : static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
1987 108 : register_histogram_vec!(
1988 108 : "pageserver_page_service_batch_size",
1989 108 : "Batch size of pageserver page service requests",
1990 108 : &["tenant_id", "shard_id", "timeline_id"],
1991 108 : PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE.clone()
1992 108 : )
1993 108 : .expect("failed to define a metric")
1994 108 : });
1995 :
1996 108 : static PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
1997 108 : register_int_counter_vec!(
1998 108 : // it's a counter, but, name is prepared to extend it to a histogram of queue depth
1999 108 : "pageserver_page_service_batch_break_reason_global",
2000 108 : "Reason for breaking batches of get page requests",
2001 108 : &["reason"],
2002 108 : )
2003 108 : .expect("failed to define a metric")
2004 108 : });
2005 :
2006 : struct GetPageBatchBreakReasonTimelineMetrics {
2007 : map: EnumMap<GetPageBatchBreakReason, IntCounter>,
2008 : }
2009 :
2010 : impl GetPageBatchBreakReasonTimelineMetrics {
2011 234 : fn new(tenant_id: &str, shard_slug: &str, timeline_id: &str) -> Self {
2012 234 : GetPageBatchBreakReasonTimelineMetrics {
2013 1638 : map: EnumMap::from_array(std::array::from_fn(|reason_idx| {
2014 1638 : let reason = GetPageBatchBreakReason::from_usize(reason_idx);
2015 1638 : PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.with_label_values(&[
2016 1638 : tenant_id,
2017 1638 : shard_slug,
2018 1638 : timeline_id,
2019 1638 : reason.into(),
2020 1638 : ])
2021 1638 : })),
2022 234 : }
2023 234 : }
2024 :
2025 0 : fn inc(&self, reason: GetPageBatchBreakReason) {
2026 0 : self.map[reason].inc()
2027 0 : }
2028 : }
2029 :
2030 108 : static PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
2031 108 : register_int_counter_vec!(
2032 108 : "pageserver_page_service_batch_break_reason",
2033 108 : "Reason for breaking batches of get page requests",
2034 108 : &["tenant_id", "shard_id", "timeline_id", "reason"],
2035 108 : )
2036 108 : .expect("failed to define a metric")
2037 108 : });
2038 :
2039 0 : pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
2040 0 : register_int_gauge_vec!(
2041 0 : "pageserver_page_service_config_max_batch_size",
2042 0 : "Configured maximum batch size for the server-side batching functionality of page_service. \
2043 0 : Labels expose more of the configuration parameters.",
2044 0 : &["mode", "execution", "batching"]
2045 0 : )
2046 0 : .expect("failed to define a metric")
2047 0 : });
2048 :
2049 0 : fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
2050 0 : PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
2051 0 : let (label_values, value) = match conf {
2052 0 : PageServicePipeliningConfig::Serial => (["serial", "-", "-"], 1),
2053 : PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
2054 0 : max_batch_size,
2055 0 : execution,
2056 0 : batching,
2057 0 : }) => {
2058 0 : let mode = "pipelined";
2059 0 : let execution = match execution {
2060 : PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => {
2061 0 : "concurrent-futures"
2062 : }
2063 0 : PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
2064 : };
2065 0 : let batching = match batching {
2066 0 : PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => "uniform-lsn",
2067 0 : PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => "scattered-lsn",
2068 : };
2069 :
2070 0 : ([mode, execution, batching], max_batch_size.get())
2071 : }
2072 : };
2073 0 : PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
2074 0 : .with_label_values(&label_values)
2075 0 : .set(value.try_into().unwrap());
2076 0 : }
2077 :
2078 108 : static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy<IntCounterVec> = Lazy::new(|| {
2079 108 : register_int_counter_vec!(
2080 108 : "pageserver_page_service_pagestream_flush_in_progress_micros",
2081 108 : "Counter that sums up the microseconds that a pagestream response was being flushed into the TCP connection. \
2082 108 : If the flush is particularly slow, this counter will be updated periodically to make slow flushes \
2083 108 : easily discoverable in monitoring. \
2084 108 : Hence, this is NOT a completion latency historgram.",
2085 108 : &["tenant_id", "shard_id", "timeline_id"],
2086 108 : )
2087 108 : .expect("failed to define a metric")
2088 108 : });
2089 :
2090 108 : static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy::new(|| {
2091 108 : register_int_counter!(
2092 108 : "pageserver_page_service_pagestream_flush_in_progress_micros_global",
2093 108 : "Like pageserver_page_service_pagestream_flush_in_progress_seconds, but instance-wide.",
2094 108 : )
2095 108 : .expect("failed to define a metric")
2096 108 : });
2097 :
2098 108 : static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
2099 108 : register_histogram_vec!(
2100 108 : "pageserver_page_service_pagestream_batch_wait_time_seconds",
2101 108 : "Time a request spent waiting in its batch until the batch moved to throttle&execution.",
2102 108 : &["tenant_id", "shard_id", "timeline_id"],
2103 108 : SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
2104 108 : )
2105 108 : .expect("failed to define a metric")
2106 108 : });
2107 :
2108 108 : static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
2109 108 : register_histogram!(
2110 108 : "pageserver_page_service_pagestream_batch_wait_time_seconds_global",
2111 108 : "Like pageserver_page_service_pagestream_batch_wait_time_seconds, but aggregated to instance level.",
2112 108 : SMGR_QUERY_TIME_GLOBAL_BUCKETS.to_vec(),
2113 108 : )
2114 108 : .expect("failed to define a metric")
2115 108 : });
2116 :
2117 : impl SmgrQueryTimePerTimeline {
2118 234 : pub(crate) fn new(
2119 234 : tenant_shard_id: &TenantShardId,
2120 234 : timeline_id: &TimelineId,
2121 234 : pagestream_throttle_metrics: Arc<tenant_throttling::Pagestream>,
2122 234 : ) -> Self {
2123 234 : let tenant_id = tenant_shard_id.tenant_id.to_string();
2124 234 : let shard_slug = format!("{}", tenant_shard_id.shard_slug());
2125 234 : let timeline_id = timeline_id.to_string();
2126 1404 : let global_started = std::array::from_fn(|i| {
2127 1404 : let op = SmgrQueryType::from_repr(i).unwrap();
2128 1404 : SMGR_QUERY_STARTED_GLOBAL
2129 1404 : .get_metric_with_label_values(&[op.into()])
2130 1404 : .unwrap()
2131 1404 : });
2132 1404 : let global_latency = std::array::from_fn(|i| {
2133 1404 : let op = SmgrQueryType::from_repr(i).unwrap();
2134 1404 : SMGR_QUERY_TIME_GLOBAL
2135 1404 : .get_metric_with_label_values(&[op.into()])
2136 1404 : .unwrap()
2137 1404 : });
2138 234 :
2139 234 : let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE
2140 234 : .get_metric_with_label_values(&[
2141 234 : SmgrQueryType::GetPageAtLsn.into(),
2142 234 : &tenant_id,
2143 234 : &shard_slug,
2144 234 : &timeline_id,
2145 234 : ])
2146 234 : .unwrap();
2147 234 : let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
2148 234 : .get_metric_with_label_values(&[
2149 234 : SmgrQueryType::GetPageAtLsn.into(),
2150 234 : &tenant_id,
2151 234 : &shard_slug,
2152 234 : &timeline_id,
2153 234 : ])
2154 234 : .unwrap();
2155 234 :
2156 234 : let global_batch_size = PAGE_SERVICE_BATCH_SIZE_GLOBAL.clone();
2157 234 : let per_timeline_batch_size = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE
2158 234 : .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
2159 234 : .unwrap();
2160 234 :
2161 234 : let global_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL.clone();
2162 234 : let per_timeline_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME
2163 234 : .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
2164 234 : .unwrap();
2165 234 :
2166 1638 : let global_batch_break_reason = std::array::from_fn(|i| {
2167 1638 : let reason = GetPageBatchBreakReason::from_usize(i);
2168 1638 : PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL
2169 1638 : .get_metric_with_label_values(&[reason.into()])
2170 1638 : .unwrap()
2171 1638 : });
2172 234 : let per_timeline_batch_break_reason =
2173 234 : GetPageBatchBreakReasonTimelineMetrics::new(&tenant_id, &shard_slug, &timeline_id);
2174 234 :
2175 234 : let global_flush_in_progress_micros =
2176 234 : PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
2177 234 : let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
2178 234 : .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
2179 234 : .unwrap();
2180 234 :
2181 234 : Self {
2182 234 : global_started,
2183 234 : global_latency,
2184 234 : per_timeline_getpage_latency,
2185 234 : per_timeline_getpage_started,
2186 234 : global_batch_size,
2187 234 : per_timeline_batch_size,
2188 234 : global_flush_in_progress_micros,
2189 234 : per_timeline_flush_in_progress_micros,
2190 234 : global_batch_wait_time,
2191 234 : per_timeline_batch_wait_time,
2192 234 : global_batch_break_reason,
2193 234 : per_timeline_batch_break_reason,
2194 234 : throttling: pagestream_throttle_metrics,
2195 234 : }
2196 234 : }
2197 0 : pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
2198 0 : self.global_started[op as usize].inc();
2199 :
2200 0 : let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
2201 0 : self.per_timeline_getpage_started.inc();
2202 0 : Some(self.per_timeline_getpage_latency.clone())
2203 : } else {
2204 0 : None
2205 : };
2206 :
2207 0 : SmgrOpTimer(Some(SmgrOpTimerInner {
2208 0 : global_execution_latency_histo: self.global_latency[op as usize].clone(),
2209 0 : per_timeline_execution_latency_histo: per_timeline_latency_histo,
2210 0 : global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
2211 0 : per_timeline_flush_in_progress_micros: self
2212 0 : .per_timeline_flush_in_progress_micros
2213 0 : .clone(),
2214 0 : global_batch_wait_time: self.global_batch_wait_time.clone(),
2215 0 : per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
2216 0 : throttling: self.throttling.clone(),
2217 0 : timings: SmgrOpTimerState::Received { received_at },
2218 0 : }))
2219 0 : }
2220 :
2221 : /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
2222 0 : pub(crate) fn observe_getpage_batch_start(
2223 0 : &self,
2224 0 : batch_size: usize,
2225 0 : break_reason: GetPageBatchBreakReason,
2226 0 : ) {
2227 0 : self.global_batch_size.observe(batch_size as f64);
2228 0 : self.per_timeline_batch_size.observe(batch_size as f64);
2229 0 :
2230 0 : self.global_batch_break_reason[break_reason.into_usize()].inc();
2231 0 : self.per_timeline_batch_break_reason.inc(break_reason);
2232 0 : }
2233 : }
2234 :
2235 : // keep in sync with control plane Go code so that we can validate
2236 : // compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
2237 0 : static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
2238 0 : // Go code uses milliseconds. Variable is called `computeStartupBuckets`
2239 0 : [
2240 0 : 5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000,
2241 0 : 1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000,
2242 0 : ]
2243 0 : .map(|ms| (ms as f64) / 1000.0)
2244 0 : });
2245 :
2246 : pub(crate) struct BasebackupQueryTime {
2247 : ok: Histogram,
2248 : error: Histogram,
2249 : client_error: Histogram,
2250 : }
2251 :
2252 0 : pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
2253 0 : let vec = register_histogram_vec!(
2254 0 : "pageserver_basebackup_query_seconds",
2255 0 : "Histogram of basebackup queries durations, by result type",
2256 0 : &["result"],
2257 0 : COMPUTE_STARTUP_BUCKETS.to_vec(),
2258 0 : )
2259 0 : .expect("failed to define a metric");
2260 0 : BasebackupQueryTime {
2261 0 : ok: vec.get_metric_with_label_values(&["ok"]).unwrap(),
2262 0 : error: vec.get_metric_with_label_values(&["error"]).unwrap(),
2263 0 : client_error: vec.get_metric_with_label_values(&["client_error"]).unwrap(),
2264 0 : }
2265 0 : });
2266 :
2267 : pub(crate) struct BasebackupQueryTimeOngoingRecording<'a> {
2268 : parent: &'a BasebackupQueryTime,
2269 : start: std::time::Instant,
2270 : }
2271 :
2272 : impl BasebackupQueryTime {
2273 0 : pub(crate) fn start_recording(&self) -> BasebackupQueryTimeOngoingRecording<'_> {
2274 0 : let start = Instant::now();
2275 0 : BasebackupQueryTimeOngoingRecording {
2276 0 : parent: self,
2277 0 : start,
2278 0 : }
2279 0 : }
2280 : }
2281 :
2282 : impl BasebackupQueryTimeOngoingRecording<'_> {
2283 0 : pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
2284 0 : let elapsed = self.start.elapsed().as_secs_f64();
2285 : // If you want to change categorize of a specific error, also change it in `log_query_error`.
2286 0 : let metric = match res {
2287 0 : Ok(_) => &self.parent.ok,
2288 : Err(QueryError::Shutdown) | Err(QueryError::Reconnect) => {
2289 : // Do not observe ok/err for shutdown/reconnect.
2290 : // Reconnect error might be raised when the operation is waiting for LSN and the tenant shutdown interrupts
2291 : // the operation. A reconnect error will be issued and the client will retry.
2292 0 : return;
2293 : }
2294 0 : Err(QueryError::Disconnected(ConnectionError::Io(io_error)))
2295 0 : if is_expected_io_error(io_error) =>
2296 0 : {
2297 0 : &self.parent.client_error
2298 : }
2299 0 : Err(_) => &self.parent.error,
2300 : };
2301 0 : metric.observe(elapsed);
2302 0 : }
2303 : }
2304 :
2305 0 : pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
2306 0 : register_int_counter_pair_vec!(
2307 0 : "pageserver_live_connections_started",
2308 0 : "Number of network connections that we started handling",
2309 0 : "pageserver_live_connections_finished",
2310 0 : "Number of network connections that we finished handling",
2311 0 : &["pageserver_connection_kind"]
2312 0 : )
2313 0 : .expect("failed to define a metric")
2314 0 : });
2315 :
2316 : #[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
2317 : pub(crate) enum ComputeCommandKind {
2318 : PageStreamV3,
2319 : PageStreamV2,
2320 : Basebackup,
2321 : Fullbackup,
2322 : LeaseLsn,
2323 : }
2324 :
2325 : pub(crate) struct ComputeCommandCounters {
2326 : map: EnumMap<ComputeCommandKind, IntCounter>,
2327 : }
2328 :
2329 0 : pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy<ComputeCommandCounters> = Lazy::new(|| {
2330 0 : let inner = register_int_counter_vec!(
2331 0 : "pageserver_compute_commands",
2332 0 : "Number of compute -> pageserver commands processed",
2333 0 : &["command"]
2334 0 : )
2335 0 : .expect("failed to define a metric");
2336 0 :
2337 0 : ComputeCommandCounters {
2338 0 : map: EnumMap::from_array(std::array::from_fn(|i| {
2339 0 : let command = ComputeCommandKind::from_usize(i);
2340 0 : let command_str: &'static str = command.into();
2341 0 : inner.with_label_values(&[command_str])
2342 0 : })),
2343 0 : }
2344 0 : });
2345 :
2346 : impl ComputeCommandCounters {
2347 0 : pub(crate) fn for_command(&self, command: ComputeCommandKind) -> &IntCounter {
2348 0 : &self.map[command]
2349 0 : }
2350 : }
2351 :
2352 : // remote storage metrics
2353 :
2354 106 : static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
2355 106 : register_int_counter_pair_vec!(
2356 106 : "pageserver_remote_timeline_client_calls_started",
2357 106 : "Number of started calls to remote timeline client.",
2358 106 : "pageserver_remote_timeline_client_calls_finished",
2359 106 : "Number of finshed calls to remote timeline client.",
2360 106 : &[
2361 106 : "tenant_id",
2362 106 : "shard_id",
2363 106 : "timeline_id",
2364 106 : "file_kind",
2365 106 : "op_kind"
2366 106 : ],
2367 106 : )
2368 106 : .unwrap()
2369 106 : });
2370 :
2371 : static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
2372 105 : Lazy::new(|| {
2373 105 : register_int_counter_vec!(
2374 105 : "pageserver_remote_timeline_client_bytes_started",
2375 105 : "Incremented by the number of bytes associated with a remote timeline client operation. \
2376 105 : The increment happens when the operation is scheduled.",
2377 105 : &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
2378 105 : )
2379 105 : .expect("failed to define a metric")
2380 105 : });
2381 :
2382 105 : static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
2383 105 : register_int_counter_vec!(
2384 105 : "pageserver_remote_timeline_client_bytes_finished",
2385 105 : "Incremented by the number of bytes associated with a remote timeline client operation. \
2386 105 : The increment happens when the operation finishes (regardless of success/failure/shutdown).",
2387 105 : &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
2388 105 : )
2389 105 : .expect("failed to define a metric")
2390 105 : });
2391 :
2392 : pub(crate) struct TenantManagerMetrics {
2393 : tenant_slots_attached: UIntGauge,
2394 : tenant_slots_secondary: UIntGauge,
2395 : tenant_slots_inprogress: UIntGauge,
2396 : pub(crate) tenant_slot_writes: IntCounter,
2397 : pub(crate) unexpected_errors: IntCounter,
2398 : }
2399 :
2400 : impl TenantManagerMetrics {
2401 : /// Helpers for tracking slots. Note that these do not track the lifetime of TenantSlot objects
2402 : /// exactly: they track the lifetime of the slots _in the tenant map_.
2403 1 : pub(crate) fn slot_inserted(&self, slot: &TenantSlot) {
2404 1 : match slot {
2405 0 : TenantSlot::Attached(_) => {
2406 0 : self.tenant_slots_attached.inc();
2407 0 : }
2408 0 : TenantSlot::Secondary(_) => {
2409 0 : self.tenant_slots_secondary.inc();
2410 0 : }
2411 1 : TenantSlot::InProgress(_) => {
2412 1 : self.tenant_slots_inprogress.inc();
2413 1 : }
2414 : }
2415 1 : }
2416 :
2417 1 : pub(crate) fn slot_removed(&self, slot: &TenantSlot) {
2418 1 : match slot {
2419 1 : TenantSlot::Attached(_) => {
2420 1 : self.tenant_slots_attached.dec();
2421 1 : }
2422 0 : TenantSlot::Secondary(_) => {
2423 0 : self.tenant_slots_secondary.dec();
2424 0 : }
2425 0 : TenantSlot::InProgress(_) => {
2426 0 : self.tenant_slots_inprogress.dec();
2427 0 : }
2428 : }
2429 1 : }
2430 :
2431 : #[cfg(all(debug_assertions, not(test)))]
2432 0 : pub(crate) fn slots_total(&self) -> u64 {
2433 0 : self.tenant_slots_attached.get()
2434 0 : + self.tenant_slots_secondary.get()
2435 0 : + self.tenant_slots_inprogress.get()
2436 0 : }
2437 : }
2438 :
2439 1 : pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
2440 1 : let tenant_slots = register_uint_gauge_vec!(
2441 1 : "pageserver_tenant_manager_slots",
2442 1 : "How many slots currently exist, including all attached, secondary and in-progress operations",
2443 1 : &["mode"]
2444 1 : )
2445 1 : .expect("failed to define a metric");
2446 1 : TenantManagerMetrics {
2447 1 : tenant_slots_attached: tenant_slots
2448 1 : .get_metric_with_label_values(&["attached"])
2449 1 : .unwrap(),
2450 1 : tenant_slots_secondary: tenant_slots
2451 1 : .get_metric_with_label_values(&["secondary"])
2452 1 : .unwrap(),
2453 1 : tenant_slots_inprogress: tenant_slots
2454 1 : .get_metric_with_label_values(&["inprogress"])
2455 1 : .unwrap(),
2456 1 : tenant_slot_writes: register_int_counter!(
2457 1 : "pageserver_tenant_manager_slot_writes",
2458 1 : "Writes to a tenant slot, including all of create/attach/detach/delete"
2459 1 : )
2460 1 : .expect("failed to define a metric"),
2461 1 : unexpected_errors: register_int_counter!(
2462 1 : "pageserver_tenant_manager_unexpected_errors_total",
2463 1 : "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
2464 1 : )
2465 1 : .expect("failed to define a metric"),
2466 1 : }
2467 1 : });
2468 :
2469 : pub(crate) struct DeletionQueueMetrics {
2470 : pub(crate) keys_submitted: IntCounter,
2471 : pub(crate) keys_dropped: IntCounter,
2472 : pub(crate) keys_executed: IntCounter,
2473 : pub(crate) keys_validated: IntCounter,
2474 : pub(crate) dropped_lsn_updates: IntCounter,
2475 : pub(crate) unexpected_errors: IntCounter,
2476 : pub(crate) remote_errors: IntCounterVec,
2477 : }
2478 19 : pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
2479 19 : DeletionQueueMetrics{
2480 19 :
2481 19 : keys_submitted: register_int_counter!(
2482 19 : "pageserver_deletion_queue_submitted_total",
2483 19 : "Number of objects submitted for deletion"
2484 19 : )
2485 19 : .expect("failed to define a metric"),
2486 19 :
2487 19 : keys_dropped: register_int_counter!(
2488 19 : "pageserver_deletion_queue_dropped_total",
2489 19 : "Number of object deletions dropped due to stale generation."
2490 19 : )
2491 19 : .expect("failed to define a metric"),
2492 19 :
2493 19 : keys_executed: register_int_counter!(
2494 19 : "pageserver_deletion_queue_executed_total",
2495 19 : "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion"
2496 19 : )
2497 19 : .expect("failed to define a metric"),
2498 19 :
2499 19 : keys_validated: register_int_counter!(
2500 19 : "pageserver_deletion_queue_validated_total",
2501 19 : "Number of keys validated for deletion. Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage."
2502 19 : )
2503 19 : .expect("failed to define a metric"),
2504 19 :
2505 19 : dropped_lsn_updates: register_int_counter!(
2506 19 : "pageserver_deletion_queue_dropped_lsn_updates_total",
2507 19 : "Updates to remote_consistent_lsn dropped due to stale generation number."
2508 19 : )
2509 19 : .expect("failed to define a metric"),
2510 19 : unexpected_errors: register_int_counter!(
2511 19 : "pageserver_deletion_queue_unexpected_errors_total",
2512 19 : "Number of unexpected condiions that may stall the queue: any value above zero is unexpected."
2513 19 : )
2514 19 : .expect("failed to define a metric"),
2515 19 : remote_errors: register_int_counter_vec!(
2516 19 : "pageserver_deletion_queue_remote_errors_total",
2517 19 : "Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects",
2518 19 : &["op_kind"],
2519 19 : )
2520 19 : .expect("failed to define a metric")
2521 19 : }
2522 19 : });
2523 :
2524 : pub(crate) struct SecondaryModeMetrics {
2525 : pub(crate) upload_heatmap: IntCounter,
2526 : pub(crate) upload_heatmap_errors: IntCounter,
2527 : pub(crate) upload_heatmap_duration: Histogram,
2528 : pub(crate) download_heatmap: IntCounter,
2529 : pub(crate) download_layer: IntCounter,
2530 : }
2531 0 : pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
2532 0 : SecondaryModeMetrics {
2533 0 : upload_heatmap: register_int_counter!(
2534 0 : "pageserver_secondary_upload_heatmap",
2535 0 : "Number of heatmaps written to remote storage by attached tenants"
2536 0 : )
2537 0 : .expect("failed to define a metric"),
2538 0 : upload_heatmap_errors: register_int_counter!(
2539 0 : "pageserver_secondary_upload_heatmap_errors",
2540 0 : "Failures writing heatmap to remote storage"
2541 0 : )
2542 0 : .expect("failed to define a metric"),
2543 0 : upload_heatmap_duration: register_histogram!(
2544 0 : "pageserver_secondary_upload_heatmap_duration",
2545 0 : "Time to build and upload a heatmap, including any waiting inside the remote storage client"
2546 0 : )
2547 0 : .expect("failed to define a metric"),
2548 0 : download_heatmap: register_int_counter!(
2549 0 : "pageserver_secondary_download_heatmap",
2550 0 : "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed"
2551 0 : )
2552 0 : .expect("failed to define a metric"),
2553 0 : download_layer: register_int_counter!(
2554 0 : "pageserver_secondary_download_layer",
2555 0 : "Number of downloads of layers by secondary mode locations"
2556 0 : )
2557 0 : .expect("failed to define a metric"),
2558 0 : }
2559 0 : });
2560 :
2561 0 : pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
2562 0 : register_uint_gauge_vec!(
2563 0 : "pageserver_secondary_resident_physical_size",
2564 0 : "The size of the layer files present in the pageserver's filesystem, for secondary locations.",
2565 0 : &["tenant_id", "shard_id"]
2566 0 : )
2567 0 : .expect("failed to define a metric")
2568 0 : });
2569 :
2570 0 : pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
2571 0 : register_uint_gauge!(
2572 0 : "pageserver_utilization_score",
2573 0 : "The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
2574 0 : )
2575 0 : .expect("failed to define a metric")
2576 0 : });
2577 :
2578 0 : pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
2579 0 : register_uint_gauge_vec!(
2580 0 : "pageserver_secondary_heatmap_total_size",
2581 0 : "The total size in bytes of all layers in the most recently downloaded heatmap.",
2582 0 : &["tenant_id", "shard_id"]
2583 0 : )
2584 0 : .expect("failed to define a metric")
2585 0 : });
2586 :
2587 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
2588 : pub enum RemoteOpKind {
2589 : Upload,
2590 : Download,
2591 : Delete,
2592 : }
2593 : impl RemoteOpKind {
2594 8159 : pub fn as_str(&self) -> &'static str {
2595 8159 : match self {
2596 7663 : Self::Upload => "upload",
2597 34 : Self::Download => "download",
2598 462 : Self::Delete => "delete",
2599 : }
2600 8159 : }
2601 : }
2602 :
2603 : #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
2604 : pub enum RemoteOpFileKind {
2605 : Layer,
2606 : Index,
2607 : }
2608 : impl RemoteOpFileKind {
2609 8159 : pub fn as_str(&self) -> &'static str {
2610 8159 : match self {
2611 5810 : Self::Layer => "layer",
2612 2349 : Self::Index => "index",
2613 : }
2614 8159 : }
2615 : }
2616 :
2617 104 : pub(crate) static REMOTE_TIMELINE_CLIENT_COMPLETION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
2618 104 : register_histogram_vec!(
2619 104 : "pageserver_remote_timeline_client_seconds_global",
2620 104 : "Time spent on remote timeline client operations. \
2621 104 : Grouped by task_kind, file_kind, operation_kind and status. \
2622 104 : The task_kind is \
2623 104 : - for layer downloads, populated from RequestContext (primary objective of having the label) \
2624 104 : - for index downloads, set to 'unknown' \
2625 104 : - for any upload operation, set to 'RemoteUploadTask' \
2626 104 : This keeps dimensionality at bay. \
2627 104 : Does not account for time spent waiting in remote timeline client's queues.",
2628 104 : &["task_kind", "file_kind", "op_kind", "status"]
2629 104 : )
2630 104 : .expect("failed to define a metric")
2631 104 : });
2632 :
2633 0 : pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
2634 0 : register_int_counter_vec!(
2635 0 : "pageserver_tenant_task_events",
2636 0 : "Number of task start/stop/fail events.",
2637 0 : &["event"],
2638 0 : )
2639 0 : .expect("Failed to register tenant_task_events metric")
2640 0 : });
2641 :
2642 : pub struct BackgroundLoopSemaphoreMetrics {
2643 : counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
2644 : durations: EnumMap<BackgroundLoopKind, Histogram>,
2645 : waiting_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
2646 : running_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
2647 : }
2648 :
2649 : pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> =
2650 10 : Lazy::new(|| {
2651 10 : let counters = register_int_counter_pair_vec!(
2652 10 : "pageserver_background_loop_semaphore_wait_start_count",
2653 10 : "Counter for background loop concurrency-limiting semaphore acquire calls started",
2654 10 : "pageserver_background_loop_semaphore_wait_finish_count",
2655 10 : "Counter for background loop concurrency-limiting semaphore acquire calls finished",
2656 10 : &["task"],
2657 10 : )
2658 10 : .unwrap();
2659 10 :
2660 10 : let durations = register_histogram_vec!(
2661 10 : "pageserver_background_loop_semaphore_wait_seconds",
2662 10 : "Seconds spent waiting on background loop semaphore acquisition",
2663 10 : &["task"],
2664 10 : vec![0.01, 1.0, 5.0, 10.0, 30.0, 60.0, 180.0, 300.0, 600.0],
2665 10 : )
2666 10 : .unwrap();
2667 10 :
2668 10 : let waiting_tasks = register_int_gauge_vec!(
2669 10 : "pageserver_background_loop_semaphore_waiting_tasks",
2670 10 : "Number of background loop tasks waiting for semaphore",
2671 10 : &["task"],
2672 10 : )
2673 10 : .unwrap();
2674 10 :
2675 10 : let running_tasks = register_int_gauge_vec!(
2676 10 : "pageserver_background_loop_semaphore_running_tasks",
2677 10 : "Number of background loop tasks running concurrently",
2678 10 : &["task"],
2679 10 : )
2680 10 : .unwrap();
2681 10 :
2682 10 : BackgroundLoopSemaphoreMetrics {
2683 100 : counters: EnumMap::from_array(std::array::from_fn(|i| {
2684 100 : let kind = BackgroundLoopKind::from_usize(i);
2685 100 : counters.with_label_values(&[kind.into()])
2686 100 : })),
2687 100 : durations: EnumMap::from_array(std::array::from_fn(|i| {
2688 100 : let kind = BackgroundLoopKind::from_usize(i);
2689 100 : durations.with_label_values(&[kind.into()])
2690 100 : })),
2691 100 : waiting_tasks: EnumMap::from_array(std::array::from_fn(|i| {
2692 100 : let kind = BackgroundLoopKind::from_usize(i);
2693 100 : waiting_tasks.with_label_values(&[kind.into()])
2694 100 : })),
2695 100 : running_tasks: EnumMap::from_array(std::array::from_fn(|i| {
2696 100 : let kind = BackgroundLoopKind::from_usize(i);
2697 100 : running_tasks.with_label_values(&[kind.into()])
2698 100 : })),
2699 10 : }
2700 10 : });
2701 :
2702 : impl BackgroundLoopSemaphoreMetrics {
2703 : /// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the
2704 : /// semaphore is acquired, and drop it when the task completes or is cancelled.
2705 192 : pub(crate) fn record(
2706 192 : &self,
2707 192 : task: BackgroundLoopKind,
2708 192 : ) -> BackgroundLoopSemaphoreMetricsRecorder {
2709 192 : BackgroundLoopSemaphoreMetricsRecorder::start(self, task)
2710 192 : }
2711 : }
2712 :
2713 : /// Records metrics for a background task.
2714 : pub struct BackgroundLoopSemaphoreMetricsRecorder<'a> {
2715 : metrics: &'a BackgroundLoopSemaphoreMetrics,
2716 : task: BackgroundLoopKind,
2717 : start: Instant,
2718 : wait_counter_guard: Option<metrics::IntCounterPairGuard>,
2719 : }
2720 :
2721 : impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> {
2722 : /// Starts recording semaphore metrics, by recording wait time and incrementing
2723 : /// `wait_start_count` and `waiting_tasks`.
2724 192 : fn start(metrics: &'a BackgroundLoopSemaphoreMetrics, task: BackgroundLoopKind) -> Self {
2725 192 : metrics.waiting_tasks[task].inc();
2726 192 : Self {
2727 192 : metrics,
2728 192 : task,
2729 192 : start: Instant::now(),
2730 192 : wait_counter_guard: Some(metrics.counters[task].guard()),
2731 192 : }
2732 192 : }
2733 :
2734 : /// Signals that the semaphore has been acquired, and updates relevant metrics.
2735 192 : pub fn acquired(&mut self) -> Duration {
2736 192 : let waited = self.start.elapsed();
2737 192 : self.wait_counter_guard.take().expect("already acquired");
2738 192 : self.metrics.durations[self.task].observe(waited.as_secs_f64());
2739 192 : self.metrics.waiting_tasks[self.task].dec();
2740 192 : self.metrics.running_tasks[self.task].inc();
2741 192 : waited
2742 192 : }
2743 : }
2744 :
2745 : impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> {
2746 : /// The task either completed or was cancelled.
2747 192 : fn drop(&mut self) {
2748 192 : if self.wait_counter_guard.take().is_some() {
2749 0 : // Waiting.
2750 0 : self.metrics.durations[self.task].observe(self.start.elapsed().as_secs_f64());
2751 0 : self.metrics.waiting_tasks[self.task].dec();
2752 192 : } else {
2753 192 : // Running.
2754 192 : self.metrics.running_tasks[self.task].dec();
2755 192 : }
2756 192 : }
2757 : }
2758 :
2759 0 : pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
2760 0 : register_int_counter_vec!(
2761 0 : "pageserver_background_loop_period_overrun_count",
2762 0 : "Incremented whenever warn_when_period_overrun() logs a warning.",
2763 0 : &["task", "period"],
2764 0 : )
2765 0 : .expect("failed to define a metric")
2766 0 : });
2767 :
2768 : // walreceiver metrics
2769 :
2770 0 : pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
2771 0 : register_int_counter!(
2772 0 : "pageserver_walreceiver_started_connections_total",
2773 0 : "Number of started walreceiver connections"
2774 0 : )
2775 0 : .expect("failed to define a metric")
2776 0 : });
2777 :
2778 0 : pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
2779 0 : register_int_gauge!(
2780 0 : "pageserver_walreceiver_active_managers",
2781 0 : "Number of active walreceiver managers"
2782 0 : )
2783 0 : .expect("failed to define a metric")
2784 0 : });
2785 :
2786 0 : pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
2787 0 : register_int_counter_vec!(
2788 0 : "pageserver_walreceiver_switches_total",
2789 0 : "Number of walreceiver manager change_connection calls",
2790 0 : &["reason"]
2791 0 : )
2792 0 : .expect("failed to define a metric")
2793 0 : });
2794 :
2795 0 : pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
2796 0 : register_int_counter!(
2797 0 : "pageserver_walreceiver_broker_updates_total",
2798 0 : "Number of received broker updates in walreceiver"
2799 0 : )
2800 0 : .expect("failed to define a metric")
2801 0 : });
2802 :
2803 1 : pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
2804 1 : register_int_counter_vec!(
2805 1 : "pageserver_walreceiver_candidates_events_total",
2806 1 : "Number of walreceiver candidate events",
2807 1 : &["event"]
2808 1 : )
2809 1 : .expect("failed to define a metric")
2810 1 : });
2811 :
2812 : pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
2813 0 : Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
2814 :
2815 : pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
2816 1 : Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
2817 :
2818 : // Metrics collected on WAL redo operations
2819 : //
2820 : // We collect the time spent in actual WAL redo ('redo'), and time waiting
2821 : // for access to the postgres process ('wait') since there is only one for
2822 : // each tenant.
2823 :
2824 : /// Time buckets are small because we want to be able to measure the
2825 : /// smallest redo processing times. These buckets allow us to measure down
2826 : /// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
2827 : /// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
2828 : ///
2829 : /// Values up to 1s are recorded because metrics show that we have redo
2830 : /// durations and lock times larger than 0.250s.
2831 : macro_rules! redo_histogram_time_buckets {
2832 : () => {
2833 : vec![
2834 : 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
2835 : 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000,
2836 : 1.000_000,
2837 : ]
2838 : };
2839 : }
2840 :
2841 : /// While we're at it, also measure the amount of records replayed in each
2842 : /// operation. We have a global 'total replayed' counter, but that's not
2843 : /// as useful as 'what is the skew for how many records we replay in one
2844 : /// operation'.
2845 : macro_rules! redo_histogram_count_buckets {
2846 : () => {
2847 : vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0]
2848 : };
2849 : }
2850 :
2851 : macro_rules! redo_bytes_histogram_count_buckets {
2852 : () => {
2853 : // powers of (2^.5), from 2^4.5 to 2^15 (22 buckets)
2854 : // rounded up to the next multiple of 8 to capture any MAXALIGNed record of that size, too.
2855 : vec![
2856 : 24.0, 32.0, 48.0, 64.0, 96.0, 128.0, 184.0, 256.0, 368.0, 512.0, 728.0, 1024.0, 1456.0,
2857 : 2048.0, 2904.0, 4096.0, 5800.0, 8192.0, 11592.0, 16384.0, 23176.0, 32768.0,
2858 : ]
2859 : };
2860 : }
2861 :
2862 : pub(crate) struct WalIngestMetrics {
2863 : pub(crate) bytes_received: IntCounter,
2864 : pub(crate) records_received: IntCounter,
2865 : pub(crate) records_observed: IntCounter,
2866 : pub(crate) records_committed: IntCounter,
2867 : pub(crate) values_committed_metadata_images: IntCounter,
2868 : pub(crate) values_committed_metadata_deltas: IntCounter,
2869 : pub(crate) values_committed_data_images: IntCounter,
2870 : pub(crate) values_committed_data_deltas: IntCounter,
2871 : pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
2872 : }
2873 :
2874 : impl WalIngestMetrics {
2875 0 : pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) {
2876 0 : if stats.metadata_images > 0 {
2877 0 : self.values_committed_metadata_images
2878 0 : .inc_by(stats.metadata_images);
2879 0 : }
2880 0 : if stats.metadata_deltas > 0 {
2881 0 : self.values_committed_metadata_deltas
2882 0 : .inc_by(stats.metadata_deltas);
2883 0 : }
2884 0 : if stats.data_images > 0 {
2885 0 : self.values_committed_data_images.inc_by(stats.data_images);
2886 0 : }
2887 0 : if stats.data_deltas > 0 {
2888 0 : self.values_committed_data_deltas.inc_by(stats.data_deltas);
2889 0 : }
2890 0 : }
2891 : }
2892 :
2893 5 : pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
2894 5 : let values_committed = register_int_counter_vec!(
2895 5 : "pageserver_wal_ingest_values_committed",
2896 5 : "Number of values committed to pageserver storage from WAL records",
2897 5 : &["class", "kind"],
2898 5 : )
2899 5 : .expect("failed to define a metric");
2900 5 :
2901 5 : WalIngestMetrics {
2902 5 : bytes_received: register_int_counter!(
2903 5 : "pageserver_wal_ingest_bytes_received",
2904 5 : "Bytes of WAL ingested from safekeepers",
2905 5 : )
2906 5 : .unwrap(),
2907 5 : records_received: register_int_counter!(
2908 5 : "pageserver_wal_ingest_records_received",
2909 5 : "Number of WAL records received from safekeepers"
2910 5 : )
2911 5 : .expect("failed to define a metric"),
2912 5 : records_observed: register_int_counter!(
2913 5 : "pageserver_wal_ingest_records_observed",
2914 5 : "Number of WAL records observed from safekeepers. These are metadata only records for shard 0."
2915 5 : )
2916 5 : .expect("failed to define a metric"),
2917 5 : records_committed: register_int_counter!(
2918 5 : "pageserver_wal_ingest_records_committed",
2919 5 : "Number of WAL records which resulted in writes to pageserver storage"
2920 5 : )
2921 5 : .expect("failed to define a metric"),
2922 5 : values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
2923 5 : values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
2924 5 : values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
2925 5 : values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]),
2926 5 : gap_blocks_zeroed_on_rel_extend: register_int_counter!(
2927 5 : "pageserver_gap_blocks_zeroed_on_rel_extend",
2928 5 : "Total number of zero gap blocks written on relation extends"
2929 5 : )
2930 5 : .expect("failed to define a metric"),
2931 5 : }
2932 5 : });
2933 :
2934 108 : pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
2935 108 : register_int_counter_vec!(
2936 108 : "pageserver_timeline_wal_records_received",
2937 108 : "Number of WAL records received per shard",
2938 108 : &["tenant_id", "shard_id", "timeline_id"]
2939 108 : )
2940 108 : .expect("failed to define a metric")
2941 108 : });
2942 :
2943 3 : pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
2944 3 : register_histogram!(
2945 3 : "pageserver_wal_redo_seconds",
2946 3 : "Time spent on WAL redo",
2947 3 : redo_histogram_time_buckets!()
2948 3 : )
2949 3 : .expect("failed to define a metric")
2950 3 : });
2951 :
2952 3 : pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
2953 3 : register_histogram!(
2954 3 : "pageserver_wal_redo_records_histogram",
2955 3 : "Histogram of number of records replayed per redo in the Postgres WAL redo process",
2956 3 : redo_histogram_count_buckets!(),
2957 3 : )
2958 3 : .expect("failed to define a metric")
2959 3 : });
2960 :
2961 3 : pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
2962 3 : register_histogram!(
2963 3 : "pageserver_wal_redo_bytes_histogram",
2964 3 : "Histogram of number of records replayed per redo sent to Postgres",
2965 3 : redo_bytes_histogram_count_buckets!(),
2966 3 : )
2967 3 : .expect("failed to define a metric")
2968 3 : });
2969 :
2970 : // FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
2971 3 : pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
2972 3 : register_int_counter!(
2973 3 : "pageserver_replayed_wal_records_total",
2974 3 : "Number of WAL records replayed in WAL redo process"
2975 3 : )
2976 3 : .unwrap()
2977 3 : });
2978 :
2979 : #[rustfmt::skip]
2980 4 : pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
2981 4 : register_histogram!(
2982 4 : "pageserver_wal_redo_process_launch_duration",
2983 4 : "Histogram of the duration of successful WalRedoProcess::launch calls",
2984 4 : vec![
2985 4 : 0.0002, 0.0004, 0.0006, 0.0008, 0.0010,
2986 4 : 0.0020, 0.0040, 0.0060, 0.0080, 0.0100,
2987 4 : 0.0200, 0.0400, 0.0600, 0.0800, 0.1000,
2988 4 : 0.2000, 0.4000, 0.6000, 0.8000, 1.0000,
2989 4 : 1.5000, 2.0000, 2.5000, 3.0000, 4.0000, 10.0000
2990 4 : ],
2991 4 : )
2992 4 : .expect("failed to define a metric")
2993 4 : });
2994 :
2995 : pub(crate) struct WalRedoProcessCounters {
2996 : pub(crate) started: IntCounter,
2997 : pub(crate) killed_by_cause: EnumMap<WalRedoKillCause, IntCounter>,
2998 : pub(crate) active_stderr_logger_tasks_started: IntCounter,
2999 : pub(crate) active_stderr_logger_tasks_finished: IntCounter,
3000 : }
3001 :
3002 : #[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
3003 : pub(crate) enum WalRedoKillCause {
3004 : WalRedoProcessDrop,
3005 : NoLeakChildDrop,
3006 : Startup,
3007 : }
3008 :
3009 : impl Default for WalRedoProcessCounters {
3010 4 : fn default() -> Self {
3011 4 : let started = register_int_counter!(
3012 4 : "pageserver_wal_redo_process_started_total",
3013 4 : "Number of WAL redo processes started",
3014 4 : )
3015 4 : .unwrap();
3016 4 :
3017 4 : let killed = register_int_counter_vec!(
3018 4 : "pageserver_wal_redo_process_stopped_total",
3019 4 : "Number of WAL redo processes stopped",
3020 4 : &["cause"],
3021 4 : )
3022 4 : .unwrap();
3023 4 :
3024 4 : let active_stderr_logger_tasks_started = register_int_counter!(
3025 4 : "pageserver_walredo_stderr_logger_tasks_started_total",
3026 4 : "Number of active walredo stderr logger tasks that have started",
3027 4 : )
3028 4 : .unwrap();
3029 4 :
3030 4 : let active_stderr_logger_tasks_finished = register_int_counter!(
3031 4 : "pageserver_walredo_stderr_logger_tasks_finished_total",
3032 4 : "Number of active walredo stderr logger tasks that have finished",
3033 4 : )
3034 4 : .unwrap();
3035 4 :
3036 4 : Self {
3037 4 : started,
3038 12 : killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
3039 12 : let cause = WalRedoKillCause::from_usize(i);
3040 12 : let cause_str: &'static str = cause.into();
3041 12 : killed.with_label_values(&[cause_str])
3042 12 : })),
3043 4 : active_stderr_logger_tasks_started,
3044 4 : active_stderr_logger_tasks_finished,
3045 4 : }
3046 4 : }
3047 : }
3048 :
3049 : pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
3050 : Lazy::new(WalRedoProcessCounters::default);
3051 :
3052 : /// Similar to `prometheus::HistogramTimer` but does not record on drop.
3053 : pub(crate) struct StorageTimeMetricsTimer {
3054 : metrics: StorageTimeMetrics,
3055 : start: Instant,
3056 : }
3057 :
3058 : impl StorageTimeMetricsTimer {
3059 1112 : fn new(metrics: StorageTimeMetrics) -> Self {
3060 1112 : Self {
3061 1112 : metrics,
3062 1112 : start: Instant::now(),
3063 1112 : }
3064 1112 : }
3065 :
3066 : /// Returns the elapsed duration of the timer.
3067 1111 : pub fn elapsed(&self) -> Duration {
3068 1111 : self.start.elapsed()
3069 1111 : }
3070 :
3071 : /// Record the time from creation to now and return it.
3072 1111 : pub fn stop_and_record(self) -> Duration {
3073 1111 : let duration = self.elapsed();
3074 1111 : let seconds = duration.as_secs_f64();
3075 1111 : self.metrics.timeline_sum.inc_by(seconds);
3076 1111 : self.metrics.timeline_count.inc();
3077 1111 : self.metrics.global_histogram.observe(seconds);
3078 1111 : duration
3079 1111 : }
3080 :
3081 : /// Turns this timer into a timer, which will always record -- usually this means recording
3082 : /// regardless an early `?` path was taken in a function.
3083 10 : pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer {
3084 10 : AlwaysRecordingStorageTimeMetricsTimer(Some(self))
3085 10 : }
3086 : }
3087 :
3088 : pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option<StorageTimeMetricsTimer>);
3089 :
3090 : impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
3091 10 : fn drop(&mut self) {
3092 10 : if let Some(inner) = self.0.take() {
3093 10 : inner.stop_and_record();
3094 10 : }
3095 10 : }
3096 : }
3097 :
3098 : impl AlwaysRecordingStorageTimeMetricsTimer {
3099 : /// Returns the elapsed duration of the timer.
3100 0 : pub fn elapsed(&self) -> Duration {
3101 0 : self.0.as_ref().expect("not dropped yet").elapsed()
3102 0 : }
3103 : }
3104 :
3105 : /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
3106 : /// timeline total sum and count.
3107 : #[derive(Clone, Debug)]
3108 : pub(crate) struct StorageTimeMetrics {
3109 : /// Sum of f64 seconds, per operation, tenant_id and timeline_id
3110 : timeline_sum: Counter,
3111 : /// Number of oeprations, per operation, tenant_id and timeline_id
3112 : timeline_count: IntCounter,
3113 : /// Global histogram having only the "operation" label.
3114 : global_histogram: Histogram,
3115 : }
3116 :
3117 : impl StorageTimeMetrics {
3118 2106 : pub fn new(
3119 2106 : operation: StorageTimeOperation,
3120 2106 : tenant_id: &str,
3121 2106 : shard_id: &str,
3122 2106 : timeline_id: &str,
3123 2106 : ) -> Self {
3124 2106 : let operation: &'static str = operation.into();
3125 2106 :
3126 2106 : let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
3127 2106 : .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
3128 2106 : .unwrap();
3129 2106 : let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
3130 2106 : .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
3131 2106 : .unwrap();
3132 2106 : let global_histogram = STORAGE_TIME_GLOBAL
3133 2106 : .get_metric_with_label_values(&[operation])
3134 2106 : .unwrap();
3135 2106 :
3136 2106 : StorageTimeMetrics {
3137 2106 : timeline_sum,
3138 2106 : timeline_count,
3139 2106 : global_histogram,
3140 2106 : }
3141 2106 : }
3142 :
3143 : /// Starts timing a new operation.
3144 : ///
3145 : /// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop.
3146 1112 : pub fn start_timer(&self) -> StorageTimeMetricsTimer {
3147 1112 : StorageTimeMetricsTimer::new(self.clone())
3148 1112 : }
3149 : }
3150 :
3151 : pub(crate) struct TimelineMetrics {
3152 : tenant_id: String,
3153 : shard_id: String,
3154 : timeline_id: String,
3155 : pub flush_time_histo: StorageTimeMetrics,
3156 : pub flush_delay_histo: StorageTimeMetrics,
3157 : pub compact_time_histo: StorageTimeMetrics,
3158 : pub create_images_time_histo: StorageTimeMetrics,
3159 : pub logical_size_histo: StorageTimeMetrics,
3160 : pub imitate_logical_size_histo: StorageTimeMetrics,
3161 : pub load_layer_map_histo: StorageTimeMetrics,
3162 : pub garbage_collect_histo: StorageTimeMetrics,
3163 : pub find_gc_cutoffs_histo: StorageTimeMetrics,
3164 : pub last_record_lsn_gauge: IntGauge,
3165 : pub disk_consistent_lsn_gauge: IntGauge,
3166 : pub pitr_history_size: UIntGauge,
3167 : pub archival_size: UIntGauge,
3168 : pub layers_per_read: Histogram,
3169 : pub standby_horizon_gauge: IntGauge,
3170 : pub resident_physical_size_gauge: UIntGauge,
3171 : pub visible_physical_size_gauge: UIntGauge,
3172 : /// copy of LayeredTimeline.current_logical_size
3173 : pub current_logical_size_gauge: UIntGauge,
3174 : pub aux_file_size_gauge: IntGauge,
3175 : pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
3176 : pub evictions: IntCounter,
3177 : pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
3178 : /// Number of valid LSN leases.
3179 : pub valid_lsn_lease_count_gauge: UIntGauge,
3180 : pub wal_records_received: IntCounter,
3181 : pub storage_io_size: StorageIoSizeMetrics,
3182 : pub wait_lsn_in_progress_micros: GlobalAndPerTenantIntCounter,
3183 : pub wait_lsn_start_finish_counterpair: IntCounterPair,
3184 : pub wait_ondemand_download_time: wait_ondemand_download_time::WaitOndemandDownloadTimeSum,
3185 : shutdown: std::sync::atomic::AtomicBool,
3186 : }
3187 :
3188 : impl TimelineMetrics {
3189 234 : pub fn new(
3190 234 : tenant_shard_id: &TenantShardId,
3191 234 : timeline_id_raw: &TimelineId,
3192 234 : evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
3193 234 : ) -> Self {
3194 234 : let tenant_id = tenant_shard_id.tenant_id.to_string();
3195 234 : let shard_id = format!("{}", tenant_shard_id.shard_slug());
3196 234 : let timeline_id = timeline_id_raw.to_string();
3197 234 : let flush_time_histo = StorageTimeMetrics::new(
3198 234 : StorageTimeOperation::LayerFlush,
3199 234 : &tenant_id,
3200 234 : &shard_id,
3201 234 : &timeline_id,
3202 234 : );
3203 234 : let flush_delay_histo = StorageTimeMetrics::new(
3204 234 : StorageTimeOperation::LayerFlushDelay,
3205 234 : &tenant_id,
3206 234 : &shard_id,
3207 234 : &timeline_id,
3208 234 : );
3209 234 : let compact_time_histo = StorageTimeMetrics::new(
3210 234 : StorageTimeOperation::Compact,
3211 234 : &tenant_id,
3212 234 : &shard_id,
3213 234 : &timeline_id,
3214 234 : );
3215 234 : let create_images_time_histo = StorageTimeMetrics::new(
3216 234 : StorageTimeOperation::CreateImages,
3217 234 : &tenant_id,
3218 234 : &shard_id,
3219 234 : &timeline_id,
3220 234 : );
3221 234 : let logical_size_histo = StorageTimeMetrics::new(
3222 234 : StorageTimeOperation::LogicalSize,
3223 234 : &tenant_id,
3224 234 : &shard_id,
3225 234 : &timeline_id,
3226 234 : );
3227 234 : let imitate_logical_size_histo = StorageTimeMetrics::new(
3228 234 : StorageTimeOperation::ImitateLogicalSize,
3229 234 : &tenant_id,
3230 234 : &shard_id,
3231 234 : &timeline_id,
3232 234 : );
3233 234 : let load_layer_map_histo = StorageTimeMetrics::new(
3234 234 : StorageTimeOperation::LoadLayerMap,
3235 234 : &tenant_id,
3236 234 : &shard_id,
3237 234 : &timeline_id,
3238 234 : );
3239 234 : let garbage_collect_histo = StorageTimeMetrics::new(
3240 234 : StorageTimeOperation::Gc,
3241 234 : &tenant_id,
3242 234 : &shard_id,
3243 234 : &timeline_id,
3244 234 : );
3245 234 : let find_gc_cutoffs_histo = StorageTimeMetrics::new(
3246 234 : StorageTimeOperation::FindGcCutoffs,
3247 234 : &tenant_id,
3248 234 : &shard_id,
3249 234 : &timeline_id,
3250 234 : );
3251 234 : let last_record_lsn_gauge = LAST_RECORD_LSN
3252 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3253 234 : .unwrap();
3254 234 :
3255 234 : let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
3256 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3257 234 : .unwrap();
3258 234 :
3259 234 : let pitr_history_size = PITR_HISTORY_SIZE
3260 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3261 234 : .unwrap();
3262 234 :
3263 234 : let archival_size = TIMELINE_ARCHIVE_SIZE
3264 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3265 234 : .unwrap();
3266 234 :
3267 234 : let layers_per_read = LAYERS_PER_READ
3268 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3269 234 : .unwrap();
3270 234 :
3271 234 : let standby_horizon_gauge = STANDBY_HORIZON
3272 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3273 234 : .unwrap();
3274 234 : let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
3275 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3276 234 : .unwrap();
3277 234 : let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
3278 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3279 234 : .unwrap();
3280 234 : // TODO: we shouldn't expose this metric
3281 234 : let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
3282 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3283 234 : .unwrap();
3284 234 : let aux_file_size_gauge = AUX_FILE_SIZE
3285 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3286 234 : .unwrap();
3287 234 : // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
3288 234 : let directory_entries_count_gauge_closure = {
3289 234 : let tenant_shard_id = *tenant_shard_id;
3290 234 : let timeline_id_raw = *timeline_id_raw;
3291 0 : move || {
3292 0 : let tenant_id = tenant_shard_id.tenant_id.to_string();
3293 0 : let shard_id = format!("{}", tenant_shard_id.shard_slug());
3294 0 : let timeline_id = timeline_id_raw.to_string();
3295 0 : let gauge: UIntGauge = DIRECTORY_ENTRIES_COUNT
3296 0 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3297 0 : .unwrap();
3298 0 : gauge
3299 0 : }
3300 : };
3301 234 : let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
3302 234 : Lazy::new(Box::new(directory_entries_count_gauge_closure));
3303 234 : let evictions = EVICTIONS
3304 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3305 234 : .unwrap();
3306 234 : let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
3307 234 : .build(&tenant_id, &shard_id, &timeline_id);
3308 234 :
3309 234 : let valid_lsn_lease_count_gauge = VALID_LSN_LEASE_COUNT
3310 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3311 234 : .unwrap();
3312 234 :
3313 234 : let wal_records_received = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED
3314 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3315 234 : .unwrap();
3316 234 :
3317 234 : let storage_io_size = StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
3318 234 :
3319 234 : let wait_lsn_in_progress_micros = GlobalAndPerTenantIntCounter {
3320 234 : global: WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS.clone(),
3321 234 : per_tenant: WAIT_LSN_IN_PROGRESS_MICROS
3322 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3323 234 : .unwrap(),
3324 234 : };
3325 234 :
3326 234 : let wait_lsn_start_finish_counterpair = WAIT_LSN_START_FINISH_COUNTERPAIR
3327 234 : .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
3328 234 : .unwrap();
3329 234 :
3330 234 : let wait_ondemand_download_time =
3331 234 : wait_ondemand_download_time::WaitOndemandDownloadTimeSum::new(
3332 234 : &tenant_id,
3333 234 : &shard_id,
3334 234 : &timeline_id,
3335 234 : );
3336 234 :
3337 234 : TIMELINE_STATE_METRIC.with_label_values(&["active"]).inc();
3338 234 :
3339 234 : TimelineMetrics {
3340 234 : tenant_id,
3341 234 : shard_id,
3342 234 : timeline_id,
3343 234 : flush_time_histo,
3344 234 : flush_delay_histo,
3345 234 : compact_time_histo,
3346 234 : create_images_time_histo,
3347 234 : logical_size_histo,
3348 234 : imitate_logical_size_histo,
3349 234 : garbage_collect_histo,
3350 234 : find_gc_cutoffs_histo,
3351 234 : load_layer_map_histo,
3352 234 : last_record_lsn_gauge,
3353 234 : disk_consistent_lsn_gauge,
3354 234 : pitr_history_size,
3355 234 : archival_size,
3356 234 : layers_per_read,
3357 234 : standby_horizon_gauge,
3358 234 : resident_physical_size_gauge,
3359 234 : visible_physical_size_gauge,
3360 234 : current_logical_size_gauge,
3361 234 : aux_file_size_gauge,
3362 234 : directory_entries_count_gauge,
3363 234 : evictions,
3364 234 : evictions_with_low_residence_duration: std::sync::RwLock::new(
3365 234 : evictions_with_low_residence_duration,
3366 234 : ),
3367 234 : storage_io_size,
3368 234 : valid_lsn_lease_count_gauge,
3369 234 : wal_records_received,
3370 234 : wait_lsn_in_progress_micros,
3371 234 : wait_lsn_start_finish_counterpair,
3372 234 : wait_ondemand_download_time,
3373 234 : shutdown: std::sync::atomic::AtomicBool::default(),
3374 234 : }
3375 234 : }
3376 :
3377 804 : pub(crate) fn record_new_file_metrics(&self, sz: u64) {
3378 804 : self.resident_physical_size_add(sz);
3379 804 : }
3380 :
3381 274 : pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
3382 274 : self.resident_physical_size_gauge.sub(sz);
3383 274 : crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
3384 274 : }
3385 :
3386 872 : pub(crate) fn resident_physical_size_add(&self, sz: u64) {
3387 872 : self.resident_physical_size_gauge.add(sz);
3388 872 : crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
3389 872 : }
3390 :
3391 5 : pub(crate) fn resident_physical_size_get(&self) -> u64 {
3392 5 : self.resident_physical_size_gauge.get()
3393 5 : }
3394 :
3395 : /// Generates TIMELINE_LAYER labels for a persistent layer.
3396 1338 : fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] {
3397 1338 : let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) {
3398 715 : true => LayerLevel::L0,
3399 623 : false => LayerLevel::L1,
3400 : };
3401 1338 : let kind = match layer_desc.is_delta() {
3402 1107 : true => LayerKind::Delta,
3403 231 : false => LayerKind::Image,
3404 : };
3405 1338 : [
3406 1338 : &self.tenant_id,
3407 1338 : &self.shard_id,
3408 1338 : &self.timeline_id,
3409 1338 : level.into(),
3410 1338 : kind.into(),
3411 1338 : ]
3412 1338 : }
3413 :
3414 : /// Generates TIMELINE_LAYER labels for a frozen ephemeral layer.
3415 1191 : fn make_frozen_layer_labels(&self, _layer: &InMemoryLayer) -> [&str; 5] {
3416 1191 : [
3417 1191 : &self.tenant_id,
3418 1191 : &self.shard_id,
3419 1191 : &self.timeline_id,
3420 1191 : LayerLevel::Frozen.into(),
3421 1191 : LayerKind::Delta.into(), // by definition
3422 1191 : ]
3423 1191 : }
3424 :
3425 : /// Removes a frozen ephemeral layer to TIMELINE_LAYER metrics.
3426 595 : pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) {
3427 595 : assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
3428 595 : let labels = self.make_frozen_layer_labels(layer);
3429 595 : let size = layer.try_len().expect("frozen layer should have no writer");
3430 595 : TIMELINE_LAYER_COUNT
3431 595 : .get_metric_with_label_values(&labels)
3432 595 : .unwrap()
3433 595 : .dec();
3434 595 : TIMELINE_LAYER_SIZE
3435 595 : .get_metric_with_label_values(&labels)
3436 595 : .unwrap()
3437 595 : .sub(size);
3438 595 : }
3439 :
3440 : /// Adds a frozen ephemeral layer to TIMELINE_LAYER metrics.
3441 596 : pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) {
3442 596 : assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
3443 596 : let labels = self.make_frozen_layer_labels(layer);
3444 596 : let size = layer.try_len().expect("frozen layer should have no writer");
3445 596 : TIMELINE_LAYER_COUNT
3446 596 : .get_metric_with_label_values(&labels)
3447 596 : .unwrap()
3448 596 : .inc();
3449 596 : TIMELINE_LAYER_SIZE
3450 596 : .get_metric_with_label_values(&labels)
3451 596 : .unwrap()
3452 596 : .add(size);
3453 596 : }
3454 :
3455 : /// Removes a persistent layer from TIMELINE_LAYER metrics.
3456 349 : pub fn dec_layer(&self, layer_desc: &PersistentLayerDesc) {
3457 349 : let labels = self.make_layer_labels(layer_desc);
3458 349 : TIMELINE_LAYER_COUNT
3459 349 : .get_metric_with_label_values(&labels)
3460 349 : .unwrap()
3461 349 : .dec();
3462 349 : TIMELINE_LAYER_SIZE
3463 349 : .get_metric_with_label_values(&labels)
3464 349 : .unwrap()
3465 349 : .sub(layer_desc.file_size);
3466 349 : }
3467 :
3468 : /// Adds a persistent layer to TIMELINE_LAYER metrics.
3469 989 : pub fn inc_layer(&self, layer_desc: &PersistentLayerDesc) {
3470 989 : let labels = self.make_layer_labels(layer_desc);
3471 989 : TIMELINE_LAYER_COUNT
3472 989 : .get_metric_with_label_values(&labels)
3473 989 : .unwrap()
3474 989 : .inc();
3475 989 : TIMELINE_LAYER_SIZE
3476 989 : .get_metric_with_label_values(&labels)
3477 989 : .unwrap()
3478 989 : .add(layer_desc.file_size);
3479 989 : }
3480 :
3481 5 : pub(crate) fn shutdown(&self) {
3482 5 : let was_shutdown = self
3483 5 : .shutdown
3484 5 : .swap(true, std::sync::atomic::Ordering::Relaxed);
3485 5 :
3486 5 : if was_shutdown {
3487 : // this happens on tenant deletion because tenant first shuts down timelines, then
3488 : // invokes timeline deletion which first shuts down the timeline again.
3489 : // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
3490 0 : return;
3491 5 : }
3492 5 :
3493 5 : TIMELINE_STATE_METRIC.with_label_values(&["active"]).dec();
3494 5 :
3495 5 : let tenant_id = &self.tenant_id;
3496 5 : let timeline_id = &self.timeline_id;
3497 5 : let shard_id = &self.shard_id;
3498 5 : let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3499 5 : let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3500 5 : let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3501 5 : {
3502 5 : RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
3503 5 : let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3504 5 : }
3505 5 : let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3506 5 : let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3507 5 : if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
3508 0 : let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3509 5 : }
3510 :
3511 5 : let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3512 5 : let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3513 :
3514 20 : for ref level in LayerLevel::iter() {
3515 45 : for ref kind in LayerKind::iter() {
3516 30 : let labels: [&str; 5] =
3517 30 : [tenant_id, shard_id, timeline_id, level.into(), kind.into()];
3518 30 : let _ = TIMELINE_LAYER_SIZE.remove_label_values(&labels);
3519 30 : let _ = TIMELINE_LAYER_COUNT.remove_label_values(&labels);
3520 30 : }
3521 : }
3522 :
3523 5 : let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3524 5 :
3525 5 : let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3526 5 : let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3527 5 : let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3528 5 :
3529 5 : self.evictions_with_low_residence_duration
3530 5 : .write()
3531 5 : .unwrap()
3532 5 : .remove(tenant_id, shard_id, timeline_id);
3533 :
3534 : // The following metrics are born outside of the TimelineMetrics lifecycle but still
3535 : // removed at the end of it. The idea is to have the metrics outlive the
3536 : // entity during which they're observed, e.g., the smgr metrics shall
3537 : // outlive an individual smgr connection, but not the timeline.
3538 :
3539 50 : for op in StorageTimeOperation::VARIANTS {
3540 45 : let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[
3541 45 : op,
3542 45 : tenant_id,
3543 45 : shard_id,
3544 45 : timeline_id,
3545 45 : ]);
3546 45 : let _ = STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[
3547 45 : op,
3548 45 : tenant_id,
3549 45 : shard_id,
3550 45 : timeline_id,
3551 45 : ]);
3552 45 : }
3553 :
3554 15 : for op in StorageIoSizeOperation::VARIANTS {
3555 10 : let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
3556 10 : }
3557 :
3558 : let _ =
3559 5 : WAIT_LSN_IN_PROGRESS_MICROS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3560 5 :
3561 5 : {
3562 5 : let mut res = [Ok(()), Ok(())];
3563 5 : WAIT_LSN_START_FINISH_COUNTERPAIR
3564 5 : .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id]);
3565 5 : }
3566 5 :
3567 5 : wait_ondemand_download_time::shutdown_timeline(tenant_id, shard_id, timeline_id);
3568 5 :
3569 5 : let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
3570 5 : SmgrQueryType::GetPageAtLsn.into(),
3571 5 : tenant_id,
3572 5 : shard_id,
3573 5 : timeline_id,
3574 5 : ]);
3575 5 : let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
3576 5 : SmgrQueryType::GetPageAtLsn.into(),
3577 5 : tenant_id,
3578 5 : shard_id,
3579 5 : timeline_id,
3580 5 : ]);
3581 5 : let _ = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE.remove_label_values(&[
3582 5 : tenant_id,
3583 5 : shard_id,
3584 5 : timeline_id,
3585 5 : ]);
3586 5 : let _ = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED.remove_label_values(&[
3587 5 : tenant_id,
3588 5 : shard_id,
3589 5 : timeline_id,
3590 5 : ]);
3591 5 : let _ = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS.remove_label_values(&[
3592 5 : tenant_id,
3593 5 : shard_id,
3594 5 : timeline_id,
3595 5 : ]);
3596 5 : let _ = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME.remove_label_values(&[
3597 5 : tenant_id,
3598 5 : shard_id,
3599 5 : timeline_id,
3600 5 : ]);
3601 :
3602 40 : for reason in GetPageBatchBreakReason::iter() {
3603 35 : let _ = PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.remove_label_values(&[
3604 35 : tenant_id,
3605 35 : shard_id,
3606 35 : timeline_id,
3607 35 : reason.into(),
3608 35 : ]);
3609 35 : }
3610 5 : }
3611 : }
3612 :
3613 3 : pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
3614 3 : let tid = tenant_shard_id.tenant_id.to_string();
3615 3 : let shard_id = tenant_shard_id.shard_slug().to_string();
3616 3 :
3617 3 : // Only shard zero deals in synthetic sizes
3618 3 : if tenant_shard_id.is_shard_zero() {
3619 3 : let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
3620 3 : }
3621 3 : let _ = TENANT_OFFLOADED_TIMELINES.remove_label_values(&[&tid, &shard_id]);
3622 3 :
3623 3 : tenant_throttling::remove_tenant_metrics(tenant_shard_id);
3624 3 :
3625 3 : // we leave the BROKEN_TENANTS_SET entry if any
3626 3 : }
3627 :
3628 : /// Maintain a per timeline gauge in addition to the global gauge.
3629 : pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
3630 : last_set: AtomicU64,
3631 : gauge: UIntGauge,
3632 : }
3633 :
3634 : impl PerTimelineRemotePhysicalSizeGauge {
3635 239 : fn new(per_timeline_gauge: UIntGauge) -> Self {
3636 239 : Self {
3637 239 : last_set: AtomicU64::new(0),
3638 239 : gauge: per_timeline_gauge,
3639 239 : }
3640 239 : }
3641 1002 : pub(crate) fn set(&self, sz: u64) {
3642 1002 : self.gauge.set(sz);
3643 1002 : let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed);
3644 1002 : if sz < prev {
3645 20 : REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz);
3646 982 : } else {
3647 982 : REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev);
3648 982 : };
3649 1002 : }
3650 1 : pub(crate) fn get(&self) -> u64 {
3651 1 : self.gauge.get()
3652 1 : }
3653 : }
3654 :
3655 : impl Drop for PerTimelineRemotePhysicalSizeGauge {
3656 10 : fn drop(&mut self) {
3657 10 : REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed));
3658 10 : }
3659 : }
3660 :
3661 : pub(crate) struct RemoteTimelineClientMetrics {
3662 : tenant_id: String,
3663 : shard_id: String,
3664 : timeline_id: String,
3665 : pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge,
3666 : calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
3667 : bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
3668 : bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
3669 : pub(crate) projected_remote_consistent_lsn_gauge: UIntGauge,
3670 : }
3671 :
3672 : impl RemoteTimelineClientMetrics {
3673 239 : pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
3674 239 : let tenant_id_str = tenant_shard_id.tenant_id.to_string();
3675 239 : let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
3676 239 : let timeline_id_str = timeline_id.to_string();
3677 239 :
3678 239 : let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new(
3679 239 : REMOTE_PHYSICAL_SIZE
3680 239 : .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
3681 239 : .unwrap(),
3682 239 : );
3683 239 :
3684 239 : let projected_remote_consistent_lsn_gauge = PROJECTED_REMOTE_CONSISTENT_LSN
3685 239 : .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
3686 239 : .unwrap();
3687 239 :
3688 239 : RemoteTimelineClientMetrics {
3689 239 : tenant_id: tenant_id_str,
3690 239 : shard_id: shard_id_str,
3691 239 : timeline_id: timeline_id_str,
3692 239 : calls: Mutex::new(HashMap::default()),
3693 239 : bytes_started_counter: Mutex::new(HashMap::default()),
3694 239 : bytes_finished_counter: Mutex::new(HashMap::default()),
3695 239 : remote_physical_size_gauge,
3696 239 : projected_remote_consistent_lsn_gauge,
3697 239 : }
3698 239 : }
3699 :
3700 1667 : pub fn remote_operation_time(
3701 1667 : &self,
3702 1667 : task_kind: Option<TaskKind>,
3703 1667 : file_kind: &RemoteOpFileKind,
3704 1667 : op_kind: &RemoteOpKind,
3705 1667 : status: &'static str,
3706 1667 : ) -> Histogram {
3707 1667 : REMOTE_TIMELINE_CLIENT_COMPLETION_LATENCY
3708 1667 : .get_metric_with_label_values(&[
3709 1667 : task_kind.as_ref().map(|tk| tk.into()).unwrap_or("unknown"),
3710 1667 : file_kind.as_str(),
3711 1667 : op_kind.as_str(),
3712 1667 : status,
3713 1667 : ])
3714 1667 : .unwrap()
3715 1667 : }
3716 :
3717 3816 : fn calls_counter_pair(
3718 3816 : &self,
3719 3816 : file_kind: &RemoteOpFileKind,
3720 3816 : op_kind: &RemoteOpKind,
3721 3816 : ) -> IntCounterPair {
3722 3816 : let mut guard = self.calls.lock().unwrap();
3723 3816 : let key = (file_kind.as_str(), op_kind.as_str());
3724 3816 : let metric = guard.entry(key).or_insert_with(move || {
3725 429 : REMOTE_TIMELINE_CLIENT_CALLS
3726 429 : .get_metric_with_label_values(&[
3727 429 : &self.tenant_id,
3728 429 : &self.shard_id,
3729 429 : &self.timeline_id,
3730 429 : key.0,
3731 429 : key.1,
3732 429 : ])
3733 429 : .unwrap()
3734 3816 : });
3735 3816 : metric.clone()
3736 3816 : }
3737 :
3738 894 : fn bytes_started_counter(
3739 894 : &self,
3740 894 : file_kind: &RemoteOpFileKind,
3741 894 : op_kind: &RemoteOpKind,
3742 894 : ) -> IntCounter {
3743 894 : let mut guard = self.bytes_started_counter.lock().unwrap();
3744 894 : let key = (file_kind.as_str(), op_kind.as_str());
3745 894 : let metric = guard.entry(key).or_insert_with(move || {
3746 169 : REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
3747 169 : .get_metric_with_label_values(&[
3748 169 : &self.tenant_id,
3749 169 : &self.shard_id,
3750 169 : &self.timeline_id,
3751 169 : key.0,
3752 169 : key.1,
3753 169 : ])
3754 169 : .unwrap()
3755 894 : });
3756 894 : metric.clone()
3757 894 : }
3758 :
3759 1776 : fn bytes_finished_counter(
3760 1776 : &self,
3761 1776 : file_kind: &RemoteOpFileKind,
3762 1776 : op_kind: &RemoteOpKind,
3763 1776 : ) -> IntCounter {
3764 1776 : let mut guard = self.bytes_finished_counter.lock().unwrap();
3765 1776 : let key = (file_kind.as_str(), op_kind.as_str());
3766 1776 : let metric = guard.entry(key).or_insert_with(move || {
3767 169 : REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
3768 169 : .get_metric_with_label_values(&[
3769 169 : &self.tenant_id,
3770 169 : &self.shard_id,
3771 169 : &self.timeline_id,
3772 169 : key.0,
3773 169 : key.1,
3774 169 : ])
3775 169 : .unwrap()
3776 1776 : });
3777 1776 : metric.clone()
3778 1776 : }
3779 : }
3780 :
3781 : #[cfg(test)]
3782 : impl RemoteTimelineClientMetrics {
3783 3 : pub fn get_bytes_started_counter_value(
3784 3 : &self,
3785 3 : file_kind: &RemoteOpFileKind,
3786 3 : op_kind: &RemoteOpKind,
3787 3 : ) -> Option<u64> {
3788 3 : let guard = self.bytes_started_counter.lock().unwrap();
3789 3 : let key = (file_kind.as_str(), op_kind.as_str());
3790 3 : guard.get(&key).map(|counter| counter.get())
3791 3 : }
3792 :
3793 3 : pub fn get_bytes_finished_counter_value(
3794 3 : &self,
3795 3 : file_kind: &RemoteOpFileKind,
3796 3 : op_kind: &RemoteOpKind,
3797 3 : ) -> Option<u64> {
3798 3 : let guard = self.bytes_finished_counter.lock().unwrap();
3799 3 : let key = (file_kind.as_str(), op_kind.as_str());
3800 3 : guard.get(&key).map(|counter| counter.get())
3801 3 : }
3802 : }
3803 :
3804 : /// See [`RemoteTimelineClientMetrics::call_begin`].
3805 : #[must_use]
3806 : pub(crate) struct RemoteTimelineClientCallMetricGuard {
3807 : /// Decremented on drop.
3808 : calls_counter_pair: Option<IntCounterPair>,
3809 : /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
3810 : bytes_finished: Option<(IntCounter, u64)>,
3811 : }
3812 :
3813 : impl RemoteTimelineClientCallMetricGuard {
3814 : /// Consume this guard object without performing the metric updates it would do on `drop()`.
3815 : /// The caller vouches to do the metric updates manually.
3816 1944 : pub fn will_decrement_manually(mut self) {
3817 1944 : let RemoteTimelineClientCallMetricGuard {
3818 1944 : calls_counter_pair,
3819 1944 : bytes_finished,
3820 1944 : } = &mut self;
3821 1944 : calls_counter_pair.take();
3822 1944 : bytes_finished.take();
3823 1944 : }
3824 : }
3825 :
3826 : impl Drop for RemoteTimelineClientCallMetricGuard {
3827 1961 : fn drop(&mut self) {
3828 1961 : let RemoteTimelineClientCallMetricGuard {
3829 1961 : calls_counter_pair,
3830 1961 : bytes_finished,
3831 1961 : } = self;
3832 1961 : if let Some(guard) = calls_counter_pair.take() {
3833 17 : guard.dec();
3834 1944 : }
3835 1961 : if let Some((bytes_finished_metric, value)) = bytes_finished {
3836 0 : bytes_finished_metric.inc_by(*value);
3837 1961 : }
3838 1961 : }
3839 : }
3840 :
3841 : /// The enum variants communicate to the [`RemoteTimelineClientMetrics`] whether to
3842 : /// track the byte size of this call in applicable metric(s).
3843 : pub(crate) enum RemoteTimelineClientMetricsCallTrackSize {
3844 : /// Do not account for this call's byte size in any metrics.
3845 : /// The `reason` field is there to make the call sites self-documenting
3846 : /// about why they don't need the metric.
3847 : DontTrackSize { reason: &'static str },
3848 : /// Track the byte size of the call in applicable metric(s).
3849 : Bytes(u64),
3850 : }
3851 :
3852 : impl RemoteTimelineClientMetrics {
3853 : /// Update the metrics that change when a call to the remote timeline client instance starts.
3854 : ///
3855 : /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
3856 : /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that
3857 : /// is more suitable.
3858 : /// Never do both.
3859 1961 : pub(crate) fn call_begin(
3860 1961 : &self,
3861 1961 : file_kind: &RemoteOpFileKind,
3862 1961 : op_kind: &RemoteOpKind,
3863 1961 : size: RemoteTimelineClientMetricsCallTrackSize,
3864 1961 : ) -> RemoteTimelineClientCallMetricGuard {
3865 1961 : let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
3866 1961 : calls_counter_pair.inc();
3867 :
3868 1961 : let bytes_finished = match size {
3869 1067 : RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
3870 1067 : // nothing to do
3871 1067 : None
3872 : }
3873 894 : RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
3874 894 : self.bytes_started_counter(file_kind, op_kind).inc_by(size);
3875 894 : let finished_counter = self.bytes_finished_counter(file_kind, op_kind);
3876 894 : Some((finished_counter, size))
3877 : }
3878 : };
3879 1961 : RemoteTimelineClientCallMetricGuard {
3880 1961 : calls_counter_pair: Some(calls_counter_pair),
3881 1961 : bytes_finished,
3882 1961 : }
3883 1961 : }
3884 :
3885 : /// Manually udpate the metrics that track completions, instead of using the guard object.
3886 : /// Using the guard object is generally preferable.
3887 : /// See [`call_begin`](Self::call_begin) for more context.
3888 1855 : pub(crate) fn call_end(
3889 1855 : &self,
3890 1855 : file_kind: &RemoteOpFileKind,
3891 1855 : op_kind: &RemoteOpKind,
3892 1855 : size: RemoteTimelineClientMetricsCallTrackSize,
3893 1855 : ) {
3894 1855 : let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
3895 1855 : calls_counter_pair.dec();
3896 1855 : match size {
3897 973 : RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
3898 882 : RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
3899 882 : self.bytes_finished_counter(file_kind, op_kind).inc_by(size);
3900 882 : }
3901 : }
3902 1855 : }
3903 : }
3904 :
3905 : impl Drop for RemoteTimelineClientMetrics {
3906 10 : fn drop(&mut self) {
3907 10 : let RemoteTimelineClientMetrics {
3908 10 : tenant_id,
3909 10 : shard_id,
3910 10 : timeline_id,
3911 10 : remote_physical_size_gauge,
3912 10 : calls,
3913 10 : bytes_started_counter,
3914 10 : bytes_finished_counter,
3915 10 : projected_remote_consistent_lsn_gauge,
3916 10 : } = self;
3917 12 : for ((a, b), _) in calls.get_mut().unwrap().drain() {
3918 12 : let mut res = [Ok(()), Ok(())];
3919 12 : REMOTE_TIMELINE_CLIENT_CALLS
3920 12 : .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id, a, b]);
3921 12 : // don't care about results
3922 12 : }
3923 10 : for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
3924 3 : let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
3925 3 : tenant_id,
3926 3 : shard_id,
3927 3 : timeline_id,
3928 3 : a,
3929 3 : b,
3930 3 : ]);
3931 3 : }
3932 10 : for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
3933 3 : let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
3934 3 : tenant_id,
3935 3 : shard_id,
3936 3 : timeline_id,
3937 3 : a,
3938 3 : b,
3939 3 : ]);
3940 3 : }
3941 10 : {
3942 10 : let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
3943 10 : let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
3944 10 : }
3945 10 : {
3946 10 : let _ = projected_remote_consistent_lsn_gauge;
3947 10 : let _ = PROJECTED_REMOTE_CONSISTENT_LSN.remove_label_values(&[
3948 10 : tenant_id,
3949 10 : shard_id,
3950 10 : timeline_id,
3951 10 : ]);
3952 10 : }
3953 10 : }
3954 : }
3955 :
3956 : /// Wrapper future that measures the time spent by a remote storage operation,
3957 : /// and records the time and success/failure as a prometheus metric.
3958 : pub(crate) trait MeasureRemoteOp<O, E>: Sized + Future<Output = Result<O, E>> {
3959 1678 : async fn measure_remote_op(
3960 1678 : self,
3961 1678 : task_kind: Option<TaskKind>, // not all caller contexts have a RequestContext / TaskKind handy
3962 1678 : file_kind: RemoteOpFileKind,
3963 1678 : op: RemoteOpKind,
3964 1678 : metrics: Arc<RemoteTimelineClientMetrics>,
3965 1678 : ) -> Result<O, E> {
3966 1678 : let start = Instant::now();
3967 1678 : let res = self.await;
3968 1667 : let duration = start.elapsed();
3969 1667 : let status = if res.is_ok() { &"success" } else { &"failure" };
3970 1667 : metrics
3971 1667 : .remote_operation_time(task_kind, &file_kind, &op, status)
3972 1667 : .observe(duration.as_secs_f64());
3973 1667 : res
3974 1667 : }
3975 : }
3976 :
3977 : impl<Fut, O, E> MeasureRemoteOp<O, E> for Fut where Fut: Sized + Future<Output = Result<O, E>> {}
3978 :
3979 : pub mod tokio_epoll_uring {
3980 : use std::collections::HashMap;
3981 : use std::sync::{Arc, Mutex};
3982 :
3983 : use metrics::{Histogram, LocalHistogram, UIntGauge, register_histogram, register_int_counter};
3984 : use once_cell::sync::Lazy;
3985 :
3986 : /// Shared storage for tokio-epoll-uring thread local metrics.
3987 : pub(crate) static THREAD_LOCAL_METRICS_STORAGE: Lazy<ThreadLocalMetricsStorage> =
3988 121 : Lazy::new(|| {
3989 121 : let slots_submission_queue_depth = register_histogram!(
3990 121 : "pageserver_tokio_epoll_uring_slots_submission_queue_depth",
3991 121 : "The slots waiters queue depth of each tokio_epoll_uring system",
3992 121 : vec![
3993 121 : 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
3994 121 : ],
3995 121 : )
3996 121 : .expect("failed to define a metric");
3997 121 : ThreadLocalMetricsStorage {
3998 121 : observers: Mutex::new(HashMap::new()),
3999 121 : slots_submission_queue_depth,
4000 121 : }
4001 121 : });
4002 :
4003 : pub struct ThreadLocalMetricsStorage {
4004 : /// List of thread local metrics observers.
4005 : observers: Mutex<HashMap<u64, Arc<ThreadLocalMetrics>>>,
4006 : /// A histogram shared between all thread local systems
4007 : /// for collecting slots submission queue depth.
4008 : slots_submission_queue_depth: Histogram,
4009 : }
4010 :
4011 : /// Each thread-local [`tokio_epoll_uring::System`] gets one of these as its
4012 : /// [`tokio_epoll_uring::metrics::PerSystemMetrics`] generic.
4013 : ///
4014 : /// The System makes observations into [`Self`] and periodically, the collector
4015 : /// comes along and flushes [`Self`] into the shared storage [`THREAD_LOCAL_METRICS_STORAGE`].
4016 : ///
4017 : /// [`LocalHistogram`] is `!Send`, so, we need to put it behind a [`Mutex`].
4018 : /// But except for the periodic flush, the lock is uncontended so there's no waiting
4019 : /// for cache coherence protocol to get an exclusive cache line.
4020 : pub struct ThreadLocalMetrics {
4021 : /// Local observer of thread local tokio-epoll-uring system's slots waiters queue depth.
4022 : slots_submission_queue_depth: Mutex<LocalHistogram>,
4023 : }
4024 :
4025 : impl ThreadLocalMetricsStorage {
4026 : /// Registers a new thread local system. Returns a thread local metrics observer.
4027 567 : pub fn register_system(&self, id: u64) -> Arc<ThreadLocalMetrics> {
4028 567 : let per_system_metrics = Arc::new(ThreadLocalMetrics::new(
4029 567 : self.slots_submission_queue_depth.local(),
4030 567 : ));
4031 567 : let mut g = self.observers.lock().unwrap();
4032 567 : g.insert(id, Arc::clone(&per_system_metrics));
4033 567 : per_system_metrics
4034 567 : }
4035 :
4036 : /// Removes metrics observer for a thread local system.
4037 : /// This should be called before dropping a thread local system.
4038 121 : pub fn remove_system(&self, id: u64) {
4039 121 : let mut g = self.observers.lock().unwrap();
4040 121 : g.remove(&id);
4041 121 : }
4042 :
4043 : /// Flush all thread local metrics to the shared storage.
4044 0 : pub fn flush_thread_local_metrics(&self) {
4045 0 : let g = self.observers.lock().unwrap();
4046 0 : g.values().for_each(|local| {
4047 0 : local.flush();
4048 0 : });
4049 0 : }
4050 : }
4051 :
4052 : impl ThreadLocalMetrics {
4053 567 : pub fn new(slots_submission_queue_depth: LocalHistogram) -> Self {
4054 567 : ThreadLocalMetrics {
4055 567 : slots_submission_queue_depth: Mutex::new(slots_submission_queue_depth),
4056 567 : }
4057 567 : }
4058 :
4059 : /// Flushes the thread local metrics to shared aggregator.
4060 0 : pub fn flush(&self) {
4061 0 : let Self {
4062 0 : slots_submission_queue_depth,
4063 0 : } = self;
4064 0 : slots_submission_queue_depth.lock().unwrap().flush();
4065 0 : }
4066 : }
4067 :
4068 : impl tokio_epoll_uring::metrics::PerSystemMetrics for ThreadLocalMetrics {
4069 396535 : fn observe_slots_submission_queue_depth(&self, queue_depth: u64) {
4070 396535 : let Self {
4071 396535 : slots_submission_queue_depth,
4072 396535 : } = self;
4073 396535 : slots_submission_queue_depth
4074 396535 : .lock()
4075 396535 : .unwrap()
4076 396535 : .observe(queue_depth as f64);
4077 396535 : }
4078 : }
4079 :
4080 : pub struct Collector {
4081 : descs: Vec<metrics::core::Desc>,
4082 : systems_created: UIntGauge,
4083 : systems_destroyed: UIntGauge,
4084 : thread_local_metrics_storage: &'static ThreadLocalMetricsStorage,
4085 : }
4086 :
4087 : impl metrics::core::Collector for Collector {
4088 0 : fn desc(&self) -> Vec<&metrics::core::Desc> {
4089 0 : self.descs.iter().collect()
4090 0 : }
4091 :
4092 0 : fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
4093 0 : let mut mfs = Vec::with_capacity(Self::NMETRICS);
4094 0 : let tokio_epoll_uring::metrics::GlobalMetrics {
4095 0 : systems_created,
4096 0 : systems_destroyed,
4097 0 : } = tokio_epoll_uring::metrics::global();
4098 0 : self.systems_created.set(systems_created);
4099 0 : mfs.extend(self.systems_created.collect());
4100 0 : self.systems_destroyed.set(systems_destroyed);
4101 0 : mfs.extend(self.systems_destroyed.collect());
4102 0 :
4103 0 : self.thread_local_metrics_storage
4104 0 : .flush_thread_local_metrics();
4105 0 :
4106 0 : mfs.extend(
4107 0 : self.thread_local_metrics_storage
4108 0 : .slots_submission_queue_depth
4109 0 : .collect(),
4110 0 : );
4111 0 : mfs
4112 0 : }
4113 : }
4114 :
4115 : impl Collector {
4116 : const NMETRICS: usize = 3;
4117 :
4118 : #[allow(clippy::new_without_default)]
4119 0 : pub fn new() -> Self {
4120 0 : let mut descs = Vec::new();
4121 0 :
4122 0 : let systems_created = UIntGauge::new(
4123 0 : "pageserver_tokio_epoll_uring_systems_created",
4124 0 : "counter of tokio-epoll-uring systems that were created",
4125 0 : )
4126 0 : .unwrap();
4127 0 : descs.extend(
4128 0 : metrics::core::Collector::desc(&systems_created)
4129 0 : .into_iter()
4130 0 : .cloned(),
4131 0 : );
4132 0 :
4133 0 : let systems_destroyed = UIntGauge::new(
4134 0 : "pageserver_tokio_epoll_uring_systems_destroyed",
4135 0 : "counter of tokio-epoll-uring systems that were destroyed",
4136 0 : )
4137 0 : .unwrap();
4138 0 : descs.extend(
4139 0 : metrics::core::Collector::desc(&systems_destroyed)
4140 0 : .into_iter()
4141 0 : .cloned(),
4142 0 : );
4143 0 :
4144 0 : Self {
4145 0 : descs,
4146 0 : systems_created,
4147 0 : systems_destroyed,
4148 0 : thread_local_metrics_storage: &THREAD_LOCAL_METRICS_STORAGE,
4149 0 : }
4150 0 : }
4151 : }
4152 :
4153 121 : pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy<metrics::IntCounter> = Lazy::new(|| {
4154 121 : register_int_counter!(
4155 121 : "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count",
4156 121 : "Number of times where thread_local_system creation spanned multiple executor threads",
4157 121 : )
4158 121 : .unwrap()
4159 121 : });
4160 :
4161 0 : pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy<metrics::IntCounter> = Lazy::new(|| {
4162 0 : register_int_counter!(
4163 0 : "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count",
4164 0 : "Number of times thread_local_system creation failed and was retried after back-off.",
4165 0 : )
4166 0 : .unwrap()
4167 0 : });
4168 : }
4169 :
4170 : pub(crate) struct GlobalAndPerTenantIntCounter {
4171 : global: IntCounter,
4172 : per_tenant: IntCounter,
4173 : }
4174 :
4175 : impl GlobalAndPerTenantIntCounter {
4176 : #[inline(always)]
4177 0 : pub(crate) fn inc(&self) {
4178 0 : self.inc_by(1)
4179 0 : }
4180 : #[inline(always)]
4181 112854 : pub(crate) fn inc_by(&self, n: u64) {
4182 112854 : self.global.inc_by(n);
4183 112854 : self.per_tenant.inc_by(n);
4184 112854 : }
4185 : }
4186 :
4187 : pub(crate) mod tenant_throttling {
4188 : use metrics::register_int_counter_vec;
4189 : use once_cell::sync::Lazy;
4190 : use utils::shard::TenantShardId;
4191 :
4192 : use super::GlobalAndPerTenantIntCounter;
4193 :
4194 : pub(crate) struct Metrics<const KIND: usize> {
4195 : pub(super) count_accounted_start: GlobalAndPerTenantIntCounter,
4196 : pub(super) count_accounted_finish: GlobalAndPerTenantIntCounter,
4197 : pub(super) wait_time: GlobalAndPerTenantIntCounter,
4198 : pub(super) count_throttled: GlobalAndPerTenantIntCounter,
4199 : }
4200 :
4201 109 : static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
4202 109 : register_int_counter_vec!(
4203 109 : "pageserver_tenant_throttling_count_accounted_start_global",
4204 109 : "Count of tenant throttling starts, by kind of throttle.",
4205 109 : &["kind"]
4206 109 : )
4207 109 : .unwrap()
4208 109 : });
4209 109 : static COUNT_ACCOUNTED_START_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
4210 109 : register_int_counter_vec!(
4211 109 : "pageserver_tenant_throttling_count_accounted_start",
4212 109 : "Count of tenant throttling starts, by kind of throttle.",
4213 109 : &["kind", "tenant_id", "shard_id"]
4214 109 : )
4215 109 : .unwrap()
4216 109 : });
4217 109 : static COUNT_ACCOUNTED_FINISH: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
4218 109 : register_int_counter_vec!(
4219 109 : "pageserver_tenant_throttling_count_accounted_finish_global",
4220 109 : "Count of tenant throttling finishes, by kind of throttle.",
4221 109 : &["kind"]
4222 109 : )
4223 109 : .unwrap()
4224 109 : });
4225 109 : static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
4226 109 : register_int_counter_vec!(
4227 109 : "pageserver_tenant_throttling_count_accounted_finish",
4228 109 : "Count of tenant throttling finishes, by kind of throttle.",
4229 109 : &["kind", "tenant_id", "shard_id"]
4230 109 : )
4231 109 : .unwrap()
4232 109 : });
4233 109 : static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
4234 109 : register_int_counter_vec!(
4235 109 : "pageserver_tenant_throttling_wait_usecs_sum_global",
4236 109 : "Sum of microseconds that spent waiting throttle by kind of throttle.",
4237 109 : &["kind"]
4238 109 : )
4239 109 : .unwrap()
4240 109 : });
4241 109 : static WAIT_USECS_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
4242 109 : register_int_counter_vec!(
4243 109 : "pageserver_tenant_throttling_wait_usecs_sum",
4244 109 : "Sum of microseconds that spent waiting throttle by kind of throttle.",
4245 109 : &["kind", "tenant_id", "shard_id"]
4246 109 : )
4247 109 : .unwrap()
4248 109 : });
4249 :
4250 109 : static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
4251 109 : register_int_counter_vec!(
4252 109 : "pageserver_tenant_throttling_count_global",
4253 109 : "Count of tenant throttlings, by kind of throttle.",
4254 109 : &["kind"]
4255 109 : )
4256 109 : .unwrap()
4257 109 : });
4258 109 : static WAIT_COUNT_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
4259 109 : register_int_counter_vec!(
4260 109 : "pageserver_tenant_throttling_count",
4261 109 : "Count of tenant throttlings, by kind of throttle.",
4262 109 : &["kind", "tenant_id", "shard_id"]
4263 109 : )
4264 109 : .unwrap()
4265 109 : });
4266 :
4267 : const KINDS: &[&str] = &["pagestream"];
4268 : pub type Pagestream = Metrics<0>;
4269 :
4270 : impl<const KIND: usize> Metrics<KIND> {
4271 118 : pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
4272 118 : let per_tenant_label_values = &[
4273 118 : KINDS[KIND],
4274 118 : &tenant_shard_id.tenant_id.to_string(),
4275 118 : &tenant_shard_id.shard_slug().to_string(),
4276 118 : ];
4277 118 : Metrics {
4278 118 : count_accounted_start: {
4279 118 : GlobalAndPerTenantIntCounter {
4280 118 : global: COUNT_ACCOUNTED_START.with_label_values(&[KINDS[KIND]]),
4281 118 : per_tenant: COUNT_ACCOUNTED_START_PER_TENANT
4282 118 : .with_label_values(per_tenant_label_values),
4283 118 : }
4284 118 : },
4285 118 : count_accounted_finish: {
4286 118 : GlobalAndPerTenantIntCounter {
4287 118 : global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KINDS[KIND]]),
4288 118 : per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT
4289 118 : .with_label_values(per_tenant_label_values),
4290 118 : }
4291 118 : },
4292 118 : wait_time: {
4293 118 : GlobalAndPerTenantIntCounter {
4294 118 : global: WAIT_USECS.with_label_values(&[KINDS[KIND]]),
4295 118 : per_tenant: WAIT_USECS_PER_TENANT
4296 118 : .with_label_values(per_tenant_label_values),
4297 118 : }
4298 118 : },
4299 118 : count_throttled: {
4300 118 : GlobalAndPerTenantIntCounter {
4301 118 : global: WAIT_COUNT.with_label_values(&[KINDS[KIND]]),
4302 118 : per_tenant: WAIT_COUNT_PER_TENANT
4303 118 : .with_label_values(per_tenant_label_values),
4304 118 : }
4305 118 : },
4306 118 : }
4307 118 : }
4308 : }
4309 :
4310 0 : pub(crate) fn preinitialize_global_metrics() {
4311 0 : Lazy::force(&COUNT_ACCOUNTED_START);
4312 0 : Lazy::force(&COUNT_ACCOUNTED_FINISH);
4313 0 : Lazy::force(&WAIT_USECS);
4314 0 : Lazy::force(&WAIT_COUNT);
4315 0 : }
4316 :
4317 3 : pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
4318 12 : for m in &[
4319 3 : &COUNT_ACCOUNTED_START_PER_TENANT,
4320 3 : &COUNT_ACCOUNTED_FINISH_PER_TENANT,
4321 3 : &WAIT_USECS_PER_TENANT,
4322 3 : &WAIT_COUNT_PER_TENANT,
4323 3 : ] {
4324 24 : for kind in KINDS {
4325 12 : let _ = m.remove_label_values(&[
4326 12 : kind,
4327 12 : &tenant_shard_id.tenant_id.to_string(),
4328 12 : &tenant_shard_id.shard_slug().to_string(),
4329 12 : ]);
4330 12 : }
4331 : }
4332 3 : }
4333 : }
4334 :
4335 : pub(crate) mod disk_usage_based_eviction {
4336 : use super::*;
4337 :
4338 : pub(crate) struct Metrics {
4339 : pub(crate) tenant_collection_time: Histogram,
4340 : pub(crate) tenant_layer_count: Histogram,
4341 : pub(crate) layers_collected: IntCounter,
4342 : pub(crate) layers_selected: IntCounter,
4343 : pub(crate) layers_evicted: IntCounter,
4344 : }
4345 :
4346 : impl Default for Metrics {
4347 0 : fn default() -> Self {
4348 0 : let tenant_collection_time = register_histogram!(
4349 0 : "pageserver_disk_usage_based_eviction_tenant_collection_seconds",
4350 0 : "Time spent collecting layers from a tenant -- not normalized by collected layer amount",
4351 0 : vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
4352 0 : )
4353 0 : .unwrap();
4354 0 :
4355 0 : let tenant_layer_count = register_histogram!(
4356 0 : "pageserver_disk_usage_based_eviction_tenant_collected_layers",
4357 0 : "Amount of layers gathered from a tenant",
4358 0 : vec![5.0, 50.0, 500.0, 5000.0, 50000.0]
4359 0 : )
4360 0 : .unwrap();
4361 0 :
4362 0 : let layers_collected = register_int_counter!(
4363 0 : "pageserver_disk_usage_based_eviction_collected_layers_total",
4364 0 : "Amount of layers collected"
4365 0 : )
4366 0 : .unwrap();
4367 0 :
4368 0 : let layers_selected = register_int_counter!(
4369 0 : "pageserver_disk_usage_based_eviction_select_layers_total",
4370 0 : "Amount of layers selected"
4371 0 : )
4372 0 : .unwrap();
4373 0 :
4374 0 : let layers_evicted = register_int_counter!(
4375 0 : "pageserver_disk_usage_based_eviction_evicted_layers_total",
4376 0 : "Amount of layers successfully evicted"
4377 0 : )
4378 0 : .unwrap();
4379 0 :
4380 0 : Self {
4381 0 : tenant_collection_time,
4382 0 : tenant_layer_count,
4383 0 : layers_collected,
4384 0 : layers_selected,
4385 0 : layers_evicted,
4386 0 : }
4387 0 : }
4388 : }
4389 :
4390 : pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
4391 : }
4392 :
4393 106 : static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
4394 106 : register_uint_gauge_vec!(
4395 106 : "pageserver_tokio_executor_thread_configured_count",
4396 106 : "Total number of configued tokio executor threads in the process.
4397 106 : The `setup` label denotes whether we're running with multiple runtimes or a single runtime.",
4398 106 : &["setup"],
4399 106 : )
4400 106 : .unwrap()
4401 106 : });
4402 :
4403 106 : pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
4404 : static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
4405 106 : let _guard = SERIALIZE.lock().unwrap();
4406 106 : TOKIO_EXECUTOR_THREAD_COUNT.reset();
4407 106 : TOKIO_EXECUTOR_THREAD_COUNT
4408 106 : .get_metric_with_label_values(&[setup])
4409 106 : .unwrap()
4410 106 : .set(u64::try_from(num_threads.get()).unwrap());
4411 106 : }
4412 :
4413 0 : pub(crate) static BASEBACKUP_CACHE_READ: Lazy<IntCounterVec> = Lazy::new(|| {
4414 0 : register_int_counter_vec!(
4415 0 : "pageserver_basebackup_cache_read_total",
4416 0 : "Number of read accesses to the basebackup cache grouped by hit/miss/error",
4417 0 : &["result"]
4418 0 : )
4419 0 : .expect("failed to define a metric")
4420 0 : });
4421 :
4422 0 : pub(crate) static BASEBACKUP_CACHE_PREPARE: Lazy<IntCounterVec> = Lazy::new(|| {
4423 0 : register_int_counter_vec!(
4424 0 : "pageserver_basebackup_cache_prepare_total",
4425 0 : "Number of prepare requests processed by the basebackup cache grouped by ok/skip/error",
4426 0 : &["result"]
4427 0 : )
4428 0 : .expect("failed to define a metric")
4429 0 : });
4430 :
4431 0 : pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy<IntGauge> = Lazy::new(|| {
4432 0 : register_int_gauge!(
4433 0 : "pageserver_basebackup_cache_entries_total",
4434 0 : "Number of entries in the basebackup cache"
4435 0 : )
4436 0 : .expect("failed to define a metric")
4437 0 : });
4438 :
4439 : // FIXME: Support basebackup cache size metrics.
4440 : #[allow(dead_code)]
4441 0 : pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<IntGauge> = Lazy::new(|| {
4442 0 : register_int_gauge!(
4443 0 : "pageserver_basebackup_cache_size_bytes",
4444 0 : "Total size of all basebackup cache entries on disk in bytes"
4445 0 : )
4446 0 : .expect("failed to define a metric")
4447 0 : });
4448 :
4449 0 : static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy<UIntGaugeVec> = Lazy::new(|| {
4450 0 : register_uint_gauge_vec!(
4451 0 : "pageserver_config_ignored_items",
4452 0 : "TOML items present in the on-disk configuration file but ignored by the pageserver config parser.\
4453 0 : The `item` label is the dot-separated path of the ignored item in the on-disk configuration file.\
4454 0 : The value for an unknown config item is always 1.\
4455 0 : There is a special label value \"\", which is 0, so that there is always a metric exposed (simplifies dashboards).",
4456 0 : &["item"]
4457 0 : )
4458 0 : .unwrap()
4459 0 : });
4460 :
4461 0 : pub fn preinitialize_metrics(
4462 0 : conf: &'static PageServerConf,
4463 0 : ignored: config::ignored_fields::Paths,
4464 0 : ) {
4465 0 : set_page_service_config_max_batch_size(&conf.page_service_pipelining);
4466 0 :
4467 0 : PAGESERVER_CONFIG_IGNORED_ITEMS
4468 0 : .with_label_values(&[""])
4469 0 : .set(0);
4470 0 : for path in &ignored.paths {
4471 0 : PAGESERVER_CONFIG_IGNORED_ITEMS
4472 0 : .with_label_values(&[path])
4473 0 : .set(1);
4474 0 : }
4475 :
4476 : // Python tests need these and on some we do alerting.
4477 : //
4478 : // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
4479 : // order:
4480 : // - global metrics reside in a Lazy<PageserverMetrics>
4481 : // - access via crate::metrics::PS_METRICS.some_metric.inc()
4482 : // - could move the statics into TimelineMetrics::new()?
4483 :
4484 : // counters
4485 0 : [
4486 0 : &UNEXPECTED_ONDEMAND_DOWNLOADS,
4487 0 : &WALRECEIVER_STARTED_CONNECTIONS,
4488 0 : &WALRECEIVER_BROKER_UPDATES,
4489 0 : &WALRECEIVER_CANDIDATES_ADDED,
4490 0 : &WALRECEIVER_CANDIDATES_REMOVED,
4491 0 : &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
4492 0 : &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
4493 0 : &REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
4494 0 : &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
4495 0 : &CIRCUIT_BREAKERS_BROKEN,
4496 0 : &CIRCUIT_BREAKERS_UNBROKEN,
4497 0 : &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
4498 0 : &WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS,
4499 0 : ]
4500 0 : .into_iter()
4501 0 : .for_each(|c| {
4502 0 : Lazy::force(c);
4503 0 : });
4504 0 :
4505 0 : // Deletion queue stats
4506 0 : Lazy::force(&DELETION_QUEUE);
4507 0 :
4508 0 : // Tenant stats
4509 0 : Lazy::force(&TENANT);
4510 0 :
4511 0 : // Tenant manager stats
4512 0 : Lazy::force(&TENANT_MANAGER);
4513 0 :
4514 0 : Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
4515 0 : Lazy::force(&disk_usage_based_eviction::METRICS);
4516 :
4517 0 : for state_name in pageserver_api::models::TenantState::VARIANTS {
4518 0 : // initialize the metric for all gauges, otherwise the time series might seemingly show
4519 0 : // values from last restart.
4520 0 : TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0);
4521 0 : }
4522 :
4523 : // countervecs
4524 0 : [
4525 0 : &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
4526 0 : &SMGR_QUERY_STARTED_GLOBAL,
4527 0 : &PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL,
4528 0 : ]
4529 0 : .into_iter()
4530 0 : .for_each(|c| {
4531 0 : Lazy::force(c);
4532 0 : });
4533 0 :
4534 0 : // gauges
4535 0 : WALRECEIVER_ACTIVE_MANAGERS.get();
4536 0 :
4537 0 : // histograms
4538 0 : [
4539 0 : &LAYERS_PER_READ_GLOBAL,
4540 0 : &LAYERS_PER_READ_BATCH_GLOBAL,
4541 0 : &LAYERS_PER_READ_AMORTIZED_GLOBAL,
4542 0 : &DELTAS_PER_READ_GLOBAL,
4543 0 : &WAIT_LSN_TIME,
4544 0 : &WAL_REDO_TIME,
4545 0 : &WAL_REDO_RECORDS_HISTOGRAM,
4546 0 : &WAL_REDO_BYTES_HISTOGRAM,
4547 0 : &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
4548 0 : &PAGE_SERVICE_BATCH_SIZE_GLOBAL,
4549 0 : &PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL,
4550 0 : ]
4551 0 : .into_iter()
4552 0 : .for_each(|h| {
4553 0 : Lazy::force(h);
4554 0 : });
4555 0 :
4556 0 : // Custom
4557 0 : Lazy::force(&BASEBACKUP_QUERY_TIME);
4558 0 : Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
4559 0 : Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);
4560 0 :
4561 0 : tenant_throttling::preinitialize_global_metrics();
4562 0 : wait_ondemand_download_time::preinitialize_global_metrics();
4563 0 : }
|