Line data Source code
1 : use metrics::metric_vec_duration::DurationResultObserver;
2 : use metrics::{
3 : register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
4 : register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
5 : register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
6 : HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
7 : };
8 : use once_cell::sync::Lazy;
9 : use strum::{EnumCount, IntoEnumIterator, VariantNames};
10 : use strum_macros::{EnumVariantNames, IntoStaticStr};
11 : use utils::id::{TenantId, TimelineId};
12 :
13 : /// Prometheus histogram buckets (in seconds) for operations in the critical
14 : /// path. In other words, operations that directly affect that latency of user
15 : /// queries.
16 : ///
17 : /// The buckets capture the majority of latencies in the microsecond and
18 : /// millisecond range but also extend far enough up to distinguish "bad" from
19 : /// "really bad".
20 : const CRITICAL_OP_BUCKETS: &[f64] = &[
21 : 0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us
22 : 0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms
23 : 1.0, 10.0, 100.0, // 1 s, 10 s, 100 s
24 : ];
25 :
26 : // Metrics collected on operations on the storage repository.
27 10238 : #[derive(Debug, EnumVariantNames, IntoStaticStr)]
28 : #[strum(serialize_all = "kebab_case")]
29 : pub enum StorageTimeOperation {
30 : #[strum(serialize = "layer flush")]
31 : LayerFlush,
32 :
33 : #[strum(serialize = "compact")]
34 : Compact,
35 :
36 : #[strum(serialize = "create images")]
37 : CreateImages,
38 :
39 : #[strum(serialize = "logical size")]
40 : LogicalSize,
41 :
42 : #[strum(serialize = "imitate logical size")]
43 : ImitateLogicalSize,
44 :
45 : #[strum(serialize = "load layer map")]
46 : LoadLayerMap,
47 :
48 : #[strum(serialize = "gc")]
49 : Gc,
50 :
51 : #[strum(serialize = "create tenant")]
52 : CreateTenant,
53 : }
54 :
55 554 : pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
56 554 : register_counter_vec!(
57 554 : "pageserver_storage_operations_seconds_sum",
58 554 : "Total time spent on storage operations with operation, tenant and timeline dimensions",
59 554 : &["operation", "tenant_id", "timeline_id"],
60 554 : )
61 554 : .expect("failed to define a metric")
62 554 : });
63 :
64 554 : pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
65 554 : register_int_counter_vec!(
66 554 : "pageserver_storage_operations_seconds_count",
67 554 : "Count of storage operations with operation, tenant and timeline dimensions",
68 554 : &["operation", "tenant_id", "timeline_id"],
69 554 : )
70 554 : .expect("failed to define a metric")
71 554 : });
72 :
73 : // Buckets for background operations like compaction, GC, size calculation
74 : const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
75 :
76 554 : pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
77 554 : register_histogram_vec!(
78 554 : "pageserver_storage_operations_seconds_global",
79 554 : "Time spent on storage operations",
80 554 : &["operation"],
81 554 : STORAGE_OP_BUCKETS.into(),
82 554 : )
83 554 : .expect("failed to define a metric")
84 554 : });
85 :
86 576 : pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
87 576 : register_histogram!(
88 576 : "pageserver_read_num_fs_layers",
89 576 : "Number of persistent layers accessed for processing a read request, including those in the cache",
90 576 : vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
91 576 : )
92 576 : .expect("failed to define a metric")
93 576 : });
94 :
95 : // Metrics collected on operations on the storage repository.
96 576 : pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
97 576 : register_histogram!(
98 576 : "pageserver_getpage_reconstruct_seconds",
99 576 : "Time spent in reconstruct_value (reconstruct a page from deltas)",
100 576 : CRITICAL_OP_BUCKETS.into(),
101 576 : )
102 576 : .expect("failed to define a metric")
103 576 : });
104 :
105 575 : pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
106 575 : register_int_counter!(
107 575 : "pageserver_materialized_cache_hits_direct_total",
108 575 : "Number of cache hits from materialized page cache without redo",
109 575 : )
110 575 : .expect("failed to define a metric")
111 575 : });
112 :
113 490 : pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
114 490 : register_histogram!(
115 490 : "pageserver_getpage_get_reconstruct_data_seconds",
116 490 : "Time spent in get_reconstruct_value_data",
117 490 : CRITICAL_OP_BUCKETS.into(),
118 490 : )
119 490 : .expect("failed to define a metric")
120 490 : });
121 :
122 575 : pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
123 575 : register_int_counter!(
124 575 : "pageserver_materialized_cache_hits_total",
125 575 : "Number of cache hits from materialized page cache",
126 575 : )
127 575 : .expect("failed to define a metric")
128 575 : });
129 :
130 : pub struct PageCacheMetrics {
131 : pub read_accesses_materialized_page: IntCounter,
132 : pub read_accesses_ephemeral: IntCounter,
133 : pub read_accesses_immutable: IntCounter,
134 :
135 : pub read_hits_ephemeral: IntCounter,
136 : pub read_hits_immutable: IntCounter,
137 : pub read_hits_materialized_page_exact: IntCounter,
138 : pub read_hits_materialized_page_older_lsn: IntCounter,
139 : }
140 :
141 490 : static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
142 490 : register_int_counter_vec!(
143 490 : "pageserver_page_cache_read_hits_total",
144 490 : "Number of read accesses to the page cache that hit",
145 490 : &["key_kind", "hit_kind"]
146 490 : )
147 490 : .expect("failed to define a metric")
148 490 : });
149 :
150 490 : static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
151 490 : register_int_counter_vec!(
152 490 : "pageserver_page_cache_read_accesses_total",
153 490 : "Number of read accesses to the page cache",
154 490 : &["key_kind"]
155 490 : )
156 490 : .expect("failed to define a metric")
157 490 : });
158 :
159 490 : pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
160 490 : read_accesses_materialized_page: {
161 490 : PAGE_CACHE_READ_ACCESSES
162 490 : .get_metric_with_label_values(&["materialized_page"])
163 490 : .unwrap()
164 490 : },
165 490 :
166 490 : read_accesses_ephemeral: {
167 490 : PAGE_CACHE_READ_ACCESSES
168 490 : .get_metric_with_label_values(&["ephemeral"])
169 490 : .unwrap()
170 490 : },
171 490 :
172 490 : read_accesses_immutable: {
173 490 : PAGE_CACHE_READ_ACCESSES
174 490 : .get_metric_with_label_values(&["immutable"])
175 490 : .unwrap()
176 490 : },
177 490 :
178 490 : read_hits_ephemeral: {
179 490 : PAGE_CACHE_READ_HITS
180 490 : .get_metric_with_label_values(&["ephemeral", "-"])
181 490 : .unwrap()
182 490 : },
183 490 :
184 490 : read_hits_immutable: {
185 490 : PAGE_CACHE_READ_HITS
186 490 : .get_metric_with_label_values(&["immutable", "-"])
187 490 : .unwrap()
188 490 : },
189 490 :
190 490 : read_hits_materialized_page_exact: {
191 490 : PAGE_CACHE_READ_HITS
192 490 : .get_metric_with_label_values(&["materialized_page", "exact"])
193 490 : .unwrap()
194 490 : },
195 490 :
196 490 : read_hits_materialized_page_older_lsn: {
197 490 : PAGE_CACHE_READ_HITS
198 490 : .get_metric_with_label_values(&["materialized_page", "older_lsn"])
199 490 : .unwrap()
200 490 : },
201 490 : });
202 :
203 : pub struct PageCacheSizeMetrics {
204 : pub max_bytes: UIntGauge,
205 :
206 : pub current_bytes_ephemeral: UIntGauge,
207 : pub current_bytes_immutable: UIntGauge,
208 : pub current_bytes_materialized_page: UIntGauge,
209 : }
210 :
211 576 : static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
212 576 : register_uint_gauge_vec!(
213 576 : "pageserver_page_cache_size_current_bytes",
214 576 : "Current size of the page cache in bytes, by key kind",
215 576 : &["key_kind"]
216 576 : )
217 576 : .expect("failed to define a metric")
218 576 : });
219 :
220 576 : pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
221 576 : max_bytes: {
222 576 : register_uint_gauge!(
223 576 : "pageserver_page_cache_size_max_bytes",
224 576 : "Maximum size of the page cache in bytes"
225 576 : )
226 576 : .expect("failed to define a metric")
227 576 : },
228 576 :
229 576 : current_bytes_ephemeral: {
230 576 : PAGE_CACHE_SIZE_CURRENT_BYTES
231 576 : .get_metric_with_label_values(&["ephemeral"])
232 576 : .unwrap()
233 576 : },
234 576 : current_bytes_immutable: {
235 576 : PAGE_CACHE_SIZE_CURRENT_BYTES
236 576 : .get_metric_with_label_values(&["immutable"])
237 576 : .unwrap()
238 576 : },
239 576 : current_bytes_materialized_page: {
240 576 : PAGE_CACHE_SIZE_CURRENT_BYTES
241 576 : .get_metric_with_label_values(&["materialized_page"])
242 576 : .unwrap()
243 576 : },
244 576 : });
245 :
246 576 : pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
247 576 : register_histogram!(
248 576 : "pageserver_wait_lsn_seconds",
249 576 : "Time spent waiting for WAL to arrive",
250 576 : CRITICAL_OP_BUCKETS.into(),
251 576 : )
252 576 : .expect("failed to define a metric")
253 576 : });
254 :
255 554 : static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
256 554 : register_int_gauge_vec!(
257 554 : "pageserver_last_record_lsn",
258 554 : "Last record LSN grouped by timeline",
259 554 : &["tenant_id", "timeline_id"]
260 554 : )
261 554 : .expect("failed to define a metric")
262 554 : });
263 :
264 554 : static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
265 554 : register_uint_gauge_vec!(
266 554 : "pageserver_resident_physical_size",
267 554 : "The size of the layer files present in the pageserver's filesystem.",
268 554 : &["tenant_id", "timeline_id"]
269 554 : )
270 554 : .expect("failed to define a metric")
271 554 : });
272 :
273 288 : static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
274 288 : register_uint_gauge_vec!(
275 288 : "pageserver_remote_physical_size",
276 288 : "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
277 288 : // Corollary: If any files are missing from the index part, they won't be included here.
278 288 : &["tenant_id", "timeline_id"]
279 288 : )
280 288 : .expect("failed to define a metric")
281 288 : });
282 :
283 41 : pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
284 41 : register_int_counter!(
285 41 : "pageserver_remote_ondemand_downloaded_layers_total",
286 41 : "Total on-demand downloaded layers"
287 41 : )
288 41 : .unwrap()
289 41 : });
290 :
291 41 : pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
292 41 : register_int_counter!(
293 41 : "pageserver_remote_ondemand_downloaded_bytes_total",
294 41 : "Total bytes of layers on-demand downloaded",
295 41 : )
296 41 : .unwrap()
297 41 : });
298 :
299 554 : static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
300 554 : register_uint_gauge_vec!(
301 554 : "pageserver_current_logical_size",
302 554 : "Current logical size grouped by timeline",
303 554 : &["tenant_id", "timeline_id"]
304 554 : )
305 554 : .expect("failed to define current logical size metric")
306 554 : });
307 :
308 573 : pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
309 573 : register_uint_gauge_vec!(
310 573 : "pageserver_tenant_states_count",
311 573 : "Count of tenants per state",
312 573 : &["state"]
313 573 : )
314 573 : .expect("Failed to register pageserver_tenant_states_count metric")
315 573 : });
316 :
317 : /// A set of broken tenants.
318 : ///
319 : /// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
320 : /// tenant.
321 573 : pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
322 573 : register_uint_gauge_vec!(
323 573 : "pageserver_broken_tenants_count",
324 573 : "Set of broken tenants",
325 573 : &["tenant_id"]
326 573 : )
327 573 : .expect("Failed to register pageserver_tenant_states_count metric")
328 573 : });
329 :
330 117 : pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
331 117 : register_uint_gauge_vec!(
332 117 : "pageserver_tenant_synthetic_cached_size_bytes",
333 117 : "Synthetic size of each tenant in bytes",
334 117 : &["tenant_id"]
335 117 : )
336 117 : .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
337 117 : });
338 :
339 : // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
340 : // or in testing they estimate how much we would upload if we did.
341 554 : static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
342 554 : register_int_counter_vec!(
343 554 : "pageserver_created_persistent_files_total",
344 554 : "Number of files created that are meant to be uploaded to cloud storage",
345 554 : &["tenant_id", "timeline_id"]
346 554 : )
347 554 : .expect("failed to define a metric")
348 554 : });
349 :
350 554 : static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
351 554 : register_int_counter_vec!(
352 554 : "pageserver_written_persistent_bytes_total",
353 554 : "Total bytes written that are meant to be uploaded to cloud storage",
354 554 : &["tenant_id", "timeline_id"]
355 554 : )
356 554 : .expect("failed to define a metric")
357 554 : });
358 :
359 1 : pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
360 1 : register_histogram_vec!(
361 1 : "pageserver_eviction_iteration_duration_seconds_global",
362 1 : "Time spent on a single eviction iteration",
363 1 : &["period_secs", "threshold_secs"],
364 1 : STORAGE_OP_BUCKETS.into(),
365 1 : )
366 1 : .expect("failed to define a metric")
367 1 : });
368 :
369 554 : static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
370 554 : register_int_counter_vec!(
371 554 : "pageserver_evictions",
372 554 : "Number of layers evicted from the pageserver",
373 554 : &["tenant_id", "timeline_id"]
374 554 : )
375 554 : .expect("failed to define a metric")
376 554 : });
377 :
378 554 : static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(|| {
379 554 : register_int_counter_vec!(
380 554 : "pageserver_evictions_with_low_residence_duration",
381 554 : "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
382 554 : Residence duration is determined using the `residence_duration_data_source`.",
383 554 : &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
384 554 : )
385 554 : .expect("failed to define a metric")
386 554 : });
387 :
388 575 : pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
389 575 : register_int_counter!(
390 575 : "pageserver_unexpected_ondemand_downloads_count",
391 575 : "Number of unexpected on-demand downloads. \
392 575 : We log more context for each increment, so, forgo any labels in this metric.",
393 575 : )
394 575 : .expect("failed to define a metric")
395 575 : });
396 :
397 : /// How long did we take to start up? Broken down by labels to describe
398 : /// different phases of startup.
399 575 : pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
400 575 : register_gauge_vec!(
401 575 : "pageserver_startup_duration_seconds",
402 575 : "Time taken by phases of pageserver startup, in seconds",
403 575 : &["phase"]
404 575 : )
405 575 : .expect("Failed to register pageserver_startup_duration_seconds metric")
406 575 : });
407 :
408 575 : pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
409 575 : register_uint_gauge!(
410 575 : "pageserver_startup_is_loading",
411 575 : "1 while in initial startup load of tenants, 0 at other times"
412 575 : )
413 575 : .expect("Failed to register pageserver_startup_is_loading")
414 575 : });
415 :
416 : /// How long did tenants take to go from construction to active state?
417 538 : pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
418 538 : register_histogram!(
419 538 : "pageserver_tenant_activation_seconds",
420 538 : "Time taken by tenants to activate, in seconds",
421 538 : CRITICAL_OP_BUCKETS.into()
422 538 : )
423 538 : .expect("Failed to register pageserver_tenant_activation_seconds metric")
424 538 : });
425 :
426 : /// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
427 0 : #[derive(Debug)]
428 : pub struct EvictionsWithLowResidenceDuration {
429 : data_source: &'static str,
430 : threshold: Duration,
431 : counter: Option<IntCounter>,
432 : }
433 :
434 : pub struct EvictionsWithLowResidenceDurationBuilder {
435 : data_source: &'static str,
436 : threshold: Duration,
437 : }
438 :
439 : impl EvictionsWithLowResidenceDurationBuilder {
440 1402 : pub fn new(data_source: &'static str, threshold: Duration) -> Self {
441 1402 : Self {
442 1402 : data_source,
443 1402 : threshold,
444 1402 : }
445 1402 : }
446 :
447 1402 : fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
448 1402 : let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
449 1402 : .get_metric_with_label_values(&[
450 1402 : tenant_id,
451 1402 : timeline_id,
452 1402 : self.data_source,
453 1402 : &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
454 1402 : ])
455 1402 : .unwrap();
456 1402 : EvictionsWithLowResidenceDuration {
457 1402 : data_source: self.data_source,
458 1402 : threshold: self.threshold,
459 1402 : counter: Some(counter),
460 1402 : }
461 1402 : }
462 : }
463 :
464 : impl EvictionsWithLowResidenceDuration {
465 1720 : fn threshold_label_value(threshold: Duration) -> String {
466 1720 : format!("{}", threshold.as_secs())
467 1720 : }
468 :
469 208 : pub fn observe(&self, observed_value: Duration) {
470 208 : if observed_value < self.threshold {
471 208 : self.counter
472 208 : .as_ref()
473 208 : .expect("nobody calls this function after `remove_from_vec`")
474 208 : .inc();
475 208 : }
476 208 : }
477 :
478 29 : pub fn change_threshold(
479 29 : &mut self,
480 29 : tenant_id: &str,
481 29 : timeline_id: &str,
482 29 : new_threshold: Duration,
483 29 : ) {
484 29 : if new_threshold == self.threshold {
485 21 : return;
486 8 : }
487 8 : let mut with_new =
488 8 : EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold)
489 8 : .build(tenant_id, timeline_id);
490 8 : std::mem::swap(self, &mut with_new);
491 8 : with_new.remove(tenant_id, timeline_id);
492 29 : }
493 :
494 : // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
495 318 : fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
496 318 : let Some(_counter) = self.counter.take() else {
497 0 : return;
498 : };
499 :
500 318 : let threshold = Self::threshold_label_value(self.threshold);
501 318 :
502 318 : let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
503 318 : tenant_id,
504 318 : timeline_id,
505 318 : self.data_source,
506 318 : &threshold,
507 318 : ]);
508 318 :
509 318 : match removed {
510 0 : Err(e) => {
511 0 : // this has been hit in staging as
512 0 : // <https://neondatabase.sentry.io/issues/4142396994/>, but we don't know how.
513 0 : // because we can be in the drop path already, don't risk:
514 0 : // - "double-panic => illegal instruction" or
515 0 : // - future "drop panick => abort"
516 0 : //
517 0 : // so just nag: (the error has the labels)
518 0 : tracing::warn!("failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}");
519 : }
520 : Ok(()) => {
521 : // to help identify cases where we double-remove the same values, let's log all
522 : // deletions?
523 318 : tracing::info!("removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source);
524 : }
525 : }
526 318 : }
527 : }
528 :
529 : // Metrics collected on disk IO operations
530 : //
531 : // Roughly logarithmic scale.
532 : const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
533 : 0.000030, // 30 usec
534 : 0.001000, // 1000 usec
535 : 0.030, // 30 ms
536 : 1.000, // 1000 ms
537 : 30.000, // 30000 ms
538 : ];
539 :
540 : /// Tracks time taken by fs operations near VirtualFile.
541 : ///
542 : /// Operations:
543 : /// - open ([`std::fs::OpenOptions::open`])
544 : /// - close (dropping [`std::fs::File`])
545 : /// - close-by-replace (close by replacement algorithm)
546 : /// - read (`read_at`)
547 : /// - write (`write_at`)
548 : /// - seek (modify internal position or file length query)
549 : /// - fsync ([`std::fs::File::sync_all`])
550 : /// - metadata ([`std::fs::File::metadata`])
551 499 : pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
552 499 : register_histogram_vec!(
553 499 : "pageserver_io_operations_seconds",
554 499 : "Time spent in IO operations",
555 499 : &["operation"],
556 499 : STORAGE_IO_TIME_BUCKETS.into()
557 499 : )
558 499 : .expect("failed to define a metric")
559 499 : });
560 :
561 : const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
562 :
563 : // Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
564 539 : pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
565 539 : register_int_gauge_vec!(
566 539 : "pageserver_io_operations_bytes_total",
567 539 : "Total amount of bytes read/written in IO operations",
568 539 : &["operation", "tenant_id", "timeline_id"]
569 539 : )
570 539 : .expect("failed to define a metric")
571 539 : });
572 :
573 0 : #[derive(Debug)]
574 : struct GlobalAndPerTimelineHistogram {
575 : global: Histogram,
576 : per_tenant_timeline: Histogram,
577 : }
578 :
579 : impl GlobalAndPerTimelineHistogram {
580 4599588 : fn observe(&self, value: f64) {
581 4599588 : self.global.observe(value);
582 4599588 : self.per_tenant_timeline.observe(value);
583 4599588 : }
584 : }
585 :
586 : struct GlobalAndPerTimelineHistogramTimer<'a> {
587 : h: &'a GlobalAndPerTimelineHistogram,
588 : start: std::time::Instant,
589 : }
590 :
591 : impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
592 4599588 : fn drop(&mut self) {
593 4599588 : let elapsed = self.start.elapsed();
594 4599588 : self.h.observe(elapsed.as_secs_f64());
595 4599588 : }
596 : }
597 :
598 : #[derive(
599 0 : Debug,
600 0 : Clone,
601 : Copy,
602 37700 : IntoStaticStr,
603 : strum_macros::EnumCount,
604 2800 : strum_macros::EnumIter,
605 18228 : strum_macros::FromRepr,
606 : )]
607 : #[strum(serialize_all = "snake_case")]
608 : pub enum SmgrQueryType {
609 : GetRelExists,
610 : GetRelSize,
611 : GetPageAtLsn,
612 : GetDbSize,
613 : }
614 :
615 0 : #[derive(Debug)]
616 : pub struct SmgrQueryTimePerTimeline {
617 : metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
618 : }
619 :
620 486 : static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
621 486 : register_histogram_vec!(
622 486 : "pageserver_smgr_query_seconds",
623 486 : "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
624 486 : &["smgr_query_type", "tenant_id", "timeline_id"],
625 486 : CRITICAL_OP_BUCKETS.into(),
626 486 : )
627 486 : .expect("failed to define a metric")
628 486 : });
629 :
630 419 : static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
631 419 : register_histogram_vec!(
632 419 : "pageserver_smgr_query_seconds_global",
633 419 : "Time spent on smgr query handling, aggregated by query type.",
634 419 : &["smgr_query_type"],
635 419 : CRITICAL_OP_BUCKETS.into(),
636 419 : )
637 419 : .expect("failed to define a metric")
638 419 : });
639 :
640 : impl SmgrQueryTimePerTimeline {
641 4557 : pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
642 4557 : let tenant_id = tenant_id.to_string();
643 4557 : let timeline_id = timeline_id.to_string();
644 18228 : let metrics = std::array::from_fn(|i| {
645 18228 : let op = SmgrQueryType::from_repr(i).unwrap();
646 18228 : let global = SMGR_QUERY_TIME_GLOBAL
647 18228 : .get_metric_with_label_values(&[op.into()])
648 18228 : .unwrap();
649 18228 : let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
650 18228 : .get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
651 18228 : .unwrap();
652 18228 : GlobalAndPerTimelineHistogram {
653 18228 : global,
654 18228 : per_tenant_timeline,
655 18228 : }
656 18228 : });
657 4557 : Self { metrics }
658 4557 : }
659 4599619 : pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ {
660 4599619 : let metric = &self.metrics[op as usize];
661 4599619 : GlobalAndPerTimelineHistogramTimer {
662 4599619 : h: metric,
663 4599619 : start: std::time::Instant::now(),
664 4599619 : }
665 4599619 : }
666 : }
667 :
668 : #[cfg(test)]
669 : mod smgr_query_time_tests {
670 : use strum::IntoEnumIterator;
671 : use utils::id::{TenantId, TimelineId};
672 :
673 : // Regression test, we used hard-coded string constants before using an enum.
674 1 : #[test]
675 1 : fn op_label_name() {
676 1 : use super::SmgrQueryType::*;
677 1 : let expect: [(super::SmgrQueryType, &'static str); 4] = [
678 1 : (GetRelExists, "get_rel_exists"),
679 1 : (GetRelSize, "get_rel_size"),
680 1 : (GetPageAtLsn, "get_page_at_lsn"),
681 1 : (GetDbSize, "get_db_size"),
682 1 : ];
683 5 : for (op, expect) in expect {
684 4 : let actual: &'static str = op.into();
685 4 : assert_eq!(actual, expect);
686 : }
687 1 : }
688 :
689 1 : #[test]
690 1 : fn basic() {
691 1 : let ops: Vec<_> = super::SmgrQueryType::iter().collect();
692 :
693 5 : for op in &ops {
694 4 : let tenant_id = TenantId::generate();
695 4 : let timeline_id = TimelineId::generate();
696 4 : let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
697 4 :
698 8 : let get_counts = || {
699 8 : let global: u64 = ops
700 8 : .iter()
701 32 : .map(|op| metrics.metrics[*op as usize].global.get_sample_count())
702 8 : .sum();
703 8 : let per_tenant_timeline: u64 = ops
704 8 : .iter()
705 32 : .map(|op| {
706 32 : metrics.metrics[*op as usize]
707 32 : .per_tenant_timeline
708 32 : .get_sample_count()
709 32 : })
710 8 : .sum();
711 8 : (global, per_tenant_timeline)
712 8 : };
713 :
714 4 : let (pre_global, pre_per_tenant_timeline) = get_counts();
715 4 : assert_eq!(pre_per_tenant_timeline, 0);
716 :
717 4 : let timer = metrics.start_timer(*op);
718 4 : drop(timer);
719 4 :
720 4 : let (post_global, post_per_tenant_timeline) = get_counts();
721 4 : assert_eq!(post_per_tenant_timeline, 1);
722 4 : assert!(post_global > pre_global);
723 : }
724 1 : }
725 : }
726 :
727 : // keep in sync with control plane Go code so that we can validate
728 : // compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
729 379 : static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
730 379 : // Go code uses milliseconds. Variable is called `computeStartupBuckets`
731 379 : [
732 379 : 5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000,
733 379 : 1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000,
734 379 : ]
735 10612 : .map(|ms| (ms as f64) / 1000.0)
736 379 : });
737 :
738 : pub struct BasebackupQueryTime(HistogramVec);
739 379 : pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
740 379 : BasebackupQueryTime({
741 379 : register_histogram_vec!(
742 379 : "pageserver_basebackup_query_seconds",
743 379 : "Histogram of basebackup queries durations, by result type",
744 379 : &["result"],
745 379 : COMPUTE_STARTUP_BUCKETS.to_vec(),
746 379 : )
747 379 : .expect("failed to define a metric")
748 379 : })
749 379 : });
750 :
751 : impl DurationResultObserver for BasebackupQueryTime {
752 660 : fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
753 660 : let label_value = if res.is_ok() { "ok" } else { "error" };
754 660 : let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
755 660 : metric.observe(duration.as_secs_f64());
756 660 : }
757 : }
758 :
759 426 : pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
760 426 : register_int_gauge_vec!(
761 426 : "pageserver_live_connections",
762 426 : "Number of live network connections",
763 426 : &["pageserver_connection_kind"]
764 426 : )
765 426 : .expect("failed to define a metric")
766 426 : });
767 :
768 : // remote storage metrics
769 :
770 : /// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
771 288 : static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::new(|| {
772 288 : register_int_gauge_vec!(
773 288 : "pageserver_remote_timeline_client_calls_unfinished",
774 288 : "Number of ongoing calls to remote timeline client. \
775 288 : Used to populate pageserver_remote_timeline_client_calls_started. \
776 288 : This metric is not useful for sampling from Prometheus, but useful in tests.",
777 288 : &["tenant_id", "timeline_id", "file_kind", "op_kind"],
778 288 : )
779 288 : .expect("failed to define a metric")
780 288 : });
781 :
782 288 : static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new(|| {
783 288 : register_histogram_vec!(
784 288 : "pageserver_remote_timeline_client_calls_started",
785 288 : "When calling a remote timeline client method, we record the current value \
786 288 : of the calls_unfinished gauge in this histogram. Plot the histogram \
787 288 : over time in a heatmap to visualize how many operations were ongoing \
788 288 : at a given instant. It gives you a better idea of the queue depth \
789 288 : than plotting the gauge directly, since operations may complete faster \
790 288 : than the sampling interval.",
791 288 : &["file_kind", "op_kind"],
792 288 : // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
793 288 : vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
794 288 : )
795 288 : .expect("failed to define a metric")
796 288 : });
797 :
798 216 : static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
799 216 : register_int_counter_vec!(
800 216 : "pageserver_remote_timeline_client_bytes_started",
801 216 : "Incremented by the number of bytes associated with a remote timeline client operation. \
802 216 : The increment happens when the operation is scheduled.",
803 216 : &["tenant_id", "timeline_id", "file_kind", "op_kind"],
804 216 : )
805 216 : .expect("failed to define a metric")
806 216 : });
807 :
808 216 : static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
809 216 : register_int_counter_vec!(
810 216 : "pageserver_remote_timeline_client_bytes_finished",
811 216 : "Incremented by the number of bytes associated with a remote timeline client operation. \
812 216 : The increment happens when the operation finishes (regardless of success/failure/shutdown).",
813 216 : &["tenant_id", "timeline_id", "file_kind", "op_kind"],
814 216 : )
815 216 : .expect("failed to define a metric")
816 216 : });
817 :
818 0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
819 : pub enum RemoteOpKind {
820 : Upload,
821 : Download,
822 : Delete,
823 : }
824 : impl RemoteOpKind {
825 142795 : pub fn as_str(&self) -> &'static str {
826 142795 : match self {
827 102500 : Self::Upload => "upload",
828 3738 : Self::Download => "download",
829 36557 : Self::Delete => "delete",
830 : }
831 142795 : }
832 : }
833 :
834 0 : #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
835 : pub enum RemoteOpFileKind {
836 : Layer,
837 : Index,
838 : }
839 : impl RemoteOpFileKind {
840 142795 : pub fn as_str(&self) -> &'static str {
841 142795 : match self {
842 124197 : Self::Layer => "layer",
843 18598 : Self::Index => "index",
844 : }
845 142795 : }
846 : }
847 :
848 288 : pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
849 288 : register_histogram_vec!(
850 288 : "pageserver_remote_operation_seconds",
851 288 : "Time spent on remote storage operations. \
852 288 : Grouped by tenant, timeline, operation_kind and status. \
853 288 : Does not account for time spent waiting in remote timeline client's queues.",
854 288 : &["file_kind", "op_kind", "status"]
855 288 : )
856 288 : .expect("failed to define a metric")
857 288 : });
858 :
859 490 : pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
860 490 : register_int_counter_vec!(
861 490 : "pageserver_tenant_task_events",
862 490 : "Number of task start/stop/fail events.",
863 490 : &["event"],
864 490 : )
865 490 : .expect("Failed to register tenant_task_events metric")
866 490 : });
867 :
868 575 : pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
869 575 : register_int_counter_vec!(
870 575 : "pageserver_background_loop_period_overrun_count",
871 575 : "Incremented whenever warn_when_period_overrun() logs a warning.",
872 575 : &["task", "period"],
873 575 : )
874 575 : .expect("failed to define a metric")
875 575 : });
876 :
877 : // walreceiver metrics
878 :
879 575 : pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
880 575 : register_int_counter!(
881 575 : "pageserver_walreceiver_started_connections_total",
882 575 : "Number of started walreceiver connections"
883 575 : )
884 575 : .expect("failed to define a metric")
885 575 : });
886 :
887 575 : pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
888 575 : register_int_gauge!(
889 575 : "pageserver_walreceiver_active_managers",
890 575 : "Number of active walreceiver managers"
891 575 : )
892 575 : .expect("failed to define a metric")
893 575 : });
894 :
895 401 : pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
896 401 : register_int_counter_vec!(
897 401 : "pageserver_walreceiver_switches_total",
898 401 : "Number of walreceiver manager change_connection calls",
899 401 : &["reason"]
900 401 : )
901 401 : .expect("failed to define a metric")
902 401 : });
903 :
904 575 : pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
905 575 : register_int_counter!(
906 575 : "pageserver_walreceiver_broker_updates_total",
907 575 : "Number of received broker updates in walreceiver"
908 575 : )
909 575 : .expect("failed to define a metric")
910 575 : });
911 :
912 576 : pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
913 576 : register_int_counter_vec!(
914 576 : "pageserver_walreceiver_candidates_events_total",
915 576 : "Number of walreceiver candidate events",
916 576 : &["event"]
917 576 : )
918 576 : .expect("failed to define a metric")
919 576 : });
920 :
921 : pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
922 575 : Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
923 :
924 : pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
925 576 : Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
926 :
927 : // Metrics collected on WAL redo operations
928 : //
929 : // We collect the time spent in actual WAL redo ('redo'), and time waiting
930 : // for access to the postgres process ('wait') since there is only one for
931 : // each tenant.
932 :
933 : /// Time buckets are small because we want to be able to measure the
934 : /// smallest redo processing times. These buckets allow us to measure down
935 : /// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
936 : /// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
937 : ///
938 : /// Values up to 1s are recorded because metrics show that we have redo
939 : /// durations and lock times larger than 0.250s.
940 : macro_rules! redo_histogram_time_buckets {
941 : () => {
942 : vec![
943 : 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
944 : 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000,
945 : 1.000_000,
946 : ]
947 : };
948 : }
949 :
950 : /// While we're at it, also measure the amount of records replayed in each
951 : /// operation. We have a global 'total replayed' counter, but that's not
952 : /// as useful as 'what is the skew for how many records we replay in one
953 : /// operation'.
954 : macro_rules! redo_histogram_count_buckets {
955 : () => {
956 : vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0]
957 : };
958 : }
959 :
960 : macro_rules! redo_bytes_histogram_count_buckets {
961 : () => {
962 : // powers of (2^.5), from 2^4.5 to 2^15 (22 buckets)
963 : // rounded up to the next multiple of 8 to capture any MAXALIGNed record of that size, too.
964 : vec![
965 : 24.0, 32.0, 48.0, 64.0, 96.0, 128.0, 184.0, 256.0, 368.0, 512.0, 728.0, 1024.0, 1456.0,
966 : 2048.0, 2904.0, 4096.0, 5800.0, 8192.0, 11592.0, 16384.0, 23176.0, 32768.0,
967 : ]
968 : };
969 : }
970 :
971 576 : pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
972 576 : register_histogram!(
973 576 : "pageserver_wal_redo_seconds",
974 576 : "Time spent on WAL redo",
975 576 : redo_histogram_time_buckets!()
976 576 : )
977 576 : .expect("failed to define a metric")
978 576 : });
979 :
980 576 : pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
981 576 : register_histogram!(
982 576 : "pageserver_wal_redo_wait_seconds",
983 576 : "Time spent waiting for access to the Postgres WAL redo process",
984 576 : redo_histogram_time_buckets!(),
985 576 : )
986 576 : .expect("failed to define a metric")
987 576 : });
988 :
989 576 : pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
990 576 : register_histogram!(
991 576 : "pageserver_wal_redo_records_histogram",
992 576 : "Histogram of number of records replayed per redo in the Postgres WAL redo process",
993 576 : redo_histogram_count_buckets!(),
994 576 : )
995 576 : .expect("failed to define a metric")
996 576 : });
997 :
998 576 : pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
999 576 : register_histogram!(
1000 576 : "pageserver_wal_redo_bytes_histogram",
1001 576 : "Histogram of number of records replayed per redo sent to Postgres",
1002 576 : redo_bytes_histogram_count_buckets!(),
1003 576 : )
1004 576 : .expect("failed to define a metric")
1005 576 : });
1006 :
1007 : // FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
1008 372 : pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
1009 372 : register_int_counter!(
1010 372 : "pageserver_replayed_wal_records_total",
1011 372 : "Number of WAL records replayed in WAL redo process"
1012 372 : )
1013 372 : .unwrap()
1014 372 : });
1015 :
1016 : /// Similar to `prometheus::HistogramTimer` but does not record on drop.
1017 : pub struct StorageTimeMetricsTimer {
1018 : metrics: StorageTimeMetrics,
1019 : start: Instant,
1020 : }
1021 :
1022 : impl StorageTimeMetricsTimer {
1023 10343 : fn new(metrics: StorageTimeMetrics) -> Self {
1024 10343 : Self {
1025 10343 : metrics,
1026 10343 : start: Instant::now(),
1027 10343 : }
1028 10343 : }
1029 :
1030 : /// Record the time from creation to now.
1031 10315 : pub fn stop_and_record(self) {
1032 10315 : let duration = self.start.elapsed().as_secs_f64();
1033 10315 : self.metrics.timeline_sum.inc_by(duration);
1034 10315 : self.metrics.timeline_count.inc();
1035 10315 : self.metrics.global_histogram.observe(duration);
1036 10315 : }
1037 : }
1038 :
1039 : /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
1040 : /// timeline total sum and count.
1041 10343 : #[derive(Clone, Debug)]
1042 : pub struct StorageTimeMetrics {
1043 : /// Sum of f64 seconds, per operation, tenant_id and timeline_id
1044 : timeline_sum: Counter,
1045 : /// Number of oeprations, per operation, tenant_id and timeline_id
1046 : timeline_count: IntCounter,
1047 : /// Global histogram having only the "operation" label.
1048 : global_histogram: Histogram,
1049 : }
1050 :
1051 : impl StorageTimeMetrics {
1052 9758 : pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self {
1053 9758 : let operation: &'static str = operation.into();
1054 9758 :
1055 9758 : let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
1056 9758 : .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
1057 9758 : .unwrap();
1058 9758 : let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
1059 9758 : .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
1060 9758 : .unwrap();
1061 9758 : let global_histogram = STORAGE_TIME_GLOBAL
1062 9758 : .get_metric_with_label_values(&[operation])
1063 9758 : .unwrap();
1064 9758 :
1065 9758 : StorageTimeMetrics {
1066 9758 : timeline_sum,
1067 9758 : timeline_count,
1068 9758 : global_histogram,
1069 9758 : }
1070 9758 : }
1071 :
1072 : /// Starts timing a new operation.
1073 : ///
1074 : /// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop.
1075 10343 : pub fn start_timer(&self) -> StorageTimeMetricsTimer {
1076 10343 : StorageTimeMetricsTimer::new(self.clone())
1077 10343 : }
1078 : }
1079 :
1080 0 : #[derive(Debug)]
1081 : pub struct TimelineMetrics {
1082 : tenant_id: String,
1083 : timeline_id: String,
1084 : pub flush_time_histo: StorageTimeMetrics,
1085 : pub compact_time_histo: StorageTimeMetrics,
1086 : pub create_images_time_histo: StorageTimeMetrics,
1087 : pub logical_size_histo: StorageTimeMetrics,
1088 : pub imitate_logical_size_histo: StorageTimeMetrics,
1089 : pub load_layer_map_histo: StorageTimeMetrics,
1090 : pub garbage_collect_histo: StorageTimeMetrics,
1091 : pub last_record_gauge: IntGauge,
1092 : pub resident_physical_size_gauge: UIntGauge,
1093 : /// copy of LayeredTimeline.current_logical_size
1094 : pub current_logical_size_gauge: UIntGauge,
1095 : pub num_persistent_files_created: IntCounter,
1096 : pub persistent_bytes_written: IntCounter,
1097 : pub evictions: IntCounter,
1098 : pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
1099 : }
1100 :
1101 : impl TimelineMetrics {
1102 1394 : pub fn new(
1103 1394 : tenant_id: &TenantId,
1104 1394 : timeline_id: &TimelineId,
1105 1394 : evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
1106 1394 : ) -> Self {
1107 1394 : let tenant_id = tenant_id.to_string();
1108 1394 : let timeline_id = timeline_id.to_string();
1109 1394 : let flush_time_histo =
1110 1394 : StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
1111 1394 : let compact_time_histo =
1112 1394 : StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id);
1113 1394 : let create_images_time_histo =
1114 1394 : StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id);
1115 1394 : let logical_size_histo =
1116 1394 : StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id);
1117 1394 : let imitate_logical_size_histo = StorageTimeMetrics::new(
1118 1394 : StorageTimeOperation::ImitateLogicalSize,
1119 1394 : &tenant_id,
1120 1394 : &timeline_id,
1121 1394 : );
1122 1394 : let load_layer_map_histo =
1123 1394 : StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id);
1124 1394 : let garbage_collect_histo =
1125 1394 : StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id);
1126 1394 : let last_record_gauge = LAST_RECORD_LSN
1127 1394 : .get_metric_with_label_values(&[&tenant_id, &timeline_id])
1128 1394 : .unwrap();
1129 1394 : let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
1130 1394 : .get_metric_with_label_values(&[&tenant_id, &timeline_id])
1131 1394 : .unwrap();
1132 1394 : let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
1133 1394 : .get_metric_with_label_values(&[&tenant_id, &timeline_id])
1134 1394 : .unwrap();
1135 1394 : let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
1136 1394 : .get_metric_with_label_values(&[&tenant_id, &timeline_id])
1137 1394 : .unwrap();
1138 1394 : let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
1139 1394 : .get_metric_with_label_values(&[&tenant_id, &timeline_id])
1140 1394 : .unwrap();
1141 1394 : let evictions = EVICTIONS
1142 1394 : .get_metric_with_label_values(&[&tenant_id, &timeline_id])
1143 1394 : .unwrap();
1144 1394 : let evictions_with_low_residence_duration =
1145 1394 : evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
1146 1394 :
1147 1394 : TimelineMetrics {
1148 1394 : tenant_id,
1149 1394 : timeline_id,
1150 1394 : flush_time_histo,
1151 1394 : compact_time_histo,
1152 1394 : create_images_time_histo,
1153 1394 : logical_size_histo,
1154 1394 : imitate_logical_size_histo,
1155 1394 : garbage_collect_histo,
1156 1394 : load_layer_map_histo,
1157 1394 : last_record_gauge,
1158 1394 : resident_physical_size_gauge,
1159 1394 : current_logical_size_gauge,
1160 1394 : num_persistent_files_created,
1161 1394 : persistent_bytes_written,
1162 1394 : evictions,
1163 1394 : evictions_with_low_residence_duration: std::sync::RwLock::new(
1164 1394 : evictions_with_low_residence_duration,
1165 1394 : ),
1166 1394 : }
1167 1394 : }
1168 : }
1169 :
1170 : impl Drop for TimelineMetrics {
1171 310 : fn drop(&mut self) {
1172 310 : let tenant_id = &self.tenant_id;
1173 310 : let timeline_id = &self.timeline_id;
1174 310 : let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
1175 310 : let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
1176 310 : let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
1177 310 : let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
1178 310 : let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
1179 310 : let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
1180 310 :
1181 310 : self.evictions_with_low_residence_duration
1182 310 : .write()
1183 310 : .unwrap()
1184 310 : .remove(tenant_id, timeline_id);
1185 :
1186 : // The following metrics are born outside of the TimelineMetrics lifecycle but still
1187 : // removed at the end of it. The idea is to have the metrics outlive the
1188 : // entity during which they're observed, e.g., the smgr metrics shall
1189 : // outlive an individual smgr connection, but not the timeline.
1190 :
1191 2790 : for op in StorageTimeOperation::VARIANTS {
1192 2480 : let _ =
1193 2480 : STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
1194 2480 : let _ =
1195 2480 : STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
1196 2480 : }
1197 :
1198 930 : for op in STORAGE_IO_SIZE_OPERATIONS {
1199 620 : let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
1200 620 : }
1201 :
1202 1550 : for op in SmgrQueryType::iter() {
1203 1240 : let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
1204 1240 : op.into(),
1205 1240 : tenant_id,
1206 1240 : timeline_id,
1207 1240 : ]);
1208 1240 : }
1209 310 : }
1210 : }
1211 :
1212 164 : pub fn remove_tenant_metrics(tenant_id: &TenantId) {
1213 164 : let tid = tenant_id.to_string();
1214 164 : let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
1215 164 : // we leave the BROKEN_TENANTS_SET entry if any
1216 164 : }
1217 :
1218 : use futures::Future;
1219 : use pin_project_lite::pin_project;
1220 : use std::collections::HashMap;
1221 : use std::pin::Pin;
1222 : use std::sync::{Arc, Mutex};
1223 : use std::task::{Context, Poll};
1224 : use std::time::{Duration, Instant};
1225 :
1226 : pub struct RemoteTimelineClientMetrics {
1227 : tenant_id: String,
1228 : timeline_id: String,
1229 : remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
1230 : calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
1231 : bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
1232 : bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
1233 : }
1234 :
1235 : impl RemoteTimelineClientMetrics {
1236 765 : pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
1237 765 : RemoteTimelineClientMetrics {
1238 765 : tenant_id: tenant_id.to_string(),
1239 765 : timeline_id: timeline_id.to_string(),
1240 765 : calls_unfinished_gauge: Mutex::new(HashMap::default()),
1241 765 : bytes_started_counter: Mutex::new(HashMap::default()),
1242 765 : bytes_finished_counter: Mutex::new(HashMap::default()),
1243 765 : remote_physical_size_gauge: Mutex::new(None),
1244 765 : }
1245 765 : }
1246 :
1247 5077 : pub fn remote_physical_size_gauge(&self) -> UIntGauge {
1248 5077 : let mut guard = self.remote_physical_size_gauge.lock().unwrap();
1249 5077 : guard
1250 5077 : .get_or_insert_with(|| {
1251 763 : REMOTE_PHYSICAL_SIZE
1252 763 : .get_metric_with_label_values(&[
1253 763 : &self.tenant_id.to_string(),
1254 763 : &self.timeline_id.to_string(),
1255 763 : ])
1256 763 : .unwrap()
1257 5077 : })
1258 5077 : .clone()
1259 5077 : }
1260 :
1261 30346 : pub fn remote_operation_time(
1262 30346 : &self,
1263 30346 : file_kind: &RemoteOpFileKind,
1264 30346 : op_kind: &RemoteOpKind,
1265 30346 : status: &'static str,
1266 30346 : ) -> Histogram {
1267 30346 : let key = (file_kind.as_str(), op_kind.as_str(), status);
1268 30346 : REMOTE_OPERATION_TIME
1269 30346 : .get_metric_with_label_values(&[key.0, key.1, key.2])
1270 30346 : .unwrap()
1271 30346 : }
1272 :
1273 50524 : fn calls_unfinished_gauge(
1274 50524 : &self,
1275 50524 : file_kind: &RemoteOpFileKind,
1276 50524 : op_kind: &RemoteOpKind,
1277 50524 : ) -> IntGauge {
1278 50524 : let mut guard = self.calls_unfinished_gauge.lock().unwrap();
1279 50524 : let key = (file_kind.as_str(), op_kind.as_str());
1280 50524 : let metric = guard.entry(key).or_insert_with(move || {
1281 1579 : REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
1282 1579 : .get_metric_with_label_values(&[
1283 1579 : &self.tenant_id.to_string(),
1284 1579 : &self.timeline_id.to_string(),
1285 1579 : key.0,
1286 1579 : key.1,
1287 1579 : ])
1288 1579 : .unwrap()
1289 50524 : });
1290 50524 : metric.clone()
1291 50524 : }
1292 :
1293 26723 : fn calls_started_hist(
1294 26723 : &self,
1295 26723 : file_kind: &RemoteOpFileKind,
1296 26723 : op_kind: &RemoteOpKind,
1297 26723 : ) -> Histogram {
1298 26723 : let key = (file_kind.as_str(), op_kind.as_str());
1299 26723 : REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
1300 26723 : .get_metric_with_label_values(&[key.0, key.1])
1301 26723 : .unwrap()
1302 26723 : }
1303 :
1304 12223 : fn bytes_started_counter(
1305 12223 : &self,
1306 12223 : file_kind: &RemoteOpFileKind,
1307 12223 : op_kind: &RemoteOpKind,
1308 12223 : ) -> IntCounter {
1309 12223 : let mut guard = self.bytes_started_counter.lock().unwrap();
1310 12223 : let key = (file_kind.as_str(), op_kind.as_str());
1311 12223 : let metric = guard.entry(key).or_insert_with(move || {
1312 510 : REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
1313 510 : .get_metric_with_label_values(&[
1314 510 : &self.tenant_id.to_string(),
1315 510 : &self.timeline_id.to_string(),
1316 510 : key.0,
1317 510 : key.1,
1318 510 : ])
1319 510 : .unwrap()
1320 12223 : });
1321 12223 : metric.clone()
1322 12223 : }
1323 :
1324 22973 : fn bytes_finished_counter(
1325 22973 : &self,
1326 22973 : file_kind: &RemoteOpFileKind,
1327 22973 : op_kind: &RemoteOpKind,
1328 22973 : ) -> IntCounter {
1329 22973 : let mut guard = self.bytes_finished_counter.lock().unwrap();
1330 22973 : let key = (file_kind.as_str(), op_kind.as_str());
1331 22973 : let metric = guard.entry(key).or_insert_with(move || {
1332 510 : REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
1333 510 : .get_metric_with_label_values(&[
1334 510 : &self.tenant_id.to_string(),
1335 510 : &self.timeline_id.to_string(),
1336 510 : key.0,
1337 510 : key.1,
1338 510 : ])
1339 510 : .unwrap()
1340 22973 : });
1341 22973 : metric.clone()
1342 22973 : }
1343 : }
1344 :
1345 : #[cfg(test)]
1346 : impl RemoteTimelineClientMetrics {
1347 3 : pub fn get_bytes_started_counter_value(
1348 3 : &self,
1349 3 : file_kind: &RemoteOpFileKind,
1350 3 : op_kind: &RemoteOpKind,
1351 3 : ) -> Option<u64> {
1352 3 : let guard = self.bytes_started_counter.lock().unwrap();
1353 3 : let key = (file_kind.as_str(), op_kind.as_str());
1354 3 : guard.get(&key).map(|counter| counter.get())
1355 3 : }
1356 :
1357 3 : pub fn get_bytes_finished_counter_value(
1358 3 : &self,
1359 3 : file_kind: &RemoteOpFileKind,
1360 3 : op_kind: &RemoteOpKind,
1361 3 : ) -> Option<u64> {
1362 3 : let guard = self.bytes_finished_counter.lock().unwrap();
1363 3 : let key = (file_kind.as_str(), op_kind.as_str());
1364 3 : guard.get(&key).map(|counter| counter.get())
1365 3 : }
1366 : }
1367 :
1368 : /// See [`RemoteTimelineClientMetrics::call_begin`].
1369 : #[must_use]
1370 : pub(crate) struct RemoteTimelineClientCallMetricGuard {
1371 : /// Decremented on drop.
1372 : calls_unfinished_metric: Option<IntGauge>,
1373 : /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
1374 : bytes_finished: Option<(IntCounter, u64)>,
1375 : }
1376 :
1377 : impl RemoteTimelineClientCallMetricGuard {
1378 : /// Consume this guard object without performing the metric updates it would do on `drop()`.
1379 : /// The caller vouches to do the metric updates manually.
1380 25476 : pub fn will_decrement_manually(mut self) {
1381 25476 : let RemoteTimelineClientCallMetricGuard {
1382 25476 : calls_unfinished_metric,
1383 25476 : bytes_finished,
1384 25476 : } = &mut self;
1385 25476 : calls_unfinished_metric.take();
1386 25476 : bytes_finished.take();
1387 25476 : }
1388 : }
1389 :
1390 : impl Drop for RemoteTimelineClientCallMetricGuard {
1391 26720 : fn drop(&mut self) {
1392 26720 : let RemoteTimelineClientCallMetricGuard {
1393 26720 : calls_unfinished_metric,
1394 26720 : bytes_finished,
1395 26720 : } = self;
1396 26720 : if let Some(guard) = calls_unfinished_metric.take() {
1397 1244 : guard.dec();
1398 25476 : }
1399 26720 : if let Some((bytes_finished_metric, value)) = bytes_finished {
1400 0 : bytes_finished_metric.inc_by(*value);
1401 26720 : }
1402 26720 : }
1403 : }
1404 :
1405 : /// The enum variants communicate to the [`RemoteTimelineClientMetrics`] whether to
1406 : /// track the byte size of this call in applicable metric(s).
1407 : pub(crate) enum RemoteTimelineClientMetricsCallTrackSize {
1408 : /// Do not account for this call's byte size in any metrics.
1409 : /// The `reason` field is there to make the call sites self-documenting
1410 : /// about why they don't need the metric.
1411 : DontTrackSize { reason: &'static str },
1412 : /// Track the byte size of the call in applicable metric(s).
1413 : Bytes(u64),
1414 : }
1415 :
1416 : impl RemoteTimelineClientMetrics {
1417 : /// Update the metrics that change when a call to the remote timeline client instance starts.
1418 : ///
1419 : /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
1420 : /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that
1421 : /// is more suitable.
1422 : /// Never do both.
1423 26723 : pub(crate) fn call_begin(
1424 26723 : &self,
1425 26723 : file_kind: &RemoteOpFileKind,
1426 26723 : op_kind: &RemoteOpKind,
1427 26723 : size: RemoteTimelineClientMetricsCallTrackSize,
1428 26723 : ) -> RemoteTimelineClientCallMetricGuard {
1429 26723 : let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
1430 26723 : self.calls_started_hist(file_kind, op_kind)
1431 26723 : .observe(calls_unfinished_metric.get() as f64);
1432 26723 : calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric
1433 :
1434 26723 : let bytes_finished = match size {
1435 14500 : RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
1436 14500 : // nothing to do
1437 14500 : None
1438 : }
1439 12223 : RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
1440 12223 : self.bytes_started_counter(file_kind, op_kind).inc_by(size);
1441 12223 : let finished_counter = self.bytes_finished_counter(file_kind, op_kind);
1442 12223 : Some((finished_counter, size))
1443 : }
1444 : };
1445 26723 : RemoteTimelineClientCallMetricGuard {
1446 26723 : calls_unfinished_metric: Some(calls_unfinished_metric),
1447 26723 : bytes_finished,
1448 26723 : }
1449 26723 : }
1450 :
1451 : /// Manually udpate the metrics that track completions, instead of using the guard object.
1452 : /// Using the guard object is generally preferable.
1453 : /// See [`call_begin`](Self::call_begin) for more context.
1454 23801 : pub(crate) fn call_end(
1455 23801 : &self,
1456 23801 : file_kind: &RemoteOpFileKind,
1457 23801 : op_kind: &RemoteOpKind,
1458 23801 : size: RemoteTimelineClientMetricsCallTrackSize,
1459 23801 : ) {
1460 23801 : let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
1461 : debug_assert!(
1462 23801 : calls_unfinished_metric.get() > 0,
1463 0 : "begin and end should cancel out"
1464 : );
1465 23801 : calls_unfinished_metric.dec();
1466 23801 : match size {
1467 13051 : RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
1468 10750 : RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
1469 10750 : self.bytes_finished_counter(file_kind, op_kind).inc_by(size);
1470 10750 : }
1471 : }
1472 23801 : }
1473 : }
1474 :
1475 : impl Drop for RemoteTimelineClientMetrics {
1476 225 : fn drop(&mut self) {
1477 225 : let RemoteTimelineClientMetrics {
1478 225 : tenant_id,
1479 225 : timeline_id,
1480 225 : remote_physical_size_gauge,
1481 225 : calls_unfinished_gauge,
1482 225 : bytes_started_counter,
1483 225 : bytes_finished_counter,
1484 225 : } = self;
1485 546 : for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
1486 546 : let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
1487 546 : tenant_id,
1488 546 : timeline_id,
1489 546 : a,
1490 546 : b,
1491 546 : ]);
1492 546 : }
1493 225 : for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
1494 153 : let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
1495 153 : tenant_id,
1496 153 : timeline_id,
1497 153 : a,
1498 153 : b,
1499 153 : ]);
1500 153 : }
1501 225 : for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
1502 153 : let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
1503 153 : tenant_id,
1504 153 : timeline_id,
1505 153 : a,
1506 153 : b,
1507 153 : ]);
1508 153 : }
1509 225 : {
1510 225 : let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
1511 225 : let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
1512 225 : }
1513 225 : }
1514 : }
1515 :
1516 : /// Wrapper future that measures the time spent by a remote storage operation,
1517 : /// and records the time and success/failure as a prometheus metric.
1518 : pub trait MeasureRemoteOp: Sized {
1519 30360 : fn measure_remote_op(
1520 30360 : self,
1521 30360 : tenant_id: TenantId,
1522 30360 : timeline_id: TimelineId,
1523 30360 : file_kind: RemoteOpFileKind,
1524 30360 : op: RemoteOpKind,
1525 30360 : metrics: Arc<RemoteTimelineClientMetrics>,
1526 30360 : ) -> MeasuredRemoteOp<Self> {
1527 30360 : let start = Instant::now();
1528 30360 : MeasuredRemoteOp {
1529 30360 : inner: self,
1530 30360 : tenant_id,
1531 30360 : timeline_id,
1532 30360 : file_kind,
1533 30360 : op,
1534 30360 : start,
1535 30360 : metrics,
1536 30360 : }
1537 30360 : }
1538 : }
1539 :
1540 : impl<T: Sized> MeasureRemoteOp for T {}
1541 :
1542 : pin_project! {
1543 : pub struct MeasuredRemoteOp<F>
1544 : {
1545 : #[pin]
1546 : inner: F,
1547 : tenant_id: TenantId,
1548 : timeline_id: TimelineId,
1549 : file_kind: RemoteOpFileKind,
1550 : op: RemoteOpKind,
1551 : start: Instant,
1552 : metrics: Arc<RemoteTimelineClientMetrics>,
1553 : }
1554 : }
1555 :
1556 : impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
1557 : type Output = Result<O, E>;
1558 :
1559 1778089 : fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
1560 1778089 : let this = self.project();
1561 1778089 : let poll_result = this.inner.poll(cx);
1562 1778089 : if let Poll::Ready(ref res) = poll_result {
1563 30346 : let duration = this.start.elapsed();
1564 30346 : let status = if res.is_ok() { &"success" } else { &"failure" };
1565 30346 : this.metrics
1566 30346 : .remote_operation_time(this.file_kind, this.op, status)
1567 30346 : .observe(duration.as_secs_f64());
1568 1747743 : }
1569 1778089 : poll_result
1570 1778089 : }
1571 : }
1572 :
1573 575 : pub fn preinitialize_metrics() {
1574 575 : // Python tests need these and on some we do alerting.
1575 575 : //
1576 575 : // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
1577 575 : // order:
1578 575 : // - global metrics reside in a Lazy<PageserverMetrics>
1579 575 : // - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
1580 575 : // - could move the statics into TimelineMetrics::new()?
1581 575 :
1582 575 : // counters
1583 575 : [
1584 575 : &MATERIALIZED_PAGE_CACHE_HIT,
1585 575 : &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
1586 575 : &UNEXPECTED_ONDEMAND_DOWNLOADS,
1587 575 : &WALRECEIVER_STARTED_CONNECTIONS,
1588 575 : &WALRECEIVER_BROKER_UPDATES,
1589 575 : &WALRECEIVER_CANDIDATES_ADDED,
1590 575 : &WALRECEIVER_CANDIDATES_REMOVED,
1591 575 : ]
1592 575 : .into_iter()
1593 4025 : .for_each(|c| {
1594 4025 : Lazy::force(c);
1595 4025 : });
1596 575 :
1597 575 : // countervecs
1598 575 : [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
1599 575 : .into_iter()
1600 575 : .for_each(|c| {
1601 575 : Lazy::force(c);
1602 575 : });
1603 575 :
1604 575 : // gauges
1605 575 : WALRECEIVER_ACTIVE_MANAGERS.get();
1606 575 :
1607 575 : // histograms
1608 575 : [
1609 575 : &READ_NUM_FS_LAYERS,
1610 575 : &RECONSTRUCT_TIME,
1611 575 : &WAIT_LSN_TIME,
1612 575 : &WAL_REDO_TIME,
1613 575 : &WAL_REDO_WAIT_TIME,
1614 575 : &WAL_REDO_RECORDS_HISTOGRAM,
1615 575 : &WAL_REDO_BYTES_HISTOGRAM,
1616 575 : ]
1617 575 : .into_iter()
1618 4025 : .for_each(|h| {
1619 4025 : Lazy::force(h);
1620 4025 : });
1621 575 : }
|