LCOV - differential code coverage report
Current view: top level - pageserver/src - metrics.rs (source / functions) Coverage Total Hit UBC GIC CBC ECB
Current: f6946e90941b557c917ac98cd5a7e9506d180f3e.info Lines: 97.2 % 1333 1296 37 1296
Current Date: 2023-10-19 02:04:12 Functions: 85.0 % 260 221 39 1 220 1
Baseline: c8637f37369098875162f194f92736355783b050.info
Baseline Date: 2023-10-18 20:25:20

           TLA  Line data    Source code
       1                 : use enum_map::EnumMap;
       2                 : use metrics::metric_vec_duration::DurationResultObserver;
       3                 : use metrics::{
       4                 :     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
       5                 :     register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
       6                 :     register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
       7                 :     HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
       8                 : };
       9                 : use once_cell::sync::Lazy;
      10                 : use strum::{EnumCount, IntoEnumIterator, VariantNames};
      11                 : use strum_macros::{EnumVariantNames, IntoStaticStr};
      12                 : use utils::id::{TenantId, TimelineId};
      13                 : 
      14                 : /// Prometheus histogram buckets (in seconds) for operations in the critical
      15                 : /// path. In other words, operations that directly affect that latency of user
      16                 : /// queries.
      17                 : ///
      18                 : /// The buckets capture the majority of latencies in the microsecond and
      19                 : /// millisecond range but also extend far enough up to distinguish "bad" from
      20                 : /// "really bad".
      21                 : const CRITICAL_OP_BUCKETS: &[f64] = &[
      22                 :     0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us
      23                 :     0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms
      24                 :     1.0, 10.0, 100.0, // 1 s, 10 s, 100 s
      25                 : ];
      26                 : 
      27                 : // Metrics collected on operations on the storage repository.
      28 CBC        9564 : #[derive(Debug, EnumVariantNames, IntoStaticStr)]
      29                 : #[strum(serialize_all = "kebab_case")]
      30                 : pub enum StorageTimeOperation {
      31                 :     #[strum(serialize = "layer flush")]
      32                 :     LayerFlush,
      33                 : 
      34                 :     #[strum(serialize = "compact")]
      35                 :     Compact,
      36                 : 
      37                 :     #[strum(serialize = "create images")]
      38                 :     CreateImages,
      39                 : 
      40                 :     #[strum(serialize = "logical size")]
      41                 :     LogicalSize,
      42                 : 
      43                 :     #[strum(serialize = "imitate logical size")]
      44                 :     ImitateLogicalSize,
      45                 : 
      46                 :     #[strum(serialize = "load layer map")]
      47                 :     LoadLayerMap,
      48                 : 
      49                 :     #[strum(serialize = "gc")]
      50                 :     Gc,
      51                 : 
      52                 :     #[strum(serialize = "create tenant")]
      53                 :     CreateTenant,
      54                 : }
      55                 : 
      56             535 : pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
      57             535 :     register_counter_vec!(
      58             535 :         "pageserver_storage_operations_seconds_sum",
      59             535 :         "Total time spent on storage operations with operation, tenant and timeline dimensions",
      60             535 :         &["operation", "tenant_id", "timeline_id"],
      61             535 :     )
      62             535 :     .expect("failed to define a metric")
      63             535 : });
      64                 : 
      65             535 : pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
      66             535 :     register_int_counter_vec!(
      67             535 :         "pageserver_storage_operations_seconds_count",
      68             535 :         "Count of storage operations with operation, tenant and timeline dimensions",
      69             535 :         &["operation", "tenant_id", "timeline_id"],
      70             535 :     )
      71             535 :     .expect("failed to define a metric")
      72             535 : });
      73                 : 
      74                 : // Buckets for background operations like compaction, GC, size calculation
      75                 : const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
      76                 : 
      77             535 : pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
      78             535 :     register_histogram_vec!(
      79             535 :         "pageserver_storage_operations_seconds_global",
      80             535 :         "Time spent on storage operations",
      81             535 :         &["operation"],
      82             535 :         STORAGE_OP_BUCKETS.into(),
      83             535 :     )
      84             535 :     .expect("failed to define a metric")
      85             535 : });
      86                 : 
      87             561 : pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
      88             561 :     register_histogram!(
      89             561 :         "pageserver_read_num_fs_layers",
      90             561 :         "Number of persistent layers accessed for processing a read request, including those in the cache",
      91             561 :         vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
      92             561 :     )
      93             561 :     .expect("failed to define a metric")
      94             561 : });
      95                 : 
      96                 : // Metrics collected on operations on the storage repository.
      97                 : 
      98                 : pub(crate) struct ReconstructTimeMetrics {
      99                 :     ok: Histogram,
     100                 :     err: Histogram,
     101                 : }
     102                 : 
     103             561 : pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
     104             561 :     let inner = register_histogram_vec!(
     105             561 :         "pageserver_getpage_reconstruct_seconds",
     106             561 :         "Time spent in reconstruct_value (reconstruct a page from deltas)",
     107             561 :         &["result"],
     108             561 :         CRITICAL_OP_BUCKETS.into(),
     109             561 :     )
     110             561 :     .expect("failed to define a metric");
     111             561 :     ReconstructTimeMetrics {
     112             561 :         ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
     113             561 :         err: inner.get_metric_with_label_values(&["err"]).unwrap(),
     114             561 :     }
     115             561 : });
     116                 : 
     117                 : impl ReconstructTimeMetrics {
     118         6372249 :     pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
     119         6372249 :         match result {
     120         6372249 :             Ok(_) => &self.ok,
     121 UBC           0 :             Err(_) => &self.err,
     122                 :         }
     123 CBC     6372249 :     }
     124                 : }
     125                 : 
     126             560 : pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
     127             560 :     register_int_counter!(
     128             560 :         "pageserver_materialized_cache_hits_direct_total",
     129             560 :         "Number of cache hits from materialized page cache without redo",
     130             560 :     )
     131             560 :     .expect("failed to define a metric")
     132             560 : });
     133                 : 
     134             486 : pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
     135             486 :     register_histogram!(
     136             486 :         "pageserver_getpage_get_reconstruct_data_seconds",
     137             486 :         "Time spent in get_reconstruct_value_data",
     138             486 :         CRITICAL_OP_BUCKETS.into(),
     139             486 :     )
     140             486 :     .expect("failed to define a metric")
     141             486 : });
     142                 : 
     143             560 : pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
     144             560 :     register_int_counter!(
     145             560 :         "pageserver_materialized_cache_hits_total",
     146             560 :         "Number of cache hits from materialized page cache",
     147             560 :     )
     148             560 :     .expect("failed to define a metric")
     149             560 : });
     150                 : 
     151                 : pub struct PageCacheMetricsForTaskKind {
     152                 :     pub read_accesses_materialized_page: IntCounter,
     153                 :     pub read_accesses_immutable: IntCounter,
     154                 : 
     155                 :     pub read_hits_immutable: IntCounter,
     156                 :     pub read_hits_materialized_page_exact: IntCounter,
     157                 :     pub read_hits_materialized_page_older_lsn: IntCounter,
     158                 : }
     159                 : 
     160                 : pub struct PageCacheMetrics {
     161                 :     map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
     162                 : }
     163                 : 
     164             486 : static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
     165             486 :     register_int_counter_vec!(
     166             486 :         "pageserver_page_cache_read_hits_total",
     167             486 :         "Number of read accesses to the page cache that hit",
     168             486 :         &["task_kind", "key_kind", "content_kind", "hit_kind"]
     169             486 :     )
     170             486 :     .expect("failed to define a metric")
     171             486 : });
     172                 : 
     173             486 : static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
     174             486 :     register_int_counter_vec!(
     175             486 :         "pageserver_page_cache_read_accesses_total",
     176             486 :         "Number of read accesses to the page cache",
     177             486 :         &["task_kind", "key_kind", "content_kind"]
     178             486 :     )
     179             486 :     .expect("failed to define a metric")
     180             486 : });
     181                 : 
     182             486 : pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
     183           11665 :     map: EnumMap::from_array(std::array::from_fn(|task_kind| {
     184           11665 :         let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
     185           11665 :         let task_kind: &'static str = task_kind.into();
     186           69990 :         EnumMap::from_array(std::array::from_fn(|content_kind| {
     187           69990 :             let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
     188           69990 :             let content_kind: &'static str = content_kind.into();
     189           69990 :             PageCacheMetricsForTaskKind {
     190           69990 :                 read_accesses_materialized_page: {
     191           69990 :                     PAGE_CACHE_READ_ACCESSES
     192           69990 :                         .get_metric_with_label_values(&[
     193           69990 :                             task_kind,
     194           69990 :                             "materialized_page",
     195           69990 :                             content_kind,
     196           69990 :                         ])
     197           69990 :                         .unwrap()
     198           69990 :                 },
     199           69990 : 
     200           69990 :                 read_accesses_immutable: {
     201           69990 :                     PAGE_CACHE_READ_ACCESSES
     202           69990 :                         .get_metric_with_label_values(&[task_kind, "immutable", content_kind])
     203           69990 :                         .unwrap()
     204           69990 :                 },
     205           69990 : 
     206           69990 :                 read_hits_immutable: {
     207           69990 :                     PAGE_CACHE_READ_HITS
     208           69990 :                         .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
     209           69990 :                         .unwrap()
     210           69990 :                 },
     211           69990 : 
     212           69990 :                 read_hits_materialized_page_exact: {
     213           69990 :                     PAGE_CACHE_READ_HITS
     214           69990 :                         .get_metric_with_label_values(&[
     215           69990 :                             task_kind,
     216           69990 :                             "materialized_page",
     217           69990 :                             content_kind,
     218           69990 :                             "exact",
     219           69990 :                         ])
     220           69990 :                         .unwrap()
     221           69990 :                 },
     222           69990 : 
     223           69990 :                 read_hits_materialized_page_older_lsn: {
     224           69990 :                     PAGE_CACHE_READ_HITS
     225           69990 :                         .get_metric_with_label_values(&[
     226           69990 :                             task_kind,
     227           69990 :                             "materialized_page",
     228           69990 :                             content_kind,
     229           69990 :                             "older_lsn",
     230           69990 :                         ])
     231           69990 :                         .unwrap()
     232           69990 :                 },
     233           69990 :             }
     234           69990 :         }))
     235           11665 :     })),
     236             486 : });
     237                 : 
     238                 : impl PageCacheMetrics {
     239       592211992 :     pub(crate) fn for_ctx(&self, ctx: &RequestContext) -> &PageCacheMetricsForTaskKind {
     240       592211992 :         &self.map[ctx.task_kind()][ctx.page_content_kind()]
     241       592211992 :     }
     242                 : }
     243                 : 
     244                 : pub struct PageCacheSizeMetrics {
     245                 :     pub max_bytes: UIntGauge,
     246                 : 
     247                 :     pub current_bytes_ephemeral: UIntGauge,
     248                 :     pub current_bytes_immutable: UIntGauge,
     249                 :     pub current_bytes_materialized_page: UIntGauge,
     250                 : }
     251                 : 
     252             561 : static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
     253             561 :     register_uint_gauge_vec!(
     254             561 :         "pageserver_page_cache_size_current_bytes",
     255             561 :         "Current size of the page cache in bytes, by key kind",
     256             561 :         &["key_kind"]
     257             561 :     )
     258             561 :     .expect("failed to define a metric")
     259             561 : });
     260                 : 
     261             561 : pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
     262             561 :     max_bytes: {
     263             561 :         register_uint_gauge!(
     264             561 :             "pageserver_page_cache_size_max_bytes",
     265             561 :             "Maximum size of the page cache in bytes"
     266             561 :         )
     267             561 :         .expect("failed to define a metric")
     268             561 :     },
     269             561 : 
     270             561 :     current_bytes_ephemeral: {
     271             561 :         PAGE_CACHE_SIZE_CURRENT_BYTES
     272             561 :             .get_metric_with_label_values(&["ephemeral"])
     273             561 :             .unwrap()
     274             561 :     },
     275             561 :     current_bytes_immutable: {
     276             561 :         PAGE_CACHE_SIZE_CURRENT_BYTES
     277             561 :             .get_metric_with_label_values(&["immutable"])
     278             561 :             .unwrap()
     279             561 :     },
     280             561 :     current_bytes_materialized_page: {
     281             561 :         PAGE_CACHE_SIZE_CURRENT_BYTES
     282             561 :             .get_metric_with_label_values(&["materialized_page"])
     283             561 :             .unwrap()
     284             561 :     },
     285             561 : });
     286                 : 
     287             486 : pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
     288             486 :     register_histogram!(
     289             486 :         "pageserver_page_cache_acquire_pinned_slot_seconds",
     290             486 :         "Time spent acquiring a pinned slot in the page cache",
     291             486 :         CRITICAL_OP_BUCKETS.into(),
     292             486 :     )
     293             486 :     .expect("failed to define a metric")
     294             486 : });
     295                 : 
     296             486 : pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
     297             486 :     register_int_counter!(
     298             486 :         "pageserver_page_cache_find_victim_iters_total",
     299             486 :         "Counter for the number of iterations in the find_victim loop",
     300             486 :     )
     301             486 :     .expect("failed to define a metric")
     302             486 : });
     303                 : 
     304 UBC           0 : static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
     305               0 :     register_int_counter_vec!(
     306               0 :         "page_cache_errors_total",
     307               0 :         "Number of timeouts while acquiring a pinned slot in the page cache",
     308               0 :         &["error_kind"]
     309               0 :     )
     310               0 :     .expect("failed to define a metric")
     311               0 : });
     312                 : 
     313               0 : #[derive(IntoStaticStr)]
     314                 : #[strum(serialize_all = "kebab_case")]
     315                 : pub(crate) enum PageCacheErrorKind {
     316                 :     AcquirePinnedSlotTimeout,
     317                 :     EvictIterLimit,
     318                 : }
     319                 : 
     320               0 : pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
     321               0 :     PAGE_CACHE_ERRORS
     322               0 :         .get_metric_with_label_values(&[error_kind.into()])
     323               0 :         .unwrap()
     324               0 :         .inc();
     325               0 : }
     326                 : 
     327 CBC         561 : pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
     328             561 :     register_histogram!(
     329             561 :         "pageserver_wait_lsn_seconds",
     330             561 :         "Time spent waiting for WAL to arrive",
     331             561 :         CRITICAL_OP_BUCKETS.into(),
     332             561 :     )
     333             561 :     .expect("failed to define a metric")
     334             561 : });
     335                 : 
     336             535 : static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     337             535 :     register_int_gauge_vec!(
     338             535 :         "pageserver_last_record_lsn",
     339             535 :         "Last record LSN grouped by timeline",
     340             535 :         &["tenant_id", "timeline_id"]
     341             535 :     )
     342             535 :     .expect("failed to define a metric")
     343             535 : });
     344                 : 
     345             535 : static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     346             535 :     register_uint_gauge_vec!(
     347             535 :         "pageserver_resident_physical_size",
     348             535 :         "The size of the layer files present in the pageserver's filesystem.",
     349             535 :         &["tenant_id", "timeline_id"]
     350             535 :     )
     351             535 :     .expect("failed to define a metric")
     352             535 : });
     353                 : 
     354             535 : pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
     355             535 :     register_uint_gauge!(
     356             535 :         "pageserver_resident_physical_size_global",
     357             535 :         "Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions."
     358             535 :     )
     359             535 :     .expect("failed to define a metric")
     360             535 : });
     361                 : 
     362             536 : static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     363             536 :     register_uint_gauge_vec!(
     364             536 :         "pageserver_remote_physical_size",
     365             536 :         "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
     366             536 :         // Corollary: If any files are missing from the index part, they won't be included here.
     367             536 :         &["tenant_id", "timeline_id"]
     368             536 :     )
     369             536 :     .expect("failed to define a metric")
     370             536 : });
     371                 : 
     372             535 : static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
     373             535 :     register_uint_gauge!(
     374             535 :         "pageserver_remote_physical_size_global",
     375             535 :         "Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions."
     376             535 :     )
     377             535 :     .expect("failed to define a metric")
     378             535 : });
     379                 : 
     380              51 : pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
     381              51 :     register_int_counter!(
     382              51 :         "pageserver_remote_ondemand_downloaded_layers_total",
     383              51 :         "Total on-demand downloaded layers"
     384              51 :     )
     385              51 :     .unwrap()
     386              51 : });
     387                 : 
     388              51 : pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
     389              51 :     register_int_counter!(
     390              51 :         "pageserver_remote_ondemand_downloaded_bytes_total",
     391              51 :         "Total bytes of layers on-demand downloaded",
     392              51 :     )
     393              51 :     .unwrap()
     394              51 : });
     395                 : 
     396             535 : static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     397             535 :     register_uint_gauge_vec!(
     398             535 :         "pageserver_current_logical_size",
     399             535 :         "Current logical size grouped by timeline",
     400             535 :         &["tenant_id", "timeline_id"]
     401             535 :     )
     402             535 :     .expect("failed to define current logical size metric")
     403             535 : });
     404                 : 
     405             553 : pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
     406             553 :     register_uint_gauge_vec!(
     407             553 :         "pageserver_tenant_states_count",
     408             553 :         "Count of tenants per state",
     409             553 :         &["state"]
     410             553 :     )
     411             553 :     .expect("Failed to register pageserver_tenant_states_count metric")
     412             553 : });
     413                 : 
     414                 : /// A set of broken tenants.
     415                 : ///
     416                 : /// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
     417                 : /// tenant.
     418             553 : pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
     419             553 :     register_uint_gauge_vec!(
     420             553 :         "pageserver_broken_tenants_count",
     421             553 :         "Set of broken tenants",
     422             553 :         &["tenant_id"]
     423             553 :     )
     424             553 :     .expect("Failed to register pageserver_tenant_states_count metric")
     425             553 : });
     426                 : 
     427              96 : pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
     428              96 :     register_uint_gauge_vec!(
     429              96 :         "pageserver_tenant_synthetic_cached_size_bytes",
     430              96 :         "Synthetic size of each tenant in bytes",
     431              96 :         &["tenant_id"]
     432              96 :     )
     433              96 :     .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
     434              96 : });
     435                 : 
     436                 : // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
     437                 : // or in testing they estimate how much we would upload if we did.
     438             535 : static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
     439             535 :     register_int_counter_vec!(
     440             535 :         "pageserver_created_persistent_files_total",
     441             535 :         "Number of files created that are meant to be uploaded to cloud storage",
     442             535 :         &["tenant_id", "timeline_id"]
     443             535 :     )
     444             535 :     .expect("failed to define a metric")
     445             535 : });
     446                 : 
     447             535 : static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
     448             535 :     register_int_counter_vec!(
     449             535 :         "pageserver_written_persistent_bytes_total",
     450             535 :         "Total bytes written that are meant to be uploaded to cloud storage",
     451             535 :         &["tenant_id", "timeline_id"]
     452             535 :     )
     453             535 :     .expect("failed to define a metric")
     454             535 : });
     455                 : 
     456               1 : pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
     457               1 :     register_histogram_vec!(
     458               1 :         "pageserver_eviction_iteration_duration_seconds_global",
     459               1 :         "Time spent on a single eviction iteration",
     460               1 :         &["period_secs", "threshold_secs"],
     461               1 :         STORAGE_OP_BUCKETS.into(),
     462               1 :     )
     463               1 :     .expect("failed to define a metric")
     464               1 : });
     465                 : 
     466             535 : static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
     467             535 :     register_int_counter_vec!(
     468             535 :         "pageserver_evictions",
     469             535 :         "Number of layers evicted from the pageserver",
     470             535 :         &["tenant_id", "timeline_id"]
     471             535 :     )
     472             535 :     .expect("failed to define a metric")
     473             535 : });
     474                 : 
     475             535 : static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(|| {
     476             535 :     register_int_counter_vec!(
     477             535 :         "pageserver_evictions_with_low_residence_duration",
     478             535 :         "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
     479             535 :          Residence duration is determined using the `residence_duration_data_source`.",
     480             535 :         &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
     481             535 :     )
     482             535 :     .expect("failed to define a metric")
     483             535 : });
     484                 : 
     485             560 : pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
     486             560 :     register_int_counter!(
     487             560 :         "pageserver_unexpected_ondemand_downloads_count",
     488             560 :         "Number of unexpected on-demand downloads. \
     489             560 :          We log more context for each increment, so, forgo any labels in this metric.",
     490             560 :     )
     491             560 :     .expect("failed to define a metric")
     492             560 : });
     493                 : 
     494                 : /// How long did we take to start up?  Broken down by labels to describe
     495                 : /// different phases of startup.
     496             560 : pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
     497             560 :     register_gauge_vec!(
     498             560 :         "pageserver_startup_duration_seconds",
     499             560 :         "Time taken by phases of pageserver startup, in seconds",
     500             560 :         &["phase"]
     501             560 :     )
     502             560 :     .expect("Failed to register pageserver_startup_duration_seconds metric")
     503             560 : });
     504                 : 
     505             560 : pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
     506             560 :     register_uint_gauge!(
     507             560 :         "pageserver_startup_is_loading",
     508             560 :         "1 while in initial startup load of tenants, 0 at other times"
     509             560 :     )
     510             560 :     .expect("Failed to register pageserver_startup_is_loading")
     511             560 : });
     512                 : 
     513                 : /// How long did tenants take to go from construction to active state?
     514             523 : pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
     515             523 :     register_histogram!(
     516             523 :         "pageserver_tenant_activation_seconds",
     517             523 :         "Time taken by tenants to activate, in seconds",
     518             523 :         CRITICAL_OP_BUCKETS.into()
     519             523 :     )
     520             523 :     .expect("Failed to register pageserver_tenant_activation_seconds metric")
     521             523 : });
     522                 : 
     523                 : /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
     524 UBC           0 : #[derive(Debug)]
     525                 : pub struct EvictionsWithLowResidenceDuration {
     526                 :     data_source: &'static str,
     527                 :     threshold: Duration,
     528                 :     counter: Option<IntCounter>,
     529                 : }
     530                 : 
     531                 : pub struct EvictionsWithLowResidenceDurationBuilder {
     532                 :     data_source: &'static str,
     533                 :     threshold: Duration,
     534                 : }
     535                 : 
     536                 : impl EvictionsWithLowResidenceDurationBuilder {
     537 CBC        1310 :     pub fn new(data_source: &'static str, threshold: Duration) -> Self {
     538            1310 :         Self {
     539            1310 :             data_source,
     540            1310 :             threshold,
     541            1310 :         }
     542            1310 :     }
     543                 : 
     544            1310 :     fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
     545            1310 :         let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
     546            1310 :             .get_metric_with_label_values(&[
     547            1310 :                 tenant_id,
     548            1310 :                 timeline_id,
     549            1310 :                 self.data_source,
     550            1310 :                 &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
     551            1310 :             ])
     552            1310 :             .unwrap();
     553            1310 :         EvictionsWithLowResidenceDuration {
     554            1310 :             data_source: self.data_source,
     555            1310 :             threshold: self.threshold,
     556            1310 :             counter: Some(counter),
     557            1310 :         }
     558            1310 :     }
     559                 : }
     560                 : 
     561                 : impl EvictionsWithLowResidenceDuration {
     562            1561 :     fn threshold_label_value(threshold: Duration) -> String {
     563            1561 :         format!("{}", threshold.as_secs())
     564            1561 :     }
     565                 : 
     566             590 :     pub fn observe(&self, observed_value: Duration) {
     567             590 :         if observed_value < self.threshold {
     568             590 :             self.counter
     569             590 :                 .as_ref()
     570             590 :                 .expect("nobody calls this function after `remove_from_vec`")
     571             590 :                 .inc();
     572             590 :         }
     573             590 :     }
     574                 : 
     575              29 :     pub fn change_threshold(
     576              29 :         &mut self,
     577              29 :         tenant_id: &str,
     578              29 :         timeline_id: &str,
     579              29 :         new_threshold: Duration,
     580              29 :     ) {
     581              29 :         if new_threshold == self.threshold {
     582              21 :             return;
     583               8 :         }
     584               8 :         let mut with_new =
     585               8 :             EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold)
     586               8 :                 .build(tenant_id, timeline_id);
     587               8 :         std::mem::swap(self, &mut with_new);
     588               8 :         with_new.remove(tenant_id, timeline_id);
     589              29 :     }
     590                 : 
     591                 :     // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
     592             251 :     fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
     593             251 :         let Some(_counter) = self.counter.take() else {
     594 UBC           0 :             return;
     595                 :         };
     596                 : 
     597 CBC         251 :         let threshold = Self::threshold_label_value(self.threshold);
     598             251 : 
     599             251 :         let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
     600             251 :             tenant_id,
     601             251 :             timeline_id,
     602             251 :             self.data_source,
     603             251 :             &threshold,
     604             251 :         ]);
     605             251 : 
     606             251 :         match removed {
     607 UBC           0 :             Err(e) => {
     608               0 :                 // this has been hit in staging as
     609               0 :                 // <https://neondatabase.sentry.io/issues/4142396994/>, but we don't know how.
     610               0 :                 // because we can be in the drop path already, don't risk:
     611               0 :                 // - "double-panic => illegal instruction" or
     612               0 :                 // - future "drop panick => abort"
     613               0 :                 //
     614               0 :                 // so just nag: (the error has the labels)
     615               0 :                 tracing::warn!("failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}");
     616                 :             }
     617                 :             Ok(()) => {
     618                 :                 // to help identify cases where we double-remove the same values, let's log all
     619                 :                 // deletions?
     620 CBC         251 :                 tracing::info!("removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source);
     621                 :             }
     622                 :         }
     623             251 :     }
     624                 : }
     625                 : 
     626                 : // Metrics collected on disk IO operations
     627                 : //
     628                 : // Roughly logarithmic scale.
     629                 : const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
     630                 :     0.000030, // 30 usec
     631                 :     0.001000, // 1000 usec
     632                 :     0.030,    // 30 ms
     633                 :     1.000,    // 1000 ms
     634                 :     30.000,   // 30000 ms
     635                 : ];
     636                 : 
     637                 : /// VirtualFile fs operation variants.
     638                 : ///
     639                 : /// Operations:
     640                 : /// - open ([`std::fs::OpenOptions::open`])
     641                 : /// - close (dropping [`std::fs::File`])
     642                 : /// - close-by-replace (close by replacement algorithm)
     643                 : /// - read (`read_at`)
     644                 : /// - write (`write_at`)
     645                 : /// - seek (modify internal position or file length query)
     646                 : /// - fsync ([`std::fs::File::sync_all`])
     647                 : /// - metadata ([`std::fs::File::metadata`])
     648                 : #[derive(
     649            4432 :     Debug, Clone, Copy, strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr,
     650                 : )]
     651                 : pub(crate) enum StorageIoOperation {
     652                 :     Open,
     653                 :     Close,
     654                 :     CloseByReplace,
     655                 :     Read,
     656                 :     Write,
     657                 :     Seek,
     658                 :     Fsync,
     659                 :     Metadata,
     660                 : }
     661                 : 
     662                 : impl StorageIoOperation {
     663            4432 :     pub fn as_str(&self) -> &'static str {
     664            4432 :         match self {
     665             554 :             StorageIoOperation::Open => "open",
     666             554 :             StorageIoOperation::Close => "close",
     667             554 :             StorageIoOperation::CloseByReplace => "close-by-replace",
     668             554 :             StorageIoOperation::Read => "read",
     669             554 :             StorageIoOperation::Write => "write",
     670             554 :             StorageIoOperation::Seek => "seek",
     671             554 :             StorageIoOperation::Fsync => "fsync",
     672             554 :             StorageIoOperation::Metadata => "metadata",
     673                 :         }
     674            4432 :     }
     675                 : }
     676                 : 
     677                 : /// Tracks time taken by fs operations near VirtualFile.
     678 UBC           0 : #[derive(Debug)]
     679                 : pub(crate) struct StorageIoTime {
     680                 :     metrics: [Histogram; StorageIoOperation::COUNT],
     681                 : }
     682                 : 
     683                 : impl StorageIoTime {
     684 CBC         554 :     fn new() -> Self {
     685             554 :         let storage_io_histogram_vec = register_histogram_vec!(
     686             554 :             "pageserver_io_operations_seconds",
     687             554 :             "Time spent in IO operations",
     688             554 :             &["operation"],
     689             554 :             STORAGE_IO_TIME_BUCKETS.into()
     690             554 :         )
     691             554 :         .expect("failed to define a metric");
     692            4432 :         let metrics = std::array::from_fn(|i| {
     693            4432 :             let op = StorageIoOperation::from_repr(i).unwrap();
     694            4432 :             storage_io_histogram_vec
     695            4432 :                 .get_metric_with_label_values(&[op.as_str()])
     696            4432 :                 .unwrap()
     697            4432 :         });
     698             554 :         Self { metrics }
     699             554 :     }
     700                 : 
     701        16248653 :     pub(crate) fn get(&self, op: StorageIoOperation) -> &Histogram {
     702        16248653 :         &self.metrics[op as usize]
     703        16248653 :     }
     704                 : }
     705                 : 
     706                 : pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(StorageIoTime::new);
     707                 : 
     708                 : const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
     709                 : 
     710                 : // Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
     711             554 : pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
     712             554 :     register_int_gauge_vec!(
     713             554 :         "pageserver_io_operations_bytes_total",
     714             554 :         "Total amount of bytes read/written in IO operations",
     715             554 :         &["operation", "tenant_id", "timeline_id"]
     716             554 :     )
     717             554 :     .expect("failed to define a metric")
     718             554 : });
     719                 : 
     720 UBC           0 : #[derive(Debug)]
     721                 : struct GlobalAndPerTimelineHistogram {
     722                 :     global: Histogram,
     723                 :     per_tenant_timeline: Histogram,
     724                 : }
     725                 : 
     726                 : impl GlobalAndPerTimelineHistogram {
     727 CBC     3779051 :     fn observe(&self, value: f64) {
     728         3779051 :         self.global.observe(value);
     729         3779051 :         self.per_tenant_timeline.observe(value);
     730         3779051 :     }
     731                 : }
     732                 : 
     733                 : struct GlobalAndPerTimelineHistogramTimer<'a> {
     734                 :     h: &'a GlobalAndPerTimelineHistogram,
     735                 :     start: std::time::Instant,
     736                 : }
     737                 : 
     738                 : impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
     739         3779051 :     fn drop(&mut self) {
     740         3779051 :         let elapsed = self.start.elapsed();
     741         3779051 :         self.h.observe(elapsed.as_secs_f64());
     742         3779051 :     }
     743                 : }
     744                 : 
     745                 : #[derive(
     746 UBC           0 :     Debug,
     747               0 :     Clone,
     748                 :     Copy,
     749 CBC       36216 :     IntoStaticStr,
     750                 :     strum_macros::EnumCount,
     751            2197 :     strum_macros::EnumIter,
     752           17620 :     strum_macros::FromRepr,
     753                 : )]
     754                 : #[strum(serialize_all = "snake_case")]
     755                 : pub enum SmgrQueryType {
     756                 :     GetRelExists,
     757                 :     GetRelSize,
     758                 :     GetPageAtLsn,
     759                 :     GetDbSize,
     760                 : }
     761                 : 
     762 UBC           0 : #[derive(Debug)]
     763                 : pub struct SmgrQueryTimePerTimeline {
     764                 :     metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
     765                 : }
     766                 : 
     767 CBC         458 : static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
     768             458 :     register_histogram_vec!(
     769             458 :         "pageserver_smgr_query_seconds",
     770             458 :         "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
     771             458 :         &["smgr_query_type", "tenant_id", "timeline_id"],
     772             458 :         CRITICAL_OP_BUCKETS.into(),
     773             458 :     )
     774             458 :     .expect("failed to define a metric")
     775             458 : });
     776                 : 
     777             404 : static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     778             404 :     register_histogram_vec!(
     779             404 :         "pageserver_smgr_query_seconds_global",
     780             404 :         "Time spent on smgr query handling, aggregated by query type.",
     781             404 :         &["smgr_query_type"],
     782             404 :         CRITICAL_OP_BUCKETS.into(),
     783             404 :     )
     784             404 :     .expect("failed to define a metric")
     785             404 : });
     786                 : 
     787                 : impl SmgrQueryTimePerTimeline {
     788            4405 :     pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
     789            4405 :         let tenant_id = tenant_id.to_string();
     790            4405 :         let timeline_id = timeline_id.to_string();
     791           17620 :         let metrics = std::array::from_fn(|i| {
     792           17620 :             let op = SmgrQueryType::from_repr(i).unwrap();
     793           17620 :             let global = SMGR_QUERY_TIME_GLOBAL
     794           17620 :                 .get_metric_with_label_values(&[op.into()])
     795           17620 :                 .unwrap();
     796           17620 :             let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
     797           17620 :                 .get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
     798           17620 :                 .unwrap();
     799           17620 :             GlobalAndPerTimelineHistogram {
     800           17620 :                 global,
     801           17620 :                 per_tenant_timeline,
     802           17620 :             }
     803           17620 :         });
     804            4405 :         Self { metrics }
     805            4405 :     }
     806         3779082 :     pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ {
     807         3779082 :         let metric = &self.metrics[op as usize];
     808         3779082 :         GlobalAndPerTimelineHistogramTimer {
     809         3779082 :             h: metric,
     810         3779082 :             start: std::time::Instant::now(),
     811         3779082 :         }
     812         3779082 :     }
     813                 : }
     814                 : 
     815                 : #[cfg(test)]
     816                 : mod smgr_query_time_tests {
     817                 :     use strum::IntoEnumIterator;
     818                 :     use utils::id::{TenantId, TimelineId};
     819                 : 
     820                 :     // Regression test, we used hard-coded string constants before using an enum.
     821               1 :     #[test]
     822               1 :     fn op_label_name() {
     823               1 :         use super::SmgrQueryType::*;
     824               1 :         let expect: [(super::SmgrQueryType, &'static str); 4] = [
     825               1 :             (GetRelExists, "get_rel_exists"),
     826               1 :             (GetRelSize, "get_rel_size"),
     827               1 :             (GetPageAtLsn, "get_page_at_lsn"),
     828               1 :             (GetDbSize, "get_db_size"),
     829               1 :         ];
     830               5 :         for (op, expect) in expect {
     831               4 :             let actual: &'static str = op.into();
     832               4 :             assert_eq!(actual, expect);
     833                 :         }
     834               1 :     }
     835                 : 
     836               1 :     #[test]
     837               1 :     fn basic() {
     838               1 :         let ops: Vec<_> = super::SmgrQueryType::iter().collect();
     839                 : 
     840               5 :         for op in &ops {
     841               4 :             let tenant_id = TenantId::generate();
     842               4 :             let timeline_id = TimelineId::generate();
     843               4 :             let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
     844               4 : 
     845               8 :             let get_counts = || {
     846               8 :                 let global: u64 = ops
     847               8 :                     .iter()
     848              32 :                     .map(|op| metrics.metrics[*op as usize].global.get_sample_count())
     849               8 :                     .sum();
     850               8 :                 let per_tenant_timeline: u64 = ops
     851               8 :                     .iter()
     852              32 :                     .map(|op| {
     853              32 :                         metrics.metrics[*op as usize]
     854              32 :                             .per_tenant_timeline
     855              32 :                             .get_sample_count()
     856              32 :                     })
     857               8 :                     .sum();
     858               8 :                 (global, per_tenant_timeline)
     859               8 :             };
     860                 : 
     861               4 :             let (pre_global, pre_per_tenant_timeline) = get_counts();
     862               4 :             assert_eq!(pre_per_tenant_timeline, 0);
     863                 : 
     864               4 :             let timer = metrics.start_timer(*op);
     865               4 :             drop(timer);
     866               4 : 
     867               4 :             let (post_global, post_per_tenant_timeline) = get_counts();
     868               4 :             assert_eq!(post_per_tenant_timeline, 1);
     869               4 :             assert!(post_global > pre_global);
     870                 :         }
     871               1 :     }
     872                 : }
     873                 : 
     874                 : // keep in sync with control plane Go code so that we can validate
     875                 : // compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
     876             360 : static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
     877             360 :     // Go code uses milliseconds. Variable is called `computeStartupBuckets`
     878             360 :     [
     879             360 :         5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000,
     880             360 :         1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000,
     881             360 :     ]
     882           10080 :     .map(|ms| (ms as f64) / 1000.0)
     883             360 : });
     884                 : 
     885                 : pub struct BasebackupQueryTime(HistogramVec);
     886             360 : pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
     887             360 :     BasebackupQueryTime({
     888             360 :         register_histogram_vec!(
     889             360 :             "pageserver_basebackup_query_seconds",
     890             360 :             "Histogram of basebackup queries durations, by result type",
     891             360 :             &["result"],
     892             360 :             COMPUTE_STARTUP_BUCKETS.to_vec(),
     893             360 :         )
     894             360 :         .expect("failed to define a metric")
     895             360 :     })
     896             360 : });
     897                 : 
     898                 : impl DurationResultObserver for BasebackupQueryTime {
     899             638 :     fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
     900             638 :         let label_value = if res.is_ok() { "ok" } else { "error" };
     901             638 :         let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
     902             638 :         metric.observe(duration.as_secs_f64());
     903             638 :     }
     904                 : }
     905                 : 
     906             419 : pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
     907             419 :     register_int_gauge_vec!(
     908             419 :         "pageserver_live_connections",
     909             419 :         "Number of live network connections",
     910             419 :         &["pageserver_connection_kind"]
     911             419 :     )
     912             419 :     .expect("failed to define a metric")
     913             419 : });
     914                 : 
     915                 : // remote storage metrics
     916                 : 
     917                 : /// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
     918             536 : static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::new(|| {
     919             536 :     register_int_gauge_vec!(
     920             536 :         "pageserver_remote_timeline_client_calls_unfinished",
     921             536 :         "Number of ongoing calls to remote timeline client. \
     922             536 :          Used to populate pageserver_remote_timeline_client_calls_started. \
     923             536 :          This metric is not useful for sampling from Prometheus, but useful in tests.",
     924             536 :         &["tenant_id", "timeline_id", "file_kind", "op_kind"],
     925             536 :     )
     926             536 :     .expect("failed to define a metric")
     927             536 : });
     928                 : 
     929             536 : static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new(|| {
     930             536 :     register_histogram_vec!(
     931             536 :         "pageserver_remote_timeline_client_calls_started",
     932             536 :         "When calling a remote timeline client method, we record the current value \
     933             536 :          of the calls_unfinished gauge in this histogram. Plot the histogram \
     934             536 :          over time in a heatmap to visualize how many operations were ongoing \
     935             536 :          at a given instant. It gives you a better idea of the queue depth \
     936             536 :          than plotting the gauge directly, since operations may complete faster \
     937             536 :          than the sampling interval.",
     938             536 :         &["file_kind", "op_kind"],
     939             536 :         // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
     940             536 :         vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
     941             536 :     )
     942             536 :     .expect("failed to define a metric")
     943             536 : });
     944                 : 
     945             393 : static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
     946             393 :     register_int_counter_vec!(
     947             393 :         "pageserver_remote_timeline_client_bytes_started",
     948             393 :         "Incremented by the number of bytes associated with a remote timeline client operation. \
     949             393 :          The increment happens when the operation is scheduled.",
     950             393 :         &["tenant_id", "timeline_id", "file_kind", "op_kind"],
     951             393 :     )
     952             393 :     .expect("failed to define a metric")
     953             393 : });
     954                 : 
     955             393 : static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
     956             393 :     register_int_counter_vec!(
     957             393 :         "pageserver_remote_timeline_client_bytes_finished",
     958             393 :         "Incremented by the number of bytes associated with a remote timeline client operation. \
     959             393 :          The increment happens when the operation finishes (regardless of success/failure/shutdown).",
     960             393 :         &["tenant_id", "timeline_id", "file_kind", "op_kind"],
     961             393 :     )
     962             393 :     .expect("failed to define a metric")
     963             393 : });
     964                 : 
     965                 : pub(crate) struct DeletionQueueMetrics {
     966                 :     pub(crate) keys_submitted: IntCounter,
     967                 :     pub(crate) keys_dropped: IntCounter,
     968                 :     pub(crate) keys_executed: IntCounter,
     969                 :     pub(crate) keys_validated: IntCounter,
     970                 :     pub(crate) dropped_lsn_updates: IntCounter,
     971                 :     pub(crate) unexpected_errors: IntCounter,
     972                 :     pub(crate) remote_errors: IntCounterVec,
     973                 : }
     974             561 : pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
     975             561 :     DeletionQueueMetrics{
     976             561 : 
     977             561 :     keys_submitted: register_int_counter!(
     978             561 :         "pageserver_deletion_queue_submitted_total",
     979             561 :         "Number of objects submitted for deletion"
     980             561 :     )
     981             561 :     .expect("failed to define a metric"),
     982             561 : 
     983             561 :     keys_dropped: register_int_counter!(
     984             561 :         "pageserver_deletion_queue_dropped_total",
     985             561 :         "Number of object deletions dropped due to stale generation."
     986             561 :     )
     987             561 :     .expect("failed to define a metric"),
     988             561 : 
     989             561 :     keys_executed: register_int_counter!(
     990             561 :         "pageserver_deletion_queue_executed_total",
     991             561 :         "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion"
     992             561 :     )
     993             561 :     .expect("failed to define a metric"),
     994             561 : 
     995             561 :     keys_validated: register_int_counter!(
     996             561 :         "pageserver_deletion_queue_validated_total",
     997             561 :         "Number of keys validated for deletion.  Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage."
     998             561 :     )
     999             561 :     .expect("failed to define a metric"),
    1000             561 : 
    1001             561 :     dropped_lsn_updates: register_int_counter!(
    1002             561 :         "pageserver_deletion_queue_dropped_lsn_updates_total",
    1003             561 :         "Updates to remote_consistent_lsn dropped due to stale generation number."
    1004             561 :     )
    1005             561 :     .expect("failed to define a metric"),
    1006             561 :     unexpected_errors: register_int_counter!(
    1007             561 :         "pageserver_deletion_queue_unexpected_errors_total",
    1008             561 :         "Number of unexpected condiions that may stall the queue: any value above zero is unexpected."
    1009             561 :     )
    1010             561 :     .expect("failed to define a metric"),
    1011             561 :     remote_errors: register_int_counter_vec!(
    1012             561 :         "pageserver_deletion_queue_remote_errors_total",
    1013             561 :         "Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects",
    1014             561 :         &["op_kind"],
    1015             561 :     )
    1016             561 :     .expect("failed to define a metric")
    1017             561 : }
    1018             561 : });
    1019                 : 
    1020 UBC           0 : #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
    1021                 : pub enum RemoteOpKind {
    1022                 :     Upload,
    1023                 :     Download,
    1024                 :     Delete,
    1025                 : }
    1026                 : impl RemoteOpKind {
    1027 CBC      160914 :     pub fn as_str(&self) -> &'static str {
    1028          160914 :         match self {
    1029          153819 :             Self::Upload => "upload",
    1030            5193 :             Self::Download => "download",
    1031            1902 :             Self::Delete => "delete",
    1032                 :         }
    1033          160914 :     }
    1034                 : }
    1035                 : 
    1036 UBC           0 : #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
    1037                 : pub enum RemoteOpFileKind {
    1038                 :     Layer,
    1039                 :     Index,
    1040                 : }
    1041                 : impl RemoteOpFileKind {
    1042 CBC      160914 :     pub fn as_str(&self) -> &'static str {
    1043          160914 :         match self {
    1044          135003 :             Self::Layer => "layer",
    1045           25911 :             Self::Index => "index",
    1046                 :         }
    1047          160914 :     }
    1048                 : }
    1049                 : 
    1050             536 : pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    1051             536 :     register_histogram_vec!(
    1052             536 :         "pageserver_remote_operation_seconds",
    1053             536 :         "Time spent on remote storage operations. \
    1054             536 :         Grouped by tenant, timeline, operation_kind and status. \
    1055             536 :         Does not account for time spent waiting in remote timeline client's queues.",
    1056             536 :         &["file_kind", "op_kind", "status"]
    1057             536 :     )
    1058             536 :     .expect("failed to define a metric")
    1059             536 : });
    1060                 : 
    1061             481 : pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    1062             481 :     register_int_counter_vec!(
    1063             481 :         "pageserver_tenant_task_events",
    1064             481 :         "Number of task start/stop/fail events.",
    1065             481 :         &["event"],
    1066             481 :     )
    1067             481 :     .expect("Failed to register tenant_task_events metric")
    1068             481 : });
    1069                 : 
    1070                 : pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
    1071             243 :     Lazy::new(|| {
    1072             243 :         register_int_counter_vec!(
    1073             243 :             "pageserver_background_loop_semaphore_wait_start_count",
    1074             243 :             "Counter for background loop concurrency-limiting semaphore acquire calls started",
    1075             243 :             &["task"],
    1076             243 :         )
    1077             243 :         .unwrap()
    1078             243 :     });
    1079                 : 
    1080                 : pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
    1081             243 :     Lazy::new(|| {
    1082             243 :         register_int_counter_vec!(
    1083             243 :             "pageserver_background_loop_semaphore_wait_finish_count",
    1084             243 :             "Counter for background loop concurrency-limiting semaphore acquire calls finished",
    1085             243 :             &["task"],
    1086             243 :         )
    1087             243 :         .unwrap()
    1088             243 :     });
    1089                 : 
    1090             560 : pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    1091             560 :     register_int_counter_vec!(
    1092             560 :         "pageserver_background_loop_period_overrun_count",
    1093             560 :         "Incremented whenever warn_when_period_overrun() logs a warning.",
    1094             560 :         &["task", "period"],
    1095             560 :     )
    1096             560 :     .expect("failed to define a metric")
    1097             560 : });
    1098                 : 
    1099                 : // walreceiver metrics
    1100                 : 
    1101             560 : pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
    1102             560 :     register_int_counter!(
    1103             560 :         "pageserver_walreceiver_started_connections_total",
    1104             560 :         "Number of started walreceiver connections"
    1105             560 :     )
    1106             560 :     .expect("failed to define a metric")
    1107             560 : });
    1108                 : 
    1109             560 : pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    1110             560 :     register_int_gauge!(
    1111             560 :         "pageserver_walreceiver_active_managers",
    1112             560 :         "Number of active walreceiver managers"
    1113             560 :     )
    1114             560 :     .expect("failed to define a metric")
    1115             560 : });
    1116                 : 
    1117             389 : pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    1118             389 :     register_int_counter_vec!(
    1119             389 :         "pageserver_walreceiver_switches_total",
    1120             389 :         "Number of walreceiver manager change_connection calls",
    1121             389 :         &["reason"]
    1122             389 :     )
    1123             389 :     .expect("failed to define a metric")
    1124             389 : });
    1125                 : 
    1126             560 : pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    1127             560 :     register_int_counter!(
    1128             560 :         "pageserver_walreceiver_broker_updates_total",
    1129             560 :         "Number of received broker updates in walreceiver"
    1130             560 :     )
    1131             560 :     .expect("failed to define a metric")
    1132             560 : });
    1133                 : 
    1134             561 : pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    1135             561 :     register_int_counter_vec!(
    1136             561 :         "pageserver_walreceiver_candidates_events_total",
    1137             561 :         "Number of walreceiver candidate events",
    1138             561 :         &["event"]
    1139             561 :     )
    1140             561 :     .expect("failed to define a metric")
    1141             561 : });
    1142                 : 
    1143                 : pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
    1144             560 :     Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
    1145                 : 
    1146                 : pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
    1147             561 :     Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
    1148                 : 
    1149                 : // Metrics collected on WAL redo operations
    1150                 : //
    1151                 : // We collect the time spent in actual WAL redo ('redo'), and time waiting
    1152                 : // for access to the postgres process ('wait') since there is only one for
    1153                 : // each tenant.
    1154                 : 
    1155                 : /// Time buckets are small because we want to be able to measure the
    1156                 : /// smallest redo processing times. These buckets allow us to measure down
    1157                 : /// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
    1158                 : /// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
    1159                 : ///
    1160                 : /// Values up to 1s are recorded because metrics show that we have redo
    1161                 : /// durations and lock times larger than 0.250s.
    1162                 : macro_rules! redo_histogram_time_buckets {
    1163                 :     () => {
    1164                 :         vec![
    1165                 :             0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
    1166                 :             0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000,
    1167                 :             1.000_000,
    1168                 :         ]
    1169                 :     };
    1170                 : }
    1171                 : 
    1172                 : /// While we're at it, also measure the amount of records replayed in each
    1173                 : /// operation. We have a global 'total replayed' counter, but that's not
    1174                 : /// as useful as 'what is the skew for how many records we replay in one
    1175                 : /// operation'.
    1176                 : macro_rules! redo_histogram_count_buckets {
    1177                 :     () => {
    1178                 :         vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0]
    1179                 :     };
    1180                 : }
    1181                 : 
    1182                 : macro_rules! redo_bytes_histogram_count_buckets {
    1183                 :     () => {
    1184                 :         // powers of (2^.5), from 2^4.5 to 2^15 (22 buckets)
    1185                 :         // rounded up to the next multiple of 8 to capture any MAXALIGNed record of that size, too.
    1186                 :         vec![
    1187                 :             24.0, 32.0, 48.0, 64.0, 96.0, 128.0, 184.0, 256.0, 368.0, 512.0, 728.0, 1024.0, 1456.0,
    1188                 :             2048.0, 2904.0, 4096.0, 5800.0, 8192.0, 11592.0, 16384.0, 23176.0, 32768.0,
    1189                 :         ]
    1190                 :     };
    1191                 : }
    1192                 : 
    1193             561 : pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    1194             561 :     register_histogram!(
    1195             561 :         "pageserver_wal_redo_seconds",
    1196             561 :         "Time spent on WAL redo",
    1197             561 :         redo_histogram_time_buckets!()
    1198             561 :     )
    1199             561 :     .expect("failed to define a metric")
    1200             561 : });
    1201                 : 
    1202             561 : pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    1203             561 :     register_histogram!(
    1204             561 :         "pageserver_wal_redo_wait_seconds",
    1205             561 :         "Time spent waiting for access to the Postgres WAL redo process",
    1206             561 :         redo_histogram_time_buckets!(),
    1207             561 :     )
    1208             561 :     .expect("failed to define a metric")
    1209             561 : });
    1210                 : 
    1211             561 : pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    1212             561 :     register_histogram!(
    1213             561 :         "pageserver_wal_redo_records_histogram",
    1214             561 :         "Histogram of number of records replayed per redo in the Postgres WAL redo process",
    1215             561 :         redo_histogram_count_buckets!(),
    1216             561 :     )
    1217             561 :     .expect("failed to define a metric")
    1218             561 : });
    1219                 : 
    1220             561 : pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    1221             561 :     register_histogram!(
    1222             561 :         "pageserver_wal_redo_bytes_histogram",
    1223             561 :         "Histogram of number of records replayed per redo sent to Postgres",
    1224             561 :         redo_bytes_histogram_count_buckets!(),
    1225             561 :     )
    1226             561 :     .expect("failed to define a metric")
    1227             561 : });
    1228                 : 
    1229                 : // FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
    1230             357 : pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    1231             357 :     register_int_counter!(
    1232             357 :         "pageserver_replayed_wal_records_total",
    1233             357 :         "Number of WAL records replayed in WAL redo process"
    1234             357 :     )
    1235             357 :     .unwrap()
    1236             357 : });
    1237                 : 
    1238                 : /// Similar to `prometheus::HistogramTimer` but does not record on drop.
    1239                 : pub struct StorageTimeMetricsTimer {
    1240                 :     metrics: StorageTimeMetrics,
    1241                 :     start: Instant,
    1242                 : }
    1243                 : 
    1244                 : impl StorageTimeMetricsTimer {
    1245            8642 :     fn new(metrics: StorageTimeMetrics) -> Self {
    1246            8642 :         Self {
    1247            8642 :             metrics,
    1248            8642 :             start: Instant::now(),
    1249            8642 :         }
    1250            8642 :     }
    1251                 : 
    1252                 :     /// Record the time from creation to now.
    1253            8614 :     pub fn stop_and_record(self) {
    1254            8614 :         let duration = self.start.elapsed().as_secs_f64();
    1255            8614 :         self.metrics.timeline_sum.inc_by(duration);
    1256            8614 :         self.metrics.timeline_count.inc();
    1257            8614 :         self.metrics.global_histogram.observe(duration);
    1258            8614 :     }
    1259                 : }
    1260                 : 
    1261                 : /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
    1262                 : /// timeline total sum and count.
    1263            8642 : #[derive(Clone, Debug)]
    1264                 : pub struct StorageTimeMetrics {
    1265                 :     /// Sum of f64 seconds, per operation, tenant_id and timeline_id
    1266                 :     timeline_sum: Counter,
    1267                 :     /// Number of oeprations, per operation, tenant_id and timeline_id
    1268                 :     timeline_count: IntCounter,
    1269                 :     /// Global histogram having only the "operation" label.
    1270                 :     global_histogram: Histogram,
    1271                 : }
    1272                 : 
    1273                 : impl StorageTimeMetrics {
    1274            9114 :     pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self {
    1275            9114 :         let operation: &'static str = operation.into();
    1276            9114 : 
    1277            9114 :         let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
    1278            9114 :             .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
    1279            9114 :             .unwrap();
    1280            9114 :         let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
    1281            9114 :             .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
    1282            9114 :             .unwrap();
    1283            9114 :         let global_histogram = STORAGE_TIME_GLOBAL
    1284            9114 :             .get_metric_with_label_values(&[operation])
    1285            9114 :             .unwrap();
    1286            9114 : 
    1287            9114 :         StorageTimeMetrics {
    1288            9114 :             timeline_sum,
    1289            9114 :             timeline_count,
    1290            9114 :             global_histogram,
    1291            9114 :         }
    1292            9114 :     }
    1293                 : 
    1294                 :     /// Starts timing a new operation.
    1295                 :     ///
    1296                 :     /// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop.
    1297            8642 :     pub fn start_timer(&self) -> StorageTimeMetricsTimer {
    1298            8642 :         StorageTimeMetricsTimer::new(self.clone())
    1299            8642 :     }
    1300                 : }
    1301                 : 
    1302 UBC           0 : #[derive(Debug)]
    1303                 : pub struct TimelineMetrics {
    1304                 :     tenant_id: String,
    1305                 :     timeline_id: String,
    1306                 :     pub flush_time_histo: StorageTimeMetrics,
    1307                 :     pub compact_time_histo: StorageTimeMetrics,
    1308                 :     pub create_images_time_histo: StorageTimeMetrics,
    1309                 :     pub logical_size_histo: StorageTimeMetrics,
    1310                 :     pub imitate_logical_size_histo: StorageTimeMetrics,
    1311                 :     pub load_layer_map_histo: StorageTimeMetrics,
    1312                 :     pub garbage_collect_histo: StorageTimeMetrics,
    1313                 :     pub last_record_gauge: IntGauge,
    1314                 :     resident_physical_size_gauge: UIntGauge,
    1315                 :     /// copy of LayeredTimeline.current_logical_size
    1316                 :     pub current_logical_size_gauge: UIntGauge,
    1317                 :     pub num_persistent_files_created: IntCounter,
    1318                 :     pub persistent_bytes_written: IntCounter,
    1319                 :     pub evictions: IntCounter,
    1320                 :     pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
    1321                 : }
    1322                 : 
    1323                 : impl TimelineMetrics {
    1324 CBC        1302 :     pub fn new(
    1325            1302 :         tenant_id: &TenantId,
    1326            1302 :         timeline_id: &TimelineId,
    1327            1302 :         evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
    1328            1302 :     ) -> Self {
    1329            1302 :         let tenant_id = tenant_id.to_string();
    1330            1302 :         let timeline_id = timeline_id.to_string();
    1331            1302 :         let flush_time_histo =
    1332            1302 :             StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
    1333            1302 :         let compact_time_histo =
    1334            1302 :             StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id);
    1335            1302 :         let create_images_time_histo =
    1336            1302 :             StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id);
    1337            1302 :         let logical_size_histo =
    1338            1302 :             StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id);
    1339            1302 :         let imitate_logical_size_histo = StorageTimeMetrics::new(
    1340            1302 :             StorageTimeOperation::ImitateLogicalSize,
    1341            1302 :             &tenant_id,
    1342            1302 :             &timeline_id,
    1343            1302 :         );
    1344            1302 :         let load_layer_map_histo =
    1345            1302 :             StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id);
    1346            1302 :         let garbage_collect_histo =
    1347            1302 :             StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id);
    1348            1302 :         let last_record_gauge = LAST_RECORD_LSN
    1349            1302 :             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
    1350            1302 :             .unwrap();
    1351            1302 :         let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
    1352            1302 :             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
    1353            1302 :             .unwrap();
    1354            1302 :         let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
    1355            1302 :             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
    1356            1302 :             .unwrap();
    1357            1302 :         let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
    1358            1302 :             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
    1359            1302 :             .unwrap();
    1360            1302 :         let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
    1361            1302 :             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
    1362            1302 :             .unwrap();
    1363            1302 :         let evictions = EVICTIONS
    1364            1302 :             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
    1365            1302 :             .unwrap();
    1366            1302 :         let evictions_with_low_residence_duration =
    1367            1302 :             evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
    1368            1302 : 
    1369            1302 :         TimelineMetrics {
    1370            1302 :             tenant_id,
    1371            1302 :             timeline_id,
    1372            1302 :             flush_time_histo,
    1373            1302 :             compact_time_histo,
    1374            1302 :             create_images_time_histo,
    1375            1302 :             logical_size_histo,
    1376            1302 :             imitate_logical_size_histo,
    1377            1302 :             garbage_collect_histo,
    1378            1302 :             load_layer_map_histo,
    1379            1302 :             last_record_gauge,
    1380            1302 :             resident_physical_size_gauge,
    1381            1302 :             current_logical_size_gauge,
    1382            1302 :             num_persistent_files_created,
    1383            1302 :             persistent_bytes_written,
    1384            1302 :             evictions,
    1385            1302 :             evictions_with_low_residence_duration: std::sync::RwLock::new(
    1386            1302 :                 evictions_with_low_residence_duration,
    1387            1302 :             ),
    1388            1302 :         }
    1389            1302 :     }
    1390                 : 
    1391           18600 :     pub fn record_new_file_metrics(&self, sz: u64) {
    1392           18600 :         self.resident_physical_size_add(sz);
    1393           18600 :         self.num_persistent_files_created.inc_by(1);
    1394           18600 :         self.persistent_bytes_written.inc_by(sz);
    1395           18600 :     }
    1396                 : 
    1397            7441 :     pub fn resident_physical_size_sub(&self, sz: u64) {
    1398            7441 :         self.resident_physical_size_gauge.sub(sz);
    1399            7441 :         crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
    1400            7441 :     }
    1401                 : 
    1402           19970 :     pub fn resident_physical_size_add(&self, sz: u64) {
    1403           19970 :         self.resident_physical_size_gauge.add(sz);
    1404           19970 :         crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
    1405           19970 :     }
    1406                 : 
    1407             314 :     pub fn resident_physical_size_set(&self, sz: u64) {
    1408             314 :         self.resident_physical_size_gauge.set(sz);
    1409             314 :         crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
    1410             314 :     }
    1411                 : 
    1412             255 :     pub fn resident_physical_size_get(&self) -> u64 {
    1413             255 :         self.resident_physical_size_gauge.get()
    1414             255 :     }
    1415                 : }
    1416                 : 
    1417                 : impl Drop for TimelineMetrics {
    1418             243 :     fn drop(&mut self) {
    1419             243 :         let tenant_id = &self.tenant_id;
    1420             243 :         let timeline_id = &self.timeline_id;
    1421             243 :         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
    1422             243 :         {
    1423             243 :             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
    1424             243 :             let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
    1425             243 :         }
    1426             243 :         let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
    1427             243 :         let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
    1428             243 :         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
    1429             243 :         let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
    1430             243 : 
    1431             243 :         self.evictions_with_low_residence_duration
    1432             243 :             .write()
    1433             243 :             .unwrap()
    1434             243 :             .remove(tenant_id, timeline_id);
    1435                 : 
    1436                 :         // The following metrics are born outside of the TimelineMetrics lifecycle but still
    1437                 :         // removed at the end of it. The idea is to have the metrics outlive the
    1438                 :         // entity during which they're observed, e.g., the smgr metrics shall
    1439                 :         // outlive an individual smgr connection, but not the timeline.
    1440                 : 
    1441            2187 :         for op in StorageTimeOperation::VARIANTS {
    1442            1944 :             let _ =
    1443            1944 :                 STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
    1444            1944 :             let _ =
    1445            1944 :                 STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
    1446            1944 :         }
    1447                 : 
    1448             729 :         for op in STORAGE_IO_SIZE_OPERATIONS {
    1449             486 :             let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
    1450             486 :         }
    1451                 : 
    1452            1215 :         for op in SmgrQueryType::iter() {
    1453             972 :             let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
    1454             972 :                 op.into(),
    1455             972 :                 tenant_id,
    1456             972 :                 timeline_id,
    1457             972 :             ]);
    1458             972 :         }
    1459             243 :     }
    1460                 : }
    1461                 : 
    1462             139 : pub fn remove_tenant_metrics(tenant_id: &TenantId) {
    1463             139 :     let tid = tenant_id.to_string();
    1464             139 :     let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
    1465             139 :     // we leave the BROKEN_TENANTS_SET entry if any
    1466             139 : }
    1467                 : 
    1468                 : use futures::Future;
    1469                 : use pin_project_lite::pin_project;
    1470                 : use std::collections::HashMap;
    1471                 : use std::pin::Pin;
    1472                 : use std::sync::{Arc, Mutex};
    1473                 : use std::task::{Context, Poll};
    1474                 : use std::time::{Duration, Instant};
    1475                 : 
    1476                 : use crate::context::{PageContentKind, RequestContext};
    1477                 : use crate::task_mgr::TaskKind;
    1478                 : 
    1479                 : /// Maintain a per timeline gauge in addition to the global gauge.
    1480                 : struct PerTimelineRemotePhysicalSizeGauge {
    1481                 :     last_set: u64,
    1482                 :     gauge: UIntGauge,
    1483                 : }
    1484                 : 
    1485                 : impl PerTimelineRemotePhysicalSizeGauge {
    1486            1302 :     fn new(per_timeline_gauge: UIntGauge) -> Self {
    1487            1302 :         Self {
    1488            1302 :             last_set: per_timeline_gauge.get(),
    1489            1302 :             gauge: per_timeline_gauge,
    1490            1302 :         }
    1491            1302 :     }
    1492            7290 :     fn set(&mut self, sz: u64) {
    1493            7290 :         self.gauge.set(sz);
    1494            7290 :         if sz < self.last_set {
    1495              73 :             REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
    1496            7217 :         } else {
    1497            7217 :             REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
    1498            7217 :         };
    1499            7290 :         self.last_set = sz;
    1500            7290 :     }
    1501              12 :     fn get(&self) -> u64 {
    1502              12 :         self.gauge.get()
    1503              12 :     }
    1504                 : }
    1505                 : 
    1506                 : impl Drop for PerTimelineRemotePhysicalSizeGauge {
    1507             243 :     fn drop(&mut self) {
    1508             243 :         REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
    1509             243 :     }
    1510                 : }
    1511                 : 
    1512                 : pub struct RemoteTimelineClientMetrics {
    1513                 :     tenant_id: String,
    1514                 :     timeline_id: String,
    1515                 :     remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
    1516                 :     calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
    1517                 :     bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    1518                 :     bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    1519                 : }
    1520                 : 
    1521                 : impl RemoteTimelineClientMetrics {
    1522            1600 :     pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
    1523            1600 :         RemoteTimelineClientMetrics {
    1524            1600 :             tenant_id: tenant_id.to_string(),
    1525            1600 :             timeline_id: timeline_id.to_string(),
    1526            1600 :             calls_unfinished_gauge: Mutex::new(HashMap::default()),
    1527            1600 :             bytes_started_counter: Mutex::new(HashMap::default()),
    1528            1600 :             bytes_finished_counter: Mutex::new(HashMap::default()),
    1529            1600 :             remote_physical_size_gauge: Mutex::new(None),
    1530            1600 :         }
    1531            1600 :     }
    1532                 : 
    1533            7290 :     pub(crate) fn remote_physical_size_set(&self, sz: u64) {
    1534            7290 :         let mut guard = self.remote_physical_size_gauge.lock().unwrap();
    1535            7290 :         let gauge = guard.get_or_insert_with(|| {
    1536            1302 :             PerTimelineRemotePhysicalSizeGauge::new(
    1537            1302 :                 REMOTE_PHYSICAL_SIZE
    1538            1302 :                     .get_metric_with_label_values(&[
    1539            1302 :                         &self.tenant_id.to_string(),
    1540            1302 :                         &self.timeline_id.to_string(),
    1541            1302 :                     ])
    1542            1302 :                     .unwrap(),
    1543            1302 :             )
    1544            7290 :         });
    1545            7290 :         gauge.set(sz);
    1546            7290 :     }
    1547                 : 
    1548              12 :     pub(crate) fn remote_physical_size_get(&self) -> u64 {
    1549              12 :         let guard = self.remote_physical_size_gauge.lock().unwrap();
    1550              12 :         guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
    1551              12 :     }
    1552                 : 
    1553           28743 :     pub fn remote_operation_time(
    1554           28743 :         &self,
    1555           28743 :         file_kind: &RemoteOpFileKind,
    1556           28743 :         op_kind: &RemoteOpKind,
    1557           28743 :         status: &'static str,
    1558           28743 :     ) -> Histogram {
    1559           28743 :         let key = (file_kind.as_str(), op_kind.as_str(), status);
    1560           28743 :         REMOTE_OPERATION_TIME
    1561           28743 :             .get_metric_with_label_values(&[key.0, key.1, key.2])
    1562           28743 :             .unwrap()
    1563           28743 :     }
    1564                 : 
    1565           50805 :     fn calls_unfinished_gauge(
    1566           50805 :         &self,
    1567           50805 :         file_kind: &RemoteOpFileKind,
    1568           50805 :         op_kind: &RemoteOpKind,
    1569           50805 :     ) -> IntGauge {
    1570           50805 :         let mut guard = self.calls_unfinished_gauge.lock().unwrap();
    1571           50805 :         let key = (file_kind.as_str(), op_kind.as_str());
    1572           50805 :         let metric = guard.entry(key).or_insert_with(move || {
    1573            2668 :             REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
    1574            2668 :                 .get_metric_with_label_values(&[
    1575            2668 :                     &self.tenant_id.to_string(),
    1576            2668 :                     &self.timeline_id.to_string(),
    1577            2668 :                     key.0,
    1578            2668 :                     key.1,
    1579            2668 :                 ])
    1580            2668 :                 .unwrap()
    1581           50805 :         });
    1582           50805 :         metric.clone()
    1583           50805 :     }
    1584                 : 
    1585           27030 :     fn calls_started_hist(
    1586           27030 :         &self,
    1587           27030 :         file_kind: &RemoteOpFileKind,
    1588           27030 :         op_kind: &RemoteOpKind,
    1589           27030 :     ) -> Histogram {
    1590           27030 :         let key = (file_kind.as_str(), op_kind.as_str());
    1591           27030 :         REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
    1592           27030 :             .get_metric_with_label_values(&[key.0, key.1])
    1593           27030 :             .unwrap()
    1594           27030 :     }
    1595                 : 
    1596           18600 :     fn bytes_started_counter(
    1597           18600 :         &self,
    1598           18600 :         file_kind: &RemoteOpFileKind,
    1599           18600 :         op_kind: &RemoteOpKind,
    1600           18600 :     ) -> IntCounter {
    1601           18600 :         let mut guard = self.bytes_started_counter.lock().unwrap();
    1602           18600 :         let key = (file_kind.as_str(), op_kind.as_str());
    1603           18600 :         let metric = guard.entry(key).or_insert_with(move || {
    1604             786 :             REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
    1605             786 :                 .get_metric_with_label_values(&[
    1606             786 :                     &self.tenant_id.to_string(),
    1607             786 :                     &self.timeline_id.to_string(),
    1608             786 :                     key.0,
    1609             786 :                     key.1,
    1610             786 :                 ])
    1611             786 :                 .unwrap()
    1612           18600 :         });
    1613           18600 :         metric.clone()
    1614           18600 :     }
    1615                 : 
    1616           35730 :     fn bytes_finished_counter(
    1617           35730 :         &self,
    1618           35730 :         file_kind: &RemoteOpFileKind,
    1619           35730 :         op_kind: &RemoteOpKind,
    1620           35730 :     ) -> IntCounter {
    1621           35730 :         let mut guard = self.bytes_finished_counter.lock().unwrap();
    1622           35730 :         let key = (file_kind.as_str(), op_kind.as_str());
    1623           35730 :         let metric = guard.entry(key).or_insert_with(move || {
    1624             786 :             REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
    1625             786 :                 .get_metric_with_label_values(&[
    1626             786 :                     &self.tenant_id.to_string(),
    1627             786 :                     &self.timeline_id.to_string(),
    1628             786 :                     key.0,
    1629             786 :                     key.1,
    1630             786 :                 ])
    1631             786 :                 .unwrap()
    1632           35730 :         });
    1633           35730 :         metric.clone()
    1634           35730 :     }
    1635                 : }
    1636                 : 
    1637                 : #[cfg(test)]
    1638                 : impl RemoteTimelineClientMetrics {
    1639               3 :     pub fn get_bytes_started_counter_value(
    1640               3 :         &self,
    1641               3 :         file_kind: &RemoteOpFileKind,
    1642               3 :         op_kind: &RemoteOpKind,
    1643               3 :     ) -> Option<u64> {
    1644               3 :         let guard = self.bytes_started_counter.lock().unwrap();
    1645               3 :         let key = (file_kind.as_str(), op_kind.as_str());
    1646               3 :         guard.get(&key).map(|counter| counter.get())
    1647               3 :     }
    1648                 : 
    1649               3 :     pub fn get_bytes_finished_counter_value(
    1650               3 :         &self,
    1651               3 :         file_kind: &RemoteOpFileKind,
    1652               3 :         op_kind: &RemoteOpKind,
    1653               3 :     ) -> Option<u64> {
    1654               3 :         let guard = self.bytes_finished_counter.lock().unwrap();
    1655               3 :         let key = (file_kind.as_str(), op_kind.as_str());
    1656               3 :         guard.get(&key).map(|counter| counter.get())
    1657               3 :     }
    1658                 : }
    1659                 : 
    1660                 : /// See [`RemoteTimelineClientMetrics::call_begin`].
    1661                 : #[must_use]
    1662                 : pub(crate) struct RemoteTimelineClientCallMetricGuard {
    1663                 :     /// Decremented on drop.
    1664                 :     calls_unfinished_metric: Option<IntGauge>,
    1665                 :     /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
    1666                 :     bytes_finished: Option<(IntCounter, u64)>,
    1667                 : }
    1668                 : 
    1669                 : impl RemoteTimelineClientCallMetricGuard {
    1670                 :     /// Consume this guard object without performing the metric updates it would do on `drop()`.
    1671                 :     /// The caller vouches to do the metric updates manually.
    1672           25298 :     pub fn will_decrement_manually(mut self) {
    1673           25298 :         let RemoteTimelineClientCallMetricGuard {
    1674           25298 :             calls_unfinished_metric,
    1675           25298 :             bytes_finished,
    1676           25298 :         } = &mut self;
    1677           25298 :         calls_unfinished_metric.take();
    1678           25298 :         bytes_finished.take();
    1679           25298 :     }
    1680                 : }
    1681                 : 
    1682                 : impl Drop for RemoteTimelineClientCallMetricGuard {
    1683           27027 :     fn drop(&mut self) {
    1684           27027 :         let RemoteTimelineClientCallMetricGuard {
    1685           27027 :             calls_unfinished_metric,
    1686           27027 :             bytes_finished,
    1687           27027 :         } = self;
    1688           27027 :         if let Some(guard) = calls_unfinished_metric.take() {
    1689            1729 :             guard.dec();
    1690           25298 :         }
    1691           27027 :         if let Some((bytes_finished_metric, value)) = bytes_finished {
    1692 UBC           0 :             bytes_finished_metric.inc_by(*value);
    1693 CBC       27027 :         }
    1694           27027 :     }
    1695                 : }
    1696                 : 
    1697                 : /// The enum variants communicate to the [`RemoteTimelineClientMetrics`] whether to
    1698                 : /// track the byte size of this call in applicable metric(s).
    1699                 : pub(crate) enum RemoteTimelineClientMetricsCallTrackSize {
    1700                 :     /// Do not account for this call's byte size in any metrics.
    1701                 :     /// The `reason` field is there to make the call sites self-documenting
    1702                 :     /// about why they don't need the metric.
    1703                 :     DontTrackSize { reason: &'static str },
    1704                 :     /// Track the byte size of the call in applicable metric(s).
    1705                 :     Bytes(u64),
    1706                 : }
    1707                 : 
    1708                 : impl RemoteTimelineClientMetrics {
    1709                 :     /// Update the metrics that change when a call to the remote timeline client instance starts.
    1710                 :     ///
    1711                 :     /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
    1712                 :     /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that
    1713                 :     /// is more suitable.
    1714                 :     /// Never do both.
    1715           27030 :     pub(crate) fn call_begin(
    1716           27030 :         &self,
    1717           27030 :         file_kind: &RemoteOpFileKind,
    1718           27030 :         op_kind: &RemoteOpKind,
    1719           27030 :         size: RemoteTimelineClientMetricsCallTrackSize,
    1720           27030 :     ) -> RemoteTimelineClientCallMetricGuard {
    1721           27030 :         let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
    1722           27030 :         self.calls_started_hist(file_kind, op_kind)
    1723           27030 :             .observe(calls_unfinished_metric.get() as f64);
    1724           27030 :         calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric
    1725                 : 
    1726           27030 :         let bytes_finished = match size {
    1727            8430 :             RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
    1728            8430 :                 // nothing to do
    1729            8430 :                 None
    1730                 :             }
    1731           18600 :             RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
    1732           18600 :                 self.bytes_started_counter(file_kind, op_kind).inc_by(size);
    1733           18600 :                 let finished_counter = self.bytes_finished_counter(file_kind, op_kind);
    1734           18600 :                 Some((finished_counter, size))
    1735                 :             }
    1736                 :         };
    1737           27030 :         RemoteTimelineClientCallMetricGuard {
    1738           27030 :             calls_unfinished_metric: Some(calls_unfinished_metric),
    1739           27030 :             bytes_finished,
    1740           27030 :         }
    1741           27030 :     }
    1742                 : 
    1743                 :     /// Manually udpate the metrics that track completions, instead of using the guard object.
    1744                 :     /// Using the guard object is generally preferable.
    1745                 :     /// See [`call_begin`](Self::call_begin) for more context.
    1746           23775 :     pub(crate) fn call_end(
    1747           23775 :         &self,
    1748           23775 :         file_kind: &RemoteOpFileKind,
    1749           23775 :         op_kind: &RemoteOpKind,
    1750           23775 :         size: RemoteTimelineClientMetricsCallTrackSize,
    1751           23775 :     ) {
    1752           23775 :         let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
    1753                 :         debug_assert!(
    1754           23775 :             calls_unfinished_metric.get() > 0,
    1755 UBC           0 :             "begin and end should cancel out"
    1756                 :         );
    1757 CBC       23775 :         calls_unfinished_metric.dec();
    1758           23775 :         match size {
    1759            6645 :             RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
    1760           17130 :             RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
    1761           17130 :                 self.bytes_finished_counter(file_kind, op_kind).inc_by(size);
    1762           17130 :             }
    1763                 :         }
    1764           23775 :     }
    1765                 : }
    1766                 : 
    1767                 : impl Drop for RemoteTimelineClientMetrics {
    1768             541 :     fn drop(&mut self) {
    1769             541 :         let RemoteTimelineClientMetrics {
    1770             541 :             tenant_id,
    1771             541 :             timeline_id,
    1772             541 :             remote_physical_size_gauge,
    1773             541 :             calls_unfinished_gauge,
    1774             541 :             bytes_started_counter,
    1775             541 :             bytes_finished_counter,
    1776             541 :         } = self;
    1777             541 :         for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
    1778             520 :             let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
    1779             520 :                 tenant_id,
    1780             520 :                 timeline_id,
    1781             520 :                 a,
    1782             520 :                 b,
    1783             520 :             ]);
    1784             520 :         }
    1785             541 :         for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
    1786             161 :             let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
    1787             161 :                 tenant_id,
    1788             161 :                 timeline_id,
    1789             161 :                 a,
    1790             161 :                 b,
    1791             161 :             ]);
    1792             161 :         }
    1793             541 :         for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
    1794             161 :             let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
    1795             161 :                 tenant_id,
    1796             161 :                 timeline_id,
    1797             161 :                 a,
    1798             161 :                 b,
    1799             161 :             ]);
    1800             161 :         }
    1801             541 :         {
    1802             541 :             let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
    1803             541 :             let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
    1804             541 :         }
    1805             541 :     }
    1806                 : }
    1807                 : 
    1808                 : /// Wrapper future that measures the time spent by a remote storage operation,
    1809                 : /// and records the time and success/failure as a prometheus metric.
    1810                 : pub trait MeasureRemoteOp: Sized {
    1811           28827 :     fn measure_remote_op(
    1812           28827 :         self,
    1813           28827 :         tenant_id: TenantId,
    1814           28827 :         timeline_id: TimelineId,
    1815           28827 :         file_kind: RemoteOpFileKind,
    1816           28827 :         op: RemoteOpKind,
    1817           28827 :         metrics: Arc<RemoteTimelineClientMetrics>,
    1818           28827 :     ) -> MeasuredRemoteOp<Self> {
    1819           28827 :         let start = Instant::now();
    1820           28827 :         MeasuredRemoteOp {
    1821           28827 :             inner: self,
    1822           28827 :             tenant_id,
    1823           28827 :             timeline_id,
    1824           28827 :             file_kind,
    1825           28827 :             op,
    1826           28827 :             start,
    1827           28827 :             metrics,
    1828           28827 :         }
    1829           28827 :     }
    1830                 : }
    1831                 : 
    1832                 : impl<T: Sized> MeasureRemoteOp for T {}
    1833                 : 
    1834                 : pin_project! {
    1835                 :     pub struct MeasuredRemoteOp<F>
    1836                 :     {
    1837                 :         #[pin]
    1838                 :         inner: F,
    1839                 :         tenant_id: TenantId,
    1840                 :         timeline_id: TimelineId,
    1841                 :         file_kind: RemoteOpFileKind,
    1842                 :         op: RemoteOpKind,
    1843                 :         start: Instant,
    1844                 :         metrics: Arc<RemoteTimelineClientMetrics>,
    1845                 :     }
    1846                 : }
    1847                 : 
    1848                 : impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
    1849                 :     type Output = Result<O, E>;
    1850                 : 
    1851         3763127 :     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
    1852         3763127 :         let this = self.project();
    1853         3763127 :         let poll_result = this.inner.poll(cx);
    1854         3763127 :         if let Poll::Ready(ref res) = poll_result {
    1855           28743 :             let duration = this.start.elapsed();
    1856           28743 :             let status = if res.is_ok() { &"success" } else { &"failure" };
    1857           28743 :             this.metrics
    1858           28743 :                 .remote_operation_time(this.file_kind, this.op, status)
    1859           28743 :                 .observe(duration.as_secs_f64());
    1860         3734384 :         }
    1861         3763127 :         poll_result
    1862         3763127 :     }
    1863                 : }
    1864                 : 
    1865             560 : pub fn preinitialize_metrics() {
    1866             560 :     // Python tests need these and on some we do alerting.
    1867             560 :     //
    1868             560 :     // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
    1869             560 :     // order:
    1870             560 :     // - global metrics reside in a Lazy<PageserverMetrics>
    1871             560 :     //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
    1872             560 :     // - could move the statics into TimelineMetrics::new()?
    1873             560 : 
    1874             560 :     // counters
    1875             560 :     [
    1876             560 :         &MATERIALIZED_PAGE_CACHE_HIT,
    1877             560 :         &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
    1878             560 :         &UNEXPECTED_ONDEMAND_DOWNLOADS,
    1879             560 :         &WALRECEIVER_STARTED_CONNECTIONS,
    1880             560 :         &WALRECEIVER_BROKER_UPDATES,
    1881             560 :         &WALRECEIVER_CANDIDATES_ADDED,
    1882             560 :         &WALRECEIVER_CANDIDATES_REMOVED,
    1883             560 :     ]
    1884             560 :     .into_iter()
    1885            3920 :     .for_each(|c| {
    1886            3920 :         Lazy::force(c);
    1887            3920 :     });
    1888             560 : 
    1889             560 :     // Deletion queue stats
    1890             560 :     Lazy::force(&DELETION_QUEUE);
    1891             560 : 
    1892             560 :     // countervecs
    1893             560 :     [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
    1894             560 :         .into_iter()
    1895             560 :         .for_each(|c| {
    1896             560 :             Lazy::force(c);
    1897             560 :         });
    1898             560 : 
    1899             560 :     // gauges
    1900             560 :     WALRECEIVER_ACTIVE_MANAGERS.get();
    1901             560 : 
    1902             560 :     // histograms
    1903             560 :     [
    1904             560 :         &READ_NUM_FS_LAYERS,
    1905             560 :         &WAIT_LSN_TIME,
    1906             560 :         &WAL_REDO_TIME,
    1907             560 :         &WAL_REDO_WAIT_TIME,
    1908             560 :         &WAL_REDO_RECORDS_HISTOGRAM,
    1909             560 :         &WAL_REDO_BYTES_HISTOGRAM,
    1910             560 :     ]
    1911             560 :     .into_iter()
    1912            3360 :     .for_each(|h| {
    1913            3360 :         Lazy::force(h);
    1914            3360 :     });
    1915             560 : 
    1916             560 :     // Custom
    1917             560 :     Lazy::force(&RECONSTRUCT_TIME);
    1918             560 : }
        

Generated by: LCOV version 2.1-beta