LCOV - code coverage report
Current view: top level - storage_controller/src - metrics.rs (source / functions) Coverage Total Hit
Test: 98683a8629f0f7f0031d02e04512998d589d76ea.info Lines: 27.6 % 76 21
Test Date: 2025-04-11 16:58:57 Functions: 73.5 % 34 25

            Line data    Source code
       1              : //!
       2              : //! This module provides metric definitions for the storage controller.
       3              : //!
       4              : //! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
       5              : //! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
       6              : //! constant.
       7              : //!
       8              : //! The rest of the code defines label group types and deals with converting outer types to labels.
       9              : //!
      10              : use std::sync::Mutex;
      11              : 
      12              : use bytes::Bytes;
      13              : use measured::label::LabelValue;
      14              : use measured::metric::histogram;
      15              : use measured::{FixedCardinalityLabel, MetricGroup};
      16              : use metrics::NeonMetrics;
      17              : use once_cell::sync::Lazy;
      18              : use strum::IntoEnumIterator;
      19              : 
      20              : use crate::persistence::{DatabaseError, DatabaseOperation};
      21              : use crate::service::LeadershipStatus;
      22              : 
      23              : pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
      24              :     Lazy::new(StorageControllerMetrics::default);
      25              : 
      26            0 : pub fn preinitialize_metrics() {
      27            0 :     Lazy::force(&METRICS_REGISTRY);
      28            0 : }
      29              : 
      30              : pub(crate) struct StorageControllerMetrics {
      31              :     pub(crate) metrics_group: StorageControllerMetricGroup,
      32              :     encoder: Mutex<measured::text::BufferedTextEncoder>,
      33              : }
      34              : 
      35           17 : #[derive(measured::MetricGroup)]
      36              : #[metric(new())]
      37              : pub(crate) struct StorageControllerMetricGroup {
      38              :     /// Count of how many times we spawn a reconcile task
      39              :     pub(crate) storage_controller_reconcile_spawn: measured::Counter,
      40              : 
      41              :     /// Size of the in-memory map of tenant shards
      42              :     pub(crate) storage_controller_tenant_shards: measured::Gauge,
      43              : 
      44              :     /// Size of the in-memory map of pageserver_nodes
      45              :     pub(crate) storage_controller_pageserver_nodes: measured::Gauge,
      46              : 
      47              :     /// Count of how many pageserver nodes from in-memory map have https configured
      48              :     pub(crate) storage_controller_https_pageserver_nodes: measured::Gauge,
      49              : 
      50              :     /// Size of the in-memory map of safekeeper_nodes
      51              :     pub(crate) storage_controller_safekeeper_nodes: measured::Gauge,
      52              : 
      53              :     /// Count of how many safekeeper nodes from in-memory map have https configured
      54              :     pub(crate) storage_controller_https_safekeeper_nodes: measured::Gauge,
      55              : 
      56              :     /// Reconciler tasks completed, broken down by success/failure/cancelled
      57              :     pub(crate) storage_controller_reconcile_complete:
      58              :         measured::CounterVec<ReconcileCompleteLabelGroupSet>,
      59              : 
      60              :     /// Count of how many times we make an optimization change to a tenant's scheduling
      61              :     pub(crate) storage_controller_schedule_optimization: measured::Counter,
      62              : 
      63              :     /// How many shards are not scheduled into their preferred AZ
      64              :     pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
      65              : 
      66              :     /// How many shard locations (secondary or attached) on each node
      67              :     pub(crate) storage_controller_node_shards: measured::GaugeVec<NodeLabelGroupSet>,
      68              : 
      69              :     /// How many _attached_ shard locations on each node
      70              :     pub(crate) storage_controller_node_attached_shards: measured::GaugeVec<NodeLabelGroupSet>,
      71              : 
      72              :     /// How many _home_ shard locations on each node (i.e. the node's AZ matches the shard's
      73              :     /// preferred AZ)
      74              :     pub(crate) storage_controller_node_home_shards: measured::GaugeVec<NodeLabelGroupSet>,
      75              : 
      76              :     /// How many shards would like to reconcile but were blocked by concurrency limits
      77              :     pub(crate) storage_controller_pending_reconciles: measured::Gauge,
      78              : 
      79              :     /// HTTP request status counters for handled requests
      80              :     pub(crate) storage_controller_http_request_status:
      81              :         measured::CounterVec<HttpRequestStatusLabelGroupSet>,
      82              : 
      83              :     /// HTTP request handler latency across all status codes
      84              :     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
      85              :     pub(crate) storage_controller_http_request_latency:
      86              :         measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
      87              : 
      88              :     /// HTTP rate limiting latency across all tenants and endpoints
      89              :     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 10.0))]
      90              :     pub(crate) storage_controller_http_request_rate_limited: measured::Histogram<10>,
      91              : 
      92              :     /// Count of HTTP requests to the pageserver that resulted in an error,
      93              :     /// broken down by the pageserver node id, request name and method
      94              :     pub(crate) storage_controller_pageserver_request_error:
      95              :         measured::CounterVec<PageserverRequestLabelGroupSet>,
      96              : 
      97              :     /// Count of HTTP requests to the safekeeper that resulted in an error,
      98              :     /// broken down by the safekeeper node id, request name and method
      99              :     pub(crate) storage_controller_safekeeper_request_error:
     100              :         measured::CounterVec<PageserverRequestLabelGroupSet>,
     101              : 
     102              :     /// Latency of HTTP requests to the pageserver, broken down by pageserver
     103              :     /// node id, request name and method. This include both successful and unsuccessful
     104              :     /// requests.
     105              :     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     106              :     pub(crate) storage_controller_pageserver_request_latency:
     107              :         measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
     108              : 
     109              :     /// Latency of HTTP requests to the safekeeper, broken down by safekeeper
     110              :     /// node id, request name and method. This include both successful and unsuccessful
     111              :     /// requests.
     112              :     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     113              :     pub(crate) storage_controller_safekeeper_request_latency:
     114              :         measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
     115              : 
     116              :     /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
     117              :     /// broken down by the pageserver node id, request name and method
     118              :     pub(crate) storage_controller_passthrough_request_error:
     119              :         measured::CounterVec<PageserverRequestLabelGroupSet>,
     120              : 
     121              :     /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
     122              :     /// node id, request name and method. This include both successful and unsuccessful
     123              :     /// requests.
     124              :     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     125              :     pub(crate) storage_controller_passthrough_request_latency:
     126              :         measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
     127              : 
     128              :     /// Count of errors in database queries, broken down by error type and operation.
     129              :     pub(crate) storage_controller_database_query_error:
     130              :         measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
     131              : 
     132              :     /// Latency of database queries, broken down by operation.
     133              :     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     134              :     pub(crate) storage_controller_database_query_latency:
     135              :         measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
     136              : 
     137              :     pub(crate) storage_controller_leadership_status: measured::GaugeVec<LeadershipStatusGroupSet>,
     138              : 
     139              :     /// HTTP request status counters for handled requests
     140              :     pub(crate) storage_controller_reconcile_long_running:
     141              :         measured::CounterVec<ReconcileLongRunningLabelGroupSet>,
     142              : }
     143              : 
     144              : impl StorageControllerMetrics {
     145            0 :     pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
     146            0 :         let mut encoder = self.encoder.lock().unwrap();
     147            0 :         neon_metrics
     148            0 :             .collect_group_into(&mut *encoder)
     149            0 :             .unwrap_or_else(|infallible| match infallible {});
     150            0 :         self.metrics_group
     151            0 :             .collect_group_into(&mut *encoder)
     152            0 :             .unwrap_or_else(|infallible| match infallible {});
     153            0 :         encoder.finish()
     154            0 :     }
     155              : }
     156              : 
     157              : impl Default for StorageControllerMetrics {
     158           17 :     fn default() -> Self {
     159           17 :         let mut metrics_group = StorageControllerMetricGroup::new();
     160           17 :         metrics_group
     161           17 :             .storage_controller_reconcile_complete
     162           17 :             .init_all_dense();
     163           17 : 
     164           17 :         Self {
     165           17 :             metrics_group,
     166           17 :             encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
     167           17 :         }
     168           17 :     }
     169              : }
     170              : 
     171          102 : #[derive(measured::LabelGroup, Clone)]
     172              : #[label(set = NodeLabelGroupSet)]
     173              : pub(crate) struct NodeLabelGroup<'a> {
     174              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     175              :     pub(crate) az: &'a str,
     176              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     177              :     pub(crate) node_id: &'a str,
     178              : }
     179              : 
     180           51 : #[derive(measured::LabelGroup)]
     181              : #[label(set = ReconcileCompleteLabelGroupSet)]
     182              : pub(crate) struct ReconcileCompleteLabelGroup {
     183              :     pub(crate) status: ReconcileOutcome,
     184              : }
     185              : 
     186           34 : #[derive(measured::LabelGroup)]
     187              : #[label(set = HttpRequestStatusLabelGroupSet)]
     188              : pub(crate) struct HttpRequestStatusLabelGroup<'a> {
     189              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     190              :     pub(crate) path: &'a str,
     191              :     pub(crate) method: Method,
     192              :     pub(crate) status: StatusCode,
     193              : }
     194              : 
     195           34 : #[derive(measured::LabelGroup)]
     196              : #[label(set = HttpRequestLatencyLabelGroupSet)]
     197              : pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
     198              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     199              :     pub(crate) path: &'a str,
     200              :     pub(crate) method: Method,
     201              : }
     202              : 
     203          204 : #[derive(measured::LabelGroup, Clone)]
     204              : #[label(set = PageserverRequestLabelGroupSet)]
     205              : pub(crate) struct PageserverRequestLabelGroup<'a> {
     206              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     207              :     pub(crate) pageserver_id: &'a str,
     208              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     209              :     pub(crate) path: &'a str,
     210              :     pub(crate) method: Method,
     211              : }
     212              : 
     213           68 : #[derive(measured::LabelGroup)]
     214              : #[label(set = DatabaseQueryErrorLabelGroupSet)]
     215              : pub(crate) struct DatabaseQueryErrorLabelGroup {
     216              :     pub(crate) error_type: DatabaseErrorLabel,
     217              :     pub(crate) operation: DatabaseOperation,
     218              : }
     219              : 
     220           51 : #[derive(measured::LabelGroup)]
     221              : #[label(set = DatabaseQueryLatencyLabelGroupSet)]
     222              : pub(crate) struct DatabaseQueryLatencyLabelGroup {
     223              :     pub(crate) operation: DatabaseOperation,
     224              : }
     225              : 
     226           51 : #[derive(measured::LabelGroup)]
     227              : #[label(set = LeadershipStatusGroupSet)]
     228              : pub(crate) struct LeadershipStatusGroup {
     229              :     pub(crate) status: LeadershipStatus,
     230              : }
     231              : 
     232           34 : #[derive(measured::LabelGroup, Clone)]
     233              : #[label(set = ReconcileLongRunningLabelGroupSet)]
     234              : pub(crate) struct ReconcileLongRunningLabelGroup<'a> {
     235              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     236              :     pub(crate) tenant_id: &'a str,
     237              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     238              :     pub(crate) shard_number: &'a str,
     239              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     240              :     pub(crate) sequence: &'a str,
     241              : }
     242              : 
     243              : #[derive(FixedCardinalityLabel, Clone, Copy)]
     244              : pub(crate) enum ReconcileOutcome {
     245              :     #[label(rename = "ok")]
     246              :     Success,
     247              :     Error,
     248              :     Cancel,
     249              : }
     250              : 
     251              : #[derive(FixedCardinalityLabel, Copy, Clone)]
     252              : pub(crate) enum Method {
     253              :     Get,
     254              :     Put,
     255              :     Post,
     256              :     Delete,
     257              :     Other,
     258              : }
     259              : 
     260              : impl From<hyper::Method> for Method {
     261            0 :     fn from(value: hyper::Method) -> Self {
     262            0 :         if value == hyper::Method::GET {
     263            0 :             Method::Get
     264            0 :         } else if value == hyper::Method::PUT {
     265            0 :             Method::Put
     266            0 :         } else if value == hyper::Method::POST {
     267            0 :             Method::Post
     268            0 :         } else if value == hyper::Method::DELETE {
     269            0 :             Method::Delete
     270              :         } else {
     271            0 :             Method::Other
     272              :         }
     273            0 :     }
     274              : }
     275              : 
     276              : #[derive(Clone, Copy)]
     277              : pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
     278              : 
     279              : impl LabelValue for StatusCode {
     280            0 :     fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
     281            0 :         v.write_int(self.0.as_u16() as i64)
     282            0 :     }
     283              : }
     284              : 
     285              : impl FixedCardinalityLabel for StatusCode {
     286            0 :     fn cardinality() -> usize {
     287            0 :         (100..1000).len()
     288            0 :     }
     289              : 
     290            0 :     fn encode(&self) -> usize {
     291            0 :         self.0.as_u16() as usize
     292            0 :     }
     293              : 
     294            0 :     fn decode(value: usize) -> Self {
     295            0 :         Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
     296            0 :     }
     297              : }
     298              : 
     299              : #[derive(FixedCardinalityLabel, Clone, Copy)]
     300              : pub(crate) enum DatabaseErrorLabel {
     301              :     Query,
     302              :     Connection,
     303              :     ConnectionPool,
     304              :     Logical,
     305              :     Migration,
     306              : }
     307              : 
     308              : impl DatabaseError {
     309            0 :     pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
     310            0 :         match self {
     311            0 :             Self::Query(_) => DatabaseErrorLabel::Query,
     312            0 :             Self::Connection(_) => DatabaseErrorLabel::Connection,
     313            0 :             Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
     314            0 :             Self::Logical(_) => DatabaseErrorLabel::Logical,
     315            0 :             Self::Migration(_) => DatabaseErrorLabel::Migration,
     316              :         }
     317            0 :     }
     318              : }
     319              : 
     320              : /// Update the leadership status metric gauges to reflect the requested status
     321            0 : pub(crate) fn update_leadership_status(status: LeadershipStatus) {
     322            0 :     let status_metric = &METRICS_REGISTRY
     323            0 :         .metrics_group
     324            0 :         .storage_controller_leadership_status;
     325              : 
     326            0 :     for s in LeadershipStatus::iter() {
     327            0 :         if s == status {
     328            0 :             status_metric.set(LeadershipStatusGroup { status: s }, 1);
     329            0 :         } else {
     330            0 :             status_metric.set(LeadershipStatusGroup { status: s }, 0);
     331            0 :         }
     332              :     }
     333            0 : }
        

Generated by: LCOV version 2.1-beta