LCOV - code coverage report
Current view: top level - storage_controller/src - metrics.rs (source / functions) Coverage Total Hit
Test: 7eb96e224e685167ad85f58f858387d8cf253f63.info Lines: 25.7 % 74 19
Test Date: 2024-09-23 21:23:07 Functions: 70.0 % 30 21

            Line data    Source code
       1              : //!
       2              : //! This module provides metric definitions for the storage controller.
       3              : //!
       4              : //! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
       5              : //! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
       6              : //! constant.
       7              : //!
       8              : //! The rest of the code defines label group types and deals with converting outer types to labels.
       9              : //!
      10              : use bytes::Bytes;
      11              : use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
      12              : use metrics::NeonMetrics;
      13              : use once_cell::sync::Lazy;
      14              : use std::sync::Mutex;
      15              : use strum::IntoEnumIterator;
      16              : 
      17              : use crate::{
      18              :     persistence::{DatabaseError, DatabaseOperation},
      19              :     service::LeadershipStatus,
      20              : };
      21              : 
      22              : pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
      23              :     Lazy::new(StorageControllerMetrics::default);
      24              : 
      25            0 : pub fn preinitialize_metrics() {
      26            0 :     Lazy::force(&METRICS_REGISTRY);
      27            0 : }
      28              : 
      29              : pub(crate) struct StorageControllerMetrics {
      30              :     pub(crate) metrics_group: StorageControllerMetricGroup,
      31              :     encoder: Mutex<measured::text::BufferedTextEncoder>,
      32              : }
      33              : 
      34            3 : #[derive(measured::MetricGroup)]
      35              : #[metric(new())]
      36              : pub(crate) struct StorageControllerMetricGroup {
      37              :     /// Count of how many times we spawn a reconcile task
      38              :     pub(crate) storage_controller_reconcile_spawn: measured::Counter,
      39              : 
      40              :     /// Reconciler tasks completed, broken down by success/failure/cancelled
      41              :     pub(crate) storage_controller_reconcile_complete:
      42              :         measured::CounterVec<ReconcileCompleteLabelGroupSet>,
      43              : 
      44              :     /// Count of how many times we make an optimization change to a tenant's scheduling
      45              :     pub(crate) storage_controller_schedule_optimization: measured::Counter,
      46              : 
      47              :     /// HTTP request status counters for handled requests
      48              :     pub(crate) storage_controller_http_request_status:
      49              :         measured::CounterVec<HttpRequestStatusLabelGroupSet>,
      50              : 
      51              :     /// HTTP request handler latency across all status codes
      52              :     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
      53              :     pub(crate) storage_controller_http_request_latency:
      54              :         measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
      55              : 
      56              :     /// Count of HTTP requests to the pageserver that resulted in an error,
      57              :     /// broken down by the pageserver node id, request name and method
      58              :     pub(crate) storage_controller_pageserver_request_error:
      59              :         measured::CounterVec<PageserverRequestLabelGroupSet>,
      60              : 
      61              :     /// Latency of HTTP requests to the pageserver, broken down by pageserver
      62              :     /// node id, request name and method. This include both successful and unsuccessful
      63              :     /// requests.
      64              :     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
      65              :     pub(crate) storage_controller_pageserver_request_latency:
      66              :         measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
      67              : 
      68              :     /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
      69              :     /// broken down by the pageserver node id, request name and method
      70              :     pub(crate) storage_controller_passthrough_request_error:
      71              :         measured::CounterVec<PageserverRequestLabelGroupSet>,
      72              : 
      73              :     /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
      74              :     /// node id, request name and method. This include both successful and unsuccessful
      75              :     /// requests.
      76              :     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
      77              :     pub(crate) storage_controller_passthrough_request_latency:
      78              :         measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
      79              : 
      80              :     /// Count of errors in database queries, broken down by error type and operation.
      81              :     pub(crate) storage_controller_database_query_error:
      82              :         measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
      83              : 
      84              :     /// Latency of database queries, broken down by operation.
      85              :     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
      86              :     pub(crate) storage_controller_database_query_latency:
      87              :         measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
      88              : 
      89              :     pub(crate) storage_controller_leadership_status: measured::GaugeVec<LeadershipStatusGroupSet>,
      90              : }
      91              : 
      92              : impl StorageControllerMetrics {
      93            0 :     pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
      94            0 :         let mut encoder = self.encoder.lock().unwrap();
      95            0 :         neon_metrics
      96            0 :             .collect_group_into(&mut *encoder)
      97            0 :             .unwrap_or_else(|infallible| match infallible {});
      98            0 :         self.metrics_group
      99            0 :             .collect_group_into(&mut *encoder)
     100            0 :             .unwrap_or_else(|infallible| match infallible {});
     101            0 :         encoder.finish()
     102            0 :     }
     103              : }
     104              : 
     105              : impl Default for StorageControllerMetrics {
     106            3 :     fn default() -> Self {
     107            3 :         let mut metrics_group = StorageControllerMetricGroup::new();
     108            3 :         metrics_group
     109            3 :             .storage_controller_reconcile_complete
     110            3 :             .init_all_dense();
     111            3 : 
     112            3 :         Self {
     113            3 :             metrics_group,
     114            3 :             encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
     115            3 :         }
     116            3 :     }
     117              : }
     118              : 
     119            9 : #[derive(measured::LabelGroup)]
     120              : #[label(set = ReconcileCompleteLabelGroupSet)]
     121              : pub(crate) struct ReconcileCompleteLabelGroup {
     122              :     pub(crate) status: ReconcileOutcome,
     123              : }
     124              : 
     125            6 : #[derive(measured::LabelGroup)]
     126              : #[label(set = HttpRequestStatusLabelGroupSet)]
     127              : pub(crate) struct HttpRequestStatusLabelGroup<'a> {
     128              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     129              :     pub(crate) path: &'a str,
     130              :     pub(crate) method: Method,
     131              :     pub(crate) status: StatusCode,
     132              : }
     133              : 
     134            6 : #[derive(measured::LabelGroup)]
     135              : #[label(set = HttpRequestLatencyLabelGroupSet)]
     136              : pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
     137              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     138              :     pub(crate) path: &'a str,
     139              :     pub(crate) method: Method,
     140              : }
     141              : 
     142           24 : #[derive(measured::LabelGroup, Clone)]
     143              : #[label(set = PageserverRequestLabelGroupSet)]
     144              : pub(crate) struct PageserverRequestLabelGroup<'a> {
     145              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     146              :     pub(crate) pageserver_id: &'a str,
     147              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     148              :     pub(crate) path: &'a str,
     149              :     pub(crate) method: Method,
     150              : }
     151              : 
     152           12 : #[derive(measured::LabelGroup)]
     153              : #[label(set = DatabaseQueryErrorLabelGroupSet)]
     154              : pub(crate) struct DatabaseQueryErrorLabelGroup {
     155              :     pub(crate) error_type: DatabaseErrorLabel,
     156              :     pub(crate) operation: DatabaseOperation,
     157              : }
     158              : 
     159            9 : #[derive(measured::LabelGroup)]
     160              : #[label(set = DatabaseQueryLatencyLabelGroupSet)]
     161              : pub(crate) struct DatabaseQueryLatencyLabelGroup {
     162              :     pub(crate) operation: DatabaseOperation,
     163              : }
     164              : 
     165            9 : #[derive(measured::LabelGroup)]
     166              : #[label(set = LeadershipStatusGroupSet)]
     167              : pub(crate) struct LeadershipStatusGroup {
     168              :     pub(crate) status: LeadershipStatus,
     169              : }
     170              : 
     171              : #[derive(FixedCardinalityLabel, Clone, Copy)]
     172              : pub(crate) enum ReconcileOutcome {
     173              :     #[label(rename = "ok")]
     174              :     Success,
     175              :     Error,
     176              :     Cancel,
     177              : }
     178              : 
     179              : #[derive(FixedCardinalityLabel, Copy, Clone)]
     180              : pub(crate) enum Method {
     181              :     Get,
     182              :     Put,
     183              :     Post,
     184              :     Delete,
     185              :     Other,
     186              : }
     187              : 
     188              : impl From<hyper::Method> for Method {
     189            0 :     fn from(value: hyper::Method) -> Self {
     190            0 :         if value == hyper::Method::GET {
     191            0 :             Method::Get
     192            0 :         } else if value == hyper::Method::PUT {
     193            0 :             Method::Put
     194            0 :         } else if value == hyper::Method::POST {
     195            0 :             Method::Post
     196            0 :         } else if value == hyper::Method::DELETE {
     197            0 :             Method::Delete
     198              :         } else {
     199            0 :             Method::Other
     200              :         }
     201            0 :     }
     202              : }
     203              : 
     204              : #[derive(Clone, Copy)]
     205              : pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
     206              : 
     207              : impl LabelValue for StatusCode {
     208            0 :     fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
     209            0 :         v.write_int(self.0.as_u16() as i64)
     210            0 :     }
     211              : }
     212              : 
     213              : impl FixedCardinalityLabel for StatusCode {
     214            0 :     fn cardinality() -> usize {
     215            0 :         (100..1000).len()
     216            0 :     }
     217              : 
     218            0 :     fn encode(&self) -> usize {
     219            0 :         self.0.as_u16() as usize
     220            0 :     }
     221              : 
     222            0 :     fn decode(value: usize) -> Self {
     223            0 :         Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
     224            0 :     }
     225              : }
     226              : 
     227              : #[derive(FixedCardinalityLabel, Clone, Copy)]
     228              : pub(crate) enum DatabaseErrorLabel {
     229              :     Query,
     230              :     Connection,
     231              :     ConnectionPool,
     232              :     Logical,
     233              :     Migration,
     234              : }
     235              : 
     236              : impl DatabaseError {
     237            0 :     pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
     238            0 :         match self {
     239            0 :             Self::Query(_) => DatabaseErrorLabel::Query,
     240            0 :             Self::Connection(_) => DatabaseErrorLabel::Connection,
     241            0 :             Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
     242            0 :             Self::Logical(_) => DatabaseErrorLabel::Logical,
     243            0 :             Self::Migration(_) => DatabaseErrorLabel::Migration,
     244              :         }
     245            0 :     }
     246              : }
     247              : 
     248              : /// Update the leadership status metric gauges to reflect the requested status
     249            0 : pub(crate) fn update_leadership_status(status: LeadershipStatus) {
     250            0 :     let status_metric = &METRICS_REGISTRY
     251            0 :         .metrics_group
     252            0 :         .storage_controller_leadership_status;
     253              : 
     254            0 :     for s in LeadershipStatus::iter() {
     255            0 :         if s == status {
     256            0 :             status_metric.set(LeadershipStatusGroup { status: s }, 1);
     257            0 :         } else {
     258            0 :             status_metric.set(LeadershipStatusGroup { status: s }, 0);
     259            0 :         }
     260              :     }
     261            0 : }
        

Generated by: LCOV version 2.1-beta