LCOV - code coverage report
Current view: top level - storage_controller/src - metrics.rs (source / functions) Coverage Total Hit
Test: b4ae4c4857f9ef3e144e982a35ee23bc84c71983.info Lines: 26.7 % 75 20
Test Date: 2024-10-22 22:13:45 Functions: 71.9 % 32 23

            Line data    Source code
       1              : //!
       2              : //! This module provides metric definitions for the storage controller.
       3              : //!
       4              : //! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
       5              : //! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
       6              : //! constant.
       7              : //!
       8              : //! The rest of the code defines label group types and deals with converting outer types to labels.
       9              : //!
      10              : use bytes::Bytes;
      11              : use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
      12              : use metrics::NeonMetrics;
      13              : use once_cell::sync::Lazy;
      14              : use std::sync::Mutex;
      15              : use strum::IntoEnumIterator;
      16              : 
      17              : use crate::{
      18              :     persistence::{DatabaseError, DatabaseOperation},
      19              :     service::LeadershipStatus,
      20              : };
      21              : 
      22              : pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
      23              :     Lazy::new(StorageControllerMetrics::default);
      24              : 
      25            0 : pub fn preinitialize_metrics() {
      26            0 :     Lazy::force(&METRICS_REGISTRY);
      27            0 : }
      28              : 
      29              : pub(crate) struct StorageControllerMetrics {
      30              :     pub(crate) metrics_group: StorageControllerMetricGroup,
      31              :     encoder: Mutex<measured::text::BufferedTextEncoder>,
      32              : }
      33              : 
      34            3 : #[derive(measured::MetricGroup)]
      35              : #[metric(new())]
      36              : pub(crate) struct StorageControllerMetricGroup {
      37              :     /// Count of how many times we spawn a reconcile task
      38              :     pub(crate) storage_controller_reconcile_spawn: measured::Counter,
      39              : 
      40              :     /// Reconciler tasks completed, broken down by success/failure/cancelled
      41              :     pub(crate) storage_controller_reconcile_complete:
      42              :         measured::CounterVec<ReconcileCompleteLabelGroupSet>,
      43              : 
      44              :     /// Count of how many times we make an optimization change to a tenant's scheduling
      45              :     pub(crate) storage_controller_schedule_optimization: measured::Counter,
      46              : 
      47              :     /// HTTP request status counters for handled requests
      48              :     pub(crate) storage_controller_http_request_status:
      49              :         measured::CounterVec<HttpRequestStatusLabelGroupSet>,
      50              : 
      51              :     /// HTTP request handler latency across all status codes
      52              :     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
      53              :     pub(crate) storage_controller_http_request_latency:
      54              :         measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
      55              : 
      56              :     /// Count of HTTP requests to the pageserver that resulted in an error,
      57              :     /// broken down by the pageserver node id, request name and method
      58              :     pub(crate) storage_controller_pageserver_request_error:
      59              :         measured::CounterVec<PageserverRequestLabelGroupSet>,
      60              : 
      61              :     /// Latency of HTTP requests to the pageserver, broken down by pageserver
      62              :     /// node id, request name and method. This include both successful and unsuccessful
      63              :     /// requests.
      64              :     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
      65              :     pub(crate) storage_controller_pageserver_request_latency:
      66              :         measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
      67              : 
      68              :     /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
      69              :     /// broken down by the pageserver node id, request name and method
      70              :     pub(crate) storage_controller_passthrough_request_error:
      71              :         measured::CounterVec<PageserverRequestLabelGroupSet>,
      72              : 
      73              :     /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
      74              :     /// node id, request name and method. This include both successful and unsuccessful
      75              :     /// requests.
      76              :     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
      77              :     pub(crate) storage_controller_passthrough_request_latency:
      78              :         measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
      79              : 
      80              :     /// Count of errors in database queries, broken down by error type and operation.
      81              :     pub(crate) storage_controller_database_query_error:
      82              :         measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
      83              : 
      84              :     /// Latency of database queries, broken down by operation.
      85              :     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
      86              :     pub(crate) storage_controller_database_query_latency:
      87              :         measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
      88              : 
      89              :     pub(crate) storage_controller_leadership_status: measured::GaugeVec<LeadershipStatusGroupSet>,
      90              : 
      91              :     /// HTTP request status counters for handled requests
      92              :     pub(crate) storage_controller_reconcile_long_running:
      93              :         measured::CounterVec<ReconcileLongRunningLabelGroupSet>,
      94              : }
      95              : 
      96              : impl StorageControllerMetrics {
      97            0 :     pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
      98            0 :         let mut encoder = self.encoder.lock().unwrap();
      99            0 :         neon_metrics
     100            0 :             .collect_group_into(&mut *encoder)
     101            0 :             .unwrap_or_else(|infallible| match infallible {});
     102            0 :         self.metrics_group
     103            0 :             .collect_group_into(&mut *encoder)
     104            0 :             .unwrap_or_else(|infallible| match infallible {});
     105            0 :         encoder.finish()
     106            0 :     }
     107              : }
     108              : 
     109              : impl Default for StorageControllerMetrics {
     110            3 :     fn default() -> Self {
     111            3 :         let mut metrics_group = StorageControllerMetricGroup::new();
     112            3 :         metrics_group
     113            3 :             .storage_controller_reconcile_complete
     114            3 :             .init_all_dense();
     115            3 : 
     116            3 :         Self {
     117            3 :             metrics_group,
     118            3 :             encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
     119            3 :         }
     120            3 :     }
     121              : }
     122              : 
     123            9 : #[derive(measured::LabelGroup)]
     124              : #[label(set = ReconcileCompleteLabelGroupSet)]
     125              : pub(crate) struct ReconcileCompleteLabelGroup {
     126              :     pub(crate) status: ReconcileOutcome,
     127              : }
     128              : 
     129            6 : #[derive(measured::LabelGroup)]
     130              : #[label(set = HttpRequestStatusLabelGroupSet)]
     131              : pub(crate) struct HttpRequestStatusLabelGroup<'a> {
     132              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     133              :     pub(crate) path: &'a str,
     134              :     pub(crate) method: Method,
     135              :     pub(crate) status: StatusCode,
     136              : }
     137              : 
     138            6 : #[derive(measured::LabelGroup)]
     139              : #[label(set = HttpRequestLatencyLabelGroupSet)]
     140              : pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
     141              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     142              :     pub(crate) path: &'a str,
     143              :     pub(crate) method: Method,
     144              : }
     145              : 
     146           24 : #[derive(measured::LabelGroup, Clone)]
     147              : #[label(set = PageserverRequestLabelGroupSet)]
     148              : pub(crate) struct PageserverRequestLabelGroup<'a> {
     149              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     150              :     pub(crate) pageserver_id: &'a str,
     151              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     152              :     pub(crate) path: &'a str,
     153              :     pub(crate) method: Method,
     154              : }
     155              : 
     156           12 : #[derive(measured::LabelGroup)]
     157              : #[label(set = DatabaseQueryErrorLabelGroupSet)]
     158              : pub(crate) struct DatabaseQueryErrorLabelGroup {
     159              :     pub(crate) error_type: DatabaseErrorLabel,
     160              :     pub(crate) operation: DatabaseOperation,
     161              : }
     162              : 
     163            9 : #[derive(measured::LabelGroup)]
     164              : #[label(set = DatabaseQueryLatencyLabelGroupSet)]
     165              : pub(crate) struct DatabaseQueryLatencyLabelGroup {
     166              :     pub(crate) operation: DatabaseOperation,
     167              : }
     168              : 
     169            9 : #[derive(measured::LabelGroup)]
     170              : #[label(set = LeadershipStatusGroupSet)]
     171              : pub(crate) struct LeadershipStatusGroup {
     172              :     pub(crate) status: LeadershipStatus,
     173              : }
     174              : 
     175            6 : #[derive(measured::LabelGroup, Clone)]
     176              : #[label(set = ReconcileLongRunningLabelGroupSet)]
     177              : pub(crate) struct ReconcileLongRunningLabelGroup<'a> {
     178              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     179              :     pub(crate) tenant_id: &'a str,
     180              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     181              :     pub(crate) shard_number: &'a str,
     182              :     #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     183              :     pub(crate) sequence: &'a str,
     184              : }
     185              : 
     186              : #[derive(FixedCardinalityLabel, Clone, Copy)]
     187              : pub(crate) enum ReconcileOutcome {
     188              :     #[label(rename = "ok")]
     189              :     Success,
     190              :     Error,
     191              :     Cancel,
     192              : }
     193              : 
     194              : #[derive(FixedCardinalityLabel, Copy, Clone)]
     195              : pub(crate) enum Method {
     196              :     Get,
     197              :     Put,
     198              :     Post,
     199              :     Delete,
     200              :     Other,
     201              : }
     202              : 
     203              : impl From<hyper::Method> for Method {
     204            0 :     fn from(value: hyper::Method) -> Self {
     205            0 :         if value == hyper::Method::GET {
     206            0 :             Method::Get
     207            0 :         } else if value == hyper::Method::PUT {
     208            0 :             Method::Put
     209            0 :         } else if value == hyper::Method::POST {
     210            0 :             Method::Post
     211            0 :         } else if value == hyper::Method::DELETE {
     212            0 :             Method::Delete
     213              :         } else {
     214            0 :             Method::Other
     215              :         }
     216            0 :     }
     217              : }
     218              : 
     219              : #[derive(Clone, Copy)]
     220              : pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
     221              : 
     222              : impl LabelValue for StatusCode {
     223            0 :     fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
     224            0 :         v.write_int(self.0.as_u16() as i64)
     225            0 :     }
     226              : }
     227              : 
     228              : impl FixedCardinalityLabel for StatusCode {
     229            0 :     fn cardinality() -> usize {
     230            0 :         (100..1000).len()
     231            0 :     }
     232              : 
     233            0 :     fn encode(&self) -> usize {
     234            0 :         self.0.as_u16() as usize
     235            0 :     }
     236              : 
     237            0 :     fn decode(value: usize) -> Self {
     238            0 :         Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
     239            0 :     }
     240              : }
     241              : 
     242              : #[derive(FixedCardinalityLabel, Clone, Copy)]
     243              : pub(crate) enum DatabaseErrorLabel {
     244              :     Query,
     245              :     Connection,
     246              :     ConnectionPool,
     247              :     Logical,
     248              :     Migration,
     249              : }
     250              : 
     251              : impl DatabaseError {
     252            0 :     pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
     253            0 :         match self {
     254            0 :             Self::Query(_) => DatabaseErrorLabel::Query,
     255            0 :             Self::Connection(_) => DatabaseErrorLabel::Connection,
     256            0 :             Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
     257            0 :             Self::Logical(_) => DatabaseErrorLabel::Logical,
     258            0 :             Self::Migration(_) => DatabaseErrorLabel::Migration,
     259              :         }
     260            0 :     }
     261              : }
     262              : 
     263              : /// Update the leadership status metric gauges to reflect the requested status
     264            0 : pub(crate) fn update_leadership_status(status: LeadershipStatus) {
     265            0 :     let status_metric = &METRICS_REGISTRY
     266            0 :         .metrics_group
     267            0 :         .storage_controller_leadership_status;
     268              : 
     269            0 :     for s in LeadershipStatus::iter() {
     270            0 :         if s == status {
     271            0 :             status_metric.set(LeadershipStatusGroup { status: s }, 1);
     272            0 :         } else {
     273            0 :             status_metric.set(LeadershipStatusGroup { status: s }, 0);
     274            0 :         }
     275              :     }
     276            0 : }
        

Generated by: LCOV version 2.1-beta