LCOV - code coverage report
Current view: top level - compute_tools/src - hadron_metrics.rs (source / functions) Coverage Total Hit
Test: c8f8d331b83562868d9054d9e0e68f866772aeaa.info Lines: 0.0 % 30 0
Test Date: 2025-07-26 17:20:05 Functions: 0.0 % 5 0

            Line data    Source code
       1              : use metrics::{
       2              :     IntCounter, IntGaugeVec, core::Collector, proto::MetricFamily, register_int_counter,
       3              :     register_int_gauge_vec,
       4              : };
       5              : use once_cell::sync::Lazy;
       6              : 
       7              : // Counter keeping track of the number of PageStream request errors reported by Postgres.
       8              : // An error is registered every time Postgres calls compute_ctl's /refresh_configuration API.
       9              : // Postgres will invoke this API if it detected trouble with PageStream requests (get_page@lsn,
      10              : // get_base_backup, etc.) it sends to any pageserver. An increase in this counter value typically
      11              : // indicates Postgres downtime, as PageStream requests are critical for Postgres to function.
      12            0 : pub static POSTGRES_PAGESTREAM_REQUEST_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
      13            0 :     register_int_counter!(
      14              :         "pg_cctl_pagestream_request_errors_total",
      15              :         "Number of PageStream request errors reported by the postgres process"
      16              :     )
      17            0 :     .expect("failed to define a metric")
      18            0 : });
      19              : 
      20              : // Counter keeping track of the number of compute configuration errors due to Postgres statement
      21              : // timeouts. An error is registered every time `ComputeNode::reconfigure()` fails due to Postgres
      22              : // error code 57014 (query cancelled). This statement timeout typically occurs when postgres is
      23              : // stuck in a problematic retry loop when the PS is reject its connection requests (usually due
      24              : // to PG pointing at the wrong PS). We should investigate the root cause when this counter value
      25              : // increases by checking PG and PS logs.
      26            0 : pub static COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
      27            0 :     register_int_counter!(
      28              :         "pg_cctl_configure_statement_timeout_errors_total",
      29              :         "Number of compute configuration errors due to Postgres statement timeouts."
      30              :     )
      31            0 :     .expect("failed to define a metric")
      32            0 : });
      33              : 
      34            0 : pub static COMPUTE_ATTACHED: Lazy<IntGaugeVec> = Lazy::new(|| {
      35            0 :     register_int_gauge_vec!(
      36              :         "pg_cctl_attached",
      37              :         "Compute node attached status (1 if attached)",
      38            0 :         &[
      39            0 :             "pg_compute_id",
      40            0 :             "pg_instance_id",
      41            0 :             "tenant_id",
      42            0 :             "timeline_id"
      43            0 :         ]
      44              :     )
      45            0 :     .expect("failed to define a metric")
      46            0 : });
      47              : 
      48            0 : pub fn collect() -> Vec<MetricFamily> {
      49            0 :     let mut metrics = Vec::new();
      50            0 :     metrics.extend(POSTGRES_PAGESTREAM_REQUEST_ERRORS.collect());
      51            0 :     metrics.extend(COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS.collect());
      52            0 :     metrics.extend(COMPUTE_ATTACHED.collect());
      53            0 :     metrics
      54            0 : }
      55              : 
      56            0 : pub fn initialize_metrics() {
      57            0 :     Lazy::force(&POSTGRES_PAGESTREAM_REQUEST_ERRORS);
      58            0 :     Lazy::force(&COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS);
      59            0 :     Lazy::force(&COMPUTE_ATTACHED);
      60            0 : }
        

Generated by: LCOV version 2.1-beta