Line data Source code
1 : use metrics::{
2 : IntCounter, IntGaugeVec, core::Collector, proto::MetricFamily, register_int_counter,
3 : register_int_gauge_vec,
4 : };
5 : use once_cell::sync::Lazy;
6 :
7 : // Counter keeping track of the number of PageStream request errors reported by Postgres.
8 : // An error is registered every time Postgres calls compute_ctl's /refresh_configuration API.
9 : // Postgres will invoke this API if it detected trouble with PageStream requests (get_page@lsn,
10 : // get_base_backup, etc.) it sends to any pageserver. An increase in this counter value typically
11 : // indicates Postgres downtime, as PageStream requests are critical for Postgres to function.
12 0 : pub static POSTGRES_PAGESTREAM_REQUEST_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
13 0 : register_int_counter!(
14 : "pg_cctl_pagestream_request_errors_total",
15 : "Number of PageStream request errors reported by the postgres process"
16 : )
17 0 : .expect("failed to define a metric")
18 0 : });
19 :
20 : // Counter keeping track of the number of compute configuration errors due to Postgres statement
21 : // timeouts. An error is registered every time `ComputeNode::reconfigure()` fails due to Postgres
22 : // error code 57014 (query cancelled). This statement timeout typically occurs when postgres is
23 : // stuck in a problematic retry loop when the PS is reject its connection requests (usually due
24 : // to PG pointing at the wrong PS). We should investigate the root cause when this counter value
25 : // increases by checking PG and PS logs.
26 0 : pub static COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
27 0 : register_int_counter!(
28 : "pg_cctl_configure_statement_timeout_errors_total",
29 : "Number of compute configuration errors due to Postgres statement timeouts."
30 : )
31 0 : .expect("failed to define a metric")
32 0 : });
33 :
34 0 : pub static COMPUTE_ATTACHED: Lazy<IntGaugeVec> = Lazy::new(|| {
35 0 : register_int_gauge_vec!(
36 : "pg_cctl_attached",
37 : "Compute node attached status (1 if attached)",
38 0 : &[
39 0 : "pg_compute_id",
40 0 : "pg_instance_id",
41 0 : "tenant_id",
42 0 : "timeline_id"
43 0 : ]
44 : )
45 0 : .expect("failed to define a metric")
46 0 : });
47 :
48 0 : pub fn collect() -> Vec<MetricFamily> {
49 0 : let mut metrics = Vec::new();
50 0 : metrics.extend(POSTGRES_PAGESTREAM_REQUEST_ERRORS.collect());
51 0 : metrics.extend(COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS.collect());
52 0 : metrics.extend(COMPUTE_ATTACHED.collect());
53 0 : metrics
54 0 : }
55 :
56 0 : pub fn initialize_metrics() {
57 0 : Lazy::force(&POSTGRES_PAGESTREAM_REQUEST_ERRORS);
58 0 : Lazy::force(&COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS);
59 0 : Lazy::force(&COMPUTE_ATTACHED);
60 0 : }
|