Line data Source code
1 : use metrics::core::{AtomicF64, AtomicU64, Collector, GenericCounter, GenericGauge};
2 : use metrics::proto::MetricFamily;
3 : use metrics::{
4 : IntCounter, IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter,
5 : register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec,
6 : };
7 : use once_cell::sync::Lazy;
8 :
9 0 : pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
10 0 : register_uint_gauge_vec!(
11 : "compute_installed_extensions",
12 : "Number of databases where the version of extension is installed",
13 0 : &["extension_name", "version", "owned_by_superuser"]
14 : )
15 0 : .expect("failed to define a metric")
16 0 : });
17 :
18 : // Normally, any HTTP API request is described by METHOD (e.g. GET, POST, etc.) + PATH,
19 : // but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
20 : // And it's fair to call it a 'RPC' (Remote Procedure Call).
21 : pub enum CPlaneRequestRPC {
22 : GetConfig,
23 : }
24 :
25 : impl CPlaneRequestRPC {
26 0 : pub fn as_str(&self) -> &str {
27 0 : match self {
28 0 : CPlaneRequestRPC::GetConfig => "GetConfig",
29 : }
30 0 : }
31 : }
32 :
33 : pub const UNKNOWN_HTTP_STATUS: &str = "unknown";
34 :
35 0 : pub(crate) static CPLANE_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
36 0 : register_int_counter_vec!(
37 : "compute_ctl_cplane_requests_total",
38 : "Total number of control plane requests made by compute_ctl by status",
39 0 : &["rpc", "http_status"]
40 : )
41 0 : .expect("failed to define a metric")
42 0 : });
43 :
44 : /// Total number of failed database migrations. Per-compute, this is actually a boolean metric,
45 : /// either empty or with a single value (1, migration_id) because we stop at the first failure.
46 : /// Yet, the sum over the fleet will provide the total number of failures.
47 0 : pub(crate) static DB_MIGRATION_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
48 0 : register_int_counter_vec!(
49 : "compute_ctl_db_migration_failed_total",
50 : "Total number of failed database migrations",
51 0 : &["migration_id"]
52 : )
53 0 : .expect("failed to define a metric")
54 0 : });
55 :
56 0 : pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
57 0 : register_int_counter_vec!(
58 : "compute_ctl_remote_ext_requests_total",
59 : "Total number of requests made by compute_ctl to download extensions from S3 proxy by status",
60 0 : &["http_status", "filename"]
61 : )
62 0 : .expect("failed to define a metric")
63 0 : });
64 :
65 : // Size of audit log directory in bytes
66 0 : pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy<GenericGauge<AtomicF64>> = Lazy::new(|| {
67 0 : register_gauge!(
68 : "compute_audit_log_dir_size",
69 : "Size of audit log directory in bytes",
70 : )
71 0 : .expect("failed to define a metric")
72 0 : });
73 :
74 : // Report that `compute_ctl` is up and what's the current compute status.
75 0 : pub(crate) static COMPUTE_CTL_UP: Lazy<IntGaugeVec> = Lazy::new(|| {
76 0 : register_int_gauge_vec!(
77 : "compute_ctl_up",
78 : "Whether compute_ctl is running",
79 0 : &["build_tag", "status"]
80 : )
81 0 : .expect("failed to define a metric")
82 0 : });
83 :
84 0 : pub(crate) static PG_CURR_DOWNTIME_MS: Lazy<GenericGauge<AtomicF64>> = Lazy::new(|| {
85 0 : register_gauge!(
86 : "compute_pg_current_downtime_ms",
87 : "Non-cumulative duration of Postgres downtime in ms; resets after successful check",
88 : )
89 0 : .expect("failed to define a metric")
90 0 : });
91 :
92 0 : pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy<GenericCounter<AtomicU64>> = Lazy::new(|| {
93 0 : register_int_counter!(
94 : "compute_pg_downtime_ms_total",
95 : "Cumulative duration of Postgres downtime in ms",
96 : )
97 0 : .expect("failed to define a metric")
98 0 : });
99 :
100 0 : pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
101 0 : register_int_counter!(
102 : "compute_ctl_lfc_prewarms_total",
103 : "Total number of LFC prewarms requested by compute_ctl or autoprewarm option",
104 : )
105 0 : .expect("failed to define a metric")
106 0 : });
107 :
108 0 : pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
109 0 : register_int_counter!(
110 : "compute_ctl_lfc_prewarm_errors_total",
111 : "Total number of LFC prewarm errors",
112 : )
113 0 : .expect("failed to define a metric")
114 0 : });
115 :
116 0 : pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
117 0 : register_int_counter!(
118 : "compute_ctl_lfc_offloads_total",
119 : "Total number of LFC offloads requested by compute_ctl or lfc_offload_period_seconds option",
120 : )
121 0 : .expect("failed to define a metric")
122 0 : });
123 :
124 0 : pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
125 0 : register_int_counter!(
126 : "compute_ctl_lfc_offload_errors_total",
127 : "Total number of LFC offload errors",
128 : )
129 0 : .expect("failed to define a metric")
130 0 : });
131 :
132 0 : pub fn collect() -> Vec<MetricFamily> {
133 0 : let mut metrics = COMPUTE_CTL_UP.collect();
134 0 : metrics.extend(INSTALLED_EXTENSIONS.collect());
135 0 : metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
136 0 : metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
137 0 : metrics.extend(DB_MIGRATION_FAILED.collect());
138 0 : metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
139 0 : metrics.extend(PG_CURR_DOWNTIME_MS.collect());
140 0 : metrics.extend(PG_TOTAL_DOWNTIME_MS.collect());
141 0 : metrics.extend(LFC_PREWARMS.collect());
142 0 : metrics.extend(LFC_PREWARM_ERRORS.collect());
143 0 : metrics.extend(LFC_OFFLOADS.collect());
144 0 : metrics.extend(LFC_OFFLOAD_ERRORS.collect());
145 0 : metrics
146 0 : }
|