Line data Source code
1 : //!
2 : //! This module provides metric definitions for the storage controller.
3 : //!
4 : //! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
5 : //! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
6 : //! constant.
7 : //!
8 : //! The rest of the code defines label group types and deals with converting outer types to labels.
9 : //!
10 : use bytes::Bytes;
11 : use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
12 : use metrics::NeonMetrics;
13 : use once_cell::sync::Lazy;
14 : use std::sync::Mutex;
15 : use strum::IntoEnumIterator;
16 :
17 : use crate::{
18 : persistence::{DatabaseError, DatabaseOperation},
19 : service::LeadershipStatus,
20 : };
21 :
22 : pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
23 : Lazy::new(StorageControllerMetrics::default);
24 :
25 0 : pub fn preinitialize_metrics() {
26 0 : Lazy::force(&METRICS_REGISTRY);
27 0 : }
28 :
29 : pub(crate) struct StorageControllerMetrics {
30 : pub(crate) metrics_group: StorageControllerMetricGroup,
31 : encoder: Mutex<measured::text::BufferedTextEncoder>,
32 : }
33 :
34 3 : #[derive(measured::MetricGroup)]
35 : #[metric(new())]
36 : pub(crate) struct StorageControllerMetricGroup {
37 : /// Count of how many times we spawn a reconcile task
38 : pub(crate) storage_controller_reconcile_spawn: measured::Counter,
39 :
40 : /// Reconciler tasks completed, broken down by success/failure/cancelled
41 : pub(crate) storage_controller_reconcile_complete:
42 : measured::CounterVec<ReconcileCompleteLabelGroupSet>,
43 :
44 : /// Count of how many times we make an optimization change to a tenant's scheduling
45 : pub(crate) storage_controller_schedule_optimization: measured::Counter,
46 :
47 : /// HTTP request status counters for handled requests
48 : pub(crate) storage_controller_http_request_status:
49 : measured::CounterVec<HttpRequestStatusLabelGroupSet>,
50 :
51 : /// HTTP request handler latency across all status codes
52 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
53 : pub(crate) storage_controller_http_request_latency:
54 : measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
55 :
56 : /// Count of HTTP requests to the pageserver that resulted in an error,
57 : /// broken down by the pageserver node id, request name and method
58 : pub(crate) storage_controller_pageserver_request_error:
59 : measured::CounterVec<PageserverRequestLabelGroupSet>,
60 :
61 : /// Latency of HTTP requests to the pageserver, broken down by pageserver
62 : /// node id, request name and method. This include both successful and unsuccessful
63 : /// requests.
64 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
65 : pub(crate) storage_controller_pageserver_request_latency:
66 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
67 :
68 : /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
69 : /// broken down by the pageserver node id, request name and method
70 : pub(crate) storage_controller_passthrough_request_error:
71 : measured::CounterVec<PageserverRequestLabelGroupSet>,
72 :
73 : /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
74 : /// node id, request name and method. This include both successful and unsuccessful
75 : /// requests.
76 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
77 : pub(crate) storage_controller_passthrough_request_latency:
78 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
79 :
80 : /// Count of errors in database queries, broken down by error type and operation.
81 : pub(crate) storage_controller_database_query_error:
82 : measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
83 :
84 : /// Latency of database queries, broken down by operation.
85 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
86 : pub(crate) storage_controller_database_query_latency:
87 : measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
88 :
89 : pub(crate) storage_controller_leadership_status: measured::GaugeVec<LeadershipStatusGroupSet>,
90 : }
91 :
92 : impl StorageControllerMetrics {
93 0 : pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
94 0 : let mut encoder = self.encoder.lock().unwrap();
95 0 : neon_metrics
96 0 : .collect_group_into(&mut *encoder)
97 0 : .unwrap_or_else(|infallible| match infallible {});
98 0 : self.metrics_group
99 0 : .collect_group_into(&mut *encoder)
100 0 : .unwrap_or_else(|infallible| match infallible {});
101 0 : encoder.finish()
102 0 : }
103 : }
104 :
105 : impl Default for StorageControllerMetrics {
106 3 : fn default() -> Self {
107 3 : let mut metrics_group = StorageControllerMetricGroup::new();
108 3 : metrics_group
109 3 : .storage_controller_reconcile_complete
110 3 : .init_all_dense();
111 3 :
112 3 : Self {
113 3 : metrics_group,
114 3 : encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
115 3 : }
116 3 : }
117 : }
118 :
119 9 : #[derive(measured::LabelGroup)]
120 : #[label(set = ReconcileCompleteLabelGroupSet)]
121 : pub(crate) struct ReconcileCompleteLabelGroup {
122 : pub(crate) status: ReconcileOutcome,
123 : }
124 :
125 6 : #[derive(measured::LabelGroup)]
126 : #[label(set = HttpRequestStatusLabelGroupSet)]
127 : pub(crate) struct HttpRequestStatusLabelGroup<'a> {
128 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
129 : pub(crate) path: &'a str,
130 : pub(crate) method: Method,
131 : pub(crate) status: StatusCode,
132 : }
133 :
134 6 : #[derive(measured::LabelGroup)]
135 : #[label(set = HttpRequestLatencyLabelGroupSet)]
136 : pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
137 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
138 : pub(crate) path: &'a str,
139 : pub(crate) method: Method,
140 : }
141 :
142 24 : #[derive(measured::LabelGroup, Clone)]
143 : #[label(set = PageserverRequestLabelGroupSet)]
144 : pub(crate) struct PageserverRequestLabelGroup<'a> {
145 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
146 : pub(crate) pageserver_id: &'a str,
147 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
148 : pub(crate) path: &'a str,
149 : pub(crate) method: Method,
150 : }
151 :
152 12 : #[derive(measured::LabelGroup)]
153 : #[label(set = DatabaseQueryErrorLabelGroupSet)]
154 : pub(crate) struct DatabaseQueryErrorLabelGroup {
155 : pub(crate) error_type: DatabaseErrorLabel,
156 : pub(crate) operation: DatabaseOperation,
157 : }
158 :
159 9 : #[derive(measured::LabelGroup)]
160 : #[label(set = DatabaseQueryLatencyLabelGroupSet)]
161 : pub(crate) struct DatabaseQueryLatencyLabelGroup {
162 : pub(crate) operation: DatabaseOperation,
163 : }
164 :
165 9 : #[derive(measured::LabelGroup)]
166 : #[label(set = LeadershipStatusGroupSet)]
167 : pub(crate) struct LeadershipStatusGroup {
168 : pub(crate) status: LeadershipStatus,
169 : }
170 :
171 : #[derive(FixedCardinalityLabel, Clone, Copy)]
172 : pub(crate) enum ReconcileOutcome {
173 : #[label(rename = "ok")]
174 : Success,
175 : Error,
176 : Cancel,
177 : }
178 :
179 : #[derive(FixedCardinalityLabel, Copy, Clone)]
180 : pub(crate) enum Method {
181 : Get,
182 : Put,
183 : Post,
184 : Delete,
185 : Other,
186 : }
187 :
188 : impl From<hyper::Method> for Method {
189 0 : fn from(value: hyper::Method) -> Self {
190 0 : if value == hyper::Method::GET {
191 0 : Method::Get
192 0 : } else if value == hyper::Method::PUT {
193 0 : Method::Put
194 0 : } else if value == hyper::Method::POST {
195 0 : Method::Post
196 0 : } else if value == hyper::Method::DELETE {
197 0 : Method::Delete
198 : } else {
199 0 : Method::Other
200 : }
201 0 : }
202 : }
203 :
204 : #[derive(Clone, Copy)]
205 : pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
206 :
207 : impl LabelValue for StatusCode {
208 0 : fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
209 0 : v.write_int(self.0.as_u16() as i64)
210 0 : }
211 : }
212 :
213 : impl FixedCardinalityLabel for StatusCode {
214 0 : fn cardinality() -> usize {
215 0 : (100..1000).len()
216 0 : }
217 :
218 0 : fn encode(&self) -> usize {
219 0 : self.0.as_u16() as usize
220 0 : }
221 :
222 0 : fn decode(value: usize) -> Self {
223 0 : Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
224 0 : }
225 : }
226 :
227 : #[derive(FixedCardinalityLabel, Clone, Copy)]
228 : pub(crate) enum DatabaseErrorLabel {
229 : Query,
230 : Connection,
231 : ConnectionPool,
232 : Logical,
233 : Migration,
234 : }
235 :
236 : impl DatabaseError {
237 0 : pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
238 0 : match self {
239 0 : Self::Query(_) => DatabaseErrorLabel::Query,
240 0 : Self::Connection(_) => DatabaseErrorLabel::Connection,
241 0 : Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
242 0 : Self::Logical(_) => DatabaseErrorLabel::Logical,
243 0 : Self::Migration(_) => DatabaseErrorLabel::Migration,
244 : }
245 0 : }
246 : }
247 :
248 : /// Update the leadership status metric gauges to reflect the requested status
249 0 : pub(crate) fn update_leadership_status(status: LeadershipStatus) {
250 0 : let status_metric = &METRICS_REGISTRY
251 0 : .metrics_group
252 0 : .storage_controller_leadership_status;
253 :
254 0 : for s in LeadershipStatus::iter() {
255 0 : if s == status {
256 0 : status_metric.set(LeadershipStatusGroup { status: s }, 1);
257 0 : } else {
258 0 : status_metric.set(LeadershipStatusGroup { status: s }, 0);
259 0 : }
260 : }
261 0 : }
|