Line data Source code
1 : //!
2 : //! This module provides metric definitions for the storage controller.
3 : //!
4 : //! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
5 : //! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
6 : //! constant.
7 : //!
8 : //! The rest of the code defines label group types and deals with converting outer types to labels.
9 : //!
10 : use std::sync::Mutex;
11 :
12 : use bytes::Bytes;
13 : use measured::label::LabelValue;
14 : use measured::metric::histogram;
15 : use measured::{FixedCardinalityLabel, MetricGroup};
16 : use metrics::NeonMetrics;
17 : use once_cell::sync::Lazy;
18 : use strum::IntoEnumIterator;
19 :
20 : use crate::persistence::{DatabaseError, DatabaseOperation};
21 : use crate::service::LeadershipStatus;
22 :
23 : pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
24 : Lazy::new(StorageControllerMetrics::default);
25 :
26 0 : pub fn preinitialize_metrics() {
27 0 : Lazy::force(&METRICS_REGISTRY);
28 0 : }
29 :
30 : pub(crate) struct StorageControllerMetrics {
31 : pub(crate) metrics_group: StorageControllerMetricGroup,
32 : encoder: Mutex<measured::text::BufferedTextEncoder>,
33 : }
34 :
35 17 : #[derive(measured::MetricGroup)]
36 : #[metric(new())]
37 : pub(crate) struct StorageControllerMetricGroup {
38 : /// Count of how many times we spawn a reconcile task
39 : pub(crate) storage_controller_reconcile_spawn: measured::Counter,
40 :
41 : /// Size of the in-memory map of tenant shards
42 : pub(crate) storage_controller_tenant_shards: measured::Gauge,
43 :
44 : /// Size of the in-memory map of pageserver_nodes
45 : pub(crate) storage_controller_pageserver_nodes: measured::Gauge,
46 :
47 : /// Count of how many pageserver nodes from in-memory map have https configured
48 : pub(crate) storage_controller_https_pageserver_nodes: measured::Gauge,
49 :
50 : /// Size of the in-memory map of safekeeper_nodes
51 : pub(crate) storage_controller_safekeeper_nodes: measured::Gauge,
52 :
53 : /// Count of how many safekeeper nodes from in-memory map have https configured
54 : pub(crate) storage_controller_https_safekeeper_nodes: measured::Gauge,
55 :
56 : /// Reconciler tasks completed, broken down by success/failure/cancelled
57 : pub(crate) storage_controller_reconcile_complete:
58 : measured::CounterVec<ReconcileCompleteLabelGroupSet>,
59 :
60 : /// Count of how many times we make an optimization change to a tenant's scheduling
61 : pub(crate) storage_controller_schedule_optimization: measured::Counter,
62 :
63 : /// How many shards are not scheduled into their preferred AZ
64 : pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
65 :
66 : /// How many shard locations (secondary or attached) on each node
67 : pub(crate) storage_controller_node_shards: measured::GaugeVec<NodeLabelGroupSet>,
68 :
69 : /// How many _attached_ shard locations on each node
70 : pub(crate) storage_controller_node_attached_shards: measured::GaugeVec<NodeLabelGroupSet>,
71 :
72 : /// How many _home_ shard locations on each node (i.e. the node's AZ matches the shard's
73 : /// preferred AZ)
74 : pub(crate) storage_controller_node_home_shards: measured::GaugeVec<NodeLabelGroupSet>,
75 :
76 : /// How many shards would like to reconcile but were blocked by concurrency limits
77 : pub(crate) storage_controller_pending_reconciles: measured::Gauge,
78 :
79 : /// HTTP request status counters for handled requests
80 : pub(crate) storage_controller_http_request_status:
81 : measured::CounterVec<HttpRequestStatusLabelGroupSet>,
82 :
83 : /// HTTP request handler latency across all status codes
84 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
85 : pub(crate) storage_controller_http_request_latency:
86 : measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
87 :
88 : /// HTTP rate limiting latency across all tenants and endpoints
89 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 10.0))]
90 : pub(crate) storage_controller_http_request_rate_limited: measured::Histogram<10>,
91 :
92 : /// Count of HTTP requests to the pageserver that resulted in an error,
93 : /// broken down by the pageserver node id, request name and method
94 : pub(crate) storage_controller_pageserver_request_error:
95 : measured::CounterVec<PageserverRequestLabelGroupSet>,
96 :
97 : /// Count of HTTP requests to the safekeeper that resulted in an error,
98 : /// broken down by the safekeeper node id, request name and method
99 : pub(crate) storage_controller_safekeeper_request_error:
100 : measured::CounterVec<PageserverRequestLabelGroupSet>,
101 :
102 : /// Latency of HTTP requests to the pageserver, broken down by pageserver
103 : /// node id, request name and method. This include both successful and unsuccessful
104 : /// requests.
105 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
106 : pub(crate) storage_controller_pageserver_request_latency:
107 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
108 :
109 : /// Latency of HTTP requests to the safekeeper, broken down by safekeeper
110 : /// node id, request name and method. This include both successful and unsuccessful
111 : /// requests.
112 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
113 : pub(crate) storage_controller_safekeeper_request_latency:
114 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
115 :
116 : /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
117 : /// broken down by the pageserver node id, request name and method
118 : pub(crate) storage_controller_passthrough_request_error:
119 : measured::CounterVec<PageserverRequestLabelGroupSet>,
120 :
121 : /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
122 : /// node id, request name and method. This include both successful and unsuccessful
123 : /// requests.
124 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
125 : pub(crate) storage_controller_passthrough_request_latency:
126 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
127 :
128 : /// Count of errors in database queries, broken down by error type and operation.
129 : pub(crate) storage_controller_database_query_error:
130 : measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
131 :
132 : /// Latency of database queries, broken down by operation.
133 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
134 : pub(crate) storage_controller_database_query_latency:
135 : measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
136 :
137 : pub(crate) storage_controller_leadership_status: measured::GaugeVec<LeadershipStatusGroupSet>,
138 :
139 : /// HTTP request status counters for handled requests
140 : pub(crate) storage_controller_reconcile_long_running:
141 : measured::CounterVec<ReconcileLongRunningLabelGroupSet>,
142 : }
143 :
144 : impl StorageControllerMetrics {
145 0 : pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
146 0 : let mut encoder = self.encoder.lock().unwrap();
147 0 : neon_metrics
148 0 : .collect_group_into(&mut *encoder)
149 0 : .unwrap_or_else(|infallible| match infallible {});
150 0 : self.metrics_group
151 0 : .collect_group_into(&mut *encoder)
152 0 : .unwrap_or_else(|infallible| match infallible {});
153 0 : encoder.finish()
154 0 : }
155 : }
156 :
157 : impl Default for StorageControllerMetrics {
158 17 : fn default() -> Self {
159 17 : let mut metrics_group = StorageControllerMetricGroup::new();
160 17 : metrics_group
161 17 : .storage_controller_reconcile_complete
162 17 : .init_all_dense();
163 17 :
164 17 : Self {
165 17 : metrics_group,
166 17 : encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
167 17 : }
168 17 : }
169 : }
170 :
171 102 : #[derive(measured::LabelGroup, Clone)]
172 : #[label(set = NodeLabelGroupSet)]
173 : pub(crate) struct NodeLabelGroup<'a> {
174 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
175 : pub(crate) az: &'a str,
176 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
177 : pub(crate) node_id: &'a str,
178 : }
179 :
180 51 : #[derive(measured::LabelGroup)]
181 : #[label(set = ReconcileCompleteLabelGroupSet)]
182 : pub(crate) struct ReconcileCompleteLabelGroup {
183 : pub(crate) status: ReconcileOutcome,
184 : }
185 :
186 34 : #[derive(measured::LabelGroup)]
187 : #[label(set = HttpRequestStatusLabelGroupSet)]
188 : pub(crate) struct HttpRequestStatusLabelGroup<'a> {
189 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
190 : pub(crate) path: &'a str,
191 : pub(crate) method: Method,
192 : pub(crate) status: StatusCode,
193 : }
194 :
195 34 : #[derive(measured::LabelGroup)]
196 : #[label(set = HttpRequestLatencyLabelGroupSet)]
197 : pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
198 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
199 : pub(crate) path: &'a str,
200 : pub(crate) method: Method,
201 : }
202 :
203 204 : #[derive(measured::LabelGroup, Clone)]
204 : #[label(set = PageserverRequestLabelGroupSet)]
205 : pub(crate) struct PageserverRequestLabelGroup<'a> {
206 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
207 : pub(crate) pageserver_id: &'a str,
208 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
209 : pub(crate) path: &'a str,
210 : pub(crate) method: Method,
211 : }
212 :
213 68 : #[derive(measured::LabelGroup)]
214 : #[label(set = DatabaseQueryErrorLabelGroupSet)]
215 : pub(crate) struct DatabaseQueryErrorLabelGroup {
216 : pub(crate) error_type: DatabaseErrorLabel,
217 : pub(crate) operation: DatabaseOperation,
218 : }
219 :
220 51 : #[derive(measured::LabelGroup)]
221 : #[label(set = DatabaseQueryLatencyLabelGroupSet)]
222 : pub(crate) struct DatabaseQueryLatencyLabelGroup {
223 : pub(crate) operation: DatabaseOperation,
224 : }
225 :
226 51 : #[derive(measured::LabelGroup)]
227 : #[label(set = LeadershipStatusGroupSet)]
228 : pub(crate) struct LeadershipStatusGroup {
229 : pub(crate) status: LeadershipStatus,
230 : }
231 :
232 34 : #[derive(measured::LabelGroup, Clone)]
233 : #[label(set = ReconcileLongRunningLabelGroupSet)]
234 : pub(crate) struct ReconcileLongRunningLabelGroup<'a> {
235 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
236 : pub(crate) tenant_id: &'a str,
237 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
238 : pub(crate) shard_number: &'a str,
239 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
240 : pub(crate) sequence: &'a str,
241 : }
242 :
243 : #[derive(FixedCardinalityLabel, Clone, Copy)]
244 : pub(crate) enum ReconcileOutcome {
245 : #[label(rename = "ok")]
246 : Success,
247 : Error,
248 : Cancel,
249 : }
250 :
251 : #[derive(FixedCardinalityLabel, Copy, Clone)]
252 : pub(crate) enum Method {
253 : Get,
254 : Put,
255 : Post,
256 : Delete,
257 : Other,
258 : }
259 :
260 : impl From<hyper::Method> for Method {
261 0 : fn from(value: hyper::Method) -> Self {
262 0 : if value == hyper::Method::GET {
263 0 : Method::Get
264 0 : } else if value == hyper::Method::PUT {
265 0 : Method::Put
266 0 : } else if value == hyper::Method::POST {
267 0 : Method::Post
268 0 : } else if value == hyper::Method::DELETE {
269 0 : Method::Delete
270 : } else {
271 0 : Method::Other
272 : }
273 0 : }
274 : }
275 :
276 : #[derive(Clone, Copy)]
277 : pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
278 :
279 : impl LabelValue for StatusCode {
280 0 : fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
281 0 : v.write_int(self.0.as_u16() as i64)
282 0 : }
283 : }
284 :
285 : impl FixedCardinalityLabel for StatusCode {
286 0 : fn cardinality() -> usize {
287 0 : (100..1000).len()
288 0 : }
289 :
290 0 : fn encode(&self) -> usize {
291 0 : self.0.as_u16() as usize
292 0 : }
293 :
294 0 : fn decode(value: usize) -> Self {
295 0 : Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
296 0 : }
297 : }
298 :
299 : #[derive(FixedCardinalityLabel, Clone, Copy)]
300 : pub(crate) enum DatabaseErrorLabel {
301 : Query,
302 : Connection,
303 : ConnectionPool,
304 : Logical,
305 : Migration,
306 : }
307 :
308 : impl DatabaseError {
309 0 : pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
310 0 : match self {
311 0 : Self::Query(_) => DatabaseErrorLabel::Query,
312 0 : Self::Connection(_) => DatabaseErrorLabel::Connection,
313 0 : Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
314 0 : Self::Logical(_) => DatabaseErrorLabel::Logical,
315 0 : Self::Migration(_) => DatabaseErrorLabel::Migration,
316 : }
317 0 : }
318 : }
319 :
320 : /// Update the leadership status metric gauges to reflect the requested status
321 0 : pub(crate) fn update_leadership_status(status: LeadershipStatus) {
322 0 : let status_metric = &METRICS_REGISTRY
323 0 : .metrics_group
324 0 : .storage_controller_leadership_status;
325 :
326 0 : for s in LeadershipStatus::iter() {
327 0 : if s == status {
328 0 : status_metric.set(LeadershipStatusGroup { status: s }, 1);
329 0 : } else {
330 0 : status_metric.set(LeadershipStatusGroup { status: s }, 0);
331 0 : }
332 : }
333 0 : }
|