Line data Source code
1 : //!
2 : //! This module provides metric definitions for the storage controller.
3 : //!
4 : //! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
5 : //! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
6 : //! constant.
7 : //!
8 : //! The rest of the code defines label group types and deals with converting outer types to labels.
9 : //!
10 : use std::sync::Mutex;
11 :
12 : use bytes::Bytes;
13 : use measured::label::LabelValue;
14 : use measured::metric::histogram;
15 : use measured::{FixedCardinalityLabel, MetricGroup};
16 : use metrics::NeonMetrics;
17 : use once_cell::sync::Lazy;
18 : use strum::IntoEnumIterator;
19 :
20 : use crate::persistence::{DatabaseError, DatabaseOperation};
21 : use crate::service::LeadershipStatus;
22 :
23 : pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
24 : Lazy::new(StorageControllerMetrics::default);
25 :
26 0 : pub fn preinitialize_metrics() {
27 0 : Lazy::force(&METRICS_REGISTRY);
28 0 : }
29 :
30 : pub(crate) struct StorageControllerMetrics {
31 : pub(crate) metrics_group: StorageControllerMetricGroup,
32 : encoder: Mutex<measured::text::BufferedTextEncoder>,
33 : }
34 :
35 15 : #[derive(measured::MetricGroup)]
36 : #[metric(new())]
37 : pub(crate) struct StorageControllerMetricGroup {
38 : /// Count of how many times we spawn a reconcile task
39 : pub(crate) storage_controller_reconcile_spawn: measured::Counter,
40 :
41 : /// Size of the in-memory map of tenant shards
42 : pub(crate) storage_controller_tenant_shards: measured::Gauge,
43 :
44 : /// Size of the in-memory map of pageserver_nodes
45 : pub(crate) storage_controller_pageserver_nodes: measured::Gauge,
46 :
47 : /// Reconciler tasks completed, broken down by success/failure/cancelled
48 : pub(crate) storage_controller_reconcile_complete:
49 : measured::CounterVec<ReconcileCompleteLabelGroupSet>,
50 :
51 : /// Count of how many times we make an optimization change to a tenant's scheduling
52 : pub(crate) storage_controller_schedule_optimization: measured::Counter,
53 :
54 : /// How many shards are not scheduled into their preferred AZ
55 : pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
56 :
57 : /// How many shard locations (secondary or attached) on each node
58 : pub(crate) storage_controller_node_shards: measured::GaugeVec<NodeLabelGroupSet>,
59 :
60 : /// How many _attached_ shard locations on each node
61 : pub(crate) storage_controller_node_attached_shards: measured::GaugeVec<NodeLabelGroupSet>,
62 :
63 : /// How many _home_ shard locations on each node (i.e. the node's AZ matches the shard's
64 : /// preferred AZ)
65 : pub(crate) storage_controller_node_home_shards: measured::GaugeVec<NodeLabelGroupSet>,
66 :
67 : /// How many shards would like to reconcile but were blocked by concurrency limits
68 : pub(crate) storage_controller_pending_reconciles: measured::Gauge,
69 :
70 : /// HTTP request status counters for handled requests
71 : pub(crate) storage_controller_http_request_status:
72 : measured::CounterVec<HttpRequestStatusLabelGroupSet>,
73 :
74 : /// HTTP request handler latency across all status codes
75 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
76 : pub(crate) storage_controller_http_request_latency:
77 : measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
78 :
79 : /// Count of HTTP requests to the pageserver that resulted in an error,
80 : /// broken down by the pageserver node id, request name and method
81 : pub(crate) storage_controller_pageserver_request_error:
82 : measured::CounterVec<PageserverRequestLabelGroupSet>,
83 :
84 : /// Count of HTTP requests to the safekeeper that resulted in an error,
85 : /// broken down by the safekeeper node id, request name and method
86 : pub(crate) storage_controller_safekeeper_request_error:
87 : measured::CounterVec<PageserverRequestLabelGroupSet>,
88 :
89 : /// Latency of HTTP requests to the pageserver, broken down by pageserver
90 : /// node id, request name and method. This include both successful and unsuccessful
91 : /// requests.
92 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
93 : pub(crate) storage_controller_pageserver_request_latency:
94 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
95 :
96 : /// Latency of HTTP requests to the safekeeper, broken down by safekeeper
97 : /// node id, request name and method. This include both successful and unsuccessful
98 : /// requests.
99 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
100 : pub(crate) storage_controller_safekeeper_request_latency:
101 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
102 :
103 : /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
104 : /// broken down by the pageserver node id, request name and method
105 : pub(crate) storage_controller_passthrough_request_error:
106 : measured::CounterVec<PageserverRequestLabelGroupSet>,
107 :
108 : /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
109 : /// node id, request name and method. This include both successful and unsuccessful
110 : /// requests.
111 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
112 : pub(crate) storage_controller_passthrough_request_latency:
113 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
114 :
115 : /// Count of errors in database queries, broken down by error type and operation.
116 : pub(crate) storage_controller_database_query_error:
117 : measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
118 :
119 : /// Latency of database queries, broken down by operation.
120 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
121 : pub(crate) storage_controller_database_query_latency:
122 : measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
123 :
124 : pub(crate) storage_controller_leadership_status: measured::GaugeVec<LeadershipStatusGroupSet>,
125 :
126 : /// HTTP request status counters for handled requests
127 : pub(crate) storage_controller_reconcile_long_running:
128 : measured::CounterVec<ReconcileLongRunningLabelGroupSet>,
129 : }
130 :
131 : impl StorageControllerMetrics {
132 0 : pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
133 0 : let mut encoder = self.encoder.lock().unwrap();
134 0 : neon_metrics
135 0 : .collect_group_into(&mut *encoder)
136 0 : .unwrap_or_else(|infallible| match infallible {});
137 0 : self.metrics_group
138 0 : .collect_group_into(&mut *encoder)
139 0 : .unwrap_or_else(|infallible| match infallible {});
140 0 : encoder.finish()
141 0 : }
142 : }
143 :
144 : impl Default for StorageControllerMetrics {
145 15 : fn default() -> Self {
146 15 : let mut metrics_group = StorageControllerMetricGroup::new();
147 15 : metrics_group
148 15 : .storage_controller_reconcile_complete
149 15 : .init_all_dense();
150 15 :
151 15 : Self {
152 15 : metrics_group,
153 15 : encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
154 15 : }
155 15 : }
156 : }
157 :
158 90 : #[derive(measured::LabelGroup, Clone)]
159 : #[label(set = NodeLabelGroupSet)]
160 : pub(crate) struct NodeLabelGroup<'a> {
161 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
162 : pub(crate) az: &'a str,
163 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
164 : pub(crate) node_id: &'a str,
165 : }
166 :
167 45 : #[derive(measured::LabelGroup)]
168 : #[label(set = ReconcileCompleteLabelGroupSet)]
169 : pub(crate) struct ReconcileCompleteLabelGroup {
170 : pub(crate) status: ReconcileOutcome,
171 : }
172 :
173 30 : #[derive(measured::LabelGroup)]
174 : #[label(set = HttpRequestStatusLabelGroupSet)]
175 : pub(crate) struct HttpRequestStatusLabelGroup<'a> {
176 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
177 : pub(crate) path: &'a str,
178 : pub(crate) method: Method,
179 : pub(crate) status: StatusCode,
180 : }
181 :
182 30 : #[derive(measured::LabelGroup)]
183 : #[label(set = HttpRequestLatencyLabelGroupSet)]
184 : pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
185 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
186 : pub(crate) path: &'a str,
187 : pub(crate) method: Method,
188 : }
189 :
190 180 : #[derive(measured::LabelGroup, Clone)]
191 : #[label(set = PageserverRequestLabelGroupSet)]
192 : pub(crate) struct PageserverRequestLabelGroup<'a> {
193 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
194 : pub(crate) pageserver_id: &'a str,
195 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
196 : pub(crate) path: &'a str,
197 : pub(crate) method: Method,
198 : }
199 :
200 60 : #[derive(measured::LabelGroup)]
201 : #[label(set = DatabaseQueryErrorLabelGroupSet)]
202 : pub(crate) struct DatabaseQueryErrorLabelGroup {
203 : pub(crate) error_type: DatabaseErrorLabel,
204 : pub(crate) operation: DatabaseOperation,
205 : }
206 :
207 45 : #[derive(measured::LabelGroup)]
208 : #[label(set = DatabaseQueryLatencyLabelGroupSet)]
209 : pub(crate) struct DatabaseQueryLatencyLabelGroup {
210 : pub(crate) operation: DatabaseOperation,
211 : }
212 :
213 45 : #[derive(measured::LabelGroup)]
214 : #[label(set = LeadershipStatusGroupSet)]
215 : pub(crate) struct LeadershipStatusGroup {
216 : pub(crate) status: LeadershipStatus,
217 : }
218 :
219 30 : #[derive(measured::LabelGroup, Clone)]
220 : #[label(set = ReconcileLongRunningLabelGroupSet)]
221 : pub(crate) struct ReconcileLongRunningLabelGroup<'a> {
222 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
223 : pub(crate) tenant_id: &'a str,
224 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
225 : pub(crate) shard_number: &'a str,
226 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
227 : pub(crate) sequence: &'a str,
228 : }
229 :
230 : #[derive(FixedCardinalityLabel, Clone, Copy)]
231 : pub(crate) enum ReconcileOutcome {
232 : #[label(rename = "ok")]
233 : Success,
234 : Error,
235 : Cancel,
236 : }
237 :
238 : #[derive(FixedCardinalityLabel, Copy, Clone)]
239 : pub(crate) enum Method {
240 : Get,
241 : Put,
242 : Post,
243 : Delete,
244 : Other,
245 : }
246 :
247 : impl From<hyper::Method> for Method {
248 0 : fn from(value: hyper::Method) -> Self {
249 0 : if value == hyper::Method::GET {
250 0 : Method::Get
251 0 : } else if value == hyper::Method::PUT {
252 0 : Method::Put
253 0 : } else if value == hyper::Method::POST {
254 0 : Method::Post
255 0 : } else if value == hyper::Method::DELETE {
256 0 : Method::Delete
257 : } else {
258 0 : Method::Other
259 : }
260 0 : }
261 : }
262 :
263 : #[derive(Clone, Copy)]
264 : pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
265 :
266 : impl LabelValue for StatusCode {
267 0 : fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
268 0 : v.write_int(self.0.as_u16() as i64)
269 0 : }
270 : }
271 :
272 : impl FixedCardinalityLabel for StatusCode {
273 0 : fn cardinality() -> usize {
274 0 : (100..1000).len()
275 0 : }
276 :
277 0 : fn encode(&self) -> usize {
278 0 : self.0.as_u16() as usize
279 0 : }
280 :
281 0 : fn decode(value: usize) -> Self {
282 0 : Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
283 0 : }
284 : }
285 :
286 : #[derive(FixedCardinalityLabel, Clone, Copy)]
287 : pub(crate) enum DatabaseErrorLabel {
288 : Query,
289 : Connection,
290 : ConnectionPool,
291 : Logical,
292 : Migration,
293 : }
294 :
295 : impl DatabaseError {
296 0 : pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
297 0 : match self {
298 0 : Self::Query(_) => DatabaseErrorLabel::Query,
299 0 : Self::Connection(_) => DatabaseErrorLabel::Connection,
300 0 : Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
301 0 : Self::Logical(_) => DatabaseErrorLabel::Logical,
302 0 : Self::Migration(_) => DatabaseErrorLabel::Migration,
303 : }
304 0 : }
305 : }
306 :
307 : /// Update the leadership status metric gauges to reflect the requested status
308 0 : pub(crate) fn update_leadership_status(status: LeadershipStatus) {
309 0 : let status_metric = &METRICS_REGISTRY
310 0 : .metrics_group
311 0 : .storage_controller_leadership_status;
312 :
313 0 : for s in LeadershipStatus::iter() {
314 0 : if s == status {
315 0 : status_metric.set(LeadershipStatusGroup { status: s }, 1);
316 0 : } else {
317 0 : status_metric.set(LeadershipStatusGroup { status: s }, 0);
318 0 : }
319 : }
320 0 : }
|