Line data Source code
1 : //!
2 : //! This module provides metric definitions for the storage controller.
3 : //!
4 : //! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
5 : //! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
6 : //! constant.
7 : //!
8 : //! The rest of the code defines label group types and deals with converting outer types to labels.
9 : //!
10 : use bytes::Bytes;
11 : use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
12 : use metrics::NeonMetrics;
13 : use once_cell::sync::Lazy;
14 : use std::sync::Mutex;
15 : use strum::IntoEnumIterator;
16 :
17 : use crate::{
18 : persistence::{DatabaseError, DatabaseOperation},
19 : service::LeadershipStatus,
20 : };
21 :
22 : pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
23 : Lazy::new(StorageControllerMetrics::default);
24 :
25 0 : pub fn preinitialize_metrics() {
26 0 : Lazy::force(&METRICS_REGISTRY);
27 0 : }
28 :
29 : pub(crate) struct StorageControllerMetrics {
30 : pub(crate) metrics_group: StorageControllerMetricGroup,
31 : encoder: Mutex<measured::text::BufferedTextEncoder>,
32 : }
33 :
34 9 : #[derive(measured::MetricGroup)]
35 : #[metric(new())]
36 : pub(crate) struct StorageControllerMetricGroup {
37 : /// Count of how many times we spawn a reconcile task
38 : pub(crate) storage_controller_reconcile_spawn: measured::Counter,
39 :
40 : /// Size of the in-memory map of tenant shards
41 : pub(crate) storage_controller_tenant_shards: measured::Gauge,
42 :
43 : /// Size of the in-memory map of pageserver_nodes
44 : pub(crate) storage_controller_pageserver_nodes: measured::Gauge,
45 :
46 : /// Reconciler tasks completed, broken down by success/failure/cancelled
47 : pub(crate) storage_controller_reconcile_complete:
48 : measured::CounterVec<ReconcileCompleteLabelGroupSet>,
49 :
50 : /// Count of how many times we make an optimization change to a tenant's scheduling
51 : pub(crate) storage_controller_schedule_optimization: measured::Counter,
52 :
53 : /// How many shards are not scheduled into their preferred AZ
54 : pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
55 :
56 : /// How many shards would like to reconcile but were blocked by concurrency limits
57 : pub(crate) storage_controller_pending_reconciles: measured::Gauge,
58 :
59 : /// HTTP request status counters for handled requests
60 : pub(crate) storage_controller_http_request_status:
61 : measured::CounterVec<HttpRequestStatusLabelGroupSet>,
62 :
63 : /// HTTP request handler latency across all status codes
64 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
65 : pub(crate) storage_controller_http_request_latency:
66 : measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
67 :
68 : /// Count of HTTP requests to the pageserver that resulted in an error,
69 : /// broken down by the pageserver node id, request name and method
70 : pub(crate) storage_controller_pageserver_request_error:
71 : measured::CounterVec<PageserverRequestLabelGroupSet>,
72 :
73 : /// Latency of HTTP requests to the pageserver, broken down by pageserver
74 : /// node id, request name and method. This include both successful and unsuccessful
75 : /// requests.
76 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
77 : pub(crate) storage_controller_pageserver_request_latency:
78 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
79 :
80 : /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
81 : /// broken down by the pageserver node id, request name and method
82 : pub(crate) storage_controller_passthrough_request_error:
83 : measured::CounterVec<PageserverRequestLabelGroupSet>,
84 :
85 : /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
86 : /// node id, request name and method. This include both successful and unsuccessful
87 : /// requests.
88 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
89 : pub(crate) storage_controller_passthrough_request_latency:
90 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
91 :
92 : /// Count of errors in database queries, broken down by error type and operation.
93 : pub(crate) storage_controller_database_query_error:
94 : measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
95 :
96 : /// Latency of database queries, broken down by operation.
97 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
98 : pub(crate) storage_controller_database_query_latency:
99 : measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
100 :
101 : pub(crate) storage_controller_leadership_status: measured::GaugeVec<LeadershipStatusGroupSet>,
102 :
103 : /// HTTP request status counters for handled requests
104 : pub(crate) storage_controller_reconcile_long_running:
105 : measured::CounterVec<ReconcileLongRunningLabelGroupSet>,
106 : }
107 :
108 : impl StorageControllerMetrics {
109 0 : pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
110 0 : let mut encoder = self.encoder.lock().unwrap();
111 0 : neon_metrics
112 0 : .collect_group_into(&mut *encoder)
113 0 : .unwrap_or_else(|infallible| match infallible {});
114 0 : self.metrics_group
115 0 : .collect_group_into(&mut *encoder)
116 0 : .unwrap_or_else(|infallible| match infallible {});
117 0 : encoder.finish()
118 0 : }
119 : }
120 :
121 : impl Default for StorageControllerMetrics {
122 9 : fn default() -> Self {
123 9 : let mut metrics_group = StorageControllerMetricGroup::new();
124 9 : metrics_group
125 9 : .storage_controller_reconcile_complete
126 9 : .init_all_dense();
127 9 :
128 9 : Self {
129 9 : metrics_group,
130 9 : encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
131 9 : }
132 9 : }
133 : }
134 :
135 27 : #[derive(measured::LabelGroup)]
136 : #[label(set = ReconcileCompleteLabelGroupSet)]
137 : pub(crate) struct ReconcileCompleteLabelGroup {
138 : pub(crate) status: ReconcileOutcome,
139 : }
140 :
141 18 : #[derive(measured::LabelGroup)]
142 : #[label(set = HttpRequestStatusLabelGroupSet)]
143 : pub(crate) struct HttpRequestStatusLabelGroup<'a> {
144 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
145 : pub(crate) path: &'a str,
146 : pub(crate) method: Method,
147 : pub(crate) status: StatusCode,
148 : }
149 :
150 18 : #[derive(measured::LabelGroup)]
151 : #[label(set = HttpRequestLatencyLabelGroupSet)]
152 : pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
153 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
154 : pub(crate) path: &'a str,
155 : pub(crate) method: Method,
156 : }
157 :
158 72 : #[derive(measured::LabelGroup, Clone)]
159 : #[label(set = PageserverRequestLabelGroupSet)]
160 : pub(crate) struct PageserverRequestLabelGroup<'a> {
161 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
162 : pub(crate) pageserver_id: &'a str,
163 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
164 : pub(crate) path: &'a str,
165 : pub(crate) method: Method,
166 : }
167 :
168 36 : #[derive(measured::LabelGroup)]
169 : #[label(set = DatabaseQueryErrorLabelGroupSet)]
170 : pub(crate) struct DatabaseQueryErrorLabelGroup {
171 : pub(crate) error_type: DatabaseErrorLabel,
172 : pub(crate) operation: DatabaseOperation,
173 : }
174 :
175 27 : #[derive(measured::LabelGroup)]
176 : #[label(set = DatabaseQueryLatencyLabelGroupSet)]
177 : pub(crate) struct DatabaseQueryLatencyLabelGroup {
178 : pub(crate) operation: DatabaseOperation,
179 : }
180 :
181 27 : #[derive(measured::LabelGroup)]
182 : #[label(set = LeadershipStatusGroupSet)]
183 : pub(crate) struct LeadershipStatusGroup {
184 : pub(crate) status: LeadershipStatus,
185 : }
186 :
187 18 : #[derive(measured::LabelGroup, Clone)]
188 : #[label(set = ReconcileLongRunningLabelGroupSet)]
189 : pub(crate) struct ReconcileLongRunningLabelGroup<'a> {
190 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
191 : pub(crate) tenant_id: &'a str,
192 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
193 : pub(crate) shard_number: &'a str,
194 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
195 : pub(crate) sequence: &'a str,
196 : }
197 :
198 : #[derive(FixedCardinalityLabel, Clone, Copy)]
199 : pub(crate) enum ReconcileOutcome {
200 : #[label(rename = "ok")]
201 : Success,
202 : Error,
203 : Cancel,
204 : }
205 :
206 : #[derive(FixedCardinalityLabel, Copy, Clone)]
207 : pub(crate) enum Method {
208 : Get,
209 : Put,
210 : Post,
211 : Delete,
212 : Other,
213 : }
214 :
215 : impl From<hyper::Method> for Method {
216 0 : fn from(value: hyper::Method) -> Self {
217 0 : if value == hyper::Method::GET {
218 0 : Method::Get
219 0 : } else if value == hyper::Method::PUT {
220 0 : Method::Put
221 0 : } else if value == hyper::Method::POST {
222 0 : Method::Post
223 0 : } else if value == hyper::Method::DELETE {
224 0 : Method::Delete
225 : } else {
226 0 : Method::Other
227 : }
228 0 : }
229 : }
230 :
231 : #[derive(Clone, Copy)]
232 : pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
233 :
234 : impl LabelValue for StatusCode {
235 0 : fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
236 0 : v.write_int(self.0.as_u16() as i64)
237 0 : }
238 : }
239 :
240 : impl FixedCardinalityLabel for StatusCode {
241 0 : fn cardinality() -> usize {
242 0 : (100..1000).len()
243 0 : }
244 :
245 0 : fn encode(&self) -> usize {
246 0 : self.0.as_u16() as usize
247 0 : }
248 :
249 0 : fn decode(value: usize) -> Self {
250 0 : Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
251 0 : }
252 : }
253 :
254 : #[derive(FixedCardinalityLabel, Clone, Copy)]
255 : pub(crate) enum DatabaseErrorLabel {
256 : Query,
257 : Connection,
258 : ConnectionPool,
259 : Logical,
260 : Migration,
261 : }
262 :
263 : impl DatabaseError {
264 0 : pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
265 0 : match self {
266 0 : Self::Query(_) => DatabaseErrorLabel::Query,
267 0 : Self::Connection(_) => DatabaseErrorLabel::Connection,
268 0 : Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
269 0 : Self::Logical(_) => DatabaseErrorLabel::Logical,
270 0 : Self::Migration(_) => DatabaseErrorLabel::Migration,
271 : }
272 0 : }
273 : }
274 :
275 : /// Update the leadership status metric gauges to reflect the requested status
276 0 : pub(crate) fn update_leadership_status(status: LeadershipStatus) {
277 0 : let status_metric = &METRICS_REGISTRY
278 0 : .metrics_group
279 0 : .storage_controller_leadership_status;
280 :
281 0 : for s in LeadershipStatus::iter() {
282 0 : if s == status {
283 0 : status_metric.set(LeadershipStatusGroup { status: s }, 1);
284 0 : } else {
285 0 : status_metric.set(LeadershipStatusGroup { status: s }, 0);
286 0 : }
287 : }
288 0 : }
|