Line data Source code
1 : //!
2 : //! This module provides metric definitions for the storage controller.
3 : //!
4 : //! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
5 : //! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
6 : //! constant.
7 : //!
8 : //! The rest of the code defines label group types and deals with converting outer types to labels.
9 : //!
10 : use bytes::Bytes;
11 : use measured::{
12 : label::{LabelValue, StaticLabelSet},
13 : FixedCardinalityLabel, MetricGroup,
14 : };
15 : use once_cell::sync::Lazy;
16 : use std::sync::Mutex;
17 :
18 : use crate::persistence::{DatabaseError, DatabaseOperation};
19 :
20 : pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
21 : Lazy::new(StorageControllerMetrics::default);
22 :
23 0 : pub fn preinitialize_metrics() {
24 0 : Lazy::force(&METRICS_REGISTRY);
25 0 : }
26 :
27 : pub(crate) struct StorageControllerMetrics {
28 : pub(crate) metrics_group: StorageControllerMetricGroup,
29 : encoder: Mutex<measured::text::TextEncoder>,
30 : }
31 :
32 : #[derive(measured::MetricGroup)]
33 : pub(crate) struct StorageControllerMetricGroup {
34 : /// Count of how many times we spawn a reconcile task
35 : pub(crate) storage_controller_reconcile_spawn: measured::Counter,
36 : /// Reconciler tasks completed, broken down by success/failure/cancelled
37 : pub(crate) storage_controller_reconcile_complete:
38 : measured::CounterVec<ReconcileCompleteLabelGroupSet>,
39 :
40 : /// Count of how many times we make an optimization change to a tenant's scheduling
41 : pub(crate) storage_controller_schedule_optimization: measured::Counter,
42 :
43 : /// HTTP request status counters for handled requests
44 : pub(crate) storage_controller_http_request_status:
45 : measured::CounterVec<HttpRequestStatusLabelGroupSet>,
46 : /// HTTP request handler latency across all status codes
47 : pub(crate) storage_controller_http_request_latency:
48 : measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
49 :
50 : /// Count of HTTP requests to the pageserver that resulted in an error,
51 : /// broken down by the pageserver node id, request name and method
52 : pub(crate) storage_controller_pageserver_request_error:
53 : measured::CounterVec<PageserverRequestLabelGroupSet>,
54 :
55 : /// Latency of HTTP requests to the pageserver, broken down by pageserver
56 : /// node id, request name and method. This include both successful and unsuccessful
57 : /// requests.
58 : pub(crate) storage_controller_pageserver_request_latency:
59 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
60 :
61 : /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
62 : /// broken down by the pageserver node id, request name and method
63 : pub(crate) storage_controller_passthrough_request_error:
64 : measured::CounterVec<PageserverRequestLabelGroupSet>,
65 :
66 : /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
67 : /// node id, request name and method. This include both successful and unsuccessful
68 : /// requests.
69 : pub(crate) storage_controller_passthrough_request_latency:
70 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
71 :
72 : /// Count of errors in database queries, broken down by error type and operation.
73 : pub(crate) storage_controller_database_query_error:
74 : measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
75 :
76 : /// Latency of database queries, broken down by operation.
77 : pub(crate) storage_controller_database_query_latency:
78 : measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
79 : }
80 :
81 : impl StorageControllerMetrics {
82 0 : pub(crate) fn encode(&self) -> Bytes {
83 0 : let mut encoder = self.encoder.lock().unwrap();
84 0 : self.metrics_group.collect_into(&mut *encoder);
85 0 : encoder.finish()
86 0 : }
87 : }
88 :
89 : impl Default for StorageControllerMetrics {
90 6 : fn default() -> Self {
91 6 : Self {
92 6 : metrics_group: StorageControllerMetricGroup::new(),
93 6 : encoder: Mutex::new(measured::text::TextEncoder::new()),
94 6 : }
95 6 : }
96 : }
97 :
98 : impl StorageControllerMetricGroup {
99 6 : pub(crate) fn new() -> Self {
100 6 : Self {
101 6 : storage_controller_reconcile_spawn: measured::Counter::new(),
102 6 : storage_controller_reconcile_complete: measured::CounterVec::new(
103 6 : ReconcileCompleteLabelGroupSet {
104 6 : status: StaticLabelSet::new(),
105 6 : },
106 6 : ),
107 6 : storage_controller_schedule_optimization: measured::Counter::new(),
108 6 : storage_controller_http_request_status: measured::CounterVec::new(
109 6 : HttpRequestStatusLabelGroupSet {
110 6 : path: lasso::ThreadedRodeo::new(),
111 6 : method: StaticLabelSet::new(),
112 6 : status: StaticLabelSet::new(),
113 6 : },
114 6 : ),
115 6 : storage_controller_http_request_latency: measured::HistogramVec::new(
116 6 : measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
117 6 : ),
118 6 : storage_controller_pageserver_request_error: measured::CounterVec::new(
119 6 : PageserverRequestLabelGroupSet {
120 6 : pageserver_id: lasso::ThreadedRodeo::new(),
121 6 : path: lasso::ThreadedRodeo::new(),
122 6 : method: StaticLabelSet::new(),
123 6 : },
124 6 : ),
125 6 : storage_controller_pageserver_request_latency: measured::HistogramVec::new(
126 6 : measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
127 6 : ),
128 6 : storage_controller_passthrough_request_error: measured::CounterVec::new(
129 6 : PageserverRequestLabelGroupSet {
130 6 : pageserver_id: lasso::ThreadedRodeo::new(),
131 6 : path: lasso::ThreadedRodeo::new(),
132 6 : method: StaticLabelSet::new(),
133 6 : },
134 6 : ),
135 6 : storage_controller_passthrough_request_latency: measured::HistogramVec::new(
136 6 : measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
137 6 : ),
138 6 : storage_controller_database_query_error: measured::CounterVec::new(
139 6 : DatabaseQueryErrorLabelGroupSet {
140 6 : operation: StaticLabelSet::new(),
141 6 : error_type: StaticLabelSet::new(),
142 6 : },
143 6 : ),
144 6 : storage_controller_database_query_latency: measured::HistogramVec::new(
145 6 : measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
146 6 : ),
147 6 : }
148 6 : }
149 : }
150 :
151 6 : #[derive(measured::LabelGroup)]
152 : #[label(set = ReconcileCompleteLabelGroupSet)]
153 : pub(crate) struct ReconcileCompleteLabelGroup {
154 : pub(crate) status: ReconcileOutcome,
155 : }
156 :
157 : #[derive(measured::LabelGroup)]
158 : #[label(set = HttpRequestStatusLabelGroupSet)]
159 : pub(crate) struct HttpRequestStatusLabelGroup<'a> {
160 : #[label(dynamic_with = lasso::ThreadedRodeo)]
161 : pub(crate) path: &'a str,
162 : pub(crate) method: Method,
163 : pub(crate) status: StatusCode,
164 : }
165 :
166 : #[derive(measured::LabelGroup)]
167 : #[label(set = HttpRequestLatencyLabelGroupSet)]
168 : pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
169 : #[label(dynamic_with = lasso::ThreadedRodeo)]
170 : pub(crate) path: &'a str,
171 : pub(crate) method: Method,
172 : }
173 :
174 : impl Default for HttpRequestLatencyLabelGroupSet {
175 6 : fn default() -> Self {
176 6 : Self {
177 6 : path: lasso::ThreadedRodeo::new(),
178 6 : method: StaticLabelSet::new(),
179 6 : }
180 6 : }
181 : }
182 :
183 : #[derive(measured::LabelGroup, Clone)]
184 : #[label(set = PageserverRequestLabelGroupSet)]
185 : pub(crate) struct PageserverRequestLabelGroup<'a> {
186 : #[label(dynamic_with = lasso::ThreadedRodeo)]
187 : pub(crate) pageserver_id: &'a str,
188 : #[label(dynamic_with = lasso::ThreadedRodeo)]
189 : pub(crate) path: &'a str,
190 : pub(crate) method: Method,
191 : }
192 :
193 : impl Default for PageserverRequestLabelGroupSet {
194 12 : fn default() -> Self {
195 12 : Self {
196 12 : pageserver_id: lasso::ThreadedRodeo::new(),
197 12 : path: lasso::ThreadedRodeo::new(),
198 12 : method: StaticLabelSet::new(),
199 12 : }
200 12 : }
201 : }
202 :
203 12 : #[derive(measured::LabelGroup)]
204 : #[label(set = DatabaseQueryErrorLabelGroupSet)]
205 : pub(crate) struct DatabaseQueryErrorLabelGroup {
206 : pub(crate) error_type: DatabaseErrorLabel,
207 : pub(crate) operation: DatabaseOperation,
208 : }
209 :
210 12 : #[derive(measured::LabelGroup)]
211 : #[label(set = DatabaseQueryLatencyLabelGroupSet)]
212 : pub(crate) struct DatabaseQueryLatencyLabelGroup {
213 : pub(crate) operation: DatabaseOperation,
214 : }
215 :
216 : #[derive(FixedCardinalityLabel)]
217 : pub(crate) enum ReconcileOutcome {
218 : #[label(rename = "ok")]
219 : Success,
220 : Error,
221 : Cancel,
222 : }
223 :
224 : #[derive(FixedCardinalityLabel, Clone)]
225 : pub(crate) enum Method {
226 : Get,
227 : Put,
228 : Post,
229 : Delete,
230 : Other,
231 : }
232 :
233 : impl From<hyper::Method> for Method {
234 0 : fn from(value: hyper::Method) -> Self {
235 0 : if value == hyper::Method::GET {
236 0 : Method::Get
237 0 : } else if value == hyper::Method::PUT {
238 0 : Method::Put
239 0 : } else if value == hyper::Method::POST {
240 0 : Method::Post
241 0 : } else if value == hyper::Method::DELETE {
242 0 : Method::Delete
243 : } else {
244 0 : Method::Other
245 : }
246 0 : }
247 : }
248 :
249 : pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
250 :
251 : impl LabelValue for StatusCode {
252 0 : fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
253 0 : v.write_int(self.0.as_u16() as u64)
254 0 : }
255 : }
256 :
257 : impl FixedCardinalityLabel for StatusCode {
258 0 : fn cardinality() -> usize {
259 0 : (100..1000).len()
260 0 : }
261 :
262 0 : fn encode(&self) -> usize {
263 0 : self.0.as_u16() as usize
264 0 : }
265 :
266 0 : fn decode(value: usize) -> Self {
267 0 : Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
268 0 : }
269 : }
270 :
271 : #[derive(FixedCardinalityLabel)]
272 : pub(crate) enum DatabaseErrorLabel {
273 : Query,
274 : Connection,
275 : ConnectionPool,
276 : Logical,
277 : }
278 :
279 : impl DatabaseError {
280 0 : pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
281 0 : match self {
282 0 : Self::Query(_) => DatabaseErrorLabel::Query,
283 0 : Self::Connection(_) => DatabaseErrorLabel::Connection,
284 0 : Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
285 0 : Self::Logical(_) => DatabaseErrorLabel::Logical,
286 : }
287 0 : }
288 : }
|