Line data Source code
1 : //!
2 : //! This module provides metric definitions for the storage controller.
3 : //!
4 : //! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
5 : //! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
6 : //! constant.
7 : //!
8 : //! The rest of the code defines label group types and deals with converting outer types to labels.
9 : //!
10 : use bytes::Bytes;
11 : use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
12 : use metrics::NeonMetrics;
13 : use once_cell::sync::Lazy;
14 : use std::sync::Mutex;
15 :
16 : use crate::persistence::{DatabaseError, DatabaseOperation};
17 :
18 : pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
19 : Lazy::new(StorageControllerMetrics::default);
20 :
21 0 : pub fn preinitialize_metrics() {
22 0 : Lazy::force(&METRICS_REGISTRY);
23 0 : }
24 :
25 : pub(crate) struct StorageControllerMetrics {
26 : pub(crate) metrics_group: StorageControllerMetricGroup,
27 : encoder: Mutex<measured::text::BufferedTextEncoder>,
28 : }
29 :
30 6 : #[derive(measured::MetricGroup)]
31 : #[metric(new())]
32 : pub(crate) struct StorageControllerMetricGroup {
33 : /// Count of how many times we spawn a reconcile task
34 : pub(crate) storage_controller_reconcile_spawn: measured::Counter,
35 :
36 : /// Reconciler tasks completed, broken down by success/failure/cancelled
37 : pub(crate) storage_controller_reconcile_complete:
38 : measured::CounterVec<ReconcileCompleteLabelGroupSet>,
39 :
40 : /// Count of how many times we make an optimization change to a tenant's scheduling
41 : pub(crate) storage_controller_schedule_optimization: measured::Counter,
42 :
43 : /// HTTP request status counters for handled requests
44 : pub(crate) storage_controller_http_request_status:
45 : measured::CounterVec<HttpRequestStatusLabelGroupSet>,
46 :
47 : /// HTTP request handler latency across all status codes
48 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
49 : pub(crate) storage_controller_http_request_latency:
50 : measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
51 :
52 : /// Count of HTTP requests to the pageserver that resulted in an error,
53 : /// broken down by the pageserver node id, request name and method
54 : pub(crate) storage_controller_pageserver_request_error:
55 : measured::CounterVec<PageserverRequestLabelGroupSet>,
56 :
57 : /// Latency of HTTP requests to the pageserver, broken down by pageserver
58 : /// node id, request name and method. This include both successful and unsuccessful
59 : /// requests.
60 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
61 : pub(crate) storage_controller_pageserver_request_latency:
62 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
63 :
64 : /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
65 : /// broken down by the pageserver node id, request name and method
66 : pub(crate) storage_controller_passthrough_request_error:
67 : measured::CounterVec<PageserverRequestLabelGroupSet>,
68 :
69 : /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
70 : /// node id, request name and method. This include both successful and unsuccessful
71 : /// requests.
72 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
73 : pub(crate) storage_controller_passthrough_request_latency:
74 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
75 :
76 : /// Count of errors in database queries, broken down by error type and operation.
77 : pub(crate) storage_controller_database_query_error:
78 : measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
79 :
80 : /// Latency of database queries, broken down by operation.
81 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
82 : pub(crate) storage_controller_database_query_latency:
83 : measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
84 : }
85 :
86 : impl StorageControllerMetrics {
87 0 : pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
88 0 : let mut encoder = self.encoder.lock().unwrap();
89 0 : neon_metrics
90 0 : .collect_group_into(&mut *encoder)
91 0 : .unwrap_or_else(|infallible| match infallible {});
92 0 : self.metrics_group
93 0 : .collect_group_into(&mut *encoder)
94 0 : .unwrap_or_else(|infallible| match infallible {});
95 0 : encoder.finish()
96 0 : }
97 : }
98 :
99 : impl Default for StorageControllerMetrics {
100 6 : fn default() -> Self {
101 6 : let mut metrics_group = StorageControllerMetricGroup::new();
102 6 : metrics_group
103 6 : .storage_controller_reconcile_complete
104 6 : .init_all_dense();
105 6 :
106 6 : Self {
107 6 : metrics_group,
108 6 : encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
109 6 : }
110 6 : }
111 : }
112 :
113 18 : #[derive(measured::LabelGroup)]
114 : #[label(set = ReconcileCompleteLabelGroupSet)]
115 : pub(crate) struct ReconcileCompleteLabelGroup {
116 : pub(crate) status: ReconcileOutcome,
117 : }
118 :
119 12 : #[derive(measured::LabelGroup)]
120 : #[label(set = HttpRequestStatusLabelGroupSet)]
121 : pub(crate) struct HttpRequestStatusLabelGroup<'a> {
122 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
123 : pub(crate) path: &'a str,
124 : pub(crate) method: Method,
125 : pub(crate) status: StatusCode,
126 : }
127 :
128 12 : #[derive(measured::LabelGroup)]
129 : #[label(set = HttpRequestLatencyLabelGroupSet)]
130 : pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
131 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
132 : pub(crate) path: &'a str,
133 : pub(crate) method: Method,
134 : }
135 :
136 48 : #[derive(measured::LabelGroup, Clone)]
137 : #[label(set = PageserverRequestLabelGroupSet)]
138 : pub(crate) struct PageserverRequestLabelGroup<'a> {
139 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
140 : pub(crate) pageserver_id: &'a str,
141 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
142 : pub(crate) path: &'a str,
143 : pub(crate) method: Method,
144 : }
145 :
146 24 : #[derive(measured::LabelGroup)]
147 : #[label(set = DatabaseQueryErrorLabelGroupSet)]
148 : pub(crate) struct DatabaseQueryErrorLabelGroup {
149 : pub(crate) error_type: DatabaseErrorLabel,
150 : pub(crate) operation: DatabaseOperation,
151 : }
152 :
153 18 : #[derive(measured::LabelGroup)]
154 : #[label(set = DatabaseQueryLatencyLabelGroupSet)]
155 : pub(crate) struct DatabaseQueryLatencyLabelGroup {
156 : pub(crate) operation: DatabaseOperation,
157 : }
158 :
159 : #[derive(FixedCardinalityLabel, Clone, Copy)]
160 : pub(crate) enum ReconcileOutcome {
161 : #[label(rename = "ok")]
162 : Success,
163 : Error,
164 : Cancel,
165 : }
166 :
167 : #[derive(FixedCardinalityLabel, Copy, Clone)]
168 : pub(crate) enum Method {
169 : Get,
170 : Put,
171 : Post,
172 : Delete,
173 : Other,
174 : }
175 :
176 : impl From<hyper::Method> for Method {
177 0 : fn from(value: hyper::Method) -> Self {
178 0 : if value == hyper::Method::GET {
179 0 : Method::Get
180 0 : } else if value == hyper::Method::PUT {
181 0 : Method::Put
182 0 : } else if value == hyper::Method::POST {
183 0 : Method::Post
184 0 : } else if value == hyper::Method::DELETE {
185 0 : Method::Delete
186 : } else {
187 0 : Method::Other
188 : }
189 0 : }
190 : }
191 :
192 : #[derive(Clone, Copy)]
193 : pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
194 :
195 : impl LabelValue for StatusCode {
196 0 : fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
197 0 : v.write_int(self.0.as_u16() as i64)
198 0 : }
199 : }
200 :
201 : impl FixedCardinalityLabel for StatusCode {
202 0 : fn cardinality() -> usize {
203 0 : (100..1000).len()
204 0 : }
205 :
206 0 : fn encode(&self) -> usize {
207 0 : self.0.as_u16() as usize
208 0 : }
209 :
210 0 : fn decode(value: usize) -> Self {
211 0 : Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
212 0 : }
213 : }
214 :
215 : #[derive(FixedCardinalityLabel, Clone, Copy)]
216 : pub(crate) enum DatabaseErrorLabel {
217 : Query,
218 : Connection,
219 : ConnectionPool,
220 : Logical,
221 : }
222 :
223 : impl DatabaseError {
224 0 : pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
225 0 : match self {
226 0 : Self::Query(_) => DatabaseErrorLabel::Query,
227 0 : Self::Connection(_) => DatabaseErrorLabel::Connection,
228 0 : Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
229 0 : Self::Logical(_) => DatabaseErrorLabel::Logical,
230 : }
231 0 : }
232 : }
|