Line data Source code
1 : //!
2 : //! This module provides metric definitions for the storage controller.
3 : //!
4 : //! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
5 : //! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
6 : //! constant.
7 : //!
8 : //! The rest of the code defines label group types and deals with converting outer types to labels.
9 : //!
10 : use bytes::Bytes;
11 : use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
12 : use metrics::NeonMetrics;
13 : use once_cell::sync::Lazy;
14 : use std::sync::Mutex;
15 :
16 : use crate::{
17 : persistence::{DatabaseError, DatabaseOperation},
18 : service::LeadershipStatus,
19 : };
20 :
21 : pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
22 : Lazy::new(StorageControllerMetrics::default);
23 :
24 0 : pub fn preinitialize_metrics() {
25 0 : Lazy::force(&METRICS_REGISTRY);
26 0 : }
27 :
28 : pub(crate) struct StorageControllerMetrics {
29 : pub(crate) metrics_group: StorageControllerMetricGroup,
30 : encoder: Mutex<measured::text::BufferedTextEncoder>,
31 : }
32 :
33 6 : #[derive(measured::MetricGroup)]
34 : #[metric(new())]
35 : pub(crate) struct StorageControllerMetricGroup {
36 : /// Count of how many times we spawn a reconcile task
37 : pub(crate) storage_controller_reconcile_spawn: measured::Counter,
38 :
39 : /// Reconciler tasks completed, broken down by success/failure/cancelled
40 : pub(crate) storage_controller_reconcile_complete:
41 : measured::CounterVec<ReconcileCompleteLabelGroupSet>,
42 :
43 : /// Count of how many times we make an optimization change to a tenant's scheduling
44 : pub(crate) storage_controller_schedule_optimization: measured::Counter,
45 :
46 : /// HTTP request status counters for handled requests
47 : pub(crate) storage_controller_http_request_status:
48 : measured::CounterVec<HttpRequestStatusLabelGroupSet>,
49 :
50 : /// HTTP request handler latency across all status codes
51 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
52 : pub(crate) storage_controller_http_request_latency:
53 : measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
54 :
55 : /// Count of HTTP requests to the pageserver that resulted in an error,
56 : /// broken down by the pageserver node id, request name and method
57 : pub(crate) storage_controller_pageserver_request_error:
58 : measured::CounterVec<PageserverRequestLabelGroupSet>,
59 :
60 : /// Latency of HTTP requests to the pageserver, broken down by pageserver
61 : /// node id, request name and method. This include both successful and unsuccessful
62 : /// requests.
63 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
64 : pub(crate) storage_controller_pageserver_request_latency:
65 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
66 :
67 : /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
68 : /// broken down by the pageserver node id, request name and method
69 : pub(crate) storage_controller_passthrough_request_error:
70 : measured::CounterVec<PageserverRequestLabelGroupSet>,
71 :
72 : /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
73 : /// node id, request name and method. This include both successful and unsuccessful
74 : /// requests.
75 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
76 : pub(crate) storage_controller_passthrough_request_latency:
77 : measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
78 :
79 : /// Count of errors in database queries, broken down by error type and operation.
80 : pub(crate) storage_controller_database_query_error:
81 : measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
82 :
83 : /// Latency of database queries, broken down by operation.
84 : #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
85 : pub(crate) storage_controller_database_query_latency:
86 : measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
87 :
88 : pub(crate) storage_controller_leadership_status: measured::GaugeVec<LeadershipStatusGroupSet>,
89 : }
90 :
91 : impl StorageControllerMetrics {
92 0 : pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
93 0 : let mut encoder = self.encoder.lock().unwrap();
94 0 : neon_metrics
95 0 : .collect_group_into(&mut *encoder)
96 0 : .unwrap_or_else(|infallible| match infallible {});
97 0 : self.metrics_group
98 0 : .collect_group_into(&mut *encoder)
99 0 : .unwrap_or_else(|infallible| match infallible {});
100 0 : encoder.finish()
101 0 : }
102 : }
103 :
104 : impl Default for StorageControllerMetrics {
105 6 : fn default() -> Self {
106 6 : let mut metrics_group = StorageControllerMetricGroup::new();
107 6 : metrics_group
108 6 : .storage_controller_reconcile_complete
109 6 : .init_all_dense();
110 6 :
111 6 : Self {
112 6 : metrics_group,
113 6 : encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
114 6 : }
115 6 : }
116 : }
117 :
118 18 : #[derive(measured::LabelGroup)]
119 : #[label(set = ReconcileCompleteLabelGroupSet)]
120 : pub(crate) struct ReconcileCompleteLabelGroup {
121 : pub(crate) status: ReconcileOutcome,
122 : }
123 :
124 12 : #[derive(measured::LabelGroup)]
125 : #[label(set = HttpRequestStatusLabelGroupSet)]
126 : pub(crate) struct HttpRequestStatusLabelGroup<'a> {
127 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
128 : pub(crate) path: &'a str,
129 : pub(crate) method: Method,
130 : pub(crate) status: StatusCode,
131 : }
132 :
133 12 : #[derive(measured::LabelGroup)]
134 : #[label(set = HttpRequestLatencyLabelGroupSet)]
135 : pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
136 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
137 : pub(crate) path: &'a str,
138 : pub(crate) method: Method,
139 : }
140 :
141 48 : #[derive(measured::LabelGroup, Clone)]
142 : #[label(set = PageserverRequestLabelGroupSet)]
143 : pub(crate) struct PageserverRequestLabelGroup<'a> {
144 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
145 : pub(crate) pageserver_id: &'a str,
146 : #[label(dynamic_with = lasso::ThreadedRodeo, default)]
147 : pub(crate) path: &'a str,
148 : pub(crate) method: Method,
149 : }
150 :
151 24 : #[derive(measured::LabelGroup)]
152 : #[label(set = DatabaseQueryErrorLabelGroupSet)]
153 : pub(crate) struct DatabaseQueryErrorLabelGroup {
154 : pub(crate) error_type: DatabaseErrorLabel,
155 : pub(crate) operation: DatabaseOperation,
156 : }
157 :
158 18 : #[derive(measured::LabelGroup)]
159 : #[label(set = DatabaseQueryLatencyLabelGroupSet)]
160 : pub(crate) struct DatabaseQueryLatencyLabelGroup {
161 : pub(crate) operation: DatabaseOperation,
162 : }
163 :
164 18 : #[derive(measured::LabelGroup)]
165 : #[label(set = LeadershipStatusGroupSet)]
166 : pub(crate) struct LeadershipStatusGroup {
167 : pub(crate) status: LeadershipStatus,
168 : }
169 :
170 : #[derive(FixedCardinalityLabel, Clone, Copy)]
171 : pub(crate) enum ReconcileOutcome {
172 : #[label(rename = "ok")]
173 : Success,
174 : Error,
175 : Cancel,
176 : }
177 :
178 : #[derive(FixedCardinalityLabel, Copy, Clone)]
179 : pub(crate) enum Method {
180 : Get,
181 : Put,
182 : Post,
183 : Delete,
184 : Other,
185 : }
186 :
187 : impl From<hyper::Method> for Method {
188 0 : fn from(value: hyper::Method) -> Self {
189 0 : if value == hyper::Method::GET {
190 0 : Method::Get
191 0 : } else if value == hyper::Method::PUT {
192 0 : Method::Put
193 0 : } else if value == hyper::Method::POST {
194 0 : Method::Post
195 0 : } else if value == hyper::Method::DELETE {
196 0 : Method::Delete
197 : } else {
198 0 : Method::Other
199 : }
200 0 : }
201 : }
202 :
203 : #[derive(Clone, Copy)]
204 : pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
205 :
206 : impl LabelValue for StatusCode {
207 0 : fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
208 0 : v.write_int(self.0.as_u16() as i64)
209 0 : }
210 : }
211 :
212 : impl FixedCardinalityLabel for StatusCode {
213 0 : fn cardinality() -> usize {
214 0 : (100..1000).len()
215 0 : }
216 :
217 0 : fn encode(&self) -> usize {
218 0 : self.0.as_u16() as usize
219 0 : }
220 :
221 0 : fn decode(value: usize) -> Self {
222 0 : Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
223 0 : }
224 : }
225 :
226 : #[derive(FixedCardinalityLabel, Clone, Copy)]
227 : pub(crate) enum DatabaseErrorLabel {
228 : Query,
229 : Connection,
230 : ConnectionPool,
231 : Logical,
232 : }
233 :
234 : impl DatabaseError {
235 0 : pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
236 0 : match self {
237 0 : Self::Query(_) => DatabaseErrorLabel::Query,
238 0 : Self::Connection(_) => DatabaseErrorLabel::Connection,
239 0 : Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
240 0 : Self::Logical(_) => DatabaseErrorLabel::Logical,
241 : }
242 0 : }
243 : }
|