Line data Source code
1 : use std::future::Future;
2 : use std::str::FromStr;
3 : use std::time::Duration;
4 :
5 : use anyhow::Context;
6 : use metrics::{IntCounter, IntCounterVec};
7 : use once_cell::sync::Lazy;
8 : use strum_macros::{EnumString, VariantNames};
9 : use tokio::time::Instant;
10 : use tracing::warn;
11 :
12 : /// Logs a critical error, similarly to `tracing::error!`. This will:
13 : ///
14 : /// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
15 : /// * Trigger a pageable alert (via the metric below).
16 : /// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
17 : /// * In debug builds, panic the process.
18 : ///
19 : /// When including errors in the message, please use {err:?} to include the error cause and original
20 : /// backtrace.
21 : #[macro_export]
22 : macro_rules! critical {
23 : ($($arg:tt)*) => {{
24 : if cfg!(debug_assertions) {
25 : panic!($($arg)*);
26 : }
27 : $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
28 : let backtrace = std::backtrace::Backtrace::capture();
29 : tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
30 : }};
31 : }
32 :
33 0 : #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
34 : #[strum(serialize_all = "snake_case")]
35 : pub enum LogFormat {
36 : Plain,
37 : Json,
38 : Test,
39 : }
40 :
41 : impl LogFormat {
42 0 : pub fn from_config(s: &str) -> anyhow::Result<LogFormat> {
43 : use strum::VariantNames;
44 0 : LogFormat::from_str(s).with_context(|| {
45 0 : format!(
46 0 : "Unrecognized log format. Please specify one of: {:?}",
47 0 : LogFormat::VARIANTS
48 0 : )
49 0 : })
50 0 : }
51 : }
52 :
53 : pub struct TracingEventCountMetric {
54 : /// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro,
55 : /// and also emit it as a regular error. These are thus double-counted, but that seems fine.
56 : critical: IntCounter,
57 : error: IntCounter,
58 : warn: IntCounter,
59 : info: IntCounter,
60 : debug: IntCounter,
61 : trace: IntCounter,
62 : }
63 :
64 541 : pub static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
65 541 : let vec = metrics::register_int_counter_vec!(
66 541 : "libmetrics_tracing_event_count",
67 541 : "Number of tracing events, by level",
68 541 : &["level"]
69 541 : )
70 541 : .expect("failed to define metric");
71 541 : TracingEventCountMetric::new(vec)
72 541 : });
73 :
74 : impl TracingEventCountMetric {
75 542 : fn new(vec: IntCounterVec) -> Self {
76 542 : Self {
77 542 : critical: vec.with_label_values(&["critical"]),
78 542 : error: vec.with_label_values(&["error"]),
79 542 : warn: vec.with_label_values(&["warn"]),
80 542 : info: vec.with_label_values(&["info"]),
81 542 : debug: vec.with_label_values(&["debug"]),
82 542 : trace: vec.with_label_values(&["trace"]),
83 542 : }
84 542 : }
85 :
86 : // Allow public access from `critical!` macro.
87 0 : pub fn inc_critical(&self) {
88 0 : self.critical.inc();
89 0 : }
90 :
91 49490 : fn inc_for_level(&self, level: tracing::Level) {
92 49490 : let counter = match level {
93 85 : tracing::Level::ERROR => &self.error,
94 451 : tracing::Level::WARN => &self.warn,
95 48952 : tracing::Level::INFO => &self.info,
96 1 : tracing::Level::DEBUG => &self.debug,
97 1 : tracing::Level::TRACE => &self.trace,
98 : };
99 49490 : counter.inc();
100 49490 : }
101 : }
102 :
103 : struct TracingEventCountLayer(&'static TracingEventCountMetric);
104 :
105 : impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
106 : where
107 : S: tracing::Subscriber,
108 : {
109 49490 : fn on_event(
110 49490 : &self,
111 49490 : event: &tracing::Event<'_>,
112 49490 : _ctx: tracing_subscriber::layer::Context<'_, S>,
113 49490 : ) {
114 49490 : self.0.inc_for_level(*event.metadata().level());
115 49490 : }
116 : }
117 :
118 : /// Whether to add the `tracing_error` crate's `ErrorLayer`
119 : /// to the global tracing subscriber.
120 : ///
121 : pub enum TracingErrorLayerEnablement {
122 : /// Do not add the `ErrorLayer`.
123 : Disabled,
124 : /// Add the `ErrorLayer` with the filter specified by RUST_LOG, defaulting to `info` if `RUST_LOG` is unset.
125 : EnableWithRustLogFilter,
126 : }
127 :
128 : /// Where the logging should output to.
129 : #[derive(Clone, Copy)]
130 : pub enum Output {
131 : Stdout,
132 : Stderr,
133 : }
134 :
135 541 : pub fn init(
136 541 : log_format: LogFormat,
137 541 : tracing_error_layer_enablement: TracingErrorLayerEnablement,
138 541 : output: Output,
139 541 : ) -> anyhow::Result<()> {
140 541 : // We fall back to printing all spans at info-level or above if
141 541 : // the RUST_LOG environment variable is not set.
142 1553 : let rust_log_env_filter = || {
143 1553 : tracing_subscriber::EnvFilter::try_from_default_env()
144 1553 : .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
145 1553 : };
146 :
147 : // NB: the order of the with() calls does not matter.
148 : // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
149 : use tracing_subscriber::prelude::*;
150 541 : let r = tracing_subscriber::registry();
151 541 : let r = r.with({
152 541 : let log_layer = tracing_subscriber::fmt::layer()
153 541 : .with_target(false)
154 541 : .with_ansi(false)
155 541 : .with_writer(move || -> Box<dyn std::io::Write> {
156 0 : match output {
157 0 : Output::Stdout => Box::new(std::io::stdout()),
158 0 : Output::Stderr => Box::new(std::io::stderr()),
159 : }
160 541 : });
161 541 : let log_layer = match log_format {
162 0 : LogFormat::Json => log_layer.json().boxed(),
163 0 : LogFormat::Plain => log_layer.boxed(),
164 541 : LogFormat::Test => log_layer.with_test_writer().boxed(),
165 : };
166 541 : log_layer.with_filter(rust_log_env_filter())
167 541 : });
168 541 : let r = r.with(
169 541 : TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()),
170 541 : );
171 541 : match tracing_error_layer_enablement {
172 471 : TracingErrorLayerEnablement::EnableWithRustLogFilter => r
173 471 : .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
174 471 : .init(),
175 70 : TracingErrorLayerEnablement::Disabled => r.init(),
176 : }
177 :
178 541 : Ok(())
179 541 : }
180 :
181 : /// Disable the default rust panic hook by using `set_hook`.
182 : ///
183 : /// For neon binaries, the assumption is that tracing is configured before with [`init`], after
184 : /// that sentry is configured (if needed). sentry will install it's own on top of this, always
185 : /// processing the panic before we log it.
186 : ///
187 : /// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
188 : /// If the assumptions about the initialization order are not held, use
189 : /// [`TracingPanicHookGuard::forget`] but keep in mind, if tracing is stopped, then panics will be
190 : /// lost.
191 : #[must_use]
192 8 : pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
193 8 : std::panic::set_hook(Box::new(tracing_panic_hook));
194 8 : TracingPanicHookGuard::new()
195 8 : }
196 :
197 : /// Drop guard which restores the std panic hook on drop.
198 : ///
199 : /// Tracing should not be used when it's not configured, but we cannot really latch on to any
200 : /// imaginary lifetime of tracing.
201 : pub struct TracingPanicHookGuard {
202 : act: bool,
203 : }
204 :
205 : impl TracingPanicHookGuard {
206 8 : fn new() -> Self {
207 8 : TracingPanicHookGuard { act: true }
208 8 : }
209 :
210 : /// Make this hook guard not do anything when dropped.
211 8 : pub fn forget(&mut self) {
212 8 : self.act = false;
213 8 : }
214 : }
215 :
216 : impl Drop for TracingPanicHookGuard {
217 8 : fn drop(&mut self) {
218 8 : if self.act {
219 0 : let _ = std::panic::take_hook();
220 8 : }
221 8 : }
222 : }
223 :
224 : /// Named symbol for our panic hook, which logs the panic.
225 12 : fn tracing_panic_hook(info: &std::panic::PanicHookInfo) {
226 12 : // following rust 1.66.1 std implementation:
227 12 : // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
228 12 : let location = info.location();
229 :
230 12 : let msg = match info.payload().downcast_ref::<&'static str>() {
231 0 : Some(s) => *s,
232 12 : None => match info.payload().downcast_ref::<String>() {
233 12 : Some(s) => &s[..],
234 0 : None => "Box<dyn Any>",
235 : },
236 : };
237 :
238 12 : let thread = std::thread::current();
239 12 : let thread = thread.name().unwrap_or("<unnamed>");
240 12 : let backtrace = std::backtrace::Backtrace::capture();
241 :
242 12 : let _entered = if let Some(location) = location {
243 12 : tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
244 : } else {
245 : // very unlikely to hit here, but the guarantees of std could change
246 0 : tracing::error_span!("panic", %thread)
247 : }
248 12 : .entered();
249 12 :
250 12 : if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
251 : // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
252 : // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
253 : // string, maybe even to a TLS one but tracing already does that.
254 12 : tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
255 : } else {
256 0 : tracing::error!("{msg}");
257 : }
258 :
259 : // ensure that we log something on the panic if this hook is left after tracing has been
260 : // unconfigured. worst case when teardown is racing the panic is to log the panic twice.
261 12 : tracing::dispatcher::get_default(|d| {
262 12 : if let Some(_none) = d.downcast_ref::<tracing::subscriber::NoSubscriber>() {
263 0 : let location = location.map(PrettyLocation);
264 0 : log_panic_to_stderr(thread, msg, location, &backtrace);
265 12 : }
266 12 : });
267 12 : }
268 :
269 : #[cold]
270 0 : fn log_panic_to_stderr(
271 0 : thread: &str,
272 0 : msg: &str,
273 0 : location: Option<PrettyLocation<'_, '_>>,
274 0 : backtrace: &std::backtrace::Backtrace,
275 0 : ) {
276 0 : eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}");
277 0 : }
278 :
279 : struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
280 :
281 : impl std::fmt::Display for PrettyLocation<'_, '_> {
282 12 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
283 12 : write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
284 12 : }
285 : }
286 :
287 : impl std::fmt::Debug for PrettyLocation<'_, '_> {
288 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
289 0 : <Self as std::fmt::Display>::fmt(self, f)
290 0 : }
291 : }
292 :
293 : /// When you will store a secret but want to make sure it won't
294 : /// be accidentally logged, wrap it in a SecretString, whose Debug
295 : /// implementation does not expose the contents.
296 : #[derive(Clone, Eq, PartialEq)]
297 : pub struct SecretString(String);
298 :
299 : impl SecretString {
300 0 : pub fn get_contents(&self) -> &str {
301 0 : self.0.as_str()
302 0 : }
303 : }
304 :
305 : impl From<String> for SecretString {
306 0 : fn from(s: String) -> Self {
307 0 : Self(s)
308 0 : }
309 : }
310 :
311 : impl FromStr for SecretString {
312 : type Err = std::convert::Infallible;
313 :
314 0 : fn from_str(s: &str) -> Result<Self, Self::Err> {
315 0 : Ok(Self(s.to_string()))
316 0 : }
317 : }
318 :
319 : impl std::fmt::Debug for SecretString {
320 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
321 0 : write!(f, "[SECRET]")
322 0 : }
323 : }
324 :
325 : /// Logs a periodic warning if a future is slow to complete.
326 : ///
327 : /// This is performance-sensitive as it's used on the GetPage read path.
328 : #[inline]
329 0 : pub async fn warn_slow<O>(name: &str, threshold: Duration, f: impl Future<Output = O>) -> O {
330 0 : // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
331 0 : // won't fit on the stack.
332 0 : let mut f = Box::pin(f);
333 0 :
334 0 : let started = Instant::now();
335 0 : let mut attempt = 1;
336 :
337 : loop {
338 : // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common
339 : // case where the timeout doesn't fire.
340 0 : let deadline = started + attempt * threshold;
341 0 : if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await {
342 : // NB: we check if we exceeded the threshold even if the timeout never fired, because
343 : // scheduling or execution delays may cause the future to succeed even if it exceeds the
344 : // timeout. This costs an extra unconditional clock reading, but seems worth it to avoid
345 : // false negatives.
346 0 : let elapsed = started.elapsed();
347 0 : if elapsed >= threshold {
348 0 : warn!("slow {name} completed after {:.3}s", elapsed.as_secs_f64());
349 0 : }
350 0 : return output;
351 0 : }
352 0 :
353 0 : let elapsed = started.elapsed().as_secs_f64();
354 0 : warn!("slow {name} still running after {elapsed:.3}s",);
355 :
356 0 : attempt += 1;
357 : }
358 0 : }
359 :
360 : #[cfg(test)]
361 : mod tests {
362 : use metrics::{core::Opts, IntCounterVec};
363 :
364 : use crate::logging::{TracingEventCountLayer, TracingEventCountMetric};
365 :
366 : #[test]
367 1 : fn tracing_event_count_metric() {
368 1 : let counter_vec =
369 1 : IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
370 1 : let metric = Box::leak(Box::new(TracingEventCountMetric::new(counter_vec.clone())));
371 1 : let layer = TracingEventCountLayer(metric);
372 : use tracing_subscriber::prelude::*;
373 :
374 1 : tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {
375 1 : tracing::trace!("foo");
376 1 : tracing::debug!("foo");
377 1 : tracing::info!("foo");
378 1 : tracing::warn!("foo");
379 1 : tracing::error!("foo");
380 1 : });
381 1 :
382 1 : assert_eq!(counter_vec.with_label_values(&["trace"]).get(), 1);
383 1 : assert_eq!(counter_vec.with_label_values(&["debug"]).get(), 1);
384 1 : assert_eq!(counter_vec.with_label_values(&["info"]).get(), 1);
385 1 : assert_eq!(counter_vec.with_label_values(&["warn"]).get(), 1);
386 1 : assert_eq!(counter_vec.with_label_values(&["error"]).get(), 1);
387 1 : }
388 : }
|