Line data Source code
1 : use std::future::Future;
2 : use std::str::FromStr;
3 : use std::time::Duration;
4 :
5 : use anyhow::Context;
6 : use metrics::{IntCounter, IntCounterVec};
7 : use once_cell::sync::Lazy;
8 : use strum_macros::{EnumString, VariantNames};
9 : use tokio::time::Instant;
10 : use tracing::info;
11 :
12 : /// Logs a critical error, similarly to `tracing::error!`. This will:
13 : ///
14 : /// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
15 : /// * Trigger a pageable alert (via the metric below).
16 : /// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
17 : /// * In debug builds, panic the process.
18 : ///
19 : /// When including errors in the message, please use {err:?} to include the error cause and original
20 : /// backtrace.
21 : #[macro_export]
22 : macro_rules! critical {
23 : ($($arg:tt)*) => {{
24 : if cfg!(debug_assertions) {
25 : panic!($($arg)*);
26 : }
27 : $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
28 : let backtrace = std::backtrace::Backtrace::capture();
29 : tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
30 : }};
31 : }
32 :
33 0 : #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
34 : #[strum(serialize_all = "snake_case")]
35 : pub enum LogFormat {
36 : Plain,
37 : Json,
38 : Test,
39 : }
40 :
41 : impl LogFormat {
42 0 : pub fn from_config(s: &str) -> anyhow::Result<LogFormat> {
43 : use strum::VariantNames;
44 0 : LogFormat::from_str(s).with_context(|| {
45 0 : format!(
46 0 : "Unrecognized log format. Please specify one of: {:?}",
47 0 : LogFormat::VARIANTS
48 0 : )
49 0 : })
50 0 : }
51 : }
52 :
53 : pub struct TracingEventCountMetric {
54 : /// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro,
55 : /// and also emit it as a regular error. These are thus double-counted, but that seems fine.
56 : critical: IntCounter,
57 : error: IntCounter,
58 : warn: IntCounter,
59 : info: IntCounter,
60 : debug: IntCounter,
61 : trace: IntCounter,
62 : }
63 :
64 548 : pub static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
65 548 : let vec = metrics::register_int_counter_vec!(
66 548 : "libmetrics_tracing_event_count",
67 548 : "Number of tracing events, by level",
68 548 : &["level"]
69 548 : )
70 548 : .expect("failed to define metric");
71 548 : TracingEventCountMetric::new(vec)
72 548 : });
73 :
74 : impl TracingEventCountMetric {
75 549 : fn new(vec: IntCounterVec) -> Self {
76 549 : Self {
77 549 : critical: vec.with_label_values(&["critical"]),
78 549 : error: vec.with_label_values(&["error"]),
79 549 : warn: vec.with_label_values(&["warn"]),
80 549 : info: vec.with_label_values(&["info"]),
81 549 : debug: vec.with_label_values(&["debug"]),
82 549 : trace: vec.with_label_values(&["trace"]),
83 549 : }
84 549 : }
85 :
86 : // Allow public access from `critical!` macro.
87 0 : pub fn inc_critical(&self) {
88 0 : self.critical.inc();
89 0 : }
90 :
91 49664 : fn inc_for_level(&self, level: tracing::Level) {
92 49664 : let counter = match level {
93 69 : tracing::Level::ERROR => &self.error,
94 451 : tracing::Level::WARN => &self.warn,
95 49142 : tracing::Level::INFO => &self.info,
96 1 : tracing::Level::DEBUG => &self.debug,
97 1 : tracing::Level::TRACE => &self.trace,
98 : };
99 49664 : counter.inc();
100 49664 : }
101 : }
102 :
103 : struct TracingEventCountLayer(&'static TracingEventCountMetric);
104 :
105 : impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
106 : where
107 : S: tracing::Subscriber,
108 : {
109 49664 : fn on_event(
110 49664 : &self,
111 49664 : event: &tracing::Event<'_>,
112 49664 : _ctx: tracing_subscriber::layer::Context<'_, S>,
113 49664 : ) {
114 49664 : self.0.inc_for_level(*event.metadata().level());
115 49664 : }
116 : }
117 :
118 : /// Whether to add the `tracing_error` crate's `ErrorLayer`
119 : /// to the global tracing subscriber.
120 : ///
121 : pub enum TracingErrorLayerEnablement {
122 : /// Do not add the `ErrorLayer`.
123 : Disabled,
124 : /// Add the `ErrorLayer` with the filter specified by RUST_LOG, defaulting to `info` if `RUST_LOG` is unset.
125 : EnableWithRustLogFilter,
126 : }
127 :
128 : /// Where the logging should output to.
129 : #[derive(Clone, Copy)]
130 : pub enum Output {
131 : Stdout,
132 : Stderr,
133 : }
134 :
135 548 : pub fn init(
136 548 : log_format: LogFormat,
137 548 : tracing_error_layer_enablement: TracingErrorLayerEnablement,
138 548 : output: Output,
139 548 : ) -> anyhow::Result<()> {
140 548 : // We fall back to printing all spans at info-level or above if
141 548 : // the RUST_LOG environment variable is not set.
142 1574 : let rust_log_env_filter = || {
143 1574 : tracing_subscriber::EnvFilter::try_from_default_env()
144 1574 : .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
145 1574 : };
146 :
147 : // NB: the order of the with() calls does not matter.
148 : // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
149 : use tracing_subscriber::prelude::*;
150 548 : let r = tracing_subscriber::registry();
151 548 : let r = r.with({
152 548 : let log_layer = tracing_subscriber::fmt::layer()
153 548 : .with_target(false)
154 548 : .with_ansi(false)
155 548 : .with_writer(move || -> Box<dyn std::io::Write> {
156 0 : match output {
157 0 : Output::Stdout => Box::new(std::io::stdout()),
158 0 : Output::Stderr => Box::new(std::io::stderr()),
159 : }
160 548 : });
161 548 : let log_layer = match log_format {
162 0 : LogFormat::Json => log_layer.json().boxed(),
163 0 : LogFormat::Plain => log_layer.boxed(),
164 548 : LogFormat::Test => log_layer.with_test_writer().boxed(),
165 : };
166 548 : log_layer.with_filter(rust_log_env_filter())
167 548 : });
168 548 :
169 548 : let r = r.with(
170 548 : TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()),
171 548 : );
172 548 : match tracing_error_layer_enablement {
173 478 : TracingErrorLayerEnablement::EnableWithRustLogFilter => r
174 478 : .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
175 478 : .init(),
176 70 : TracingErrorLayerEnablement::Disabled => r.init(),
177 : }
178 :
179 548 : Ok(())
180 548 : }
181 :
182 : /// Disable the default rust panic hook by using `set_hook`.
183 : ///
184 : /// For neon binaries, the assumption is that tracing is configured before with [`init`], after
185 : /// that sentry is configured (if needed). sentry will install it's own on top of this, always
186 : /// processing the panic before we log it.
187 : ///
188 : /// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
189 : /// If the assumptions about the initialization order are not held, use
190 : /// [`TracingPanicHookGuard::forget`] but keep in mind, if tracing is stopped, then panics will be
191 : /// lost.
192 : #[must_use]
193 8 : pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
194 8 : std::panic::set_hook(Box::new(tracing_panic_hook));
195 8 : TracingPanicHookGuard::new()
196 8 : }
197 :
198 : /// Drop guard which restores the std panic hook on drop.
199 : ///
200 : /// Tracing should not be used when it's not configured, but we cannot really latch on to any
201 : /// imaginary lifetime of tracing.
202 : pub struct TracingPanicHookGuard {
203 : act: bool,
204 : }
205 :
206 : impl TracingPanicHookGuard {
207 8 : fn new() -> Self {
208 8 : TracingPanicHookGuard { act: true }
209 8 : }
210 :
211 : /// Make this hook guard not do anything when dropped.
212 8 : pub fn forget(&mut self) {
213 8 : self.act = false;
214 8 : }
215 : }
216 :
217 : impl Drop for TracingPanicHookGuard {
218 8 : fn drop(&mut self) {
219 8 : if self.act {
220 0 : let _ = std::panic::take_hook();
221 8 : }
222 8 : }
223 : }
224 :
225 : /// Named symbol for our panic hook, which logs the panic.
226 12 : fn tracing_panic_hook(info: &std::panic::PanicHookInfo) {
227 12 : // following rust 1.66.1 std implementation:
228 12 : // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
229 12 : let location = info.location();
230 :
231 12 : let msg = match info.payload().downcast_ref::<&'static str>() {
232 0 : Some(s) => *s,
233 12 : None => match info.payload().downcast_ref::<String>() {
234 12 : Some(s) => &s[..],
235 0 : None => "Box<dyn Any>",
236 : },
237 : };
238 :
239 12 : let thread = std::thread::current();
240 12 : let thread = thread.name().unwrap_or("<unnamed>");
241 12 : let backtrace = std::backtrace::Backtrace::capture();
242 :
243 12 : let _entered = if let Some(location) = location {
244 12 : tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
245 : } else {
246 : // very unlikely to hit here, but the guarantees of std could change
247 0 : tracing::error_span!("panic", %thread)
248 : }
249 12 : .entered();
250 12 :
251 12 : if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
252 : // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
253 : // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
254 : // string, maybe even to a TLS one but tracing already does that.
255 12 : tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
256 : } else {
257 0 : tracing::error!("{msg}");
258 : }
259 :
260 : // ensure that we log something on the panic if this hook is left after tracing has been
261 : // unconfigured. worst case when teardown is racing the panic is to log the panic twice.
262 12 : tracing::dispatcher::get_default(|d| {
263 12 : if let Some(_none) = d.downcast_ref::<tracing::subscriber::NoSubscriber>() {
264 0 : let location = location.map(PrettyLocation);
265 0 : log_panic_to_stderr(thread, msg, location, &backtrace);
266 12 : }
267 12 : });
268 12 : }
269 :
270 : #[cold]
271 0 : fn log_panic_to_stderr(
272 0 : thread: &str,
273 0 : msg: &str,
274 0 : location: Option<PrettyLocation<'_, '_>>,
275 0 : backtrace: &std::backtrace::Backtrace,
276 0 : ) {
277 0 : eprintln!(
278 0 : "panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}"
279 0 : );
280 0 : }
281 :
282 : struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
283 :
284 : impl std::fmt::Display for PrettyLocation<'_, '_> {
285 12 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
286 12 : write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
287 12 : }
288 : }
289 :
290 : impl std::fmt::Debug for PrettyLocation<'_, '_> {
291 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
292 0 : <Self as std::fmt::Display>::fmt(self, f)
293 0 : }
294 : }
295 :
296 : /// When you will store a secret but want to make sure it won't
297 : /// be accidentally logged, wrap it in a SecretString, whose Debug
298 : /// implementation does not expose the contents.
299 : #[derive(Clone, Eq, PartialEq)]
300 : pub struct SecretString(String);
301 :
302 : impl SecretString {
303 0 : pub fn get_contents(&self) -> &str {
304 0 : self.0.as_str()
305 0 : }
306 : }
307 :
308 : impl From<String> for SecretString {
309 0 : fn from(s: String) -> Self {
310 0 : Self(s)
311 0 : }
312 : }
313 :
314 : impl FromStr for SecretString {
315 : type Err = std::convert::Infallible;
316 :
317 0 : fn from_str(s: &str) -> Result<Self, Self::Err> {
318 0 : Ok(Self(s.to_string()))
319 0 : }
320 : }
321 :
322 : impl std::fmt::Debug for SecretString {
323 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
324 0 : write!(f, "[SECRET]")
325 0 : }
326 : }
327 :
328 : /// Logs a periodic message if a future is slow to complete.
329 : ///
330 : /// This is performance-sensitive as it's used on the GetPage read path.
331 : ///
332 : /// TODO: consider upgrading this to a warning, but currently it fires too often.
333 : #[inline]
334 0 : pub async fn log_slow<O>(name: &str, threshold: Duration, f: impl Future<Output = O>) -> O {
335 0 : // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
336 0 : // won't fit on the stack.
337 0 : let mut f = Box::pin(f);
338 0 :
339 0 : let started = Instant::now();
340 0 : let mut attempt = 1;
341 :
342 : loop {
343 : // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common
344 : // case where the timeout doesn't fire.
345 0 : let deadline = started + attempt * threshold;
346 0 : if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await {
347 : // NB: we check if we exceeded the threshold even if the timeout never fired, because
348 : // scheduling or execution delays may cause the future to succeed even if it exceeds the
349 : // timeout. This costs an extra unconditional clock reading, but seems worth it to avoid
350 : // false negatives.
351 0 : let elapsed = started.elapsed();
352 0 : if elapsed >= threshold {
353 0 : info!("slow {name} completed after {:.3}s", elapsed.as_secs_f64());
354 0 : }
355 0 : return output;
356 0 : }
357 0 :
358 0 : let elapsed = started.elapsed().as_secs_f64();
359 0 : info!("slow {name} still running after {elapsed:.3}s",);
360 :
361 0 : attempt += 1;
362 : }
363 0 : }
364 :
365 : #[cfg(test)]
366 : mod tests {
367 : use metrics::IntCounterVec;
368 : use metrics::core::Opts;
369 :
370 : use crate::logging::{TracingEventCountLayer, TracingEventCountMetric};
371 :
372 : #[test]
373 1 : fn tracing_event_count_metric() {
374 1 : let counter_vec =
375 1 : IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
376 1 : let metric = Box::leak(Box::new(TracingEventCountMetric::new(counter_vec.clone())));
377 1 : let layer = TracingEventCountLayer(metric);
378 : use tracing_subscriber::prelude::*;
379 :
380 1 : tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {
381 1 : tracing::trace!("foo");
382 1 : tracing::debug!("foo");
383 1 : tracing::info!("foo");
384 1 : tracing::warn!("foo");
385 1 : tracing::error!("foo");
386 1 : });
387 1 :
388 1 : assert_eq!(counter_vec.with_label_values(&["trace"]).get(), 1);
389 1 : assert_eq!(counter_vec.with_label_values(&["debug"]).get(), 1);
390 1 : assert_eq!(counter_vec.with_label_values(&["info"]).get(), 1);
391 1 : assert_eq!(counter_vec.with_label_values(&["warn"]).get(), 1);
392 1 : assert_eq!(counter_vec.with_label_values(&["error"]).get(), 1);
393 1 : }
394 : }
|