Line data Source code
1 : use std::future::Future;
2 : use std::str::FromStr;
3 : use std::time::Duration;
4 :
5 : use anyhow::Context;
6 : use metrics::{IntCounter, IntCounterVec};
7 : use once_cell::sync::Lazy;
8 : use strum_macros::{EnumString, VariantNames};
9 : use tokio::time::Instant;
10 : use tracing::info;
11 :
12 : /// Logs a critical error, similarly to `tracing::error!`. This will:
13 : ///
14 : /// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
15 : /// * Trigger a pageable alert (via the metric below).
16 : /// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
17 : /// * In debug builds, panic the process.
18 : ///
19 : /// When including errors in the message, please use {err:?} to include the error cause and original
20 : /// backtrace.
21 : #[macro_export]
22 : macro_rules! critical {
23 : ($($arg:tt)*) => {{
24 : if cfg!(debug_assertions) {
25 : panic!($($arg)*);
26 : }
27 : $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
28 : let backtrace = std::backtrace::Backtrace::capture();
29 : tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
30 : }};
31 : }
32 :
33 0 : #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
34 : #[strum(serialize_all = "snake_case")]
35 : pub enum LogFormat {
36 : Plain,
37 : Json,
38 : Test,
39 : }
40 :
41 : impl LogFormat {
42 0 : pub fn from_config(s: &str) -> anyhow::Result<LogFormat> {
43 : use strum::VariantNames;
44 0 : LogFormat::from_str(s).with_context(|| {
45 0 : format!(
46 0 : "Unrecognized log format. Please specify one of: {:?}",
47 0 : LogFormat::VARIANTS
48 0 : )
49 0 : })
50 0 : }
51 : }
52 :
53 : pub struct TracingEventCountMetric {
54 : /// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro,
55 : /// and also emit it as a regular error. These are thus double-counted, but that seems fine.
56 : critical: IntCounter,
57 : error: IntCounter,
58 : warn: IntCounter,
59 : info: IntCounter,
60 : debug: IntCounter,
61 : trace: IntCounter,
62 : }
63 :
64 548 : pub static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
65 548 : let vec = metrics::register_int_counter_vec!(
66 548 : "libmetrics_tracing_event_count",
67 548 : "Number of tracing events, by level",
68 548 : &["level"]
69 548 : )
70 548 : .expect("failed to define metric");
71 548 : TracingEventCountMetric::new(vec)
72 548 : });
73 :
74 : impl TracingEventCountMetric {
75 549 : fn new(vec: IntCounterVec) -> Self {
76 549 : Self {
77 549 : critical: vec.with_label_values(&["critical"]),
78 549 : error: vec.with_label_values(&["error"]),
79 549 : warn: vec.with_label_values(&["warn"]),
80 549 : info: vec.with_label_values(&["info"]),
81 549 : debug: vec.with_label_values(&["debug"]),
82 549 : trace: vec.with_label_values(&["trace"]),
83 549 : }
84 549 : }
85 :
86 : // Allow public access from `critical!` macro.
87 0 : pub fn inc_critical(&self) {
88 0 : self.critical.inc();
89 0 : }
90 :
91 49900 : fn inc_for_level(&self, level: tracing::Level) {
92 49900 : let counter = match level {
93 77 : tracing::Level::ERROR => &self.error,
94 451 : tracing::Level::WARN => &self.warn,
95 49370 : tracing::Level::INFO => &self.info,
96 1 : tracing::Level::DEBUG => &self.debug,
97 1 : tracing::Level::TRACE => &self.trace,
98 : };
99 49900 : counter.inc();
100 49900 : }
101 : }
102 :
103 : struct TracingEventCountLayer(&'static TracingEventCountMetric);
104 :
105 : impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
106 : where
107 : S: tracing::Subscriber,
108 : {
109 49900 : fn on_event(
110 49900 : &self,
111 49900 : event: &tracing::Event<'_>,
112 49900 : _ctx: tracing_subscriber::layer::Context<'_, S>,
113 49900 : ) {
114 49900 : self.0.inc_for_level(*event.metadata().level());
115 49900 : }
116 : }
117 :
118 : /// Whether to add the `tracing_error` crate's `ErrorLayer`
119 : /// to the global tracing subscriber.
120 : ///
121 : pub enum TracingErrorLayerEnablement {
122 : /// Do not add the `ErrorLayer`.
123 : Disabled,
124 : /// Add the `ErrorLayer` with the filter specified by RUST_LOG, defaulting to `info` if `RUST_LOG` is unset.
125 : EnableWithRustLogFilter,
126 : }
127 :
128 : /// Where the logging should output to.
129 : #[derive(Clone, Copy)]
130 : pub enum Output {
131 : Stdout,
132 : Stderr,
133 : }
134 :
135 548 : pub fn init(
136 548 : log_format: LogFormat,
137 548 : tracing_error_layer_enablement: TracingErrorLayerEnablement,
138 548 : output: Output,
139 548 : ) -> anyhow::Result<()> {
140 548 : // We fall back to printing all spans at info-level or above if
141 548 : // the RUST_LOG environment variable is not set.
142 1574 : let rust_log_env_filter = || {
143 1574 : tracing_subscriber::EnvFilter::try_from_default_env()
144 1574 : .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
145 1574 : };
146 :
147 : // NB: the order of the with() calls does not matter.
148 : // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
149 : use tracing_subscriber::prelude::*;
150 548 : let r = tracing_subscriber::registry();
151 548 : let r = r.with({
152 548 : let log_layer = tracing_subscriber::fmt::layer()
153 548 : .with_target(false)
154 548 : .with_ansi(false)
155 548 : .with_writer(move || -> Box<dyn std::io::Write> {
156 0 : match output {
157 0 : Output::Stdout => Box::new(std::io::stdout()),
158 0 : Output::Stderr => Box::new(std::io::stderr()),
159 : }
160 548 : });
161 548 : let log_layer = match log_format {
162 0 : LogFormat::Json => log_layer.json().boxed(),
163 0 : LogFormat::Plain => log_layer.boxed(),
164 548 : LogFormat::Test => log_layer.with_test_writer().boxed(),
165 : };
166 548 : log_layer.with_filter(rust_log_env_filter())
167 548 : });
168 548 : let r = r.with(
169 548 : TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()),
170 548 : );
171 548 : match tracing_error_layer_enablement {
172 478 : TracingErrorLayerEnablement::EnableWithRustLogFilter => r
173 478 : .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
174 478 : .init(),
175 70 : TracingErrorLayerEnablement::Disabled => r.init(),
176 : }
177 :
178 548 : Ok(())
179 548 : }
180 :
181 : /// Disable the default rust panic hook by using `set_hook`.
182 : ///
183 : /// For neon binaries, the assumption is that tracing is configured before with [`init`], after
184 : /// that sentry is configured (if needed). sentry will install it's own on top of this, always
185 : /// processing the panic before we log it.
186 : ///
187 : /// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
188 : /// If the assumptions about the initialization order are not held, use
189 : /// [`TracingPanicHookGuard::forget`] but keep in mind, if tracing is stopped, then panics will be
190 : /// lost.
191 : #[must_use]
192 8 : pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
193 8 : std::panic::set_hook(Box::new(tracing_panic_hook));
194 8 : TracingPanicHookGuard::new()
195 8 : }
196 :
197 : /// Drop guard which restores the std panic hook on drop.
198 : ///
199 : /// Tracing should not be used when it's not configured, but we cannot really latch on to any
200 : /// imaginary lifetime of tracing.
201 : pub struct TracingPanicHookGuard {
202 : act: bool,
203 : }
204 :
205 : impl TracingPanicHookGuard {
206 8 : fn new() -> Self {
207 8 : TracingPanicHookGuard { act: true }
208 8 : }
209 :
210 : /// Make this hook guard not do anything when dropped.
211 8 : pub fn forget(&mut self) {
212 8 : self.act = false;
213 8 : }
214 : }
215 :
216 : impl Drop for TracingPanicHookGuard {
217 8 : fn drop(&mut self) {
218 8 : if self.act {
219 0 : let _ = std::panic::take_hook();
220 8 : }
221 8 : }
222 : }
223 :
224 : /// Named symbol for our panic hook, which logs the panic.
225 12 : fn tracing_panic_hook(info: &std::panic::PanicHookInfo) {
226 12 : // following rust 1.66.1 std implementation:
227 12 : // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
228 12 : let location = info.location();
229 :
230 12 : let msg = match info.payload().downcast_ref::<&'static str>() {
231 0 : Some(s) => *s,
232 12 : None => match info.payload().downcast_ref::<String>() {
233 12 : Some(s) => &s[..],
234 0 : None => "Box<dyn Any>",
235 : },
236 : };
237 :
238 12 : let thread = std::thread::current();
239 12 : let thread = thread.name().unwrap_or("<unnamed>");
240 12 : let backtrace = std::backtrace::Backtrace::capture();
241 :
242 12 : let _entered = if let Some(location) = location {
243 12 : tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
244 : } else {
245 : // very unlikely to hit here, but the guarantees of std could change
246 0 : tracing::error_span!("panic", %thread)
247 : }
248 12 : .entered();
249 12 :
250 12 : if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
251 : // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
252 : // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
253 : // string, maybe even to a TLS one but tracing already does that.
254 12 : tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
255 : } else {
256 0 : tracing::error!("{msg}");
257 : }
258 :
259 : // ensure that we log something on the panic if this hook is left after tracing has been
260 : // unconfigured. worst case when teardown is racing the panic is to log the panic twice.
261 12 : tracing::dispatcher::get_default(|d| {
262 12 : if let Some(_none) = d.downcast_ref::<tracing::subscriber::NoSubscriber>() {
263 0 : let location = location.map(PrettyLocation);
264 0 : log_panic_to_stderr(thread, msg, location, &backtrace);
265 12 : }
266 12 : });
267 12 : }
268 :
269 : #[cold]
270 0 : fn log_panic_to_stderr(
271 0 : thread: &str,
272 0 : msg: &str,
273 0 : location: Option<PrettyLocation<'_, '_>>,
274 0 : backtrace: &std::backtrace::Backtrace,
275 0 : ) {
276 0 : eprintln!(
277 0 : "panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}"
278 0 : );
279 0 : }
280 :
281 : struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
282 :
283 : impl std::fmt::Display for PrettyLocation<'_, '_> {
284 12 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
285 12 : write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
286 12 : }
287 : }
288 :
289 : impl std::fmt::Debug for PrettyLocation<'_, '_> {
290 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
291 0 : <Self as std::fmt::Display>::fmt(self, f)
292 0 : }
293 : }
294 :
295 : /// When you will store a secret but want to make sure it won't
296 : /// be accidentally logged, wrap it in a SecretString, whose Debug
297 : /// implementation does not expose the contents.
298 : #[derive(Clone, Eq, PartialEq)]
299 : pub struct SecretString(String);
300 :
301 : impl SecretString {
302 0 : pub fn get_contents(&self) -> &str {
303 0 : self.0.as_str()
304 0 : }
305 : }
306 :
307 : impl From<String> for SecretString {
308 0 : fn from(s: String) -> Self {
309 0 : Self(s)
310 0 : }
311 : }
312 :
313 : impl FromStr for SecretString {
314 : type Err = std::convert::Infallible;
315 :
316 0 : fn from_str(s: &str) -> Result<Self, Self::Err> {
317 0 : Ok(Self(s.to_string()))
318 0 : }
319 : }
320 :
321 : impl std::fmt::Debug for SecretString {
322 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
323 0 : write!(f, "[SECRET]")
324 0 : }
325 : }
326 :
327 : /// Logs a periodic message if a future is slow to complete.
328 : ///
329 : /// This is performance-sensitive as it's used on the GetPage read path.
330 : ///
331 : /// TODO: consider upgrading this to a warning, but currently it fires too often.
332 : #[inline]
333 0 : pub async fn log_slow<O>(name: &str, threshold: Duration, f: impl Future<Output = O>) -> O {
334 0 : // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
335 0 : // won't fit on the stack.
336 0 : let mut f = Box::pin(f);
337 0 :
338 0 : let started = Instant::now();
339 0 : let mut attempt = 1;
340 :
341 : loop {
342 : // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common
343 : // case where the timeout doesn't fire.
344 0 : let deadline = started + attempt * threshold;
345 0 : if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await {
346 : // NB: we check if we exceeded the threshold even if the timeout never fired, because
347 : // scheduling or execution delays may cause the future to succeed even if it exceeds the
348 : // timeout. This costs an extra unconditional clock reading, but seems worth it to avoid
349 : // false negatives.
350 0 : let elapsed = started.elapsed();
351 0 : if elapsed >= threshold {
352 0 : info!("slow {name} completed after {:.3}s", elapsed.as_secs_f64());
353 0 : }
354 0 : return output;
355 0 : }
356 0 :
357 0 : let elapsed = started.elapsed().as_secs_f64();
358 0 : info!("slow {name} still running after {elapsed:.3}s",);
359 :
360 0 : attempt += 1;
361 : }
362 0 : }
363 :
364 : #[cfg(test)]
365 : mod tests {
366 : use metrics::IntCounterVec;
367 : use metrics::core::Opts;
368 :
369 : use crate::logging::{TracingEventCountLayer, TracingEventCountMetric};
370 :
371 : #[test]
372 1 : fn tracing_event_count_metric() {
373 1 : let counter_vec =
374 1 : IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
375 1 : let metric = Box::leak(Box::new(TracingEventCountMetric::new(counter_vec.clone())));
376 1 : let layer = TracingEventCountLayer(metric);
377 : use tracing_subscriber::prelude::*;
378 :
379 1 : tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {
380 1 : tracing::trace!("foo");
381 1 : tracing::debug!("foo");
382 1 : tracing::info!("foo");
383 1 : tracing::warn!("foo");
384 1 : tracing::error!("foo");
385 1 : });
386 1 :
387 1 : assert_eq!(counter_vec.with_label_values(&["trace"]).get(), 1);
388 1 : assert_eq!(counter_vec.with_label_values(&["debug"]).get(), 1);
389 1 : assert_eq!(counter_vec.with_label_values(&["info"]).get(), 1);
390 1 : assert_eq!(counter_vec.with_label_values(&["warn"]).get(), 1);
391 1 : assert_eq!(counter_vec.with_label_values(&["error"]).get(), 1);
392 1 : }
393 : }
|