Line data Source code
1 : use std::time::Duration;
2 :
3 : use futures::future::pending;
4 : use tokio::time::Instant;
5 : use tracing::{error, info, warn};
6 :
7 : use utils::backoff::exponential_backoff_duration;
8 :
9 : /// A retry handler for Pageserver gRPC requests.
10 : ///
11 : /// This is used instead of backoff::retry for better control and observability.
12 : pub struct Retry {
13 : /// Timeout across all retry attempts. If None, retries forever.
14 : pub timeout: Option<Duration>,
15 : /// The initial backoff duration. The first retry does not use a backoff.
16 : pub base_backoff: Duration,
17 : /// The maximum backoff duration.
18 : pub max_backoff: Duration,
19 : }
20 :
21 : impl Retry {
22 : /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
23 : /// using the current tracing span for context.
24 : ///
25 : /// Only certain gRPC status codes are retried, see [`Self::should_retry`].
26 0 : pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
27 0 : where
28 0 : F: FnMut(usize) -> O, // pass attempt number, starting at 0
29 0 : O: Future<Output = tonic::Result<T>>,
30 0 : {
31 0 : let started = Instant::now();
32 0 : let deadline = self.timeout.map(|timeout| started + timeout);
33 0 : let mut last_error = None;
34 0 : let mut retries = 0;
35 : loop {
36 : // Set up a future to wait for the backoff, if any, and run the closure.
37 0 : let backoff_and_try = async {
38 : // NB: sleep() always sleeps 1ms, even when given a 0 argument. See:
39 : // https://github.com/tokio-rs/tokio/issues/6866
40 0 : if let Some(backoff) = self.backoff_duration(retries) {
41 0 : tokio::time::sleep(backoff).await;
42 0 : }
43 :
44 0 : f(retries).await
45 0 : };
46 :
47 : // Set up a future for the timeout, if any.
48 0 : let timeout = async {
49 0 : match deadline {
50 0 : Some(deadline) => tokio::time::sleep_until(deadline).await,
51 0 : None => pending().await,
52 : }
53 0 : };
54 :
55 : // Wait for the backoff and request, or bail out if the timeout is exceeded.
56 0 : let result = tokio::select! {
57 0 : result = backoff_and_try => result,
58 :
59 0 : _ = timeout => {
60 0 : let last_error = last_error.unwrap_or_else(|| {
61 0 : tonic::Status::deadline_exceeded(format!(
62 0 : "request timed out after {:.3}s",
63 0 : started.elapsed().as_secs_f64()
64 : ))
65 0 : });
66 0 : error!(
67 0 : "giving up after {:.3}s and {retries} retries, last error {:?}: {}",
68 0 : started.elapsed().as_secs_f64(), last_error.code(), last_error.message(),
69 : );
70 0 : return Err(last_error);
71 : }
72 : };
73 :
74 0 : match result {
75 : // Success, return the result.
76 0 : Ok(result) => {
77 0 : if retries > 0 {
78 0 : info!(
79 0 : "request succeeded after {retries} retries in {:.3}s",
80 0 : started.elapsed().as_secs_f64(),
81 : );
82 0 : }
83 :
84 0 : return Ok(result);
85 : }
86 :
87 : // Error, retry or bail out.
88 0 : Err(status) => {
89 0 : let (code, message) = (status.code(), status.message());
90 0 : let attempt = retries + 1;
91 :
92 0 : if !Self::should_retry(code) {
93 : // NB: include the attempt here too. This isn't necessarily the first
94 : // attempt, because the error may change between attempts.
95 0 : error!(
96 0 : "request failed with {code:?}: {message}, not retrying (attempt {attempt})"
97 : );
98 0 : return Err(status);
99 0 : }
100 :
101 0 : warn!("request failed with {code:?}: {message}, retrying (attempt {attempt})");
102 :
103 0 : retries += 1;
104 0 : last_error = Some(status);
105 : }
106 : }
107 : }
108 0 : }
109 :
110 : /// Returns the backoff duration for the given retry attempt, or None for no backoff. The first
111 : /// attempt and first retry never backs off, so this returns None for 0 and 1 retries.
112 0 : fn backoff_duration(&self, retries: usize) -> Option<Duration> {
113 0 : let backoff = exponential_backoff_duration(
114 0 : (retries as u32).saturating_sub(1), // first retry does not back off
115 0 : self.base_backoff.as_secs_f64(),
116 0 : self.max_backoff.as_secs_f64(),
117 : );
118 0 : (!backoff.is_zero()).then_some(backoff)
119 0 : }
120 :
121 : /// Returns true if the given status code should be retries.
122 0 : fn should_retry(code: tonic::Code) -> bool {
123 0 : match code {
124 0 : tonic::Code::Ok => panic!("unexpected Ok status code"),
125 :
126 : // These codes are transient, so retry them.
127 0 : tonic::Code::Aborted => true,
128 0 : tonic::Code::Cancelled => true,
129 0 : tonic::Code::DeadlineExceeded => true, // maybe transient slowness
130 0 : tonic::Code::ResourceExhausted => true,
131 0 : tonic::Code::Unavailable => true,
132 :
133 : // The following codes will like continue to fail, so don't retry.
134 0 : tonic::Code::AlreadyExists => false,
135 0 : tonic::Code::DataLoss => false,
136 0 : tonic::Code::FailedPrecondition => false,
137 : // NB: don't retry Internal. It is intended for serious errors such as invariant
138 : // violations, and is also used for client-side invariant checks that would otherwise
139 : // result in retry loops.
140 0 : tonic::Code::Internal => false,
141 0 : tonic::Code::InvalidArgument => false,
142 0 : tonic::Code::NotFound => false,
143 0 : tonic::Code::OutOfRange => false,
144 0 : tonic::Code::PermissionDenied => false,
145 0 : tonic::Code::Unauthenticated => false,
146 0 : tonic::Code::Unimplemented => false,
147 0 : tonic::Code::Unknown => false,
148 : }
149 0 : }
150 : }
|