LCOV - code coverage report
Current view: top level - pageserver/client_grpc/src - retry.rs (source / functions) Coverage Total Hit
Test: 1e20c4f2b28aa592527961bb32170ebbd2c9172f.info Lines: 0.0 % 79 0
Test Date: 2025-07-16 12:29:03 Functions: 0.0 % 32 0

            Line data    Source code
       1              : use std::time::Duration;
       2              : 
       3              : use futures::future::pending;
       4              : use tokio::time::Instant;
       5              : use tracing::{error, info, warn};
       6              : 
       7              : use utils::backoff::exponential_backoff_duration;
       8              : 
       9              : /// A retry handler for Pageserver gRPC requests.
      10              : ///
      11              : /// This is used instead of backoff::retry for better control and observability.
      12              : pub struct Retry {
      13              :     /// Timeout across all retry attempts. If None, retries forever.
      14              :     pub timeout: Option<Duration>,
      15              :     /// The initial backoff duration. The first retry does not use a backoff.
      16              :     pub base_backoff: Duration,
      17              :     /// The maximum backoff duration.
      18              :     pub max_backoff: Duration,
      19              : }
      20              : 
      21              : impl Retry {
      22              :     /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
      23              :     /// using the current tracing span for context.
      24              :     ///
      25              :     /// Only certain gRPC status codes are retried, see [`Self::should_retry`].
      26            0 :     pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
      27            0 :     where
      28            0 :         F: FnMut(usize) -> O, // pass attempt number, starting at 0
      29            0 :         O: Future<Output = tonic::Result<T>>,
      30            0 :     {
      31            0 :         let started = Instant::now();
      32            0 :         let deadline = self.timeout.map(|timeout| started + timeout);
      33            0 :         let mut last_error = None;
      34            0 :         let mut retries = 0;
      35              :         loop {
      36              :             // Set up a future to wait for the backoff, if any, and run the closure.
      37            0 :             let backoff_and_try = async {
      38              :                 // NB: sleep() always sleeps 1ms, even when given a 0 argument. See:
      39              :                 // https://github.com/tokio-rs/tokio/issues/6866
      40            0 :                 if let Some(backoff) = self.backoff_duration(retries) {
      41            0 :                     tokio::time::sleep(backoff).await;
      42            0 :                 }
      43              : 
      44            0 :                 f(retries).await
      45            0 :             };
      46              : 
      47              :             // Set up a future for the timeout, if any.
      48            0 :             let timeout = async {
      49            0 :                 match deadline {
      50            0 :                     Some(deadline) => tokio::time::sleep_until(deadline).await,
      51            0 :                     None => pending().await,
      52              :                 }
      53            0 :             };
      54              : 
      55              :             // Wait for the backoff and request, or bail out if the timeout is exceeded.
      56            0 :             let result = tokio::select! {
      57            0 :                 result = backoff_and_try => result,
      58              : 
      59            0 :                 _ = timeout => {
      60            0 :                     let last_error = last_error.unwrap_or_else(|| {
      61            0 :                         tonic::Status::deadline_exceeded(format!(
      62            0 :                             "request timed out after {:.3}s",
      63            0 :                             started.elapsed().as_secs_f64()
      64              :                         ))
      65            0 :                     });
      66            0 :                     error!(
      67            0 :                         "giving up after {:.3}s and {retries} retries, last error {:?}: {}",
      68            0 :                         started.elapsed().as_secs_f64(), last_error.code(), last_error.message(),
      69              :                     );
      70            0 :                     return Err(last_error);
      71              :                 }
      72              :             };
      73              : 
      74            0 :             match result {
      75              :                 // Success, return the result.
      76            0 :                 Ok(result) => {
      77            0 :                     if retries > 0 {
      78            0 :                         info!(
      79            0 :                             "request succeeded after {retries} retries in {:.3}s",
      80            0 :                             started.elapsed().as_secs_f64(),
      81              :                         );
      82            0 :                     }
      83              : 
      84            0 :                     return Ok(result);
      85              :                 }
      86              : 
      87              :                 // Error, retry or bail out.
      88            0 :                 Err(status) => {
      89            0 :                     let (code, message) = (status.code(), status.message());
      90            0 :                     let attempt = retries + 1;
      91              : 
      92            0 :                     if !Self::should_retry(code) {
      93              :                         // NB: include the attempt here too. This isn't necessarily the first
      94              :                         // attempt, because the error may change between attempts.
      95            0 :                         error!(
      96            0 :                             "request failed with {code:?}: {message}, not retrying (attempt {attempt})"
      97              :                         );
      98            0 :                         return Err(status);
      99            0 :                     }
     100              : 
     101            0 :                     warn!("request failed with {code:?}: {message}, retrying (attempt {attempt})");
     102              : 
     103            0 :                     retries += 1;
     104            0 :                     last_error = Some(status);
     105              :                 }
     106              :             }
     107              :         }
     108            0 :     }
     109              : 
     110              :     /// Returns the backoff duration for the given retry attempt, or None for no backoff. The first
     111              :     /// attempt and first retry never backs off, so this returns None for 0 and 1 retries.
     112            0 :     fn backoff_duration(&self, retries: usize) -> Option<Duration> {
     113            0 :         let backoff = exponential_backoff_duration(
     114            0 :             (retries as u32).saturating_sub(1), // first retry does not back off
     115            0 :             self.base_backoff.as_secs_f64(),
     116            0 :             self.max_backoff.as_secs_f64(),
     117              :         );
     118            0 :         (!backoff.is_zero()).then_some(backoff)
     119            0 :     }
     120              : 
     121              :     /// Returns true if the given status code should be retries.
     122            0 :     fn should_retry(code: tonic::Code) -> bool {
     123            0 :         match code {
     124            0 :             tonic::Code::Ok => panic!("unexpected Ok status code"),
     125              : 
     126              :             // These codes are transient, so retry them.
     127            0 :             tonic::Code::Aborted => true,
     128            0 :             tonic::Code::Cancelled => true,
     129            0 :             tonic::Code::DeadlineExceeded => true, // maybe transient slowness
     130            0 :             tonic::Code::ResourceExhausted => true,
     131            0 :             tonic::Code::Unavailable => true,
     132              : 
     133              :             // The following codes will like continue to fail, so don't retry.
     134            0 :             tonic::Code::AlreadyExists => false,
     135            0 :             tonic::Code::DataLoss => false,
     136            0 :             tonic::Code::FailedPrecondition => false,
     137              :             // NB: don't retry Internal. It is intended for serious errors such as invariant
     138              :             // violations, and is also used for client-side invariant checks that would otherwise
     139              :             // result in retry loops.
     140            0 :             tonic::Code::Internal => false,
     141            0 :             tonic::Code::InvalidArgument => false,
     142            0 :             tonic::Code::NotFound => false,
     143            0 :             tonic::Code::OutOfRange => false,
     144            0 :             tonic::Code::PermissionDenied => false,
     145            0 :             tonic::Code::Unauthenticated => false,
     146            0 :             tonic::Code::Unimplemented => false,
     147            0 :             tonic::Code::Unknown => false,
     148              :         }
     149            0 :     }
     150              : }
        

Generated by: LCOV version 2.1-beta