Line data Source code
1 : //! AWS S3 storage wrapper around `rusoto` library.
2 : //!
3 : //! Respects `prefix_in_bucket` property from [`S3Config`],
4 : //! allowing multiple api users to independently work with the same S3 bucket, if
5 : //! their bucket prefixes are both specified and different.
6 :
7 : use std::{
8 : borrow::Cow,
9 : collections::HashMap,
10 : num::NonZeroU32,
11 : pin::Pin,
12 : sync::Arc,
13 : task::{Context, Poll},
14 : time::{Duration, SystemTime},
15 : };
16 :
17 : use anyhow::{anyhow, Context as _};
18 : use aws_config::{
19 : environment::credentials::EnvironmentVariableCredentialsProvider,
20 : imds::credentials::ImdsCredentialsProvider,
21 : meta::credentials::CredentialsProviderChain,
22 : profile::ProfileFileCredentialsProvider,
23 : provider_config::ProviderConfig,
24 : retry::{RetryConfigBuilder, RetryMode},
25 : web_identity_token::WebIdentityTokenCredentialsProvider,
26 : BehaviorVersion,
27 : };
28 : use aws_credential_types::provider::SharedCredentialsProvider;
29 : use aws_sdk_s3::{
30 : config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
31 : error::SdkError,
32 : operation::get_object::GetObjectError,
33 : types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
34 : Client,
35 : };
36 : use aws_smithy_async::rt::sleep::TokioSleep;
37 :
38 : use aws_smithy_types::{body::SdkBody, DateTime};
39 : use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
40 : use bytes::Bytes;
41 : use futures::stream::Stream;
42 : use hyper::Body;
43 : use scopeguard::ScopeGuard;
44 : use tokio_util::sync::CancellationToken;
45 : use utils::backoff;
46 :
47 : use super::StorageMetadata;
48 : use crate::{
49 : error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
50 : Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
51 : MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
52 : };
53 :
54 : pub(super) mod metrics;
55 :
56 : use self::metrics::AttemptOutcome;
57 : pub(super) use self::metrics::RequestKind;
58 :
59 : /// AWS S3 storage.
60 : pub struct S3Bucket {
61 : client: Client,
62 : bucket_name: String,
63 : prefix_in_bucket: Option<String>,
64 : max_keys_per_list_response: Option<i32>,
65 : concurrency_limiter: ConcurrencyLimiter,
66 : // Per-request timeout. Accessible for tests.
67 : pub timeout: Duration,
68 : }
69 :
70 : struct GetObjectRequest {
71 : bucket: String,
72 : key: String,
73 : range: Option<String>,
74 : }
75 : impl S3Bucket {
76 : /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
77 28 : pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
78 28 : tracing::debug!(
79 0 : "Creating s3 remote storage for S3 bucket {}",
80 0 : aws_config.bucket_name
81 0 : );
82 :
83 28 : let region = Some(Region::new(aws_config.bucket_region.clone()));
84 28 :
85 28 : let provider_conf = ProviderConfig::without_region().with_region(region.clone());
86 28 :
87 28 : let credentials_provider = {
88 28 : // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
89 28 : CredentialsProviderChain::first_try(
90 28 : "env",
91 28 : EnvironmentVariableCredentialsProvider::new(),
92 28 : )
93 28 : // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
94 28 : .or_else(
95 28 : "profile-sso",
96 28 : ProfileFileCredentialsProvider::builder()
97 28 : .configure(&provider_conf)
98 28 : .build(),
99 28 : )
100 28 : // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
101 28 : // needed to access remote extensions bucket
102 28 : .or_else(
103 28 : "token",
104 28 : WebIdentityTokenCredentialsProvider::builder()
105 28 : .configure(&provider_conf)
106 28 : .build(),
107 28 : )
108 28 : // uses imds v2
109 28 : .or_else("imds", ImdsCredentialsProvider::builder().build())
110 28 : };
111 28 :
112 28 : // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
113 28 : let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
114 28 :
115 28 : // We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling
116 28 : // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one
117 28 : // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
118 28 : let mut retry_config = RetryConfigBuilder::new();
119 28 : retry_config
120 28 : .set_max_attempts(Some(1))
121 28 : .set_mode(Some(RetryMode::Adaptive));
122 28 :
123 28 : let mut config_builder = Builder::default()
124 28 : .behavior_version(BehaviorVersion::v2023_11_09())
125 28 : .region(region)
126 28 : .identity_cache(IdentityCache::lazy().build())
127 28 : .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
128 28 : .retry_config(retry_config.build())
129 28 : .sleep_impl(SharedAsyncSleep::from(sleep_impl));
130 :
131 28 : if let Some(custom_endpoint) = aws_config.endpoint.clone() {
132 0 : config_builder = config_builder
133 0 : .endpoint_url(custom_endpoint)
134 0 : .force_path_style(true);
135 28 : }
136 :
137 28 : let client = Client::from_conf(config_builder.build());
138 28 :
139 28 : let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
140 26 : let mut prefix = prefix;
141 28 : while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
142 2 : prefix = &prefix[1..]
143 : }
144 :
145 26 : let mut prefix = prefix.to_string();
146 48 : while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
147 22 : prefix.pop();
148 22 : }
149 26 : prefix
150 28 : });
151 28 : Ok(Self {
152 28 : client,
153 28 : bucket_name: aws_config.bucket_name.clone(),
154 28 : max_keys_per_list_response: aws_config.max_keys_per_list_response,
155 28 : prefix_in_bucket,
156 28 : concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
157 28 : timeout,
158 28 : })
159 28 : }
160 :
161 126 : fn s3_object_to_relative_path(&self, key: &str) -> RemotePath {
162 126 : let relative_path =
163 126 : match key.strip_prefix(self.prefix_in_bucket.as_deref().unwrap_or_default()) {
164 126 : Some(stripped) => stripped,
165 : // we rely on AWS to return properly prefixed paths
166 : // for requests with a certain prefix
167 0 : None => panic!(
168 0 : "Key {} does not start with bucket prefix {:?}",
169 0 : key, self.prefix_in_bucket
170 0 : ),
171 : };
172 126 : RemotePath(
173 126 : relative_path
174 126 : .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
175 126 : .collect(),
176 126 : )
177 126 : }
178 :
179 280 : pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
180 280 : assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
181 280 : let path_string = path
182 280 : .get_path()
183 280 : .as_str()
184 280 : .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
185 280 : match &self.prefix_in_bucket {
186 274 : Some(prefix) => prefix.clone() + "/" + path_string,
187 6 : None => path_string.to_string(),
188 : }
189 280 : }
190 :
191 242 : async fn permit(
192 242 : &self,
193 242 : kind: RequestKind,
194 242 : cancel: &CancellationToken,
195 242 : ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
196 0 : let started_at = start_counting_cancelled_wait(kind);
197 0 : let acquire = self.concurrency_limiter.acquire(kind);
198 :
199 0 : let permit = tokio::select! {
200 0 : permit = acquire => permit.expect("semaphore is never closed"),
201 : _ = cancel.cancelled() => return Err(Cancelled),
202 : };
203 :
204 0 : let started_at = ScopeGuard::into_inner(started_at);
205 0 : metrics::BUCKET_METRICS
206 0 : .wait_seconds
207 0 : .observe_elapsed(kind, started_at);
208 0 :
209 0 : Ok(permit)
210 0 : }
211 :
212 24 : async fn owned_permit(
213 24 : &self,
214 24 : kind: RequestKind,
215 24 : cancel: &CancellationToken,
216 24 : ) -> Result<tokio::sync::OwnedSemaphorePermit, Cancelled> {
217 0 : let started_at = start_counting_cancelled_wait(kind);
218 0 : let acquire = self.concurrency_limiter.acquire_owned(kind);
219 :
220 0 : let permit = tokio::select! {
221 0 : permit = acquire => permit.expect("semaphore is never closed"),
222 : _ = cancel.cancelled() => return Err(Cancelled),
223 : };
224 :
225 0 : let started_at = ScopeGuard::into_inner(started_at);
226 0 : metrics::BUCKET_METRICS
227 0 : .wait_seconds
228 0 : .observe_elapsed(kind, started_at);
229 0 : Ok(permit)
230 0 : }
231 :
232 24 : async fn download_object(
233 24 : &self,
234 24 : request: GetObjectRequest,
235 24 : cancel: &CancellationToken,
236 24 : ) -> Result<Download, DownloadError> {
237 0 : let kind = RequestKind::Get;
238 :
239 0 : let permit = self.owned_permit(kind, cancel).await?;
240 :
241 0 : let started_at = start_measuring_requests(kind);
242 0 :
243 0 : let get_object = self
244 0 : .client
245 0 : .get_object()
246 0 : .bucket(request.bucket)
247 0 : .key(request.key)
248 0 : .set_range(request.range)
249 0 : .send();
250 :
251 0 : let get_object = tokio::select! {
252 0 : res = get_object => res,
253 : _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
254 : _ = cancel.cancelled() => return Err(DownloadError::Cancelled),
255 : };
256 :
257 0 : let started_at = ScopeGuard::into_inner(started_at);
258 :
259 0 : let object_output = match get_object {
260 0 : Ok(object_output) => object_output,
261 0 : Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
262 : // Count this in the AttemptOutcome::Ok bucket, because 404 is not
263 : // an error: we expect to sometimes fetch an object and find it missing,
264 : // e.g. when probing for timeline indices.
265 0 : metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
266 0 : kind,
267 0 : AttemptOutcome::Ok,
268 0 : started_at,
269 0 : );
270 0 : return Err(DownloadError::NotFound);
271 : }
272 0 : Err(e) => {
273 0 : metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
274 0 : kind,
275 0 : AttemptOutcome::Err,
276 0 : started_at,
277 0 : );
278 0 :
279 0 : return Err(DownloadError::Other(
280 0 : anyhow::Error::new(e).context("download s3 object"),
281 0 : ));
282 : }
283 : };
284 :
285 : // even if we would have no timeout left, continue anyways. the caller can decide to ignore
286 : // the errors considering timeouts and cancellation.
287 0 : let remaining = self.timeout.saturating_sub(started_at.elapsed());
288 0 :
289 0 : let metadata = object_output.metadata().cloned().map(StorageMetadata);
290 0 : let etag = object_output
291 0 : .e_tag
292 0 : .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?
293 0 : .into();
294 0 : let last_modified = object_output
295 0 : .last_modified
296 0 : .ok_or(DownloadError::Other(anyhow::anyhow!(
297 0 : "Missing LastModified header"
298 0 : )))?
299 0 : .try_into()
300 0 : .map_err(|e: ConversionError| DownloadError::Other(e.into()))?;
301 :
302 0 : let body = object_output.body;
303 0 : let body = ByteStreamAsStream::from(body);
304 0 : let body = PermitCarrying::new(permit, body);
305 0 : let body = TimedDownload::new(started_at, body);
306 0 :
307 0 : let cancel_or_timeout = crate::support::cancel_or_timeout(remaining, cancel.clone());
308 0 : let body = crate::support::DownloadStream::new(cancel_or_timeout, body);
309 0 :
310 0 : Ok(Download {
311 0 : metadata,
312 0 : etag,
313 0 : last_modified,
314 0 : download_stream: Box::pin(body),
315 0 : })
316 0 : }
317 :
318 106 : async fn delete_oids(
319 106 : &self,
320 106 : _permit: &tokio::sync::SemaphorePermit<'_>,
321 106 : delete_objects: &[ObjectIdentifier],
322 106 : cancel: &CancellationToken,
323 106 : ) -> anyhow::Result<()> {
324 0 : let kind = RequestKind::Delete;
325 0 : let mut cancel = std::pin::pin!(cancel.cancelled());
326 :
327 0 : for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
328 0 : let started_at = start_measuring_requests(kind);
329 :
330 0 : let req = self
331 0 : .client
332 0 : .delete_objects()
333 0 : .bucket(self.bucket_name.clone())
334 0 : .delete(
335 0 : Delete::builder()
336 0 : .set_objects(Some(chunk.to_vec()))
337 0 : .build()
338 0 : .context("build request")?,
339 : )
340 0 : .send();
341 :
342 0 : let resp = tokio::select! {
343 0 : resp = req => resp,
344 : _ = tokio::time::sleep(self.timeout) => return Err(TimeoutOrCancel::Timeout.into()),
345 : _ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()),
346 : };
347 :
348 0 : let started_at = ScopeGuard::into_inner(started_at);
349 0 : metrics::BUCKET_METRICS
350 0 : .req_seconds
351 0 : .observe_elapsed(kind, &resp, started_at);
352 :
353 0 : let resp = resp.context("request deletion")?;
354 0 : metrics::BUCKET_METRICS
355 0 : .deleted_objects_total
356 0 : .inc_by(chunk.len() as u64);
357 :
358 0 : if let Some(errors) = resp.errors {
359 : // Log a bounded number of the errors within the response:
360 : // these requests can carry 1000 keys so logging each one
361 : // would be too verbose, especially as errors may lead us
362 : // to retry repeatedly.
363 : const LOG_UP_TO_N_ERRORS: usize = 10;
364 0 : for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
365 0 : tracing::warn!(
366 0 : "DeleteObjects key {} failed: {}: {}",
367 0 : e.key.as_ref().map(Cow::from).unwrap_or("".into()),
368 0 : e.code.as_ref().map(Cow::from).unwrap_or("".into()),
369 0 : e.message.as_ref().map(Cow::from).unwrap_or("".into())
370 0 : );
371 : }
372 :
373 0 : return Err(anyhow::anyhow!(
374 0 : "Failed to delete {}/{} objects",
375 0 : errors.len(),
376 0 : chunk.len(),
377 0 : ));
378 0 : }
379 : }
380 0 : Ok(())
381 0 : }
382 : }
383 :
384 : pin_project_lite::pin_project! {
385 : struct ByteStreamAsStream {
386 : #[pin]
387 : inner: aws_smithy_types::byte_stream::ByteStream
388 : }
389 : }
390 :
391 : impl From<aws_smithy_types::byte_stream::ByteStream> for ByteStreamAsStream {
392 24 : fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self {
393 24 : ByteStreamAsStream { inner }
394 24 : }
395 : }
396 :
397 : impl Stream for ByteStreamAsStream {
398 : type Item = std::io::Result<Bytes>;
399 :
400 47 : fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
401 47 : // this does the std::io::ErrorKind::Other conversion
402 47 : self.project().inner.poll_next(cx).map_err(|x| x.into())
403 47 : }
404 :
405 : // cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes
406 : // sense and Stream::size_hint does not really
407 : }
408 :
409 : pin_project_lite::pin_project! {
410 : /// Times and tracks the outcome of the request.
411 : struct TimedDownload<S> {
412 : started_at: std::time::Instant,
413 : outcome: metrics::AttemptOutcome,
414 : #[pin]
415 : inner: S
416 : }
417 :
418 : impl<S> PinnedDrop for TimedDownload<S> {
419 : fn drop(mut this: Pin<&mut Self>) {
420 : metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
421 : }
422 : }
423 : }
424 :
425 : impl<S> TimedDownload<S> {
426 0 : fn new(started_at: std::time::Instant, inner: S) -> Self {
427 0 : TimedDownload {
428 0 : started_at,
429 0 : outcome: metrics::AttemptOutcome::Cancelled,
430 0 : inner,
431 0 : }
432 0 : }
433 : }
434 :
435 : impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
436 : type Item = <S as Stream>::Item;
437 :
438 0 : fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
439 0 : use std::task::ready;
440 0 :
441 0 : let this = self.project();
442 :
443 0 : let res = ready!(this.inner.poll_next(cx));
444 0 : match &res {
445 0 : Some(Ok(_)) => {}
446 0 : Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
447 0 : None => *this.outcome = metrics::AttemptOutcome::Ok,
448 : }
449 :
450 0 : Poll::Ready(res)
451 0 : }
452 :
453 0 : fn size_hint(&self) -> (usize, Option<usize>) {
454 0 : self.inner.size_hint()
455 0 : }
456 : }
457 :
458 : impl RemoteStorage for S3Bucket {
459 24 : async fn list(
460 24 : &self,
461 24 : prefix: Option<&RemotePath>,
462 24 : mode: ListingMode,
463 24 : max_keys: Option<NonZeroU32>,
464 24 : cancel: &CancellationToken,
465 24 : ) -> Result<Listing, DownloadError> {
466 0 : let kind = RequestKind::List;
467 0 : // s3 sdk wants i32
468 0 : let mut max_keys = max_keys.map(|mk| mk.get() as i32);
469 0 : let mut result = Listing::default();
470 0 :
471 0 : // get the passed prefix or if it is not set use prefix_in_bucket value
472 0 : let list_prefix = prefix
473 0 : .map(|p| self.relative_path_to_s3_object(p))
474 0 : .or_else(|| self.prefix_in_bucket.clone())
475 0 : .map(|mut p| {
476 : // required to end with a separator
477 : // otherwise request will return only the entry of a prefix
478 0 : if matches!(mode, ListingMode::WithDelimiter)
479 0 : && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
480 0 : {
481 0 : p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
482 0 : }
483 0 : p
484 0 : });
485 :
486 0 : let _permit = self.permit(kind, cancel).await?;
487 :
488 0 : let mut continuation_token = None;
489 :
490 : loop {
491 0 : let started_at = start_measuring_requests(kind);
492 0 :
493 0 : // min of two Options, returning Some if one is value and another is
494 0 : // None (None is smaller than anything, so plain min doesn't work).
495 0 : let request_max_keys = self
496 0 : .max_keys_per_list_response
497 0 : .into_iter()
498 0 : .chain(max_keys.into_iter())
499 0 : .min();
500 0 : let mut request = self
501 0 : .client
502 0 : .list_objects_v2()
503 0 : .bucket(self.bucket_name.clone())
504 0 : .set_prefix(list_prefix.clone())
505 0 : .set_continuation_token(continuation_token)
506 0 : .set_max_keys(request_max_keys);
507 0 :
508 0 : if let ListingMode::WithDelimiter = mode {
509 0 : request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
510 0 : }
511 :
512 0 : let request = request.send();
513 :
514 0 : let response = tokio::select! {
515 0 : res = request => res,
516 : _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
517 : _ = cancel.cancelled() => return Err(DownloadError::Cancelled),
518 : };
519 :
520 0 : let response = response
521 0 : .context("Failed to list S3 prefixes")
522 0 : .map_err(DownloadError::Other);
523 0 :
524 0 : let started_at = ScopeGuard::into_inner(started_at);
525 0 :
526 0 : metrics::BUCKET_METRICS
527 0 : .req_seconds
528 0 : .observe_elapsed(kind, &response, started_at);
529 :
530 0 : let response = response?;
531 :
532 0 : let keys = response.contents();
533 0 : let empty = Vec::new();
534 0 : let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
535 0 :
536 0 : tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
537 :
538 0 : for object in keys {
539 0 : let object_path = object.key().expect("response does not contain a key");
540 0 : let remote_path = self.s3_object_to_relative_path(object_path);
541 0 : result.keys.push(remote_path);
542 0 : if let Some(mut mk) = max_keys {
543 0 : assert!(mk > 0);
544 0 : mk -= 1;
545 0 : if mk == 0 {
546 0 : return Ok(result); // limit reached
547 0 : }
548 0 : max_keys = Some(mk);
549 0 : }
550 : }
551 :
552 0 : result.prefixes.extend(
553 0 : prefixes
554 0 : .iter()
555 0 : .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
556 0 : );
557 :
558 0 : continuation_token = match response.next_continuation_token {
559 0 : Some(new_token) => Some(new_token),
560 0 : None => break,
561 0 : };
562 0 : }
563 0 :
564 0 : Ok(result)
565 0 : }
566 :
567 0 : async fn upload(
568 0 : &self,
569 0 : from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
570 0 : from_size_bytes: usize,
571 0 : to: &RemotePath,
572 0 : metadata: Option<StorageMetadata>,
573 0 : cancel: &CancellationToken,
574 0 : ) -> anyhow::Result<()> {
575 0 : let kind = RequestKind::Put;
576 0 : let _permit = self.permit(kind, cancel).await?;
577 :
578 0 : let started_at = start_measuring_requests(kind);
579 0 :
580 0 : let body = Body::wrap_stream(from);
581 0 : let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
582 :
583 0 : let upload = self
584 0 : .client
585 0 : .put_object()
586 0 : .bucket(self.bucket_name.clone())
587 0 : .key(self.relative_path_to_s3_object(to))
588 0 : .set_metadata(metadata.map(|m| m.0))
589 0 : .content_length(from_size_bytes.try_into()?)
590 0 : .body(bytes_stream)
591 0 : .send();
592 0 :
593 0 : let upload = tokio::time::timeout(self.timeout, upload);
594 :
595 0 : let res = tokio::select! {
596 0 : res = upload => res,
597 : _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
598 : };
599 :
600 0 : if let Ok(inner) = &res {
601 0 : // do not incl. timeouts as errors in metrics but cancellations
602 0 : let started_at = ScopeGuard::into_inner(started_at);
603 0 : metrics::BUCKET_METRICS
604 0 : .req_seconds
605 0 : .observe_elapsed(kind, inner, started_at);
606 0 : }
607 :
608 0 : match res {
609 0 : Ok(Ok(_put)) => Ok(()),
610 0 : Ok(Err(sdk)) => Err(sdk.into()),
611 0 : Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
612 : }
613 0 : }
614 :
615 2 : async fn copy(
616 2 : &self,
617 2 : from: &RemotePath,
618 2 : to: &RemotePath,
619 2 : cancel: &CancellationToken,
620 2 : ) -> anyhow::Result<()> {
621 0 : let kind = RequestKind::Copy;
622 0 : let _permit = self.permit(kind, cancel).await?;
623 :
624 0 : let timeout = tokio::time::sleep(self.timeout);
625 0 :
626 0 : let started_at = start_measuring_requests(kind);
627 0 :
628 0 : // we need to specify bucket_name as a prefix
629 0 : let copy_source = format!(
630 0 : "{}/{}",
631 0 : self.bucket_name,
632 0 : self.relative_path_to_s3_object(from)
633 0 : );
634 0 :
635 0 : let op = self
636 0 : .client
637 0 : .copy_object()
638 0 : .bucket(self.bucket_name.clone())
639 0 : .key(self.relative_path_to_s3_object(to))
640 0 : .copy_source(copy_source)
641 0 : .send();
642 :
643 0 : let res = tokio::select! {
644 0 : res = op => res,
645 : _ = timeout => return Err(TimeoutOrCancel::Timeout.into()),
646 : _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
647 : };
648 :
649 0 : let started_at = ScopeGuard::into_inner(started_at);
650 0 : metrics::BUCKET_METRICS
651 0 : .req_seconds
652 0 : .observe_elapsed(kind, &res, started_at);
653 0 :
654 0 : res?;
655 :
656 0 : Ok(())
657 0 : }
658 :
659 14 : async fn download(
660 14 : &self,
661 14 : from: &RemotePath,
662 14 : cancel: &CancellationToken,
663 14 : ) -> Result<Download, DownloadError> {
664 0 : // if prefix is not none then download file `prefix/from`
665 0 : // if prefix is none then download file `from`
666 0 : self.download_object(
667 0 : GetObjectRequest {
668 0 : bucket: self.bucket_name.clone(),
669 0 : key: self.relative_path_to_s3_object(from),
670 0 : range: None,
671 0 : },
672 0 : cancel,
673 0 : )
674 0 : .await
675 0 : }
676 :
677 10 : async fn download_byte_range(
678 10 : &self,
679 10 : from: &RemotePath,
680 10 : start_inclusive: u64,
681 10 : end_exclusive: Option<u64>,
682 10 : cancel: &CancellationToken,
683 10 : ) -> Result<Download, DownloadError> {
684 0 : // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
685 0 : // and needs both ends to be exclusive
686 0 : let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
687 0 : let range = Some(match end_inclusive {
688 0 : Some(end_inclusive) => format!("bytes={start_inclusive}-{end_inclusive}"),
689 0 : None => format!("bytes={start_inclusive}-"),
690 : });
691 :
692 0 : self.download_object(
693 0 : GetObjectRequest {
694 0 : bucket: self.bucket_name.clone(),
695 0 : key: self.relative_path_to_s3_object(from),
696 0 : range,
697 0 : },
698 0 : cancel,
699 0 : )
700 0 : .await
701 0 : }
702 :
703 102 : async fn delete_objects<'a>(
704 102 : &self,
705 102 : paths: &'a [RemotePath],
706 102 : cancel: &CancellationToken,
707 102 : ) -> anyhow::Result<()> {
708 0 : let kind = RequestKind::Delete;
709 0 : let permit = self.permit(kind, cancel).await?;
710 0 : let mut delete_objects = Vec::with_capacity(paths.len());
711 0 : for path in paths {
712 0 : let obj_id = ObjectIdentifier::builder()
713 0 : .set_key(Some(self.relative_path_to_s3_object(path)))
714 0 : .build()
715 0 : .context("convert path to oid")?;
716 0 : delete_objects.push(obj_id);
717 : }
718 :
719 0 : self.delete_oids(&permit, &delete_objects, cancel).await
720 0 : }
721 :
722 90 : async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
723 0 : let paths = std::array::from_ref(path);
724 0 : self.delete_objects(paths, cancel).await
725 0 : }
726 :
727 6 : async fn time_travel_recover(
728 6 : &self,
729 6 : prefix: Option<&RemotePath>,
730 6 : timestamp: SystemTime,
731 6 : done_if_after: SystemTime,
732 6 : cancel: &CancellationToken,
733 6 : ) -> Result<(), TimeTravelError> {
734 0 : let kind = RequestKind::TimeTravel;
735 0 : let permit = self.permit(kind, cancel).await?;
736 :
737 0 : let timestamp = DateTime::from(timestamp);
738 0 : let done_if_after = DateTime::from(done_if_after);
739 0 :
740 0 : tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
741 :
742 : // get the passed prefix or if it is not set use prefix_in_bucket value
743 0 : let prefix = prefix
744 0 : .map(|p| self.relative_path_to_s3_object(p))
745 0 : .or_else(|| self.prefix_in_bucket.clone());
746 0 :
747 0 : let warn_threshold = 3;
748 0 : let max_retries = 10;
749 0 : let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
750 :
751 0 : let mut key_marker = None;
752 0 : let mut version_id_marker = None;
753 0 : let mut versions_and_deletes = Vec::new();
754 :
755 : loop {
756 0 : let response = backoff::retry(
757 0 : || async {
758 0 : let op = self
759 0 : .client
760 0 : .list_object_versions()
761 0 : .bucket(self.bucket_name.clone())
762 0 : .set_prefix(prefix.clone())
763 0 : .set_key_marker(key_marker.clone())
764 0 : .set_version_id_marker(version_id_marker.clone())
765 0 : .send();
766 :
767 0 : tokio::select! {
768 0 : res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
769 : _ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
770 : }
771 0 : },
772 0 : is_permanent,
773 0 : warn_threshold,
774 0 : max_retries,
775 0 : "listing object versions for time_travel_recover",
776 0 : cancel,
777 0 : )
778 0 : .await
779 0 : .ok_or_else(|| TimeTravelError::Cancelled)
780 0 : .and_then(|x| x)?;
781 :
782 0 : tracing::trace!(
783 0 : " Got List response version_id_marker={:?}, key_marker={:?}",
784 0 : response.version_id_marker,
785 0 : response.key_marker
786 0 : );
787 0 : let versions = response
788 0 : .versions
789 0 : .unwrap_or_default()
790 0 : .into_iter()
791 0 : .map(VerOrDelete::from_version);
792 0 : let deletes = response
793 0 : .delete_markers
794 0 : .unwrap_or_default()
795 0 : .into_iter()
796 0 : .map(VerOrDelete::from_delete_marker);
797 0 : itertools::process_results(versions.chain(deletes), |n_vds| {
798 0 : versions_and_deletes.extend(n_vds)
799 0 : })
800 0 : .map_err(TimeTravelError::Other)?;
801 12 : fn none_if_empty(v: Option<String>) -> Option<String> {
802 12 : v.filter(|v| !v.is_empty())
803 12 : }
804 0 : version_id_marker = none_if_empty(response.next_version_id_marker);
805 0 : key_marker = none_if_empty(response.next_key_marker);
806 0 : if version_id_marker.is_none() {
807 : // The final response is not supposed to be truncated
808 0 : if response.is_truncated.unwrap_or_default() {
809 0 : return Err(TimeTravelError::Other(anyhow::anyhow!(
810 0 : "Received truncated ListObjectVersions response for prefix={prefix:?}"
811 0 : )));
812 0 : }
813 0 : break;
814 0 : }
815 0 : // Limit the number of versions deletions, mostly so that we don't
816 0 : // keep requesting forever if the list is too long, as we'd put the
817 0 : // list in RAM.
818 0 : // Building a list of 100k entries that reaches the limit roughly takes
819 0 : // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
820 0 : const COMPLEXITY_LIMIT: usize = 100_000;
821 0 : if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
822 0 : return Err(TimeTravelError::TooManyVersions);
823 0 : }
824 : }
825 :
826 0 : tracing::info!(
827 0 : "Built list for time travel with {} versions and deletions",
828 0 : versions_and_deletes.len()
829 0 : );
830 :
831 : // Work on the list of references instead of the objects directly,
832 : // otherwise we get lifetime errors in the sort_by_key call below.
833 0 : let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
834 0 :
835 0 : versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified));
836 0 :
837 0 : let mut vds_for_key = HashMap::<_, Vec<_>>::new();
838 :
839 0 : for vd in &versions_and_deletes {
840 : let VerOrDelete {
841 0 : version_id, key, ..
842 0 : } = &vd;
843 0 : if version_id == "null" {
844 0 : return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \
845 0 : indicating either disabled versioning, or legacy objects with null version id values")));
846 0 : }
847 0 : tracing::trace!(
848 0 : "Parsing version key={key} version_id={version_id} kind={:?}",
849 0 : vd.kind
850 0 : );
851 :
852 0 : vds_for_key.entry(key).or_default().push(vd);
853 : }
854 0 : for (key, versions) in vds_for_key {
855 0 : let last_vd = versions.last().unwrap();
856 0 : if last_vd.last_modified > done_if_after {
857 0 : tracing::trace!("Key {key} has version later than done_if_after, skipping");
858 0 : continue;
859 0 : }
860 : // the version we want to restore to.
861 0 : let version_to_restore_to =
862 0 : match versions.binary_search_by_key(×tamp, |tpl| tpl.last_modified) {
863 0 : Ok(v) => v,
864 0 : Err(e) => e,
865 : };
866 0 : if version_to_restore_to == versions.len() {
867 0 : tracing::trace!("Key {key} has no changes since timestamp, skipping");
868 0 : continue;
869 0 : }
870 0 : let mut do_delete = false;
871 0 : if version_to_restore_to == 0 {
872 : // All versions more recent, so the key didn't exist at the specified time point.
873 0 : tracing::trace!(
874 0 : "All {} versions more recent for {key}, deleting",
875 0 : versions.len()
876 0 : );
877 0 : do_delete = true;
878 : } else {
879 0 : match &versions[version_to_restore_to - 1] {
880 : VerOrDelete {
881 : kind: VerOrDeleteKind::Version,
882 0 : version_id,
883 0 : ..
884 0 : } => {
885 0 : tracing::trace!("Copying old version {version_id} for {key}...");
886 : // Restore the state to the last version by copying
887 0 : let source_id =
888 0 : format!("{}/{key}?versionId={version_id}", self.bucket_name);
889 0 :
890 0 : backoff::retry(
891 0 : || async {
892 0 : let op = self
893 0 : .client
894 0 : .copy_object()
895 0 : .bucket(self.bucket_name.clone())
896 0 : .key(key)
897 0 : .copy_source(&source_id)
898 0 : .send();
899 :
900 0 : tokio::select! {
901 0 : res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
902 : _ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
903 : }
904 0 : },
905 0 : is_permanent,
906 0 : warn_threshold,
907 0 : max_retries,
908 0 : "copying object version for time_travel_recover",
909 0 : cancel,
910 0 : )
911 0 : .await
912 0 : .ok_or_else(|| TimeTravelError::Cancelled)
913 0 : .and_then(|x| x)?;
914 0 : tracing::info!(%version_id, %key, "Copied old version in S3");
915 : }
916 : VerOrDelete {
917 : kind: VerOrDeleteKind::DeleteMarker,
918 : ..
919 0 : } => {
920 0 : do_delete = true;
921 0 : }
922 : }
923 : };
924 0 : if do_delete {
925 0 : if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) {
926 : // Key has since been deleted (but there was some history), no need to do anything
927 0 : tracing::trace!("Key {key} already deleted, skipping.");
928 : } else {
929 0 : tracing::trace!("Deleting {key}...");
930 :
931 0 : let oid = ObjectIdentifier::builder()
932 0 : .key(key.to_owned())
933 0 : .build()
934 0 : .map_err(|e| TimeTravelError::Other(e.into()))?;
935 :
936 0 : self.delete_oids(&permit, &[oid], cancel)
937 0 : .await
938 0 : .map_err(|e| {
939 0 : // delete_oid0 will use TimeoutOrCancel
940 0 : if TimeoutOrCancel::caused_by_cancel(&e) {
941 0 : TimeTravelError::Cancelled
942 : } else {
943 0 : TimeTravelError::Other(e)
944 : }
945 0 : })?;
946 : }
947 0 : }
948 : }
949 0 : Ok(())
950 0 : }
951 : }
952 :
953 : /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
954 266 : fn start_counting_cancelled_wait(
955 266 : kind: RequestKind,
956 266 : ) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
957 266 : scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
958 0 : metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
959 266 : })
960 266 : }
961 :
962 : /// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
963 272 : fn start_measuring_requests(
964 272 : kind: RequestKind,
965 272 : ) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
966 272 : scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
967 0 : metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
968 0 : kind,
969 0 : AttemptOutcome::Cancelled,
970 0 : started_at,
971 0 : )
972 272 : })
973 272 : }
974 :
975 : // Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
976 : struct VerOrDelete {
977 : kind: VerOrDeleteKind,
978 : last_modified: DateTime,
979 : version_id: String,
980 : key: String,
981 : }
982 :
983 : #[derive(Debug)]
984 : enum VerOrDeleteKind {
985 : Version,
986 : DeleteMarker,
987 : }
988 :
989 : impl VerOrDelete {
990 36 : fn with_kind(
991 36 : kind: VerOrDeleteKind,
992 36 : last_modified: Option<DateTime>,
993 36 : version_id: Option<String>,
994 36 : key: Option<String>,
995 36 : ) -> anyhow::Result<Self> {
996 36 : let lvk = (last_modified, version_id, key);
997 36 : let (Some(last_modified), Some(version_id), Some(key)) = lvk else {
998 0 : anyhow::bail!(
999 0 : "One (or more) of last_modified, key, and id is None. \
1000 0 : Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}",
1001 0 : lvk.0,
1002 0 : lvk.1,
1003 0 : lvk.2,
1004 0 : );
1005 : };
1006 36 : Ok(Self {
1007 36 : kind,
1008 36 : last_modified,
1009 36 : version_id,
1010 36 : key,
1011 36 : })
1012 36 : }
1013 28 : fn from_version(v: ObjectVersion) -> anyhow::Result<Self> {
1014 28 : Self::with_kind(
1015 28 : VerOrDeleteKind::Version,
1016 28 : v.last_modified,
1017 28 : v.version_id,
1018 28 : v.key,
1019 28 : )
1020 28 : }
1021 8 : fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result<Self> {
1022 8 : Self::with_kind(
1023 8 : VerOrDeleteKind::DeleteMarker,
1024 8 : v.last_modified,
1025 8 : v.version_id,
1026 8 : v.key,
1027 8 : )
1028 8 : }
1029 : }
1030 :
1031 : #[cfg(test)]
1032 : mod tests {
1033 : use camino::Utf8Path;
1034 : use std::num::NonZeroUsize;
1035 :
1036 : use crate::{RemotePath, S3Bucket, S3Config};
1037 :
1038 : #[test]
1039 2 : fn relative_path() {
1040 2 : let all_paths = ["", "some/path", "some/path/"];
1041 2 : let all_paths: Vec<RemotePath> = all_paths
1042 2 : .iter()
1043 6 : .map(|x| RemotePath::new(Utf8Path::new(x)).expect("bad path"))
1044 2 : .collect();
1045 2 : let prefixes = [
1046 2 : None,
1047 2 : Some(""),
1048 2 : Some("test/prefix"),
1049 2 : Some("test/prefix/"),
1050 2 : Some("/test/prefix/"),
1051 2 : ];
1052 2 : let expected_outputs = [
1053 2 : vec!["", "some/path", "some/path"],
1054 2 : vec!["/", "/some/path", "/some/path"],
1055 2 : vec![
1056 2 : "test/prefix/",
1057 2 : "test/prefix/some/path",
1058 2 : "test/prefix/some/path",
1059 2 : ],
1060 2 : vec![
1061 2 : "test/prefix/",
1062 2 : "test/prefix/some/path",
1063 2 : "test/prefix/some/path",
1064 2 : ],
1065 2 : vec![
1066 2 : "test/prefix/",
1067 2 : "test/prefix/some/path",
1068 2 : "test/prefix/some/path",
1069 2 : ],
1070 2 : ];
1071 :
1072 10 : for (prefix_idx, prefix) in prefixes.iter().enumerate() {
1073 10 : let config = S3Config {
1074 10 : bucket_name: "bucket".to_owned(),
1075 10 : bucket_region: "region".to_owned(),
1076 10 : prefix_in_bucket: prefix.map(str::to_string),
1077 10 : endpoint: None,
1078 10 : concurrency_limit: NonZeroUsize::new(100).unwrap(),
1079 10 : max_keys_per_list_response: Some(5),
1080 10 : };
1081 10 : let storage =
1082 10 : S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
1083 30 : for (test_path_idx, test_path) in all_paths.iter().enumerate() {
1084 30 : let result = storage.relative_path_to_s3_object(test_path);
1085 30 : let expected = expected_outputs[prefix_idx][test_path_idx];
1086 30 : assert_eq!(result, expected);
1087 : }
1088 : }
1089 2 : }
1090 : }
|