Line data Source code
1 : //! AWS S3 storage wrapper around `rusoto` library.
2 : //!
3 : //! Respects `prefix_in_bucket` property from [`S3Config`],
4 : //! allowing multiple api users to independently work with the same S3 bucket, if
5 : //! their bucket prefixes are both specified and different.
6 :
7 : use std::{
8 : borrow::Cow,
9 : collections::HashMap,
10 : num::NonZeroU32,
11 : pin::Pin,
12 : sync::Arc,
13 : task::{Context, Poll},
14 : time::{Duration, SystemTime},
15 : };
16 :
17 : use anyhow::{anyhow, Context as _};
18 : use aws_config::{
19 : environment::credentials::EnvironmentVariableCredentialsProvider,
20 : imds::credentials::ImdsCredentialsProvider,
21 : meta::credentials::CredentialsProviderChain,
22 : profile::ProfileFileCredentialsProvider,
23 : provider_config::ProviderConfig,
24 : retry::{RetryConfigBuilder, RetryMode},
25 : web_identity_token::WebIdentityTokenCredentialsProvider,
26 : BehaviorVersion,
27 : };
28 : use aws_credential_types::provider::SharedCredentialsProvider;
29 : use aws_sdk_s3::{
30 : config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
31 : error::SdkError,
32 : operation::get_object::GetObjectError,
33 : types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
34 : Client,
35 : };
36 : use aws_smithy_async::rt::sleep::TokioSleep;
37 :
38 : use aws_smithy_types::byte_stream::ByteStream;
39 : use aws_smithy_types::{body::SdkBody, DateTime};
40 : use bytes::Bytes;
41 : use futures::stream::Stream;
42 : use hyper::Body;
43 : use scopeguard::ScopeGuard;
44 : use tokio_util::sync::CancellationToken;
45 : use utils::backoff;
46 :
47 : use super::StorageMetadata;
48 : use crate::{
49 : error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
50 : Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
51 : MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
52 : };
53 :
54 : pub(super) mod metrics;
55 :
56 : use self::metrics::AttemptOutcome;
57 : pub(super) use self::metrics::RequestKind;
58 :
59 : /// AWS S3 storage.
60 : pub struct S3Bucket {
61 : client: Client,
62 : bucket_name: String,
63 : prefix_in_bucket: Option<String>,
64 : max_keys_per_list_response: Option<i32>,
65 : concurrency_limiter: ConcurrencyLimiter,
66 : // Per-request timeout. Accessible for tests.
67 : pub timeout: Duration,
68 : }
69 :
70 : struct GetObjectRequest {
71 : bucket: String,
72 : key: String,
73 : range: Option<String>,
74 : }
75 : impl S3Bucket {
76 : /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
77 28 : pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
78 28 : tracing::debug!(
79 0 : "Creating s3 remote storage for S3 bucket {}",
80 0 : aws_config.bucket_name
81 0 : );
82 :
83 28 : let region = Some(Region::new(aws_config.bucket_region.clone()));
84 28 :
85 28 : let provider_conf = ProviderConfig::without_region().with_region(region.clone());
86 28 :
87 28 : let credentials_provider = {
88 28 : // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
89 28 : CredentialsProviderChain::first_try(
90 28 : "env",
91 28 : EnvironmentVariableCredentialsProvider::new(),
92 28 : )
93 28 : // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
94 28 : .or_else(
95 28 : "profile-sso",
96 28 : ProfileFileCredentialsProvider::builder()
97 28 : .configure(&provider_conf)
98 28 : .build(),
99 28 : )
100 28 : // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
101 28 : // needed to access remote extensions bucket
102 28 : .or_else(
103 28 : "token",
104 28 : WebIdentityTokenCredentialsProvider::builder()
105 28 : .configure(&provider_conf)
106 28 : .build(),
107 28 : )
108 28 : // uses imds v2
109 28 : .or_else("imds", ImdsCredentialsProvider::builder().build())
110 28 : };
111 28 :
112 28 : // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
113 28 : let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
114 28 :
115 28 : // We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling
116 28 : // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one
117 28 : // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
118 28 : let mut retry_config = RetryConfigBuilder::new();
119 28 : retry_config
120 28 : .set_max_attempts(Some(1))
121 28 : .set_mode(Some(RetryMode::Adaptive));
122 28 :
123 28 : let mut config_builder = Builder::default()
124 28 : .behavior_version(BehaviorVersion::v2023_11_09())
125 28 : .region(region)
126 28 : .identity_cache(IdentityCache::lazy().build())
127 28 : .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
128 28 : .retry_config(retry_config.build())
129 28 : .sleep_impl(SharedAsyncSleep::from(sleep_impl));
130 :
131 28 : if let Some(custom_endpoint) = aws_config.endpoint.clone() {
132 0 : config_builder = config_builder
133 0 : .endpoint_url(custom_endpoint)
134 0 : .force_path_style(true);
135 28 : }
136 :
137 28 : let client = Client::from_conf(config_builder.build());
138 28 :
139 28 : let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
140 26 : let mut prefix = prefix;
141 28 : while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
142 2 : prefix = &prefix[1..]
143 : }
144 :
145 26 : let mut prefix = prefix.to_string();
146 48 : while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
147 22 : prefix.pop();
148 22 : }
149 26 : prefix
150 28 : });
151 28 : Ok(Self {
152 28 : client,
153 28 : bucket_name: aws_config.bucket_name.clone(),
154 28 : max_keys_per_list_response: aws_config.max_keys_per_list_response,
155 28 : prefix_in_bucket,
156 28 : concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
157 28 : timeout,
158 28 : })
159 28 : }
160 :
161 126 : fn s3_object_to_relative_path(&self, key: &str) -> RemotePath {
162 126 : let relative_path =
163 126 : match key.strip_prefix(self.prefix_in_bucket.as_deref().unwrap_or_default()) {
164 126 : Some(stripped) => stripped,
165 : // we rely on AWS to return properly prefixed paths
166 : // for requests with a certain prefix
167 0 : None => panic!(
168 0 : "Key {} does not start with bucket prefix {:?}",
169 0 : key, self.prefix_in_bucket
170 0 : ),
171 : };
172 126 : RemotePath(
173 126 : relative_path
174 126 : .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
175 126 : .collect(),
176 126 : )
177 126 : }
178 :
179 280 : pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
180 280 : assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
181 280 : let path_string = path
182 280 : .get_path()
183 280 : .as_str()
184 280 : .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
185 280 : match &self.prefix_in_bucket {
186 274 : Some(prefix) => prefix.clone() + "/" + path_string,
187 6 : None => path_string.to_string(),
188 : }
189 280 : }
190 :
191 241 : async fn permit(
192 241 : &self,
193 241 : kind: RequestKind,
194 241 : cancel: &CancellationToken,
195 241 : ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
196 0 : let started_at = start_counting_cancelled_wait(kind);
197 0 : let acquire = self.concurrency_limiter.acquire(kind);
198 :
199 0 : let permit = tokio::select! {
200 0 : permit = acquire => permit.expect("semaphore is never closed"),
201 : _ = cancel.cancelled() => return Err(Cancelled),
202 : };
203 :
204 0 : let started_at = ScopeGuard::into_inner(started_at);
205 0 : metrics::BUCKET_METRICS
206 0 : .wait_seconds
207 0 : .observe_elapsed(kind, started_at);
208 0 :
209 0 : Ok(permit)
210 0 : }
211 :
212 25 : async fn owned_permit(
213 25 : &self,
214 25 : kind: RequestKind,
215 25 : cancel: &CancellationToken,
216 25 : ) -> Result<tokio::sync::OwnedSemaphorePermit, Cancelled> {
217 0 : let started_at = start_counting_cancelled_wait(kind);
218 0 : let acquire = self.concurrency_limiter.acquire_owned(kind);
219 :
220 0 : let permit = tokio::select! {
221 0 : permit = acquire => permit.expect("semaphore is never closed"),
222 : _ = cancel.cancelled() => return Err(Cancelled),
223 : };
224 :
225 0 : let started_at = ScopeGuard::into_inner(started_at);
226 0 : metrics::BUCKET_METRICS
227 0 : .wait_seconds
228 0 : .observe_elapsed(kind, started_at);
229 0 : Ok(permit)
230 0 : }
231 :
232 25 : async fn download_object(
233 25 : &self,
234 25 : request: GetObjectRequest,
235 25 : cancel: &CancellationToken,
236 25 : ) -> Result<Download, DownloadError> {
237 0 : let kind = RequestKind::Get;
238 :
239 0 : let permit = self.owned_permit(kind, cancel).await?;
240 :
241 0 : let started_at = start_measuring_requests(kind);
242 0 :
243 0 : let get_object = self
244 0 : .client
245 0 : .get_object()
246 0 : .bucket(request.bucket)
247 0 : .key(request.key)
248 0 : .set_range(request.range)
249 0 : .send();
250 :
251 0 : let get_object = tokio::select! {
252 0 : res = get_object => res,
253 : _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
254 : _ = cancel.cancelled() => return Err(DownloadError::Cancelled),
255 : };
256 :
257 0 : let started_at = ScopeGuard::into_inner(started_at);
258 :
259 0 : let object_output = match get_object {
260 0 : Ok(object_output) => object_output,
261 0 : Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
262 : // Count this in the AttemptOutcome::Ok bucket, because 404 is not
263 : // an error: we expect to sometimes fetch an object and find it missing,
264 : // e.g. when probing for timeline indices.
265 0 : metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
266 0 : kind,
267 0 : AttemptOutcome::Ok,
268 0 : started_at,
269 0 : );
270 0 : return Err(DownloadError::NotFound);
271 : }
272 0 : Err(e) => {
273 0 : metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
274 0 : kind,
275 0 : AttemptOutcome::Err,
276 0 : started_at,
277 0 : );
278 0 :
279 0 : return Err(DownloadError::Other(
280 0 : anyhow::Error::new(e).context("download s3 object"),
281 0 : ));
282 : }
283 : };
284 :
285 : // even if we would have no timeout left, continue anyways. the caller can decide to ignore
286 : // the errors considering timeouts and cancellation.
287 0 : let remaining = self.timeout.saturating_sub(started_at.elapsed());
288 0 :
289 0 : let metadata = object_output.metadata().cloned().map(StorageMetadata);
290 0 : let etag = object_output.e_tag;
291 0 : let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
292 0 :
293 0 : let body = object_output.body;
294 0 : let body = ByteStreamAsStream::from(body);
295 0 : let body = PermitCarrying::new(permit, body);
296 0 : let body = TimedDownload::new(started_at, body);
297 0 :
298 0 : let cancel_or_timeout = crate::support::cancel_or_timeout(remaining, cancel.clone());
299 0 : let body = crate::support::DownloadStream::new(cancel_or_timeout, body);
300 0 :
301 0 : Ok(Download {
302 0 : metadata,
303 0 : etag,
304 0 : last_modified,
305 0 : download_stream: Box::pin(body),
306 0 : })
307 0 : }
308 :
309 106 : async fn delete_oids(
310 106 : &self,
311 106 : _permit: &tokio::sync::SemaphorePermit<'_>,
312 106 : delete_objects: &[ObjectIdentifier],
313 106 : cancel: &CancellationToken,
314 106 : ) -> anyhow::Result<()> {
315 0 : let kind = RequestKind::Delete;
316 0 : let mut cancel = std::pin::pin!(cancel.cancelled());
317 :
318 0 : for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
319 0 : let started_at = start_measuring_requests(kind);
320 :
321 0 : let req = self
322 0 : .client
323 0 : .delete_objects()
324 0 : .bucket(self.bucket_name.clone())
325 0 : .delete(
326 0 : Delete::builder()
327 0 : .set_objects(Some(chunk.to_vec()))
328 0 : .build()
329 0 : .context("build request")?,
330 : )
331 0 : .send();
332 :
333 0 : let resp = tokio::select! {
334 0 : resp = req => resp,
335 : _ = tokio::time::sleep(self.timeout) => return Err(TimeoutOrCancel::Timeout.into()),
336 : _ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()),
337 : };
338 :
339 0 : let started_at = ScopeGuard::into_inner(started_at);
340 0 : metrics::BUCKET_METRICS
341 0 : .req_seconds
342 0 : .observe_elapsed(kind, &resp, started_at);
343 :
344 0 : let resp = resp.context("request deletion")?;
345 0 : metrics::BUCKET_METRICS
346 0 : .deleted_objects_total
347 0 : .inc_by(chunk.len() as u64);
348 :
349 0 : if let Some(errors) = resp.errors {
350 : // Log a bounded number of the errors within the response:
351 : // these requests can carry 1000 keys so logging each one
352 : // would be too verbose, especially as errors may lead us
353 : // to retry repeatedly.
354 : const LOG_UP_TO_N_ERRORS: usize = 10;
355 0 : for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
356 0 : tracing::warn!(
357 0 : "DeleteObjects key {} failed: {}: {}",
358 0 : e.key.as_ref().map(Cow::from).unwrap_or("".into()),
359 0 : e.code.as_ref().map(Cow::from).unwrap_or("".into()),
360 0 : e.message.as_ref().map(Cow::from).unwrap_or("".into())
361 0 : );
362 : }
363 :
364 0 : return Err(anyhow::anyhow!(
365 0 : "Failed to delete {}/{} objects",
366 0 : errors.len(),
367 0 : chunk.len(),
368 0 : ));
369 0 : }
370 : }
371 0 : Ok(())
372 0 : }
373 : }
374 :
375 : pin_project_lite::pin_project! {
376 : struct ByteStreamAsStream {
377 : #[pin]
378 : inner: aws_smithy_types::byte_stream::ByteStream
379 : }
380 : }
381 :
382 : impl From<aws_smithy_types::byte_stream::ByteStream> for ByteStreamAsStream {
383 24 : fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self {
384 24 : ByteStreamAsStream { inner }
385 24 : }
386 : }
387 :
388 : impl Stream for ByteStreamAsStream {
389 : type Item = std::io::Result<Bytes>;
390 :
391 49 : fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
392 49 : // this does the std::io::ErrorKind::Other conversion
393 49 : self.project().inner.poll_next(cx).map_err(|x| x.into())
394 49 : }
395 :
396 : // cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes
397 : // sense and Stream::size_hint does not really
398 : }
399 :
400 : pin_project_lite::pin_project! {
401 : /// Times and tracks the outcome of the request.
402 : struct TimedDownload<S> {
403 : started_at: std::time::Instant,
404 : outcome: metrics::AttemptOutcome,
405 : #[pin]
406 : inner: S
407 : }
408 :
409 : impl<S> PinnedDrop for TimedDownload<S> {
410 : fn drop(mut this: Pin<&mut Self>) {
411 : metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
412 : }
413 : }
414 : }
415 :
416 : impl<S> TimedDownload<S> {
417 0 : fn new(started_at: std::time::Instant, inner: S) -> Self {
418 0 : TimedDownload {
419 0 : started_at,
420 0 : outcome: metrics::AttemptOutcome::Cancelled,
421 0 : inner,
422 0 : }
423 0 : }
424 : }
425 :
426 : impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
427 : type Item = <S as Stream>::Item;
428 :
429 0 : fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
430 0 : use std::task::ready;
431 0 :
432 0 : let this = self.project();
433 :
434 0 : let res = ready!(this.inner.poll_next(cx));
435 0 : match &res {
436 0 : Some(Ok(_)) => {}
437 0 : Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
438 0 : None => *this.outcome = metrics::AttemptOutcome::Ok,
439 : }
440 :
441 0 : Poll::Ready(res)
442 0 : }
443 :
444 0 : fn size_hint(&self) -> (usize, Option<usize>) {
445 0 : self.inner.size_hint()
446 0 : }
447 : }
448 :
449 : impl RemoteStorage for S3Bucket {
450 24 : async fn list(
451 24 : &self,
452 24 : prefix: Option<&RemotePath>,
453 24 : mode: ListingMode,
454 24 : max_keys: Option<NonZeroU32>,
455 24 : cancel: &CancellationToken,
456 24 : ) -> Result<Listing, DownloadError> {
457 0 : let kind = RequestKind::List;
458 0 : // s3 sdk wants i32
459 0 : let mut max_keys = max_keys.map(|mk| mk.get() as i32);
460 0 : let mut result = Listing::default();
461 0 :
462 0 : // get the passed prefix or if it is not set use prefix_in_bucket value
463 0 : let list_prefix = prefix
464 0 : .map(|p| self.relative_path_to_s3_object(p))
465 0 : .or_else(|| self.prefix_in_bucket.clone())
466 0 : .map(|mut p| {
467 : // required to end with a separator
468 : // otherwise request will return only the entry of a prefix
469 0 : if matches!(mode, ListingMode::WithDelimiter)
470 0 : && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
471 0 : {
472 0 : p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
473 0 : }
474 0 : p
475 0 : });
476 :
477 0 : let _permit = self.permit(kind, cancel).await?;
478 :
479 0 : let mut continuation_token = None;
480 :
481 : loop {
482 0 : let started_at = start_measuring_requests(kind);
483 0 :
484 0 : // min of two Options, returning Some if one is value and another is
485 0 : // None (None is smaller than anything, so plain min doesn't work).
486 0 : let request_max_keys = self
487 0 : .max_keys_per_list_response
488 0 : .into_iter()
489 0 : .chain(max_keys.into_iter())
490 0 : .min();
491 0 : let mut request = self
492 0 : .client
493 0 : .list_objects_v2()
494 0 : .bucket(self.bucket_name.clone())
495 0 : .set_prefix(list_prefix.clone())
496 0 : .set_continuation_token(continuation_token)
497 0 : .set_max_keys(request_max_keys);
498 0 :
499 0 : if let ListingMode::WithDelimiter = mode {
500 0 : request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
501 0 : }
502 :
503 0 : let request = request.send();
504 :
505 0 : let response = tokio::select! {
506 0 : res = request => res,
507 : _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
508 : _ = cancel.cancelled() => return Err(DownloadError::Cancelled),
509 : };
510 :
511 0 : let response = response
512 0 : .context("Failed to list S3 prefixes")
513 0 : .map_err(DownloadError::Other);
514 0 :
515 0 : let started_at = ScopeGuard::into_inner(started_at);
516 0 :
517 0 : metrics::BUCKET_METRICS
518 0 : .req_seconds
519 0 : .observe_elapsed(kind, &response, started_at);
520 :
521 0 : let response = response?;
522 :
523 0 : let keys = response.contents();
524 0 : let empty = Vec::new();
525 0 : let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
526 :
527 0 : tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
528 :
529 0 : for object in keys {
530 0 : let object_path = object.key().expect("response does not contain a key");
531 0 : let remote_path = self.s3_object_to_relative_path(object_path);
532 0 : result.keys.push(remote_path);
533 0 : if let Some(mut mk) = max_keys {
534 0 : assert!(mk > 0);
535 0 : mk -= 1;
536 0 : if mk == 0 {
537 0 : return Ok(result); // limit reached
538 0 : }
539 0 : max_keys = Some(mk);
540 0 : }
541 : }
542 :
543 0 : result.prefixes.extend(
544 0 : prefixes
545 0 : .iter()
546 0 : .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
547 0 : );
548 :
549 0 : continuation_token = match response.next_continuation_token {
550 0 : Some(new_token) => Some(new_token),
551 0 : None => break,
552 0 : };
553 0 : }
554 0 :
555 0 : Ok(result)
556 0 : }
557 :
558 0 : async fn upload(
559 0 : &self,
560 0 : from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
561 0 : from_size_bytes: usize,
562 0 : to: &RemotePath,
563 0 : metadata: Option<StorageMetadata>,
564 0 : cancel: &CancellationToken,
565 0 : ) -> anyhow::Result<()> {
566 0 : let kind = RequestKind::Put;
567 0 : let _permit = self.permit(kind, cancel).await?;
568 :
569 0 : let started_at = start_measuring_requests(kind);
570 0 :
571 0 : let body = Body::wrap_stream(from);
572 0 : let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
573 :
574 0 : let upload = self
575 0 : .client
576 0 : .put_object()
577 0 : .bucket(self.bucket_name.clone())
578 0 : .key(self.relative_path_to_s3_object(to))
579 0 : .set_metadata(metadata.map(|m| m.0))
580 0 : .content_length(from_size_bytes.try_into()?)
581 0 : .body(bytes_stream)
582 0 : .send();
583 0 :
584 0 : let upload = tokio::time::timeout(self.timeout, upload);
585 :
586 0 : let res = tokio::select! {
587 0 : res = upload => res,
588 : _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
589 : };
590 :
591 0 : if let Ok(inner) = &res {
592 0 : // do not incl. timeouts as errors in metrics but cancellations
593 0 : let started_at = ScopeGuard::into_inner(started_at);
594 0 : metrics::BUCKET_METRICS
595 0 : .req_seconds
596 0 : .observe_elapsed(kind, inner, started_at);
597 0 : }
598 :
599 0 : match res {
600 0 : Ok(Ok(_put)) => Ok(()),
601 0 : Ok(Err(sdk)) => Err(sdk.into()),
602 0 : Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
603 : }
604 0 : }
605 :
606 2 : async fn copy(
607 2 : &self,
608 2 : from: &RemotePath,
609 2 : to: &RemotePath,
610 2 : cancel: &CancellationToken,
611 2 : ) -> anyhow::Result<()> {
612 0 : let kind = RequestKind::Copy;
613 0 : let _permit = self.permit(kind, cancel).await?;
614 :
615 0 : let timeout = tokio::time::sleep(self.timeout);
616 0 :
617 0 : let started_at = start_measuring_requests(kind);
618 0 :
619 0 : // we need to specify bucket_name as a prefix
620 0 : let copy_source = format!(
621 0 : "{}/{}",
622 0 : self.bucket_name,
623 0 : self.relative_path_to_s3_object(from)
624 0 : );
625 0 :
626 0 : let op = self
627 0 : .client
628 0 : .copy_object()
629 0 : .bucket(self.bucket_name.clone())
630 0 : .key(self.relative_path_to_s3_object(to))
631 0 : .copy_source(copy_source)
632 0 : .send();
633 :
634 0 : let res = tokio::select! {
635 0 : res = op => res,
636 : _ = timeout => return Err(TimeoutOrCancel::Timeout.into()),
637 : _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
638 : };
639 :
640 0 : let started_at = ScopeGuard::into_inner(started_at);
641 0 : metrics::BUCKET_METRICS
642 0 : .req_seconds
643 0 : .observe_elapsed(kind, &res, started_at);
644 0 :
645 0 : res?;
646 :
647 0 : Ok(())
648 0 : }
649 :
650 15 : async fn download(
651 15 : &self,
652 15 : from: &RemotePath,
653 15 : cancel: &CancellationToken,
654 15 : ) -> Result<Download, DownloadError> {
655 0 : // if prefix is not none then download file `prefix/from`
656 0 : // if prefix is none then download file `from`
657 0 : self.download_object(
658 0 : GetObjectRequest {
659 0 : bucket: self.bucket_name.clone(),
660 0 : key: self.relative_path_to_s3_object(from),
661 0 : range: None,
662 0 : },
663 0 : cancel,
664 0 : )
665 0 : .await
666 0 : }
667 :
668 10 : async fn download_byte_range(
669 10 : &self,
670 10 : from: &RemotePath,
671 10 : start_inclusive: u64,
672 10 : end_exclusive: Option<u64>,
673 10 : cancel: &CancellationToken,
674 10 : ) -> Result<Download, DownloadError> {
675 0 : // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
676 0 : // and needs both ends to be exclusive
677 0 : let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
678 0 : let range = Some(match end_inclusive {
679 0 : Some(end_inclusive) => format!("bytes={start_inclusive}-{end_inclusive}"),
680 0 : None => format!("bytes={start_inclusive}-"),
681 : });
682 :
683 0 : self.download_object(
684 0 : GetObjectRequest {
685 0 : bucket: self.bucket_name.clone(),
686 0 : key: self.relative_path_to_s3_object(from),
687 0 : range,
688 0 : },
689 0 : cancel,
690 0 : )
691 0 : .await
692 0 : }
693 :
694 102 : async fn delete_objects<'a>(
695 102 : &self,
696 102 : paths: &'a [RemotePath],
697 102 : cancel: &CancellationToken,
698 102 : ) -> anyhow::Result<()> {
699 0 : let kind = RequestKind::Delete;
700 0 : let permit = self.permit(kind, cancel).await?;
701 0 : let mut delete_objects = Vec::with_capacity(paths.len());
702 0 : for path in paths {
703 0 : let obj_id = ObjectIdentifier::builder()
704 0 : .set_key(Some(self.relative_path_to_s3_object(path)))
705 0 : .build()
706 0 : .context("convert path to oid")?;
707 0 : delete_objects.push(obj_id);
708 : }
709 :
710 0 : self.delete_oids(&permit, &delete_objects, cancel).await
711 0 : }
712 :
713 90 : async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
714 0 : let paths = std::array::from_ref(path);
715 0 : self.delete_objects(paths, cancel).await
716 0 : }
717 :
718 6 : async fn time_travel_recover(
719 6 : &self,
720 6 : prefix: Option<&RemotePath>,
721 6 : timestamp: SystemTime,
722 6 : done_if_after: SystemTime,
723 6 : cancel: &CancellationToken,
724 6 : ) -> Result<(), TimeTravelError> {
725 0 : let kind = RequestKind::TimeTravel;
726 0 : let permit = self.permit(kind, cancel).await?;
727 :
728 0 : let timestamp = DateTime::from(timestamp);
729 0 : let done_if_after = DateTime::from(done_if_after);
730 :
731 0 : tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
732 :
733 : // get the passed prefix or if it is not set use prefix_in_bucket value
734 0 : let prefix = prefix
735 0 : .map(|p| self.relative_path_to_s3_object(p))
736 0 : .or_else(|| self.prefix_in_bucket.clone());
737 0 :
738 0 : let warn_threshold = 3;
739 0 : let max_retries = 10;
740 0 : let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
741 :
742 0 : let mut key_marker = None;
743 0 : let mut version_id_marker = None;
744 0 : let mut versions_and_deletes = Vec::new();
745 :
746 : loop {
747 0 : let response = backoff::retry(
748 0 : || async {
749 0 : let op = self
750 0 : .client
751 0 : .list_object_versions()
752 0 : .bucket(self.bucket_name.clone())
753 0 : .set_prefix(prefix.clone())
754 0 : .set_key_marker(key_marker.clone())
755 0 : .set_version_id_marker(version_id_marker.clone())
756 0 : .send();
757 0 :
758 0 : tokio::select! {
759 0 : res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
760 : _ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
761 : }
762 0 : },
763 0 : is_permanent,
764 0 : warn_threshold,
765 0 : max_retries,
766 0 : "listing object versions for time_travel_recover",
767 0 : cancel,
768 0 : )
769 0 : .await
770 0 : .ok_or_else(|| TimeTravelError::Cancelled)
771 0 : .and_then(|x| x)?;
772 :
773 0 : tracing::trace!(
774 0 : " Got List response version_id_marker={:?}, key_marker={:?}",
775 0 : response.version_id_marker,
776 0 : response.key_marker
777 0 : );
778 0 : let versions = response
779 0 : .versions
780 0 : .unwrap_or_default()
781 0 : .into_iter()
782 0 : .map(VerOrDelete::from_version);
783 0 : let deletes = response
784 0 : .delete_markers
785 0 : .unwrap_or_default()
786 0 : .into_iter()
787 0 : .map(VerOrDelete::from_delete_marker);
788 0 : itertools::process_results(versions.chain(deletes), |n_vds| {
789 0 : versions_and_deletes.extend(n_vds)
790 0 : })
791 0 : .map_err(TimeTravelError::Other)?;
792 12 : fn none_if_empty(v: Option<String>) -> Option<String> {
793 12 : v.filter(|v| !v.is_empty())
794 12 : }
795 0 : version_id_marker = none_if_empty(response.next_version_id_marker);
796 0 : key_marker = none_if_empty(response.next_key_marker);
797 0 : if version_id_marker.is_none() {
798 : // The final response is not supposed to be truncated
799 0 : if response.is_truncated.unwrap_or_default() {
800 0 : return Err(TimeTravelError::Other(anyhow::anyhow!(
801 0 : "Received truncated ListObjectVersions response for prefix={prefix:?}"
802 0 : )));
803 0 : }
804 0 : break;
805 0 : }
806 0 : // Limit the number of versions deletions, mostly so that we don't
807 0 : // keep requesting forever if the list is too long, as we'd put the
808 0 : // list in RAM.
809 0 : // Building a list of 100k entries that reaches the limit roughly takes
810 0 : // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
811 0 : const COMPLEXITY_LIMIT: usize = 100_000;
812 0 : if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
813 0 : return Err(TimeTravelError::TooManyVersions);
814 0 : }
815 : }
816 :
817 0 : tracing::info!(
818 0 : "Built list for time travel with {} versions and deletions",
819 0 : versions_and_deletes.len()
820 0 : );
821 :
822 : // Work on the list of references instead of the objects directly,
823 : // otherwise we get lifetime errors in the sort_by_key call below.
824 0 : let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
825 0 :
826 0 : versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified));
827 0 :
828 0 : let mut vds_for_key = HashMap::<_, Vec<_>>::new();
829 :
830 0 : for vd in &versions_and_deletes {
831 : let VerOrDelete {
832 0 : version_id, key, ..
833 0 : } = &vd;
834 0 : if version_id == "null" {
835 0 : return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \
836 0 : indicating either disabled versioning, or legacy objects with null version id values")));
837 0 : }
838 0 : tracing::trace!(
839 0 : "Parsing version key={key} version_id={version_id} kind={:?}",
840 0 : vd.kind
841 0 : );
842 :
843 0 : vds_for_key.entry(key).or_default().push(vd);
844 : }
845 0 : for (key, versions) in vds_for_key {
846 0 : let last_vd = versions.last().unwrap();
847 0 : if last_vd.last_modified > done_if_after {
848 0 : tracing::trace!("Key {key} has version later than done_if_after, skipping");
849 0 : continue;
850 0 : }
851 : // the version we want to restore to.
852 0 : let version_to_restore_to =
853 0 : match versions.binary_search_by_key(×tamp, |tpl| tpl.last_modified) {
854 0 : Ok(v) => v,
855 0 : Err(e) => e,
856 : };
857 0 : if version_to_restore_to == versions.len() {
858 0 : tracing::trace!("Key {key} has no changes since timestamp, skipping");
859 0 : continue;
860 0 : }
861 0 : let mut do_delete = false;
862 0 : if version_to_restore_to == 0 {
863 : // All versions more recent, so the key didn't exist at the specified time point.
864 0 : tracing::trace!(
865 0 : "All {} versions more recent for {key}, deleting",
866 0 : versions.len()
867 0 : );
868 0 : do_delete = true;
869 : } else {
870 0 : match &versions[version_to_restore_to - 1] {
871 : VerOrDelete {
872 : kind: VerOrDeleteKind::Version,
873 0 : version_id,
874 : ..
875 : } => {
876 0 : tracing::trace!("Copying old version {version_id} for {key}...");
877 : // Restore the state to the last version by copying
878 0 : let source_id =
879 0 : format!("{}/{key}?versionId={version_id}", self.bucket_name);
880 0 :
881 0 : backoff::retry(
882 0 : || async {
883 0 : let op = self
884 0 : .client
885 0 : .copy_object()
886 0 : .bucket(self.bucket_name.clone())
887 0 : .key(key)
888 0 : .copy_source(&source_id)
889 0 : .send();
890 0 :
891 0 : tokio::select! {
892 0 : res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
893 : _ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
894 : }
895 0 : },
896 0 : is_permanent,
897 0 : warn_threshold,
898 0 : max_retries,
899 0 : "copying object version for time_travel_recover",
900 0 : cancel,
901 0 : )
902 0 : .await
903 0 : .ok_or_else(|| TimeTravelError::Cancelled)
904 0 : .and_then(|x| x)?;
905 0 : tracing::info!(%version_id, %key, "Copied old version in S3");
906 : }
907 : VerOrDelete {
908 : kind: VerOrDeleteKind::DeleteMarker,
909 : ..
910 0 : } => {
911 0 : do_delete = true;
912 0 : }
913 : }
914 : };
915 0 : if do_delete {
916 0 : if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) {
917 : // Key has since been deleted (but there was some history), no need to do anything
918 0 : tracing::trace!("Key {key} already deleted, skipping.");
919 : } else {
920 0 : tracing::trace!("Deleting {key}...");
921 :
922 0 : let oid = ObjectIdentifier::builder()
923 0 : .key(key.to_owned())
924 0 : .build()
925 0 : .map_err(|e| TimeTravelError::Other(e.into()))?;
926 :
927 0 : self.delete_oids(&permit, &[oid], cancel)
928 0 : .await
929 0 : .map_err(|e| {
930 0 : // delete_oid0 will use TimeoutOrCancel
931 0 : if TimeoutOrCancel::caused_by_cancel(&e) {
932 0 : TimeTravelError::Cancelled
933 : } else {
934 0 : TimeTravelError::Other(e)
935 : }
936 0 : })?;
937 : }
938 0 : }
939 : }
940 0 : Ok(())
941 0 : }
942 : }
943 :
944 : /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
945 266 : fn start_counting_cancelled_wait(
946 266 : kind: RequestKind,
947 266 : ) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
948 266 : scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
949 0 : metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
950 266 : })
951 266 : }
952 :
953 : /// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
954 272 : fn start_measuring_requests(
955 272 : kind: RequestKind,
956 272 : ) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
957 272 : scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
958 0 : metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
959 0 : kind,
960 0 : AttemptOutcome::Cancelled,
961 0 : started_at,
962 0 : )
963 272 : })
964 272 : }
965 :
966 : // Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
967 : struct VerOrDelete {
968 : kind: VerOrDeleteKind,
969 : last_modified: DateTime,
970 : version_id: String,
971 : key: String,
972 : }
973 :
974 0 : #[derive(Debug)]
975 : enum VerOrDeleteKind {
976 : Version,
977 : DeleteMarker,
978 : }
979 :
980 : impl VerOrDelete {
981 36 : fn with_kind(
982 36 : kind: VerOrDeleteKind,
983 36 : last_modified: Option<DateTime>,
984 36 : version_id: Option<String>,
985 36 : key: Option<String>,
986 36 : ) -> anyhow::Result<Self> {
987 36 : let lvk = (last_modified, version_id, key);
988 36 : let (Some(last_modified), Some(version_id), Some(key)) = lvk else {
989 0 : anyhow::bail!(
990 0 : "One (or more) of last_modified, key, and id is None. \
991 0 : Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}",
992 0 : lvk.0,
993 0 : lvk.1,
994 0 : lvk.2,
995 0 : );
996 : };
997 36 : Ok(Self {
998 36 : kind,
999 36 : last_modified,
1000 36 : version_id,
1001 36 : key,
1002 36 : })
1003 36 : }
1004 28 : fn from_version(v: ObjectVersion) -> anyhow::Result<Self> {
1005 28 : Self::with_kind(
1006 28 : VerOrDeleteKind::Version,
1007 28 : v.last_modified,
1008 28 : v.version_id,
1009 28 : v.key,
1010 28 : )
1011 28 : }
1012 8 : fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result<Self> {
1013 8 : Self::with_kind(
1014 8 : VerOrDeleteKind::DeleteMarker,
1015 8 : v.last_modified,
1016 8 : v.version_id,
1017 8 : v.key,
1018 8 : )
1019 8 : }
1020 : }
1021 :
1022 : #[cfg(test)]
1023 : mod tests {
1024 : use camino::Utf8Path;
1025 : use std::num::NonZeroUsize;
1026 :
1027 : use crate::{RemotePath, S3Bucket, S3Config};
1028 :
1029 2 : #[test]
1030 2 : fn relative_path() {
1031 2 : let all_paths = ["", "some/path", "some/path/"];
1032 2 : let all_paths: Vec<RemotePath> = all_paths
1033 2 : .iter()
1034 6 : .map(|x| RemotePath::new(Utf8Path::new(x)).expect("bad path"))
1035 2 : .collect();
1036 2 : let prefixes = [
1037 2 : None,
1038 2 : Some(""),
1039 2 : Some("test/prefix"),
1040 2 : Some("test/prefix/"),
1041 2 : Some("/test/prefix/"),
1042 2 : ];
1043 2 : let expected_outputs = vec![
1044 2 : vec!["", "some/path", "some/path"],
1045 2 : vec!["/", "/some/path", "/some/path"],
1046 2 : vec![
1047 2 : "test/prefix/",
1048 2 : "test/prefix/some/path",
1049 2 : "test/prefix/some/path",
1050 2 : ],
1051 2 : vec![
1052 2 : "test/prefix/",
1053 2 : "test/prefix/some/path",
1054 2 : "test/prefix/some/path",
1055 2 : ],
1056 2 : vec![
1057 2 : "test/prefix/",
1058 2 : "test/prefix/some/path",
1059 2 : "test/prefix/some/path",
1060 2 : ],
1061 2 : ];
1062 :
1063 10 : for (prefix_idx, prefix) in prefixes.iter().enumerate() {
1064 10 : let config = S3Config {
1065 10 : bucket_name: "bucket".to_owned(),
1066 10 : bucket_region: "region".to_owned(),
1067 10 : prefix_in_bucket: prefix.map(str::to_string),
1068 10 : endpoint: None,
1069 10 : concurrency_limit: NonZeroUsize::new(100).unwrap(),
1070 10 : max_keys_per_list_response: Some(5),
1071 10 : };
1072 10 : let storage =
1073 10 : S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
1074 30 : for (test_path_idx, test_path) in all_paths.iter().enumerate() {
1075 30 : let result = storage.relative_path_to_s3_object(test_path);
1076 30 : let expected = expected_outputs[prefix_idx][test_path_idx];
1077 30 : assert_eq!(result, expected);
1078 : }
1079 : }
1080 2 : }
1081 : }
|