Line data Source code
1 : //! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
2 : //! No other modules from this tree are supposed to be used directly by the external code.
3 : //!
4 : //! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
5 : //! * [`local_fs`] allows to use local file system as an external storage
6 : //! * [`s3_bucket`] uses AWS S3 bucket as an external storage
7 : //! * [`azure_blob`] allows to use Azure Blob storage as an external storage
8 : //!
9 : #![deny(unsafe_code)]
10 : #![deny(clippy::undocumented_unsafe_blocks)]
11 :
12 : mod azure_blob;
13 : mod error;
14 : mod local_fs;
15 : mod s3_bucket;
16 : mod simulate_failures;
17 : mod support;
18 :
19 : use std::{
20 : collections::HashMap,
21 : fmt::Debug,
22 : num::{NonZeroU32, NonZeroUsize},
23 : pin::Pin,
24 : sync::Arc,
25 : time::{Duration, SystemTime},
26 : };
27 :
28 : use anyhow::{bail, Context};
29 : use camino::{Utf8Path, Utf8PathBuf};
30 :
31 : use bytes::Bytes;
32 : use futures::stream::Stream;
33 : use serde::{Deserialize, Serialize};
34 : use tokio::sync::Semaphore;
35 : use tokio_util::sync::CancellationToken;
36 : use toml_edit::Item;
37 : use tracing::info;
38 :
39 : pub use self::{
40 : azure_blob::AzureBlobStorage, local_fs::LocalFs, s3_bucket::S3Bucket,
41 : simulate_failures::UnreliableWrapper,
42 : };
43 : use s3_bucket::RequestKind;
44 :
45 : /// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
46 : pub use azure_core::Etag;
47 :
48 : pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
49 :
50 : /// Currently, sync happens with AWS S3, that has two limits on requests per second:
51 : /// ~200 RPS for IAM services
52 : /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
53 : /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
54 : /// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
55 : pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
56 : /// We set this a little bit low as we currently buffer the entire file into RAM
57 : ///
58 : /// Here, a limit of max 20k concurrent connections was noted.
59 : /// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
60 : pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
61 : /// No limits on the client side, which currenltly means 1000 for AWS S3.
62 : /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
63 : pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
64 :
65 : /// As defined in S3 docs
66 : pub const MAX_KEYS_PER_DELETE: usize = 1000;
67 :
68 : const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
69 :
70 : /// Path on the remote storage, relative to some inner prefix.
71 : /// The prefix is an implementation detail, that allows representing local paths
72 : /// as the remote ones, stripping the local storage prefix away.
73 : #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
74 : pub struct RemotePath(Utf8PathBuf);
75 :
76 : impl Serialize for RemotePath {
77 0 : fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
78 0 : where
79 0 : S: serde::Serializer,
80 0 : {
81 0 : serializer.collect_str(self)
82 0 : }
83 : }
84 :
85 : impl<'de> Deserialize<'de> for RemotePath {
86 0 : fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
87 0 : where
88 0 : D: serde::Deserializer<'de>,
89 0 : {
90 0 : let str = String::deserialize(deserializer)?;
91 0 : Ok(Self(Utf8PathBuf::from(&str)))
92 0 : }
93 : }
94 :
95 : impl std::fmt::Display for RemotePath {
96 16 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
97 16 : std::fmt::Display::fmt(&self.0, f)
98 16 : }
99 : }
100 :
101 : impl RemotePath {
102 2340 : pub fn new(relative_path: &Utf8Path) -> anyhow::Result<Self> {
103 2340 : anyhow::ensure!(
104 2340 : relative_path.is_relative(),
105 4 : "Path {relative_path:?} is not relative"
106 : );
107 2336 : Ok(Self(relative_path.to_path_buf()))
108 2340 : }
109 :
110 1248 : pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
111 1248 : Self::new(Utf8Path::new(relative_path))
112 1248 : }
113 :
114 2098 : pub fn with_base(&self, base_path: &Utf8Path) -> Utf8PathBuf {
115 2098 : base_path.join(&self.0)
116 2098 : }
117 :
118 14 : pub fn object_name(&self) -> Option<&str> {
119 14 : self.0.file_name()
120 14 : }
121 :
122 99 : pub fn join(&self, segment: &Utf8Path) -> Self {
123 99 : Self(self.0.join(segment))
124 99 : }
125 :
126 493 : pub fn get_path(&self) -> &Utf8PathBuf {
127 493 : &self.0
128 493 : }
129 :
130 0 : pub fn extension(&self) -> Option<&str> {
131 0 : self.0.extension()
132 0 : }
133 :
134 12 : pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
135 12 : self.0.strip_prefix(&p.0)
136 12 : }
137 : }
138 :
139 : /// We don't need callers to be able to pass arbitrary delimiters: just control
140 : /// whether listings will use a '/' separator or not.
141 : ///
142 : /// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The
143 : /// NoDelimiter mode will only populate `keys`.
144 : pub enum ListingMode {
145 : WithDelimiter,
146 : NoDelimiter,
147 : }
148 :
149 : #[derive(Default)]
150 : pub struct Listing {
151 : pub prefixes: Vec<RemotePath>,
152 : pub keys: Vec<RemotePath>,
153 : }
154 :
155 : /// Storage (potentially remote) API to manage its state.
156 : /// This storage tries to be unaware of any layered repository context,
157 : /// providing basic CRUD operations for storage files.
158 : #[allow(async_fn_in_trait)]
159 : pub trait RemoteStorage: Send + Sync + 'static {
160 : /// Lists all top level subdirectories for a given prefix
161 : /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
162 : /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
163 : /// so this method doesnt need to.
164 9 : async fn list_prefixes(
165 9 : &self,
166 9 : prefix: Option<&RemotePath>,
167 9 : cancel: &CancellationToken,
168 9 : ) -> Result<Vec<RemotePath>, DownloadError> {
169 0 : let result = self
170 0 : .list(prefix, ListingMode::WithDelimiter, None, cancel)
171 0 : .await?
172 : .prefixes;
173 0 : Ok(result)
174 0 : }
175 : /// Lists all files in directory "recursively"
176 : /// (not really recursively, because AWS has a flat namespace)
177 : /// Note: This is subtely different than list_prefixes,
178 : /// because it is for listing files instead of listing
179 : /// names sharing common prefixes.
180 : /// For example,
181 : /// list_files("foo/bar") = ["foo/bar/cat123.txt",
182 : /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
183 : /// whereas,
184 : /// list_prefixes("foo/bar/") = ["cat", "dog"]
185 : /// See `test_real_s3.rs` for more details.
186 : ///
187 : /// max_keys limits max number of keys returned; None means unlimited.
188 27 : async fn list_files(
189 27 : &self,
190 27 : prefix: Option<&RemotePath>,
191 27 : max_keys: Option<NonZeroU32>,
192 27 : cancel: &CancellationToken,
193 27 : ) -> Result<Vec<RemotePath>, DownloadError> {
194 6 : let result = self
195 6 : .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
196 12 : .await?
197 : .keys;
198 6 : Ok(result)
199 6 : }
200 :
201 : async fn list(
202 : &self,
203 : prefix: Option<&RemotePath>,
204 : _mode: ListingMode,
205 : max_keys: Option<NonZeroU32>,
206 : cancel: &CancellationToken,
207 : ) -> Result<Listing, DownloadError>;
208 :
209 : /// Streams the local file contents into remote into the remote storage entry.
210 : ///
211 : /// If the operation fails because of timeout or cancellation, the root cause of the error will be
212 : /// set to `TimeoutOrCancel`.
213 : async fn upload(
214 : &self,
215 : from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
216 : // S3 PUT request requires the content length to be specified,
217 : // otherwise it starts to fail with the concurrent connection count increasing.
218 : data_size_bytes: usize,
219 : to: &RemotePath,
220 : metadata: Option<StorageMetadata>,
221 : cancel: &CancellationToken,
222 : ) -> anyhow::Result<()>;
223 :
224 : /// Streams the remote storage entry contents.
225 : ///
226 : /// The returned download stream will obey initial timeout and cancellation signal by erroring
227 : /// on whichever happens first. Only one of the reasons will fail the stream, which is usually
228 : /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
229 : ///
230 : /// Returns the metadata, if any was stored with the file previously.
231 : async fn download(
232 : &self,
233 : from: &RemotePath,
234 : cancel: &CancellationToken,
235 : ) -> Result<Download, DownloadError>;
236 :
237 : /// Streams a given byte range of the remote storage entry contents.
238 : ///
239 : /// The returned download stream will obey initial timeout and cancellation signal by erroring
240 : /// on whichever happens first. Only one of the reasons will fail the stream, which is usually
241 : /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
242 : ///
243 : /// Returns the metadata, if any was stored with the file previously.
244 : async fn download_byte_range(
245 : &self,
246 : from: &RemotePath,
247 : start_inclusive: u64,
248 : end_exclusive: Option<u64>,
249 : cancel: &CancellationToken,
250 : ) -> Result<Download, DownloadError>;
251 :
252 : /// Delete a single path from remote storage.
253 : ///
254 : /// If the operation fails because of timeout or cancellation, the root cause of the error will be
255 : /// set to `TimeoutOrCancel`. In such situation it is unknown if the deletion went through.
256 : async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()>;
257 :
258 : /// Delete a multiple paths from remote storage.
259 : ///
260 : /// If the operation fails because of timeout or cancellation, the root cause of the error will be
261 : /// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went
262 : /// through.
263 : async fn delete_objects<'a>(
264 : &self,
265 : paths: &'a [RemotePath],
266 : cancel: &CancellationToken,
267 : ) -> anyhow::Result<()>;
268 :
269 : /// Copy a remote object inside a bucket from one path to another.
270 : async fn copy(
271 : &self,
272 : from: &RemotePath,
273 : to: &RemotePath,
274 : cancel: &CancellationToken,
275 : ) -> anyhow::Result<()>;
276 :
277 : /// Resets the content of everything with the given prefix to the given state
278 : async fn time_travel_recover(
279 : &self,
280 : prefix: Option<&RemotePath>,
281 : timestamp: SystemTime,
282 : done_if_after: SystemTime,
283 : cancel: &CancellationToken,
284 : ) -> Result<(), TimeTravelError>;
285 : }
286 :
287 : /// DownloadStream is sensitive to the timeout and cancellation used with the original
288 : /// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
289 : /// with `tokio::io::copy_buf`.
290 : // This has 'static because safekeepers do not use cancellation tokens (yet)
291 : pub type DownloadStream =
292 : Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>>;
293 :
294 : pub struct Download {
295 : pub download_stream: DownloadStream,
296 : /// The last time the file was modified (`last-modified` HTTP header)
297 : pub last_modified: SystemTime,
298 : /// A way to identify this specific version of the resource (`etag` HTTP header)
299 : pub etag: Etag,
300 : /// Extra key-value data, associated with the current remote file.
301 : pub metadata: Option<StorageMetadata>,
302 : }
303 :
304 : impl Debug for Download {
305 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
306 0 : f.debug_struct("Download")
307 0 : .field("metadata", &self.metadata)
308 0 : .finish()
309 0 : }
310 : }
311 :
312 : /// Every storage, currently supported.
313 : /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
314 : #[derive(Clone)]
315 : // Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
316 : pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
317 : LocalFs(LocalFs),
318 : AwsS3(Arc<S3Bucket>),
319 : AzureBlob(Arc<AzureBlobStorage>),
320 : Unreliable(Other),
321 : }
322 :
323 : impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
324 108 : pub async fn list(
325 108 : &self,
326 108 : prefix: Option<&RemotePath>,
327 108 : mode: ListingMode,
328 108 : max_keys: Option<NonZeroU32>,
329 108 : cancel: &CancellationToken,
330 108 : ) -> anyhow::Result<Listing, DownloadError> {
331 108 : match self {
332 108 : Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await,
333 0 : Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await,
334 0 : Self::AzureBlob(s) => s.list(prefix, mode, max_keys, cancel).await,
335 0 : Self::Unreliable(s) => s.list(prefix, mode, max_keys, cancel).await,
336 : }
337 108 : }
338 :
339 : // A function for listing all the files in a "directory"
340 : // Example:
341 : // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
342 : //
343 : // max_keys limits max number of keys returned; None means unlimited.
344 6 : pub async fn list_files(
345 6 : &self,
346 6 : folder: Option<&RemotePath>,
347 6 : max_keys: Option<NonZeroU32>,
348 6 : cancel: &CancellationToken,
349 6 : ) -> Result<Vec<RemotePath>, DownloadError> {
350 6 : match self {
351 12 : Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
352 0 : Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
353 0 : Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
354 0 : Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
355 : }
356 6 : }
357 :
358 : // lists common *prefixes*, if any of files
359 : // Example:
360 : // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
361 0 : pub async fn list_prefixes(
362 0 : &self,
363 0 : prefix: Option<&RemotePath>,
364 0 : cancel: &CancellationToken,
365 0 : ) -> Result<Vec<RemotePath>, DownloadError> {
366 0 : match self {
367 0 : Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
368 0 : Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
369 0 : Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
370 0 : Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
371 : }
372 0 : }
373 :
374 : /// See [`RemoteStorage::upload`]
375 1888 : pub async fn upload(
376 1888 : &self,
377 1888 : from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
378 1888 : data_size_bytes: usize,
379 1888 : to: &RemotePath,
380 1888 : metadata: Option<StorageMetadata>,
381 1888 : cancel: &CancellationToken,
382 1888 : ) -> anyhow::Result<()> {
383 1888 : match self {
384 26152 : Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
385 0 : Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
386 0 : Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
387 76 : Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
388 : }
389 1781 : }
390 :
391 42 : pub async fn download(
392 42 : &self,
393 42 : from: &RemotePath,
394 42 : cancel: &CancellationToken,
395 42 : ) -> Result<Download, DownloadError> {
396 42 : match self {
397 70 : Self::LocalFs(s) => s.download(from, cancel).await,
398 0 : Self::AwsS3(s) => s.download(from, cancel).await,
399 0 : Self::AzureBlob(s) => s.download(from, cancel).await,
400 0 : Self::Unreliable(s) => s.download(from, cancel).await,
401 : }
402 42 : }
403 :
404 0 : pub async fn download_byte_range(
405 0 : &self,
406 0 : from: &RemotePath,
407 0 : start_inclusive: u64,
408 0 : end_exclusive: Option<u64>,
409 0 : cancel: &CancellationToken,
410 0 : ) -> Result<Download, DownloadError> {
411 0 : match self {
412 0 : Self::LocalFs(s) => {
413 0 : s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
414 0 : .await
415 : }
416 0 : Self::AwsS3(s) => {
417 0 : s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
418 0 : .await
419 : }
420 0 : Self::AzureBlob(s) => {
421 0 : s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
422 0 : .await
423 : }
424 0 : Self::Unreliable(s) => {
425 0 : s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
426 0 : .await
427 : }
428 : }
429 0 : }
430 :
431 : /// See [`RemoteStorage::delete`]
432 2 : pub async fn delete(
433 2 : &self,
434 2 : path: &RemotePath,
435 2 : cancel: &CancellationToken,
436 2 : ) -> anyhow::Result<()> {
437 2 : match self {
438 2 : Self::LocalFs(s) => s.delete(path, cancel).await,
439 0 : Self::AwsS3(s) => s.delete(path, cancel).await,
440 0 : Self::AzureBlob(s) => s.delete(path, cancel).await,
441 0 : Self::Unreliable(s) => s.delete(path, cancel).await,
442 : }
443 2 : }
444 :
445 : /// See [`RemoteStorage::delete_objects`]
446 6 : pub async fn delete_objects(
447 6 : &self,
448 6 : paths: &[RemotePath],
449 6 : cancel: &CancellationToken,
450 6 : ) -> anyhow::Result<()> {
451 6 : match self {
452 6 : Self::LocalFs(s) => s.delete_objects(paths, cancel).await,
453 0 : Self::AwsS3(s) => s.delete_objects(paths, cancel).await,
454 0 : Self::AzureBlob(s) => s.delete_objects(paths, cancel).await,
455 0 : Self::Unreliable(s) => s.delete_objects(paths, cancel).await,
456 : }
457 6 : }
458 :
459 : /// See [`RemoteStorage::copy`]
460 0 : pub async fn copy_object(
461 0 : &self,
462 0 : from: &RemotePath,
463 0 : to: &RemotePath,
464 0 : cancel: &CancellationToken,
465 0 : ) -> anyhow::Result<()> {
466 0 : match self {
467 0 : Self::LocalFs(s) => s.copy(from, to, cancel).await,
468 0 : Self::AwsS3(s) => s.copy(from, to, cancel).await,
469 0 : Self::AzureBlob(s) => s.copy(from, to, cancel).await,
470 0 : Self::Unreliable(s) => s.copy(from, to, cancel).await,
471 : }
472 0 : }
473 :
474 : /// See [`RemoteStorage::time_travel_recover`].
475 0 : pub async fn time_travel_recover(
476 0 : &self,
477 0 : prefix: Option<&RemotePath>,
478 0 : timestamp: SystemTime,
479 0 : done_if_after: SystemTime,
480 0 : cancel: &CancellationToken,
481 0 : ) -> Result<(), TimeTravelError> {
482 0 : match self {
483 0 : Self::LocalFs(s) => {
484 0 : s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
485 0 : .await
486 : }
487 0 : Self::AwsS3(s) => {
488 0 : s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
489 0 : .await
490 : }
491 0 : Self::AzureBlob(s) => {
492 0 : s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
493 0 : .await
494 : }
495 0 : Self::Unreliable(s) => {
496 0 : s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
497 0 : .await
498 : }
499 : }
500 0 : }
501 : }
502 :
503 : impl GenericRemoteStorage {
504 148 : pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
505 148 : let timeout = storage_config.timeout;
506 148 : Ok(match &storage_config.storage {
507 124 : RemoteStorageKind::LocalFs(path) => {
508 124 : info!("Using fs root '{path}' as a remote storage");
509 124 : Self::LocalFs(LocalFs::new(path.clone(), timeout)?)
510 : }
511 18 : RemoteStorageKind::AwsS3(s3_config) => {
512 18 : // The profile and access key id are only printed here for debugging purposes,
513 18 : // their values don't indicate the eventually taken choice for auth.
514 18 : let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "<none>".into());
515 18 : let access_key_id =
516 18 : std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
517 18 : info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
518 18 : s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
519 18 : Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
520 : }
521 6 : RemoteStorageKind::AzureContainer(azure_config) => {
522 6 : info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
523 6 : azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
524 6 : Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?))
525 : }
526 : })
527 148 : }
528 :
529 4 : pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
530 4 : Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
531 4 : }
532 :
533 : /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
534 932 : pub async fn upload_storage_object(
535 932 : &self,
536 932 : from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
537 932 : from_size_bytes: usize,
538 932 : to: &RemotePath,
539 932 : cancel: &CancellationToken,
540 932 : ) -> anyhow::Result<()> {
541 932 : self.upload(from, from_size_bytes, to, None, cancel)
542 2847 : .await
543 927 : .with_context(|| {
544 0 : format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}")
545 927 : })
546 927 : }
547 :
548 : /// Downloads the storage object into the `to_path` provided.
549 : /// `byte_range` could be specified to dowload only a part of the file, if needed.
550 0 : pub async fn download_storage_object(
551 0 : &self,
552 0 : byte_range: Option<(u64, Option<u64>)>,
553 0 : from: &RemotePath,
554 0 : cancel: &CancellationToken,
555 0 : ) -> Result<Download, DownloadError> {
556 0 : match byte_range {
557 0 : Some((start, end)) => self.download_byte_range(from, start, end, cancel).await,
558 0 : None => self.download(from, cancel).await,
559 : }
560 0 : }
561 : }
562 :
563 : /// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
564 : /// Immutable, cannot be changed once the file is created.
565 : #[derive(Debug, Clone, PartialEq, Eq)]
566 : pub struct StorageMetadata(HashMap<String, String>);
567 :
568 : impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
569 0 : fn from(arr: [(&str, &str); N]) -> Self {
570 0 : let map: HashMap<String, String> = arr
571 0 : .iter()
572 0 : .map(|(k, v)| (k.to_string(), v.to_string()))
573 0 : .collect();
574 0 : Self(map)
575 0 : }
576 : }
577 :
578 : /// External backup storage configuration, enough for creating a client for that storage.
579 : #[derive(Debug, Clone, PartialEq, Eq)]
580 : pub struct RemoteStorageConfig {
581 : /// The storage connection configuration.
582 : pub storage: RemoteStorageKind,
583 : /// A common timeout enforced for all requests after concurrency limiter permit has been
584 : /// acquired.
585 : pub timeout: Duration,
586 : }
587 :
588 : /// A kind of a remote storage to connect to, with its connection configuration.
589 : #[derive(Debug, Clone, PartialEq, Eq)]
590 : pub enum RemoteStorageKind {
591 : /// Storage based on local file system.
592 : /// Specify a root folder to place all stored files into.
593 : LocalFs(Utf8PathBuf),
594 : /// AWS S3 based storage, storing all files in the S3 bucket
595 : /// specified by the config
596 : AwsS3(S3Config),
597 : /// Azure Blob based storage, storing all files in the container
598 : /// specified by the config
599 : AzureContainer(AzureConfig),
600 : }
601 :
602 : /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
603 : #[derive(Clone, PartialEq, Eq)]
604 : pub struct S3Config {
605 : /// Name of the bucket to connect to.
606 : pub bucket_name: String,
607 : /// The region where the bucket is located at.
608 : pub bucket_region: String,
609 : /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
610 : pub prefix_in_bucket: Option<String>,
611 : /// A base URL to send S3 requests to.
612 : /// By default, the endpoint is derived from a region name, assuming it's
613 : /// an AWS S3 region name, erroring on wrong region name.
614 : /// Endpoint provides a way to support other S3 flavors and their regions.
615 : ///
616 : /// Example: `http://127.0.0.1:5000`
617 : pub endpoint: Option<String>,
618 : /// AWS S3 has various limits on its API calls, we need not to exceed those.
619 : /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
620 : pub concurrency_limit: NonZeroUsize,
621 : pub max_keys_per_list_response: Option<i32>,
622 : }
623 :
624 : impl Debug for S3Config {
625 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
626 0 : f.debug_struct("S3Config")
627 0 : .field("bucket_name", &self.bucket_name)
628 0 : .field("bucket_region", &self.bucket_region)
629 0 : .field("prefix_in_bucket", &self.prefix_in_bucket)
630 0 : .field("concurrency_limit", &self.concurrency_limit)
631 0 : .field(
632 0 : "max_keys_per_list_response",
633 0 : &self.max_keys_per_list_response,
634 0 : )
635 0 : .finish()
636 0 : }
637 : }
638 :
639 : /// Azure bucket coordinates and access credentials to manage the bucket contents (read and write).
640 : #[derive(Clone, PartialEq, Eq)]
641 : pub struct AzureConfig {
642 : /// Name of the container to connect to.
643 : pub container_name: String,
644 : /// The region where the bucket is located at.
645 : pub container_region: String,
646 : /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
647 : pub prefix_in_container: Option<String>,
648 : /// Azure has various limits on its API calls, we need not to exceed those.
649 : /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
650 : pub concurrency_limit: NonZeroUsize,
651 : pub max_keys_per_list_response: Option<i32>,
652 : }
653 :
654 : impl Debug for AzureConfig {
655 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
656 0 : f.debug_struct("AzureConfig")
657 0 : .field("bucket_name", &self.container_name)
658 0 : .field("bucket_region", &self.container_region)
659 0 : .field("prefix_in_bucket", &self.prefix_in_container)
660 0 : .field("concurrency_limit", &self.concurrency_limit)
661 0 : .field(
662 0 : "max_keys_per_list_response",
663 0 : &self.max_keys_per_list_response,
664 0 : )
665 0 : .finish()
666 0 : }
667 : }
668 :
669 : impl RemoteStorageConfig {
670 : pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
671 :
672 22 : pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
673 22 : let local_path = toml.get("local_path");
674 22 : let bucket_name = toml.get("bucket_name");
675 22 : let bucket_region = toml.get("bucket_region");
676 22 : let container_name = toml.get("container_name");
677 22 : let container_region = toml.get("container_region");
678 :
679 22 : let use_azure = container_name.is_some() && container_region.is_some();
680 :
681 22 : let default_concurrency_limit = if use_azure {
682 0 : DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
683 : } else {
684 22 : DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
685 : };
686 22 : let concurrency_limit = NonZeroUsize::new(
687 22 : parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit),
688 22 : )
689 22 : .context("Failed to parse 'concurrency_limit' as a positive integer")?;
690 :
691 22 : let max_keys_per_list_response =
692 22 : parse_optional_integer::<i32, _>("max_keys_per_list_response", toml)
693 22 : .context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
694 22 : .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);
695 :
696 22 : let endpoint = toml
697 22 : .get("endpoint")
698 22 : .map(|endpoint| parse_toml_string("endpoint", endpoint))
699 22 : .transpose()?;
700 :
701 22 : let timeout = toml
702 22 : .get("timeout")
703 22 : .map(|timeout| {
704 2 : timeout
705 2 : .as_str()
706 2 : .ok_or_else(|| anyhow::Error::msg("timeout was not a string"))
707 22 : })
708 22 : .transpose()
709 22 : .and_then(|timeout| {
710 22 : timeout
711 22 : .map(humantime::parse_duration)
712 22 : .transpose()
713 22 : .map_err(anyhow::Error::new)
714 22 : })
715 22 : .context("parse timeout")?
716 22 : .unwrap_or(Self::DEFAULT_TIMEOUT);
717 22 :
718 22 : if timeout < Duration::from_secs(1) {
719 0 : bail!("timeout was specified as {timeout:?} which is too low");
720 22 : }
721 :
722 12 : let storage = match (
723 22 : local_path,
724 22 : bucket_name,
725 22 : bucket_region,
726 22 : container_name,
727 22 : container_region,
728 : ) {
729 : // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
730 10 : (None, None, None, None, None) => return Ok(None),
731 : (_, Some(_), None, ..) => {
732 0 : bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
733 : }
734 : (_, None, Some(_), ..) => {
735 0 : bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
736 : }
737 6 : (None, Some(bucket_name), Some(bucket_region), ..) => {
738 6 : RemoteStorageKind::AwsS3(S3Config {
739 6 : bucket_name: parse_toml_string("bucket_name", bucket_name)?,
740 6 : bucket_region: parse_toml_string("bucket_region", bucket_region)?,
741 6 : prefix_in_bucket: toml
742 6 : .get("prefix_in_bucket")
743 6 : .map(|prefix_in_bucket| {
744 6 : parse_toml_string("prefix_in_bucket", prefix_in_bucket)
745 6 : })
746 6 : .transpose()?,
747 6 : endpoint,
748 6 : concurrency_limit,
749 6 : max_keys_per_list_response,
750 : })
751 : }
752 : (_, _, _, Some(_), None) => {
753 0 : bail!("'container_name' option is mandatory if 'container_region' is given ")
754 : }
755 : (_, _, _, None, Some(_)) => {
756 0 : bail!("'container_name' option is mandatory if 'container_region' is given ")
757 : }
758 0 : (None, None, None, Some(container_name), Some(container_region)) => {
759 0 : RemoteStorageKind::AzureContainer(AzureConfig {
760 0 : container_name: parse_toml_string("container_name", container_name)?,
761 0 : container_region: parse_toml_string("container_region", container_region)?,
762 0 : prefix_in_container: toml
763 0 : .get("prefix_in_container")
764 0 : .map(|prefix_in_container| {
765 0 : parse_toml_string("prefix_in_container", prefix_in_container)
766 0 : })
767 0 : .transpose()?,
768 0 : concurrency_limit,
769 0 : max_keys_per_list_response,
770 : })
771 : }
772 6 : (Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs(
773 6 : Utf8PathBuf::from(parse_toml_string("local_path", local_path)?),
774 : ),
775 : (Some(_), Some(_), ..) => {
776 0 : bail!("'local_path' and 'bucket_name' are mutually exclusive")
777 : }
778 : (Some(_), _, _, Some(_), Some(_)) => {
779 0 : bail!("local_path and 'container_name' are mutually exclusive")
780 : }
781 : };
782 :
783 12 : Ok(Some(RemoteStorageConfig { storage, timeout }))
784 22 : }
785 : }
786 :
787 : // Helper functions to parse a toml Item
788 44 : fn parse_optional_integer<I, E>(name: &str, item: &toml_edit::Item) -> anyhow::Result<Option<I>>
789 44 : where
790 44 : I: TryFrom<i64, Error = E>,
791 44 : E: std::error::Error + Send + Sync + 'static,
792 44 : {
793 44 : let toml_integer = match item.get(name) {
794 4 : Some(item) => item
795 4 : .as_integer()
796 4 : .with_context(|| format!("configure option {name} is not an integer"))?,
797 40 : None => return Ok(None),
798 : };
799 :
800 4 : I::try_from(toml_integer)
801 4 : .map(Some)
802 4 : .with_context(|| format!("configure option {name} is too large"))
803 44 : }
804 :
805 30 : fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
806 30 : let s = item
807 30 : .as_str()
808 30 : .with_context(|| format!("configure option {name} is not a string"))?;
809 30 : Ok(s.to_string())
810 30 : }
811 :
812 : struct ConcurrencyLimiter {
813 : // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
814 : // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
815 : // The helps to ensure we don't exceed the thresholds.
816 : write: Arc<Semaphore>,
817 : read: Arc<Semaphore>,
818 : }
819 :
820 : impl ConcurrencyLimiter {
821 373 : fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
822 373 : match kind {
823 32 : RequestKind::Get => &self.read,
824 153 : RequestKind::Put => &self.write,
825 30 : RequestKind::List => &self.read,
826 149 : RequestKind::Delete => &self.write,
827 3 : RequestKind::Copy => &self.write,
828 6 : RequestKind::TimeTravel => &self.write,
829 : }
830 373 : }
831 :
832 348 : async fn acquire(
833 348 : &self,
834 348 : kind: RequestKind,
835 348 : ) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
836 0 : self.for_kind(kind).acquire().await
837 0 : }
838 :
839 25 : async fn acquire_owned(
840 25 : &self,
841 25 : kind: RequestKind,
842 25 : ) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
843 0 : Arc::clone(self.for_kind(kind)).acquire_owned().await
844 0 : }
845 :
846 34 : fn new(limit: usize) -> ConcurrencyLimiter {
847 34 : Self {
848 34 : read: Arc::new(Semaphore::new(limit)),
849 34 : write: Arc::new(Semaphore::new(limit)),
850 34 : }
851 34 : }
852 : }
853 :
854 : #[cfg(test)]
855 : mod tests {
856 : use super::*;
857 :
858 : #[test]
859 2 : fn test_object_name() {
860 2 : let k = RemotePath::new(Utf8Path::new("a/b/c")).unwrap();
861 2 : assert_eq!(k.object_name(), Some("c"));
862 :
863 2 : let k = RemotePath::new(Utf8Path::new("a/b/c/")).unwrap();
864 2 : assert_eq!(k.object_name(), Some("c"));
865 :
866 2 : let k = RemotePath::new(Utf8Path::new("a/")).unwrap();
867 2 : assert_eq!(k.object_name(), Some("a"));
868 :
869 : // XXX is it impossible to have an empty key?
870 2 : let k = RemotePath::new(Utf8Path::new("")).unwrap();
871 2 : assert_eq!(k.object_name(), None);
872 2 : }
873 :
874 : #[test]
875 2 : fn rempte_path_cannot_be_created_from_absolute_ones() {
876 2 : let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
877 2 : assert_eq!(err.to_string(), "Path \"/\" is not relative");
878 2 : }
879 :
880 : #[test]
881 2 : fn parse_localfs_config_with_timeout() {
882 2 : let input = "local_path = '.'
883 2 : timeout = '5s'";
884 2 :
885 2 : let toml = input.parse::<toml_edit::Document>().unwrap();
886 2 :
887 2 : let config = RemoteStorageConfig::from_toml(toml.as_item())
888 2 : .unwrap()
889 2 : .expect("it exists");
890 2 :
891 2 : assert_eq!(
892 2 : config,
893 2 : RemoteStorageConfig {
894 2 : storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")),
895 2 : timeout: Duration::from_secs(5)
896 2 : }
897 2 : );
898 2 : }
899 : }
|