LCOV - code coverage report
Current view: top level - pageserver/src/tenant - upload_queue.rs (source / functions) Coverage Total Hit
Test: 07bee600374ccd486c69370d0972d9035964fe68.info Lines: 93.9 % 857 805
Test Date: 2025-02-20 13:11:02 Functions: 92.6 % 81 75

            Line data    Source code
       1              : use std::collections::{HashMap, HashSet, VecDeque};
       2              : use std::fmt::Debug;
       3              : use std::sync::atomic::AtomicU32;
       4              : use std::sync::Arc;
       5              : 
       6              : use super::remote_timeline_client::is_same_remote_layer_path;
       7              : use super::storage_layer::AsLayerDesc as _;
       8              : use super::storage_layer::LayerName;
       9              : use super::storage_layer::ResidentLayer;
      10              : use crate::tenant::metadata::TimelineMetadata;
      11              : use crate::tenant::remote_timeline_client::index::IndexPart;
      12              : use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
      13              : use utils::generation::Generation;
      14              : use utils::lsn::{AtomicLsn, Lsn};
      15              : 
      16              : use chrono::NaiveDateTime;
      17              : use once_cell::sync::Lazy;
      18              : use tracing::info;
      19              : 
      20              : /// Kill switch for upload queue reordering in case it causes problems.
      21              : /// TODO: remove this once we have confidence in it.
      22              : static DISABLE_UPLOAD_QUEUE_REORDERING: Lazy<bool> =
      23          388 :     Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_REORDERING").as_deref() == Ok("true"));
      24              : 
      25              : /// Kill switch for index upload coalescing in case it causes problems.
      26              : /// TODO: remove this once we have confidence in it.
      27              : static DISABLE_UPLOAD_QUEUE_INDEX_COALESCING: Lazy<bool> =
      28           15 :     Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_INDEX_COALESCING").as_deref() == Ok("true"));
      29              : 
      30              : // clippy warns that Uninitialized is much smaller than Initialized, which wastes
      31              : // memory for Uninitialized variants. Doesn't matter in practice, there are not
      32              : // that many upload queues in a running pageserver, and most of them are initialized
      33              : // anyway.
      34              : #[allow(clippy::large_enum_variant)]
      35              : pub enum UploadQueue {
      36              :     Uninitialized,
      37              :     Initialized(UploadQueueInitialized),
      38              :     Stopped(UploadQueueStopped),
      39              : }
      40              : 
      41              : impl UploadQueue {
      42            0 :     pub fn as_str(&self) -> &'static str {
      43            0 :         match self {
      44            0 :             UploadQueue::Uninitialized => "Uninitialized",
      45            0 :             UploadQueue::Initialized(_) => "Initialized",
      46            0 :             UploadQueue::Stopped(_) => "Stopped",
      47              :         }
      48            0 :     }
      49              : }
      50              : 
      51              : #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
      52              : pub enum OpType {
      53              :     MayReorder,
      54              :     FlushDeletion,
      55              : }
      56              : 
      57              : /// This keeps track of queued and in-progress tasks.
      58              : pub struct UploadQueueInitialized {
      59              :     /// Maximum number of inprogress tasks to schedule. 0 is no limit.
      60              :     pub(crate) inprogress_limit: usize,
      61              : 
      62              :     /// Counter to assign task IDs
      63              :     pub(crate) task_counter: u64,
      64              : 
      65              :     /// The next uploaded index_part.json; assumed to be dirty.
      66              :     ///
      67              :     /// Should not be read, directly except for layer file updates. Instead you should add a
      68              :     /// projected field.
      69              :     pub(crate) dirty: IndexPart,
      70              : 
      71              :     /// The latest remote persisted IndexPart.
      72              :     ///
      73              :     /// Each completed metadata upload will update this. The second item is the task_id which last
      74              :     /// updated the value, used to ensure we never store an older value over a newer one.
      75              :     pub(crate) clean: (IndexPart, Option<u64>),
      76              : 
      77              :     /// How many file uploads or deletions been scheduled, since the
      78              :     /// last (scheduling of) metadata index upload?
      79              :     pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64,
      80              : 
      81              :     /// The Lsn is only updated after our generation has been validated with
      82              :     /// the control plane (unlesss a timeline's generation is None, in which case
      83              :     /// we skip validation)
      84              :     pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
      85              : 
      86              :     /// Tasks that are currently in-progress. In-progress means that a tokio Task
      87              :     /// has been launched for it. An in-progress task can be busy uploading, but it can
      88              :     /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
      89              :     /// be waiting for retry in `exponential_backoff`.
      90              :     pub inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
      91              : 
      92              :     /// Queued operations that have not been launched yet. They might depend on previous
      93              :     /// tasks to finish. For example, metadata upload cannot be performed before all
      94              :     /// preceding layer file uploads have completed.
      95              :     pub queued_operations: VecDeque<UploadOp>,
      96              : 
      97              :     /// Files which have been unlinked but not yet had scheduled a deletion for. Only kept around
      98              :     /// for error logging.
      99              :     ///
     100              :     /// Putting this behind a testing feature to catch problems in tests, but assuming we could have a
     101              :     /// bug causing leaks, then it's better to not leave this enabled for production builds.
     102              :     #[cfg(feature = "testing")]
     103              :     pub(crate) dangling_files: HashMap<LayerName, Generation>,
     104              : 
     105              :     /// Ensure we order file operations correctly.
     106              :     pub(crate) recently_deleted: HashSet<(LayerName, Generation)>,
     107              : 
     108              :     /// Deletions that are blocked by the tenant configuration
     109              :     pub(crate) blocked_deletions: Vec<Delete>,
     110              : 
     111              :     /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`.
     112              :     pub(crate) shutting_down: bool,
     113              : 
     114              :     /// Permitless semaphore on which any number of `RemoteTimelineClient::shutdown` futures can
     115              :     /// wait on until one of them stops the queue. The semaphore is closed when
     116              :     /// `RemoteTimelineClient::launch_queued_tasks` encounters `UploadOp::Shutdown`.
     117              :     pub(crate) shutdown_ready: Arc<tokio::sync::Semaphore>,
     118              : }
     119              : 
     120              : impl UploadQueueInitialized {
     121           16 :     pub(super) fn no_pending_work(&self) -> bool {
     122           16 :         self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
     123           16 :     }
     124              : 
     125            0 :     pub(super) fn get_last_remote_consistent_lsn_visible(&self) -> Lsn {
     126            0 :         self.visible_remote_consistent_lsn.load()
     127            0 :     }
     128              : 
     129            0 :     pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
     130            0 :         let lsn = self.clean.0.metadata.disk_consistent_lsn();
     131            0 :         self.clean.1.map(|_| lsn)
     132            0 :     }
     133              : 
     134              :     /// Returns and removes the next ready operation from the queue, if any. This isn't necessarily
     135              :     /// the first operation in the queue, to avoid head-of-line blocking -- an operation can jump
     136              :     /// the queue if it doesn't conflict with operations ahead of it.
     137              :     ///
     138              :     /// Also returns any operations that were coalesced into this one, e.g. multiple index uploads.
     139              :     ///
     140              :     /// None may be returned even if the queue isn't empty, if no operations are ready yet.
     141              :     ///
     142              :     /// NB: this is quadratic, but queues are expected to be small, and bounded by inprogress_limit.
     143        26827 :     pub fn next_ready(&mut self) -> Option<(UploadOp, Vec<UploadOp>)> {
     144        26827 :         // If inprogress_tasks is already at limit, don't schedule anything more.
     145        26827 :         if self.inprogress_limit > 0 && self.inprogress_tasks.len() >= self.inprogress_limit {
     146           12 :             return None;
     147        26815 :         }
     148              : 
     149        51127 :         for (i, candidate) in self.queued_operations.iter().enumerate() {
     150              :             // If this candidate is ready, go for it. Otherwise, try the next one.
     151        51127 :             if self.is_ready(i) {
     152              :                 // Shutdown operations are left at the head of the queue, to prevent further
     153              :                 // operations from starting. Signal that we're ready to shut down.
     154        10162 :                 if matches!(candidate, UploadOp::Shutdown) {
     155           20 :                     assert!(self.inprogress_tasks.is_empty(), "shutdown with tasks");
     156           20 :                     assert_eq!(i, 0, "shutdown not at head of queue");
     157           20 :                     self.shutdown_ready.close();
     158           20 :                     return None;
     159        10142 :                 }
     160        10142 : 
     161        10142 :                 let mut op = self.queued_operations.remove(i).expect("i can't disappear");
     162        10142 : 
     163        10142 :                 // Coalesce any back-to-back index uploads by only uploading the newest one that's
     164        10142 :                 // ready. This typically happens with layer/index/layer/index/... sequences, where
     165        10142 :                 // the layers bypass the indexes, leaving the indexes queued.
     166        10142 :                 //
     167        10142 :                 // If other operations are interleaved between index uploads we don't try to
     168        10142 :                 // coalesce them, since we may as well update the index concurrently with them.
     169        10142 :                 // This keeps the index fresh and avoids starvation.
     170        10142 :                 //
     171        10142 :                 // NB: we assume that all uploaded indexes have the same remote path. This
     172        10142 :                 // is true at the time of writing: the path only depends on the tenant,
     173        10142 :                 // timeline and generation, all of which are static for a timeline instance.
     174        10142 :                 // Otherwise, we must be careful not to coalesce different paths.
     175        10142 :                 let mut coalesced_ops = Vec::new();
     176        10142 :                 if matches!(op, UploadOp::UploadMetadata { .. }) {
     177         3071 :                     while let Some(UploadOp::UploadMetadata { .. }) = self.queued_operations.get(i)
     178              :                     {
     179           31 :                         if *DISABLE_UPLOAD_QUEUE_INDEX_COALESCING {
     180            0 :                             break;
     181           31 :                         }
     182           31 :                         if !self.is_ready(i) {
     183           11 :                             break;
     184           20 :                         }
     185           20 :                         coalesced_ops.push(op);
     186           20 :                         op = self.queued_operations.remove(i).expect("i can't disappear");
     187              :                     }
     188         7091 :                 }
     189              : 
     190        10142 :                 return Some((op, coalesced_ops));
     191        40965 :             }
     192              : 
     193              :             // Nothing can bypass a barrier or shutdown. If it wasn't scheduled above, give up.
     194        40965 :             if matches!(candidate, UploadOp::Barrier(_) | UploadOp::Shutdown) {
     195         4779 :                 return None;
     196        36186 :             }
     197        36186 : 
     198        36186 :             // If upload queue reordering is disabled, bail out after the first operation.
     199        36186 :             if *DISABLE_UPLOAD_QUEUE_REORDERING {
     200            0 :                 return None;
     201        36186 :             }
     202              :         }
     203        11874 :         None
     204        26827 :     }
     205              : 
     206              :     /// Returns true if the queued operation at the given position is ready to be uploaded, i.e. if
     207              :     /// it doesn't conflict with any in-progress or queued operations ahead of it. Operations are
     208              :     /// allowed to skip the queue when it's safe to do so, to increase parallelism.
     209              :     ///
     210              :     /// The position must be valid for the queue size.
     211        51158 :     fn is_ready(&self, pos: usize) -> bool {
     212        51158 :         let candidate = self.queued_operations.get(pos).expect("invalid position");
     213        51158 :         self
     214        51158 :             // Look at in-progress operations, in random order.
     215        51158 :             .inprogress_tasks
     216        51158 :             .values()
     217      2334420 :             .map(|task| &task.op)
     218        51158 :             // Then queued operations ahead of the candidate, front-to-back.
     219        51158 :             .chain(self.queued_operations.iter().take(pos))
     220        51158 :             // Keep track of the active index ahead of each operation. This is used to ensure that
     221        51158 :             // an upload doesn't skip the queue too far, such that it modifies a layer that's
     222        51158 :             // referenced by an active index.
     223        51158 :             //
     224        51158 :             // It's okay that in-progress operations are emitted in random order above, since at
     225        51158 :             // most one of them can be an index upload (enforced by can_bypass).
     226      2362423 :             .scan(&self.clean.0, |next_active_index, op| {
     227      2362423 :                 let active_index = *next_active_index;
     228      2362423 :                 if let UploadOp::UploadMetadata { ref uploaded } = op {
     229        32105 :                     *next_active_index = uploaded; // stash index for next operation after this
     230      2330318 :                 }
     231      2362423 :                 Some((op, active_index))
     232      2362423 :             })
     233        51158 :             // Check if the candidate can bypass all of them.
     234      2362423 :             .all(|(op, active_index)| candidate.can_bypass(op, active_index))
     235        51158 :     }
     236              : 
     237              :     /// Returns the number of in-progress deletion operations.
     238              :     #[cfg(test)]
     239            4 :     pub(crate) fn num_inprogress_deletions(&self) -> usize {
     240            4 :         self.inprogress_tasks
     241            4 :             .iter()
     242            4 :             .filter(|(_, t)| matches!(t.op, UploadOp::Delete(_)))
     243            4 :             .count()
     244            4 :     }
     245              : 
     246              :     /// Returns the number of in-progress layer uploads.
     247              :     #[cfg(test)]
     248            8 :     pub(crate) fn num_inprogress_layer_uploads(&self) -> usize {
     249            8 :         self.inprogress_tasks
     250            8 :             .iter()
     251           12 :             .filter(|(_, t)| matches!(t.op, UploadOp::UploadLayer(_, _, _)))
     252            8 :             .count()
     253            8 :     }
     254              : 
     255              :     /// Test helper that schedules all ready operations into inprogress_tasks, and returns
     256              :     /// references to them.
     257              :     ///
     258              :     /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into
     259              :     /// UploadQueue, so we can use the same code path.
     260              :     #[cfg(test)]
     261          156 :     fn schedule_ready(&mut self) -> Vec<Arc<UploadTask>> {
     262          156 :         let mut tasks = Vec::new();
     263              :         // NB: schedule operations one by one, to handle conflicts with inprogress_tasks.
     264          344 :         while let Some((op, coalesced_ops)) = self.next_ready() {
     265          188 :             self.task_counter += 1;
     266          188 :             let task = Arc::new(UploadTask {
     267          188 :                 task_id: self.task_counter,
     268          188 :                 op,
     269          188 :                 coalesced_ops,
     270          188 :                 retries: 0.into(),
     271          188 :             });
     272          188 :             self.inprogress_tasks.insert(task.task_id, task.clone());
     273          188 :             tasks.push(task);
     274          188 :         }
     275          156 :         tasks
     276          156 :     }
     277              : 
     278              :     /// Test helper that marks an operation as completed, removing it from inprogress_tasks.
     279              :     ///
     280              :     /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into
     281              :     /// UploadQueue, so we can use the same code path.
     282              :     #[cfg(test)]
     283          116 :     fn complete(&mut self, task_id: u64) {
     284          116 :         let Some(task) = self.inprogress_tasks.remove(&task_id) else {
     285            0 :             return;
     286              :         };
     287              :         // Update the clean index on uploads.
     288          116 :         if let UploadOp::UploadMetadata { ref uploaded } = task.op {
     289           32 :             if task.task_id > self.clean.1.unwrap_or_default() {
     290           32 :                 self.clean = (*uploaded.clone(), Some(task.task_id));
     291           32 :             }
     292           84 :         }
     293          116 :     }
     294              : }
     295              : 
     296              : #[derive(Clone, Copy)]
     297              : pub(super) enum SetDeletedFlagProgress {
     298              :     NotRunning,
     299              :     InProgress(NaiveDateTime),
     300              :     Successful(NaiveDateTime),
     301              : }
     302              : 
     303              : pub struct UploadQueueStoppedDeletable {
     304              :     pub(super) upload_queue_for_deletion: UploadQueueInitialized,
     305              :     pub(super) deleted_at: SetDeletedFlagProgress,
     306              : }
     307              : 
     308              : pub enum UploadQueueStopped {
     309              :     Deletable(UploadQueueStoppedDeletable),
     310              :     Uninitialized,
     311              : }
     312              : 
     313              : #[derive(thiserror::Error, Debug)]
     314              : pub enum NotInitialized {
     315              :     #[error("queue is in state Uninitialized")]
     316              :     Uninitialized,
     317              :     #[error("queue is in state Stopped")]
     318              :     Stopped,
     319              :     #[error("queue is shutting down")]
     320              :     ShuttingDown,
     321              : }
     322              : 
     323              : impl NotInitialized {
     324            0 :     pub(crate) fn is_stopping(&self) -> bool {
     325              :         use NotInitialized::*;
     326            0 :         match self {
     327            0 :             Uninitialized => false,
     328            0 :             Stopped => true,
     329            0 :             ShuttingDown => true,
     330              :         }
     331            0 :     }
     332              : }
     333              : 
     334              : impl UploadQueue {
     335          900 :     pub fn initialize_empty_remote(
     336          900 :         &mut self,
     337          900 :         metadata: &TimelineMetadata,
     338          900 :         inprogress_limit: usize,
     339          900 :     ) -> anyhow::Result<&mut UploadQueueInitialized> {
     340          900 :         match self {
     341          900 :             UploadQueue::Uninitialized => (),
     342              :             UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
     343            0 :                 anyhow::bail!("already initialized, state {}", self.as_str())
     344              :             }
     345              :         }
     346              : 
     347          900 :         info!("initializing upload queue for empty remote");
     348              : 
     349          900 :         let index_part = IndexPart::empty(metadata.clone());
     350          900 : 
     351          900 :         let state = UploadQueueInitialized {
     352          900 :             inprogress_limit,
     353          900 :             dirty: index_part.clone(),
     354          900 :             clean: (index_part, None),
     355          900 :             latest_files_changes_since_metadata_upload_scheduled: 0,
     356          900 :             visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
     357          900 :             // what follows are boring default initializations
     358          900 :             task_counter: 0,
     359          900 :             inprogress_tasks: HashMap::new(),
     360          900 :             queued_operations: VecDeque::new(),
     361          900 :             #[cfg(feature = "testing")]
     362          900 :             dangling_files: HashMap::new(),
     363          900 :             recently_deleted: HashSet::new(),
     364          900 :             blocked_deletions: Vec::new(),
     365          900 :             shutting_down: false,
     366          900 :             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
     367          900 :         };
     368          900 : 
     369          900 :         *self = UploadQueue::Initialized(state);
     370          900 :         Ok(self.initialized_mut().expect("we just set it"))
     371          900 :     }
     372              : 
     373           44 :     pub fn initialize_with_current_remote_index_part(
     374           44 :         &mut self,
     375           44 :         index_part: &IndexPart,
     376           44 :         inprogress_limit: usize,
     377           44 :     ) -> anyhow::Result<&mut UploadQueueInitialized> {
     378           44 :         match self {
     379           44 :             UploadQueue::Uninitialized => (),
     380              :             UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
     381            0 :                 anyhow::bail!("already initialized, state {}", self.as_str())
     382              :             }
     383              :         }
     384              : 
     385           44 :         info!(
     386            0 :             "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
     387            0 :             index_part.metadata.disk_consistent_lsn()
     388              :         );
     389              : 
     390           44 :         let state = UploadQueueInitialized {
     391           44 :             inprogress_limit,
     392           44 :             dirty: index_part.clone(),
     393           44 :             clean: (index_part.clone(), None),
     394           44 :             latest_files_changes_since_metadata_upload_scheduled: 0,
     395           44 :             visible_remote_consistent_lsn: Arc::new(
     396           44 :                 index_part.metadata.disk_consistent_lsn().into(),
     397           44 :             ),
     398           44 :             // what follows are boring default initializations
     399           44 :             task_counter: 0,
     400           44 :             inprogress_tasks: HashMap::new(),
     401           44 :             queued_operations: VecDeque::new(),
     402           44 :             #[cfg(feature = "testing")]
     403           44 :             dangling_files: HashMap::new(),
     404           44 :             recently_deleted: HashSet::new(),
     405           44 :             blocked_deletions: Vec::new(),
     406           44 :             shutting_down: false,
     407           44 :             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
     408           44 :         };
     409           44 : 
     410           44 :         *self = UploadQueue::Initialized(state);
     411           44 :         Ok(self.initialized_mut().expect("we just set it"))
     412           44 :     }
     413              : 
     414        18320 :     pub fn initialized_mut(&mut self) -> Result<&mut UploadQueueInitialized, NotInitialized> {
     415              :         use UploadQueue::*;
     416        18320 :         match self {
     417            0 :             Uninitialized => Err(NotInitialized::Uninitialized),
     418        18320 :             Initialized(x) => {
     419        18320 :                 if x.shutting_down {
     420            0 :                     Err(NotInitialized::ShuttingDown)
     421              :                 } else {
     422        18320 :                     Ok(x)
     423              :                 }
     424              :             }
     425            0 :             Stopped(_) => Err(NotInitialized::Stopped),
     426              :         }
     427        18320 :     }
     428              : 
     429            4 :     pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStoppedDeletable> {
     430            4 :         match self {
     431              :             UploadQueue::Initialized(_) | UploadQueue::Uninitialized => {
     432            0 :                 anyhow::bail!("queue is in state {}", self.as_str())
     433              :             }
     434              :             UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => {
     435            0 :                 anyhow::bail!("queue is in state Stopped(Uninitialized)")
     436              :             }
     437            4 :             UploadQueue::Stopped(UploadQueueStopped::Deletable(deletable)) => Ok(deletable),
     438              :         }
     439            4 :     }
     440              : }
     441              : 
     442              : /// An in-progress upload or delete task.
     443              : #[derive(Debug)]
     444              : pub struct UploadTask {
     445              :     /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
     446              :     pub task_id: u64,
     447              :     /// Number of task retries.
     448              :     pub retries: AtomicU32,
     449              :     /// The upload operation.
     450              :     pub op: UploadOp,
     451              :     /// Any upload operations that were coalesced into this operation. This typically happens with
     452              :     /// back-to-back index uploads, see `UploadQueueInitialized::next_ready()`.
     453              :     pub coalesced_ops: Vec<UploadOp>,
     454              : }
     455              : 
     456              : /// A deletion of some layers within the lifetime of a timeline.  This is not used
     457              : /// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
     458              : #[derive(Debug, Clone)]
     459              : pub struct Delete {
     460              :     pub layers: Vec<(LayerName, LayerFileMetadata)>,
     461              : }
     462              : 
     463              : #[derive(Clone, Debug)]
     464              : pub enum UploadOp {
     465              :     /// Upload a layer file. The last field indicates the last operation for thie file.
     466              :     UploadLayer(ResidentLayer, LayerFileMetadata, Option<OpType>),
     467              : 
     468              :     /// Upload a index_part.json file
     469              :     UploadMetadata {
     470              :         /// The next [`UploadQueueInitialized::clean`] after this upload succeeds.
     471              :         uploaded: Box<IndexPart>,
     472              :     },
     473              : 
     474              :     /// Delete layer files
     475              :     Delete(Delete),
     476              : 
     477              :     /// Barrier. When the barrier operation is reached, the channel is closed.
     478              :     Barrier(tokio::sync::watch::Sender<()>),
     479              : 
     480              :     /// Shutdown; upon encountering this operation no new operations will be spawned, otherwise
     481              :     /// this is the same as a Barrier.
     482              :     Shutdown,
     483              : }
     484              : 
     485              : impl std::fmt::Display for UploadOp {
     486            0 :     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
     487            0 :         match self {
     488            0 :             UploadOp::UploadLayer(layer, metadata, mode) => {
     489            0 :                 write!(
     490            0 :                     f,
     491            0 :                     "UploadLayer({}, size={:?}, gen={:?}, mode={:?})",
     492            0 :                     layer, metadata.file_size, metadata.generation, mode
     493            0 :                 )
     494              :             }
     495            0 :             UploadOp::UploadMetadata { uploaded, .. } => {
     496            0 :                 write!(
     497            0 :                     f,
     498            0 :                     "UploadMetadata(lsn: {})",
     499            0 :                     uploaded.metadata.disk_consistent_lsn()
     500            0 :                 )
     501              :             }
     502            0 :             UploadOp::Delete(delete) => {
     503            0 :                 write!(f, "Delete({} layers)", delete.layers.len())
     504              :             }
     505            0 :             UploadOp::Barrier(_) => write!(f, "Barrier"),
     506            0 :             UploadOp::Shutdown => write!(f, "Shutdown"),
     507              :         }
     508            0 :     }
     509              : }
     510              : 
     511              : impl UploadOp {
     512              :     /// Returns true if self can bypass other, i.e. if the operations don't conflict. index is the
     513              :     /// active index when other would be uploaded -- if we allow self to bypass other, this would
     514              :     /// be the active index when self is uploaded.
     515      2362519 :     pub fn can_bypass(&self, other: &UploadOp, index: &IndexPart) -> bool {
     516      2362519 :         match (self, other) {
     517              :             // Nothing can bypass a barrier or shutdown, and it can't bypass anything.
     518         4795 :             (UploadOp::Barrier(_), _) | (_, UploadOp::Barrier(_)) => false,
     519           16 :             (UploadOp::Shutdown, _) | (_, UploadOp::Shutdown) => false,
     520              : 
     521              :             // Uploads and deletes can bypass each other unless they're for the same file.
     522        40047 :             (UploadOp::UploadLayer(a, ameta, _), UploadOp::UploadLayer(b, bmeta, _)) => {
     523        40047 :                 let aname = &a.layer_desc().layer_name();
     524        40047 :                 let bname = &b.layer_desc().layer_name();
     525        40047 :                 !is_same_remote_layer_path(aname, ameta, bname, bmeta)
     526              :             }
     527           84 :             (UploadOp::UploadLayer(u, umeta, _), UploadOp::Delete(d))
     528      2275000 :             | (UploadOp::Delete(d), UploadOp::UploadLayer(u, umeta, _)) => {
     529      2275114 :                 d.layers.iter().all(|(dname, dmeta)| {
     530      2275114 :                     !is_same_remote_layer_path(&u.layer_desc().layer_name(), umeta, dname, dmeta)
     531      2275114 :                 })
     532              :             }
     533              : 
     534              :             // Deletes are idempotent and can always bypass each other.
     535         5914 :             (UploadOp::Delete(_), UploadOp::Delete(_)) => true,
     536              : 
     537              :             // Uploads and deletes can bypass an index upload as long as neither the uploaded index
     538              :             // nor the active index below it references the file. A layer can't be modified or
     539              :             // deleted while referenced by an index.
     540              :             //
     541              :             // Similarly, index uploads can bypass uploads and deletes as long as neither the
     542              :             // uploaded index nor the active index references the file (the latter would be
     543              :             // incorrect use by the caller).
     544          181 :             (UploadOp::UploadLayer(u, umeta, _), UploadOp::UploadMetadata { uploaded: i })
     545         6870 :             | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::UploadLayer(u, umeta, _)) => {
     546         7051 :                 let uname = u.layer_desc().layer_name();
     547         7051 :                 !i.references(&uname, umeta) && !index.references(&uname, umeta)
     548              :             }
     549        29286 :             (UploadOp::Delete(d), UploadOp::UploadMetadata { uploaded: i })
     550           38 :             | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::Delete(d)) => {
     551        29324 :                 d.layers.iter().all(|(dname, dmeta)| {
     552        29324 :                     !i.references(dname, dmeta) && !index.references(dname, dmeta)
     553        29324 :                 })
     554              :             }
     555              : 
     556              :             // Indexes can never bypass each other. They can coalesce though, and
     557              :             // `UploadQueue::next_ready()` currently does this when possible.
     558          288 :             (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => false,
     559              :         }
     560      2362519 :     }
     561              : }
     562              : 
     563              : #[cfg(test)]
     564              : mod tests {
     565              :     use super::*;
     566              :     use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
     567              :     use crate::tenant::storage_layer::layer::local_layer_path;
     568              :     use crate::tenant::storage_layer::Layer;
     569              :     use crate::tenant::Timeline;
     570              :     use crate::DEFAULT_PG_VERSION;
     571              :     use itertools::Itertools as _;
     572              :     use std::str::FromStr as _;
     573              :     use utils::shard::{ShardCount, ShardIndex, ShardNumber};
     574              : 
     575              :     /// Test helper which asserts that two operations are the same, in lieu of UploadOp PartialEq.
     576              :     #[track_caller]
     577          196 :     fn assert_same_op(a: &UploadOp, b: &UploadOp) {
     578              :         use UploadOp::*;
     579          196 :         match (a, b) {
     580           88 :             (UploadLayer(a, ameta, atype), UploadLayer(b, bmeta, btype)) => {
     581           88 :                 assert_eq!(a.layer_desc().layer_name(), b.layer_desc().layer_name());
     582           88 :                 assert_eq!(ameta, bmeta);
     583           88 :                 assert_eq!(atype, btype);
     584              :             }
     585           44 :             (Delete(a), Delete(b)) => assert_eq!(a.layers, b.layers),
     586           56 :             (UploadMetadata { uploaded: a }, UploadMetadata { uploaded: b }) => assert_eq!(a, b),
     587            8 :             (Barrier(_), Barrier(_)) => {}
     588            0 :             (Shutdown, Shutdown) => {}
     589            0 :             (a, b) => panic!("{a:?} != {b:?}"),
     590              :         }
     591          196 :     }
     592              : 
     593              :     /// Test helper which asserts that two sets of operations are the same.
     594              :     #[track_caller]
     595           44 :     fn assert_same_ops<'a>(
     596           44 :         a: impl IntoIterator<Item = &'a UploadOp>,
     597           44 :         b: impl IntoIterator<Item = &'a UploadOp>,
     598           44 :     ) {
     599           44 :         a.into_iter()
     600           44 :             .zip_eq(b)
     601          116 :             .for_each(|(a, b)| assert_same_op(a, b))
     602           44 :     }
     603              : 
     604              :     /// Test helper to construct a test timeline.
     605              :     ///
     606              :     /// TODO: it really shouldn't be necessary to construct an entire tenant and timeline just to
     607              :     /// test the upload queue -- decouple ResidentLayer from Timeline.
     608              :     ///
     609              :     /// TODO: the upload queue uses TimelineMetadata::example() instead, because there's no way to
     610              :     /// obtain a TimelineMetadata from a Timeline.
     611           48 :     fn make_timeline() -> Arc<Timeline> {
     612           48 :         // Grab the current test name from the current thread name.
     613           48 :         // TODO: TenantHarness shouldn't take a &'static str, but just leak the test name for now.
     614           48 :         let test_name = std::thread::current().name().unwrap().to_string();
     615           48 :         let test_name = Box::leak(test_name.into_boxed_str());
     616           48 : 
     617           48 :         let runtime = tokio::runtime::Builder::new_current_thread()
     618           48 :             .enable_all()
     619           48 :             .build()
     620           48 :             .expect("failed to create runtime");
     621           48 : 
     622           48 :         runtime
     623           48 :             .block_on(async {
     624           48 :                 let harness = TenantHarness::create(test_name).await?;
     625           48 :                 let (tenant, ctx) = harness.load().await;
     626           48 :                 tenant
     627           48 :                     .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
     628           48 :                     .await
     629           48 :             })
     630           48 :             .expect("failed to create timeline")
     631           48 :     }
     632              : 
     633              :     /// Test helper to construct an (empty) resident layer.
     634          120 :     fn make_layer(timeline: &Arc<Timeline>, name: &str) -> ResidentLayer {
     635          120 :         make_layer_with_size(timeline, name, 0)
     636          120 :     }
     637              : 
     638              :     /// Test helper to construct a resident layer with the given size.
     639          132 :     fn make_layer_with_size(timeline: &Arc<Timeline>, name: &str, size: usize) -> ResidentLayer {
     640          132 :         let metadata = LayerFileMetadata {
     641          132 :             generation: timeline.generation,
     642          132 :             shard: timeline.get_shard_index(),
     643          132 :             file_size: size as u64,
     644          132 :         };
     645          132 :         make_layer_with_metadata(timeline, name, metadata)
     646          132 :     }
     647              : 
     648              :     /// Test helper to construct a layer with the given metadata.
     649          196 :     fn make_layer_with_metadata(
     650          196 :         timeline: &Arc<Timeline>,
     651          196 :         name: &str,
     652          196 :         metadata: LayerFileMetadata,
     653          196 :     ) -> ResidentLayer {
     654          196 :         let name = LayerName::from_str(name).expect("invalid name");
     655          196 :         let local_path = local_layer_path(
     656          196 :             timeline.conf,
     657          196 :             &timeline.tenant_shard_id,
     658          196 :             &timeline.timeline_id,
     659          196 :             &name,
     660          196 :             &metadata.generation,
     661          196 :         );
     662          196 :         std::fs::write(&local_path, vec![0; metadata.file_size as usize])
     663          196 :             .expect("failed to write file");
     664          196 :         Layer::for_resident(timeline.conf, timeline, local_path, name, metadata)
     665          196 :     }
     666              : 
     667              :     /// Test helper to add a layer to an index and return a new index.
     668           24 :     fn index_with(index: &IndexPart, layer: &ResidentLayer) -> Box<IndexPart> {
     669           24 :         let mut index = index.clone();
     670           24 :         index
     671           24 :             .layer_metadata
     672           24 :             .insert(layer.layer_desc().layer_name(), layer.metadata());
     673           24 :         Box::new(index)
     674           24 :     }
     675              : 
     676              :     /// Test helper to remove a layer from an index and return a new index.
     677            8 :     fn index_without(index: &IndexPart, layer: &ResidentLayer) -> Box<IndexPart> {
     678            8 :         let mut index = index.clone();
     679            8 :         index
     680            8 :             .layer_metadata
     681            8 :             .remove(&layer.layer_desc().layer_name());
     682            8 :         Box::new(index)
     683            8 :     }
     684              : 
     685              :     /// Nothing can bypass a barrier, and it can't bypass inprogress tasks.
     686              :     #[test]
     687            4 :     fn schedule_barrier() -> anyhow::Result<()> {
     688            4 :         let mut queue = UploadQueue::Uninitialized;
     689            4 :         let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?;
     690            4 :         let tli = make_timeline();
     691            4 : 
     692            4 :         let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter
     693            4 :         let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     694            4 :         let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     695            4 :         let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     696            4 :         let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     697            4 :         let (barrier, _) = tokio::sync::watch::channel(());
     698            4 : 
     699            4 :         // Enqueue non-conflicting upload, delete, and index before and after a barrier.
     700            4 :         let ops = [
     701            4 :             UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
     702            4 :             UploadOp::Delete(Delete {
     703            4 :                 layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())],
     704            4 :             }),
     705            4 :             UploadOp::UploadMetadata {
     706            4 :                 uploaded: index.clone(),
     707            4 :             },
     708            4 :             UploadOp::Barrier(barrier),
     709            4 :             UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
     710            4 :             UploadOp::Delete(Delete {
     711            4 :                 layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
     712            4 :             }),
     713            4 :             UploadOp::UploadMetadata {
     714            4 :                 uploaded: index.clone(),
     715            4 :             },
     716            4 :         ];
     717            4 : 
     718            4 :         queue.queued_operations.extend(ops.clone());
     719            4 : 
     720            4 :         // Schedule the initial operations ahead of the barrier.
     721            4 :         let tasks = queue.schedule_ready();
     722            4 : 
     723           12 :         assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]);
     724            4 :         assert!(matches!(
     725            4 :             queue.queued_operations.front(),
     726              :             Some(&UploadOp::Barrier(_))
     727              :         ));
     728              : 
     729              :         // Complete the initial operations. The barrier isn't scheduled while they're pending.
     730           16 :         for task in tasks {
     731           12 :             assert!(queue.schedule_ready().is_empty());
     732           12 :             queue.complete(task.task_id);
     733              :         }
     734              : 
     735              :         // Schedule the barrier. The later tasks won't schedule until it completes.
     736            4 :         let tasks = queue.schedule_ready();
     737            4 : 
     738            4 :         assert_eq!(tasks.len(), 1);
     739            4 :         assert!(matches!(tasks[0].op, UploadOp::Barrier(_)));
     740            4 :         assert_eq!(queue.queued_operations.len(), 3);
     741              : 
     742              :         // Complete the barrier. The rest of the tasks schedule immediately.
     743            4 :         queue.complete(tasks[0].task_id);
     744            4 : 
     745            4 :         let tasks = queue.schedule_ready();
     746           12 :         assert_same_ops(tasks.iter().map(|t| &t.op), &ops[4..]);
     747            4 :         assert!(queue.queued_operations.is_empty());
     748              : 
     749            4 :         Ok(())
     750            4 :     }
     751              : 
     752              :     /// Deletes can be scheduled in parallel, even if they're for the same file.
     753              :     #[test]
     754            4 :     fn schedule_delete_parallel() -> anyhow::Result<()> {
     755            4 :         let mut queue = UploadQueue::Uninitialized;
     756            4 :         let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?;
     757            4 :         let tli = make_timeline();
     758            4 : 
     759            4 :         // Enqueue a bunch of deletes, some with conflicting names.
     760            4 :         let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     761            4 :         let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     762            4 :         let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     763            4 :         let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     764            4 : 
     765            4 :         let ops = [
     766            4 :             UploadOp::Delete(Delete {
     767            4 :                 layers: vec![(layer0.layer_desc().layer_name(), layer0.metadata())],
     768            4 :             }),
     769            4 :             UploadOp::Delete(Delete {
     770            4 :                 layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())],
     771            4 :             }),
     772            4 :             UploadOp::Delete(Delete {
     773            4 :                 layers: vec![
     774            4 :                     (layer1.layer_desc().layer_name(), layer1.metadata()),
     775            4 :                     (layer2.layer_desc().layer_name(), layer2.metadata()),
     776            4 :                 ],
     777            4 :             }),
     778            4 :             UploadOp::Delete(Delete {
     779            4 :                 layers: vec![(layer2.layer_desc().layer_name(), layer2.metadata())],
     780            4 :             }),
     781            4 :             UploadOp::Delete(Delete {
     782            4 :                 layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
     783            4 :             }),
     784            4 :         ];
     785            4 : 
     786            4 :         queue.queued_operations.extend(ops.clone());
     787            4 : 
     788            4 :         // Schedule all ready operations. Since deletes don't conflict, they're all scheduled.
     789            4 :         let tasks = queue.schedule_ready();
     790            4 : 
     791           20 :         assert_same_ops(tasks.iter().map(|t| &t.op), &ops);
     792            4 :         assert!(queue.queued_operations.is_empty());
     793              : 
     794            4 :         Ok(())
     795            4 :     }
     796              : 
     797              :     /// Conflicting uploads are serialized.
     798              :     #[test]
     799            4 :     fn schedule_upload_conflicts() -> anyhow::Result<()> {
     800            4 :         let mut queue = UploadQueue::Uninitialized;
     801            4 :         let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
     802            4 :         let tli = make_timeline();
     803            4 : 
     804            4 :         // Enqueue three versions of the same layer, with different file sizes.
     805            4 :         let layer0a = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 1);
     806            4 :         let layer0b = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 2);
     807            4 :         let layer0c = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 3);
     808            4 : 
     809            4 :         let ops = [
     810            4 :             UploadOp::UploadLayer(layer0a.clone(), layer0a.metadata(), None),
     811            4 :             UploadOp::UploadLayer(layer0b.clone(), layer0b.metadata(), None),
     812            4 :             UploadOp::UploadLayer(layer0c.clone(), layer0c.metadata(), None),
     813            4 :         ];
     814            4 : 
     815            4 :         queue.queued_operations.extend(ops.clone());
     816              : 
     817              :         // Only one version should be scheduled and uploaded at a time.
     818           16 :         for op in ops {
     819           12 :             let tasks = queue.schedule_ready();
     820           12 :             assert_eq!(tasks.len(), 1);
     821           12 :             assert_same_op(&tasks[0].op, &op);
     822           12 :             queue.complete(tasks[0].task_id);
     823              :         }
     824            4 :         assert!(queue.schedule_ready().is_empty());
     825            4 :         assert!(queue.queued_operations.is_empty());
     826              : 
     827            4 :         Ok(())
     828            4 :     }
     829              : 
     830              :     /// Conflicting uploads and deletes are serialized.
     831              :     #[test]
     832            4 :     fn schedule_upload_delete_conflicts() -> anyhow::Result<()> {
     833            4 :         let mut queue = UploadQueue::Uninitialized;
     834            4 :         let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
     835            4 :         let tli = make_timeline();
     836            4 : 
     837            4 :         // Enqueue two layer uploads, with a delete of both layers in between them. These should be
     838            4 :         // scheduled one at a time, since deletes can't bypass uploads and vice versa.
     839            4 :         let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     840            4 :         let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     841            4 : 
     842            4 :         let ops = [
     843            4 :             UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
     844            4 :             UploadOp::Delete(Delete {
     845            4 :                 layers: vec![
     846            4 :                     (layer0.layer_desc().layer_name(), layer0.metadata()),
     847            4 :                     (layer1.layer_desc().layer_name(), layer1.metadata()),
     848            4 :                 ],
     849            4 :             }),
     850            4 :             UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
     851            4 :         ];
     852            4 : 
     853            4 :         queue.queued_operations.extend(ops.clone());
     854              : 
     855              :         // Only one version should be scheduled and uploaded at a time.
     856           16 :         for op in ops {
     857           12 :             let tasks = queue.schedule_ready();
     858           12 :             assert_eq!(tasks.len(), 1);
     859           12 :             assert_same_op(&tasks[0].op, &op);
     860           12 :             queue.complete(tasks[0].task_id);
     861              :         }
     862            4 :         assert!(queue.schedule_ready().is_empty());
     863            4 :         assert!(queue.queued_operations.is_empty());
     864              : 
     865            4 :         Ok(())
     866            4 :     }
     867              : 
     868              :     /// Non-conflicting uploads and deletes can bypass the queue, avoiding the conflicting
     869              :     /// delete/upload operations at the head of the queue.
     870              :     #[test]
     871            4 :     fn schedule_upload_delete_conflicts_bypass() -> anyhow::Result<()> {
     872            4 :         let mut queue = UploadQueue::Uninitialized;
     873            4 :         let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
     874            4 :         let tli = make_timeline();
     875            4 : 
     876            4 :         // Enqueue two layer uploads, with a delete of both layers in between them. These should be
     877            4 :         // scheduled one at a time, since deletes can't bypass uploads and vice versa.
     878            4 :         //
     879            4 :         // Also enqueue non-conflicting uploads and deletes at the end. These can bypass the queue
     880            4 :         // and run immediately.
     881            4 :         let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     882            4 :         let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     883            4 :         let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     884            4 :         let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     885            4 : 
     886            4 :         let ops = [
     887            4 :             UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
     888            4 :             UploadOp::Delete(Delete {
     889            4 :                 layers: vec![
     890            4 :                     (layer0.layer_desc().layer_name(), layer0.metadata()),
     891            4 :                     (layer1.layer_desc().layer_name(), layer1.metadata()),
     892            4 :                 ],
     893            4 :             }),
     894            4 :             UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
     895            4 :             UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
     896            4 :             UploadOp::Delete(Delete {
     897            4 :                 layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
     898            4 :             }),
     899            4 :         ];
     900            4 : 
     901            4 :         queue.queued_operations.extend(ops.clone());
     902            4 : 
     903            4 :         // Operations 0, 3, and 4 are scheduled immediately.
     904            4 :         let tasks = queue.schedule_ready();
     905           12 :         assert_same_ops(tasks.iter().map(|t| &t.op), [&ops[0], &ops[3], &ops[4]]);
     906            4 :         assert_eq!(queue.queued_operations.len(), 2);
     907              : 
     908            4 :         Ok(())
     909            4 :     }
     910              : 
     911              :     /// Non-conflicting uploads are parallelized.
     912              :     #[test]
     913            4 :     fn schedule_upload_parallel() -> anyhow::Result<()> {
     914            4 :         let mut queue = UploadQueue::Uninitialized;
     915            4 :         let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
     916            4 :         let tli = make_timeline();
     917            4 : 
     918            4 :         // Enqueue three different layer uploads.
     919            4 :         let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     920            4 :         let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     921            4 :         let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     922            4 : 
     923            4 :         let ops = [
     924            4 :             UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
     925            4 :             UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
     926            4 :             UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
     927            4 :         ];
     928            4 : 
     929            4 :         queue.queued_operations.extend(ops.clone());
     930            4 : 
     931            4 :         // All uploads should be scheduled concurrently.
     932            4 :         let tasks = queue.schedule_ready();
     933            4 : 
     934           12 :         assert_same_ops(tasks.iter().map(|t| &t.op), &ops);
     935            4 :         assert!(queue.queued_operations.is_empty());
     936              : 
     937            4 :         Ok(())
     938            4 :     }
     939              : 
     940              :     /// Index uploads are coalesced.
     941              :     #[test]
     942            4 :     fn schedule_index_coalesce() -> anyhow::Result<()> {
     943            4 :         let mut queue = UploadQueue::Uninitialized;
     944            4 :         let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
     945              : 
     946              :         // Enqueue three uploads of the current empty index.
     947            4 :         let index = Box::new(queue.clean.0.clone());
     948            4 : 
     949            4 :         let ops = [
     950            4 :             UploadOp::UploadMetadata {
     951            4 :                 uploaded: index.clone(),
     952            4 :             },
     953            4 :             UploadOp::UploadMetadata {
     954            4 :                 uploaded: index.clone(),
     955            4 :             },
     956            4 :             UploadOp::UploadMetadata {
     957            4 :                 uploaded: index.clone(),
     958            4 :             },
     959            4 :         ];
     960            4 : 
     961            4 :         queue.queued_operations.extend(ops.clone());
     962            4 : 
     963            4 :         // The index uploads are coalesced into a single operation.
     964            4 :         let tasks = queue.schedule_ready();
     965            4 :         assert_eq!(tasks.len(), 1);
     966            4 :         assert_same_op(&tasks[0].op, &ops[2]);
     967            4 :         assert_same_ops(&tasks[0].coalesced_ops, &ops[0..2]);
     968            4 : 
     969            4 :         assert!(queue.queued_operations.is_empty());
     970              : 
     971            4 :         Ok(())
     972            4 :     }
     973              : 
     974              :     /// Chains of upload/index operations lead to parallel layer uploads and serial index uploads.
     975              :     /// This is the common case with layer flushes.
     976              :     #[test]
     977            4 :     fn schedule_index_upload_chain() -> anyhow::Result<()> {
     978            4 :         let mut queue = UploadQueue::Uninitialized;
     979            4 :         let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
     980            4 :         let tli = make_timeline();
     981            4 : 
     982            4 :         // Enqueue three uploads of the current empty index.
     983            4 :         let index = Box::new(queue.clean.0.clone());
     984            4 :         let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     985            4 :         let index0 = index_with(&index, &layer0);
     986            4 :         let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     987            4 :         let index1 = index_with(&index0, &layer1);
     988            4 :         let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
     989            4 :         let index2 = index_with(&index1, &layer2);
     990            4 : 
     991            4 :         let ops = [
     992            4 :             UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
     993            4 :             UploadOp::UploadMetadata {
     994            4 :                 uploaded: index0.clone(),
     995            4 :             },
     996            4 :             UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
     997            4 :             UploadOp::UploadMetadata {
     998            4 :                 uploaded: index1.clone(),
     999            4 :             },
    1000            4 :             UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
    1001            4 :             UploadOp::UploadMetadata {
    1002            4 :                 uploaded: index2.clone(),
    1003            4 :             },
    1004            4 :         ];
    1005            4 : 
    1006            4 :         queue.queued_operations.extend(ops.clone());
    1007            4 : 
    1008            4 :         // The layer uploads should be scheduled immediately. The indexes must wait.
    1009            4 :         let upload_tasks = queue.schedule_ready();
    1010            4 :         assert_same_ops(
    1011           12 :             upload_tasks.iter().map(|t| &t.op),
    1012            4 :             [&ops[0], &ops[2], &ops[4]],
    1013            4 :         );
    1014            4 : 
    1015            4 :         // layer2 completes first. None of the indexes can upload yet.
    1016            4 :         queue.complete(upload_tasks[2].task_id);
    1017            4 :         assert!(queue.schedule_ready().is_empty());
    1018              : 
    1019              :         // layer0 completes. index0 can upload. It completes.
    1020            4 :         queue.complete(upload_tasks[0].task_id);
    1021            4 :         let index_tasks = queue.schedule_ready();
    1022            4 :         assert_eq!(index_tasks.len(), 1);
    1023            4 :         assert_same_op(&index_tasks[0].op, &ops[1]);
    1024            4 :         queue.complete(index_tasks[0].task_id);
    1025            4 : 
    1026            4 :         // layer 1 completes. This unblocks index 1 and 2, which coalesce into
    1027            4 :         // a single upload for index 2.
    1028            4 :         queue.complete(upload_tasks[1].task_id);
    1029            4 : 
    1030            4 :         let index_tasks = queue.schedule_ready();
    1031            4 :         assert_eq!(index_tasks.len(), 1);
    1032            4 :         assert_same_op(&index_tasks[0].op, &ops[5]);
    1033            4 :         assert_same_ops(&index_tasks[0].coalesced_ops, &ops[3..4]);
    1034            4 : 
    1035            4 :         assert!(queue.queued_operations.is_empty());
    1036              : 
    1037            4 :         Ok(())
    1038            4 :     }
    1039              : 
    1040              :     /// A delete can't bypass an index upload if an index ahead of it still references it.
    1041              :     #[test]
    1042            4 :     fn schedule_index_delete_dereferenced() -> anyhow::Result<()> {
    1043            4 :         let mut queue = UploadQueue::Uninitialized;
    1044            4 :         let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
    1045            4 :         let tli = make_timeline();
    1046            4 : 
    1047            4 :         // Create a layer to upload.
    1048            4 :         let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
    1049            4 :         let index_upload = index_with(&queue.clean.0, &layer);
    1050            4 : 
    1051            4 :         // Remove the layer reference in a new index, then delete the layer.
    1052            4 :         let index_deref = index_without(&index_upload, &layer);
    1053            4 : 
    1054            4 :         let ops = [
    1055            4 :             // Initial upload, with a barrier to prevent index coalescing.
    1056            4 :             UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
    1057            4 :             UploadOp::UploadMetadata {
    1058            4 :                 uploaded: index_upload.clone(),
    1059            4 :             },
    1060            4 :             UploadOp::Barrier(tokio::sync::watch::channel(()).0),
    1061            4 :             // Dereference the layer and delete it.
    1062            4 :             UploadOp::UploadMetadata {
    1063            4 :                 uploaded: index_deref.clone(),
    1064            4 :             },
    1065            4 :             UploadOp::Delete(Delete {
    1066            4 :                 layers: vec![(layer.layer_desc().layer_name(), layer.metadata())],
    1067            4 :             }),
    1068            4 :         ];
    1069            4 : 
    1070            4 :         queue.queued_operations.extend(ops.clone());
    1071              : 
    1072              :         // Operations are serialized.
    1073           24 :         for op in ops {
    1074           20 :             let tasks = queue.schedule_ready();
    1075           20 :             assert_eq!(tasks.len(), 1);
    1076           20 :             assert_same_op(&tasks[0].op, &op);
    1077           20 :             queue.complete(tasks[0].task_id);
    1078              :         }
    1079            4 :         assert!(queue.queued_operations.is_empty());
    1080              : 
    1081            4 :         Ok(())
    1082            4 :     }
    1083              : 
    1084              :     /// An upload with a reused layer name doesn't clobber the previous layer. Specifically, a
    1085              :     /// dereference/upload/reference cycle can't allow the upload to bypass the reference.
    1086              :     #[test]
    1087            4 :     fn schedule_index_upload_dereferenced() -> anyhow::Result<()> {
    1088            4 :         let mut queue = UploadQueue::Uninitialized;
    1089            4 :         let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
    1090            4 :         let tli = make_timeline();
    1091            4 : 
    1092            4 :         // Create a layer to upload.
    1093            4 :         let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
    1094            4 : 
    1095            4 :         // Upload the layer. Then dereference the layer, and upload/reference it again.
    1096            4 :         let index_upload = index_with(&queue.clean.0, &layer);
    1097            4 :         let index_deref = index_without(&index_upload, &layer);
    1098            4 :         let index_ref = index_with(&index_deref, &layer);
    1099            4 : 
    1100            4 :         let ops = [
    1101            4 :             // Initial upload, with a barrier to prevent index coalescing.
    1102            4 :             UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
    1103            4 :             UploadOp::UploadMetadata {
    1104            4 :                 uploaded: index_upload.clone(),
    1105            4 :             },
    1106            4 :             UploadOp::Barrier(tokio::sync::watch::channel(()).0),
    1107            4 :             // Dereference the layer.
    1108            4 :             UploadOp::UploadMetadata {
    1109            4 :                 uploaded: index_deref.clone(),
    1110            4 :             },
    1111            4 :             // Replace and reference the layer.
    1112            4 :             UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
    1113            4 :             UploadOp::UploadMetadata {
    1114            4 :                 uploaded: index_ref.clone(),
    1115            4 :             },
    1116            4 :         ];
    1117            4 : 
    1118            4 :         queue.queued_operations.extend(ops.clone());
    1119              : 
    1120              :         // Operations are serialized.
    1121           28 :         for op in ops {
    1122           24 :             let tasks = queue.schedule_ready();
    1123           24 :             assert_eq!(tasks.len(), 1);
    1124           24 :             assert_same_op(&tasks[0].op, &op);
    1125           24 :             queue.complete(tasks[0].task_id);
    1126              :         }
    1127            4 :         assert!(queue.queued_operations.is_empty());
    1128              : 
    1129            4 :         Ok(())
    1130            4 :     }
    1131              : 
    1132              :     /// Nothing can bypass a shutdown, and it waits for inprogress tasks. It's never returned from
    1133              :     /// next_ready(), but is left at the head of the queue.
    1134              :     #[test]
    1135            4 :     fn schedule_shutdown() -> anyhow::Result<()> {
    1136            4 :         let mut queue = UploadQueue::Uninitialized;
    1137            4 :         let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?;
    1138            4 :         let tli = make_timeline();
    1139            4 : 
    1140            4 :         let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter
    1141            4 :         let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
    1142            4 :         let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
    1143            4 :         let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
    1144            4 :         let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
    1145            4 : 
    1146            4 :         // Enqueue non-conflicting upload, delete, and index before and after a shutdown.
    1147            4 :         let ops = [
    1148            4 :             UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
    1149            4 :             UploadOp::Delete(Delete {
    1150            4 :                 layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())],
    1151            4 :             }),
    1152            4 :             UploadOp::UploadMetadata {
    1153            4 :                 uploaded: index.clone(),
    1154            4 :             },
    1155            4 :             UploadOp::Shutdown,
    1156            4 :             UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
    1157            4 :             UploadOp::Delete(Delete {
    1158            4 :                 layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
    1159            4 :             }),
    1160            4 :             UploadOp::UploadMetadata {
    1161            4 :                 uploaded: index.clone(),
    1162            4 :             },
    1163            4 :         ];
    1164            4 : 
    1165            4 :         queue.queued_operations.extend(ops.clone());
    1166            4 : 
    1167            4 :         // Schedule the initial operations ahead of the shutdown.
    1168            4 :         let tasks = queue.schedule_ready();
    1169            4 : 
    1170           12 :         assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]);
    1171            4 :         assert!(matches!(
    1172            4 :             queue.queued_operations.front(),
    1173              :             Some(&UploadOp::Shutdown)
    1174              :         ));
    1175              : 
    1176              :         // Complete the initial operations. The shutdown isn't triggered while they're pending.
    1177           16 :         for task in tasks {
    1178           12 :             assert!(queue.schedule_ready().is_empty());
    1179           12 :             queue.complete(task.task_id);
    1180              :         }
    1181              : 
    1182              :         // The shutdown is triggered the next time we try to pull an operation. It isn't returned,
    1183              :         // but is left in the queue.
    1184            4 :         assert!(!queue.shutdown_ready.is_closed());
    1185            4 :         assert!(queue.next_ready().is_none());
    1186            4 :         assert!(queue.shutdown_ready.is_closed());
    1187              : 
    1188            4 :         Ok(())
    1189            4 :     }
    1190              : 
    1191              :     /// Scheduling respects inprogress_limit.
    1192              :     #[test]
    1193            4 :     fn schedule_inprogress_limit() -> anyhow::Result<()> {
    1194            4 :         // Create a queue with inprogress_limit=2.
    1195            4 :         let mut queue = UploadQueue::Uninitialized;
    1196            4 :         let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 2)?;
    1197            4 :         let tli = make_timeline();
    1198            4 : 
    1199            4 :         // Enqueue a bunch of uploads.
    1200            4 :         let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
    1201            4 :         let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
    1202            4 :         let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
    1203            4 :         let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
    1204            4 : 
    1205            4 :         let ops = [
    1206            4 :             UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
    1207            4 :             UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
    1208            4 :             UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
    1209            4 :             UploadOp::UploadLayer(layer3.clone(), layer3.metadata(), None),
    1210            4 :         ];
    1211            4 : 
    1212            4 :         queue.queued_operations.extend(ops.clone());
    1213            4 : 
    1214            4 :         // Schedule all ready operations. Only 2 are scheduled.
    1215            4 :         let tasks = queue.schedule_ready();
    1216            8 :         assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..2]);
    1217            4 :         assert!(queue.next_ready().is_none());
    1218              : 
    1219              :         // When one completes, another is scheduled.
    1220            4 :         queue.complete(tasks[0].task_id);
    1221            4 :         let tasks = queue.schedule_ready();
    1222            4 :         assert_same_ops(tasks.iter().map(|t| &t.op), &ops[2..3]);
    1223            4 : 
    1224            4 :         Ok(())
    1225            4 :     }
    1226              : 
    1227              :     /// Tests that can_bypass takes name, generation and shard index into account for all operations.
    1228              :     #[test]
    1229            4 :     fn can_bypass_path() -> anyhow::Result<()> {
    1230            4 :         let tli = make_timeline();
    1231            4 : 
    1232            4 :         let name0 = &"000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51";
    1233            4 :         let name1 = &"100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51";
    1234              : 
    1235              :         // Asserts that layers a and b either can or can't bypass each other, for all combinations
    1236              :         // of operations (except Delete and UploadMetadata which are special-cased).
    1237              :         #[track_caller]
    1238           32 :         fn assert_can_bypass(a: ResidentLayer, b: ResidentLayer, can_bypass: bool) {
    1239           32 :             let index = IndexPart::empty(TimelineMetadata::example());
    1240           96 :             for (a, b) in make_ops(a).into_iter().zip(make_ops(b)) {
    1241           96 :                 match (&a, &b) {
    1242              :                     // Deletes can always bypass each other.
    1243           32 :                     (UploadOp::Delete(_), UploadOp::Delete(_)) => assert!(a.can_bypass(&b, &index)),
    1244              :                     // Indexes can never bypass each other.
    1245              :                     (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => {
    1246           32 :                         assert!(!a.can_bypass(&b, &index))
    1247              :                     }
    1248              :                     // For other operations, assert as requested.
    1249           32 :                     (a, b) => assert_eq!(a.can_bypass(b, &index), can_bypass),
    1250              :                 }
    1251              :             }
    1252           32 :         }
    1253              : 
    1254           64 :         fn make_ops(layer: ResidentLayer) -> Vec<UploadOp> {
    1255           64 :             let mut index = IndexPart::empty(TimelineMetadata::example());
    1256           64 :             index
    1257           64 :                 .layer_metadata
    1258           64 :                 .insert(layer.layer_desc().layer_name(), layer.metadata());
    1259           64 :             vec![
    1260           64 :                 UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
    1261           64 :                 UploadOp::Delete(Delete {
    1262           64 :                     layers: vec![(layer.layer_desc().layer_name(), layer.metadata())],
    1263           64 :                 }),
    1264           64 :                 UploadOp::UploadMetadata {
    1265           64 :                     uploaded: Box::new(index),
    1266           64 :                 },
    1267           64 :             ]
    1268           64 :         }
    1269              : 
    1270              :         // Makes a ResidentLayer.
    1271           64 :         let layer = |name: &'static str, shard: Option<u8>, generation: u32| -> ResidentLayer {
    1272           64 :             let shard = shard
    1273           64 :                 .map(|n| ShardIndex::new(ShardNumber(n), ShardCount(8)))
    1274           64 :                 .unwrap_or(ShardIndex::unsharded());
    1275           64 :             let metadata = LayerFileMetadata {
    1276           64 :                 shard,
    1277           64 :                 generation: Generation::Valid(generation),
    1278           64 :                 file_size: 0,
    1279           64 :             };
    1280           64 :             make_layer_with_metadata(&tli, name, metadata)
    1281           64 :         };
    1282              : 
    1283              :         // Same name and metadata can't bypass. This goes both for unsharded and sharded, as well as
    1284              :         // 0 or >0 generation.
    1285            4 :         assert_can_bypass(layer(name0, None, 0), layer(name0, None, 0), false);
    1286            4 :         assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(0), 0), false);
    1287            4 :         assert_can_bypass(layer(name0, None, 1), layer(name0, None, 1), false);
    1288            4 : 
    1289            4 :         // Different names can bypass.
    1290            4 :         assert_can_bypass(layer(name0, None, 0), layer(name1, None, 0), true);
    1291            4 : 
    1292            4 :         // Different shards can bypass. Shard 0 is different from unsharded.
    1293            4 :         assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(1), 0), true);
    1294            4 :         assert_can_bypass(layer(name0, Some(0), 0), layer(name0, None, 0), true);
    1295            4 : 
    1296            4 :         // Different generations can bypass, both sharded and unsharded.
    1297            4 :         assert_can_bypass(layer(name0, None, 0), layer(name0, None, 1), true);
    1298            4 :         assert_can_bypass(layer(name0, Some(1), 0), layer(name0, Some(1), 1), true);
    1299            4 : 
    1300            4 :         Ok(())
    1301            4 :     }
    1302              : }
        

Generated by: LCOV version 2.1-beta