LCOV - code coverage report
Current view: top level - pageserver/src - deletion_queue.rs (source / functions) Coverage Total Hit
Test: 1e20c4f2b28aa592527961bb32170ebbd2c9172f.info Lines: 84.7 % 687 582
Test Date: 2025-07-16 12:29:03 Functions: 64.6 % 99 64

            Line data    Source code
       1              : mod deleter;
       2              : mod list_writer;
       3              : mod validator;
       4              : 
       5              : use std::collections::HashMap;
       6              : use std::sync::Arc;
       7              : use std::time::Duration;
       8              : 
       9              : use anyhow::Context;
      10              : use camino::Utf8PathBuf;
      11              : use deleter::DeleterMessage;
      12              : use list_writer::ListWriterQueueMessage;
      13              : use pageserver_api::shard::TenantShardId;
      14              : use remote_storage::{GenericRemoteStorage, RemotePath};
      15              : use serde::{Deserialize, Serialize};
      16              : use thiserror::Error;
      17              : use tokio_util::sync::CancellationToken;
      18              : use tracing::{Instrument, debug, error};
      19              : use utils::crashsafe::path_with_suffix_extension;
      20              : use utils::generation::Generation;
      21              : use utils::id::TimelineId;
      22              : use utils::lsn::{AtomicLsn, Lsn};
      23              : use validator::ValidatorQueueMessage;
      24              : 
      25              : use self::deleter::Deleter;
      26              : use self::list_writer::{DeletionOp, ListWriter, RecoverOp};
      27              : use self::validator::Validator;
      28              : use crate::config::PageServerConf;
      29              : use crate::controller_upcall_client::StorageControllerUpcallApi;
      30              : use crate::metrics;
      31              : use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_timeline_path};
      32              : use crate::tenant::storage_layer::LayerName;
      33              : use crate::virtual_file::{MaybeFatalIo, VirtualFile};
      34              : 
      35              : // TODO: configurable for how long to wait before executing deletions
      36              : 
      37              : /// We aggregate object deletions from many tenants in one place, for several reasons:
      38              : /// - Coalesce deletions into fewer DeleteObjects calls
      39              : /// - Enable Tenant/Timeline lifetimes to be shorter than the time it takes
      40              : ///   to flush any outstanding deletions.
      41              : /// - Globally control throughput of deletions, as these are a low priority task: do
      42              : ///   not compete with the same S3 clients/connections used for higher priority uploads.
      43              : /// - Enable gating deletions on validation of a tenant's generation number, to make
      44              : ///   it safe to multi-attach tenants (see docs/rfcs/025-generation-numbers.md)
      45              : ///
      46              : /// There are two kinds of deletion: deferred and immediate.  A deferred deletion
      47              : /// may be intentionally delayed to protect passive readers of S3 data, and is
      48              : /// subject to a generation number validation step.  An immediate deletion is
      49              : /// ready to execute immediately, and is only queued up so that it can be coalesced
      50              : /// with other deletions in flight.
      51              : ///
      52              : /// Deferred deletions pass through three steps:
      53              : /// - ListWriter: accumulate deletion requests from Timelines, and batch them up into
      54              : ///   DeletionLists, which are persisted to disk.
      55              : /// - Validator: accumulate deletion lists, and validate them en-masse prior to passing
      56              : ///   the keys in the list onward for actual deletion.  Also validate remote_consistent_lsn
      57              : ///   updates for running timelines.
      58              : /// - Deleter: accumulate object keys that the validator has validated, and execute them in
      59              : ///   batches of 1000 keys via DeleteObjects.
      60              : ///
      61              : /// Non-deferred deletions, such as during timeline deletion, bypass the first
      62              : /// two stages and are passed straight into the Deleter.
      63              : ///
      64              : /// Internally, each stage is joined by a channel to the next.  On disk, there is only
      65              : /// one queue (of DeletionLists), which is written by the frontend and consumed
      66              : /// by the backend.
      67              : #[derive(Clone)]
      68              : pub struct DeletionQueue {
      69              :     client: DeletionQueueClient,
      70              : 
      71              :     // Parent cancellation token for the tokens passed into background workers
      72              :     cancel: CancellationToken,
      73              : }
      74              : 
      75              : /// Opaque wrapper around individual worker tasks, to avoid making the
      76              : /// worker objects themselves public
      77              : pub struct DeletionQueueWorkers<C>
      78              : where
      79              :     C: StorageControllerUpcallApi + Send + Sync,
      80              : {
      81              :     frontend: ListWriter,
      82              :     backend: Validator<C>,
      83              :     executor: Deleter,
      84              : }
      85              : 
      86              : impl<C> DeletionQueueWorkers<C>
      87              : where
      88              :     C: StorageControllerUpcallApi + Send + Sync + 'static,
      89              : {
      90            4 :     pub fn spawn_with(mut self, runtime: &tokio::runtime::Handle) -> tokio::task::JoinHandle<()> {
      91            4 :         let jh_frontend = runtime.spawn(async move {
      92            4 :             self.frontend
      93            4 :                 .background()
      94            4 :                 .instrument(tracing::info_span!(parent:None, "deletion frontend"))
      95            4 :                 .await
      96            1 :         });
      97            4 :         let jh_backend = runtime.spawn(async move {
      98            4 :             self.backend
      99            4 :                 .background()
     100            4 :                 .instrument(tracing::info_span!(parent:None, "deletion backend"))
     101            4 :                 .await
     102            1 :         });
     103            4 :         let jh_executor = runtime.spawn(async move {
     104            4 :             self.executor
     105            4 :                 .background()
     106            4 :                 .instrument(tracing::info_span!(parent:None, "deletion executor"))
     107            4 :                 .await
     108            1 :         });
     109              : 
     110            4 :         runtime.spawn({
     111            4 :             async move {
     112            4 :                 jh_frontend.await.expect("error joining frontend worker");
     113            1 :                 jh_backend.await.expect("error joining backend worker");
     114            1 :                 drop(jh_executor.await.expect("error joining executor worker"));
     115            1 :             }
     116              :         })
     117            4 :     }
     118              : }
     119              : 
     120              : /// A FlushOp is just a oneshot channel, where we send the transmit side down
     121              : /// another channel, and the receive side will receive a message when the channel
     122              : /// we're flushing has reached the FlushOp we sent into it.
     123              : ///
     124              : /// The only extra behavior beyond the channel is that the notify() method does not
     125              : /// return an error when the receive side has been dropped, because in this use case
     126              : /// it is harmless (the code that initiated the flush no longer cares about the result).
     127              : #[derive(Debug)]
     128              : struct FlushOp {
     129              :     tx: tokio::sync::oneshot::Sender<()>,
     130              : }
     131              : 
     132              : impl FlushOp {
     133           21 :     fn new() -> (Self, tokio::sync::oneshot::Receiver<()>) {
     134           21 :         let (tx, rx) = tokio::sync::oneshot::channel::<()>();
     135           21 :         (Self { tx }, rx)
     136           21 :     }
     137              : 
     138           22 :     fn notify(self) {
     139           22 :         if self.tx.send(()).is_err() {
     140              :             // oneshot channel closed. This is legal: a client could be destroyed while waiting for a flush.
     141            0 :             debug!("deletion queue flush from dropped client");
     142           22 :         };
     143           22 :     }
     144              : }
     145              : 
     146              : #[derive(Clone, Debug)]
     147              : pub struct DeletionQueueClient {
     148              :     tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
     149              :     executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
     150              : 
     151              :     lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
     152              : }
     153              : 
     154            0 : #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
     155              : struct TenantDeletionList {
     156              :     /// For each Timeline, a list of key fragments to append to the timeline remote path
     157              :     /// when reconstructing a full key
     158              :     timelines: HashMap<TimelineId, Vec<String>>,
     159              : 
     160              :     /// The generation in which this deletion was emitted: note that this may not be the
     161              :     /// same as the generation of any layers being deleted.  The generation of the layer
     162              :     /// has already been absorbed into the keys in `objects`
     163              :     generation: Generation,
     164              : }
     165              : 
     166              : impl TenantDeletionList {
     167            5 :     pub(crate) fn len(&self) -> usize {
     168            5 :         self.timelines.values().map(|v| v.len()).sum()
     169            5 :     }
     170              : }
     171              : 
     172              : /// Files ending with this suffix will be ignored and erased
     173              : /// during recovery as startup.
     174              : const TEMP_SUFFIX: &str = "tmp";
     175              : 
     176            0 : #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
     177              : struct DeletionList {
     178              :     /// Serialization version, for future use
     179              :     version: u8,
     180              : 
     181              :     /// Used for constructing a unique key for each deletion list we write out.
     182              :     sequence: u64,
     183              : 
     184              :     /// To avoid repeating tenant/timeline IDs in every key, we store keys in
     185              :     /// nested HashMaps by TenantTimelineID.  Each Tenant only appears once
     186              :     /// with one unique generation ID: if someone tries to push a second generation
     187              :     /// ID for the same tenant, we will start a new DeletionList.
     188              :     tenants: HashMap<TenantShardId, TenantDeletionList>,
     189              : 
     190              :     /// Avoid having to walk `tenants` to calculate the number of keys in
     191              :     /// the nested deletion lists
     192              :     size: usize,
     193              : 
     194              :     /// Set to true when the list has undergone validation with the control
     195              :     /// plane and the remaining contents of `tenants` are valid.  A list may
     196              :     /// also be implicitly marked valid by DeletionHeader.validated_sequence
     197              :     /// advancing to >= DeletionList.sequence
     198              :     #[serde(default)]
     199              :     #[serde(skip_serializing_if = "std::ops::Not::not")]
     200              :     validated: bool,
     201              : }
     202              : 
     203            0 : #[derive(Debug, Serialize, Deserialize)]
     204              : struct DeletionHeader {
     205              :     /// Serialization version, for future use
     206              :     version: u8,
     207              : 
     208              :     /// The highest sequence number (inclusive) that has been validated.  All deletion
     209              :     /// lists on disk with a sequence <= this value are safe to execute.
     210              :     validated_sequence: u64,
     211              : }
     212              : 
     213              : impl DeletionHeader {
     214              :     const VERSION_LATEST: u8 = 1;
     215              : 
     216            4 :     fn new(validated_sequence: u64) -> Self {
     217            4 :         Self {
     218            4 :             version: Self::VERSION_LATEST,
     219            4 :             validated_sequence,
     220            4 :         }
     221            4 :     }
     222              : 
     223            4 :     async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> {
     224            4 :         debug!("Saving deletion list header {:?}", self);
     225            4 :         let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
     226            4 :         let header_path = conf.deletion_header_path();
     227            4 :         let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
     228            4 :         VirtualFile::crashsafe_overwrite(header_path, temp_path, header_bytes)
     229            4 :             .await
     230            4 :             .maybe_fatal_err("save deletion header")?;
     231              : 
     232            4 :         Ok(())
     233            4 :     }
     234              : }
     235              : 
     236              : impl DeletionList {
     237              :     const VERSION_LATEST: u8 = 1;
     238           10 :     fn new(sequence: u64) -> Self {
     239           10 :         Self {
     240           10 :             version: Self::VERSION_LATEST,
     241           10 :             sequence,
     242           10 :             tenants: HashMap::new(),
     243           10 :             size: 0,
     244           10 :             validated: false,
     245           10 :         }
     246           10 :     }
     247              : 
     248           13 :     fn is_empty(&self) -> bool {
     249           13 :         self.tenants.is_empty()
     250           13 :     }
     251              : 
     252           30 :     fn len(&self) -> usize {
     253           30 :         self.size
     254           30 :     }
     255              : 
     256              :     /// Returns true if the push was accepted, false if the caller must start a new
     257              :     /// deletion list.
     258            7 :     fn push(
     259            7 :         &mut self,
     260            7 :         tenant: &TenantShardId,
     261            7 :         timeline: &TimelineId,
     262            7 :         generation: Generation,
     263            7 :         objects: &mut Vec<RemotePath>,
     264            7 :     ) -> bool {
     265            7 :         if objects.is_empty() {
     266              :             // Avoid inserting an empty TimelineDeletionList: this preserves the property
     267              :             // that if we have no keys, then self.objects is empty (used in Self::is_empty)
     268            0 :             return true;
     269            7 :         }
     270              : 
     271            7 :         let tenant_entry = self
     272            7 :             .tenants
     273            7 :             .entry(*tenant)
     274            7 :             .or_insert_with(|| TenantDeletionList {
     275            6 :                 timelines: HashMap::new(),
     276            6 :                 generation,
     277            6 :             });
     278              : 
     279            7 :         if tenant_entry.generation != generation {
     280              :             // Only one generation per tenant per list: signal to
     281              :             // caller to start a new list.
     282            1 :             return false;
     283            6 :         }
     284              : 
     285            6 :         let timeline_entry = tenant_entry.timelines.entry(*timeline).or_default();
     286              : 
     287            6 :         let timeline_remote_path = remote_timeline_path(tenant, timeline);
     288              : 
     289            6 :         self.size += objects.len();
     290            6 :         timeline_entry.extend(objects.drain(..).map(|p| {
     291            6 :             p.strip_prefix(&timeline_remote_path)
     292            6 :                 .expect("Timeline paths always start with the timeline prefix")
     293            6 :                 .to_string()
     294            6 :         }));
     295            6 :         true
     296            7 :     }
     297              : 
     298            5 :     fn into_remote_paths(self) -> Vec<RemotePath> {
     299            5 :         let mut result = Vec::new();
     300            5 :         for (tenant, tenant_deletions) in self.tenants.into_iter() {
     301            3 :             for (timeline, timeline_layers) in tenant_deletions.timelines.into_iter() {
     302            3 :                 let timeline_remote_path = remote_timeline_path(&tenant, &timeline);
     303            3 :                 result.extend(
     304            3 :                     timeline_layers
     305            3 :                         .into_iter()
     306            3 :                         .map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))),
     307              :                 );
     308              :             }
     309              :         }
     310              : 
     311            5 :         result
     312            5 :     }
     313              : 
     314            7 :     async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> {
     315            7 :         let path = conf.deletion_list_path(self.sequence);
     316            7 :         let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
     317              : 
     318            7 :         let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
     319              : 
     320            7 :         VirtualFile::crashsafe_overwrite(path, temp_path, bytes)
     321            7 :             .await
     322            7 :             .maybe_fatal_err("save deletion list")
     323            7 :             .map_err(Into::into)
     324            7 :     }
     325              : }
     326              : 
     327              : impl std::fmt::Display for DeletionList {
     328            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     329            0 :         write!(
     330            0 :             f,
     331            0 :             "DeletionList<seq={}, tenants={}, keys={}>",
     332              :             self.sequence,
     333            0 :             self.tenants.len(),
     334              :             self.size
     335              :         )
     336            0 :     }
     337              : }
     338              : 
     339              : struct PendingLsn {
     340              :     projected: Lsn,
     341              :     result_slot: Arc<AtomicLsn>,
     342              : }
     343              : 
     344              : struct TenantLsnState {
     345              :     timelines: HashMap<TimelineId, PendingLsn>,
     346              : 
     347              :     // In what generation was the most recent update proposed?
     348              :     generation: Generation,
     349              : }
     350              : 
     351              : #[derive(Default)]
     352              : struct VisibleLsnUpdates {
     353              :     tenants: HashMap<TenantShardId, TenantLsnState>,
     354              : }
     355              : 
     356              : impl VisibleLsnUpdates {
     357          123 :     fn new() -> Self {
     358          123 :         Self {
     359          123 :             tenants: HashMap::new(),
     360          123 :         }
     361          123 :     }
     362              : }
     363              : 
     364              : impl std::fmt::Debug for VisibleLsnUpdates {
     365            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     366            0 :         write!(f, "VisibleLsnUpdates({} tenants)", self.tenants.len())
     367            0 :     }
     368              : }
     369              : 
     370              : #[derive(Error, Debug)]
     371              : pub enum DeletionQueueError {
     372              :     #[error("Deletion queue unavailable during shutdown")]
     373              :     ShuttingDown,
     374              : }
     375              : 
     376              : impl DeletionQueueClient {
     377              :     /// This is cancel-safe.  If you drop the future before it completes, the message
     378              :     /// is not pushed, although in the context of the deletion queue it doesn't matter: once
     379              :     /// we decide to do a deletion the decision is always final.
     380          228 :     fn do_push<T>(
     381          228 :         &self,
     382          228 :         queue: &tokio::sync::mpsc::UnboundedSender<T>,
     383          228 :         msg: T,
     384          228 :     ) -> Result<(), DeletionQueueError> {
     385          228 :         match queue.send(msg) {
     386          228 :             Ok(_) => Ok(()),
     387            0 :             Err(e) => {
     388              :                 // This shouldn't happen, we should shut down all tenants before
     389              :                 // we shut down the global delete queue.  If we encounter a bug like this,
     390              :                 // we may leak objects as deletions won't be processed.
     391            0 :                 error!("Deletion queue closed while pushing, shutting down? ({e})");
     392            0 :                 Err(DeletionQueueError::ShuttingDown)
     393              :             }
     394              :         }
     395          228 :     }
     396              : 
     397            4 :     pub(crate) fn recover(
     398            4 :         &self,
     399            4 :         attached_tenants: HashMap<TenantShardId, Generation>,
     400            4 :     ) -> Result<(), DeletionQueueError> {
     401            4 :         self.do_push(
     402            4 :             &self.tx,
     403            4 :             ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }),
     404              :         )
     405            4 :     }
     406              : 
     407              :     /// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside
     408              :     /// world, it must validate its generation number before doing so.  Rather than do this synchronously,
     409              :     /// we allow the timeline to publish updates at will via this API, and then read back what LSN was most
     410              :     /// recently validated separately.
     411              :     ///
     412              :     /// In this function we publish the LSN to the `projected` field of the timeline's entry in the VisibleLsnUpdates.  The
     413              :     /// backend will later wake up and notice that the tenant's generation requires validation.
     414          770 :     pub(crate) async fn update_remote_consistent_lsn(
     415          770 :         &self,
     416          770 :         tenant_shard_id: TenantShardId,
     417          770 :         timeline_id: TimelineId,
     418          770 :         current_generation: Generation,
     419          770 :         lsn: Lsn,
     420          770 :         result_slot: Arc<AtomicLsn>,
     421          770 :     ) {
     422          770 :         let mut locked = self
     423          770 :             .lsn_table
     424          770 :             .write()
     425          770 :             .expect("Lock should never be poisoned");
     426              : 
     427          770 :         let tenant_entry = locked
     428          770 :             .tenants
     429          770 :             .entry(tenant_shard_id)
     430          770 :             .or_insert(TenantLsnState {
     431          770 :                 timelines: HashMap::new(),
     432          770 :                 generation: current_generation,
     433          770 :             });
     434              : 
     435          770 :         if tenant_entry.generation != current_generation {
     436            0 :             // Generation might have changed if we were detached and then re-attached: in this case,
     437            0 :             // state from the previous generation cannot be trusted.
     438            0 :             tenant_entry.timelines.clear();
     439            0 :             tenant_entry.generation = current_generation;
     440          770 :         }
     441              : 
     442          770 :         tenant_entry.timelines.insert(
     443          770 :             timeline_id,
     444          770 :             PendingLsn {
     445          770 :                 projected: lsn,
     446          770 :                 result_slot,
     447          770 :             },
     448              :         );
     449          770 :     }
     450              : 
     451              :     /// Submit a list of layers for deletion: this function will return before the deletion is
     452              :     /// persistent, but it may be executed at any time after this function enters: do not push
     453              :     /// layers until you're sure they can be deleted safely (i.e. remote metadata no longer
     454              :     /// references them).
     455              :     ///
     456              :     /// The `current_generation` is the generation of this pageserver's current attachment.  The
     457              :     /// generations in `layers` are the generations in which those layers were written.
     458          212 :     pub(crate) fn push_layers(
     459          212 :         &self,
     460          212 :         tenant_shard_id: TenantShardId,
     461          212 :         timeline_id: TimelineId,
     462          212 :         current_generation: Generation,
     463          212 :         layers: Vec<(LayerName, LayerFileMetadata)>,
     464          212 :     ) -> Result<(), DeletionQueueError> {
     465              :         // None generations are not valid for attached tenants: they must always be attached in
     466              :         // a known generation.  None generations are still permitted for layers in the index because
     467              :         // they may be historical.
     468          212 :         assert!(!current_generation.is_none());
     469              : 
     470          212 :         metrics::DELETION_QUEUE
     471          212 :             .keys_submitted
     472          212 :             .inc_by(layers.len() as u64);
     473          212 :         self.do_push(
     474          212 :             &self.tx,
     475          212 :             ListWriterQueueMessage::Delete(DeletionOp {
     476          212 :                 tenant_shard_id,
     477          212 :                 timeline_id,
     478          212 :                 layers,
     479          212 :                 generation: current_generation,
     480          212 :                 objects: Vec::new(),
     481          212 :             }),
     482              :         )
     483          212 :     }
     484              : 
     485              :     /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
     486           12 :     async fn do_flush<T>(
     487           12 :         &self,
     488           12 :         queue: &tokio::sync::mpsc::UnboundedSender<T>,
     489           12 :         msg: T,
     490           12 :         rx: tokio::sync::oneshot::Receiver<()>,
     491           12 :     ) -> Result<(), DeletionQueueError> {
     492           12 :         self.do_push(queue, msg)?;
     493           12 :         if rx.await.is_err() {
     494              :             // This shouldn't happen if tenants are shut down before deletion queue.  If we
     495              :             // encounter a bug like this, then a flusher will incorrectly believe it has flushed
     496              :             // when it hasn't, possibly leading to leaking objects.
     497            0 :             error!("Deletion queue dropped flush op while client was still waiting");
     498            0 :             Err(DeletionQueueError::ShuttingDown)
     499              :         } else {
     500           12 :             Ok(())
     501              :         }
     502           12 :     }
     503              : 
     504              :     /// Wait until all previous deletions are persistent (either executed, or written to a DeletionList)
     505              :     ///
     506              :     /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
     507            7 :     pub async fn flush(&self) -> Result<(), DeletionQueueError> {
     508            7 :         let (flush_op, rx) = FlushOp::new();
     509            7 :         self.do_flush(&self.tx, ListWriterQueueMessage::Flush(flush_op), rx)
     510            7 :             .await
     511            7 :     }
     512              : 
     513              :     /// Issue a flush without waiting for it to complete.  This is useful on advisory flushes where
     514              :     /// the caller wants to avoid the risk of waiting for lots of enqueued work, such as on tenant
     515              :     /// detach where flushing is nice but not necessary.
     516              :     ///
     517              :     /// This function provides no guarantees of work being done.
     518            0 :     pub fn flush_advisory(&self) {
     519            0 :         let (flush_op, _) = FlushOp::new();
     520              : 
     521              :         // Transmit the flush message, ignoring any result (such as a closed channel during shutdown).
     522            0 :         drop(self.tx.send(ListWriterQueueMessage::FlushExecute(flush_op)));
     523            0 :     }
     524              : 
     525              :     // Wait until all previous deletions are executed
     526            5 :     pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
     527            5 :         debug!("flush_execute: flushing to deletion lists...");
     528              :         // Flush any buffered work to deletion lists
     529            5 :         self.flush().await?;
     530              : 
     531              :         // Flush the backend into the executor of deletion lists
     532            5 :         let (flush_op, rx) = FlushOp::new();
     533            5 :         debug!("flush_execute: flushing backend...");
     534            5 :         self.do_flush(&self.tx, ListWriterQueueMessage::FlushExecute(flush_op), rx)
     535            5 :             .await?;
     536            5 :         debug!("flush_execute: finished flushing backend...");
     537              : 
     538              :         // Flush any immediate-mode deletions (the above backend flush will only flush
     539              :         // the executor if deletions had flowed through the backend)
     540            5 :         debug!("flush_execute: flushing execution...");
     541            5 :         self.flush_immediate().await?;
     542            5 :         debug!("flush_execute: finished flushing execution...");
     543            5 :         Ok(())
     544            5 :     }
     545              : 
     546              :     /// This interface bypasses the persistent deletion queue, and any validation
     547              :     /// that this pageserver is still elegible to execute the deletions.  It is for
     548              :     /// use in timeline deletions, where the control plane is telling us we may
     549              :     /// delete everything in the timeline.
     550              :     ///
     551              :     /// DO NOT USE THIS FROM GC OR COMPACTION CODE.  Use the regular `push_layers`.
     552            0 :     pub(crate) async fn push_immediate(
     553            0 :         &self,
     554            0 :         objects: Vec<RemotePath>,
     555            0 :     ) -> Result<(), DeletionQueueError> {
     556            0 :         metrics::DELETION_QUEUE
     557            0 :             .keys_submitted
     558            0 :             .inc_by(objects.len() as u64);
     559            0 :         self.executor_tx
     560            0 :             .send(DeleterMessage::Delete(objects))
     561            0 :             .await
     562            0 :             .map_err(|_| DeletionQueueError::ShuttingDown)
     563            0 :     }
     564              : 
     565              :     /// Companion to push_immediate.  When this returns Ok, all prior objects sent
     566              :     /// into push_immediate have been deleted from remote storage.
     567            5 :     pub(crate) async fn flush_immediate(&self) -> Result<(), DeletionQueueError> {
     568            5 :         let (flush_op, rx) = FlushOp::new();
     569            5 :         self.executor_tx
     570            5 :             .send(DeleterMessage::Flush(flush_op))
     571            5 :             .await
     572            5 :             .map_err(|_| DeletionQueueError::ShuttingDown)?;
     573              : 
     574            5 :         rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
     575            5 :     }
     576              : }
     577              : 
     578              : impl DeletionQueue {
     579            4 :     pub fn new_client(&self) -> DeletionQueueClient {
     580            4 :         self.client.clone()
     581            4 :     }
     582              : 
     583              :     /// Caller may use the returned object to construct clients with new_client.
     584              :     /// Caller should tokio::spawn the background() members of the two worker objects returned:
     585              :     /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
     586            4 :     pub fn new<C>(
     587            4 :         remote_storage: GenericRemoteStorage,
     588            4 :         controller_upcall_client: C,
     589            4 :         conf: &'static PageServerConf,
     590            4 :     ) -> (Self, DeletionQueueWorkers<C>)
     591            4 :     where
     592            4 :         C: StorageControllerUpcallApi + Send + Sync,
     593              :     {
     594              :         // Unbounded channel: enables non-async functions to submit deletions.  The actual length is
     595              :         // constrained by how promptly the ListWriter wakes up and drains it, which should be frequent
     596              :         // enough to avoid this taking pathologically large amount of memory.
     597            4 :         let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
     598              : 
     599              :         // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
     600            4 :         let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
     601              : 
     602              :         // Shallow channel: it carries lists of paths, and we expect the main queueing to
     603              :         // happen in the backend (persistent), not in this queue.
     604            4 :         let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16);
     605              : 
     606            4 :         let lsn_table = Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new()));
     607              : 
     608              :         // The deletion queue has an independent cancellation token to
     609              :         // the general pageserver shutdown token, because it stays alive a bit
     610              :         // longer to flush after Tenants have all been torn down.
     611            4 :         let cancel = CancellationToken::new();
     612              : 
     613            4 :         (
     614            4 :             Self {
     615            4 :                 client: DeletionQueueClient {
     616            4 :                     tx,
     617            4 :                     executor_tx: executor_tx.clone(),
     618            4 :                     lsn_table: lsn_table.clone(),
     619            4 :                 },
     620            4 :                 cancel: cancel.clone(),
     621            4 :             },
     622            4 :             DeletionQueueWorkers {
     623            4 :                 frontend: ListWriter::new(conf, rx, backend_tx, cancel.clone()),
     624            4 :                 backend: Validator::new(
     625            4 :                     conf,
     626            4 :                     backend_rx,
     627            4 :                     executor_tx,
     628            4 :                     controller_upcall_client,
     629            4 :                     lsn_table.clone(),
     630            4 :                     cancel.clone(),
     631            4 :                 ),
     632            4 :                 executor: Deleter::new(remote_storage, executor_rx, cancel.clone()),
     633            4 :             },
     634            4 :         )
     635            4 :     }
     636              : 
     637            0 :     pub async fn shutdown(&mut self, timeout: Duration) {
     638            0 :         match tokio::time::timeout(timeout, self.client.flush()).await {
     639              :             Ok(Ok(())) => {
     640            0 :                 tracing::info!("Deletion queue flushed successfully on shutdown")
     641              :             }
     642              :             Ok(Err(DeletionQueueError::ShuttingDown)) => {
     643              :                 // This is not harmful for correctness, but is unexpected: the deletion
     644              :                 // queue's workers should stay alive as long as there are any client handles instantiated.
     645            0 :                 tracing::warn!("Deletion queue stopped prematurely");
     646              :             }
     647            0 :             Err(_timeout) => {
     648            0 :                 tracing::warn!("Timed out flushing deletion queue on shutdown")
     649              :             }
     650              :         }
     651              : 
     652              :         // We only cancel _after_ flushing: otherwise we would be shutting down the
     653              :         // components that do the flush.
     654            0 :         self.cancel.cancel();
     655            0 :     }
     656              : }
     657              : 
     658              : #[cfg(test)]
     659              : mod test {
     660              :     use std::io::ErrorKind;
     661              :     use std::time::Duration;
     662              : 
     663              :     use camino::Utf8Path;
     664              :     use hex_literal::hex;
     665              :     use pageserver_api::key::Key;
     666              :     use pageserver_api::models::ShardImportStatus;
     667              :     use pageserver_api::shard::ShardIndex;
     668              :     use pageserver_api::upcall_api::ReAttachResponseTenant;
     669              :     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
     670              :     use tokio::task::JoinHandle;
     671              :     use tracing::info;
     672              : 
     673              :     use super::*;
     674              :     use crate::controller_upcall_client::RetryForeverError;
     675              :     use crate::tenant::harness::TenantHarness;
     676              :     use crate::tenant::storage_layer::DeltaLayerName;
     677              :     pub const TIMELINE_ID: TimelineId =
     678              :         TimelineId::from_array(hex!("11223344556677881122334455667788"));
     679              : 
     680              :     pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName {
     681              :         key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
     682              :         lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51),
     683              :     });
     684              : 
     685              :     // When you need a second layer in a test.
     686              :     pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName {
     687              :         key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
     688              :         lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61),
     689              :     });
     690              : 
     691              :     struct TestSetup {
     692              :         harness: TenantHarness,
     693              :         remote_fs_dir: Utf8PathBuf,
     694              :         storage: GenericRemoteStorage,
     695              :         mock_control_plane: MockStorageController,
     696              :         deletion_queue: DeletionQueue,
     697              :         worker_join: JoinHandle<()>,
     698              :     }
     699              : 
     700              :     impl TestSetup {
     701              :         /// Simulate a pageserver restart by destroying and recreating the deletion queue
     702            1 :         async fn restart(&mut self) {
     703            1 :             let (deletion_queue, workers) = DeletionQueue::new(
     704            1 :                 self.storage.clone(),
     705            1 :                 self.mock_control_plane.clone(),
     706            1 :                 self.harness.conf,
     707            1 :             );
     708              : 
     709            1 :             tracing::debug!("Spawning worker for new queue queue");
     710            1 :             let worker_join = workers.spawn_with(&tokio::runtime::Handle::current());
     711              : 
     712            1 :             let old_worker_join = std::mem::replace(&mut self.worker_join, worker_join);
     713            1 :             let old_deletion_queue = std::mem::replace(&mut self.deletion_queue, deletion_queue);
     714              : 
     715            1 :             tracing::debug!("Joining worker from previous queue");
     716            1 :             old_deletion_queue.cancel.cancel();
     717            1 :             old_worker_join
     718            1 :                 .await
     719            1 :                 .expect("Failed to join workers for previous deletion queue");
     720            1 :         }
     721              : 
     722            3 :         fn set_latest_generation(&self, gen_: Generation) {
     723            3 :             let tenant_shard_id = self.harness.tenant_shard_id;
     724            3 :             self.mock_control_plane
     725            3 :                 .latest_generation
     726            3 :                 .lock()
     727            3 :                 .unwrap()
     728            3 :                 .insert(tenant_shard_id, gen_);
     729            3 :         }
     730              : 
     731              :         /// Returns remote layer file name, suitable for use in assert_remote_files
     732            3 :         fn write_remote_layer(
     733            3 :             &self,
     734            3 :             file_name: LayerName,
     735            3 :             gen_: Generation,
     736            3 :         ) -> anyhow::Result<String> {
     737            3 :             let tenant_shard_id = self.harness.tenant_shard_id;
     738            3 :             let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
     739            3 :             let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
     740            3 :             std::fs::create_dir_all(&remote_timeline_path)?;
     741            3 :             let remote_layer_file_name = format!("{}{}", file_name, gen_.get_suffix());
     742              : 
     743            3 :             let content: Vec<u8> = format!("placeholder contents of {file_name}").into();
     744              : 
     745            3 :             std::fs::write(
     746            3 :                 remote_timeline_path.join(remote_layer_file_name.clone()),
     747            3 :                 content,
     748            0 :             )?;
     749              : 
     750            3 :             Ok(remote_layer_file_name)
     751            3 :         }
     752              :     }
     753              : 
     754              :     #[derive(Debug, Clone)]
     755              :     struct MockStorageController {
     756              :         pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantShardId, Generation>>>,
     757              :     }
     758              : 
     759              :     impl MockStorageController {
     760            3 :         fn new() -> Self {
     761            3 :             Self {
     762            3 :                 latest_generation: Arc::default(),
     763            3 :             }
     764            3 :         }
     765              :     }
     766              : 
     767              :     impl StorageControllerUpcallApi for MockStorageController {
     768            0 :         async fn re_attach(
     769            0 :             &self,
     770            0 :             _conf: &PageServerConf,
     771            0 :         ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
     772            0 :             unimplemented!()
     773              :         }
     774              : 
     775            4 :         async fn validate(
     776            4 :             &self,
     777            4 :             tenants: Vec<(TenantShardId, Generation)>,
     778            4 :         ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
     779            4 :             let mut result = HashMap::new();
     780              : 
     781            4 :             let latest_generation = self.latest_generation.lock().unwrap();
     782              : 
     783            8 :             for (tenant_shard_id, generation) in tenants {
     784            4 :                 if let Some(latest) = latest_generation.get(&tenant_shard_id) {
     785            4 :                     result.insert(tenant_shard_id, *latest == generation);
     786            4 :                 }
     787              :             }
     788              : 
     789            4 :             Ok(result)
     790            4 :         }
     791              : 
     792            0 :         async fn put_timeline_import_status(
     793            0 :             &self,
     794            0 :             _tenant_shard_id: TenantShardId,
     795            0 :             _timeline_id: TimelineId,
     796            0 :             _generation: Generation,
     797            0 :             _status: pageserver_api::models::ShardImportStatus,
     798            0 :         ) -> Result<(), RetryForeverError> {
     799            0 :             unimplemented!()
     800              :         }
     801              : 
     802            0 :         async fn get_timeline_import_status(
     803            0 :             &self,
     804            0 :             _tenant_shard_id: TenantShardId,
     805            0 :             _timeline_id: TimelineId,
     806            0 :             _generation: Generation,
     807            0 :         ) -> Result<ShardImportStatus, RetryForeverError> {
     808            0 :             unimplemented!()
     809              :         }
     810              :     }
     811              : 
     812            3 :     async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
     813            3 :         let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
     814            3 :         let harness = TenantHarness::create(test_name).await?;
     815              : 
     816              :         // We do not load() the harness: we only need its config and remote_storage
     817              : 
     818              :         // Set up a GenericRemoteStorage targetting a directory
     819            3 :         let remote_fs_dir = harness.conf.workdir.join("remote_fs");
     820            3 :         std::fs::create_dir_all(remote_fs_dir)?;
     821            3 :         let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
     822            3 :         let storage_config = RemoteStorageConfig {
     823            3 :             storage: RemoteStorageKind::LocalFs {
     824            3 :                 local_path: remote_fs_dir.clone(),
     825            3 :             },
     826            3 :             timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
     827            3 :             small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
     828            3 :         };
     829            3 :         let storage = GenericRemoteStorage::from_config(&storage_config)
     830            3 :             .await
     831            3 :             .unwrap();
     832              : 
     833            3 :         let mock_control_plane = MockStorageController::new();
     834              : 
     835            3 :         let (deletion_queue, worker) =
     836            3 :             DeletionQueue::new(storage.clone(), mock_control_plane.clone(), harness.conf);
     837              : 
     838            3 :         let worker_join = worker.spawn_with(&tokio::runtime::Handle::current());
     839              : 
     840            3 :         Ok(TestSetup {
     841            3 :             harness,
     842            3 :             remote_fs_dir,
     843            3 :             storage,
     844            3 :             mock_control_plane,
     845            3 :             deletion_queue,
     846            3 :             worker_join,
     847            3 :         })
     848            3 :     }
     849              : 
     850              :     // TODO: put this in a common location so that we can share with remote_timeline_client's tests
     851            9 :     fn assert_remote_files(expected: &[&str], remote_path: &Utf8Path) {
     852            9 :         let mut expected: Vec<String> = expected.iter().map(|x| String::from(*x)).collect();
     853            9 :         expected.sort();
     854              : 
     855            9 :         let mut found: Vec<String> = Vec::new();
     856            9 :         let dir = match std::fs::read_dir(remote_path) {
     857            9 :             Ok(d) => d,
     858            0 :             Err(e) => {
     859            0 :                 if e.kind() == ErrorKind::NotFound {
     860            0 :                     if expected.is_empty() {
     861              :                         // We are asserting prefix is empty: it is expected that the dir is missing
     862            0 :                         return;
     863              :                     } else {
     864            0 :                         assert_eq!(expected, Vec::<String>::new());
     865            0 :                         unreachable!();
     866              :                     }
     867              :                 } else {
     868            0 :                     panic!("Unexpected error listing {remote_path}: {e}");
     869              :                 }
     870              :             }
     871              :         };
     872              : 
     873            9 :         for entry in dir.flatten() {
     874            8 :             let entry_name = entry.file_name();
     875            8 :             let fname = entry_name.to_str().unwrap();
     876            8 :             found.push(String::from(fname));
     877            8 :         }
     878            9 :         found.sort();
     879              : 
     880            9 :         assert_eq!(expected, found);
     881            9 :     }
     882              : 
     883            5 :     fn assert_local_files(expected: &[&str], directory: &Utf8Path) {
     884            5 :         let dir = match std::fs::read_dir(directory) {
     885            4 :             Ok(d) => d,
     886              :             Err(_) => {
     887            1 :                 assert_eq!(expected, &Vec::<String>::new());
     888            1 :                 return;
     889              :             }
     890              :         };
     891            4 :         let mut found = Vec::new();
     892            9 :         for dentry in dir {
     893            5 :             let dentry = dentry.unwrap();
     894            5 :             let file_name = dentry.file_name();
     895            5 :             let file_name_str = file_name.to_string_lossy();
     896            5 :             found.push(file_name_str.to_string());
     897            5 :         }
     898            4 :         found.sort();
     899            4 :         assert_eq!(expected, found);
     900            5 :     }
     901              : 
     902              :     #[tokio::test]
     903            1 :     async fn deletion_queue_smoke() -> anyhow::Result<()> {
     904              :         // Basic test that the deletion queue processes the deletions we pass into it
     905            1 :         let ctx = setup("deletion_queue_smoke")
     906            1 :             .await
     907            1 :             .expect("Failed test setup");
     908            1 :         let client = ctx.deletion_queue.new_client();
     909            1 :         client.recover(HashMap::new())?;
     910              : 
     911            1 :         let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
     912            1 :         let tenant_shard_id = ctx.harness.tenant_shard_id;
     913              : 
     914            1 :         let content: Vec<u8> = "victim1 contents".into();
     915            1 :         let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
     916            1 :         let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
     917            1 :         let deletion_prefix = ctx.harness.conf.deletion_prefix();
     918              : 
     919              :         // Exercise the distinction between the generation of the layers
     920              :         // we delete, and the generation of the running Tenant.
     921            1 :         let layer_generation = Generation::new(0xdeadbeef);
     922            1 :         let now_generation = Generation::new(0xfeedbeef);
     923            1 :         let layer_metadata =
     924            1 :             LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
     925              : 
     926            1 :         let remote_layer_file_name_1 =
     927            1 :             format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
     928              : 
     929              :         // Set mock control plane state to valid for our generation
     930            1 :         ctx.set_latest_generation(now_generation);
     931              : 
     932              :         // Inject a victim file to remote storage
     933            1 :         info!("Writing");
     934            1 :         std::fs::create_dir_all(&remote_timeline_path)?;
     935            1 :         std::fs::write(
     936            1 :             remote_timeline_path.join(remote_layer_file_name_1.clone()),
     937            1 :             content,
     938            0 :         )?;
     939            1 :         assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
     940              : 
     941              :         // File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
     942            1 :         info!("Pushing");
     943            1 :         client.push_layers(
     944            1 :             tenant_shard_id,
     945              :             TIMELINE_ID,
     946            1 :             now_generation,
     947            1 :             [(layer_file_name_1.clone(), layer_metadata)].to_vec(),
     948            0 :         )?;
     949            1 :         assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
     950              : 
     951            1 :         assert_local_files(&[], &deletion_prefix);
     952              : 
     953              :         // File should still be there after we write a deletion list (we haven't pushed enough to execute anything)
     954            1 :         info!("Flushing");
     955            1 :         client.flush().await?;
     956            1 :         assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
     957            1 :         assert_local_files(&["0000000000000001-01.list"], &deletion_prefix);
     958              : 
     959              :         // File should go away when we execute
     960            1 :         info!("Flush-executing");
     961            1 :         client.flush_execute().await?;
     962            1 :         assert_remote_files(&[], &remote_timeline_path);
     963            1 :         assert_local_files(&["header-01"], &deletion_prefix);
     964              : 
     965              :         // Flushing on an empty queue should succeed immediately, and not write any lists
     966            1 :         info!("Flush-executing on empty");
     967            1 :         client.flush_execute().await?;
     968            1 :         assert_local_files(&["header-01"], &deletion_prefix);
     969              : 
     970            2 :         Ok(())
     971            1 :     }
     972              : 
     973              :     #[tokio::test]
     974            1 :     async fn deletion_queue_validation() -> anyhow::Result<()> {
     975            1 :         let ctx = setup("deletion_queue_validation")
     976            1 :             .await
     977            1 :             .expect("Failed test setup");
     978            1 :         let client = ctx.deletion_queue.new_client();
     979            1 :         client.recover(HashMap::new())?;
     980              : 
     981              :         // Generation that the control plane thinks is current
     982            1 :         let latest_generation = Generation::new(0xdeadbeef);
     983              :         // Generation that our DeletionQueue thinks the tenant is running with
     984            1 :         let stale_generation = latest_generation.previous();
     985              :         // Generation that our example layer file was written with
     986            1 :         let layer_generation = stale_generation.previous();
     987            1 :         let layer_metadata =
     988            1 :             LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
     989              : 
     990            1 :         ctx.set_latest_generation(latest_generation);
     991              : 
     992            1 :         let tenant_shard_id = ctx.harness.tenant_shard_id;
     993            1 :         let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
     994            1 :         let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
     995              : 
     996              :         // Initial state: a remote layer exists
     997            1 :         let remote_layer_name = ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
     998            1 :         assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
     999              : 
    1000            1 :         tracing::debug!("Pushing...");
    1001            1 :         client.push_layers(
    1002            1 :             tenant_shard_id,
    1003              :             TIMELINE_ID,
    1004            1 :             stale_generation,
    1005            1 :             [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
    1006            0 :         )?;
    1007              : 
    1008              :         // We enqueued the operation in a stale generation: it should have failed validation
    1009            1 :         tracing::debug!("Flushing...");
    1010            1 :         tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??;
    1011            1 :         assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
    1012              : 
    1013            1 :         tracing::debug!("Pushing...");
    1014            1 :         client.push_layers(
    1015            1 :             tenant_shard_id,
    1016              :             TIMELINE_ID,
    1017            1 :             latest_generation,
    1018            1 :             [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
    1019            0 :         )?;
    1020              : 
    1021              :         // We enqueued the operation in a fresh generation: it should have passed validation
    1022            1 :         tracing::debug!("Flushing...");
    1023            1 :         tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??;
    1024            1 :         assert_remote_files(&[], &remote_timeline_path);
    1025              : 
    1026            2 :         Ok(())
    1027            1 :     }
    1028              : 
    1029              :     #[tokio::test]
    1030            1 :     async fn deletion_queue_recovery() -> anyhow::Result<()> {
    1031              :         // Basic test that the deletion queue processes the deletions we pass into it
    1032            1 :         let mut ctx = setup("deletion_queue_recovery")
    1033            1 :             .await
    1034            1 :             .expect("Failed test setup");
    1035            1 :         let client = ctx.deletion_queue.new_client();
    1036            1 :         client.recover(HashMap::new())?;
    1037              : 
    1038            1 :         let tenant_shard_id = ctx.harness.tenant_shard_id;
    1039              : 
    1040            1 :         let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
    1041            1 :         let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
    1042            1 :         let deletion_prefix = ctx.harness.conf.deletion_prefix();
    1043              : 
    1044            1 :         let layer_generation = Generation::new(0xdeadbeef);
    1045            1 :         let now_generation = Generation::new(0xfeedbeef);
    1046            1 :         let layer_metadata =
    1047            1 :             LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
    1048              : 
    1049              :         // Inject a deletion in the generation before generation_now: after restart,
    1050              :         // this deletion should _not_ get executed (only the immediately previous
    1051              :         // generation gets that treatment)
    1052            1 :         let remote_layer_file_name_historical =
    1053            1 :             ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
    1054            1 :         client.push_layers(
    1055            1 :             tenant_shard_id,
    1056              :             TIMELINE_ID,
    1057            1 :             now_generation.previous(),
    1058            1 :             [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
    1059            0 :         )?;
    1060              : 
    1061              :         // Inject a deletion in the generation before generation_now: after restart,
    1062              :         // this deletion should get executed, because we execute deletions in the
    1063              :         // immediately previous generation on the same node.
    1064            1 :         let remote_layer_file_name_previous =
    1065            1 :             ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
    1066            1 :         client.push_layers(
    1067            1 :             tenant_shard_id,
    1068              :             TIMELINE_ID,
    1069            1 :             now_generation,
    1070            1 :             [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
    1071            0 :         )?;
    1072              : 
    1073            1 :         client.flush().await?;
    1074            1 :         assert_remote_files(
    1075            1 :             &[
    1076            1 :                 &remote_layer_file_name_historical,
    1077            1 :                 &remote_layer_file_name_previous,
    1078            1 :             ],
    1079            1 :             &remote_timeline_path,
    1080              :         );
    1081              : 
    1082              :         // Different generatinos for the same tenant will cause two separate
    1083              :         // deletion lists to be emitted.
    1084            1 :         assert_local_files(
    1085            1 :             &["0000000000000001-01.list", "0000000000000002-01.list"],
    1086            1 :             &deletion_prefix,
    1087              :         );
    1088              : 
    1089              :         // Simulate a node restart: the latest generation advances
    1090            1 :         let now_generation = now_generation.next();
    1091            1 :         ctx.set_latest_generation(now_generation);
    1092              : 
    1093              :         // Restart the deletion queue
    1094            1 :         drop(client);
    1095            1 :         ctx.restart().await;
    1096            1 :         let client = ctx.deletion_queue.new_client();
    1097            1 :         client.recover(HashMap::from([(tenant_shard_id, now_generation)]))?;
    1098              : 
    1099            1 :         info!("Flush-executing");
    1100            1 :         client.flush_execute().await?;
    1101              :         // The deletion from immediately prior generation was executed, the one from
    1102              :         // an older generation was not.
    1103            1 :         assert_remote_files(&[&remote_layer_file_name_historical], &remote_timeline_path);
    1104            2 :         Ok(())
    1105            1 :     }
    1106              : }
    1107              : 
    1108              : /// A lightweight queue which can issue ordinary DeletionQueueClient objects, but doesn't do any persistence
    1109              : /// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it.
    1110              : #[cfg(test)]
    1111              : pub(crate) mod mock {
    1112              :     use std::sync::atomic::{AtomicUsize, Ordering};
    1113              : 
    1114              :     use tracing::info;
    1115              : 
    1116              :     use super::*;
    1117              :     use crate::tenant::remote_timeline_client::remote_layer_path;
    1118              : 
    1119              :     pub struct ConsumerState {
    1120              :         rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
    1121              :         executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
    1122              :         cancel: CancellationToken,
    1123              :         executed: Arc<AtomicUsize>,
    1124              :     }
    1125              : 
    1126              :     impl ConsumerState {
    1127          119 :         async fn consume(&mut self, remote_storage: &GenericRemoteStorage) {
    1128          119 :             info!("Executing all pending deletions");
    1129              : 
    1130              :             // Transform all executor messages to generic frontend messages
    1131              :             loop {
    1132              :                 use either::Either;
    1133          293 :                 let msg = tokio::select! {
    1134          293 :                     left = self.executor_rx.recv() => Either::Left(left),
    1135          293 :                     right = self.rx.recv() => Either::Right(right),
    1136              :                 };
    1137            1 :                 match msg {
    1138            0 :                     Either::Left(None) => break,
    1139            0 :                     Either::Right(None) => break,
    1140            0 :                     Either::Left(Some(DeleterMessage::Delete(objects))) => {
    1141            0 :                         for path in objects {
    1142            0 :                             match remote_storage.delete(&path, &self.cancel).await {
    1143              :                                 Ok(_) => {
    1144            0 :                                     debug!("Deleted {path}");
    1145              :                                 }
    1146            0 :                                 Err(e) => {
    1147            0 :                                     error!("Failed to delete {path}, leaking object! ({e})");
    1148              :                                 }
    1149              :                             }
    1150            0 :                             self.executed.fetch_add(1, Ordering::Relaxed);
    1151              :                         }
    1152              :                     }
    1153            1 :                     Either::Left(Some(DeleterMessage::Flush(flush_op))) => {
    1154            1 :                         flush_op.notify();
    1155            1 :                     }
    1156          174 :                     Either::Right(Some(ListWriterQueueMessage::Delete(op))) => {
    1157          174 :                         let mut objects = op.objects;
    1158          348 :                         for (layer, meta) in op.layers {
    1159          174 :                             objects.push(remote_layer_path(
    1160          174 :                                 &op.tenant_shard_id.tenant_id,
    1161          174 :                                 &op.timeline_id,
    1162          174 :                                 meta.shard,
    1163          174 :                                 &layer,
    1164          174 :                                 meta.generation,
    1165          174 :                             ));
    1166          174 :                         }
    1167              : 
    1168          347 :                         for path in objects {
    1169          174 :                             info!("Executing deletion {path}");
    1170          174 :                             match remote_storage.delete(&path, &self.cancel).await {
    1171              :                                 Ok(_) => {
    1172          173 :                                     debug!("Deleted {path}");
    1173              :                                 }
    1174            0 :                                 Err(e) => {
    1175            0 :                                     error!("Failed to delete {path}, leaking object! ({e})");
    1176              :                                 }
    1177              :                             }
    1178          173 :                             self.executed.fetch_add(1, Ordering::Relaxed);
    1179              :                         }
    1180              :                     }
    1181            0 :                     Either::Right(Some(ListWriterQueueMessage::Flush(op))) => {
    1182            0 :                         op.notify();
    1183            0 :                     }
    1184            0 :                     Either::Right(Some(ListWriterQueueMessage::FlushExecute(op))) => {
    1185            0 :                         // We have already executed all prior deletions because mock does them inline
    1186            0 :                         op.notify();
    1187            0 :                     }
    1188            0 :                     Either::Right(Some(ListWriterQueueMessage::Recover(_))) => {
    1189            0 :                         // no-op in mock
    1190            0 :                     }
    1191              :                 }
    1192              :             }
    1193            0 :         }
    1194              :     }
    1195              : 
    1196              :     pub struct MockDeletionQueue {
    1197              :         tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
    1198              :         executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
    1199              :         lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
    1200              :     }
    1201              : 
    1202              :     impl MockDeletionQueue {
    1203          119 :         pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
    1204          119 :             let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
    1205          119 :             let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384);
    1206              : 
    1207          119 :             let executed = Arc::new(AtomicUsize::new(0));
    1208              : 
    1209          119 :             let mut consumer = ConsumerState {
    1210          119 :                 rx,
    1211          119 :                 executor_rx,
    1212          119 :                 cancel: CancellationToken::new(),
    1213          119 :                 executed: executed.clone(),
    1214          119 :             };
    1215              : 
    1216          119 :             tokio::spawn(async move {
    1217          119 :                 if let Some(remote_storage) = &remote_storage {
    1218          119 :                     consumer.consume(remote_storage).await;
    1219            0 :                 }
    1220            0 :             });
    1221              : 
    1222          119 :             Self {
    1223          119 :                 tx,
    1224          119 :                 executor_tx,
    1225          119 :                 lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
    1226          119 :             }
    1227          119 :         }
    1228              : 
    1229              :         #[allow(clippy::await_holding_lock)]
    1230            1 :         pub async fn pump(&self) {
    1231            1 :             let (tx, rx) = tokio::sync::oneshot::channel();
    1232            1 :             self.executor_tx
    1233            1 :                 .send(DeleterMessage::Flush(FlushOp { tx }))
    1234            1 :                 .await
    1235            1 :                 .expect("Failed to send flush message");
    1236            1 :             rx.await.ok();
    1237            1 :         }
    1238              : 
    1239          125 :         pub(crate) fn new_client(&self) -> DeletionQueueClient {
    1240          125 :             DeletionQueueClient {
    1241          125 :                 tx: self.tx.clone(),
    1242          125 :                 executor_tx: self.executor_tx.clone(),
    1243          125 :                 lsn_table: self.lsn_table.clone(),
    1244          125 :             }
    1245          125 :         }
    1246              :     }
    1247              : 
    1248              :     /// Test round-trip serialization/deserialization, and test stability of the format
    1249              :     /// vs. a static expected string for the serialized version.
    1250              :     #[test]
    1251            1 :     fn deletion_list_serialization() -> anyhow::Result<()> {
    1252            1 :         let tenant_id = "ad6c1a56f5680419d3a16ff55d97ec3c"
    1253            1 :             .to_string()
    1254            1 :             .parse::<TenantShardId>()?;
    1255            1 :         let timeline_id = "be322c834ed9e709e63b5c9698691910"
    1256            1 :             .to_string()
    1257            1 :             .parse::<TimelineId>()?;
    1258            1 :         let generation = Generation::new(123);
    1259              : 
    1260            1 :         let object =
    1261            1 :             RemotePath::from_string(&format!("tenants/{tenant_id}/timelines/{timeline_id}/foo"))?;
    1262            1 :         let mut objects = [object].to_vec();
    1263              : 
    1264            1 :         let mut example = DeletionList::new(1);
    1265            1 :         example.push(&tenant_id, &timeline_id, generation, &mut objects);
    1266              : 
    1267            1 :         let encoded = serde_json::to_string(&example)?;
    1268              : 
    1269            1 :         let expected = "{\"version\":1,\"sequence\":1,\"tenants\":{\"ad6c1a56f5680419d3a16ff55d97ec3c\":{\"timelines\":{\"be322c834ed9e709e63b5c9698691910\":[\"foo\"]},\"generation\":123}},\"size\":1}".to_string();
    1270            1 :         assert_eq!(encoded, expected);
    1271              : 
    1272            1 :         let decoded = serde_json::from_str::<DeletionList>(&encoded)?;
    1273            1 :         assert_eq!(example, decoded);
    1274              : 
    1275            1 :         Ok(())
    1276            1 :     }
    1277              : }
        

Generated by: LCOV version 2.1-beta