LCOV - code coverage report
Current view: top level - pageserver/src/tenant/timeline - delete.rs (source / functions) Coverage Total Hit
Test: 8ac049b474321fdc72ddcb56d7165153a1a900e8.info Lines: 88.7 % 301 267
Test Date: 2023-09-06 10:18:01 Functions: 75.0 % 60 45

            Line data    Source code
       1              : use std::{
       2              :     ops::{Deref, DerefMut},
       3              :     sync::Arc,
       4              : };
       5              : 
       6              : use anyhow::Context;
       7              : use pageserver_api::models::TimelineState;
       8              : use tokio::sync::OwnedMutexGuard;
       9              : use tracing::{debug, error, info, instrument, warn, Instrument, Span};
      10              : use utils::{
      11              :     crashsafe, fs_ext,
      12              :     id::{TenantId, TimelineId},
      13              : };
      14              : 
      15              : use crate::{
      16              :     config::PageServerConf,
      17              :     task_mgr::{self, TaskKind},
      18              :     tenant::{
      19              :         metadata::TimelineMetadata,
      20              :         remote_timeline_client::{
      21              :             self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
      22              :         },
      23              :         CreateTimelineCause, DeleteTimelineError, Tenant,
      24              :     },
      25              :     InitializationOrder,
      26              : };
      27              : 
      28              : use super::{Timeline, TimelineResources};
      29              : 
      30              : /// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
      31          302 : async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
      32              :     // Stop the walreceiver first.
      33            0 :     debug!("waiting for wal receiver to shutdown");
      34          302 :     let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
      35          302 :     if let Some(walreceiver) = maybe_started_walreceiver {
      36          236 :         walreceiver.stop().await;
      37           66 :     }
      38            0 :     debug!("wal receiver shutdown confirmed");
      39              : 
      40              :     // Prevent new uploads from starting.
      41          302 :     if let Some(remote_client) = timeline.remote_client.as_ref() {
      42          214 :         let res = remote_client.stop();
      43          214 :         match res {
      44          214 :             Ok(()) => {}
      45            0 :             Err(e) => match e {
      46            0 :                 remote_timeline_client::StopError::QueueUninitialized => {
      47            0 :                     // This case shouldn't happen currently because the
      48            0 :                     // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
      49            0 :                     // That is, before we declare the Tenant as Active.
      50            0 :                     // But we only allow calls to delete_timeline on Active tenants.
      51            0 :                     return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
      52              :                 }
      53              :             },
      54              :         }
      55           88 :     }
      56              : 
      57              :     // Stop & wait for the remaining timeline tasks, including upload tasks.
      58              :     // NB: This and other delete_timeline calls do not run as a task_mgr task,
      59              :     //     so, they are not affected by this shutdown_tasks() call.
      60          302 :     info!("waiting for timeline tasks to shutdown");
      61          315 :     task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
      62              : 
      63          302 :     fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
      64           12 :         Err(anyhow::anyhow!(
      65           12 :             "failpoint: timeline-delete-before-index-deleted-at"
      66           12 :         ))?
      67          302 :     });
      68          290 :     Ok(())
      69          302 : }
      70              : 
      71              : /// Mark timeline as deleted in S3 so we won't pick it up next time
      72              : /// during attach or pageserver restart.
      73              : /// See comment in persist_index_part_with_deleted_flag.
      74          290 : async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
      75          290 :     if let Some(remote_client) = timeline.remote_client.as_ref() {
      76          685 :         match remote_client.persist_index_part_with_deleted_flag().await {
      77              :             // If we (now, or already) marked it successfully as deleted, we can proceed
      78          206 :             Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
      79              :             // Bail out otherwise
      80              :             //
      81              :             // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
      82              :             // two tasks from performing the deletion at the same time. The first task
      83              :             // that starts deletion should run it to completion.
      84            0 :             Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
      85            0 :             | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
      86            0 :                 return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
      87              :             }
      88              :         }
      89           84 :     }
      90          290 :     Ok(())
      91          290 : }
      92              : 
      93              : // We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
      94              : // This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
      95              : // gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
      96              : // and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
      97              : // After index part is deleted presence of this mark file indentifies that it was a deletion intention.
      98              : // So we can just remove the mark file.
      99          316 : async fn create_delete_mark(
     100          316 :     conf: &PageServerConf,
     101          316 :     tenant_id: TenantId,
     102          316 :     timeline_id: TimelineId,
     103          316 : ) -> Result<(), DeleteTimelineError> {
     104          316 :     fail::fail_point!("timeline-delete-before-delete-mark", |_| {
     105            0 :         Err(anyhow::anyhow!(
     106            0 :             "failpoint: timeline-delete-before-delete-mark"
     107            0 :         ))?
     108          316 :     });
     109          316 :     let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
     110          316 : 
     111          316 :     // Note: we're ok to replace existing file.
     112          316 :     let _ = std::fs::OpenOptions::new()
     113          316 :         .write(true)
     114          316 :         .create(true)
     115          316 :         .open(&marker_path)
     116          316 :         .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
     117              : 
     118          316 :     crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
     119          316 :     Ok(())
     120          316 : }
     121              : 
     122              : /// Grab the layer_removal_cs lock, and actually perform the deletion.
     123              : ///
     124              : /// This lock prevents prevents GC or compaction from running at the same time.
     125              : /// The GC task doesn't register itself with the timeline it's operating on,
     126              : /// so it might still be running even though we called `shutdown_tasks`.
     127              : ///
     128              : /// Note that there are still other race conditions between
     129              : /// GC, compaction and timeline deletion. See
     130              : /// <https://github.com/neondatabase/neon/issues/2671>
     131              : ///
     132              : /// No timeout here, GC & Compaction should be responsive to the
     133              : /// `TimelineState::Stopping` change.
     134          310 : async fn delete_local_layer_files(
     135          310 :     conf: &PageServerConf,
     136          310 :     tenant_id: TenantId,
     137          310 :     timeline: &Timeline,
     138          310 : ) -> anyhow::Result<()> {
     139          310 :     info!("waiting for layer_removal_cs.lock()");
     140          310 :     let layer_removal_guard = timeline.layer_removal_cs.lock().await;
     141          310 :     info!("got layer_removal_cs.lock(), deleting layer files");
     142              : 
     143              :     // NB: storage_sync upload tasks that reference these layers have been cancelled
     144              :     //     by the caller.
     145              : 
     146          310 :     let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
     147          310 : 
     148          310 :     fail::fail_point!("timeline-delete-before-rm", |_| {
     149           13 :         Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
     150          310 :     });
     151              : 
     152              :     // NB: This need not be atomic because the deleted flag in the IndexPart
     153              :     // will be observed during tenant/timeline load. The deletion will be resumed there.
     154              :     //
     155              :     // For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
     156              :     //
     157              :     // Note that here we do not bail out on std::io::ErrorKind::NotFound.
     158              :     // This can happen if we're called a second time, e.g.,
     159              :     // because of a previous failure/cancellation at/after
     160              :     // failpoint timeline-delete-after-rm.
     161              :     //
     162              :     // It can also happen if we race with tenant detach, because,
     163              :     // it doesn't grab the layer_removal_cs lock.
     164              :     //
     165              :     // For now, log and continue.
     166              :     // warn! level is technically not appropriate for the
     167              :     // first case because we should expect retries to happen.
     168              :     // But the error is so rare, it seems better to get attention if it happens.
     169              :     //
     170              :     // Note that metadata removal is skipped, this is not technically needed,
     171              :     // but allows to reuse timeline loading code during resumed deletion.
     172              :     // (we always expect that metadata is in place when timeline is being loaded)
     173              : 
     174              :     #[cfg(feature = "testing")]
     175          297 :     let mut counter = 0;
     176          297 : 
     177          297 :     // Timeline directory may not exist if we failed to delete mark file and request was retried.
     178          297 :     if !local_timeline_directory.exists() {
     179           12 :         return Ok(());
     180          285 :     }
     181          285 : 
     182          285 :     let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
     183              : 
     184         6358 :     for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
     185              :         #[cfg(feature = "testing")]
     186              :         {
     187         6358 :             counter += 1;
     188         6358 :             if counter == 2 {
     189          282 :                 fail::fail_point!("timeline-delete-during-rm", |_| {
     190            9 :                     Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
     191          282 :                 });
     192         6076 :             }
     193              :         }
     194              : 
     195         6349 :         let entry = entry?;
     196         6349 :         if entry.path() == metadata_path {
     197            0 :             debug!("found metadata, skipping");
     198          273 :             continue;
     199         6076 :         }
     200         6076 : 
     201         6076 :         if entry.path() == local_timeline_directory {
     202              :             // Keeping directory because metedata file is still there
     203            0 :             debug!("found timeline dir itself, skipping");
     204          276 :             continue;
     205         5800 :         }
     206              : 
     207         5800 :         let metadata = match entry.metadata() {
     208         5800 :             Ok(metadata) => metadata,
     209            0 :             Err(e) => {
     210            0 :                 if crate::is_walkdir_io_not_found(&e) {
     211            0 :                     warn!(
     212            0 :                         timeline_dir=?local_timeline_directory,
     213            0 :                         path=?entry.path().display(),
     214            0 :                         "got not found err while removing timeline dir, proceeding anyway"
     215            0 :                     );
     216            0 :                     continue;
     217            0 :                 }
     218            0 :                 anyhow::bail!(e);
     219              :             }
     220              :         };
     221              : 
     222         5800 :         if metadata.is_dir() {
     223            0 :             warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
     224            0 :             tokio::fs::remove_dir(entry.path()).await
     225              :         } else {
     226         5800 :             tokio::fs::remove_file(entry.path()).await
     227              :         }
     228         5800 :         .with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
     229              :     }
     230              : 
     231          276 :     info!("finished deleting layer files, releasing layer_removal_cs.lock()");
     232          276 :     drop(layer_removal_guard);
     233          276 : 
     234          276 :     fail::fail_point!("timeline-delete-after-rm", |_| {
     235            6 :         Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
     236          276 :     });
     237              : 
     238          270 :     Ok(())
     239          310 : }
     240              : 
     241              : /// Removes remote layers and an index file after them.
     242          282 : async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
     243          282 :     if let Some(remote_client) = &timeline.remote_client {
     244         2734 :         remote_client.delete_all().await.context("delete_all")?
     245           79 :     };
     246              : 
     247          266 :     Ok(())
     248          282 : }
     249              : 
     250              : // This function removs remaining traces of a timeline on disk.
     251              : // Namely: metadata file, timeline directory, delete mark.
     252              : // Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
     253              : // delete mark should be present because it is the last step during deletion.
     254              : // (nothing can fail after its deletion)
     255          277 : async fn cleanup_remaining_timeline_fs_traces(
     256          277 :     conf: &PageServerConf,
     257          277 :     tenant_id: TenantId,
     258          277 :     timeline_id: TimelineId,
     259          277 : ) -> anyhow::Result<()> {
     260              :     // Remove local metadata
     261          277 :     tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
     262          276 :         .await
     263          277 :         .or_else(fs_ext::ignore_not_found)
     264          277 :         .context("remove metadata")?;
     265              : 
     266          277 :     fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
     267            6 :         Err(anyhow::anyhow!(
     268            6 :             "failpoint: timeline-delete-after-rm-metadata"
     269            6 :         ))?
     270          277 :     });
     271              : 
     272              :     // Remove timeline dir
     273          271 :     tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
     274          269 :         .await
     275          271 :         .or_else(fs_ext::ignore_not_found)
     276          271 :         .context("timeline dir")?;
     277              : 
     278          271 :     fail::fail_point!("timeline-delete-after-rm-dir", |_| {
     279           12 :         Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
     280          271 :     });
     281              : 
     282              :     // Make sure previous deletions are ordered before mark removal.
     283              :     // Otherwise there is no guarantee that they reach the disk before mark deletion.
     284              :     // So its possible for mark to reach disk first and for other deletions
     285              :     // to be reordered later and thus missed if a crash occurs.
     286              :     // Note that we dont need to sync after mark file is removed
     287              :     // because we can tolerate the case when mark file reappears on startup.
     288          259 :     let timeline_path = conf.timelines_path(&tenant_id);
     289          259 :     crashsafe::fsync_async(timeline_path)
     290          515 :         .await
     291          259 :         .context("fsync_pre_mark_remove")?;
     292              : 
     293              :     // Remove delete mark
     294          259 :     tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
     295          259 :         .await
     296          259 :         .context("remove delete mark")
     297          277 : }
     298              : 
     299              : /// It is important that this gets called when DeletionGuard is being held.
     300              : /// For more context see comments in [`DeleteTimelineFlow::prepare`]
     301          248 : async fn remove_timeline_from_tenant(
     302          248 :     tenant: &Tenant,
     303          248 :     timeline_id: TimelineId,
     304          248 :     _: &DeletionGuard, // using it as a witness
     305          248 : ) -> anyhow::Result<()> {
     306          248 :     // Remove the timeline from the map.
     307          248 :     let mut timelines = tenant.timelines.lock().unwrap();
     308          248 :     let children_exist = timelines
     309          248 :         .iter()
     310          417 :         .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
     311          248 :     // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
     312          248 :     // We already deleted the layer files, so it's probably best to panic.
     313          248 :     // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
     314          248 :     if children_exist {
     315            0 :         panic!("Timeline grew children while we removed layer files");
     316          248 :     }
     317          248 : 
     318          248 :     timelines
     319          248 :         .remove(&timeline_id)
     320          248 :         .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
     321          248 : 
     322          248 :     drop(timelines);
     323          248 : 
     324          248 :     Ok(())
     325          248 : }
     326              : 
     327              : /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
     328              : /// and deletes its data from both disk and s3.
     329              : /// The sequence of steps:
     330              : /// 1. Set deleted_at in remote index part.
     331              : /// 2. Create local mark file.
     332              : /// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
     333              : /// 4. Delete remote layers
     334              : /// 5. Delete index part
     335              : /// 6. Delete meta, timeline directory
     336              : /// 7. Delete mark file
     337              : /// It is resumable from any step in case a crash/restart occurs.
     338              : /// There are three entrypoints to the process:
     339              : /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
     340              : /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
     341              : /// and we possibly neeed to continue deletion of remote files.
     342              : /// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
     343              : /// index but still have local metadata, timeline directory and delete mark.
     344              : /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
     345         1394 : #[derive(Default)]
     346              : pub enum DeleteTimelineFlow {
     347              :     #[default]
     348              :     NotStarted,
     349              :     InProgress,
     350              :     Finished,
     351              : }
     352              : 
     353              : impl DeleteTimelineFlow {
     354              :     // These steps are run in the context of management api request handler.
     355              :     // Long running steps are continued to run in the background.
     356              :     // NB: If this fails half-way through, and is retried, the retry will go through
     357              :     // all the same steps again. Make sure the code here is idempotent, and don't
     358              :     // error out if some of the shutdown tasks have already been completed!
     359          605 :     #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
     360              :     pub async fn run(
     361              :         tenant: &Arc<Tenant>,
     362              :         timeline_id: TimelineId,
     363              :         inplace: bool,
     364              :     ) -> Result<(), DeleteTimelineError> {
     365              :         let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
     366              : 
     367              :         guard.mark_in_progress()?;
     368              : 
     369              :         stop_tasks(&timeline).await?;
     370              : 
     371              :         set_deleted_in_remote_index(&timeline).await?;
     372              : 
     373              :         create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
     374              : 
     375              :         fail::fail_point!("timeline-delete-before-schedule", |_| {
     376            6 :             Err(anyhow::anyhow!(
     377            6 :                 "failpoint: timeline-delete-before-schedule"
     378            6 :             ))?
     379            6 :         });
     380              : 
     381              :         if inplace {
     382              :             Self::background(guard, tenant.conf, tenant, &timeline).await?
     383              :         } else {
     384              :             Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
     385              :         }
     386              : 
     387              :         Ok(())
     388              :     }
     389              : 
     390          328 :     fn mark_in_progress(&mut self) -> anyhow::Result<()> {
     391          328 :         match self {
     392            0 :             Self::Finished => anyhow::bail!("Bug. Is in finished state"),
     393           36 :             Self::InProgress { .. } => { /* We're in a retry */ }
     394          292 :             Self::NotStarted => { /* Fresh start */ }
     395              :         }
     396              : 
     397          328 :         *self = Self::InProgress;
     398          328 : 
     399          328 :         Ok(())
     400          328 :     }
     401              : 
     402              :     /// Shortcut to create Timeline in stopping state and spawn deletion task.
     403              :     /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
     404          104 :     #[instrument(skip_all, fields(%timeline_id))]
     405              :     pub async fn resume_deletion(
     406              :         tenant: Arc<Tenant>,
     407              :         timeline_id: TimelineId,
     408              :         local_metadata: &TimelineMetadata,
     409              :         remote_client: Option<RemoteTimelineClient>,
     410              :         init_order: Option<&InitializationOrder>,
     411              :     ) -> anyhow::Result<()> {
     412              :         // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
     413              :         // RemoteTimelineClient is the only functioning part.
     414              :         let timeline = tenant
     415              :             .create_timeline_struct(
     416              :                 timeline_id,
     417              :                 local_metadata,
     418              :                 None, // Ancestor is not needed for deletion.
     419              :                 TimelineResources { remote_client },
     420              :                 init_order,
     421              :                 // Important. We dont pass ancestor above because it can be missing.
     422              :                 // Thus we need to skip the validation here.
     423              :                 CreateTimelineCause::Delete,
     424              :             )
     425              :             .context("create_timeline_struct")?;
     426              : 
     427              :         let mut guard = DeletionGuard(
     428              :             Arc::clone(&timeline.delete_progress)
     429              :                 .try_lock_owned()
     430              :                 .expect("cannot happen because we're the only owner"),
     431              :         );
     432              : 
     433              :         // We meed to do this because when console retries delete request we shouldnt answer with 404
     434              :         // because 404 means successful deletion.
     435              :         {
     436              :             let mut locked = tenant.timelines.lock().unwrap();
     437              :             locked.insert(timeline_id, Arc::clone(&timeline));
     438              :         }
     439              : 
     440              :         guard.mark_in_progress()?;
     441              : 
     442              :         // Note that delete mark can be missing on resume
     443              :         // because we create delete mark after we set deleted_at in the index part.
     444              :         create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
     445              : 
     446              :         Self::schedule_background(guard, tenant.conf, tenant, timeline);
     447              : 
     448              :         Ok(())
     449              :     }
     450              : 
     451           33 :     #[instrument(skip_all, fields(%timeline_id))]
     452              :     pub async fn cleanup_remaining_timeline_fs_traces(
     453              :         tenant: &Tenant,
     454              :         timeline_id: TimelineId,
     455              :     ) -> anyhow::Result<()> {
     456              :         let r =
     457              :             cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
     458           11 :         info!("Done");
     459              :         r
     460              :     }
     461              : 
     462          315 :     fn prepare(
     463          315 :         tenant: &Tenant,
     464          315 :         timeline_id: TimelineId,
     465          315 :     ) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
     466          315 :         // Note the interaction between this guard and deletion guard.
     467          315 :         // Here we attempt to lock deletion guard when we're holding a lock on timelines.
     468          315 :         // This is important because when you take into account `remove_timeline_from_tenant`
     469          315 :         // we remove timeline from memory when we still hold the deletion guard.
     470          315 :         // So here when timeline deletion is finished timeline wont be present in timelines map at all
     471          315 :         // which makes the following sequence impossible:
     472          315 :         // T1: get preempted right before the try_lock on `Timeline::delete_progress`
     473          315 :         // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
     474          315 :         // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
     475          315 :         // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
     476          315 :         let timelines = tenant.timelines.lock().unwrap();
     477              : 
     478          315 :         let timeline = match timelines.get(&timeline_id) {
     479          310 :             Some(t) => t,
     480            5 :             None => return Err(DeleteTimelineError::NotFound),
     481              :         };
     482              : 
     483              :         // Ensure that there are no child timelines **attached to that pageserver**,
     484              :         // because detach removes files, which will break child branches
     485          310 :         let children: Vec<TimelineId> = timelines
     486          310 :             .iter()
     487          551 :             .filter_map(|(id, entry)| {
     488          551 :                 if entry.get_ancestor_timeline_id() == Some(timeline_id) {
     489            1 :                     Some(*id)
     490              :                 } else {
     491          550 :                     None
     492              :                 }
     493          551 :             })
     494          310 :             .collect();
     495          310 : 
     496          310 :         if !children.is_empty() {
     497            1 :             return Err(DeleteTimelineError::HasChildren(children));
     498          309 :         }
     499          309 : 
     500          309 :         // Note that using try_lock here is important to avoid a deadlock.
     501          309 :         // Here we take lock on timelines and then the deletion guard.
     502          309 :         // At the end of the operation we're holding the guard and need to lock timelines map
     503          309 :         // to remove the timeline from it.
     504          309 :         // Always if you have two locks that are taken in different order this can result in a deadlock.
     505          309 : 
     506          309 :         let delete_progress = Arc::clone(&timeline.delete_progress);
     507          309 :         let delete_lock_guard = match delete_progress.try_lock_owned() {
     508          302 :             Ok(guard) => DeletionGuard(guard),
     509              :             Err(_) => {
     510              :                 // Unfortunately if lock fails arc is consumed.
     511            7 :                 return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
     512            7 :                     &timeline.delete_progress,
     513            7 :                 )));
     514              :             }
     515              :         };
     516              : 
     517          302 :         timeline.set_state(TimelineState::Stopping);
     518          302 : 
     519          302 :         Ok((Arc::clone(timeline), delete_lock_guard))
     520          315 :     }
     521              : 
     522          129 :     fn schedule_background(
     523          129 :         guard: DeletionGuard,
     524          129 :         conf: &'static PageServerConf,
     525          129 :         tenant: Arc<Tenant>,
     526          129 :         timeline: Arc<Timeline>,
     527          129 :     ) {
     528          129 :         let tenant_id = timeline.tenant_id;
     529          129 :         let timeline_id = timeline.timeline_id;
     530          129 : 
     531          129 :         task_mgr::spawn(
     532          129 :             task_mgr::BACKGROUND_RUNTIME.handle(),
     533          129 :             TaskKind::TimelineDeletionWorker,
     534          129 :             Some(tenant_id),
     535          129 :             Some(timeline_id),
     536          129 :             "timeline_delete",
     537              :             false,
     538          129 :             async move {
     539         4251 :                 if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
     540           43 :                     error!("Error: {err:#}");
     541           43 :                     timeline.set_broken(format!("{err:#}"))
     542           86 :                 };
     543          129 :                 Ok(())
     544          129 :             }
     545          129 :             .instrument({
     546          129 :                 let span =
     547          129 :                     tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
     548          129 :                 span.follows_from(Span::current());
     549          129 :                 span
     550          129 :             }),
     551          129 :         );
     552          129 :     }
     553              : 
     554          310 :     async fn background(
     555          310 :         mut guard: DeletionGuard,
     556          310 :         conf: &PageServerConf,
     557          310 :         tenant: &Tenant,
     558          310 :         timeline: &Timeline,
     559          310 :     ) -> Result<(), DeleteTimelineError> {
     560         5782 :         delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
     561              : 
     562         2734 :         delete_remote_layers_and_index(timeline).await?;
     563              : 
     564          266 :         pausable_failpoint!("in_progress_delete");
     565              : 
     566         1264 :         cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
     567              : 
     568          248 :         remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
     569              : 
     570          248 :         *guard = Self::Finished;
     571          248 : 
     572          248 :         Ok(())
     573          310 :     }
     574              : 
     575            4 :     pub(crate) fn is_finished(&self) -> bool {
     576            4 :         matches!(self, Self::Finished)
     577            4 :     }
     578              : }
     579              : 
     580              : struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
     581              : 
     582              : impl Deref for DeletionGuard {
     583              :     type Target = DeleteTimelineFlow;
     584              : 
     585            0 :     fn deref(&self) -> &Self::Target {
     586            0 :         &self.0
     587            0 :     }
     588              : }
     589              : 
     590              : impl DerefMut for DeletionGuard {
     591          576 :     fn deref_mut(&mut self) -> &mut Self::Target {
     592          576 :         &mut self.0
     593          576 :     }
     594              : }
        

Generated by: LCOV version 2.1-beta