LCOV - differential code coverage report
Current view: top level - pageserver/src/tenant/timeline - delete.rs (source / functions) Coverage Total Hit UBC CBC
Current: cd44433dd675caa99df17a61b18949c8387e2242.info Lines: 86.6 % 305 264 41 264
Current Date: 2024-01-09 02:06:09 Functions: 67.9 % 56 38 18 38
Baseline: 66c52a629a0f4a503e193045e0df4c77139e344b.info
Baseline Date: 2024-01-08 15:34:46

           TLA  Line data    Source code
       1                 : use std::{
       2                 :     ops::{Deref, DerefMut},
       3                 :     sync::Arc,
       4                 : };
       5                 : 
       6                 : use anyhow::Context;
       7                 : use pageserver_api::{models::TimelineState, shard::TenantShardId};
       8                 : use tokio::sync::OwnedMutexGuard;
       9                 : use tracing::{debug, error, info, instrument, warn, Instrument, Span};
      10                 : use utils::{crashsafe, fs_ext, id::TimelineId};
      11                 : 
      12                 : use crate::{
      13                 :     config::PageServerConf,
      14                 :     deletion_queue::DeletionQueueClient,
      15                 :     task_mgr::{self, TaskKind},
      16                 :     tenant::{
      17                 :         debug_assert_current_span_has_tenant_and_timeline_id,
      18                 :         metadata::TimelineMetadata,
      19                 :         remote_timeline_client::{
      20                 :             self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
      21                 :         },
      22                 :         CreateTimelineCause, DeleteTimelineError, Tenant,
      23                 :     },
      24                 : };
      25                 : 
      26                 : use super::{Timeline, TimelineResources};
      27                 : 
      28                 : /// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
      29 CBC         179 : async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
      30             179 :     debug_assert_current_span_has_tenant_and_timeline_id();
      31                 :     // Notify any timeline work to drop out of loops/requests
      32 UBC           0 :     tracing::debug!("Cancelling CancellationToken");
      33 CBC         179 :     timeline.cancel.cancel();
      34                 : 
      35                 :     // Stop the walreceiver first.
      36 UBC           0 :     debug!("waiting for wal receiver to shutdown");
      37 CBC         179 :     let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
      38             179 :     if let Some(walreceiver) = maybe_started_walreceiver {
      39             142 :         walreceiver.stop().await;
      40              37 :     }
      41 UBC           0 :     debug!("wal receiver shutdown confirmed");
      42                 : 
      43                 :     // Shut down the layer flush task before the remote client, as one depends on the other
      44 CBC         179 :     task_mgr::shutdown_tasks(
      45             179 :         Some(TaskKind::LayerFlushTask),
      46             179 :         Some(timeline.tenant_shard_id),
      47             179 :         Some(timeline.timeline_id),
      48             179 :     )
      49              22 :     .await;
      50                 : 
      51                 :     // Prevent new uploads from starting.
      52             179 :     if let Some(remote_client) = timeline.remote_client.as_ref() {
      53             179 :         let res = remote_client.stop();
      54             179 :         match res {
      55             179 :             Ok(()) => {}
      56 UBC           0 :             Err(e) => match e {
      57               0 :                 remote_timeline_client::StopError::QueueUninitialized => {
      58               0 :                     // This case shouldn't happen currently because the
      59               0 :                     // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
      60               0 :                     // That is, before we declare the Tenant as Active.
      61               0 :                     // But we only allow calls to delete_timeline on Active tenants.
      62               0 :                     return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
      63                 :                 }
      64                 :             },
      65                 :         }
      66               0 :     }
      67                 : 
      68                 :     // Stop & wait for the remaining timeline tasks, including upload tasks.
      69                 :     // NB: This and other delete_timeline calls do not run as a task_mgr task,
      70                 :     //     so, they are not affected by this shutdown_tasks() call.
      71 CBC         179 :     info!("waiting for timeline tasks to shutdown");
      72             179 :     task_mgr::shutdown_tasks(
      73             179 :         None,
      74             179 :         Some(timeline.tenant_shard_id),
      75             179 :         Some(timeline.timeline_id),
      76             179 :     )
      77             102 :     .await;
      78                 : 
      79             179 :     fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
      80               6 :         Err(anyhow::anyhow!(
      81               6 :             "failpoint: timeline-delete-before-index-deleted-at"
      82               6 :         ))?
      83             179 :     });
      84                 : 
      85 UBC           0 :     tracing::debug!("Waiting for gate...");
      86 CBC         173 :     timeline.gate.close().await;
      87 UBC           0 :     tracing::debug!("Shutdown complete");
      88                 : 
      89 CBC         173 :     Ok(())
      90             179 : }
      91                 : 
      92                 : /// Mark timeline as deleted in S3 so we won't pick it up next time
      93                 : /// during attach or pageserver restart.
      94                 : /// See comment in persist_index_part_with_deleted_flag.
      95             173 : async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
      96             173 :     if let Some(remote_client) = timeline.remote_client.as_ref() {
      97             734 :         match remote_client.persist_index_part_with_deleted_flag().await {
      98                 :             // If we (now, or already) marked it successfully as deleted, we can proceed
      99             173 :             Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
     100                 :             // Bail out otherwise
     101                 :             //
     102                 :             // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
     103                 :             // two tasks from performing the deletion at the same time. The first task
     104                 :             // that starts deletion should run it to completion.
     105 UBC           0 :             Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
     106               0 :             | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
     107               0 :                 return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
     108                 :             }
     109                 :         }
     110               0 :     }
     111 CBC         173 :     Ok(())
     112             173 : }
     113                 : 
     114                 : /// Grab the compaction and gc locks, and actually perform the deletion.
     115                 : ///
     116                 : /// The locks prevent GC or compaction from running at the same time. The background tasks do not
     117                 : /// register themselves with the timeline it's operating on, so it might still be running even
     118                 : /// though we called `shutdown_tasks`.
     119                 : ///
     120                 : /// Note that there are still other race conditions between
     121                 : /// GC, compaction and timeline deletion. See
     122                 : /// <https://github.com/neondatabase/neon/issues/2671>
     123                 : ///
     124                 : /// No timeout here, GC & Compaction should be responsive to the
     125                 : /// `TimelineState::Stopping` change.
     126                 : // pub(super): documentation link
     127             183 : pub(super) async fn delete_local_layer_files(
     128             183 :     conf: &PageServerConf,
     129             183 :     tenant_shard_id: TenantShardId,
     130             183 :     timeline: &Timeline,
     131             183 : ) -> anyhow::Result<()> {
     132             183 :     let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
     133             183 :     let guards = crate::timed(
     134             183 :         guards,
     135             183 :         "acquire gc and compaction locks",
     136             183 :         std::time::Duration::from_secs(5),
     137             183 :     )
     138 UBC           0 :     .await;
     139                 : 
     140                 :     // NB: storage_sync upload tasks that reference these layers have been cancelled
     141                 :     //     by the caller.
     142                 : 
     143 CBC         183 :     let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
     144             183 : 
     145             183 :     fail::fail_point!("timeline-delete-before-rm", |_| {
     146               7 :         Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
     147             183 :     });
     148                 : 
     149                 :     // NB: This need not be atomic because the deleted flag in the IndexPart
     150                 :     // will be observed during tenant/timeline load. The deletion will be resumed there.
     151                 :     //
     152                 :     // For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
     153                 :     //
     154                 :     // Note that here we do not bail out on std::io::ErrorKind::NotFound.
     155                 :     // This can happen if we're called a second time, e.g.,
     156                 :     // because of a previous failure/cancellation at/after
     157                 :     // failpoint timeline-delete-after-rm.
     158                 :     //
     159                 :     // ErrorKind::NotFound can also happen if we race with tenant detach, because,
     160                 :     // no locks are shared.
     161                 :     //
     162                 :     // For now, log and continue.
     163                 :     // warn! level is technically not appropriate for the
     164                 :     // first case because we should expect retries to happen.
     165                 :     // But the error is so rare, it seems better to get attention if it happens.
     166                 :     //
     167                 :     // Note that metadata removal is skipped, this is not technically needed,
     168                 :     // but allows to reuse timeline loading code during resumed deletion.
     169                 :     // (we always expect that metadata is in place when timeline is being loaded)
     170                 : 
     171                 :     #[cfg(feature = "testing")]
     172             176 :     let mut counter = 0;
     173             176 : 
     174             176 :     // Timeline directory may not exist if we failed to delete mark file and request was retried.
     175             176 :     if !local_timeline_directory.exists() {
     176               5 :         return Ok(());
     177             171 :     }
     178             171 : 
     179             171 :     let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id);
     180                 : 
     181            4457 :     for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
     182                 :         #[cfg(feature = "testing")]
     183                 :         {
     184            4457 :             counter += 1;
     185            4457 :             if counter == 2 {
     186             170 :                 fail::fail_point!("timeline-delete-during-rm", |_| {
     187               3 :                     Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
     188             170 :                 });
     189            4287 :             }
     190                 :         }
     191                 : 
     192            4454 :         let entry = entry?;
     193            4454 :         if entry.path() == metadata_path {
     194 UBC           0 :             debug!("found metadata, skipping");
     195 CBC         167 :             continue;
     196            4287 :         }
     197            4287 : 
     198            4287 :         if entry.path() == local_timeline_directory {
     199                 :             // Keeping directory because metedata file is still there
     200 UBC           0 :             debug!("found timeline dir itself, skipping");
     201 CBC         168 :             continue;
     202            4119 :         }
     203                 : 
     204            4119 :         let metadata = match entry.metadata() {
     205            4119 :             Ok(metadata) => metadata,
     206 UBC           0 :             Err(e) => {
     207               0 :                 if crate::is_walkdir_io_not_found(&e) {
     208               0 :                     warn!(
     209               0 :                         timeline_dir=?local_timeline_directory,
     210               0 :                         path=?entry.path().display(),
     211               0 :                         "got not found err while removing timeline dir, proceeding anyway"
     212               0 :                     );
     213               0 :                     continue;
     214               0 :                 }
     215               0 :                 anyhow::bail!(e);
     216                 :             }
     217                 :         };
     218                 : 
     219 CBC        4119 :         if metadata.is_dir() {
     220 UBC           0 :             warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
     221               0 :             tokio::fs::remove_dir(entry.path()).await
     222                 :         } else {
     223 CBC        4119 :             tokio::fs::remove_file(entry.path()).await
     224                 :         }
     225            4119 :         .with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
     226                 :     }
     227                 : 
     228             168 :     info!("finished deleting layer files, releasing locks");
     229             168 :     drop(guards);
     230             168 : 
     231             168 :     fail::fail_point!("timeline-delete-after-rm", |_| {
     232               2 :         Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
     233             168 :     });
     234                 : 
     235             166 :     Ok(())
     236             183 : }
     237                 : 
     238                 : /// Removes remote layers and an index file after them.
     239             171 : async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
     240             171 :     if let Some(remote_client) = &timeline.remote_client {
     241            1080 :         remote_client.delete_all().await.context("delete_all")?
     242 UBC           0 :     };
     243                 : 
     244 CBC         161 :     Ok(())
     245             171 : }
     246                 : 
     247                 : // This function removs remaining traces of a timeline on disk.
     248                 : // Namely: metadata file, timeline directory, delete mark.
     249                 : // Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
     250                 : // delete mark should be present because it is the last step during deletion.
     251                 : // (nothing can fail after its deletion)
     252             161 : async fn cleanup_remaining_timeline_fs_traces(
     253             161 :     conf: &PageServerConf,
     254             161 :     tenant_shard_id: TenantShardId,
     255             161 :     timeline_id: TimelineId,
     256             161 : ) -> anyhow::Result<()> {
     257             161 :     // Remove local metadata
     258             161 :     tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id))
     259             161 :         .await
     260             161 :         .or_else(fs_ext::ignore_not_found)
     261             161 :         .context("remove metadata")?;
     262                 : 
     263             161 :     fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
     264               2 :         Err(anyhow::anyhow!(
     265               2 :             "failpoint: timeline-delete-after-rm-metadata"
     266               2 :         ))?
     267             161 :     });
     268                 : 
     269                 :     // Remove timeline dir
     270             159 :     tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id))
     271             159 :         .await
     272             159 :         .or_else(fs_ext::ignore_not_found)
     273             159 :         .context("timeline dir")?;
     274                 : 
     275             159 :     fail::fail_point!("timeline-delete-after-rm-dir", |_| {
     276               6 :         Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
     277             159 :     });
     278                 : 
     279                 :     // Make sure previous deletions are ordered before mark removal.
     280                 :     // Otherwise there is no guarantee that they reach the disk before mark deletion.
     281                 :     // So its possible for mark to reach disk first and for other deletions
     282                 :     // to be reordered later and thus missed if a crash occurs.
     283                 :     // Note that we dont need to sync after mark file is removed
     284                 :     // because we can tolerate the case when mark file reappears on startup.
     285             153 :     let timeline_path = conf.timelines_path(&tenant_shard_id);
     286             153 :     crashsafe::fsync_async(timeline_path)
     287             305 :         .await
     288             153 :         .context("fsync_pre_mark_remove")?;
     289                 : 
     290                 :     // Remove delete mark
     291                 :     // TODO: once we are confident that no more exist in the field, remove this
     292                 :     // line.  It cleans up a legacy marker file that might in rare cases be present.
     293             153 :     tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
     294             152 :         .await
     295             153 :         .or_else(fs_ext::ignore_not_found)
     296             153 :         .context("remove delete mark")
     297             161 : }
     298                 : 
     299                 : /// It is important that this gets called when DeletionGuard is being held.
     300                 : /// For more context see comments in [`DeleteTimelineFlow::prepare`]
     301             153 : async fn remove_timeline_from_tenant(
     302             153 :     tenant: &Tenant,
     303             153 :     timeline_id: TimelineId,
     304             153 :     _: &DeletionGuard, // using it as a witness
     305             153 : ) -> anyhow::Result<()> {
     306             153 :     // Remove the timeline from the map.
     307             153 :     let mut timelines = tenant.timelines.lock().unwrap();
     308             153 :     let children_exist = timelines
     309             153 :         .iter()
     310             247 :         .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
     311             153 :     // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
     312             153 :     // We already deleted the layer files, so it's probably best to panic.
     313             153 :     // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
     314             153 :     if children_exist {
     315 UBC           0 :         panic!("Timeline grew children while we removed layer files");
     316 CBC         153 :     }
     317             153 : 
     318             153 :     timelines
     319             153 :         .remove(&timeline_id)
     320             153 :         .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
     321             153 : 
     322             153 :     drop(timelines);
     323             153 : 
     324             153 :     Ok(())
     325             153 : }
     326                 : 
     327                 : /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
     328                 : /// and deletes its data from both disk and s3.
     329                 : /// The sequence of steps:
     330                 : /// 1. Set deleted_at in remote index part.
     331                 : /// 2. Create local mark file.
     332                 : /// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
     333                 : /// 4. Delete remote layers
     334                 : /// 5. Delete index part
     335                 : /// 6. Delete meta, timeline directory
     336                 : /// 7. Delete mark file
     337                 : /// It is resumable from any step in case a crash/restart occurs.
     338                 : /// There are three entrypoints to the process:
     339                 : /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
     340                 : /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
     341                 : /// and we possibly neeed to continue deletion of remote files.
     342                 : /// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
     343                 : /// index but still have local metadata, timeline directory and delete mark.
     344                 : /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
     345            1290 : #[derive(Default)]
     346                 : pub enum DeleteTimelineFlow {
     347                 :     #[default]
     348                 :     NotStarted,
     349                 :     InProgress,
     350                 :     Finished,
     351                 : }
     352                 : 
     353                 : impl DeleteTimelineFlow {
     354                 :     // These steps are run in the context of management api request handler.
     355                 :     // Long running steps are continued to run in the background.
     356                 :     // NB: If this fails half-way through, and is retried, the retry will go through
     357                 :     // all the same steps again. Make sure the code here is idempotent, and don't
     358                 :     // error out if some of the shutdown tasks have already been completed!
     359 UBC           0 :     #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))]
     360                 :     pub async fn run(
     361                 :         tenant: &Arc<Tenant>,
     362                 :         timeline_id: TimelineId,
     363                 :         inplace: bool,
     364                 :     ) -> Result<(), DeleteTimelineError> {
     365                 :         let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
     366                 : 
     367                 :         guard.mark_in_progress()?;
     368                 : 
     369                 :         stop_tasks(&timeline).await?;
     370                 : 
     371                 :         set_deleted_in_remote_index(&timeline).await?;
     372                 : 
     373 CBC           2 :         fail::fail_point!("timeline-delete-before-schedule", |_| {
     374               2 :             Err(anyhow::anyhow!(
     375               2 :                 "failpoint: timeline-delete-before-schedule"
     376               2 :             ))?
     377               2 :         });
     378                 : 
     379                 :         if inplace {
     380                 :             Self::background(guard, tenant.conf, tenant, &timeline).await?
     381                 :         } else {
     382                 :             Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
     383                 :         }
     384                 : 
     385                 :         Ok(())
     386                 :     }
     387                 : 
     388             191 :     fn mark_in_progress(&mut self) -> anyhow::Result<()> {
     389             191 :         match self {
     390 UBC           0 :             Self::Finished => anyhow::bail!("Bug. Is in finished state"),
     391 CBC          17 :             Self::InProgress { .. } => { /* We're in a retry */ }
     392             174 :             Self::NotStarted => { /* Fresh start */ }
     393                 :         }
     394                 : 
     395             191 :         *self = Self::InProgress;
     396             191 : 
     397             191 :         Ok(())
     398             191 :     }
     399                 : 
     400                 :     /// Shortcut to create Timeline in stopping state and spawn deletion task.
     401                 :     /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
     402              12 :     #[instrument(skip_all, fields(%timeline_id))]
     403                 :     pub async fn resume_deletion(
     404                 :         tenant: Arc<Tenant>,
     405                 :         timeline_id: TimelineId,
     406                 :         local_metadata: &TimelineMetadata,
     407                 :         remote_client: Option<RemoteTimelineClient>,
     408                 :         deletion_queue_client: DeletionQueueClient,
     409                 :     ) -> anyhow::Result<()> {
     410                 :         // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
     411                 :         // RemoteTimelineClient is the only functioning part.
     412                 :         let timeline = tenant
     413                 :             .create_timeline_struct(
     414                 :                 timeline_id,
     415                 :                 local_metadata,
     416                 :                 None, // Ancestor is not needed for deletion.
     417                 :                 TimelineResources {
     418                 :                     remote_client,
     419                 :                     deletion_queue_client,
     420                 :                 },
     421                 :                 // Important. We dont pass ancestor above because it can be missing.
     422                 :                 // Thus we need to skip the validation here.
     423                 :                 CreateTimelineCause::Delete,
     424                 :             )
     425                 :             .context("create_timeline_struct")?;
     426                 : 
     427                 :         let mut guard = DeletionGuard(
     428                 :             Arc::clone(&timeline.delete_progress)
     429                 :                 .try_lock_owned()
     430                 :                 .expect("cannot happen because we're the only owner"),
     431                 :         );
     432                 : 
     433                 :         // We meed to do this because when console retries delete request we shouldnt answer with 404
     434                 :         // because 404 means successful deletion.
     435                 :         {
     436                 :             let mut locked = tenant.timelines.lock().unwrap();
     437                 :             locked.insert(timeline_id, Arc::clone(&timeline));
     438                 :         }
     439                 : 
     440                 :         guard.mark_in_progress()?;
     441                 : 
     442                 :         Self::schedule_background(guard, tenant.conf, tenant, timeline);
     443                 : 
     444                 :         Ok(())
     445                 :     }
     446                 : 
     447 UBC           0 :     #[instrument(skip_all, fields(%timeline_id))]
     448                 :     pub async fn cleanup_remaining_timeline_fs_traces(
     449                 :         tenant: &Tenant,
     450                 :         timeline_id: TimelineId,
     451                 :     ) -> anyhow::Result<()> {
     452                 :         let r =
     453                 :             cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
     454                 :                 .await;
     455               0 :         info!("Done");
     456                 :         r
     457                 :     }
     458                 : 
     459 CBC         189 :     fn prepare(
     460             189 :         tenant: &Tenant,
     461             189 :         timeline_id: TimelineId,
     462             189 :     ) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
     463             189 :         // Note the interaction between this guard and deletion guard.
     464             189 :         // Here we attempt to lock deletion guard when we're holding a lock on timelines.
     465             189 :         // This is important because when you take into account `remove_timeline_from_tenant`
     466             189 :         // we remove timeline from memory when we still hold the deletion guard.
     467             189 :         // So here when timeline deletion is finished timeline wont be present in timelines map at all
     468             189 :         // which makes the following sequence impossible:
     469             189 :         // T1: get preempted right before the try_lock on `Timeline::delete_progress`
     470             189 :         // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
     471             189 :         // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
     472             189 :         // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
     473             189 :         let timelines = tenant.timelines.lock().unwrap();
     474                 : 
     475             189 :         let timeline = match timelines.get(&timeline_id) {
     476             187 :             Some(t) => t,
     477               2 :             None => return Err(DeleteTimelineError::NotFound),
     478                 :         };
     479                 : 
     480                 :         // Ensure that there are no child timelines **attached to that pageserver**,
     481                 :         // because detach removes files, which will break child branches
     482             187 :         let children: Vec<TimelineId> = timelines
     483             187 :             .iter()
     484             323 :             .filter_map(|(id, entry)| {
     485             323 :                 if entry.get_ancestor_timeline_id() == Some(timeline_id) {
     486               1 :                     Some(*id)
     487                 :                 } else {
     488             322 :                     None
     489                 :                 }
     490             323 :             })
     491             187 :             .collect();
     492             187 : 
     493             187 :         if !children.is_empty() {
     494               1 :             return Err(DeleteTimelineError::HasChildren(children));
     495             186 :         }
     496             186 : 
     497             186 :         // Note that using try_lock here is important to avoid a deadlock.
     498             186 :         // Here we take lock on timelines and then the deletion guard.
     499             186 :         // At the end of the operation we're holding the guard and need to lock timelines map
     500             186 :         // to remove the timeline from it.
     501             186 :         // Always if you have two locks that are taken in different order this can result in a deadlock.
     502             186 : 
     503             186 :         let delete_progress = Arc::clone(&timeline.delete_progress);
     504             186 :         let delete_lock_guard = match delete_progress.try_lock_owned() {
     505             179 :             Ok(guard) => DeletionGuard(guard),
     506                 :             Err(_) => {
     507                 :                 // Unfortunately if lock fails arc is consumed.
     508               7 :                 return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
     509               7 :                     &timeline.delete_progress,
     510               7 :                 )));
     511                 :             }
     512                 :         };
     513                 : 
     514             179 :         timeline.set_state(TimelineState::Stopping);
     515             179 : 
     516             179 :         Ok((Arc::clone(timeline), delete_lock_guard))
     517             189 :     }
     518                 : 
     519              63 :     fn schedule_background(
     520              63 :         guard: DeletionGuard,
     521              63 :         conf: &'static PageServerConf,
     522              63 :         tenant: Arc<Tenant>,
     523              63 :         timeline: Arc<Timeline>,
     524              63 :     ) {
     525              63 :         let tenant_shard_id = timeline.tenant_shard_id;
     526              63 :         let timeline_id = timeline.timeline_id;
     527              63 : 
     528              63 :         task_mgr::spawn(
     529              63 :             task_mgr::BACKGROUND_RUNTIME.handle(),
     530              63 :             TaskKind::TimelineDeletionWorker,
     531              63 :             Some(tenant_shard_id),
     532              63 :             Some(timeline_id),
     533              63 :             "timeline_delete",
     534                 :             false,
     535              63 :             async move {
     536            2062 :                 if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
     537              17 :                     error!("Error: {err:#}");
     538              17 :                     timeline.set_broken(format!("{err:#}"))
     539              46 :                 };
     540              63 :                 Ok(())
     541              63 :             }
     542              63 :             .instrument({
     543              63 :                 let span =
     544              63 :                     tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id);
     545              63 :                 span.follows_from(Span::current());
     546              63 :                 span
     547              63 :             }),
     548              63 :         );
     549              63 :     }
     550                 : 
     551             183 :     async fn background(
     552             183 :         mut guard: DeletionGuard,
     553             183 :         conf: &PageServerConf,
     554             183 :         tenant: &Tenant,
     555             183 :         timeline: &Timeline,
     556             183 :     ) -> Result<(), DeleteTimelineError> {
     557            4104 :         delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?;
     558                 : 
     559            1080 :         delete_remote_layers_and_index(timeline).await?;
     560                 : 
     561             161 :         pausable_failpoint!("in_progress_delete");
     562                 : 
     563             161 :         cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id)
     564             777 :             .await?;
     565                 : 
     566             153 :         remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
     567                 : 
     568             153 :         *guard = Self::Finished;
     569             153 : 
     570             153 :         Ok(())
     571             183 :     }
     572                 : 
     573               4 :     pub(crate) fn is_finished(&self) -> bool {
     574               4 :         matches!(self, Self::Finished)
     575               4 :     }
     576                 : }
     577                 : 
     578                 : struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
     579                 : 
     580                 : impl Deref for DeletionGuard {
     581                 :     type Target = DeleteTimelineFlow;
     582                 : 
     583 UBC           0 :     fn deref(&self) -> &Self::Target {
     584               0 :         &self.0
     585               0 :     }
     586                 : }
     587                 : 
     588                 : impl DerefMut for DeletionGuard {
     589 CBC         344 :     fn deref_mut(&mut self) -> &mut Self::Target {
     590             344 :         &mut self.0
     591             344 :     }
     592                 : }
        

Generated by: LCOV version 2.1-beta