LCOV - code coverage report
Current view: top level - pageserver/src/tenant/timeline - (source / functions) Coverage Total Hit
Test: Lines: 36.9 % 233 86
Test Date: 2024-11-20 01:36:58 Functions: 15.4 % 26 4

            Line data    Source code
       1              : use std::{
       2              :     ops::{Deref, DerefMut},
       3              :     sync::Arc,
       4              : };
       5              : 
       6              : use anyhow::Context;
       7              : use pageserver_api::{models::TimelineState, shard::TenantShardId};
       8              : use remote_storage::DownloadError;
       9              : use tokio::sync::OwnedMutexGuard;
      10              : use tracing::{error, info, info_span, instrument, Instrument};
      11              : use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
      12              : 
      13              : use crate::{
      14              :     config::PageServerConf,
      15              :     task_mgr::{self, TaskKind},
      16              :     tenant::{
      17              :         metadata::TimelineMetadata,
      18              :         remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
      19              :         CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
      20              :         TenantManifestError, TimelineOrOffloaded,
      21              :     },
      22              :     virtual_file::MaybeFatalIo,
      23              : };
      24              : 
      25              : use super::{Timeline, TimelineResources};
      26              : 
      27              : /// Mark timeline as deleted in S3 so we won't pick it up next time
      28              : /// during attach or pageserver restart.
      29              : /// See comment in persist_index_part_with_deleted_flag.
      30            0 : async fn set_deleted_in_remote_index(
      31            0 :     remote_client: &Arc<RemoteTimelineClient>,
      32            0 : ) -> Result<(), DeleteTimelineError> {
      33            0 :     let res = remote_client.persist_index_part_with_deleted_flag().await;
      34            0 :     match res {
      35              :         // If we (now, or already) marked it successfully as deleted, we can proceed
      36            0 :         Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
      37              :         // Bail out otherwise
      38              :         //
      39              :         // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
      40              :         // two tasks from performing the deletion at the same time. The first task
      41              :         // that starts deletion should run it to completion.
      42            0 :         Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
      43            0 :         | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
      44            0 :             return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
      45              :         }
      46              :     }
      47            0 :     Ok(())
      48            0 : }
      49              : 
      50              : /// Grab the compaction and gc locks, and actually perform the deletion.
      51              : ///
      52              : /// The locks prevent GC or compaction from running at the same time. The background tasks do not
      53              : /// register themselves with the timeline it's operating on, so it might still be running even
      54              : /// though we called `shutdown_tasks`.
      55              : ///
      56              : /// Note that there are still other race conditions between
      57              : /// GC, compaction and timeline deletion. See
      58              : /// <>
      59              : ///
      60              : /// No timeout here, GC & Compaction should be responsive to the
      61              : /// `TimelineState::Stopping` change.
      62              : // pub(super): documentation link
      63            2 : pub(super) async fn delete_local_timeline_directory(
      64            2 :     conf: &PageServerConf,
      65            2 :     tenant_shard_id: TenantShardId,
      66            2 :     timeline: &Timeline,
      67            2 : ) {
      68            2 :     // Always ensure the lock order is compaction -> gc.
      69            2 :     let compaction_lock = timeline.compaction_lock.lock();
      70            2 :     let _compaction_lock = crate::timed(
      71            2 :         compaction_lock,
      72            2 :         "acquires compaction lock",
      73            2 :         std::time::Duration::from_secs(5),
      74            2 :     )
      75            0 :     .await;
      76              : 
      77            2 :     let gc_lock = timeline.gc_lock.lock();
      78            2 :     let _gc_lock = crate::timed(
      79            2 :         gc_lock,
      80            2 :         "acquires gc lock",
      81            2 :         std::time::Duration::from_secs(5),
      82            2 :     )
      83            0 :     .await;
      84              : 
      85              :     // NB: storage_sync upload tasks that reference these layers have been cancelled
      86              :     //     by the caller.
      87              : 
      88            2 :     let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
      89            2 : 
      90            2 :     // NB: This need not be atomic because the deleted flag in the IndexPart
      91            2 :     // will be observed during tenant/timeline load. The deletion will be resumed there.
      92            2 :     //
      93            2 :     // ErrorKind::NotFound can happen e.g. if we race with tenant detach, because,
      94            2 :     // no locks are shared.
      95            2 :     tokio::fs::remove_dir_all(local_timeline_directory)
      96            2 :         .await
      97            2 :         .or_else(fs_ext::ignore_not_found)
      98            2 :         .fatal_err("removing timeline directory");
      99            2 : 
     100            2 :     // Make sure previous deletions are ordered before mark removal.
     101            2 :     // Otherwise there is no guarantee that they reach the disk before mark deletion.
     102            2 :     // So its possible for mark to reach disk first and for other deletions
     103            2 :     // to be reordered later and thus missed if a crash occurs.
     104            2 :     // Note that we dont need to sync after mark file is removed
     105            2 :     // because we can tolerate the case when mark file reappears on startup.
     106            2 :     let timeline_path = conf.timelines_path(&tenant_shard_id);
     107            2 :     crashsafe::fsync_async(timeline_path)
     108            4 :         .await
     109            2 :         .fatal_err("fsync after removing timeline directory");
     110            2 : 
     111            2 :     info!("finished deleting layer files, releasing locks");
     112            2 : }
     113              : 
     114              : /// It is important that this gets called when DeletionGuard is being held.
     115              : /// For more context see comments in [`DeleteTimelineFlow::prepare`]
     116            0 : async fn remove_maybe_offloaded_timeline_from_tenant(
     117            0 :     tenant: &Tenant,
     118            0 :     timeline: &TimelineOrOffloaded,
     119            0 :     _: &DeletionGuard, // using it as a witness
     120            0 : ) -> anyhow::Result<()> {
     121            0 :     // Remove the timeline from the map.
     122            0 :     // This observes the locking order between timelines and timelines_offloaded
     123            0 :     let mut timelines = tenant.timelines.lock().unwrap();
     124            0 :     let mut timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
     125            0 :     let offloaded_children_exist = timelines_offloaded
     126            0 :         .iter()
     127            0 :         .any(|(_, entry)| entry.ancestor_timeline_id == Some(timeline.timeline_id()));
     128            0 :     let children_exist = timelines
     129            0 :         .iter()
     130            0 :         .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id()));
     131            0 :     // XXX this can happen because of race conditions with branch creation.
     132            0 :     // We already deleted the remote layer files, so it's probably best to panic.
     133            0 :     if children_exist || offloaded_children_exist {
     134            0 :         panic!("Timeline grew children while we removed layer files");
     135            0 :     }
     136            0 : 
     137            0 :     match timeline {
     138            0 :         TimelineOrOffloaded::Timeline(timeline) => {
     139            0 :             timelines.remove(&timeline.timeline_id).expect(
     140            0 :                 "timeline that we were deleting was concurrently removed from 'timelines' map",
     141            0 :             );
     142            0 :         }
     143            0 :         TimelineOrOffloaded::Offloaded(timeline) => {
     144            0 :             let offloaded_timeline = timelines_offloaded
     145            0 :                 .remove(&timeline.timeline_id)
     146            0 :                 .expect("timeline that we were deleting was concurrently removed from 'timelines_offloaded' map");
     147            0 :             offloaded_timeline.delete_from_ancestor_with_timelines(&timelines);
     148            0 :         }
     149              :     }
     150              : 
     151            0 :     drop(timelines_offloaded);
     152            0 :     drop(timelines);
     153            0 : 
     154            0 :     Ok(())
     155            0 : }
     156              : 
     157              : /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
     158              : /// and deletes its data from both disk and s3.
     159              : /// The sequence of steps:
     160              : /// 1. Set deleted_at in remote index part.
     161              : /// 2. Create local mark file.
     162              : /// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
     163              : /// 4. Delete remote layers
     164              : /// 5. Delete index part
     165              : /// 6. Delete meta, timeline directory
     166              : /// 7. Delete mark file
     167              : ///
     168              : /// It is resumable from any step in case a crash/restart occurs.
     169              : /// There are two entrypoints to the process:
     170              : /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
     171              : /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
     172              : ///    and we possibly neeed to continue deletion of remote files.
     173              : ///
     174              : /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
     175              : #[derive(Default)]
     176              : pub enum DeleteTimelineFlow {
     177              :     #[default]
     178              :     NotStarted,
     179              :     InProgress,
     180              :     Finished,
     181              : }
     182              : 
     183              : impl DeleteTimelineFlow {
     184              :     // These steps are run in the context of management api request handler.
     185              :     // Long running steps are continued to run in the background.
     186              :     // NB: If this fails half-way through, and is retried, the retry will go through
     187              :     // all the same steps again. Make sure the code here is idempotent, and don't
     188              :     // error out if some of the shutdown tasks have already been completed!
     189            0 :     #[instrument(skip_all)]
     190              :     pub async fn run(
     191              :         tenant: &Arc<Tenant>,
     192              :         timeline_id: TimelineId,
     193              :     ) -> Result<(), DeleteTimelineError> {
     194              :         super::debug_assert_current_span_has_tenant_and_timeline_id();
     195              : 
     196              :         let allow_offloaded_children = false;
     197              :         let (timeline, mut guard) = Self::prepare(tenant, timeline_id, allow_offloaded_children)?;
     198              : 
     199              :         guard.mark_in_progress()?;
     200              : 
     201              :         // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
     202              :         if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
     203              :             timeline.shutdown(super::ShutdownMode::Hard).await;
     204              :         }
     205              : 
     206              :         tenant.gc_block.before_delete(&timeline.timeline_id());
     207              : 
     208            0 :         fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
     209            0 :             Err(anyhow::anyhow!(
     210            0 :                 "failpoint: timeline-delete-before-index-deleted-at"
     211            0 :             ))?
     212            0 :         });
     213              : 
     214              :         let remote_client = match timeline.maybe_remote_client() {
     215              :             Some(remote_client) => remote_client,
     216              :             None => {
     217              :                 let remote_client = tenant
     218              :                     .build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
     219              :                 let result = match remote_client
     220              :                     .download_index_file(&tenant.cancel)
     221              :                     .instrument(info_span!("download_index_file"))
     222              :                     .await
     223              :                 {
     224              :                     Ok(r) => r,
     225              :                     Err(DownloadError::NotFound) => {
     226              :                         // Deletion is already complete
     227              :                         tracing::info!("Timeline already deleted in remote storage");
     228              :                         return Ok(());
     229              :                     }
     230              :                     Err(e) => {
     231              :                         return Err(DeleteTimelineError::Other(anyhow::anyhow!(
     232              :                             "error: {:?}",
     233              :                             e
     234              :                         )));
     235              :                     }
     236              :                 };
     237              :                 let index_part = match result {
     238              :                     MaybeDeletedIndexPart::Deleted(p) => {
     239              :                         tracing::info!("Timeline already set as deleted in remote index");
     240              :                         p
     241              :                     }
     242              :                     MaybeDeletedIndexPart::IndexPart(p) => p,
     243              :                 };
     244              :                 let remote_client = Arc::new(remote_client);
     245              : 
     246              :                 remote_client
     247              :                     .init_upload_queue(&index_part)
     248              :                     .map_err(DeleteTimelineError::Other)?;
     249              :                 remote_client.shutdown().await;
     250              :                 remote_client
     251              :             }
     252              :         };
     253              :         set_deleted_in_remote_index(&remote_client).await?;
     254              : 
     255            0 :         fail::fail_point!("timeline-delete-before-schedule", |_| {
     256            0 :             Err(anyhow::anyhow!(
     257            0 :                 "failpoint: timeline-delete-before-schedule"
     258            0 :             ))?
     259            0 :         });
     260              : 
     261              :         Self::schedule_background(
     262              :             guard,
     263              :             tenant.conf,
     264              :             Arc::clone(tenant),
     265              :             timeline,
     266              :             remote_client,
     267              :         );
     268              : 
     269              :         Ok(())
     270              :     }
     271              : 
     272            0 :     fn mark_in_progress(&mut self) -> anyhow::Result<()> {
     273            0 :         match self {
     274            0 :             Self::Finished => anyhow::bail!("Bug. Is in finished state"),
     275            0 :             Self::InProgress { .. } => { /* We're in a retry */ }
     276            0 :             Self::NotStarted => { /* Fresh start */ }
     277              :         }
     278              : 
     279            0 :         *self = Self::InProgress;
     280            0 : 
     281            0 :         Ok(())
     282            0 :     }
     283              : 
     284              :     /// Shortcut to create Timeline in stopping state and spawn deletion task.
     285            0 :     #[instrument(skip_all, fields(%timeline_id))]
     286              :     pub async fn resume_deletion(
     287              :         tenant: Arc<Tenant>,
     288              :         timeline_id: TimelineId,
     289              :         local_metadata: &TimelineMetadata,
     290              :         remote_client: RemoteTimelineClient,
     291              :     ) -> anyhow::Result<()> {
     292              :         // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
     293              :         // RemoteTimelineClient is the only functioning part.
     294              :         let timeline = tenant
     295              :             .create_timeline_struct(
     296              :                 timeline_id,
     297              :                 local_metadata,
     298              :                 None, // Ancestor is not needed for deletion.
     299              :                 TimelineResources {
     300              :                     remote_client,
     301              :                     timeline_get_throttle: tenant.timeline_get_throttle.clone(),
     302              :                     l0_flush_global_state: tenant.l0_flush_global_state.clone(),
     303              :                 },
     304              :                 // Important. We dont pass ancestor above because it can be missing.
     305              :                 // Thus we need to skip the validation here.
     306              :                 CreateTimelineCause::Delete,
     307              :                 crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
     308              :             )
     309              :             .context("create_timeline_struct")?;
     310              : 
     311              :         let mut guard = DeletionGuard(
     312              :             Arc::clone(&timeline.delete_progress)
     313              :                 .try_lock_owned()
     314              :                 .expect("cannot happen because we're the only owner"),
     315              :         );
     316              : 
     317              :         // We meed to do this because when console retries delete request we shouldnt answer with 404
     318              :         // because 404 means successful deletion.
     319              :         {
     320              :             let mut locked = tenant.timelines.lock().unwrap();
     321              :             locked.insert(timeline_id, Arc::clone(&timeline));
     322              :         }
     323              : 
     324              :         guard.mark_in_progress()?;
     325              : 
     326              :         let remote_client = timeline.remote_client.clone();
     327              :         let timeline = TimelineOrOffloaded::Timeline(timeline);
     328              :         Self::schedule_background(guard, tenant.conf, tenant, timeline, remote_client);
     329              : 
     330              :         Ok(())
     331              :     }
     332              : 
     333            2 :     pub(super) fn prepare(
     334            2 :         tenant: &Tenant,
     335            2 :         timeline_id: TimelineId,
     336            2 :         allow_offloaded_children: bool,
     337            2 :     ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
     338            2 :         // Note the interaction between this guard and deletion guard.
     339            2 :         // Here we attempt to lock deletion guard when we're holding a lock on timelines.
     340            2 :         // This is important because when you take into account `remove_timeline_from_tenant`
     341            2 :         // we remove timeline from memory when we still hold the deletion guard.
     342            2 :         // So here when timeline deletion is finished timeline wont be present in timelines map at all
     343            2 :         // which makes the following sequence impossible:
     344            2 :         // T1: get preempted right before the try_lock on `Timeline::delete_progress`
     345            2 :         // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
     346            2 :         // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
     347            2 :         // For more context see this discussion: ``
     348            2 :         let timelines = tenant.timelines.lock().unwrap();
     349            2 :         let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
     350              : 
     351            2 :         let timeline = match timelines.get(&timeline_id) {
     352            2 :             Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
     353            0 :             None => match timelines_offloaded.get(&timeline_id) {
     354            0 :                 Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
     355            0 :                 None => return Err(DeleteTimelineError::NotFound),
     356              :             },
     357              :         };
     358              : 
     359              :         // Ensure that there are no child timelines, because we are about to remove files,
     360              :         // which will break child branches
     361            2 :         let mut children = Vec::new();
     362            2 :         if !allow_offloaded_children {
     363            0 :             children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
     364            0 :                 (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
     365            0 :             }));
     366            2 :         }
     367            4 :         children.extend(timelines.iter().filter_map(|(id, entry)| {
     368            4 :             (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
     369            4 :         }));
     370            2 : 
     371            2 :         if !children.is_empty() {
     372            0 :             return Err(DeleteTimelineError::HasChildren(children));
     373            2 :         }
     374            2 : 
     375            2 :         // Note that using try_lock here is important to avoid a deadlock.
     376            2 :         // Here we take lock on timelines and then the deletion guard.
     377            2 :         // At the end of the operation we're holding the guard and need to lock timelines map
     378            2 :         // to remove the timeline from it.
     379            2 :         // Always if you have two locks that are taken in different order this can result in a deadlock.
     380            2 : 
     381            2 :         let delete_progress = Arc::clone(timeline.delete_progress());
     382            2 :         let delete_lock_guard = match delete_progress.try_lock_owned() {
     383            2 :             Ok(guard) => DeletionGuard(guard),
     384              :             Err(_) => {
     385              :                 // Unfortunately if lock fails arc is consumed.
     386            0 :                 return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
     387            0 :                     timeline.delete_progress(),
     388            0 :                 )));
     389              :             }
     390              :         };
     391              : 
     392            2 :         if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
     393            2 :             timeline.set_state(TimelineState::Stopping);
     394            2 :         }
     395              : 
     396            2 :         Ok((timeline, delete_lock_guard))
     397            2 :     }
     398              : 
     399            0 :     fn schedule_background(
     400            0 :         guard: DeletionGuard,
     401            0 :         conf: &'static PageServerConf,
     402            0 :         tenant: Arc<Tenant>,
     403            0 :         timeline: TimelineOrOffloaded,
     404            0 :         remote_client: Arc<RemoteTimelineClient>,
     405            0 :     ) {
     406            0 :         let tenant_shard_id = timeline.tenant_shard_id();
     407            0 :         let timeline_id = timeline.timeline_id();
     408            0 : 
     409            0 :         task_mgr::spawn(
     410            0 :             task_mgr::BACKGROUND_RUNTIME.handle(),
     411            0 :             TaskKind::TimelineDeletionWorker,
     412            0 :             tenant_shard_id,
     413            0 :             Some(timeline_id),
     414            0 :             "timeline_delete",
     415            0 :             async move {
     416            0 :                 if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
     417              :                     // Only log as an error if it's not a cancellation.
     418            0 :                     if matches!(err, DeleteTimelineError::Cancelled) {
     419            0 :                         info!("Shutdown during timeline deletion");
     420              :                     }else {
     421            0 :                         error!("Error: {err:#}");
     422              :                     }
     423            0 :                     if let TimelineOrOffloaded::Timeline(timeline) = timeline {
     424            0 :                         timeline.set_broken(format!("{err:#}"))
     425            0 :                     }
     426            0 :                 };
     427            0 :                 Ok(())
     428            0 :             }
     429            0 :             .instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)),
     430              :         );
     431            0 :     }
     432              : 
     433            0 :     async fn background(
     434            0 :         mut guard: DeletionGuard,
     435            0 :         conf: &PageServerConf,
     436            0 :         tenant: &Tenant,
     437            0 :         timeline: &TimelineOrOffloaded,
     438            0 :         remote_client: Arc<RemoteTimelineClient>,
     439            0 :     ) -> Result<(), DeleteTimelineError> {
     440            0 :         fail::fail_point!("timeline-delete-before-rm", |_| {
     441            0 :             Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
     442            0 :         });
     443              : 
     444              :         // Offloaded timelines have no local state
     445              :         // TODO: once we persist offloaded information, delete the timeline from there, too
     446            0 :         if let TimelineOrOffloaded::Timeline(timeline) = timeline {
     447            0 :             delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await;
     448            0 :         }
     449              : 
     450            0 :         fail::fail_point!("timeline-delete-after-rm", |_| {
     451            0 :             Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
     452            0 :         });
     453              : 
     454            0 :         remote_client.delete_all().await?;
     455              : 
     456            0 :         pausable_failpoint!("in_progress_delete");
     457              : 
     458            0 :         remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;
     459              : 
     460              :         // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
     461              :         // between the deletion of the index-part.json and reaching of this code.
     462              :         // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
     463              :         // However, we handle this case in tenant loading code so the next time we attach, the issue is
     464              :         // resolved.
     465            0 :         tenant.store_tenant_manifest().await.map_err(|e| match e {
     466            0 :             TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
     467            0 :             _ => DeleteTimelineError::Other(e.into()),
     468            0 :         })?;
     469              : 
     470            0 :         *guard = Self::Finished;
     471            0 : 
     472            0 :         Ok(())
     473            0 :     }
     474              : 
     475            0 :     pub(crate) fn is_not_started(&self) -> bool {
     476            0 :         matches!(self, Self::NotStarted)
     477            0 :     }
     478              : }
     479              : 
     480              : pub(super) struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
     481              : 
     482              : impl Deref for DeletionGuard {
     483              :     type Target = DeleteTimelineFlow;
     484              : 
     485            0 :     fn deref(&self) -> &Self::Target {
     486            0 :         &self.0
     487            0 :     }
     488              : }
     489              : 
     490              : impl DerefMut for DeletionGuard {
     491            0 :     fn deref_mut(&mut self) -> &mut Self::Target {
     492            0 :         &mut self.0
     493            0 :     }
     494              : }

Generated by: LCOV version 2.1-beta