Line data Source code
1 : use std::{
2 : ops::{Deref, DerefMut},
3 : sync::Arc,
4 : };
5 :
6 : use anyhow::Context;
7 : use pageserver_api::{models::TimelineState, shard::TenantShardId};
8 : use remote_storage::DownloadError;
9 : use tokio::sync::OwnedMutexGuard;
10 : use tracing::{error, info, info_span, instrument, Instrument};
11 : use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
12 :
13 : use crate::{
14 : config::PageServerConf,
15 : task_mgr::{self, TaskKind},
16 : tenant::{
17 : metadata::TimelineMetadata,
18 : remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
19 : CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
20 : TenantManifestError, TimelineOrOffloaded,
21 : },
22 : virtual_file::MaybeFatalIo,
23 : };
24 :
25 : use super::{Timeline, TimelineResources};
26 :
27 : /// Mark timeline as deleted in S3 so we won't pick it up next time
28 : /// during attach or pageserver restart.
29 : /// See comment in persist_index_part_with_deleted_flag.
30 0 : async fn set_deleted_in_remote_index(
31 0 : remote_client: &Arc<RemoteTimelineClient>,
32 0 : ) -> Result<(), DeleteTimelineError> {
33 0 : let res = remote_client.persist_index_part_with_deleted_flag().await;
34 0 : match res {
35 : // If we (now, or already) marked it successfully as deleted, we can proceed
36 0 : Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
37 : // Bail out otherwise
38 : //
39 : // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
40 : // two tasks from performing the deletion at the same time. The first task
41 : // that starts deletion should run it to completion.
42 0 : Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
43 0 : | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
44 0 : return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
45 : }
46 : }
47 0 : Ok(())
48 0 : }
49 :
50 : /// Grab the compaction and gc locks, and actually perform the deletion.
51 : ///
52 : /// The locks prevent GC or compaction from running at the same time. The background tasks do not
53 : /// register themselves with the timeline it's operating on, so it might still be running even
54 : /// though we called `shutdown_tasks`.
55 : ///
56 : /// Note that there are still other race conditions between
57 : /// GC, compaction and timeline deletion. See
58 : /// <https://github.com/neondatabase/neon/issues/2671>
59 : ///
60 : /// No timeout here, GC & Compaction should be responsive to the
61 : /// `TimelineState::Stopping` change.
62 : // pub(super): documentation link
63 2 : pub(super) async fn delete_local_timeline_directory(
64 2 : conf: &PageServerConf,
65 2 : tenant_shard_id: TenantShardId,
66 2 : timeline: &Timeline,
67 2 : ) {
68 2 : // Always ensure the lock order is compaction -> gc.
69 2 : let compaction_lock = timeline.compaction_lock.lock();
70 2 : let _compaction_lock = crate::timed(
71 2 : compaction_lock,
72 2 : "acquires compaction lock",
73 2 : std::time::Duration::from_secs(5),
74 2 : )
75 0 : .await;
76 :
77 2 : let gc_lock = timeline.gc_lock.lock();
78 2 : let _gc_lock = crate::timed(
79 2 : gc_lock,
80 2 : "acquires gc lock",
81 2 : std::time::Duration::from_secs(5),
82 2 : )
83 0 : .await;
84 :
85 : // NB: storage_sync upload tasks that reference these layers have been cancelled
86 : // by the caller.
87 :
88 2 : let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
89 2 :
90 2 : // NB: This need not be atomic because the deleted flag in the IndexPart
91 2 : // will be observed during tenant/timeline load. The deletion will be resumed there.
92 2 : //
93 2 : // ErrorKind::NotFound can happen e.g. if we race with tenant detach, because,
94 2 : // no locks are shared.
95 2 : tokio::fs::remove_dir_all(local_timeline_directory)
96 1 : .await
97 2 : .or_else(fs_ext::ignore_not_found)
98 2 : .fatal_err("removing timeline directory");
99 2 :
100 2 : // Make sure previous deletions are ordered before mark removal.
101 2 : // Otherwise there is no guarantee that they reach the disk before mark deletion.
102 2 : // So its possible for mark to reach disk first and for other deletions
103 2 : // to be reordered later and thus missed if a crash occurs.
104 2 : // Note that we dont need to sync after mark file is removed
105 2 : // because we can tolerate the case when mark file reappears on startup.
106 2 : let timeline_path = conf.timelines_path(&tenant_shard_id);
107 2 : crashsafe::fsync_async(timeline_path)
108 3 : .await
109 2 : .fatal_err("fsync after removing timeline directory");
110 2 :
111 2 : info!("finished deleting layer files, releasing locks");
112 2 : }
113 :
114 : /// It is important that this gets called when DeletionGuard is being held.
115 : /// For more context see comments in [`DeleteTimelineFlow::prepare`]
116 0 : async fn remove_maybe_offloaded_timeline_from_tenant(
117 0 : tenant: &Tenant,
118 0 : timeline: &TimelineOrOffloaded,
119 0 : _: &DeletionGuard, // using it as a witness
120 0 : ) -> anyhow::Result<()> {
121 0 : // Remove the timeline from the map.
122 0 : // This observes the locking order between timelines and timelines_offloaded
123 0 : let mut timelines = tenant.timelines.lock().unwrap();
124 0 : let mut timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
125 0 : let offloaded_children_exist = timelines_offloaded
126 0 : .iter()
127 0 : .any(|(_, entry)| entry.ancestor_timeline_id == Some(timeline.timeline_id()));
128 0 : let children_exist = timelines
129 0 : .iter()
130 0 : .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id()));
131 0 : // XXX this can happen because of race conditions with branch creation.
132 0 : // We already deleted the remote layer files, so it's probably best to panic.
133 0 : if children_exist || offloaded_children_exist {
134 0 : panic!("Timeline grew children while we removed layer files");
135 0 : }
136 0 :
137 0 : match timeline {
138 0 : TimelineOrOffloaded::Timeline(timeline) => {
139 0 : timelines.remove(&timeline.timeline_id).expect(
140 0 : "timeline that we were deleting was concurrently removed from 'timelines' map",
141 0 : );
142 0 : }
143 0 : TimelineOrOffloaded::Offloaded(timeline) => {
144 0 : let offloaded_timeline = timelines_offloaded
145 0 : .remove(&timeline.timeline_id)
146 0 : .expect("timeline that we were deleting was concurrently removed from 'timelines_offloaded' map");
147 0 : offloaded_timeline.delete_from_ancestor_with_timelines(&timelines);
148 0 : }
149 : }
150 :
151 0 : drop(timelines_offloaded);
152 0 : drop(timelines);
153 0 :
154 0 : Ok(())
155 0 : }
156 :
157 : /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
158 : /// and deletes its data from both disk and s3.
159 : /// The sequence of steps:
160 : /// 1. Set deleted_at in remote index part.
161 : /// 2. Create local mark file.
162 : /// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
163 : /// 4. Delete remote layers
164 : /// 5. Delete index part
165 : /// 6. Delete meta, timeline directory
166 : /// 7. Delete mark file
167 : ///
168 : /// It is resumable from any step in case a crash/restart occurs.
169 : /// There are two entrypoints to the process:
170 : /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
171 : /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
172 : /// and we possibly neeed to continue deletion of remote files.
173 : ///
174 : /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
175 : #[derive(Default)]
176 : pub enum DeleteTimelineFlow {
177 : #[default]
178 : NotStarted,
179 : InProgress,
180 : Finished,
181 : }
182 :
183 : impl DeleteTimelineFlow {
184 : // These steps are run in the context of management api request handler.
185 : // Long running steps are continued to run in the background.
186 : // NB: If this fails half-way through, and is retried, the retry will go through
187 : // all the same steps again. Make sure the code here is idempotent, and don't
188 : // error out if some of the shutdown tasks have already been completed!
189 0 : #[instrument(skip_all)]
190 : pub async fn run(
191 : tenant: &Arc<Tenant>,
192 : timeline_id: TimelineId,
193 : ) -> Result<(), DeleteTimelineError> {
194 : super::debug_assert_current_span_has_tenant_and_timeline_id();
195 :
196 : let allow_offloaded_children = false;
197 : let (timeline, mut guard) = Self::prepare(tenant, timeline_id, allow_offloaded_children)?;
198 :
199 : guard.mark_in_progress()?;
200 :
201 : // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
202 : if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
203 : timeline.shutdown(super::ShutdownMode::Hard).await;
204 : }
205 :
206 : tenant.gc_block.before_delete(&timeline.timeline_id());
207 :
208 0 : fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
209 0 : Err(anyhow::anyhow!(
210 0 : "failpoint: timeline-delete-before-index-deleted-at"
211 0 : ))?
212 0 : });
213 :
214 : let remote_client = match timeline.maybe_remote_client() {
215 : Some(remote_client) => remote_client,
216 : None => {
217 : let remote_client = tenant
218 : .build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
219 : let result = match remote_client
220 : .download_index_file(&tenant.cancel)
221 : .instrument(info_span!("download_index_file"))
222 : .await
223 : {
224 : Ok(r) => r,
225 : Err(DownloadError::NotFound) => {
226 : // Deletion is already complete
227 : tracing::info!("Timeline already deleted in remote storage");
228 : return Ok(());
229 : }
230 : Err(e) => {
231 : return Err(DeleteTimelineError::Other(anyhow::anyhow!(
232 : "error: {:?}",
233 : e
234 : )));
235 : }
236 : };
237 : let index_part = match result {
238 : MaybeDeletedIndexPart::Deleted(p) => {
239 : tracing::info!("Timeline already set as deleted in remote index");
240 : p
241 : }
242 : MaybeDeletedIndexPart::IndexPart(p) => p,
243 : };
244 : let remote_client = Arc::new(remote_client);
245 :
246 : remote_client
247 : .init_upload_queue(&index_part)
248 : .map_err(DeleteTimelineError::Other)?;
249 : remote_client.shutdown().await;
250 : remote_client
251 : }
252 : };
253 : set_deleted_in_remote_index(&remote_client).await?;
254 :
255 0 : fail::fail_point!("timeline-delete-before-schedule", |_| {
256 0 : Err(anyhow::anyhow!(
257 0 : "failpoint: timeline-delete-before-schedule"
258 0 : ))?
259 0 : });
260 :
261 : Self::schedule_background(
262 : guard,
263 : tenant.conf,
264 : Arc::clone(tenant),
265 : timeline,
266 : remote_client,
267 : );
268 :
269 : Ok(())
270 : }
271 :
272 0 : fn mark_in_progress(&mut self) -> anyhow::Result<()> {
273 0 : match self {
274 0 : Self::Finished => anyhow::bail!("Bug. Is in finished state"),
275 0 : Self::InProgress { .. } => { /* We're in a retry */ }
276 0 : Self::NotStarted => { /* Fresh start */ }
277 : }
278 :
279 0 : *self = Self::InProgress;
280 0 :
281 0 : Ok(())
282 0 : }
283 :
284 : /// Shortcut to create Timeline in stopping state and spawn deletion task.
285 0 : #[instrument(skip_all, fields(%timeline_id))]
286 : pub(crate) async fn resume_deletion(
287 : tenant: Arc<Tenant>,
288 : timeline_id: TimelineId,
289 : local_metadata: &TimelineMetadata,
290 : remote_client: RemoteTimelineClient,
291 : ) -> anyhow::Result<()> {
292 : // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
293 : // RemoteTimelineClient is the only functioning part.
294 : let timeline = tenant
295 : .create_timeline_struct(
296 : timeline_id,
297 : local_metadata,
298 : None, // Ancestor is not needed for deletion.
299 : TimelineResources {
300 : remote_client,
301 : timeline_get_throttle: tenant.timeline_get_throttle.clone(),
302 : l0_flush_global_state: tenant.l0_flush_global_state.clone(),
303 : },
304 : // Important. We dont pass ancestor above because it can be missing.
305 : // Thus we need to skip the validation here.
306 : CreateTimelineCause::Delete,
307 : crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
308 : )
309 : .context("create_timeline_struct")?;
310 :
311 : let mut guard = DeletionGuard(
312 : Arc::clone(&timeline.delete_progress)
313 : .try_lock_owned()
314 : .expect("cannot happen because we're the only owner"),
315 : );
316 :
317 : // We meed to do this because when console retries delete request we shouldnt answer with 404
318 : // because 404 means successful deletion.
319 : {
320 : let mut locked = tenant.timelines.lock().unwrap();
321 : locked.insert(timeline_id, Arc::clone(&timeline));
322 : }
323 :
324 : guard.mark_in_progress()?;
325 :
326 : let remote_client = timeline.remote_client.clone();
327 : let timeline = TimelineOrOffloaded::Timeline(timeline);
328 : Self::schedule_background(guard, tenant.conf, tenant, timeline, remote_client);
329 :
330 : Ok(())
331 : }
332 :
333 2 : pub(super) fn prepare(
334 2 : tenant: &Tenant,
335 2 : timeline_id: TimelineId,
336 2 : allow_offloaded_children: bool,
337 2 : ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
338 2 : // Note the interaction between this guard and deletion guard.
339 2 : // Here we attempt to lock deletion guard when we're holding a lock on timelines.
340 2 : // This is important because when you take into account `remove_timeline_from_tenant`
341 2 : // we remove timeline from memory when we still hold the deletion guard.
342 2 : // So here when timeline deletion is finished timeline wont be present in timelines map at all
343 2 : // which makes the following sequence impossible:
344 2 : // T1: get preempted right before the try_lock on `Timeline::delete_progress`
345 2 : // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
346 2 : // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
347 2 : // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
348 2 : let timelines = tenant.timelines.lock().unwrap();
349 2 : let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
350 :
351 2 : let timeline = match timelines.get(&timeline_id) {
352 2 : Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
353 0 : None => match timelines_offloaded.get(&timeline_id) {
354 0 : Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
355 0 : None => return Err(DeleteTimelineError::NotFound),
356 : },
357 : };
358 :
359 : // Ensure that there are no child timelines, because we are about to remove files,
360 : // which will break child branches
361 2 : let mut children = Vec::new();
362 2 : if !allow_offloaded_children {
363 0 : children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
364 0 : (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
365 0 : }));
366 2 : }
367 4 : children.extend(timelines.iter().filter_map(|(id, entry)| {
368 4 : (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
369 4 : }));
370 2 :
371 2 : if !children.is_empty() {
372 0 : return Err(DeleteTimelineError::HasChildren(children));
373 2 : }
374 2 :
375 2 : // Note that using try_lock here is important to avoid a deadlock.
376 2 : // Here we take lock on timelines and then the deletion guard.
377 2 : // At the end of the operation we're holding the guard and need to lock timelines map
378 2 : // to remove the timeline from it.
379 2 : // Always if you have two locks that are taken in different order this can result in a deadlock.
380 2 :
381 2 : let delete_progress = Arc::clone(timeline.delete_progress());
382 2 : let delete_lock_guard = match delete_progress.try_lock_owned() {
383 2 : Ok(guard) => DeletionGuard(guard),
384 : Err(_) => {
385 : // Unfortunately if lock fails arc is consumed.
386 0 : return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
387 0 : timeline.delete_progress(),
388 0 : )));
389 : }
390 : };
391 :
392 2 : if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
393 2 : timeline.set_state(TimelineState::Stopping);
394 2 : }
395 :
396 2 : Ok((timeline, delete_lock_guard))
397 2 : }
398 :
399 0 : fn schedule_background(
400 0 : guard: DeletionGuard,
401 0 : conf: &'static PageServerConf,
402 0 : tenant: Arc<Tenant>,
403 0 : timeline: TimelineOrOffloaded,
404 0 : remote_client: Arc<RemoteTimelineClient>,
405 0 : ) {
406 0 : let tenant_shard_id = timeline.tenant_shard_id();
407 0 : let timeline_id = timeline.timeline_id();
408 0 :
409 0 : task_mgr::spawn(
410 0 : task_mgr::BACKGROUND_RUNTIME.handle(),
411 0 : TaskKind::TimelineDeletionWorker,
412 0 : tenant_shard_id,
413 0 : Some(timeline_id),
414 0 : "timeline_delete",
415 0 : async move {
416 0 : if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
417 : // Only log as an error if it's not a cancellation.
418 0 : if matches!(err, DeleteTimelineError::Cancelled) {
419 0 : info!("Shutdown during timeline deletion");
420 : }else {
421 0 : error!("Error: {err:#}");
422 : }
423 0 : if let TimelineOrOffloaded::Timeline(timeline) = timeline {
424 0 : timeline.set_broken(format!("{err:#}"))
425 0 : }
426 0 : };
427 0 : Ok(())
428 0 : }
429 0 : .instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)),
430 : );
431 0 : }
432 :
433 0 : async fn background(
434 0 : mut guard: DeletionGuard,
435 0 : conf: &PageServerConf,
436 0 : tenant: &Tenant,
437 0 : timeline: &TimelineOrOffloaded,
438 0 : remote_client: Arc<RemoteTimelineClient>,
439 0 : ) -> Result<(), DeleteTimelineError> {
440 0 : fail::fail_point!("timeline-delete-before-rm", |_| {
441 0 : Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
442 0 : });
443 :
444 : // Offloaded timelines have no local state
445 : // TODO: once we persist offloaded information, delete the timeline from there, too
446 0 : if let TimelineOrOffloaded::Timeline(timeline) = timeline {
447 0 : delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await;
448 0 : }
449 :
450 0 : fail::fail_point!("timeline-delete-after-rm", |_| {
451 0 : Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
452 0 : });
453 :
454 0 : remote_client.delete_all().await?;
455 :
456 0 : pausable_failpoint!("in_progress_delete");
457 :
458 0 : remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;
459 :
460 : // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
461 : // between the deletion of the index-part.json and reaching of this code.
462 : // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
463 : // However, we handle this case in tenant loading code so the next time we attach, the issue is
464 : // resolved.
465 0 : tenant.store_tenant_manifest().await.map_err(|e| match e {
466 0 : TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
467 0 : _ => DeleteTimelineError::Other(e.into()),
468 0 : })?;
469 :
470 0 : *guard = Self::Finished;
471 0 :
472 0 : Ok(())
473 0 : }
474 :
475 0 : pub(crate) fn is_not_started(&self) -> bool {
476 0 : matches!(self, Self::NotStarted)
477 0 : }
478 : }
479 :
480 : pub(super) struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
481 :
482 : impl Deref for DeletionGuard {
483 : type Target = DeleteTimelineFlow;
484 :
485 0 : fn deref(&self) -> &Self::Target {
486 0 : &self.0
487 0 : }
488 : }
489 :
490 : impl DerefMut for DeletionGuard {
491 0 : fn deref_mut(&mut self) -> &mut Self::Target {
492 0 : &mut self.0
493 0 : }
494 : }
|