Line data Source code
1 : use std::ops::{Deref, DerefMut};
2 : use std::sync::Arc;
3 :
4 : use anyhow::Context;
5 : use pageserver_api::models::TimelineState;
6 : use pageserver_api::shard::TenantShardId;
7 : use remote_storage::DownloadError;
8 : use tokio::sync::OwnedMutexGuard;
9 : use tracing::{Instrument, error, info, info_span, instrument};
10 : use utils::id::TimelineId;
11 : use utils::{crashsafe, fs_ext, pausable_failpoint};
12 :
13 : use crate::config::PageServerConf;
14 : use crate::context::RequestContext;
15 : use crate::task_mgr::{self, TaskKind};
16 : use crate::tenant::metadata::TimelineMetadata;
17 : use crate::tenant::remote_timeline_client::{
18 : PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
19 : };
20 : use crate::tenant::{
21 : CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, TenantManifestError,
22 : TenantShard, Timeline, TimelineOrOffloaded,
23 : };
24 : use crate::virtual_file::MaybeFatalIo;
25 :
26 : /// Mark timeline as deleted in S3 so we won't pick it up next time
27 : /// during attach or pageserver restart.
28 : /// See comment in persist_index_part_with_deleted_flag.
29 0 : async fn set_deleted_in_remote_index(
30 0 : remote_client: &Arc<RemoteTimelineClient>,
31 0 : ) -> Result<(), DeleteTimelineError> {
32 0 : let res = remote_client.persist_index_part_with_deleted_flag().await;
33 0 : match res {
34 : // If we (now, or already) marked it successfully as deleted, we can proceed
35 0 : Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
36 : // Bail out otherwise
37 : //
38 : // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
39 : // two tasks from performing the deletion at the same time. The first task
40 : // that starts deletion should run it to completion.
41 0 : Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
42 0 : | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
43 0 : return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
44 : }
45 : }
46 0 : Ok(())
47 0 : }
48 :
49 : /// Grab the compaction and gc locks, and actually perform the deletion.
50 : ///
51 : /// The locks prevent GC or compaction from running at the same time. The background tasks do not
52 : /// register themselves with the timeline it's operating on, so it might still be running even
53 : /// though we called `shutdown_tasks`.
54 : ///
55 : /// Note that there are still other race conditions between
56 : /// GC, compaction and timeline deletion. See
57 : /// <https://github.com/neondatabase/neon/issues/2671>
58 : ///
59 : /// No timeout here, GC & Compaction should be responsive to the
60 : /// `TimelineState::Stopping` change.
61 : // pub(super): documentation link
62 1 : pub(super) async fn delete_local_timeline_directory(
63 1 : conf: &PageServerConf,
64 1 : tenant_shard_id: TenantShardId,
65 1 : timeline: &Timeline,
66 1 : ) {
67 : // Always ensure the lock order is compaction -> gc.
68 1 : let compaction_lock = timeline.compaction_lock.lock();
69 1 : let _compaction_lock = crate::timed(
70 1 : compaction_lock,
71 1 : "acquires compaction lock",
72 1 : std::time::Duration::from_secs(5),
73 1 : )
74 1 : .await;
75 :
76 1 : let gc_lock = timeline.gc_lock.lock();
77 1 : let _gc_lock = crate::timed(
78 1 : gc_lock,
79 1 : "acquires gc lock",
80 1 : std::time::Duration::from_secs(5),
81 1 : )
82 1 : .await;
83 :
84 : // NB: storage_sync upload tasks that reference these layers have been cancelled
85 : // by the caller.
86 :
87 1 : let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
88 :
89 : // NB: This need not be atomic because the deleted flag in the IndexPart
90 : // will be observed during tenant/timeline load. The deletion will be resumed there.
91 : //
92 : // ErrorKind::NotFound can happen e.g. if we race with tenant detach, because,
93 : // no locks are shared.
94 1 : tokio::fs::remove_dir_all(local_timeline_directory)
95 1 : .await
96 1 : .or_else(fs_ext::ignore_not_found)
97 1 : .fatal_err("removing timeline directory");
98 :
99 : // Make sure previous deletions are ordered before mark removal.
100 : // Otherwise there is no guarantee that they reach the disk before mark deletion.
101 : // So its possible for mark to reach disk first and for other deletions
102 : // to be reordered later and thus missed if a crash occurs.
103 : // Note that we dont need to sync after mark file is removed
104 : // because we can tolerate the case when mark file reappears on startup.
105 1 : let timeline_path = conf.timelines_path(&tenant_shard_id);
106 1 : crashsafe::fsync_async(timeline_path)
107 1 : .await
108 1 : .fatal_err("fsync after removing timeline directory");
109 :
110 1 : info!("finished deleting layer files, releasing locks");
111 1 : }
112 :
113 : /// It is important that this gets called when DeletionGuard is being held.
114 : /// For more context see comments in [`make_timeline_delete_guard`]
115 0 : async fn remove_maybe_offloaded_timeline_from_tenant(
116 0 : tenant: &TenantShard,
117 0 : timeline: &TimelineOrOffloaded,
118 0 : _: &DeletionGuard, // using it as a witness
119 0 : ) -> anyhow::Result<()> {
120 : // Remove the timeline from the map.
121 : // This observes the locking order between timelines and timelines_offloaded
122 0 : let mut timelines = tenant.timelines.lock().unwrap();
123 0 : let mut timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
124 0 : let mut timelines_importing = tenant.timelines_importing.lock().unwrap();
125 0 : let offloaded_children_exist = timelines_offloaded
126 0 : .iter()
127 0 : .any(|(_, entry)| entry.ancestor_timeline_id == Some(timeline.timeline_id()));
128 0 : let children_exist = timelines
129 0 : .iter()
130 0 : .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id()));
131 : // XXX this can happen because of race conditions with branch creation.
132 : // We already deleted the remote layer files, so it's probably best to panic.
133 0 : if children_exist || offloaded_children_exist {
134 0 : panic!("Timeline grew children while we removed layer files");
135 0 : }
136 :
137 0 : match timeline {
138 0 : TimelineOrOffloaded::Timeline(timeline) => {
139 0 : timelines.remove(&timeline.timeline_id).expect(
140 0 : "timeline that we were deleting was concurrently removed from 'timelines' map",
141 0 : );
142 0 : tenant
143 0 : .scheduled_compaction_tasks
144 0 : .lock()
145 0 : .unwrap()
146 0 : .remove(&timeline.timeline_id);
147 0 : }
148 0 : TimelineOrOffloaded::Offloaded(timeline) => {
149 0 : let offloaded_timeline = timelines_offloaded
150 0 : .remove(&timeline.timeline_id)
151 0 : .expect("timeline that we were deleting was concurrently removed from 'timelines_offloaded' map");
152 0 : offloaded_timeline.delete_from_ancestor_with_timelines(&timelines);
153 0 : }
154 0 : TimelineOrOffloaded::Importing(importing) => {
155 0 : timelines_importing.remove(&importing.timeline.timeline_id);
156 0 : }
157 : }
158 :
159 0 : drop(timelines_importing);
160 0 : drop(timelines_offloaded);
161 0 : drop(timelines);
162 :
163 0 : Ok(())
164 0 : }
165 :
166 : /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
167 : /// and deletes its data from both disk and s3.
168 : /// The sequence of steps:
169 : /// 1. Set deleted_at in remote index part.
170 : /// 2. Create local mark file.
171 : /// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
172 : /// 4. Delete remote layers
173 : /// 5. Delete index part
174 : /// 6. Delete meta, timeline directory
175 : /// 7. Delete mark file
176 : ///
177 : /// It is resumable from any step in case a crash/restart occurs.
178 : /// There are two entrypoints to the process:
179 : /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
180 : /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
181 : /// and we possibly neeed to continue deletion of remote files.
182 : ///
183 : /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
184 : #[derive(Default)]
185 : pub enum DeleteTimelineFlow {
186 : #[default]
187 : NotStarted,
188 : InProgress,
189 : Finished,
190 : }
191 :
192 : impl DeleteTimelineFlow {
193 : // These steps are run in the context of management api request handler.
194 : // Long running steps are continued to run in the background.
195 : // NB: If this fails half-way through, and is retried, the retry will go through
196 : // all the same steps again. Make sure the code here is idempotent, and don't
197 : // error out if some of the shutdown tasks have already been completed!
198 : #[instrument(skip_all)]
199 : pub async fn run(
200 : tenant: &Arc<TenantShard>,
201 : timeline_id: TimelineId,
202 : ) -> Result<(), DeleteTimelineError> {
203 : super::debug_assert_current_span_has_tenant_and_timeline_id();
204 :
205 : let (timeline, mut guard) =
206 : make_timeline_delete_guard(tenant, timeline_id, TimelineDeleteGuardKind::Delete)?;
207 :
208 : guard.mark_in_progress()?;
209 :
210 : // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
211 : // TODO(vlad): shut down imported timeline here
212 : match &timeline {
213 : TimelineOrOffloaded::Timeline(timeline) => {
214 : timeline.shutdown(super::ShutdownMode::Hard).await;
215 : }
216 : TimelineOrOffloaded::Importing(importing) => {
217 : importing.shutdown().await;
218 : }
219 : TimelineOrOffloaded::Offloaded(_offloaded) => {
220 : // Nothing to shut down in this case
221 : }
222 : }
223 :
224 : tenant.gc_block.before_delete(&timeline.timeline_id());
225 :
226 0 : fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
227 0 : Err(anyhow::anyhow!(
228 0 : "failpoint: timeline-delete-before-index-deleted-at"
229 0 : ))?
230 0 : });
231 :
232 : let remote_client = match timeline.maybe_remote_client() {
233 : Some(remote_client) => remote_client,
234 : None => {
235 : let remote_client = tenant
236 : .build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
237 : let result = match remote_client
238 : .download_index_file(&tenant.cancel)
239 : .instrument(info_span!("download_index_file"))
240 : .await
241 : {
242 : Ok(r) => r,
243 : Err(DownloadError::NotFound) => {
244 : // Deletion is already complete.
245 : // As we came here, we will need to remove the timeline from the tenant though.
246 : tracing::info!("Timeline already deleted in remote storage");
247 : if let TimelineOrOffloaded::Offloaded(_) = &timeline {
248 : // We only supoprt this for offloaded timelines, as we don't know which state non-offloaded timelines are in.
249 : tracing::info!(
250 : "Timeline with gone index part is offloaded timeline. Removing from tenant."
251 : );
252 : remove_maybe_offloaded_timeline_from_tenant(tenant, &timeline, &guard)
253 : .await?;
254 : }
255 : return Ok(());
256 : }
257 : Err(e) => {
258 : return Err(DeleteTimelineError::Other(anyhow::anyhow!(
259 : "error: {:?}",
260 : e
261 : )));
262 : }
263 : };
264 : let index_part = match result {
265 : MaybeDeletedIndexPart::Deleted(p) => {
266 : tracing::info!("Timeline already set as deleted in remote index");
267 : p
268 : }
269 : MaybeDeletedIndexPart::IndexPart(p) => p,
270 : };
271 : let remote_client = Arc::new(remote_client);
272 :
273 : remote_client
274 : .init_upload_queue(&index_part)
275 : .map_err(DeleteTimelineError::Other)?;
276 : remote_client.shutdown().await;
277 : remote_client
278 : }
279 : };
280 : set_deleted_in_remote_index(&remote_client).await?;
281 :
282 0 : fail::fail_point!("timeline-delete-before-schedule", |_| {
283 0 : Err(anyhow::anyhow!(
284 0 : "failpoint: timeline-delete-before-schedule"
285 0 : ))?
286 0 : });
287 :
288 : Self::schedule_background(
289 : guard,
290 : tenant.conf,
291 : Arc::clone(tenant),
292 : timeline,
293 : remote_client,
294 : );
295 :
296 : Ok(())
297 : }
298 :
299 0 : fn mark_in_progress(&mut self) -> anyhow::Result<()> {
300 0 : match self {
301 0 : Self::Finished => anyhow::bail!("Bug. Is in finished state"),
302 0 : Self::InProgress { .. } => { /* We're in a retry */ }
303 0 : Self::NotStarted => { /* Fresh start */ }
304 : }
305 :
306 0 : *self = Self::InProgress;
307 :
308 0 : Ok(())
309 0 : }
310 :
311 : /// Shortcut to create Timeline in stopping state and spawn deletion task.
312 : #[instrument(skip_all, fields(%timeline_id))]
313 : pub(crate) async fn resume_deletion(
314 : tenant: Arc<TenantShard>,
315 : timeline_id: TimelineId,
316 : local_metadata: &TimelineMetadata,
317 : remote_client: RemoteTimelineClient,
318 : ctx: &RequestContext,
319 : ) -> anyhow::Result<()> {
320 : // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
321 : // RemoteTimelineClient is the only functioning part.
322 : let (timeline, _timeline_ctx) = tenant
323 : .create_timeline_struct(
324 : timeline_id,
325 : local_metadata,
326 : None, // Ancestor is not needed for deletion.
327 : None, // Previous heatmap is not needed for deletion
328 : tenant.get_timeline_resources_for(remote_client),
329 : // Important. We dont pass ancestor above because it can be missing.
330 : // Thus we need to skip the validation here.
331 : CreateTimelineCause::Delete,
332 : crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
333 : None, // doesn't matter what we put here
334 : None, // doesn't matter what we put here
335 : ctx,
336 : )
337 : .context("create_timeline_struct")?;
338 :
339 : let mut guard = DeletionGuard(
340 : Arc::clone(&timeline.delete_progress)
341 : .try_lock_owned()
342 : .expect("cannot happen because we're the only owner"),
343 : );
344 :
345 : // We meed to do this because when console retries delete request we shouldnt answer with 404
346 : // because 404 means successful deletion.
347 : {
348 : let mut locked = tenant.timelines.lock().unwrap();
349 : locked.insert(timeline_id, Arc::clone(&timeline));
350 : }
351 :
352 : guard.mark_in_progress()?;
353 :
354 : let remote_client = timeline.remote_client.clone();
355 : let timeline = TimelineOrOffloaded::Timeline(timeline);
356 : Self::schedule_background(guard, tenant.conf, tenant, timeline, remote_client);
357 :
358 : Ok(())
359 : }
360 :
361 0 : fn schedule_background(
362 0 : guard: DeletionGuard,
363 0 : conf: &'static PageServerConf,
364 0 : tenant: Arc<TenantShard>,
365 0 : timeline: TimelineOrOffloaded,
366 0 : remote_client: Arc<RemoteTimelineClient>,
367 0 : ) {
368 0 : let tenant_shard_id = timeline.tenant_shard_id();
369 0 : let timeline_id = timeline.timeline_id();
370 :
371 : // Take a tenant gate guard, because timeline deletion needs access to the tenant to update its manifest.
372 0 : let Ok(tenant_guard) = tenant.gate.enter() else {
373 : // It is safe to simply skip here, because we only schedule background work once the timeline is durably marked for deletion.
374 0 : info!("Tenant is shutting down, timeline deletion will be resumed when it next starts");
375 0 : return;
376 : };
377 :
378 0 : task_mgr::spawn(
379 0 : task_mgr::BACKGROUND_RUNTIME.handle(),
380 0 : TaskKind::TimelineDeletionWorker,
381 0 : tenant_shard_id,
382 0 : Some(timeline_id),
383 0 : "timeline_delete",
384 0 : async move {
385 0 : let _guard = tenant_guard;
386 :
387 0 : if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
388 : // Only log as an error if it's not a cancellation.
389 0 : if matches!(err, DeleteTimelineError::Cancelled) {
390 0 : info!("Shutdown during timeline deletion");
391 : }else {
392 0 : error!("Error: {err:#}");
393 : }
394 0 : if let TimelineOrOffloaded::Timeline(timeline) = timeline {
395 0 : timeline.set_broken(format!("{err:#}"))
396 0 : }
397 0 : };
398 0 : Ok(())
399 0 : }
400 0 : .instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)),
401 : );
402 0 : }
403 :
404 0 : async fn background(
405 0 : mut guard: DeletionGuard,
406 0 : conf: &PageServerConf,
407 0 : tenant: &TenantShard,
408 0 : timeline: &TimelineOrOffloaded,
409 0 : remote_client: Arc<RemoteTimelineClient>,
410 0 : ) -> Result<(), DeleteTimelineError> {
411 0 : fail::fail_point!("timeline-delete-before-rm", |_| {
412 0 : Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
413 0 : });
414 :
415 0 : match timeline {
416 0 : TimelineOrOffloaded::Timeline(timeline) => {
417 0 : delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await;
418 : }
419 0 : TimelineOrOffloaded::Importing(importing) => {
420 0 : delete_local_timeline_directory(conf, tenant.tenant_shard_id, &importing.timeline)
421 0 : .await;
422 : }
423 0 : TimelineOrOffloaded::Offloaded(_offloaded) => {
424 0 : // Offloaded timelines have no local state
425 0 : // TODO: once we persist offloaded information, delete the timeline from there, too
426 0 : }
427 : }
428 :
429 0 : fail::fail_point!("timeline-delete-after-rm", |_| {
430 0 : Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
431 0 : });
432 :
433 0 : remote_client.delete_all().await?;
434 :
435 0 : pausable_failpoint!("in_progress_delete");
436 :
437 0 : remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;
438 :
439 : // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
440 : // between the deletion of the index-part.json and reaching of this code.
441 : // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
442 : // However, we handle this case in tenant loading code so the next time we attach, the issue is
443 : // resolved.
444 0 : tenant
445 0 : .maybe_upload_tenant_manifest()
446 0 : .await
447 0 : .map_err(|err| match err {
448 0 : TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
449 0 : err => DeleteTimelineError::Other(err.into()),
450 0 : })?;
451 :
452 0 : *guard = Self::Finished;
453 :
454 0 : Ok(())
455 0 : }
456 :
457 0 : pub(crate) fn is_not_started(&self) -> bool {
458 0 : matches!(self, Self::NotStarted)
459 0 : }
460 : }
461 :
462 : #[derive(Copy, Clone, PartialEq, Eq)]
463 : pub(super) enum TimelineDeleteGuardKind {
464 : Offload,
465 : Delete,
466 : }
467 :
468 1 : pub(super) fn make_timeline_delete_guard(
469 1 : tenant: &TenantShard,
470 1 : timeline_id: TimelineId,
471 1 : guard_kind: TimelineDeleteGuardKind,
472 1 : ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
473 : // Note the interaction between this guard and deletion guard.
474 : // Here we attempt to lock deletion guard when we're holding a lock on timelines.
475 : // This is important because when you take into account `remove_timeline_from_tenant`
476 : // we remove timeline from memory when we still hold the deletion guard.
477 : // So here when timeline deletion is finished timeline wont be present in timelines map at all
478 : // which makes the following sequence impossible:
479 : // T1: get preempted right before the try_lock on `Timeline::delete_progress`
480 : // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
481 : // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
482 : // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
483 1 : let timelines = tenant.timelines.lock().unwrap();
484 1 : let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
485 1 : let timelines_importing = tenant.timelines_importing.lock().unwrap();
486 :
487 1 : let timeline = match timelines.get(&timeline_id) {
488 1 : Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
489 0 : None => match timelines_offloaded.get(&timeline_id) {
490 0 : Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
491 0 : None => match timelines_importing.get(&timeline_id) {
492 0 : Some(t) => TimelineOrOffloaded::Importing(Arc::clone(t)),
493 0 : None => return Err(DeleteTimelineError::NotFound),
494 : },
495 : },
496 : };
497 :
498 : // Ensure that there are no child timelines, because we are about to remove files,
499 : // which will break child branches
500 1 : let mut children = Vec::new();
501 1 : if guard_kind == TimelineDeleteGuardKind::Delete {
502 0 : children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
503 0 : (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
504 0 : }));
505 1 : }
506 2 : children.extend(timelines.iter().filter_map(|(id, entry)| {
507 2 : (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
508 2 : }));
509 :
510 1 : if !children.is_empty() {
511 0 : return Err(DeleteTimelineError::HasChildren(children));
512 1 : }
513 :
514 : // Note that using try_lock here is important to avoid a deadlock.
515 : // Here we take lock on timelines and then the deletion guard.
516 : // At the end of the operation we're holding the guard and need to lock timelines map
517 : // to remove the timeline from it.
518 : // Always if you have two locks that are taken in different order this can result in a deadlock.
519 :
520 1 : let delete_progress = Arc::clone(timeline.delete_progress());
521 1 : let delete_lock_guard = match delete_progress.try_lock_owned() {
522 1 : Ok(guard) => DeletionGuard(guard),
523 : Err(_) => {
524 : // Unfortunately if lock fails arc is consumed.
525 0 : return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
526 0 : timeline.delete_progress(),
527 0 : )));
528 : }
529 : };
530 :
531 1 : if guard_kind == TimelineDeleteGuardKind::Delete {
532 0 : if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
533 0 : timeline.set_state(TimelineState::Stopping);
534 0 : }
535 1 : }
536 :
537 1 : Ok((timeline, delete_lock_guard))
538 1 : }
539 :
540 : pub(super) struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
541 :
542 : impl Deref for DeletionGuard {
543 : type Target = DeleteTimelineFlow;
544 :
545 0 : fn deref(&self) -> &Self::Target {
546 0 : &self.0
547 0 : }
548 : }
549 :
550 : impl DerefMut for DeletionGuard {
551 0 : fn deref_mut(&mut self) -> &mut Self::Target {
552 0 : &mut self.0
553 0 : }
554 : }
|