Line data Source code
1 : use std::{
2 : ops::{Deref, DerefMut},
3 : sync::Arc,
4 : };
5 :
6 : use anyhow::Context;
7 : use pageserver_api::{models::TimelineState, shard::TenantShardId};
8 : use tokio::sync::OwnedMutexGuard;
9 : use tracing::{debug, error, info, instrument, Instrument};
10 : use utils::{crashsafe, fs_ext, id::TimelineId};
11 :
12 : use crate::{
13 : config::PageServerConf,
14 : deletion_queue::DeletionQueueClient,
15 : task_mgr::{self, TaskKind},
16 : tenant::{
17 : debug_assert_current_span_has_tenant_and_timeline_id,
18 : metadata::TimelineMetadata,
19 : remote_timeline_client::{
20 : self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
21 : },
22 : CreateTimelineCause, DeleteTimelineError, Tenant,
23 : },
24 : };
25 :
26 : use super::{Timeline, TimelineResources};
27 :
28 : /// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
29 0 : async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
30 0 : debug_assert_current_span_has_tenant_and_timeline_id();
31 : // Notify any timeline work to drop out of loops/requests
32 0 : tracing::debug!("Cancelling CancellationToken");
33 0 : timeline.cancel.cancel();
34 :
35 : // Stop the walreceiver first.
36 0 : debug!("waiting for wal receiver to shutdown");
37 0 : let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
38 0 : if let Some(walreceiver) = maybe_started_walreceiver {
39 0 : walreceiver.stop().await;
40 0 : }
41 0 : debug!("wal receiver shutdown confirmed");
42 :
43 : // Shut down the layer flush task before the remote client, as one depends on the other
44 0 : task_mgr::shutdown_tasks(
45 0 : Some(TaskKind::LayerFlushTask),
46 0 : Some(timeline.tenant_shard_id),
47 0 : Some(timeline.timeline_id),
48 0 : )
49 0 : .await;
50 :
51 : // Prevent new uploads from starting.
52 0 : if let Some(remote_client) = timeline.remote_client.as_ref() {
53 0 : let res = remote_client.stop();
54 0 : match res {
55 0 : Ok(()) => {}
56 0 : Err(e) => match e {
57 0 : remote_timeline_client::StopError::QueueUninitialized => {
58 0 : // This case shouldn't happen currently because the
59 0 : // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
60 0 : // That is, before we declare the Tenant as Active.
61 0 : // But we only allow calls to delete_timeline on Active tenants.
62 0 : return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
63 : }
64 : },
65 : }
66 0 : }
67 :
68 : // Stop & wait for the remaining timeline tasks, including upload tasks.
69 : // NB: This and other delete_timeline calls do not run as a task_mgr task,
70 : // so, they are not affected by this shutdown_tasks() call.
71 0 : info!("waiting for timeline tasks to shutdown");
72 0 : task_mgr::shutdown_tasks(
73 0 : None,
74 0 : Some(timeline.tenant_shard_id),
75 0 : Some(timeline.timeline_id),
76 0 : )
77 0 : .await;
78 :
79 0 : fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
80 0 : Err(anyhow::anyhow!(
81 0 : "failpoint: timeline-delete-before-index-deleted-at"
82 0 : ))?
83 0 : });
84 :
85 0 : tracing::debug!("Waiting for gate...");
86 0 : timeline.gate.close().await;
87 0 : tracing::debug!("Shutdown complete");
88 :
89 0 : Ok(())
90 0 : }
91 :
92 : /// Mark timeline as deleted in S3 so we won't pick it up next time
93 : /// during attach or pageserver restart.
94 : /// See comment in persist_index_part_with_deleted_flag.
95 0 : async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
96 0 : if let Some(remote_client) = timeline.remote_client.as_ref() {
97 0 : match remote_client.persist_index_part_with_deleted_flag().await {
98 : // If we (now, or already) marked it successfully as deleted, we can proceed
99 0 : Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
100 : // Bail out otherwise
101 : //
102 : // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
103 : // two tasks from performing the deletion at the same time. The first task
104 : // that starts deletion should run it to completion.
105 0 : Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
106 0 : | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
107 0 : return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
108 : }
109 : }
110 0 : }
111 0 : Ok(())
112 0 : }
113 :
114 : /// Grab the compaction and gc locks, and actually perform the deletion.
115 : ///
116 : /// The locks prevent GC or compaction from running at the same time. The background tasks do not
117 : /// register themselves with the timeline it's operating on, so it might still be running even
118 : /// though we called `shutdown_tasks`.
119 : ///
120 : /// Note that there are still other race conditions between
121 : /// GC, compaction and timeline deletion. See
122 : /// <https://github.com/neondatabase/neon/issues/2671>
123 : ///
124 : /// No timeout here, GC & Compaction should be responsive to the
125 : /// `TimelineState::Stopping` change.
126 : // pub(super): documentation link
127 0 : pub(super) async fn delete_local_timeline_directory(
128 0 : conf: &PageServerConf,
129 0 : tenant_shard_id: TenantShardId,
130 0 : timeline: &Timeline,
131 0 : ) -> anyhow::Result<()> {
132 0 : let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
133 0 : let guards = crate::timed(
134 0 : guards,
135 0 : "acquire gc and compaction locks",
136 0 : std::time::Duration::from_secs(5),
137 0 : )
138 0 : .await;
139 :
140 : // NB: storage_sync upload tasks that reference these layers have been cancelled
141 : // by the caller.
142 :
143 0 : let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
144 0 :
145 0 : fail::fail_point!("timeline-delete-before-rm", |_| {
146 0 : Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
147 0 : });
148 :
149 : // NB: This need not be atomic because the deleted flag in the IndexPart
150 : // will be observed during tenant/timeline load. The deletion will be resumed there.
151 : //
152 : // Note that here we do not bail out on std::io::ErrorKind::NotFound.
153 : // This can happen if we're called a second time, e.g.,
154 : // because of a previous failure/cancellation at/after
155 : // failpoint timeline-delete-after-rm.
156 : //
157 : // ErrorKind::NotFound can also happen if we race with tenant detach, because,
158 : // no locks are shared.
159 0 : tokio::fs::remove_dir_all(local_timeline_directory)
160 0 : .await
161 0 : .or_else(fs_ext::ignore_not_found)
162 0 : .context("remove local timeline directory")?;
163 :
164 : // Make sure previous deletions are ordered before mark removal.
165 : // Otherwise there is no guarantee that they reach the disk before mark deletion.
166 : // So its possible for mark to reach disk first and for other deletions
167 : // to be reordered later and thus missed if a crash occurs.
168 : // Note that we dont need to sync after mark file is removed
169 : // because we can tolerate the case when mark file reappears on startup.
170 0 : let timeline_path = conf.timelines_path(&tenant_shard_id);
171 0 : crashsafe::fsync_async(timeline_path)
172 0 : .await
173 0 : .context("fsync_pre_mark_remove")?;
174 :
175 0 : info!("finished deleting layer files, releasing locks");
176 0 : drop(guards);
177 0 :
178 0 : fail::fail_point!("timeline-delete-after-rm", |_| {
179 0 : Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
180 0 : });
181 :
182 0 : Ok(())
183 0 : }
184 :
185 : /// Removes remote layers and an index file after them.
186 0 : async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
187 0 : if let Some(remote_client) = &timeline.remote_client {
188 0 : remote_client.delete_all().await.context("delete_all")?
189 0 : };
190 :
191 0 : Ok(())
192 0 : }
193 :
194 : // This function removs remaining traces of a timeline on disk.
195 : // Namely: metadata file, timeline directory, delete mark.
196 : // Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
197 : // delete mark should be present because it is the last step during deletion.
198 : // (nothing can fail after its deletion)
199 0 : async fn cleanup_remaining_timeline_fs_traces(
200 0 : conf: &PageServerConf,
201 0 : tenant_shard_id: TenantShardId,
202 0 : timeline_id: TimelineId,
203 0 : ) -> anyhow::Result<()> {
204 0 : // Remove delete mark
205 0 : // TODO: once we are confident that no more exist in the field, remove this
206 0 : // line. It cleans up a legacy marker file that might in rare cases be present.
207 0 : tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
208 0 : .await
209 0 : .or_else(fs_ext::ignore_not_found)
210 0 : .context("remove delete mark")
211 0 : }
212 :
213 : /// It is important that this gets called when DeletionGuard is being held.
214 : /// For more context see comments in [`DeleteTimelineFlow::prepare`]
215 0 : async fn remove_timeline_from_tenant(
216 0 : tenant: &Tenant,
217 0 : timeline_id: TimelineId,
218 0 : _: &DeletionGuard, // using it as a witness
219 0 : ) -> anyhow::Result<()> {
220 0 : // Remove the timeline from the map.
221 0 : let mut timelines = tenant.timelines.lock().unwrap();
222 0 : let children_exist = timelines
223 0 : .iter()
224 0 : .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
225 0 : // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
226 0 : // We already deleted the layer files, so it's probably best to panic.
227 0 : // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
228 0 : if children_exist {
229 0 : panic!("Timeline grew children while we removed layer files");
230 0 : }
231 0 :
232 0 : timelines
233 0 : .remove(&timeline_id)
234 0 : .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
235 0 :
236 0 : drop(timelines);
237 0 :
238 0 : Ok(())
239 0 : }
240 :
241 : /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
242 : /// and deletes its data from both disk and s3.
243 : /// The sequence of steps:
244 : /// 1. Set deleted_at in remote index part.
245 : /// 2. Create local mark file.
246 : /// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
247 : /// 4. Delete remote layers
248 : /// 5. Delete index part
249 : /// 6. Delete meta, timeline directory
250 : /// 7. Delete mark file
251 : /// It is resumable from any step in case a crash/restart occurs.
252 : /// There are three entrypoints to the process:
253 : /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
254 : /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
255 : /// and we possibly neeed to continue deletion of remote files.
256 : /// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
257 : /// index but still have local metadata, timeline directory and delete mark.
258 : /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
259 296 : #[derive(Default)]
260 : pub enum DeleteTimelineFlow {
261 : #[default]
262 : NotStarted,
263 : InProgress,
264 : Finished,
265 : }
266 :
267 : impl DeleteTimelineFlow {
268 : // These steps are run in the context of management api request handler.
269 : // Long running steps are continued to run in the background.
270 : // NB: If this fails half-way through, and is retried, the retry will go through
271 : // all the same steps again. Make sure the code here is idempotent, and don't
272 : // error out if some of the shutdown tasks have already been completed!
273 0 : #[instrument(skip_all, fields(%inplace))]
274 : pub async fn run(
275 : tenant: &Arc<Tenant>,
276 : timeline_id: TimelineId,
277 : inplace: bool,
278 : ) -> Result<(), DeleteTimelineError> {
279 : super::debug_assert_current_span_has_tenant_and_timeline_id();
280 :
281 : let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
282 :
283 : guard.mark_in_progress()?;
284 :
285 : stop_tasks(&timeline).await?;
286 :
287 : set_deleted_in_remote_index(&timeline).await?;
288 :
289 0 : fail::fail_point!("timeline-delete-before-schedule", |_| {
290 0 : Err(anyhow::anyhow!(
291 0 : "failpoint: timeline-delete-before-schedule"
292 0 : ))?
293 0 : });
294 :
295 : if inplace {
296 : Self::background(guard, tenant.conf, tenant, &timeline).await?
297 : } else {
298 : Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
299 : }
300 :
301 : Ok(())
302 : }
303 :
304 0 : fn mark_in_progress(&mut self) -> anyhow::Result<()> {
305 0 : match self {
306 0 : Self::Finished => anyhow::bail!("Bug. Is in finished state"),
307 0 : Self::InProgress { .. } => { /* We're in a retry */ }
308 0 : Self::NotStarted => { /* Fresh start */ }
309 : }
310 :
311 0 : *self = Self::InProgress;
312 0 :
313 0 : Ok(())
314 0 : }
315 :
316 : /// Shortcut to create Timeline in stopping state and spawn deletion task.
317 : /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
318 0 : #[instrument(skip_all, fields(%timeline_id))]
319 : pub async fn resume_deletion(
320 : tenant: Arc<Tenant>,
321 : timeline_id: TimelineId,
322 : local_metadata: &TimelineMetadata,
323 : remote_client: Option<RemoteTimelineClient>,
324 : deletion_queue_client: DeletionQueueClient,
325 : ) -> anyhow::Result<()> {
326 : // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
327 : // RemoteTimelineClient is the only functioning part.
328 : let timeline = tenant
329 : .create_timeline_struct(
330 : timeline_id,
331 : local_metadata,
332 : None, // Ancestor is not needed for deletion.
333 : TimelineResources {
334 : remote_client,
335 : deletion_queue_client,
336 : timeline_get_throttle: tenant.timeline_get_throttle.clone(),
337 : },
338 : // Important. We dont pass ancestor above because it can be missing.
339 : // Thus we need to skip the validation here.
340 : CreateTimelineCause::Delete,
341 : )
342 : .context("create_timeline_struct")?;
343 :
344 : let mut guard = DeletionGuard(
345 : Arc::clone(&timeline.delete_progress)
346 : .try_lock_owned()
347 : .expect("cannot happen because we're the only owner"),
348 : );
349 :
350 : // We meed to do this because when console retries delete request we shouldnt answer with 404
351 : // because 404 means successful deletion.
352 : {
353 : let mut locked = tenant.timelines.lock().unwrap();
354 : locked.insert(timeline_id, Arc::clone(&timeline));
355 : }
356 :
357 : guard.mark_in_progress()?;
358 :
359 : Self::schedule_background(guard, tenant.conf, tenant, timeline);
360 :
361 : Ok(())
362 : }
363 :
364 0 : #[instrument(skip_all, fields(%timeline_id))]
365 : pub async fn cleanup_remaining_timeline_fs_traces(
366 : tenant: &Tenant,
367 : timeline_id: TimelineId,
368 : ) -> anyhow::Result<()> {
369 : let r =
370 : cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
371 : .await;
372 0 : info!("Done");
373 : r
374 : }
375 :
376 0 : fn prepare(
377 0 : tenant: &Tenant,
378 0 : timeline_id: TimelineId,
379 0 : ) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
380 0 : // Note the interaction between this guard and deletion guard.
381 0 : // Here we attempt to lock deletion guard when we're holding a lock on timelines.
382 0 : // This is important because when you take into account `remove_timeline_from_tenant`
383 0 : // we remove timeline from memory when we still hold the deletion guard.
384 0 : // So here when timeline deletion is finished timeline wont be present in timelines map at all
385 0 : // which makes the following sequence impossible:
386 0 : // T1: get preempted right before the try_lock on `Timeline::delete_progress`
387 0 : // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
388 0 : // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
389 0 : // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
390 0 : let timelines = tenant.timelines.lock().unwrap();
391 :
392 0 : let timeline = match timelines.get(&timeline_id) {
393 0 : Some(t) => t,
394 0 : None => return Err(DeleteTimelineError::NotFound),
395 : };
396 :
397 : // Ensure that there are no child timelines **attached to that pageserver**,
398 : // because detach removes files, which will break child branches
399 0 : let children: Vec<TimelineId> = timelines
400 0 : .iter()
401 0 : .filter_map(|(id, entry)| {
402 0 : if entry.get_ancestor_timeline_id() == Some(timeline_id) {
403 0 : Some(*id)
404 : } else {
405 0 : None
406 : }
407 0 : })
408 0 : .collect();
409 0 :
410 0 : if !children.is_empty() {
411 0 : return Err(DeleteTimelineError::HasChildren(children));
412 0 : }
413 0 :
414 0 : // Note that using try_lock here is important to avoid a deadlock.
415 0 : // Here we take lock on timelines and then the deletion guard.
416 0 : // At the end of the operation we're holding the guard and need to lock timelines map
417 0 : // to remove the timeline from it.
418 0 : // Always if you have two locks that are taken in different order this can result in a deadlock.
419 0 :
420 0 : let delete_progress = Arc::clone(&timeline.delete_progress);
421 0 : let delete_lock_guard = match delete_progress.try_lock_owned() {
422 0 : Ok(guard) => DeletionGuard(guard),
423 : Err(_) => {
424 : // Unfortunately if lock fails arc is consumed.
425 0 : return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
426 0 : &timeline.delete_progress,
427 0 : )));
428 : }
429 : };
430 :
431 0 : timeline.set_state(TimelineState::Stopping);
432 0 :
433 0 : Ok((Arc::clone(timeline), delete_lock_guard))
434 0 : }
435 :
436 0 : fn schedule_background(
437 0 : guard: DeletionGuard,
438 0 : conf: &'static PageServerConf,
439 0 : tenant: Arc<Tenant>,
440 0 : timeline: Arc<Timeline>,
441 0 : ) {
442 0 : let tenant_shard_id = timeline.tenant_shard_id;
443 0 : let timeline_id = timeline.timeline_id;
444 0 :
445 0 : task_mgr::spawn(
446 0 : task_mgr::BACKGROUND_RUNTIME.handle(),
447 0 : TaskKind::TimelineDeletionWorker,
448 0 : Some(tenant_shard_id),
449 0 : Some(timeline_id),
450 0 : "timeline_delete",
451 : false,
452 0 : async move {
453 0 : if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
454 0 : error!("Error: {err:#}");
455 0 : timeline.set_broken(format!("{err:#}"))
456 0 : };
457 0 : Ok(())
458 0 : }
459 0 : .instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)),
460 : );
461 0 : }
462 :
463 0 : async fn background(
464 0 : mut guard: DeletionGuard,
465 0 : conf: &PageServerConf,
466 0 : tenant: &Tenant,
467 0 : timeline: &Timeline,
468 0 : ) -> Result<(), DeleteTimelineError> {
469 0 : delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
470 :
471 0 : delete_remote_layers_and_index(timeline).await?;
472 :
473 0 : pausable_failpoint!("in_progress_delete");
474 :
475 0 : remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
476 :
477 0 : *guard = Self::Finished;
478 0 :
479 0 : Ok(())
480 0 : }
481 :
482 0 : pub(crate) fn is_finished(&self) -> bool {
483 0 : matches!(self, Self::Finished)
484 0 : }
485 : }
486 :
487 : struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
488 :
489 : impl Deref for DeletionGuard {
490 : type Target = DeleteTimelineFlow;
491 :
492 0 : fn deref(&self) -> &Self::Target {
493 0 : &self.0
494 0 : }
495 : }
496 :
497 : impl DerefMut for DeletionGuard {
498 0 : fn deref_mut(&mut self) -> &mut Self::Target {
499 0 : &mut self.0
500 0 : }
501 : }
|