Line data Source code
1 : use std::{
2 : ops::{Deref, DerefMut},
3 : sync::Arc,
4 : };
5 :
6 : use anyhow::Context;
7 : use pageserver_api::{models::TimelineState, shard::TenantShardId};
8 : use tokio::sync::OwnedMutexGuard;
9 : use tracing::{error, info, instrument, Instrument};
10 : use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
11 :
12 : use crate::{
13 : config::PageServerConf,
14 : task_mgr::{self, TaskKind},
15 : tenant::{
16 : metadata::TimelineMetadata,
17 : remote_timeline_client::{
18 : self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
19 : },
20 : CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
21 : },
22 : };
23 :
24 : use super::{Timeline, TimelineResources};
25 :
26 : /// Mark timeline as deleted in S3 so we won't pick it up next time
27 : /// during attach or pageserver restart.
28 : /// See comment in persist_index_part_with_deleted_flag.
29 0 : async fn set_deleted_in_remote_index(
30 0 : remote_client: &Arc<RemoteTimelineClient>,
31 0 : ) -> Result<(), DeleteTimelineError> {
32 0 : let res = remote_client.persist_index_part_with_deleted_flag().await;
33 0 : match res {
34 : // If we (now, or already) marked it successfully as deleted, we can proceed
35 0 : Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
36 : // Bail out otherwise
37 : //
38 : // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
39 : // two tasks from performing the deletion at the same time. The first task
40 : // that starts deletion should run it to completion.
41 0 : Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
42 0 : | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
43 0 : return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
44 : }
45 : }
46 0 : Ok(())
47 0 : }
48 :
49 : /// Grab the compaction and gc locks, and actually perform the deletion.
50 : ///
51 : /// The locks prevent GC or compaction from running at the same time. The background tasks do not
52 : /// register themselves with the timeline it's operating on, so it might still be running even
53 : /// though we called `shutdown_tasks`.
54 : ///
55 : /// Note that there are still other race conditions between
56 : /// GC, compaction and timeline deletion. See
57 : /// <https://github.com/neondatabase/neon/issues/2671>
58 : ///
59 : /// No timeout here, GC & Compaction should be responsive to the
60 : /// `TimelineState::Stopping` change.
61 : // pub(super): documentation link
62 0 : pub(super) async fn delete_local_timeline_directory(
63 0 : conf: &PageServerConf,
64 0 : tenant_shard_id: TenantShardId,
65 0 : timeline: &Timeline,
66 0 : ) -> anyhow::Result<()> {
67 0 : // Always ensure the lock order is compaction -> gc.
68 0 : let compaction_lock = timeline.compaction_lock.lock();
69 0 : let compaction_lock = crate::timed(
70 0 : compaction_lock,
71 0 : "acquires compaction lock",
72 0 : std::time::Duration::from_secs(5),
73 0 : )
74 0 : .await;
75 :
76 0 : let gc_lock = timeline.gc_lock.lock();
77 0 : let gc_lock = crate::timed(
78 0 : gc_lock,
79 0 : "acquires gc lock",
80 0 : std::time::Duration::from_secs(5),
81 0 : )
82 0 : .await;
83 :
84 : // NB: storage_sync upload tasks that reference these layers have been cancelled
85 : // by the caller.
86 :
87 0 : let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
88 0 :
89 0 : fail::fail_point!("timeline-delete-before-rm", |_| {
90 0 : Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
91 0 : });
92 :
93 : // NB: This need not be atomic because the deleted flag in the IndexPart
94 : // will be observed during tenant/timeline load. The deletion will be resumed there.
95 : //
96 : // Note that here we do not bail out on std::io::ErrorKind::NotFound.
97 : // This can happen if we're called a second time, e.g.,
98 : // because of a previous failure/cancellation at/after
99 : // failpoint timeline-delete-after-rm.
100 : //
101 : // ErrorKind::NotFound can also happen if we race with tenant detach, because,
102 : // no locks are shared.
103 0 : tokio::fs::remove_dir_all(local_timeline_directory)
104 0 : .await
105 0 : .or_else(fs_ext::ignore_not_found)
106 0 : .context("remove local timeline directory")?;
107 :
108 : // Make sure previous deletions are ordered before mark removal.
109 : // Otherwise there is no guarantee that they reach the disk before mark deletion.
110 : // So its possible for mark to reach disk first and for other deletions
111 : // to be reordered later and thus missed if a crash occurs.
112 : // Note that we dont need to sync after mark file is removed
113 : // because we can tolerate the case when mark file reappears on startup.
114 0 : let timeline_path = conf.timelines_path(&tenant_shard_id);
115 0 : crashsafe::fsync_async(timeline_path)
116 0 : .await
117 0 : .context("fsync_pre_mark_remove")?;
118 :
119 0 : info!("finished deleting layer files, releasing locks");
120 0 : drop(gc_lock);
121 0 : drop(compaction_lock);
122 0 :
123 0 : fail::fail_point!("timeline-delete-after-rm", |_| {
124 0 : Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
125 0 : });
126 :
127 0 : Ok(())
128 0 : }
129 :
130 : /// Removes remote layers and an index file after them.
131 0 : async fn delete_remote_layers_and_index(
132 0 : remote_client: &Arc<RemoteTimelineClient>,
133 0 : ) -> anyhow::Result<()> {
134 0 : remote_client.delete_all().await.context("delete_all")
135 0 : }
136 :
137 : /// It is important that this gets called when DeletionGuard is being held.
138 : /// For more context see comments in [`DeleteTimelineFlow::prepare`]
139 0 : async fn remove_maybe_offloaded_timeline_from_tenant(
140 0 : tenant: &Tenant,
141 0 : timeline: &TimelineOrOffloaded,
142 0 : _: &DeletionGuard, // using it as a witness
143 0 : ) -> anyhow::Result<()> {
144 0 : // Remove the timeline from the map.
145 0 : // This observes the locking order between timelines and timelines_offloaded
146 0 : let mut timelines = tenant.timelines.lock().unwrap();
147 0 : let mut timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
148 0 : let offloaded_children_exist = timelines_offloaded
149 0 : .iter()
150 0 : .any(|(_, entry)| entry.ancestor_timeline_id == Some(timeline.timeline_id()));
151 0 : let children_exist = timelines
152 0 : .iter()
153 0 : .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id()));
154 0 : // XXX this can happen because of race conditions with branch creation.
155 0 : // We already deleted the remote layer files, so it's probably best to panic.
156 0 : if children_exist || offloaded_children_exist {
157 0 : panic!("Timeline grew children while we removed layer files");
158 0 : }
159 0 :
160 0 : match timeline {
161 0 : TimelineOrOffloaded::Timeline(timeline) => {
162 0 : timelines.remove(&timeline.timeline_id).expect(
163 0 : "timeline that we were deleting was concurrently removed from 'timelines' map",
164 0 : );
165 0 : }
166 0 : TimelineOrOffloaded::Offloaded(timeline) => {
167 0 : timelines_offloaded
168 0 : .remove(&timeline.timeline_id)
169 0 : .expect("timeline that we were deleting was concurrently removed from 'timelines_offloaded' map");
170 0 : }
171 : }
172 :
173 0 : drop(timelines_offloaded);
174 0 : drop(timelines);
175 0 :
176 0 : Ok(())
177 0 : }
178 :
179 : /// It is important that this gets called when DeletionGuard is being held.
180 : /// For more context see comments in [`DeleteTimelineFlow::prepare`]
181 0 : async fn upload_new_tenant_manifest(
182 0 : tenant: &Tenant,
183 0 : _: &DeletionGuard, // using it as a witness
184 0 : ) -> anyhow::Result<()> {
185 0 : // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
186 0 : // between the deletion of the index-part.json and reaching of this code.
187 0 : // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
188 0 : // However, we handle this case in tenant loading code so the next time we attach, the issue is
189 0 : // resolved.
190 0 : let manifest = tenant.tenant_manifest();
191 0 : // TODO: generation support
192 0 : let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
193 0 : remote_timeline_client::upload_tenant_manifest(
194 0 : &tenant.remote_storage,
195 0 : &tenant.tenant_shard_id,
196 0 : generation,
197 0 : &manifest,
198 0 : &tenant.cancel,
199 0 : )
200 0 : .await?;
201 :
202 0 : Ok(())
203 0 : }
204 :
205 : /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
206 : /// and deletes its data from both disk and s3.
207 : /// The sequence of steps:
208 : /// 1. Set deleted_at in remote index part.
209 : /// 2. Create local mark file.
210 : /// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
211 : /// 4. Delete remote layers
212 : /// 5. Delete index part
213 : /// 6. Delete meta, timeline directory
214 : /// 7. Delete mark file
215 : ///
216 : /// It is resumable from any step in case a crash/restart occurs.
217 : /// There are two entrypoints to the process:
218 : /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
219 : /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
220 : /// and we possibly neeed to continue deletion of remote files.
221 : ///
222 : /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
223 : #[derive(Default)]
224 : pub enum DeleteTimelineFlow {
225 : #[default]
226 : NotStarted,
227 : InProgress,
228 : Finished,
229 : }
230 :
231 : impl DeleteTimelineFlow {
232 : // These steps are run in the context of management api request handler.
233 : // Long running steps are continued to run in the background.
234 : // NB: If this fails half-way through, and is retried, the retry will go through
235 : // all the same steps again. Make sure the code here is idempotent, and don't
236 : // error out if some of the shutdown tasks have already been completed!
237 0 : #[instrument(skip_all)]
238 : pub async fn run(
239 : tenant: &Arc<Tenant>,
240 : timeline_id: TimelineId,
241 : ) -> Result<(), DeleteTimelineError> {
242 : super::debug_assert_current_span_has_tenant_and_timeline_id();
243 :
244 : let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
245 :
246 : guard.mark_in_progress()?;
247 :
248 : // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
249 : if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
250 : timeline.shutdown(super::ShutdownMode::Hard).await;
251 : }
252 :
253 : tenant.gc_block.before_delete(&timeline.timeline_id());
254 :
255 0 : fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
256 0 : Err(anyhow::anyhow!(
257 0 : "failpoint: timeline-delete-before-index-deleted-at"
258 0 : ))?
259 0 : });
260 :
261 : let remote_client = timeline.remote_client_maybe_construct(tenant);
262 : set_deleted_in_remote_index(&remote_client).await?;
263 :
264 0 : fail::fail_point!("timeline-delete-before-schedule", |_| {
265 0 : Err(anyhow::anyhow!(
266 0 : "failpoint: timeline-delete-before-schedule"
267 0 : ))?
268 0 : });
269 :
270 : Self::schedule_background(
271 : guard,
272 : tenant.conf,
273 : Arc::clone(tenant),
274 : timeline,
275 : remote_client,
276 : );
277 :
278 : Ok(())
279 : }
280 :
281 0 : fn mark_in_progress(&mut self) -> anyhow::Result<()> {
282 0 : match self {
283 0 : Self::Finished => anyhow::bail!("Bug. Is in finished state"),
284 0 : Self::InProgress { .. } => { /* We're in a retry */ }
285 0 : Self::NotStarted => { /* Fresh start */ }
286 : }
287 :
288 0 : *self = Self::InProgress;
289 0 :
290 0 : Ok(())
291 0 : }
292 :
293 : /// Shortcut to create Timeline in stopping state and spawn deletion task.
294 0 : #[instrument(skip_all, fields(%timeline_id))]
295 : pub async fn resume_deletion(
296 : tenant: Arc<Tenant>,
297 : timeline_id: TimelineId,
298 : local_metadata: &TimelineMetadata,
299 : remote_client: RemoteTimelineClient,
300 : ) -> anyhow::Result<()> {
301 : // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
302 : // RemoteTimelineClient is the only functioning part.
303 : let timeline = tenant
304 : .create_timeline_struct(
305 : timeline_id,
306 : local_metadata,
307 : None, // Ancestor is not needed for deletion.
308 : TimelineResources {
309 : remote_client,
310 : timeline_get_throttle: tenant.timeline_get_throttle.clone(),
311 : l0_flush_global_state: tenant.l0_flush_global_state.clone(),
312 : },
313 : // Important. We dont pass ancestor above because it can be missing.
314 : // Thus we need to skip the validation here.
315 : CreateTimelineCause::Delete,
316 : )
317 : .context("create_timeline_struct")?;
318 :
319 : let mut guard = DeletionGuard(
320 : Arc::clone(&timeline.delete_progress)
321 : .try_lock_owned()
322 : .expect("cannot happen because we're the only owner"),
323 : );
324 :
325 : // We meed to do this because when console retries delete request we shouldnt answer with 404
326 : // because 404 means successful deletion.
327 : {
328 : let mut locked = tenant.timelines.lock().unwrap();
329 : locked.insert(timeline_id, Arc::clone(&timeline));
330 : }
331 :
332 : guard.mark_in_progress()?;
333 :
334 : let remote_client = timeline.remote_client.clone();
335 : let timeline = TimelineOrOffloaded::Timeline(timeline);
336 : Self::schedule_background(guard, tenant.conf, tenant, timeline, remote_client);
337 :
338 : Ok(())
339 : }
340 :
341 0 : pub(super) fn prepare(
342 0 : tenant: &Tenant,
343 0 : timeline_id: TimelineId,
344 0 : ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
345 0 : // Note the interaction between this guard and deletion guard.
346 0 : // Here we attempt to lock deletion guard when we're holding a lock on timelines.
347 0 : // This is important because when you take into account `remove_timeline_from_tenant`
348 0 : // we remove timeline from memory when we still hold the deletion guard.
349 0 : // So here when timeline deletion is finished timeline wont be present in timelines map at all
350 0 : // which makes the following sequence impossible:
351 0 : // T1: get preempted right before the try_lock on `Timeline::delete_progress`
352 0 : // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
353 0 : // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
354 0 : // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
355 0 : let timelines = tenant.timelines.lock().unwrap();
356 :
357 0 : let timeline = match timelines.get(&timeline_id) {
358 0 : Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
359 : None => {
360 0 : let offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
361 0 : match offloaded_timelines.get(&timeline_id) {
362 0 : Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
363 0 : None => return Err(DeleteTimelineError::NotFound),
364 : }
365 : }
366 : };
367 :
368 : // Ensure that there are no child timelines **attached to that pageserver**,
369 : // because detach removes files, which will break child branches
370 0 : let children: Vec<TimelineId> = timelines
371 0 : .iter()
372 0 : .filter_map(|(id, entry)| {
373 0 : if entry.get_ancestor_timeline_id() == Some(timeline_id) {
374 0 : Some(*id)
375 : } else {
376 0 : None
377 : }
378 0 : })
379 0 : .collect();
380 0 :
381 0 : if !children.is_empty() {
382 0 : return Err(DeleteTimelineError::HasChildren(children));
383 0 : }
384 0 :
385 0 : // Note that using try_lock here is important to avoid a deadlock.
386 0 : // Here we take lock on timelines and then the deletion guard.
387 0 : // At the end of the operation we're holding the guard and need to lock timelines map
388 0 : // to remove the timeline from it.
389 0 : // Always if you have two locks that are taken in different order this can result in a deadlock.
390 0 :
391 0 : let delete_progress = Arc::clone(timeline.delete_progress());
392 0 : let delete_lock_guard = match delete_progress.try_lock_owned() {
393 0 : Ok(guard) => DeletionGuard(guard),
394 : Err(_) => {
395 : // Unfortunately if lock fails arc is consumed.
396 0 : return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
397 0 : timeline.delete_progress(),
398 0 : )));
399 : }
400 : };
401 :
402 0 : if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
403 0 : timeline.set_state(TimelineState::Stopping);
404 0 : }
405 :
406 0 : Ok((timeline, delete_lock_guard))
407 0 : }
408 :
409 0 : fn schedule_background(
410 0 : guard: DeletionGuard,
411 0 : conf: &'static PageServerConf,
412 0 : tenant: Arc<Tenant>,
413 0 : timeline: TimelineOrOffloaded,
414 0 : remote_client: Arc<RemoteTimelineClient>,
415 0 : ) {
416 0 : let tenant_shard_id = timeline.tenant_shard_id();
417 0 : let timeline_id = timeline.timeline_id();
418 0 :
419 0 : task_mgr::spawn(
420 0 : task_mgr::BACKGROUND_RUNTIME.handle(),
421 0 : TaskKind::TimelineDeletionWorker,
422 0 : tenant_shard_id,
423 0 : Some(timeline_id),
424 0 : "timeline_delete",
425 0 : async move {
426 0 : if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
427 0 : error!("Error: {err:#}");
428 0 : if let TimelineOrOffloaded::Timeline(timeline) = timeline {
429 0 : timeline.set_broken(format!("{err:#}"))
430 0 : }
431 0 : };
432 0 : Ok(())
433 0 : }
434 0 : .instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)),
435 : );
436 0 : }
437 :
438 0 : async fn background(
439 0 : mut guard: DeletionGuard,
440 0 : conf: &PageServerConf,
441 0 : tenant: &Tenant,
442 0 : timeline: &TimelineOrOffloaded,
443 0 : remote_client: Arc<RemoteTimelineClient>,
444 0 : ) -> Result<(), DeleteTimelineError> {
445 : // Offloaded timelines have no local state
446 : // TODO: once we persist offloaded information, delete the timeline from there, too
447 0 : if let TimelineOrOffloaded::Timeline(timeline) = timeline {
448 0 : delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
449 0 : }
450 :
451 0 : delete_remote_layers_and_index(&remote_client).await?;
452 :
453 0 : pausable_failpoint!("in_progress_delete");
454 :
455 0 : remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;
456 :
457 0 : upload_new_tenant_manifest(tenant, &guard).await?;
458 :
459 0 : *guard = Self::Finished;
460 0 :
461 0 : Ok(())
462 0 : }
463 :
464 0 : pub(crate) fn is_not_started(&self) -> bool {
465 0 : matches!(self, Self::NotStarted)
466 0 : }
467 : }
468 :
469 : pub(super) struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
470 :
471 : impl Deref for DeletionGuard {
472 : type Target = DeleteTimelineFlow;
473 :
474 0 : fn deref(&self) -> &Self::Target {
475 0 : &self.0
476 0 : }
477 : }
478 :
479 : impl DerefMut for DeletionGuard {
480 0 : fn deref_mut(&mut self) -> &mut Self::Target {
481 0 : &mut self.0
482 0 : }
483 : }
|