Line data Source code
1 : use std::{
2 : ops::{Deref, DerefMut},
3 : sync::Arc,
4 : };
5 :
6 : use anyhow::Context;
7 : use pageserver_api::{models::TimelineState, shard::TenantShardId};
8 : use tokio::sync::OwnedMutexGuard;
9 : use tracing::{error, info, instrument, Instrument};
10 : use utils::{crashsafe, fs_ext, id::TimelineId};
11 :
12 : use crate::{
13 : config::PageServerConf,
14 : deletion_queue::DeletionQueueClient,
15 : task_mgr::{self, TaskKind},
16 : tenant::{
17 : metadata::TimelineMetadata,
18 : remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
19 : CreateTimelineCause, DeleteTimelineError, Tenant,
20 : },
21 : };
22 :
23 : use super::{Timeline, TimelineResources};
24 :
25 : /// Mark timeline as deleted in S3 so we won't pick it up next time
26 : /// during attach or pageserver restart.
27 : /// See comment in persist_index_part_with_deleted_flag.
28 0 : async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
29 0 : match timeline
30 0 : .remote_client
31 0 : .persist_index_part_with_deleted_flag()
32 0 : .await
33 : {
34 : // If we (now, or already) marked it successfully as deleted, we can proceed
35 0 : Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
36 : // Bail out otherwise
37 : //
38 : // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
39 : // two tasks from performing the deletion at the same time. The first task
40 : // that starts deletion should run it to completion.
41 0 : Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
42 0 : | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
43 0 : return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
44 : }
45 : }
46 0 : Ok(())
47 0 : }
48 :
49 : /// Grab the compaction and gc locks, and actually perform the deletion.
50 : ///
51 : /// The locks prevent GC or compaction from running at the same time. The background tasks do not
52 : /// register themselves with the timeline it's operating on, so it might still be running even
53 : /// though we called `shutdown_tasks`.
54 : ///
55 : /// Note that there are still other race conditions between
56 : /// GC, compaction and timeline deletion. See
57 : /// <https://github.com/neondatabase/neon/issues/2671>
58 : ///
59 : /// No timeout here, GC & Compaction should be responsive to the
60 : /// `TimelineState::Stopping` change.
61 : // pub(super): documentation link
62 0 : pub(super) async fn delete_local_timeline_directory(
63 0 : conf: &PageServerConf,
64 0 : tenant_shard_id: TenantShardId,
65 0 : timeline: &Timeline,
66 0 : ) -> anyhow::Result<()> {
67 0 : let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
68 0 : let guards = crate::timed(
69 0 : guards,
70 0 : "acquire gc and compaction locks",
71 0 : std::time::Duration::from_secs(5),
72 0 : )
73 0 : .await;
74 :
75 : // NB: storage_sync upload tasks that reference these layers have been cancelled
76 : // by the caller.
77 :
78 0 : let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
79 0 :
80 0 : fail::fail_point!("timeline-delete-before-rm", |_| {
81 0 : Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
82 0 : });
83 :
84 : // NB: This need not be atomic because the deleted flag in the IndexPart
85 : // will be observed during tenant/timeline load. The deletion will be resumed there.
86 : //
87 : // Note that here we do not bail out on std::io::ErrorKind::NotFound.
88 : // This can happen if we're called a second time, e.g.,
89 : // because of a previous failure/cancellation at/after
90 : // failpoint timeline-delete-after-rm.
91 : //
92 : // ErrorKind::NotFound can also happen if we race with tenant detach, because,
93 : // no locks are shared.
94 0 : tokio::fs::remove_dir_all(local_timeline_directory)
95 0 : .await
96 0 : .or_else(fs_ext::ignore_not_found)
97 0 : .context("remove local timeline directory")?;
98 :
99 : // Make sure previous deletions are ordered before mark removal.
100 : // Otherwise there is no guarantee that they reach the disk before mark deletion.
101 : // So its possible for mark to reach disk first and for other deletions
102 : // to be reordered later and thus missed if a crash occurs.
103 : // Note that we dont need to sync after mark file is removed
104 : // because we can tolerate the case when mark file reappears on startup.
105 0 : let timeline_path = conf.timelines_path(&tenant_shard_id);
106 0 : crashsafe::fsync_async(timeline_path)
107 0 : .await
108 0 : .context("fsync_pre_mark_remove")?;
109 :
110 0 : info!("finished deleting layer files, releasing locks");
111 0 : drop(guards);
112 0 :
113 0 : fail::fail_point!("timeline-delete-after-rm", |_| {
114 0 : Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
115 0 : });
116 :
117 0 : Ok(())
118 0 : }
119 :
120 : /// Removes remote layers and an index file after them.
121 0 : async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
122 0 : timeline
123 0 : .remote_client
124 0 : .delete_all()
125 0 : .await
126 0 : .context("delete_all")
127 0 : }
128 :
129 : // This function removs remaining traces of a timeline on disk.
130 : // Namely: metadata file, timeline directory, delete mark.
131 : // Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
132 : // delete mark should be present because it is the last step during deletion.
133 : // (nothing can fail after its deletion)
134 0 : async fn cleanup_remaining_timeline_fs_traces(
135 0 : conf: &PageServerConf,
136 0 : tenant_shard_id: TenantShardId,
137 0 : timeline_id: TimelineId,
138 0 : ) -> anyhow::Result<()> {
139 0 : // Remove delete mark
140 0 : // TODO: once we are confident that no more exist in the field, remove this
141 0 : // line. It cleans up a legacy marker file that might in rare cases be present.
142 0 : tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
143 0 : .await
144 0 : .or_else(fs_ext::ignore_not_found)
145 0 : .context("remove delete mark")
146 0 : }
147 :
148 : /// It is important that this gets called when DeletionGuard is being held.
149 : /// For more context see comments in [`DeleteTimelineFlow::prepare`]
150 0 : async fn remove_timeline_from_tenant(
151 0 : tenant: &Tenant,
152 0 : timeline_id: TimelineId,
153 0 : _: &DeletionGuard, // using it as a witness
154 0 : ) -> anyhow::Result<()> {
155 0 : // Remove the timeline from the map.
156 0 : let mut timelines = tenant.timelines.lock().unwrap();
157 0 : let children_exist = timelines
158 0 : .iter()
159 0 : .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
160 0 : // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
161 0 : // We already deleted the layer files, so it's probably best to panic.
162 0 : // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
163 0 : if children_exist {
164 0 : panic!("Timeline grew children while we removed layer files");
165 0 : }
166 0 :
167 0 : timelines
168 0 : .remove(&timeline_id)
169 0 : .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
170 0 :
171 0 : drop(timelines);
172 0 :
173 0 : Ok(())
174 0 : }
175 :
176 : /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
177 : /// and deletes its data from both disk and s3.
178 : /// The sequence of steps:
179 : /// 1. Set deleted_at in remote index part.
180 : /// 2. Create local mark file.
181 : /// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
182 : /// 4. Delete remote layers
183 : /// 5. Delete index part
184 : /// 6. Delete meta, timeline directory
185 : /// 7. Delete mark file
186 : /// It is resumable from any step in case a crash/restart occurs.
187 : /// There are three entrypoints to the process:
188 : /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
189 : /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
190 : /// and we possibly neeed to continue deletion of remote files.
191 : /// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
192 : /// index but still have local metadata, timeline directory and delete mark.
193 : /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
194 : #[derive(Default)]
195 : pub enum DeleteTimelineFlow {
196 : #[default]
197 : NotStarted,
198 : InProgress,
199 : Finished,
200 : }
201 :
202 : impl DeleteTimelineFlow {
203 : // These steps are run in the context of management api request handler.
204 : // Long running steps are continued to run in the background.
205 : // NB: If this fails half-way through, and is retried, the retry will go through
206 : // all the same steps again. Make sure the code here is idempotent, and don't
207 : // error out if some of the shutdown tasks have already been completed!
208 0 : #[instrument(skip_all, fields(%inplace))]
209 : pub async fn run(
210 : tenant: &Arc<Tenant>,
211 : timeline_id: TimelineId,
212 : inplace: bool,
213 : ) -> Result<(), DeleteTimelineError> {
214 : super::debug_assert_current_span_has_tenant_and_timeline_id();
215 :
216 : let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
217 :
218 : guard.mark_in_progress()?;
219 :
220 : // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
221 : timeline.shutdown(super::ShutdownMode::Hard).await;
222 :
223 0 : fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
224 0 : Err(anyhow::anyhow!(
225 0 : "failpoint: timeline-delete-before-index-deleted-at"
226 0 : ))?
227 0 : });
228 :
229 : set_deleted_in_remote_index(&timeline).await?;
230 :
231 0 : fail::fail_point!("timeline-delete-before-schedule", |_| {
232 0 : Err(anyhow::anyhow!(
233 0 : "failpoint: timeline-delete-before-schedule"
234 0 : ))?
235 0 : });
236 :
237 : if inplace {
238 : Self::background(guard, tenant.conf, tenant, &timeline).await?
239 : } else {
240 : Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
241 : }
242 :
243 : Ok(())
244 : }
245 :
246 0 : fn mark_in_progress(&mut self) -> anyhow::Result<()> {
247 0 : match self {
248 0 : Self::Finished => anyhow::bail!("Bug. Is in finished state"),
249 0 : Self::InProgress { .. } => { /* We're in a retry */ }
250 0 : Self::NotStarted => { /* Fresh start */ }
251 : }
252 :
253 0 : *self = Self::InProgress;
254 0 :
255 0 : Ok(())
256 0 : }
257 :
258 : /// Shortcut to create Timeline in stopping state and spawn deletion task.
259 : /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
260 0 : #[instrument(skip_all, fields(%timeline_id))]
261 : pub async fn resume_deletion(
262 : tenant: Arc<Tenant>,
263 : timeline_id: TimelineId,
264 : local_metadata: &TimelineMetadata,
265 : remote_client: RemoteTimelineClient,
266 : deletion_queue_client: DeletionQueueClient,
267 : ) -> anyhow::Result<()> {
268 : // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
269 : // RemoteTimelineClient is the only functioning part.
270 : let timeline = tenant
271 : .create_timeline_struct(
272 : timeline_id,
273 : local_metadata,
274 : None, // Ancestor is not needed for deletion.
275 : TimelineResources {
276 : remote_client,
277 : deletion_queue_client,
278 : timeline_get_throttle: tenant.timeline_get_throttle.clone(),
279 : },
280 : // Important. We dont pass ancestor above because it can be missing.
281 : // Thus we need to skip the validation here.
282 : CreateTimelineCause::Delete,
283 : // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
284 : None,
285 : )
286 : .context("create_timeline_struct")?;
287 :
288 : let mut guard = DeletionGuard(
289 : Arc::clone(&timeline.delete_progress)
290 : .try_lock_owned()
291 : .expect("cannot happen because we're the only owner"),
292 : );
293 :
294 : // We meed to do this because when console retries delete request we shouldnt answer with 404
295 : // because 404 means successful deletion.
296 : {
297 : let mut locked = tenant.timelines.lock().unwrap();
298 : locked.insert(timeline_id, Arc::clone(&timeline));
299 : }
300 :
301 : guard.mark_in_progress()?;
302 :
303 : Self::schedule_background(guard, tenant.conf, tenant, timeline);
304 :
305 : Ok(())
306 : }
307 :
308 0 : #[instrument(skip_all, fields(%timeline_id))]
309 : pub async fn cleanup_remaining_timeline_fs_traces(
310 : tenant: &Tenant,
311 : timeline_id: TimelineId,
312 : ) -> anyhow::Result<()> {
313 : let r =
314 : cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
315 : .await;
316 : info!("Done");
317 : r
318 : }
319 :
320 0 : fn prepare(
321 0 : tenant: &Tenant,
322 0 : timeline_id: TimelineId,
323 0 : ) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
324 0 : // Note the interaction between this guard and deletion guard.
325 0 : // Here we attempt to lock deletion guard when we're holding a lock on timelines.
326 0 : // This is important because when you take into account `remove_timeline_from_tenant`
327 0 : // we remove timeline from memory when we still hold the deletion guard.
328 0 : // So here when timeline deletion is finished timeline wont be present in timelines map at all
329 0 : // which makes the following sequence impossible:
330 0 : // T1: get preempted right before the try_lock on `Timeline::delete_progress`
331 0 : // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
332 0 : // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
333 0 : // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
334 0 : let timelines = tenant.timelines.lock().unwrap();
335 :
336 0 : let timeline = match timelines.get(&timeline_id) {
337 0 : Some(t) => t,
338 0 : None => return Err(DeleteTimelineError::NotFound),
339 : };
340 :
341 : // Ensure that there are no child timelines **attached to that pageserver**,
342 : // because detach removes files, which will break child branches
343 0 : let children: Vec<TimelineId> = timelines
344 0 : .iter()
345 0 : .filter_map(|(id, entry)| {
346 0 : if entry.get_ancestor_timeline_id() == Some(timeline_id) {
347 0 : Some(*id)
348 : } else {
349 0 : None
350 : }
351 0 : })
352 0 : .collect();
353 0 :
354 0 : if !children.is_empty() {
355 0 : return Err(DeleteTimelineError::HasChildren(children));
356 0 : }
357 0 :
358 0 : // Note that using try_lock here is important to avoid a deadlock.
359 0 : // Here we take lock on timelines and then the deletion guard.
360 0 : // At the end of the operation we're holding the guard and need to lock timelines map
361 0 : // to remove the timeline from it.
362 0 : // Always if you have two locks that are taken in different order this can result in a deadlock.
363 0 :
364 0 : let delete_progress = Arc::clone(&timeline.delete_progress);
365 0 : let delete_lock_guard = match delete_progress.try_lock_owned() {
366 0 : Ok(guard) => DeletionGuard(guard),
367 : Err(_) => {
368 : // Unfortunately if lock fails arc is consumed.
369 0 : return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
370 0 : &timeline.delete_progress,
371 0 : )));
372 : }
373 : };
374 :
375 0 : timeline.set_state(TimelineState::Stopping);
376 0 :
377 0 : Ok((Arc::clone(timeline), delete_lock_guard))
378 0 : }
379 :
380 0 : fn schedule_background(
381 0 : guard: DeletionGuard,
382 0 : conf: &'static PageServerConf,
383 0 : tenant: Arc<Tenant>,
384 0 : timeline: Arc<Timeline>,
385 0 : ) {
386 0 : let tenant_shard_id = timeline.tenant_shard_id;
387 0 : let timeline_id = timeline.timeline_id;
388 0 :
389 0 : task_mgr::spawn(
390 0 : task_mgr::BACKGROUND_RUNTIME.handle(),
391 0 : TaskKind::TimelineDeletionWorker,
392 0 : Some(tenant_shard_id),
393 0 : Some(timeline_id),
394 0 : "timeline_delete",
395 : false,
396 0 : async move {
397 0 : if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
398 0 : error!("Error: {err:#}");
399 0 : timeline.set_broken(format!("{err:#}"))
400 0 : };
401 0 : Ok(())
402 0 : }
403 0 : .instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)),
404 : );
405 0 : }
406 :
407 0 : async fn background(
408 0 : mut guard: DeletionGuard,
409 0 : conf: &PageServerConf,
410 0 : tenant: &Tenant,
411 0 : timeline: &Timeline,
412 0 : ) -> Result<(), DeleteTimelineError> {
413 0 : delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
414 :
415 0 : delete_remote_layers_and_index(timeline).await?;
416 :
417 : pausable_failpoint!("in_progress_delete");
418 :
419 0 : remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
420 :
421 0 : *guard = Self::Finished;
422 0 :
423 0 : Ok(())
424 0 : }
425 :
426 0 : pub(crate) fn is_finished(&self) -> bool {
427 0 : matches!(self, Self::Finished)
428 0 : }
429 :
430 0 : pub(crate) fn is_not_started(&self) -> bool {
431 0 : matches!(self, Self::NotStarted)
432 0 : }
433 : }
434 :
435 : struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
436 :
437 : impl Deref for DeletionGuard {
438 : type Target = DeleteTimelineFlow;
439 :
440 0 : fn deref(&self) -> &Self::Target {
441 0 : &self.0
442 0 : }
443 : }
444 :
445 : impl DerefMut for DeletionGuard {
446 0 : fn deref_mut(&mut self) -> &mut Self::Target {
447 0 : &mut self.0
448 0 : }
449 : }
|