Line data Source code
1 : use std::sync::Arc;
2 :
3 : use anyhow::Context;
4 : use camino::{Utf8Path, Utf8PathBuf};
5 : use pageserver_api::{models::TenantState, shard::TenantShardId};
6 : use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
7 : use tokio::sync::OwnedMutexGuard;
8 : use tokio_util::sync::CancellationToken;
9 : use tracing::{error, instrument, Instrument};
10 :
11 : use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
12 :
13 : use crate::{
14 : config::PageServerConf,
15 : context::RequestContext,
16 : task_mgr::{self, TaskKind},
17 : tenant::{
18 : mgr::{TenantSlot, TenantsMapRemoveResult},
19 : remote_timeline_client::remote_heatmap_path,
20 : timeline::ShutdownMode,
21 : },
22 : };
23 :
24 : use super::{
25 : mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
26 : remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
27 : span,
28 : timeline::delete::DeleteTimelineFlow,
29 : tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
30 : };
31 :
32 0 : #[derive(Debug, thiserror::Error)]
33 : pub(crate) enum DeleteTenantError {
34 : #[error("GetTenant {0}")]
35 : Get(#[from] GetTenantError),
36 :
37 : #[error("Tenant not attached")]
38 : NotAttached,
39 :
40 : #[error("Invalid state {0}. Expected Active or Broken")]
41 : InvalidState(TenantState),
42 :
43 : #[error("Tenant deletion is already in progress")]
44 : AlreadyInProgress,
45 :
46 : #[error("Tenant map slot error {0}")]
47 : SlotError(#[from] TenantSlotError),
48 :
49 : #[error("Tenant map slot upsert error {0}")]
50 : SlotUpsertError(#[from] TenantSlotUpsertError),
51 :
52 : #[error("Timeline {0}")]
53 : Timeline(#[from] DeleteTimelineError),
54 :
55 : #[error("Cancelled")]
56 : Cancelled,
57 :
58 : #[error(transparent)]
59 : Other(#[from] anyhow::Error),
60 : }
61 :
62 : type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
63 :
64 0 : fn remote_tenant_delete_mark_path(
65 0 : conf: &PageServerConf,
66 0 : tenant_shard_id: &TenantShardId,
67 0 : ) -> anyhow::Result<RemotePath> {
68 0 : let tenant_remote_path = conf
69 0 : .tenant_path(tenant_shard_id)
70 0 : .strip_prefix(&conf.workdir)
71 0 : .context("Failed to strip workdir prefix")
72 0 : .and_then(RemotePath::new)
73 0 : .context("tenant path")?;
74 0 : Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
75 0 : }
76 :
77 0 : async fn create_remote_delete_mark(
78 0 : conf: &PageServerConf,
79 0 : remote_storage: &GenericRemoteStorage,
80 0 : tenant_shard_id: &TenantShardId,
81 0 : cancel: &CancellationToken,
82 0 : ) -> Result<(), DeleteTenantError> {
83 0 : let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
84 :
85 0 : let data: &[u8] = &[];
86 0 : backoff::retry(
87 0 : || async {
88 0 : let data = bytes::Bytes::from_static(data);
89 0 : let stream = futures::stream::once(futures::future::ready(Ok(data)));
90 0 : remote_storage
91 0 : .upload(stream, 0, &remote_mark_path, None, cancel)
92 0 : .await
93 0 : },
94 0 : TimeoutOrCancel::caused_by_cancel,
95 0 : FAILED_UPLOAD_WARN_THRESHOLD,
96 0 : FAILED_REMOTE_OP_RETRIES,
97 0 : "mark_upload",
98 0 : cancel,
99 0 : )
100 0 : .await
101 0 : .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
102 0 : .and_then(|x| x)
103 0 : .context("mark_upload")?;
104 :
105 0 : Ok(())
106 0 : }
107 :
108 0 : async fn create_local_delete_mark(
109 0 : conf: &PageServerConf,
110 0 : tenant_shard_id: &TenantShardId,
111 0 : ) -> Result<(), DeleteTenantError> {
112 0 : let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
113 0 :
114 0 : // Note: we're ok to replace existing file.
115 0 : let _ = std::fs::OpenOptions::new()
116 0 : .write(true)
117 0 : .create(true)
118 0 : .truncate(true)
119 0 : .open(&marker_path)
120 0 : .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
121 :
122 0 : crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
123 :
124 0 : Ok(())
125 0 : }
126 :
127 0 : async fn schedule_ordered_timeline_deletions(
128 0 : tenant: &Arc<Tenant>,
129 0 : ) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
130 0 : // Tenant is stopping at this point. We know it will be deleted.
131 0 : // No new timelines should be created.
132 0 : // Tree sort timelines to delete from leafs to the root.
133 0 : // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
134 0 : // can complete and remove timeline from the map in between our call to clone
135 0 : // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
136 0 : // timelines.lock is currently synchronous so we cant hold it across await point.
137 0 : // So just ignore NotFound error if we get it from `run`.
138 0 : // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
139 0 : let timelines = tenant.timelines.lock().unwrap().clone();
140 0 : let sorted =
141 0 : tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
142 :
143 0 : let mut already_running_deletions = vec![];
144 :
145 0 : for (timeline_id, _) in sorted.into_iter().rev() {
146 0 : let span = tracing::info_span!("timeline_delete", %timeline_id);
147 0 : let res = DeleteTimelineFlow::run(tenant, timeline_id, true)
148 0 : .instrument(span)
149 0 : .await;
150 0 : if let Err(e) = res {
151 0 : match e {
152 : DeleteTimelineError::NotFound => {
153 : // Timeline deletion finished after call to clone above but before call
154 : // to `DeleteTimelineFlow::run` and removed timeline from the map.
155 0 : continue;
156 : }
157 0 : DeleteTimelineError::AlreadyInProgress(guard) => {
158 0 : already_running_deletions.push((guard, timeline_id));
159 0 : continue;
160 : }
161 0 : e => return Err(DeleteTenantError::Timeline(e)),
162 : }
163 0 : }
164 : }
165 :
166 0 : Ok(already_running_deletions)
167 0 : }
168 :
169 0 : async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
170 0 : // Assert timelines dir is empty.
171 0 : if !fs_ext::is_directory_empty(timelines_path).await? {
172 : // Display first 10 items in directory
173 0 : let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
174 0 : let list = &list.into_iter().take(10).collect::<Vec<_>>();
175 0 : return Err(DeleteTenantError::Other(anyhow::anyhow!(
176 0 : "Timelines directory is not empty after all timelines deletion: {list:?}"
177 0 : )));
178 0 : }
179 0 :
180 0 : Ok(())
181 0 : }
182 :
183 0 : async fn remove_tenant_remote_delete_mark(
184 0 : conf: &PageServerConf,
185 0 : remote_storage: &GenericRemoteStorage,
186 0 : tenant_shard_id: &TenantShardId,
187 0 : cancel: &CancellationToken,
188 0 : ) -> Result<(), DeleteTenantError> {
189 0 : let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
190 0 : backoff::retry(
191 0 : || async { remote_storage.delete(&path, cancel).await },
192 0 : TimeoutOrCancel::caused_by_cancel,
193 0 : FAILED_UPLOAD_WARN_THRESHOLD,
194 0 : FAILED_REMOTE_OP_RETRIES,
195 0 : "remove_tenant_remote_delete_mark",
196 0 : cancel,
197 0 : )
198 0 : .await
199 0 : .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
200 0 : .and_then(|x| x)
201 0 : .context("remove_tenant_remote_delete_mark")?;
202 0 : Ok(())
203 0 : }
204 :
205 : // Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
206 0 : async fn cleanup_remaining_fs_traces(
207 0 : conf: &PageServerConf,
208 0 : tenant_shard_id: &TenantShardId,
209 0 : ) -> Result<(), DeleteTenantError> {
210 0 : let rm = |p: Utf8PathBuf, is_dir: bool| async move {
211 0 : if is_dir {
212 0 : tokio::fs::remove_dir(&p).await
213 0 : } else {
214 0 : tokio::fs::remove_file(&p).await
215 0 : }
216 0 : .or_else(fs_ext::ignore_not_found)
217 0 : .with_context(|| format!("failed to delete {p}"))
218 0 : };
219 :
220 0 : rm(conf.tenant_config_path(tenant_shard_id), false).await?;
221 0 : rm(conf.tenant_location_config_path(tenant_shard_id), false).await?;
222 :
223 0 : fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
224 0 : Err(anyhow::anyhow!(
225 0 : "failpoint: tenant-delete-before-remove-timelines-dir"
226 0 : ))?
227 0 : });
228 :
229 0 : rm(conf.timelines_path(tenant_shard_id), true).await?;
230 :
231 0 : fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
232 0 : Err(anyhow::anyhow!(
233 0 : "failpoint: tenant-delete-before-remove-deleted-mark"
234 0 : ))?
235 0 : });
236 :
237 : // Make sure previous deletions are ordered before mark removal.
238 : // Otherwise there is no guarantee that they reach the disk before mark deletion.
239 : // So its possible for mark to reach disk first and for other deletions
240 : // to be reordered later and thus missed if a crash occurs.
241 : // Note that we dont need to sync after mark file is removed
242 : // because we can tolerate the case when mark file reappears on startup.
243 0 : let tenant_path = &conf.tenant_path(tenant_shard_id);
244 0 : if tenant_path.exists() {
245 0 : crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id))
246 0 : .await
247 0 : .context("fsync_pre_mark_remove")?;
248 0 : }
249 :
250 0 : rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
251 :
252 0 : rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?;
253 :
254 0 : fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
255 0 : Err(anyhow::anyhow!(
256 0 : "failpoint: tenant-delete-before-remove-tenant-dir"
257 0 : ))?
258 0 : });
259 :
260 0 : rm(conf.tenant_path(tenant_shard_id), true).await?;
261 :
262 0 : Ok(())
263 0 : }
264 :
265 : /// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
266 : /// and deletes its data from both disk and s3.
267 : /// The sequence of steps:
268 : /// 1. Upload remote deletion mark.
269 : /// 2. Create local mark file.
270 : /// 3. Shutdown tasks
271 : /// 4. Run ordered timeline deletions
272 : /// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
273 : /// 6. Remove remote mark
274 : /// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
275 : /// It is resumable from any step in case a crash/restart occurs.
276 : /// There are two entrypoints to the process:
277 : /// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
278 : /// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
279 : /// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
280 : #[derive(Default)]
281 : pub enum DeleteTenantFlow {
282 : #[default]
283 : NotStarted,
284 : InProgress,
285 : Finished,
286 : }
287 :
288 : impl DeleteTenantFlow {
289 : // These steps are run in the context of management api request handler.
290 : // Long running steps are continued to run in the background.
291 : // NB: If this fails half-way through, and is retried, the retry will go through
292 : // all the same steps again. Make sure the code here is idempotent, and don't
293 : // error out if some of the shutdown tasks have already been completed!
294 : // NOTE: static needed for background part.
295 : // We assume that calling code sets up the span with tenant_id.
296 0 : #[instrument(skip_all)]
297 : pub(crate) async fn run(
298 : conf: &'static PageServerConf,
299 : remote_storage: GenericRemoteStorage,
300 : tenants: &'static std::sync::RwLock<TenantsMap>,
301 : tenant: Arc<Tenant>,
302 : cancel: &CancellationToken,
303 : ) -> Result<(), DeleteTenantError> {
304 : span::debug_assert_current_span_has_tenant_id();
305 :
306 : pausable_failpoint!("tenant-delete-before-run");
307 :
308 : let mut guard = Self::prepare(&tenant).await?;
309 :
310 : if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
311 : tenant.set_broken(format!("{e:#}")).await;
312 : return Err(e);
313 : }
314 :
315 : Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
316 :
317 : Ok(())
318 : }
319 :
320 : // Helper function needed to be able to match once on returned error and transition tenant into broken state.
321 : // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
322 : // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
323 : // So the solution is to set tenant state to broken.
324 0 : async fn run_inner(
325 0 : guard: &mut OwnedMutexGuard<Self>,
326 0 : conf: &'static PageServerConf,
327 0 : remote_storage: &GenericRemoteStorage,
328 0 : tenant: &Tenant,
329 0 : cancel: &CancellationToken,
330 0 : ) -> Result<(), DeleteTenantError> {
331 0 : guard.mark_in_progress()?;
332 :
333 0 : fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
334 0 : Err(anyhow::anyhow!(
335 0 : "failpoint: tenant-delete-before-create-remote-mark"
336 0 : ))?
337 0 : });
338 :
339 0 : create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
340 0 : .await
341 0 : .context("remote_mark")?;
342 :
343 0 : fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
344 0 : Err(anyhow::anyhow!(
345 0 : "failpoint: tenant-delete-before-create-local-mark"
346 0 : ))?
347 0 : });
348 :
349 0 : create_local_delete_mark(conf, &tenant.tenant_shard_id)
350 0 : .await
351 0 : .context("local delete mark")?;
352 :
353 0 : fail::fail_point!("tenant-delete-before-background", |_| {
354 0 : Err(anyhow::anyhow!(
355 0 : "failpoint: tenant-delete-before-background"
356 0 : ))?
357 0 : });
358 :
359 0 : Ok(())
360 0 : }
361 :
362 0 : fn mark_in_progress(&mut self) -> anyhow::Result<()> {
363 0 : match self {
364 0 : Self::Finished => anyhow::bail!("Bug. Is in finished state"),
365 0 : Self::InProgress { .. } => { /* We're in a retry */ }
366 0 : Self::NotStarted => { /* Fresh start */ }
367 : }
368 :
369 0 : *self = Self::InProgress;
370 0 :
371 0 : Ok(())
372 0 : }
373 :
374 0 : pub(crate) async fn should_resume_deletion(
375 0 : conf: &'static PageServerConf,
376 0 : remote_mark_exists: bool,
377 0 : tenant: &Tenant,
378 0 : ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
379 0 : let acquire = |t: &Tenant| {
380 0 : Some(
381 0 : Arc::clone(&t.delete_progress)
382 0 : .try_lock_owned()
383 0 : .expect("we're the only owner during init"),
384 0 : )
385 0 : };
386 :
387 0 : if remote_mark_exists {
388 0 : return Ok(acquire(tenant));
389 0 : }
390 0 :
391 0 : // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
392 0 : if conf
393 0 : .tenant_deleted_mark_file_path(&tenant.tenant_shard_id)
394 0 : .exists()
395 : {
396 0 : Ok(acquire(tenant))
397 : } else {
398 0 : Ok(None)
399 : }
400 0 : }
401 :
402 0 : pub(crate) async fn resume_from_attach(
403 0 : guard: DeletionGuard,
404 0 : tenant: &Arc<Tenant>,
405 0 : preload: Option<TenantPreload>,
406 0 : tenants: &'static std::sync::RwLock<TenantsMap>,
407 0 : ctx: &RequestContext,
408 0 : ) -> Result<(), DeleteTenantError> {
409 0 : let (_, progress) = completion::channel();
410 0 :
411 0 : tenant
412 0 : .set_stopping(progress, false, true)
413 0 : .await
414 0 : .expect("cant be stopping or broken");
415 0 :
416 0 : tenant
417 0 : .attach(preload, super::SpawnMode::Eager, ctx)
418 0 : .await
419 0 : .context("attach")?;
420 :
421 0 : Self::background(
422 0 : guard,
423 0 : tenant.conf,
424 0 : tenant.remote_storage.clone(),
425 0 : tenants,
426 0 : tenant,
427 0 : )
428 0 : .await
429 0 : }
430 :
431 : /// Check whether background deletion of this tenant is currently in progress
432 0 : pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
433 0 : tenant.delete_progress.try_lock().is_err()
434 0 : }
435 :
436 0 : async fn prepare(
437 0 : tenant: &Arc<Tenant>,
438 0 : ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
439 : // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
440 : // so at least for now allow deletions only for active tenants. TODO recheck
441 : // Broken and Stopping is needed for retries.
442 0 : if !matches!(
443 0 : tenant.current_state(),
444 : TenantState::Active | TenantState::Broken { .. }
445 : ) {
446 0 : return Err(DeleteTenantError::InvalidState(tenant.current_state()));
447 0 : }
448 :
449 0 : let guard = Arc::clone(&tenant.delete_progress)
450 0 : .try_lock_owned()
451 0 : .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
452 :
453 0 : fail::fail_point!("tenant-delete-before-shutdown", |_| {
454 0 : Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
455 0 : });
456 :
457 : // make pageserver shutdown not to wait for our completion
458 0 : let (_, progress) = completion::channel();
459 0 :
460 0 : // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
461 0 : // i e it is an error to do:
462 0 : // tenant.set_stopping
463 0 : // tenant.shutdown
464 0 : // Its also bad that we're holding tenants.read here.
465 0 : // TODO relax set_stopping to be idempotent?
466 0 : if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
467 0 : return Err(DeleteTenantError::Other(anyhow::anyhow!(
468 0 : "tenant shutdown is already in progress"
469 0 : )));
470 0 : }
471 0 :
472 0 : Ok(guard)
473 0 : }
474 :
475 0 : fn schedule_background(
476 0 : guard: OwnedMutexGuard<Self>,
477 0 : conf: &'static PageServerConf,
478 0 : remote_storage: GenericRemoteStorage,
479 0 : tenants: &'static std::sync::RwLock<TenantsMap>,
480 0 : tenant: Arc<Tenant>,
481 0 : ) {
482 0 : let tenant_shard_id = tenant.tenant_shard_id;
483 0 :
484 0 : task_mgr::spawn(
485 0 : task_mgr::BACKGROUND_RUNTIME.handle(),
486 0 : TaskKind::TimelineDeletionWorker,
487 0 : Some(tenant_shard_id),
488 0 : None,
489 0 : "tenant_delete",
490 : false,
491 0 : async move {
492 0 : if let Err(err) =
493 0 : Self::background(guard, conf, remote_storage, tenants, &tenant).await
494 : {
495 0 : error!("Error: {err:#}");
496 0 : tenant.set_broken(format!("{err:#}")).await;
497 0 : };
498 0 : Ok(())
499 0 : }
500 0 : .instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
501 : );
502 0 : }
503 :
504 0 : async fn background(
505 0 : mut guard: OwnedMutexGuard<Self>,
506 0 : conf: &PageServerConf,
507 0 : remote_storage: GenericRemoteStorage,
508 0 : tenants: &'static std::sync::RwLock<TenantsMap>,
509 0 : tenant: &Arc<Tenant>,
510 0 : ) -> Result<(), DeleteTenantError> {
511 : // Tree sort timelines, schedule delete for them. Mention retries from the console side.
512 : // Note that if deletion fails we dont mark timelines as broken,
513 : // the whole tenant will become broken as by `Self::schedule_background` logic
514 0 : let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
515 0 : .await
516 0 : .context("schedule_ordered_timeline_deletions")?;
517 :
518 0 : fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
519 0 : Err(anyhow::anyhow!(
520 0 : "failpoint: tenant-delete-before-polling-ongoing-deletions"
521 0 : ))?
522 0 : });
523 :
524 : // Wait for deletions that were already running at the moment when tenant deletion was requested.
525 : // When we can lock deletion guard it means that corresponding timeline deletion finished.
526 0 : for (guard, timeline_id) in already_running_timeline_deletions {
527 0 : let flow = guard.lock().await;
528 0 : if !flow.is_finished() {
529 0 : return Err(DeleteTenantError::Other(anyhow::anyhow!(
530 0 : "already running timeline deletion failed: {timeline_id}"
531 0 : )));
532 0 : }
533 : }
534 :
535 : // Remove top-level tenant objects that don't belong to a timeline, such as heatmap
536 0 : let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id());
537 0 : if let Some(Err(e)) = backoff::retry(
538 0 : || async {
539 0 : remote_storage
540 0 : .delete(&heatmap_path, &task_mgr::shutdown_token())
541 0 : .await
542 0 : },
543 0 : TimeoutOrCancel::caused_by_cancel,
544 0 : FAILED_UPLOAD_WARN_THRESHOLD,
545 0 : FAILED_REMOTE_OP_RETRIES,
546 0 : "remove_remote_tenant_heatmap",
547 0 : &task_mgr::shutdown_token(),
548 0 : )
549 0 : .await
550 : {
551 0 : tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}");
552 0 : }
553 :
554 0 : let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
555 0 : // May not exist if we fail in cleanup_remaining_fs_traces after removing it
556 0 : if timelines_path.exists() {
557 : // sanity check to guard against layout changes
558 0 : ensure_timelines_dir_empty(&timelines_path)
559 0 : .await
560 0 : .context("timelines dir not empty")?;
561 0 : }
562 :
563 0 : remove_tenant_remote_delete_mark(
564 0 : conf,
565 0 : &remote_storage,
566 0 : &tenant.tenant_shard_id,
567 0 : &task_mgr::shutdown_token(),
568 0 : )
569 0 : .await?;
570 :
571 : pausable_failpoint!("tenant-delete-before-cleanup-remaining-fs-traces-pausable");
572 0 : fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
573 0 : Err(anyhow::anyhow!(
574 0 : "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
575 0 : ))?
576 0 : });
577 :
578 0 : cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id)
579 0 : .await
580 0 : .context("cleanup_remaining_fs_traces")?;
581 :
582 : {
583 : pausable_failpoint!("tenant-delete-before-map-remove");
584 :
585 : // This block is simply removing the TenantSlot for this tenant. It requires a loop because
586 : // we might conflict with a TenantSlot::InProgress marker and need to wait for it.
587 : //
588 : // This complexity will go away when we simplify how deletion works:
589 : // https://github.com/neondatabase/neon/issues/5080
590 : loop {
591 : // Under the TenantMap lock, try to remove the tenant. We usually succeed, but if
592 : // we encounter an InProgress marker, yield the barrier it contains and wait on it.
593 0 : let barrier = {
594 0 : let mut locked = tenants.write().unwrap();
595 0 : let removed = locked.remove(tenant.tenant_shard_id);
596 0 :
597 0 : // FIXME: we should not be modifying this from outside of mgr.rs.
598 0 : // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
599 0 :
600 0 : // Update stats
601 0 : match &removed {
602 0 : TenantsMapRemoveResult::Occupied(slot) => {
603 0 : crate::metrics::TENANT_MANAGER.slot_removed(slot);
604 0 : }
605 0 : TenantsMapRemoveResult::InProgress(barrier) => {
606 0 : crate::metrics::TENANT_MANAGER
607 0 : .slot_removed(&TenantSlot::InProgress(barrier.clone()));
608 0 : }
609 0 : TenantsMapRemoveResult::Vacant => {
610 0 : // Nothing changed in map, no metric update
611 0 : }
612 : }
613 :
614 0 : match removed {
615 0 : TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
616 0 : match tenant.current_state() {
617 0 : TenantState::Stopping { .. } | TenantState::Broken { .. } => {
618 0 : // Expected: we put the tenant into stopping state before we start deleting it
619 0 : }
620 0 : state => {
621 0 : // Unexpected state
622 0 : tracing::warn!(
623 0 : "Tenant in unexpected state {state} after deletion"
624 : );
625 : }
626 : }
627 0 : break;
628 : }
629 : TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
630 : // This is unexpected: this secondary tenants should not have been created, and we
631 : // are not in a position to shut it down from here.
632 0 : tracing::warn!("Tenant transitioned to secondary mode while deleting!");
633 0 : break;
634 : }
635 : TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
636 0 : unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
637 : }
638 : TenantsMapRemoveResult::Vacant => {
639 0 : tracing::warn!(
640 0 : "Tenant removed from TenantsMap before deletion completed"
641 : );
642 0 : break;
643 : }
644 0 : TenantsMapRemoveResult::InProgress(barrier) => {
645 0 : // An InProgress entry was found, we must wait on its barrier
646 0 : barrier
647 0 : }
648 0 : }
649 0 : };
650 0 :
651 0 : tracing::info!(
652 0 : "Waiting for competing operation to complete before deleting state for tenant"
653 : );
654 0 : barrier.wait().await;
655 : }
656 : }
657 :
658 0 : *guard = Self::Finished;
659 0 :
660 0 : Ok(())
661 0 : }
662 : }
|