TLA Line data Source code
1 : use std::sync::Arc;
2 :
3 : use anyhow::Context;
4 : use camino::{Utf8Path, Utf8PathBuf};
5 : use pageserver_api::{models::TenantState, shard::TenantShardId};
6 : use remote_storage::{GenericRemoteStorage, RemotePath};
7 : use tokio::sync::OwnedMutexGuard;
8 : use tokio_util::sync::CancellationToken;
9 : use tracing::{error, instrument, Instrument, Span};
10 :
11 : use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
12 :
13 : use crate::{
14 : config::PageServerConf,
15 : context::RequestContext,
16 : task_mgr::{self, TaskKind},
17 : tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
18 : };
19 :
20 : use super::{
21 : mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
22 : remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
23 : span,
24 : timeline::delete::DeleteTimelineFlow,
25 : tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
26 : };
27 :
28 CBC 204 : #[derive(Debug, thiserror::Error)]
29 : pub(crate) enum DeleteTenantError {
30 : #[error("GetTenant {0}")]
31 : Get(#[from] GetTenantError),
32 :
33 : #[error("Tenant not attached")]
34 : NotAttached,
35 :
36 : #[error("Invalid state {0}. Expected Active or Broken")]
37 : InvalidState(TenantState),
38 :
39 : #[error("Tenant deletion is already in progress")]
40 : AlreadyInProgress,
41 :
42 : #[error("Tenant map slot error {0}")]
43 : SlotError(#[from] TenantSlotError),
44 :
45 : #[error("Tenant map slot upsert error {0}")]
46 : SlotUpsertError(#[from] TenantSlotUpsertError),
47 :
48 : #[error("Timeline {0}")]
49 : Timeline(#[from] DeleteTimelineError),
50 :
51 : #[error("Cancelled")]
52 : Cancelled,
53 :
54 : #[error(transparent)]
55 : Other(#[from] anyhow::Error),
56 : }
57 :
58 : type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
59 :
60 152 : fn remote_tenant_delete_mark_path(
61 152 : conf: &PageServerConf,
62 152 : tenant_shard_id: &TenantShardId,
63 152 : ) -> anyhow::Result<RemotePath> {
64 152 : let tenant_remote_path = conf
65 152 : .tenant_path(tenant_shard_id)
66 152 : .strip_prefix(&conf.workdir)
67 152 : .context("Failed to strip workdir prefix")
68 152 : .and_then(RemotePath::new)
69 152 : .context("tenant path")?;
70 152 : Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
71 152 : }
72 :
73 80 : async fn create_remote_delete_mark(
74 80 : conf: &PageServerConf,
75 80 : remote_storage: &GenericRemoteStorage,
76 80 : tenant_shard_id: &TenantShardId,
77 80 : cancel: &CancellationToken,
78 80 : ) -> Result<(), DeleteTenantError> {
79 80 : let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
80 :
81 80 : let data: &[u8] = &[];
82 80 : backoff::retry(
83 118 : || async {
84 118 : let data = bytes::Bytes::from_static(data);
85 118 : let stream = futures::stream::once(futures::future::ready(Ok(data)));
86 118 : remote_storage
87 118 : .upload(stream, 0, &remote_mark_path, None)
88 160 : .await
89 118 : },
90 80 : |_e| false,
91 80 : FAILED_UPLOAD_WARN_THRESHOLD,
92 80 : FAILED_REMOTE_OP_RETRIES,
93 80 : "mark_upload",
94 80 : backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
95 80 : )
96 160 : .await
97 80 : .context("mark_upload")?;
98 :
99 80 : Ok(())
100 80 : }
101 :
102 76 : async fn create_local_delete_mark(
103 76 : conf: &PageServerConf,
104 76 : tenant_shard_id: &TenantShardId,
105 76 : ) -> Result<(), DeleteTenantError> {
106 76 : let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
107 76 :
108 76 : // Note: we're ok to replace existing file.
109 76 : let _ = std::fs::OpenOptions::new()
110 76 : .write(true)
111 76 : .create(true)
112 76 : .open(&marker_path)
113 76 : .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
114 :
115 76 : crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
116 :
117 76 : Ok(())
118 76 : }
119 :
120 93 : async fn schedule_ordered_timeline_deletions(
121 93 : tenant: &Arc<Tenant>,
122 93 : ) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
123 93 : // Tenant is stopping at this point. We know it will be deleted.
124 93 : // No new timelines should be created.
125 93 : // Tree sort timelines to delete from leafs to the root.
126 93 : // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
127 93 : // can complete and remove timeline from the map in between our call to clone
128 93 : // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
129 93 : // timelines.lock is currently synchronous so we cant hold it across await point.
130 93 : // So just ignore NotFound error if we get it from `run`.
131 93 : // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
132 93 : let timelines = tenant.timelines.lock().unwrap().clone();
133 93 : let sorted =
134 147 : tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
135 :
136 93 : let mut already_running_deletions = vec![];
137 :
138 129 : for (timeline_id, _) in sorted.into_iter().rev() {
139 4617 : if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
140 22 : match e {
141 : DeleteTimelineError::NotFound => {
142 : // Timeline deletion finished after call to clone above but before call
143 : // to `DeleteTimelineFlow::run` and removed timeline from the map.
144 1 : continue;
145 : }
146 4 : DeleteTimelineError::AlreadyInProgress(guard) => {
147 4 : already_running_deletions.push((guard, timeline_id));
148 4 : continue;
149 : }
150 17 : e => return Err(DeleteTenantError::Timeline(e)),
151 : }
152 107 : }
153 : }
154 :
155 76 : Ok(already_running_deletions)
156 93 : }
157 :
158 66 : async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
159 66 : // Assert timelines dir is empty.
160 66 : if !fs_ext::is_directory_empty(timelines_path).await? {
161 : // Display first 10 items in directory
162 UBC 0 : let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
163 0 : let list = &list.into_iter().take(10).collect::<Vec<_>>();
164 0 : return Err(DeleteTenantError::Other(anyhow::anyhow!(
165 0 : "Timelines directory is not empty after all timelines deletion: {list:?}"
166 0 : )));
167 CBC 66 : }
168 66 :
169 66 : Ok(())
170 66 : }
171 :
172 72 : async fn remove_tenant_remote_delete_mark(
173 72 : conf: &PageServerConf,
174 72 : remote_storage: Option<&GenericRemoteStorage>,
175 72 : tenant_shard_id: &TenantShardId,
176 72 : cancel: &CancellationToken,
177 72 : ) -> Result<(), DeleteTenantError> {
178 72 : if let Some(remote_storage) = remote_storage {
179 72 : let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
180 72 : backoff::retry(
181 280 : || async { remote_storage.delete(&path).await },
182 72 : |_e| false,
183 72 : FAILED_UPLOAD_WARN_THRESHOLD,
184 72 : FAILED_REMOTE_OP_RETRIES,
185 72 : "remove_tenant_remote_delete_mark",
186 72 : backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
187 72 : )
188 280 : .await
189 72 : .context("remove_tenant_remote_delete_mark")?;
190 UBC 0 : }
191 CBC 72 : Ok(())
192 72 : }
193 :
194 : // Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
195 68 : async fn cleanup_remaining_fs_traces(
196 68 : conf: &PageServerConf,
197 68 : tenant_shard_id: &TenantShardId,
198 68 : ) -> Result<(), DeleteTenantError> {
199 316 : let rm = |p: Utf8PathBuf, is_dir: bool| async move {
200 316 : if is_dir {
201 120 : tokio::fs::remove_dir(&p).await
202 68 : } else {
203 196 : tokio::fs::remove_file(&p).await
204 68 : }
205 316 : .or_else(fs_ext::ignore_not_found)
206 316 : .with_context(|| format!("failed to delete {p}"))
207 316 : };
208 68 :
209 68 : rm(conf.tenant_config_path(tenant_shard_id), false).await?;
210 68 : rm(conf.tenant_location_config_path(tenant_shard_id), false).await?;
211 :
212 68 : fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
213 4 : Err(anyhow::anyhow!(
214 4 : "failpoint: tenant-delete-before-remove-timelines-dir"
215 4 : ))?
216 68 : });
217 :
218 64 : rm(conf.timelines_path(tenant_shard_id), true).await?;
219 :
220 64 : fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
221 4 : Err(anyhow::anyhow!(
222 4 : "failpoint: tenant-delete-before-remove-deleted-mark"
223 4 : ))?
224 64 : });
225 :
226 : // Make sure previous deletions are ordered before mark removal.
227 : // Otherwise there is no guarantee that they reach the disk before mark deletion.
228 : // So its possible for mark to reach disk first and for other deletions
229 : // to be reordered later and thus missed if a crash occurs.
230 : // Note that we dont need to sync after mark file is removed
231 : // because we can tolerate the case when mark file reappears on startup.
232 60 : let tenant_path = &conf.tenant_path(tenant_shard_id);
233 60 : if tenant_path.exists() {
234 60 : crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id))
235 120 : .await
236 60 : .context("fsync_pre_mark_remove")?;
237 UBC 0 : }
238 :
239 CBC 60 : rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
240 :
241 60 : fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
242 4 : Err(anyhow::anyhow!(
243 4 : "failpoint: tenant-delete-before-remove-tenant-dir"
244 4 : ))?
245 60 : });
246 :
247 56 : rm(conf.tenant_path(tenant_shard_id), true).await?;
248 :
249 56 : Ok(())
250 68 : }
251 :
252 : /// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
253 : /// and deletes its data from both disk and s3.
254 : /// The sequence of steps:
255 : /// 1. Upload remote deletion mark.
256 : /// 2. Create local mark file.
257 : /// 3. Shutdown tasks
258 : /// 4. Run ordered timeline deletions
259 : /// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
260 : /// 6. Remove remote mark
261 : /// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
262 : /// It is resumable from any step in case a crash/restart occurs.
263 : /// There are two entrypoints to the process:
264 : /// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
265 : /// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
266 : /// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
267 778 : #[derive(Default)]
268 : pub enum DeleteTenantFlow {
269 : #[default]
270 : NotStarted,
271 : InProgress,
272 : Finished,
273 : }
274 :
275 : impl DeleteTenantFlow {
276 : // These steps are run in the context of management api request handler.
277 : // Long running steps are continued to run in the background.
278 : // NB: If this fails half-way through, and is retried, the retry will go through
279 : // all the same steps again. Make sure the code here is idempotent, and don't
280 : // error out if some of the shutdown tasks have already been completed!
281 : // NOTE: static needed for background part.
282 : // We assume that calling code sets up the span with tenant_id.
283 90 : #[instrument(skip_all)]
284 : pub(crate) async fn run(
285 : conf: &'static PageServerConf,
286 : remote_storage: Option<GenericRemoteStorage>,
287 : tenants: &'static std::sync::RwLock<TenantsMap>,
288 : tenant: Arc<Tenant>,
289 : ) -> Result<(), DeleteTenantError> {
290 : span::debug_assert_current_span_has_tenant_id();
291 :
292 90 : pausable_failpoint!("tenant-delete-before-run");
293 :
294 : let mut guard = Self::prepare(&tenant).await?;
295 :
296 : if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
297 : tenant.set_broken(format!("{e:#}")).await;
298 : return Err(e);
299 : }
300 :
301 : Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
302 :
303 : Ok(())
304 : }
305 :
306 : // Helper function needed to be able to match once on returned error and transition tenant into broken state.
307 : // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
308 : // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
309 : // So the solution is to set tenant state to broken.
310 84 : async fn run_inner(
311 84 : guard: &mut OwnedMutexGuard<Self>,
312 84 : conf: &'static PageServerConf,
313 84 : remote_storage: Option<&GenericRemoteStorage>,
314 84 : tenant: &Tenant,
315 84 : ) -> Result<(), DeleteTenantError> {
316 84 : guard.mark_in_progress()?;
317 :
318 84 : fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
319 4 : Err(anyhow::anyhow!(
320 4 : "failpoint: tenant-delete-before-create-remote-mark"
321 4 : ))?
322 84 : });
323 :
324 : // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
325 : // Though sounds scary, different mark name?
326 : // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
327 80 : if let Some(remote_storage) = &remote_storage {
328 80 : create_remote_delete_mark(
329 80 : conf,
330 80 : remote_storage,
331 80 : &tenant.tenant_shard_id,
332 80 : // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
333 80 : &CancellationToken::new(),
334 80 : )
335 160 : .await
336 80 : .context("remote_mark")?
337 UBC 0 : }
338 :
339 CBC 80 : fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
340 4 : Err(anyhow::anyhow!(
341 4 : "failpoint: tenant-delete-before-create-local-mark"
342 4 : ))?
343 80 : });
344 :
345 76 : create_local_delete_mark(conf, &tenant.tenant_shard_id)
346 UBC 0 : .await
347 CBC 76 : .context("local delete mark")?;
348 :
349 76 : fail::fail_point!("tenant-delete-before-background", |_| {
350 4 : Err(anyhow::anyhow!(
351 4 : "failpoint: tenant-delete-before-background"
352 4 : ))?
353 76 : });
354 :
355 72 : Ok(())
356 84 : }
357 :
358 84 : fn mark_in_progress(&mut self) -> anyhow::Result<()> {
359 84 : match self {
360 UBC 0 : Self::Finished => anyhow::bail!("Bug. Is in finished state"),
361 CBC 24 : Self::InProgress { .. } => { /* We're in a retry */ }
362 60 : Self::NotStarted => { /* Fresh start */ }
363 : }
364 :
365 84 : *self = Self::InProgress;
366 84 :
367 84 : Ok(())
368 84 : }
369 :
370 730 : pub(crate) async fn should_resume_deletion(
371 730 : conf: &'static PageServerConf,
372 730 : remote_mark_exists: bool,
373 730 : tenant: &Tenant,
374 730 : ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
375 730 : let acquire = |t: &Tenant| {
376 21 : Some(
377 21 : Arc::clone(&t.delete_progress)
378 21 : .try_lock_owned()
379 21 : .expect("we're the only owner during init"),
380 21 : )
381 730 : };
382 730 :
383 730 : if remote_mark_exists {
384 15 : return Ok(acquire(tenant));
385 715 : }
386 715 :
387 715 : // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
388 715 : if conf
389 715 : .tenant_deleted_mark_file_path(&tenant.tenant_shard_id)
390 715 : .exists()
391 : {
392 6 : Ok(acquire(tenant))
393 : } else {
394 709 : Ok(None)
395 : }
396 730 : }
397 :
398 21 : pub(crate) async fn resume_from_attach(
399 21 : guard: DeletionGuard,
400 21 : tenant: &Arc<Tenant>,
401 21 : preload: Option<TenantPreload>,
402 21 : tenants: &'static std::sync::RwLock<TenantsMap>,
403 21 : ctx: &RequestContext,
404 21 : ) -> Result<(), DeleteTenantError> {
405 21 : let (_, progress) = completion::channel();
406 21 :
407 21 : tenant
408 21 : .set_stopping(progress, false, true)
409 UBC 0 : .await
410 CBC 21 : .expect("cant be stopping or broken");
411 21 :
412 40 : tenant.attach(preload, ctx).await.context("attach")?;
413 :
414 21 : Self::background(
415 21 : guard,
416 21 : tenant.conf,
417 21 : tenant.remote_storage.clone(),
418 21 : tenants,
419 21 : tenant,
420 21 : )
421 723 : .await
422 21 : }
423 :
424 90 : async fn prepare(
425 90 : tenant: &Arc<Tenant>,
426 90 : ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
427 90 : // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
428 90 : // so at least for now allow deletions only for active tenants. TODO recheck
429 90 : // Broken and Stopping is needed for retries.
430 90 : if !matches!(
431 90 : tenant.current_state(),
432 : TenantState::Active | TenantState::Broken { .. }
433 : ) {
434 2 : return Err(DeleteTenantError::InvalidState(tenant.current_state()));
435 88 : }
436 :
437 88 : let guard = Arc::clone(&tenant.delete_progress)
438 88 : .try_lock_owned()
439 88 : .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
440 :
441 88 : fail::fail_point!("tenant-delete-before-shutdown", |_| {
442 4 : Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
443 88 : });
444 :
445 : // make pageserver shutdown not to wait for our completion
446 84 : let (_, progress) = completion::channel();
447 84 :
448 84 : // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
449 84 : // i e it is an error to do:
450 84 : // tenant.set_stopping
451 84 : // tenant.shutdown
452 84 : // Its also bad that we're holding tenants.read here.
453 84 : // TODO relax set_stopping to be idempotent?
454 196 : if tenant.shutdown(progress, false).await.is_err() {
455 UBC 0 : return Err(DeleteTenantError::Other(anyhow::anyhow!(
456 0 : "tenant shutdown is already in progress"
457 0 : )));
458 CBC 84 : }
459 84 :
460 84 : Ok(guard)
461 90 : }
462 :
463 72 : fn schedule_background(
464 72 : guard: OwnedMutexGuard<Self>,
465 72 : conf: &'static PageServerConf,
466 72 : remote_storage: Option<GenericRemoteStorage>,
467 72 : tenants: &'static std::sync::RwLock<TenantsMap>,
468 72 : tenant: Arc<Tenant>,
469 72 : ) {
470 72 : let tenant_shard_id = tenant.tenant_shard_id;
471 72 :
472 72 : task_mgr::spawn(
473 72 : task_mgr::BACKGROUND_RUNTIME.handle(),
474 72 : TaskKind::TimelineDeletionWorker,
475 72 : Some(tenant_shard_id),
476 72 : None,
477 72 : "tenant_delete",
478 : false,
479 72 : async move {
480 37 : if let Err(err) =
481 4733 : Self::background(guard, conf, remote_storage, tenants, &tenant).await
482 : {
483 37 : error!("Error: {err:#}");
484 37 : tenant.set_broken(format!("{err:#}")).await;
485 35 : };
486 72 : Ok(())
487 72 : }
488 72 : .instrument({
489 72 : let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
490 72 : span.follows_from(Span::current());
491 72 : span
492 72 : }),
493 72 : );
494 72 : }
495 :
496 93 : async fn background(
497 93 : mut guard: OwnedMutexGuard<Self>,
498 93 : conf: &PageServerConf,
499 93 : remote_storage: Option<GenericRemoteStorage>,
500 93 : tenants: &'static std::sync::RwLock<TenantsMap>,
501 93 : tenant: &Arc<Tenant>,
502 93 : ) -> Result<(), DeleteTenantError> {
503 : // Tree sort timelines, schedule delete for them. Mention retries from the console side.
504 : // Note that if deletion fails we dont mark timelines as broken,
505 : // the whole tenant will become broken as by `Self::schedule_background` logic
506 93 : let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
507 4617 : .await
508 93 : .context("schedule_ordered_timeline_deletions")?;
509 :
510 76 : fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
511 4 : Err(anyhow::anyhow!(
512 4 : "failpoint: tenant-delete-before-polling-ongoing-deletions"
513 4 : ))?
514 76 : });
515 :
516 : // Wait for deletions that were already running at the moment when tenant deletion was requested.
517 : // When we can lock deletion guard it means that corresponding timeline deletion finished.
518 76 : for (guard, timeline_id) in already_running_timeline_deletions {
519 4 : let flow = guard.lock().await;
520 4 : if !flow.is_finished() {
521 UBC 0 : return Err(DeleteTenantError::Other(anyhow::anyhow!(
522 0 : "already running timeline deletion failed: {timeline_id}"
523 0 : )));
524 CBC 4 : }
525 : }
526 :
527 72 : let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
528 72 : // May not exist if we fail in cleanup_remaining_fs_traces after removing it
529 72 : if timelines_path.exists() {
530 : // sanity check to guard against layout changes
531 66 : ensure_timelines_dir_empty(&timelines_path)
532 66 : .await
533 66 : .context("timelines dir not empty")?;
534 6 : }
535 :
536 72 : remove_tenant_remote_delete_mark(
537 72 : conf,
538 72 : remote_storage.as_ref(),
539 72 : &tenant.tenant_shard_id,
540 72 : // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
541 72 : &CancellationToken::new(),
542 72 : )
543 280 : .await?;
544 :
545 72 : fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
546 4 : Err(anyhow::anyhow!(
547 4 : "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
548 4 : ))?
549 72 : });
550 :
551 68 : cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id)
552 436 : .await
553 68 : .context("cleanup_remaining_fs_traces")?;
554 :
555 : {
556 56 : pausable_failpoint!("tenant-delete-before-map-remove");
557 :
558 : // This block is simply removing the TenantSlot for this tenant. It requires a loop because
559 : // we might conflict with a TenantSlot::InProgress marker and need to wait for it.
560 : //
561 : // This complexity will go away when we simplify how deletion works:
562 : // https://github.com/neondatabase/neon/issues/5080
563 : loop {
564 : // Under the TenantMap lock, try to remove the tenant. We usually succeed, but if
565 : // we encounter an InProgress marker, yield the barrier it contains and wait on it.
566 1 : let barrier = {
567 57 : let mut locked = tenants.write().unwrap();
568 57 : let removed = locked.remove(tenant.tenant_shard_id);
569 57 :
570 57 : // FIXME: we should not be modifying this from outside of mgr.rs.
571 57 : // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
572 57 : crate::metrics::TENANT_MANAGER
573 57 : .tenant_slots
574 57 : .set(locked.len() as u64);
575 :
576 56 : match removed {
577 56 : TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
578 56 : match tenant.current_state() {
579 56 : TenantState::Stopping { .. } | TenantState::Broken { .. } => {
580 56 : // Expected: we put the tenant into stopping state before we start deleting it
581 56 : }
582 UBC 0 : state => {
583 : // Unexpected state
584 0 : tracing::warn!(
585 0 : "Tenant in unexpected state {state} after deletion"
586 0 : );
587 : }
588 : }
589 CBC 56 : break;
590 : }
591 : TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
592 : // This is unexpected: this secondary tenants should not have been created, and we
593 : // are not in a position to shut it down from here.
594 UBC 0 : tracing::warn!("Tenant transitioned to secondary mode while deleting!");
595 0 : break;
596 : }
597 : TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
598 0 : unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
599 : }
600 : TenantsMapRemoveResult::Vacant => {
601 0 : tracing::warn!(
602 0 : "Tenant removed from TenantsMap before deletion completed"
603 0 : );
604 0 : break;
605 : }
606 CBC 1 : TenantsMapRemoveResult::InProgress(barrier) => {
607 1 : // An InProgress entry was found, we must wait on its barrier
608 1 : barrier
609 : }
610 : }
611 : };
612 :
613 1 : tracing::info!(
614 1 : "Waiting for competing operation to complete before deleting state for tenant"
615 1 : );
616 1 : barrier.wait().await;
617 : }
618 : }
619 :
620 56 : *guard = Self::Finished;
621 56 :
622 56 : Ok(())
623 93 : }
624 : }
|