Line data Source code
1 : use std::sync::Arc;
2 :
3 : use anyhow::Context;
4 : use camino::{Utf8Path, Utf8PathBuf};
5 : use pageserver_api::{models::TenantState, shard::TenantShardId};
6 : use remote_storage::{GenericRemoteStorage, RemotePath};
7 : use tokio::sync::OwnedMutexGuard;
8 : use tokio_util::sync::CancellationToken;
9 : use tracing::{error, instrument, Instrument, Span};
10 :
11 : use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
12 :
13 : use crate::{
14 : config::PageServerConf,
15 : context::RequestContext,
16 : task_mgr::{self, TaskKind},
17 : tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
18 : };
19 :
20 : use super::{
21 : mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
22 : remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
23 : span,
24 : timeline::delete::DeleteTimelineFlow,
25 : tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
26 : };
27 :
28 204 : #[derive(Debug, thiserror::Error)]
29 : pub(crate) enum DeleteTenantError {
30 : #[error("GetTenant {0}")]
31 : Get(#[from] GetTenantError),
32 :
33 : #[error("Tenant not attached")]
34 : NotAttached,
35 :
36 : #[error("Invalid state {0}. Expected Active or Broken")]
37 : InvalidState(TenantState),
38 :
39 : #[error("Tenant deletion is already in progress")]
40 : AlreadyInProgress,
41 :
42 : #[error("Tenant map slot error {0}")]
43 : SlotError(#[from] TenantSlotError),
44 :
45 : #[error("Tenant map slot upsert error {0}")]
46 : SlotUpsertError(#[from] TenantSlotUpsertError),
47 :
48 : #[error("Timeline {0}")]
49 : Timeline(#[from] DeleteTimelineError),
50 :
51 : #[error("Cancelled")]
52 : Cancelled,
53 :
54 : #[error(transparent)]
55 : Other(#[from] anyhow::Error),
56 : }
57 :
58 : type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
59 :
60 180 : fn remote_tenant_delete_mark_path(
61 180 : conf: &PageServerConf,
62 180 : tenant_shard_id: &TenantShardId,
63 180 : ) -> anyhow::Result<RemotePath> {
64 180 : let tenant_remote_path = conf
65 180 : .tenant_path(tenant_shard_id)
66 180 : .strip_prefix(&conf.workdir)
67 180 : .context("Failed to strip workdir prefix")
68 180 : .and_then(RemotePath::new)
69 180 : .context("tenant path")?;
70 180 : Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
71 180 : }
72 :
73 94 : async fn create_remote_delete_mark(
74 94 : conf: &PageServerConf,
75 94 : remote_storage: &GenericRemoteStorage,
76 94 : tenant_shard_id: &TenantShardId,
77 94 : cancel: &CancellationToken,
78 94 : ) -> Result<(), DeleteTenantError> {
79 94 : let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
80 :
81 94 : let data: &[u8] = &[];
82 94 : backoff::retry(
83 132 : || async {
84 132 : let data = bytes::Bytes::from_static(data);
85 132 : let stream = futures::stream::once(futures::future::ready(Ok(data)));
86 132 : remote_storage
87 132 : .upload(stream, 0, &remote_mark_path, None)
88 195 : .await
89 132 : },
90 94 : |_e| false,
91 94 : FAILED_UPLOAD_WARN_THRESHOLD,
92 94 : FAILED_REMOTE_OP_RETRIES,
93 94 : "mark_upload",
94 94 : cancel,
95 94 : )
96 195 : .await
97 94 : .ok_or_else(|| anyhow::anyhow!("Cancelled"))
98 94 : .and_then(|x| x)
99 94 : .context("mark_upload")?;
100 :
101 94 : Ok(())
102 94 : }
103 :
104 90 : async fn create_local_delete_mark(
105 90 : conf: &PageServerConf,
106 90 : tenant_shard_id: &TenantShardId,
107 90 : ) -> Result<(), DeleteTenantError> {
108 90 : let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
109 90 :
110 90 : // Note: we're ok to replace existing file.
111 90 : let _ = std::fs::OpenOptions::new()
112 90 : .write(true)
113 90 : .create(true)
114 90 : .open(&marker_path)
115 90 : .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
116 :
117 90 : crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
118 :
119 90 : Ok(())
120 90 : }
121 :
122 107 : async fn schedule_ordered_timeline_deletions(
123 107 : tenant: &Arc<Tenant>,
124 107 : ) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
125 107 : // Tenant is stopping at this point. We know it will be deleted.
126 107 : // No new timelines should be created.
127 107 : // Tree sort timelines to delete from leafs to the root.
128 107 : // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
129 107 : // can complete and remove timeline from the map in between our call to clone
130 107 : // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
131 107 : // timelines.lock is currently synchronous so we cant hold it across await point.
132 107 : // So just ignore NotFound error if we get it from `run`.
133 107 : // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
134 107 : let timelines = tenant.timelines.lock().unwrap().clone();
135 107 : let sorted =
136 163 : tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
137 :
138 107 : let mut already_running_deletions = vec![];
139 :
140 145 : for (timeline_id, _) in sorted.into_iter().rev() {
141 145 : let span = tracing::info_span!("timeline_delete", %timeline_id);
142 145 : let res = DeleteTimelineFlow::run(tenant, timeline_id, true)
143 145 : .instrument(span)
144 4908 : .await;
145 145 : if let Err(e) = res {
146 22 : match e {
147 : DeleteTimelineError::NotFound => {
148 : // Timeline deletion finished after call to clone above but before call
149 : // to `DeleteTimelineFlow::run` and removed timeline from the map.
150 1 : continue;
151 : }
152 4 : DeleteTimelineError::AlreadyInProgress(guard) => {
153 4 : already_running_deletions.push((guard, timeline_id));
154 4 : continue;
155 : }
156 17 : e => return Err(DeleteTenantError::Timeline(e)),
157 : }
158 123 : }
159 : }
160 :
161 90 : Ok(already_running_deletions)
162 107 : }
163 :
164 80 : async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
165 80 : // Assert timelines dir is empty.
166 80 : if !fs_ext::is_directory_empty(timelines_path).await? {
167 : // Display first 10 items in directory
168 0 : let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
169 0 : let list = &list.into_iter().take(10).collect::<Vec<_>>();
170 0 : return Err(DeleteTenantError::Other(anyhow::anyhow!(
171 0 : "Timelines directory is not empty after all timelines deletion: {list:?}"
172 0 : )));
173 80 : }
174 80 :
175 80 : Ok(())
176 80 : }
177 :
178 86 : async fn remove_tenant_remote_delete_mark(
179 86 : conf: &PageServerConf,
180 86 : remote_storage: Option<&GenericRemoteStorage>,
181 86 : tenant_shard_id: &TenantShardId,
182 86 : cancel: &CancellationToken,
183 86 : ) -> Result<(), DeleteTenantError> {
184 86 : if let Some(remote_storage) = remote_storage {
185 86 : let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
186 86 : backoff::retry(
187 308 : || async { remote_storage.delete(&path).await },
188 86 : |_e| false,
189 86 : FAILED_UPLOAD_WARN_THRESHOLD,
190 86 : FAILED_REMOTE_OP_RETRIES,
191 86 : "remove_tenant_remote_delete_mark",
192 86 : cancel,
193 86 : )
194 308 : .await
195 86 : .ok_or_else(|| anyhow::anyhow!("Cancelled"))
196 86 : .and_then(|x| x)
197 86 : .context("remove_tenant_remote_delete_mark")?;
198 0 : }
199 86 : Ok(())
200 86 : }
201 :
202 : // Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
203 82 : async fn cleanup_remaining_fs_traces(
204 82 : conf: &PageServerConf,
205 82 : tenant_shard_id: &TenantShardId,
206 82 : ) -> Result<(), DeleteTenantError> {
207 386 : let rm = |p: Utf8PathBuf, is_dir: bool| async move {
208 386 : if is_dir {
209 148 : tokio::fs::remove_dir(&p).await
210 82 : } else {
211 238 : tokio::fs::remove_file(&p).await
212 82 : }
213 386 : .or_else(fs_ext::ignore_not_found)
214 386 : .with_context(|| format!("failed to delete {p}"))
215 386 : };
216 82 :
217 82 : rm(conf.tenant_config_path(tenant_shard_id), false).await?;
218 82 : rm(conf.tenant_location_config_path(tenant_shard_id), false).await?;
219 :
220 82 : fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
221 4 : Err(anyhow::anyhow!(
222 4 : "failpoint: tenant-delete-before-remove-timelines-dir"
223 4 : ))?
224 82 : });
225 :
226 78 : rm(conf.timelines_path(tenant_shard_id), true).await?;
227 :
228 78 : fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
229 4 : Err(anyhow::anyhow!(
230 4 : "failpoint: tenant-delete-before-remove-deleted-mark"
231 4 : ))?
232 78 : });
233 :
234 : // Make sure previous deletions are ordered before mark removal.
235 : // Otherwise there is no guarantee that they reach the disk before mark deletion.
236 : // So its possible for mark to reach disk first and for other deletions
237 : // to be reordered later and thus missed if a crash occurs.
238 : // Note that we dont need to sync after mark file is removed
239 : // because we can tolerate the case when mark file reappears on startup.
240 74 : let tenant_path = &conf.tenant_path(tenant_shard_id);
241 74 : if tenant_path.exists() {
242 74 : crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id))
243 147 : .await
244 74 : .context("fsync_pre_mark_remove")?;
245 0 : }
246 :
247 74 : rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
248 :
249 74 : fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
250 4 : Err(anyhow::anyhow!(
251 4 : "failpoint: tenant-delete-before-remove-tenant-dir"
252 4 : ))?
253 74 : });
254 :
255 70 : rm(conf.tenant_path(tenant_shard_id), true).await?;
256 :
257 70 : Ok(())
258 82 : }
259 :
260 : /// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
261 : /// and deletes its data from both disk and s3.
262 : /// The sequence of steps:
263 : /// 1. Upload remote deletion mark.
264 : /// 2. Create local mark file.
265 : /// 3. Shutdown tasks
266 : /// 4. Run ordered timeline deletions
267 : /// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
268 : /// 6. Remove remote mark
269 : /// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
270 : /// It is resumable from any step in case a crash/restart occurs.
271 : /// There are two entrypoints to the process:
272 : /// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
273 : /// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
274 : /// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
275 942 : #[derive(Default)]
276 : pub enum DeleteTenantFlow {
277 : #[default]
278 : NotStarted,
279 : InProgress,
280 : Finished,
281 : }
282 :
283 : impl DeleteTenantFlow {
284 : // These steps are run in the context of management api request handler.
285 : // Long running steps are continued to run in the background.
286 : // NB: If this fails half-way through, and is retried, the retry will go through
287 : // all the same steps again. Make sure the code here is idempotent, and don't
288 : // error out if some of the shutdown tasks have already been completed!
289 : // NOTE: static needed for background part.
290 : // We assume that calling code sets up the span with tenant_id.
291 104 : #[instrument(skip_all)]
292 : pub(crate) async fn run(
293 : conf: &'static PageServerConf,
294 : remote_storage: Option<GenericRemoteStorage>,
295 : tenants: &'static std::sync::RwLock<TenantsMap>,
296 : tenant: Arc<Tenant>,
297 : ) -> Result<(), DeleteTenantError> {
298 : span::debug_assert_current_span_has_tenant_id();
299 :
300 104 : pausable_failpoint!("tenant-delete-before-run");
301 :
302 : let mut guard = Self::prepare(&tenant).await?;
303 :
304 : if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
305 : tenant.set_broken(format!("{e:#}")).await;
306 : return Err(e);
307 : }
308 :
309 : Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
310 :
311 : Ok(())
312 : }
313 :
314 : // Helper function needed to be able to match once on returned error and transition tenant into broken state.
315 : // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
316 : // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
317 : // So the solution is to set tenant state to broken.
318 98 : async fn run_inner(
319 98 : guard: &mut OwnedMutexGuard<Self>,
320 98 : conf: &'static PageServerConf,
321 98 : remote_storage: Option<&GenericRemoteStorage>,
322 98 : tenant: &Tenant,
323 98 : ) -> Result<(), DeleteTenantError> {
324 98 : guard.mark_in_progress()?;
325 :
326 98 : fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
327 4 : Err(anyhow::anyhow!(
328 4 : "failpoint: tenant-delete-before-create-remote-mark"
329 4 : ))?
330 98 : });
331 :
332 : // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
333 : // Though sounds scary, different mark name?
334 : // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
335 94 : if let Some(remote_storage) = &remote_storage {
336 94 : create_remote_delete_mark(
337 94 : conf,
338 94 : remote_storage,
339 94 : &tenant.tenant_shard_id,
340 94 : // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
341 94 : &CancellationToken::new(),
342 94 : )
343 195 : .await
344 94 : .context("remote_mark")?
345 0 : }
346 :
347 94 : fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
348 4 : Err(anyhow::anyhow!(
349 4 : "failpoint: tenant-delete-before-create-local-mark"
350 4 : ))?
351 94 : });
352 :
353 90 : create_local_delete_mark(conf, &tenant.tenant_shard_id)
354 0 : .await
355 90 : .context("local delete mark")?;
356 :
357 90 : fail::fail_point!("tenant-delete-before-background", |_| {
358 4 : Err(anyhow::anyhow!(
359 4 : "failpoint: tenant-delete-before-background"
360 4 : ))?
361 90 : });
362 :
363 86 : Ok(())
364 98 : }
365 :
366 98 : fn mark_in_progress(&mut self) -> anyhow::Result<()> {
367 98 : match self {
368 0 : Self::Finished => anyhow::bail!("Bug. Is in finished state"),
369 24 : Self::InProgress { .. } => { /* We're in a retry */ }
370 74 : Self::NotStarted => { /* Fresh start */ }
371 : }
372 :
373 98 : *self = Self::InProgress;
374 98 :
375 98 : Ok(())
376 98 : }
377 :
378 851 : pub(crate) async fn should_resume_deletion(
379 851 : conf: &'static PageServerConf,
380 851 : remote_mark_exists: bool,
381 851 : tenant: &Tenant,
382 851 : ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
383 851 : let acquire = |t: &Tenant| {
384 21 : Some(
385 21 : Arc::clone(&t.delete_progress)
386 21 : .try_lock_owned()
387 21 : .expect("we're the only owner during init"),
388 21 : )
389 851 : };
390 851 :
391 851 : if remote_mark_exists {
392 15 : return Ok(acquire(tenant));
393 836 : }
394 836 :
395 836 : // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
396 836 : if conf
397 836 : .tenant_deleted_mark_file_path(&tenant.tenant_shard_id)
398 836 : .exists()
399 : {
400 6 : Ok(acquire(tenant))
401 : } else {
402 830 : Ok(None)
403 : }
404 851 : }
405 :
406 21 : pub(crate) async fn resume_from_attach(
407 21 : guard: DeletionGuard,
408 21 : tenant: &Arc<Tenant>,
409 21 : preload: Option<TenantPreload>,
410 21 : tenants: &'static std::sync::RwLock<TenantsMap>,
411 21 : ctx: &RequestContext,
412 21 : ) -> Result<(), DeleteTenantError> {
413 21 : let (_, progress) = completion::channel();
414 21 :
415 21 : tenant
416 21 : .set_stopping(progress, false, true)
417 0 : .await
418 21 : .expect("cant be stopping or broken");
419 21 :
420 21 : tenant
421 21 : .attach(preload, super::SpawnMode::Normal, ctx)
422 40 : .await
423 21 : .context("attach")?;
424 :
425 21 : Self::background(
426 21 : guard,
427 21 : tenant.conf,
428 21 : tenant.remote_storage.clone(),
429 21 : tenants,
430 21 : tenant,
431 21 : )
432 749 : .await
433 21 : }
434 :
435 104 : async fn prepare(
436 104 : tenant: &Arc<Tenant>,
437 104 : ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
438 104 : // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
439 104 : // so at least for now allow deletions only for active tenants. TODO recheck
440 104 : // Broken and Stopping is needed for retries.
441 104 : if !matches!(
442 104 : tenant.current_state(),
443 : TenantState::Active | TenantState::Broken { .. }
444 : ) {
445 2 : return Err(DeleteTenantError::InvalidState(tenant.current_state()));
446 102 : }
447 :
448 102 : let guard = Arc::clone(&tenant.delete_progress)
449 102 : .try_lock_owned()
450 102 : .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
451 :
452 102 : fail::fail_point!("tenant-delete-before-shutdown", |_| {
453 4 : Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
454 102 : });
455 :
456 : // make pageserver shutdown not to wait for our completion
457 98 : let (_, progress) = completion::channel();
458 98 :
459 98 : // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
460 98 : // i e it is an error to do:
461 98 : // tenant.set_stopping
462 98 : // tenant.shutdown
463 98 : // Its also bad that we're holding tenants.read here.
464 98 : // TODO relax set_stopping to be idempotent?
465 229 : if tenant.shutdown(progress, false).await.is_err() {
466 0 : return Err(DeleteTenantError::Other(anyhow::anyhow!(
467 0 : "tenant shutdown is already in progress"
468 0 : )));
469 98 : }
470 98 :
471 98 : Ok(guard)
472 104 : }
473 :
474 86 : fn schedule_background(
475 86 : guard: OwnedMutexGuard<Self>,
476 86 : conf: &'static PageServerConf,
477 86 : remote_storage: Option<GenericRemoteStorage>,
478 86 : tenants: &'static std::sync::RwLock<TenantsMap>,
479 86 : tenant: Arc<Tenant>,
480 86 : ) {
481 86 : let tenant_shard_id = tenant.tenant_shard_id;
482 86 :
483 86 : task_mgr::spawn(
484 86 : task_mgr::BACKGROUND_RUNTIME.handle(),
485 86 : TaskKind::TimelineDeletionWorker,
486 86 : Some(tenant_shard_id),
487 86 : None,
488 86 : "tenant_delete",
489 : false,
490 86 : async move {
491 37 : if let Err(err) =
492 5232 : Self::background(guard, conf, remote_storage, tenants, &tenant).await
493 : {
494 37 : error!("Error: {err:#}");
495 37 : tenant.set_broken(format!("{err:#}")).await;
496 49 : };
497 86 : Ok(())
498 86 : }
499 86 : .instrument({
500 86 : let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
501 86 : span.follows_from(Span::current());
502 86 : span
503 86 : }),
504 86 : );
505 86 : }
506 :
507 107 : async fn background(
508 107 : mut guard: OwnedMutexGuard<Self>,
509 107 : conf: &PageServerConf,
510 107 : remote_storage: Option<GenericRemoteStorage>,
511 107 : tenants: &'static std::sync::RwLock<TenantsMap>,
512 107 : tenant: &Arc<Tenant>,
513 107 : ) -> Result<(), DeleteTenantError> {
514 : // Tree sort timelines, schedule delete for them. Mention retries from the console side.
515 : // Note that if deletion fails we dont mark timelines as broken,
516 : // the whole tenant will become broken as by `Self::schedule_background` logic
517 107 : let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
518 4908 : .await
519 107 : .context("schedule_ordered_timeline_deletions")?;
520 :
521 90 : fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
522 4 : Err(anyhow::anyhow!(
523 4 : "failpoint: tenant-delete-before-polling-ongoing-deletions"
524 4 : ))?
525 90 : });
526 :
527 : // Wait for deletions that were already running at the moment when tenant deletion was requested.
528 : // When we can lock deletion guard it means that corresponding timeline deletion finished.
529 90 : for (guard, timeline_id) in already_running_timeline_deletions {
530 4 : let flow = guard.lock().await;
531 4 : if !flow.is_finished() {
532 0 : return Err(DeleteTenantError::Other(anyhow::anyhow!(
533 0 : "already running timeline deletion failed: {timeline_id}"
534 0 : )));
535 4 : }
536 : }
537 :
538 86 : let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
539 86 : // May not exist if we fail in cleanup_remaining_fs_traces after removing it
540 86 : if timelines_path.exists() {
541 : // sanity check to guard against layout changes
542 80 : ensure_timelines_dir_empty(&timelines_path)
543 79 : .await
544 80 : .context("timelines dir not empty")?;
545 6 : }
546 :
547 86 : remove_tenant_remote_delete_mark(
548 86 : conf,
549 86 : remote_storage.as_ref(),
550 86 : &tenant.tenant_shard_id,
551 86 : // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
552 86 : &CancellationToken::new(),
553 86 : )
554 308 : .await?;
555 :
556 86 : pausable_failpoint!("tenant-delete-before-cleanup-remaining-fs-traces-pausable");
557 86 : fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
558 4 : Err(anyhow::anyhow!(
559 4 : "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
560 4 : ))?
561 86 : });
562 :
563 82 : cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id)
564 530 : .await
565 82 : .context("cleanup_remaining_fs_traces")?;
566 :
567 : {
568 70 : pausable_failpoint!("tenant-delete-before-map-remove");
569 :
570 : // This block is simply removing the TenantSlot for this tenant. It requires a loop because
571 : // we might conflict with a TenantSlot::InProgress marker and need to wait for it.
572 : //
573 : // This complexity will go away when we simplify how deletion works:
574 : // https://github.com/neondatabase/neon/issues/5080
575 : loop {
576 : // Under the TenantMap lock, try to remove the tenant. We usually succeed, but if
577 : // we encounter an InProgress marker, yield the barrier it contains and wait on it.
578 1 : let barrier = {
579 71 : let mut locked = tenants.write().unwrap();
580 71 : let removed = locked.remove(tenant.tenant_shard_id);
581 71 :
582 71 : // FIXME: we should not be modifying this from outside of mgr.rs.
583 71 : // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
584 71 : crate::metrics::TENANT_MANAGER
585 71 : .tenant_slots
586 71 : .set(locked.len() as u64);
587 :
588 70 : match removed {
589 70 : TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
590 70 : match tenant.current_state() {
591 70 : TenantState::Stopping { .. } | TenantState::Broken { .. } => {
592 70 : // Expected: we put the tenant into stopping state before we start deleting it
593 70 : }
594 0 : state => {
595 : // Unexpected state
596 0 : tracing::warn!(
597 0 : "Tenant in unexpected state {state} after deletion"
598 0 : );
599 : }
600 : }
601 70 : break;
602 : }
603 : TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
604 : // This is unexpected: this secondary tenants should not have been created, and we
605 : // are not in a position to shut it down from here.
606 0 : tracing::warn!("Tenant transitioned to secondary mode while deleting!");
607 0 : break;
608 : }
609 : TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
610 0 : unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
611 : }
612 : TenantsMapRemoveResult::Vacant => {
613 0 : tracing::warn!(
614 0 : "Tenant removed from TenantsMap before deletion completed"
615 0 : );
616 0 : break;
617 : }
618 1 : TenantsMapRemoveResult::InProgress(barrier) => {
619 1 : // An InProgress entry was found, we must wait on its barrier
620 1 : barrier
621 : }
622 : }
623 : };
624 :
625 1 : tracing::info!(
626 1 : "Waiting for competing operation to complete before deleting state for tenant"
627 1 : );
628 1 : barrier.wait().await;
629 : }
630 : }
631 :
632 70 : *guard = Self::Finished;
633 70 :
634 70 : Ok(())
635 107 : }
636 : }
|