Line data Source code
1 : use std::sync::Arc;
2 :
3 : use anyhow::Context;
4 : use camino::{Utf8Path, Utf8PathBuf};
5 : use pageserver_api::{models::TenantState, shard::TenantShardId};
6 : use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
7 : use tokio::sync::OwnedMutexGuard;
8 : use tokio_util::sync::CancellationToken;
9 : use tracing::{error, Instrument};
10 :
11 : use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
12 :
13 : use crate::{
14 : config::PageServerConf,
15 : context::RequestContext,
16 : task_mgr::{self},
17 : tenant::{
18 : mgr::{TenantSlot, TenantsMapRemoveResult},
19 : remote_timeline_client::remote_heatmap_path,
20 : },
21 : };
22 :
23 : use super::{
24 : mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
25 : remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
26 : timeline::delete::DeleteTimelineFlow,
27 : tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
28 : };
29 :
30 0 : #[derive(Debug, thiserror::Error)]
31 : pub(crate) enum DeleteTenantError {
32 : #[error("GetTenant {0}")]
33 : Get(#[from] GetTenantError),
34 :
35 : #[error("Tenant map slot error {0}")]
36 : SlotError(#[from] TenantSlotError),
37 :
38 : #[error("Tenant map slot upsert error {0}")]
39 : SlotUpsertError(#[from] TenantSlotUpsertError),
40 :
41 : #[error("Timeline {0}")]
42 : Timeline(#[from] DeleteTimelineError),
43 :
44 : #[error("Cancelled")]
45 : Cancelled,
46 :
47 : #[error(transparent)]
48 : Other(#[from] anyhow::Error),
49 : }
50 :
51 : type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
52 :
53 0 : fn remote_tenant_delete_mark_path(
54 0 : conf: &PageServerConf,
55 0 : tenant_shard_id: &TenantShardId,
56 0 : ) -> anyhow::Result<RemotePath> {
57 0 : let tenant_remote_path = conf
58 0 : .tenant_path(tenant_shard_id)
59 0 : .strip_prefix(&conf.workdir)
60 0 : .context("Failed to strip workdir prefix")
61 0 : .and_then(RemotePath::new)
62 0 : .context("tenant path")?;
63 0 : Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
64 0 : }
65 :
66 0 : async fn schedule_ordered_timeline_deletions(
67 0 : tenant: &Arc<Tenant>,
68 0 : ) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
69 0 : // Tenant is stopping at this point. We know it will be deleted.
70 0 : // No new timelines should be created.
71 0 : // Tree sort timelines to delete from leafs to the root.
72 0 : // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
73 0 : // can complete and remove timeline from the map in between our call to clone
74 0 : // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
75 0 : // timelines.lock is currently synchronous so we cant hold it across await point.
76 0 : // So just ignore NotFound error if we get it from `run`.
77 0 : // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
78 0 : let timelines = tenant.timelines.lock().unwrap().clone();
79 0 : let sorted =
80 0 : tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
81 :
82 0 : let mut already_running_deletions = vec![];
83 :
84 0 : for (timeline_id, _) in sorted.into_iter().rev() {
85 0 : let span = tracing::info_span!("timeline_delete", %timeline_id);
86 0 : let res = DeleteTimelineFlow::run(tenant, timeline_id, true)
87 0 : .instrument(span)
88 0 : .await;
89 0 : if let Err(e) = res {
90 0 : match e {
91 : DeleteTimelineError::NotFound => {
92 : // Timeline deletion finished after call to clone above but before call
93 : // to `DeleteTimelineFlow::run` and removed timeline from the map.
94 0 : continue;
95 : }
96 0 : DeleteTimelineError::AlreadyInProgress(guard) => {
97 0 : already_running_deletions.push((guard, timeline_id));
98 0 : continue;
99 : }
100 0 : e => return Err(DeleteTenantError::Timeline(e)),
101 : }
102 0 : }
103 : }
104 :
105 0 : Ok(already_running_deletions)
106 0 : }
107 :
108 0 : async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
109 0 : // Assert timelines dir is empty.
110 0 : if !fs_ext::is_directory_empty(timelines_path).await? {
111 : // Display first 10 items in directory
112 0 : let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
113 0 : let list = &list.into_iter().take(10).collect::<Vec<_>>();
114 0 : return Err(DeleteTenantError::Other(anyhow::anyhow!(
115 0 : "Timelines directory is not empty after all timelines deletion: {list:?}"
116 0 : )));
117 0 : }
118 0 :
119 0 : Ok(())
120 0 : }
121 :
122 0 : async fn remove_tenant_remote_delete_mark(
123 0 : conf: &PageServerConf,
124 0 : remote_storage: &GenericRemoteStorage,
125 0 : tenant_shard_id: &TenantShardId,
126 0 : cancel: &CancellationToken,
127 0 : ) -> Result<(), DeleteTenantError> {
128 0 : let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
129 0 : backoff::retry(
130 0 : || async { remote_storage.delete(&path, cancel).await },
131 0 : TimeoutOrCancel::caused_by_cancel,
132 0 : FAILED_UPLOAD_WARN_THRESHOLD,
133 0 : FAILED_REMOTE_OP_RETRIES,
134 0 : "remove_tenant_remote_delete_mark",
135 0 : cancel,
136 0 : )
137 0 : .await
138 0 : .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
139 0 : .and_then(|x| x)
140 0 : .context("remove_tenant_remote_delete_mark")?;
141 0 : Ok(())
142 0 : }
143 :
144 : // Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
145 0 : async fn cleanup_remaining_fs_traces(
146 0 : conf: &PageServerConf,
147 0 : tenant_shard_id: &TenantShardId,
148 0 : ) -> Result<(), DeleteTenantError> {
149 0 : let rm = |p: Utf8PathBuf, is_dir: bool| async move {
150 0 : if is_dir {
151 0 : tokio::fs::remove_dir(&p).await
152 0 : } else {
153 0 : tokio::fs::remove_file(&p).await
154 0 : }
155 0 : .or_else(fs_ext::ignore_not_found)
156 0 : .with_context(|| format!("failed to delete {p}"))
157 0 : };
158 :
159 0 : rm(conf.tenant_config_path(tenant_shard_id), false).await?;
160 0 : rm(conf.tenant_location_config_path(tenant_shard_id), false).await?;
161 :
162 0 : fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
163 0 : Err(anyhow::anyhow!(
164 0 : "failpoint: tenant-delete-before-remove-timelines-dir"
165 0 : ))?
166 0 : });
167 :
168 0 : rm(conf.timelines_path(tenant_shard_id), true).await?;
169 :
170 0 : fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
171 0 : Err(anyhow::anyhow!(
172 0 : "failpoint: tenant-delete-before-remove-deleted-mark"
173 0 : ))?
174 0 : });
175 :
176 : // Make sure previous deletions are ordered before mark removal.
177 : // Otherwise there is no guarantee that they reach the disk before mark deletion.
178 : // So its possible for mark to reach disk first and for other deletions
179 : // to be reordered later and thus missed if a crash occurs.
180 : // Note that we dont need to sync after mark file is removed
181 : // because we can tolerate the case when mark file reappears on startup.
182 0 : let tenant_path = &conf.tenant_path(tenant_shard_id);
183 0 : if tenant_path.exists() {
184 0 : crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id))
185 0 : .await
186 0 : .context("fsync_pre_mark_remove")?;
187 0 : }
188 :
189 0 : rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
190 :
191 0 : rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?;
192 :
193 0 : fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
194 0 : Err(anyhow::anyhow!(
195 0 : "failpoint: tenant-delete-before-remove-tenant-dir"
196 0 : ))?
197 0 : });
198 :
199 0 : rm(conf.tenant_path(tenant_shard_id), true).await?;
200 :
201 0 : Ok(())
202 0 : }
203 :
204 : #[derive(Default)]
205 : pub enum DeleteTenantFlow {
206 : #[default]
207 : NotStarted,
208 : InProgress,
209 : Finished,
210 : }
211 :
212 : impl DeleteTenantFlow {
213 0 : pub(crate) async fn should_resume_deletion(
214 0 : conf: &'static PageServerConf,
215 0 : remote_mark_exists: bool,
216 0 : tenant: &Tenant,
217 0 : ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
218 0 : let acquire = |t: &Tenant| {
219 0 : Some(
220 0 : Arc::clone(&t.delete_progress)
221 0 : .try_lock_owned()
222 0 : .expect("we're the only owner during init"),
223 0 : )
224 0 : };
225 :
226 0 : if remote_mark_exists {
227 0 : return Ok(acquire(tenant));
228 0 : }
229 0 :
230 0 : // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
231 0 : if conf
232 0 : .tenant_deleted_mark_file_path(&tenant.tenant_shard_id)
233 0 : .exists()
234 : {
235 0 : Ok(acquire(tenant))
236 : } else {
237 0 : Ok(None)
238 : }
239 0 : }
240 :
241 0 : pub(crate) async fn resume_from_attach(
242 0 : guard: DeletionGuard,
243 0 : tenant: &Arc<Tenant>,
244 0 : preload: Option<TenantPreload>,
245 0 : tenants: &'static std::sync::RwLock<TenantsMap>,
246 0 : ctx: &RequestContext,
247 0 : ) -> Result<(), DeleteTenantError> {
248 0 : let (_, progress) = completion::channel();
249 0 :
250 0 : tenant
251 0 : .set_stopping(progress, false, true)
252 0 : .await
253 0 : .expect("cant be stopping or broken");
254 0 :
255 0 : tenant
256 0 : .attach(preload, super::SpawnMode::Eager, ctx)
257 0 : .await
258 0 : .context("attach")?;
259 :
260 0 : Self::background(
261 0 : guard,
262 0 : tenant.conf,
263 0 : tenant.remote_storage.clone(),
264 0 : tenants,
265 0 : tenant,
266 0 : )
267 0 : .await
268 0 : }
269 :
270 0 : async fn background(
271 0 : mut guard: OwnedMutexGuard<Self>,
272 0 : conf: &PageServerConf,
273 0 : remote_storage: GenericRemoteStorage,
274 0 : tenants: &'static std::sync::RwLock<TenantsMap>,
275 0 : tenant: &Arc<Tenant>,
276 0 : ) -> Result<(), DeleteTenantError> {
277 : // Tree sort timelines, schedule delete for them. Mention retries from the console side.
278 : // Note that if deletion fails we dont mark timelines as broken,
279 : // the whole tenant will become broken as by `Self::schedule_background` logic
280 0 : let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
281 0 : .await
282 0 : .context("schedule_ordered_timeline_deletions")?;
283 :
284 0 : fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
285 0 : Err(anyhow::anyhow!(
286 0 : "failpoint: tenant-delete-before-polling-ongoing-deletions"
287 0 : ))?
288 0 : });
289 :
290 : // Wait for deletions that were already running at the moment when tenant deletion was requested.
291 : // When we can lock deletion guard it means that corresponding timeline deletion finished.
292 0 : for (guard, timeline_id) in already_running_timeline_deletions {
293 0 : let flow = guard.lock().await;
294 0 : if !flow.is_finished() {
295 0 : return Err(DeleteTenantError::Other(anyhow::anyhow!(
296 0 : "already running timeline deletion failed: {timeline_id}"
297 0 : )));
298 0 : }
299 : }
300 :
301 : // Remove top-level tenant objects that don't belong to a timeline, such as heatmap
302 0 : let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id());
303 0 : if let Some(Err(e)) = backoff::retry(
304 0 : || async {
305 0 : remote_storage
306 0 : .delete(&heatmap_path, &task_mgr::shutdown_token())
307 0 : .await
308 0 : },
309 0 : TimeoutOrCancel::caused_by_cancel,
310 0 : FAILED_UPLOAD_WARN_THRESHOLD,
311 0 : FAILED_REMOTE_OP_RETRIES,
312 0 : "remove_remote_tenant_heatmap",
313 0 : &task_mgr::shutdown_token(),
314 0 : )
315 0 : .await
316 : {
317 0 : tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}");
318 0 : }
319 :
320 0 : let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
321 0 : // May not exist if we fail in cleanup_remaining_fs_traces after removing it
322 0 : if timelines_path.exists() {
323 : // sanity check to guard against layout changes
324 0 : ensure_timelines_dir_empty(&timelines_path)
325 0 : .await
326 0 : .context("timelines dir not empty")?;
327 0 : }
328 :
329 0 : remove_tenant_remote_delete_mark(
330 0 : conf,
331 0 : &remote_storage,
332 0 : &tenant.tenant_shard_id,
333 0 : &task_mgr::shutdown_token(),
334 0 : )
335 0 : .await?;
336 :
337 : pausable_failpoint!("tenant-delete-before-cleanup-remaining-fs-traces-pausable");
338 0 : fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
339 0 : Err(anyhow::anyhow!(
340 0 : "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
341 0 : ))?
342 0 : });
343 :
344 0 : cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id)
345 0 : .await
346 0 : .context("cleanup_remaining_fs_traces")?;
347 :
348 : {
349 : // This block is simply removing the TenantSlot for this tenant. It requires a loop because
350 : // we might conflict with a TenantSlot::InProgress marker and need to wait for it.
351 : //
352 : // This complexity will go away when we simplify how deletion works:
353 : // https://github.com/neondatabase/neon/issues/5080
354 : loop {
355 : // Under the TenantMap lock, try to remove the tenant. We usually succeed, but if
356 : // we encounter an InProgress marker, yield the barrier it contains and wait on it.
357 0 : let barrier = {
358 0 : let mut locked = tenants.write().unwrap();
359 0 : let removed = locked.remove(tenant.tenant_shard_id);
360 0 :
361 0 : // FIXME: we should not be modifying this from outside of mgr.rs.
362 0 : // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
363 0 :
364 0 : // Update stats
365 0 : match &removed {
366 0 : TenantsMapRemoveResult::Occupied(slot) => {
367 0 : crate::metrics::TENANT_MANAGER.slot_removed(slot);
368 0 : }
369 0 : TenantsMapRemoveResult::InProgress(barrier) => {
370 0 : crate::metrics::TENANT_MANAGER
371 0 : .slot_removed(&TenantSlot::InProgress(barrier.clone()));
372 0 : }
373 0 : TenantsMapRemoveResult::Vacant => {
374 0 : // Nothing changed in map, no metric update
375 0 : }
376 : }
377 :
378 0 : match removed {
379 0 : TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
380 0 : match tenant.current_state() {
381 0 : TenantState::Stopping { .. } | TenantState::Broken { .. } => {
382 0 : // Expected: we put the tenant into stopping state before we start deleting it
383 0 : }
384 0 : state => {
385 0 : // Unexpected state
386 0 : tracing::warn!(
387 0 : "Tenant in unexpected state {state} after deletion"
388 : );
389 : }
390 : }
391 0 : break;
392 : }
393 : TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
394 : // This is unexpected: this secondary tenants should not have been created, and we
395 : // are not in a position to shut it down from here.
396 0 : tracing::warn!("Tenant transitioned to secondary mode while deleting!");
397 0 : break;
398 : }
399 : TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
400 0 : unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
401 : }
402 : TenantsMapRemoveResult::Vacant => {
403 0 : tracing::warn!(
404 0 : "Tenant removed from TenantsMap before deletion completed"
405 : );
406 0 : break;
407 : }
408 0 : TenantsMapRemoveResult::InProgress(barrier) => {
409 0 : // An InProgress entry was found, we must wait on its barrier
410 0 : barrier
411 0 : }
412 0 : }
413 0 : };
414 0 :
415 0 : tracing::info!(
416 0 : "Waiting for competing operation to complete before deleting state for tenant"
417 : );
418 0 : barrier.wait().await;
419 : }
420 : }
421 :
422 0 : *guard = Self::Finished;
423 0 :
424 0 : Ok(())
425 0 : }
426 : }
|