Line data Source code
1 : use std::sync::Arc;
2 :
3 : use pageserver_api::models::{TenantState, TimelineState};
4 :
5 : use super::Timeline;
6 : use super::delete::{DeletionGuard, delete_local_timeline_directory};
7 : use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
8 : use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
9 : use crate::tenant::timeline::delete::{TimelineDeleteGuardKind, make_timeline_delete_guard};
10 : use crate::tenant::{
11 : DeleteTimelineError, OffloadedTimeline, TenantManifestError, TenantShard, TimelineOrOffloaded,
12 : };
13 :
14 : #[derive(thiserror::Error, Debug)]
15 : pub(crate) enum OffloadError {
16 : #[error("Cancelled")]
17 : Cancelled,
18 : #[error("Timeline is not archived")]
19 : NotArchived,
20 : #[error(transparent)]
21 : RemoteStorage(anyhow::Error),
22 : #[error("Offload or deletion already in progress")]
23 : AlreadyInProgress,
24 : #[error("Unexpected offload error: {0}")]
25 : Other(anyhow::Error),
26 : }
27 :
28 : impl From<TenantManifestError> for OffloadError {
29 0 : fn from(e: TenantManifestError) -> Self {
30 0 : match e {
31 0 : TenantManifestError::Cancelled => Self::Cancelled,
32 0 : TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e),
33 : }
34 0 : }
35 : }
36 :
37 1 : pub(crate) async fn offload_timeline(
38 1 : tenant: &TenantShard,
39 1 : timeline: &Arc<Timeline>,
40 1 : ) -> Result<(), OffloadError> {
41 1 : debug_assert_current_span_has_tenant_and_timeline_id();
42 1 : tracing::info!("offloading archived timeline");
43 :
44 1 : let delete_guard_res = make_timeline_delete_guard(
45 1 : tenant,
46 1 : timeline.timeline_id,
47 1 : TimelineDeleteGuardKind::Offload,
48 : );
49 1 : let (timeline, guard) = match delete_guard_res {
50 1 : Ok(timeline_and_guard) => timeline_and_guard,
51 0 : Err(DeleteTimelineError::HasChildren(children)) => {
52 0 : let is_archived = timeline.is_archived();
53 0 : if is_archived == Some(true) {
54 0 : tracing::error!("timeline is archived but has non-archived children: {children:?}");
55 0 : return Err(OffloadError::NotArchived);
56 0 : }
57 0 : tracing::info!(
58 : ?is_archived,
59 0 : "timeline is not archived and has unarchived children"
60 : );
61 0 : return Err(OffloadError::NotArchived);
62 : }
63 : Err(DeleteTimelineError::AlreadyInProgress(_)) => {
64 0 : tracing::info!("timeline offload or deletion already in progress");
65 0 : return Err(OffloadError::AlreadyInProgress);
66 : }
67 0 : Err(e) => return Err(OffloadError::Other(anyhow::anyhow!(e))),
68 : };
69 :
70 1 : let TimelineOrOffloaded::Timeline(timeline) = timeline else {
71 0 : tracing::error!("timeline already offloaded, but given timeline object");
72 0 : return Ok(());
73 : };
74 :
75 1 : match timeline.remote_client.shutdown_if_archived().await {
76 1 : Ok(()) => {}
77 0 : Err(ShutdownIfArchivedError::NotInitialized(_)) => {
78 0 : // Either the timeline is being deleted, the operation is being retried, or we are shutting down.
79 0 : // Don't return cancelled here to keep it idempotent.
80 0 : }
81 0 : Err(ShutdownIfArchivedError::NotArchived) => return Err(OffloadError::NotArchived),
82 : }
83 1 : timeline.set_state(TimelineState::Stopping);
84 :
85 : // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
86 1 : timeline.shutdown(super::ShutdownMode::Reload).await;
87 :
88 : // TODO extend guard mechanism above with method
89 : // to make deletions possible while offloading is in progress
90 :
91 1 : let conf = &tenant.conf;
92 1 : delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await;
93 :
94 1 : let remaining_refcount = remove_timeline_from_tenant(tenant, &timeline, &guard);
95 :
96 : {
97 1 : let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
98 1 : if matches!(
99 1 : tenant.current_state(),
100 : TenantState::Stopping { .. } | TenantState::Broken { .. }
101 : ) {
102 : // Cancel the operation if the tenant is shutting down. Do this while the
103 : // timelines_offloaded lock is held to prevent a race with Tenant::shutdown
104 : // for defusing the lock
105 0 : return Err(OffloadError::Cancelled);
106 1 : }
107 1 : offloaded_timelines.insert(
108 1 : timeline.timeline_id,
109 1 : Arc::new(
110 1 : OffloadedTimeline::from_timeline(&timeline)
111 1 : .expect("we checked above that timeline was ready"),
112 1 : ),
113 1 : );
114 : }
115 :
116 : // Last step: mark timeline as offloaded in S3
117 : // TODO: maybe move this step above, right above deletion of the local timeline directory,
118 : // then there is no potential race condition where we partially offload a timeline, and
119 : // at the next restart attach it again.
120 : // For that to happen, we'd need to make the manifest reflect our *intended* state,
121 : // not our actual state of offloaded timelines.
122 1 : tenant.maybe_upload_tenant_manifest().await?;
123 :
124 1 : tracing::info!("Timeline offload complete (remaining arc refcount: {remaining_refcount})");
125 :
126 1 : Ok(())
127 1 : }
128 :
129 : /// It is important that this gets called when DeletionGuard is being held.
130 : /// For more context see comments in [`make_timeline_delete_guard`]
131 : ///
132 : /// Returns the strong count of the timeline `Arc`
133 1 : fn remove_timeline_from_tenant(
134 1 : tenant: &TenantShard,
135 1 : timeline: &Timeline,
136 1 : _: &DeletionGuard, // using it as a witness
137 1 : ) -> usize {
138 : // Remove the timeline from the map.
139 1 : let mut timelines = tenant.timelines.lock().unwrap();
140 1 : let children_exist = timelines
141 1 : .iter()
142 2 : .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id));
143 : // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
144 : // We already deleted the layer files, so it's probably best to panic.
145 : // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
146 1 : if children_exist {
147 0 : panic!("Timeline grew children while we removed layer files");
148 1 : }
149 :
150 1 : let timeline = timelines
151 1 : .remove(&timeline.timeline_id)
152 1 : .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
153 :
154 : // Clear the compaction queue for this timeline
155 1 : tenant
156 1 : .scheduled_compaction_tasks
157 1 : .lock()
158 1 : .unwrap()
159 1 : .remove(&timeline.timeline_id);
160 :
161 1 : Arc::strong_count(&timeline)
162 1 : }
|