Line data Source code
1 : use std::sync::Arc;
2 :
3 : use pageserver_api::models::{TenantState, TimelineState};
4 :
5 : use super::delete::{delete_local_timeline_directory, DeletionGuard};
6 : use super::Timeline;
7 : use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
8 : use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
9 : use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind};
10 : use crate::tenant::{
11 : DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded,
12 : };
13 :
14 : #[derive(thiserror::Error, Debug)]
15 : pub(crate) enum OffloadError {
16 : #[error("Cancelled")]
17 : Cancelled,
18 : #[error("Timeline is not archived")]
19 : NotArchived,
20 : #[error(transparent)]
21 : RemoteStorage(anyhow::Error),
22 : #[error("Unexpected offload error: {0}")]
23 : Other(anyhow::Error),
24 : }
25 :
26 : impl From<TenantManifestError> for OffloadError {
27 0 : fn from(e: TenantManifestError) -> Self {
28 0 : match e {
29 0 : TenantManifestError::Cancelled => Self::Cancelled,
30 0 : TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e),
31 : }
32 0 : }
33 : }
34 :
35 4 : pub(crate) async fn offload_timeline(
36 4 : tenant: &Tenant,
37 4 : timeline: &Arc<Timeline>,
38 4 : ) -> Result<(), OffloadError> {
39 4 : debug_assert_current_span_has_tenant_and_timeline_id();
40 4 : tracing::info!("offloading archived timeline");
41 :
42 4 : let delete_guard_res = make_timeline_delete_guard(
43 4 : tenant,
44 4 : timeline.timeline_id,
45 4 : TimelineDeleteGuardKind::Offload,
46 4 : );
47 0 : if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res {
48 0 : let is_archived = timeline.is_archived();
49 0 : if is_archived == Some(true) {
50 0 : tracing::error!("timeline is archived but has non-archived children: {children:?}");
51 0 : return Err(OffloadError::NotArchived);
52 0 : }
53 0 : tracing::info!(
54 : ?is_archived,
55 0 : "timeline is not archived and has unarchived children"
56 : );
57 0 : return Err(OffloadError::NotArchived);
58 4 : };
59 4 : let (timeline, guard) =
60 4 : delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
61 :
62 4 : let TimelineOrOffloaded::Timeline(timeline) = timeline else {
63 0 : tracing::error!("timeline already offloaded, but given timeline object");
64 0 : return Ok(());
65 : };
66 :
67 4 : match timeline.remote_client.shutdown_if_archived().await {
68 4 : Ok(()) => {}
69 0 : Err(ShutdownIfArchivedError::NotInitialized(_)) => {
70 0 : // Either the timeline is being deleted, the operation is being retried, or we are shutting down.
71 0 : // Don't return cancelled here to keep it idempotent.
72 0 : }
73 0 : Err(ShutdownIfArchivedError::NotArchived) => return Err(OffloadError::NotArchived),
74 : }
75 4 : timeline.set_state(TimelineState::Stopping);
76 4 :
77 4 : // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
78 4 : timeline.shutdown(super::ShutdownMode::Reload).await;
79 :
80 : // TODO extend guard mechanism above with method
81 : // to make deletions possible while offloading is in progress
82 :
83 4 : let conf = &tenant.conf;
84 4 : delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await;
85 :
86 4 : let remaining_refcount = remove_timeline_from_tenant(tenant, &timeline, &guard);
87 4 :
88 4 : {
89 4 : let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
90 4 : if matches!(
91 4 : tenant.current_state(),
92 : TenantState::Stopping { .. } | TenantState::Broken { .. }
93 : ) {
94 : // Cancel the operation if the tenant is shutting down. Do this while the
95 : // timelines_offloaded lock is held to prevent a race with Tenant::shutdown
96 : // for defusing the lock
97 0 : return Err(OffloadError::Cancelled);
98 4 : }
99 4 : offloaded_timelines.insert(
100 4 : timeline.timeline_id,
101 4 : Arc::new(
102 4 : OffloadedTimeline::from_timeline(&timeline)
103 4 : .expect("we checked above that timeline was ready"),
104 4 : ),
105 4 : );
106 4 : }
107 4 :
108 4 : // Last step: mark timeline as offloaded in S3
109 4 : // TODO: maybe move this step above, right above deletion of the local timeline directory,
110 4 : // then there is no potential race condition where we partially offload a timeline, and
111 4 : // at the next restart attach it again.
112 4 : // For that to happen, we'd need to make the manifest reflect our *intended* state,
113 4 : // not our actual state of offloaded timelines.
114 4 : tenant.store_tenant_manifest().await?;
115 :
116 4 : tracing::info!("Timeline offload complete (remaining arc refcount: {remaining_refcount})");
117 :
118 4 : Ok(())
119 4 : }
120 :
121 : /// It is important that this gets called when DeletionGuard is being held.
122 : /// For more context see comments in [`make_timeline_delete_guard`]
123 : ///
124 : /// Returns the strong count of the timeline `Arc`
125 4 : fn remove_timeline_from_tenant(
126 4 : tenant: &Tenant,
127 4 : timeline: &Timeline,
128 4 : _: &DeletionGuard, // using it as a witness
129 4 : ) -> usize {
130 4 : // Remove the timeline from the map.
131 4 : let mut timelines = tenant.timelines.lock().unwrap();
132 4 : let children_exist = timelines
133 4 : .iter()
134 8 : .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id));
135 4 : // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
136 4 : // We already deleted the layer files, so it's probably best to panic.
137 4 : // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
138 4 : if children_exist {
139 0 : panic!("Timeline grew children while we removed layer files");
140 4 : }
141 4 :
142 4 : let timeline = timelines
143 4 : .remove(&timeline.timeline_id)
144 4 : .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
145 4 :
146 4 : Arc::strong_count(&timeline)
147 4 : }
|