Line data Source code
1 : use std::sync::Arc;
2 :
3 : use pageserver_api::models::{TenantState, TimelineState};
4 :
5 : use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
6 : use super::Timeline;
7 : use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
8 : use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
9 : use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
10 :
11 : #[derive(thiserror::Error, Debug)]
12 : pub(crate) enum OffloadError {
13 : #[error("Cancelled")]
14 : Cancelled,
15 : #[error("Timeline is not archived")]
16 : NotArchived,
17 : #[error(transparent)]
18 : RemoteStorage(anyhow::Error),
19 : #[error("Unexpected offload error: {0}")]
20 : Other(anyhow::Error),
21 : }
22 :
23 : impl From<TenantManifestError> for OffloadError {
24 0 : fn from(e: TenantManifestError) -> Self {
25 0 : match e {
26 0 : TenantManifestError::Cancelled => Self::Cancelled,
27 0 : TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e),
28 : }
29 0 : }
30 : }
31 :
32 2 : pub(crate) async fn offload_timeline(
33 2 : tenant: &Tenant,
34 2 : timeline: &Arc<Timeline>,
35 2 : ) -> Result<(), OffloadError> {
36 2 : debug_assert_current_span_has_tenant_and_timeline_id();
37 2 : tracing::info!("offloading archived timeline");
38 :
39 2 : let allow_offloaded_children = true;
40 2 : let set_stopping = false;
41 2 : let (timeline, guard) = DeleteTimelineFlow::prepare(
42 2 : tenant,
43 2 : timeline.timeline_id,
44 2 : allow_offloaded_children,
45 2 : set_stopping,
46 2 : )
47 2 : .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
48 :
49 2 : let TimelineOrOffloaded::Timeline(timeline) = timeline else {
50 0 : tracing::error!("timeline already offloaded, but given timeline object");
51 0 : return Ok(());
52 : };
53 :
54 2 : match timeline.remote_client.shutdown_if_archived().await {
55 2 : Ok(()) => {}
56 0 : Err(ShutdownIfArchivedError::NotInitialized(_)) => {
57 0 : // Either the timeline is being deleted, the operation is being retried, or we are shutting down.
58 0 : // Don't return cancelled here to keep it idempotent.
59 0 : }
60 0 : Err(ShutdownIfArchivedError::NotArchived) => return Err(OffloadError::NotArchived),
61 : }
62 2 : timeline.set_state(TimelineState::Stopping);
63 2 :
64 2 : // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
65 2 : timeline.shutdown(super::ShutdownMode::Reload).await;
66 :
67 : // TODO extend guard mechanism above with method
68 : // to make deletions possible while offloading is in progress
69 :
70 2 : let conf = &tenant.conf;
71 2 : delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await;
72 :
73 2 : let remaining_refcount = remove_timeline_from_tenant(tenant, &timeline, &guard);
74 2 :
75 2 : {
76 2 : let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
77 2 : if matches!(
78 2 : tenant.current_state(),
79 : TenantState::Stopping { .. } | TenantState::Broken { .. }
80 : ) {
81 : // Cancel the operation if the tenant is shutting down. Do this while the
82 : // timelines_offloaded lock is held to prevent a race with Tenant::shutdown
83 : // for defusing the lock
84 0 : return Err(OffloadError::Cancelled);
85 2 : }
86 2 : offloaded_timelines.insert(
87 2 : timeline.timeline_id,
88 2 : Arc::new(
89 2 : OffloadedTimeline::from_timeline(&timeline)
90 2 : .expect("we checked above that timeline was ready"),
91 2 : ),
92 2 : );
93 2 : }
94 2 :
95 2 : // Last step: mark timeline as offloaded in S3
96 2 : // TODO: maybe move this step above, right above deletion of the local timeline directory,
97 2 : // then there is no potential race condition where we partially offload a timeline, and
98 2 : // at the next restart attach it again.
99 2 : // For that to happen, we'd need to make the manifest reflect our *intended* state,
100 2 : // not our actual state of offloaded timelines.
101 2 : tenant.store_tenant_manifest().await?;
102 :
103 2 : tracing::info!("Timeline offload complete (remaining arc refcount: {remaining_refcount})");
104 :
105 2 : Ok(())
106 2 : }
107 :
108 : /// It is important that this gets called when DeletionGuard is being held.
109 : /// For more context see comments in [`DeleteTimelineFlow::prepare`]
110 : ///
111 : /// Returns the strong count of the timeline `Arc`
112 2 : fn remove_timeline_from_tenant(
113 2 : tenant: &Tenant,
114 2 : timeline: &Timeline,
115 2 : _: &DeletionGuard, // using it as a witness
116 2 : ) -> usize {
117 2 : // Remove the timeline from the map.
118 2 : let mut timelines = tenant.timelines.lock().unwrap();
119 2 : let children_exist = timelines
120 2 : .iter()
121 4 : .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id));
122 2 : // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
123 2 : // We already deleted the layer files, so it's probably best to panic.
124 2 : // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
125 2 : if children_exist {
126 0 : panic!("Timeline grew children while we removed layer files");
127 2 : }
128 2 :
129 2 : let timeline = timelines
130 2 : .remove(&timeline.timeline_id)
131 2 : .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
132 2 :
133 2 : Arc::strong_count(&timeline)
134 2 : }
|