Line data Source code
1 : use std::sync::Arc;
2 :
3 : use pageserver_api::models::{TenantState, TimelineState};
4 :
5 : use super::delete::{delete_local_timeline_directory, DeletionGuard};
6 : use super::Timeline;
7 : use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
8 : use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
9 : use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind};
10 : use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
11 :
12 : #[derive(thiserror::Error, Debug)]
13 : pub(crate) enum OffloadError {
14 : #[error("Cancelled")]
15 : Cancelled,
16 : #[error("Timeline is not archived")]
17 : NotArchived,
18 : #[error(transparent)]
19 : RemoteStorage(anyhow::Error),
20 : #[error("Unexpected offload error: {0}")]
21 : Other(anyhow::Error),
22 : }
23 :
24 : impl From<TenantManifestError> for OffloadError {
25 0 : fn from(e: TenantManifestError) -> Self {
26 0 : match e {
27 0 : TenantManifestError::Cancelled => Self::Cancelled,
28 0 : TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e),
29 : }
30 0 : }
31 : }
32 :
33 4 : pub(crate) async fn offload_timeline(
34 4 : tenant: &Tenant,
35 4 : timeline: &Arc<Timeline>,
36 4 : ) -> Result<(), OffloadError> {
37 4 : debug_assert_current_span_has_tenant_and_timeline_id();
38 4 : tracing::info!("offloading archived timeline");
39 :
40 4 : let (timeline, guard) = make_timeline_delete_guard(
41 4 : tenant,
42 4 : timeline.timeline_id,
43 4 : TimelineDeleteGuardKind::Offload,
44 4 : )
45 4 : .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
46 :
47 4 : let TimelineOrOffloaded::Timeline(timeline) = timeline else {
48 0 : tracing::error!("timeline already offloaded, but given timeline object");
49 0 : return Ok(());
50 : };
51 :
52 4 : match timeline.remote_client.shutdown_if_archived().await {
53 4 : Ok(()) => {}
54 0 : Err(ShutdownIfArchivedError::NotInitialized(_)) => {
55 0 : // Either the timeline is being deleted, the operation is being retried, or we are shutting down.
56 0 : // Don't return cancelled here to keep it idempotent.
57 0 : }
58 0 : Err(ShutdownIfArchivedError::NotArchived) => return Err(OffloadError::NotArchived),
59 : }
60 4 : timeline.set_state(TimelineState::Stopping);
61 4 :
62 4 : // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
63 4 : timeline.shutdown(super::ShutdownMode::Reload).await;
64 :
65 : // TODO extend guard mechanism above with method
66 : // to make deletions possible while offloading is in progress
67 :
68 4 : let conf = &tenant.conf;
69 4 : delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await;
70 :
71 4 : let remaining_refcount = remove_timeline_from_tenant(tenant, &timeline, &guard);
72 4 :
73 4 : {
74 4 : let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
75 4 : if matches!(
76 4 : tenant.current_state(),
77 : TenantState::Stopping { .. } | TenantState::Broken { .. }
78 : ) {
79 : // Cancel the operation if the tenant is shutting down. Do this while the
80 : // timelines_offloaded lock is held to prevent a race with Tenant::shutdown
81 : // for defusing the lock
82 0 : return Err(OffloadError::Cancelled);
83 4 : }
84 4 : offloaded_timelines.insert(
85 4 : timeline.timeline_id,
86 4 : Arc::new(
87 4 : OffloadedTimeline::from_timeline(&timeline)
88 4 : .expect("we checked above that timeline was ready"),
89 4 : ),
90 4 : );
91 4 : }
92 4 :
93 4 : // Last step: mark timeline as offloaded in S3
94 4 : // TODO: maybe move this step above, right above deletion of the local timeline directory,
95 4 : // then there is no potential race condition where we partially offload a timeline, and
96 4 : // at the next restart attach it again.
97 4 : // For that to happen, we'd need to make the manifest reflect our *intended* state,
98 4 : // not our actual state of offloaded timelines.
99 4 : tenant.store_tenant_manifest().await?;
100 :
101 4 : tracing::info!("Timeline offload complete (remaining arc refcount: {remaining_refcount})");
102 :
103 4 : Ok(())
104 4 : }
105 :
106 : /// It is important that this gets called when DeletionGuard is being held.
107 : /// For more context see comments in [`make_timeline_delete_guard`]
108 : ///
109 : /// Returns the strong count of the timeline `Arc`
110 4 : fn remove_timeline_from_tenant(
111 4 : tenant: &Tenant,
112 4 : timeline: &Timeline,
113 4 : _: &DeletionGuard, // using it as a witness
114 4 : ) -> usize {
115 4 : // Remove the timeline from the map.
116 4 : let mut timelines = tenant.timelines.lock().unwrap();
117 4 : let children_exist = timelines
118 4 : .iter()
119 8 : .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id));
120 4 : // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
121 4 : // We already deleted the layer files, so it's probably best to panic.
122 4 : // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
123 4 : if children_exist {
124 0 : panic!("Timeline grew children while we removed layer files");
125 4 : }
126 4 :
127 4 : let timeline = timelines
128 4 : .remove(&timeline.timeline_id)
129 4 : .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
130 4 :
131 4 : Arc::strong_count(&timeline)
132 4 : }
|