Line data Source code
1 : use std::sync::Arc;
2 :
3 : use pageserver_api::models::TenantState;
4 :
5 : use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
6 : use super::Timeline;
7 : use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
8 : use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
9 :
10 : #[derive(thiserror::Error, Debug)]
11 : pub(crate) enum OffloadError {
12 : #[error("Cancelled")]
13 : Cancelled,
14 : #[error("Timeline is not archived")]
15 : NotArchived,
16 : #[error(transparent)]
17 : RemoteStorage(anyhow::Error),
18 : #[error("Unexpected offload error: {0}")]
19 : Other(anyhow::Error),
20 : }
21 :
22 : impl From<TenantManifestError> for OffloadError {
23 0 : fn from(e: TenantManifestError) -> Self {
24 0 : match e {
25 0 : TenantManifestError::Cancelled => Self::Cancelled,
26 0 : TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e),
27 : }
28 0 : }
29 : }
30 :
31 2 : pub(crate) async fn offload_timeline(
32 2 : tenant: &Tenant,
33 2 : timeline: &Arc<Timeline>,
34 2 : ) -> Result<(), OffloadError> {
35 2 : debug_assert_current_span_has_tenant_and_timeline_id();
36 2 : tracing::info!("offloading archived timeline");
37 :
38 2 : let allow_offloaded_children = true;
39 2 : let (timeline, guard) =
40 2 : DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)
41 2 : .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
42 :
43 2 : let TimelineOrOffloaded::Timeline(timeline) = timeline else {
44 0 : tracing::error!("timeline already offloaded, but given timeline object");
45 0 : return Ok(());
46 : };
47 :
48 2 : let is_archived = timeline.is_archived();
49 2 : match is_archived {
50 2 : Some(true) => (),
51 : Some(false) => {
52 0 : tracing::warn!("tried offloading a non-archived timeline");
53 0 : return Err(OffloadError::NotArchived);
54 : }
55 : None => {
56 : // This is legal: calls to this function can race with the timeline shutting down
57 0 : tracing::info!("tried offloading a timeline whose remote storage is not initialized");
58 0 : return Err(OffloadError::Cancelled);
59 : }
60 : }
61 :
62 : // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
63 2 : timeline.shutdown(super::ShutdownMode::Reload).await;
64 :
65 : // TODO extend guard mechanism above with method
66 : // to make deletions possible while offloading is in progress
67 :
68 2 : let conf = &tenant.conf;
69 2 : delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await;
70 :
71 2 : let remaining_refcount = remove_timeline_from_tenant(tenant, &timeline, &guard);
72 2 :
73 2 : {
74 2 : let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
75 2 : if matches!(
76 2 : tenant.current_state(),
77 : TenantState::Stopping { .. } | TenantState::Broken { .. }
78 : ) {
79 : // Cancel the operation if the tenant is shutting down. Do this while the
80 : // timelines_offloaded lock is held to prevent a race with Tenant::shutdown
81 : // for defusing the lock
82 0 : return Err(OffloadError::Cancelled);
83 2 : }
84 2 : offloaded_timelines.insert(
85 2 : timeline.timeline_id,
86 2 : Arc::new(
87 2 : OffloadedTimeline::from_timeline(&timeline)
88 2 : .expect("we checked above that timeline was ready"),
89 2 : ),
90 2 : );
91 2 : }
92 2 :
93 2 : // Last step: mark timeline as offloaded in S3
94 2 : // TODO: maybe move this step above, right above deletion of the local timeline directory,
95 2 : // then there is no potential race condition where we partially offload a timeline, and
96 2 : // at the next restart attach it again.
97 2 : // For that to happen, we'd need to make the manifest reflect our *intended* state,
98 2 : // not our actual state of offloaded timelines.
99 2 : tenant.store_tenant_manifest().await?;
100 :
101 2 : tracing::info!("Timeline offload complete (remaining arc refcount: {remaining_refcount})");
102 :
103 2 : Ok(())
104 2 : }
105 :
106 : /// It is important that this gets called when DeletionGuard is being held.
107 : /// For more context see comments in [`DeleteTimelineFlow::prepare`]
108 : ///
109 : /// Returns the strong count of the timeline `Arc`
110 2 : fn remove_timeline_from_tenant(
111 2 : tenant: &Tenant,
112 2 : timeline: &Timeline,
113 2 : _: &DeletionGuard, // using it as a witness
114 2 : ) -> usize {
115 2 : // Remove the timeline from the map.
116 2 : let mut timelines = tenant.timelines.lock().unwrap();
117 2 : let children_exist = timelines
118 2 : .iter()
119 4 : .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id));
120 2 : // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
121 2 : // We already deleted the layer files, so it's probably best to panic.
122 2 : // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
123 2 : if children_exist {
124 0 : panic!("Timeline grew children while we removed layer files");
125 2 : }
126 2 :
127 2 : let timeline = timelines
128 2 : .remove(&timeline.timeline_id)
129 2 : .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
130 2 :
131 2 : Arc::strong_count(&timeline)
132 2 : }
|