LCOV - code coverage report
Current view: top level - pageserver/src/tenant/timeline - offload.rs (source / functions) Coverage Total Hit
Test: 727bdccc1d7d53837da843959afb612f56da4e79.info Lines: 83.1 % 83 69
Test Date: 2025-01-30 15:18:43 Functions: 66.7 % 6 4

            Line data    Source code
       1              : use std::sync::Arc;
       2              : 
       3              : use pageserver_api::models::{TenantState, TimelineState};
       4              : 
       5              : use super::delete::{delete_local_timeline_directory, DeletionGuard};
       6              : use super::Timeline;
       7              : use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
       8              : use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
       9              : use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind};
      10              : use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
      11              : 
      12              : #[derive(thiserror::Error, Debug)]
      13              : pub(crate) enum OffloadError {
      14              :     #[error("Cancelled")]
      15              :     Cancelled,
      16              :     #[error("Timeline is not archived")]
      17              :     NotArchived,
      18              :     #[error(transparent)]
      19              :     RemoteStorage(anyhow::Error),
      20              :     #[error("Unexpected offload error: {0}")]
      21              :     Other(anyhow::Error),
      22              : }
      23              : 
      24              : impl From<TenantManifestError> for OffloadError {
      25            0 :     fn from(e: TenantManifestError) -> Self {
      26            0 :         match e {
      27            0 :             TenantManifestError::Cancelled => Self::Cancelled,
      28            0 :             TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e),
      29              :         }
      30            0 :     }
      31              : }
      32              : 
      33            4 : pub(crate) async fn offload_timeline(
      34            4 :     tenant: &Tenant,
      35            4 :     timeline: &Arc<Timeline>,
      36            4 : ) -> Result<(), OffloadError> {
      37            4 :     debug_assert_current_span_has_tenant_and_timeline_id();
      38            4 :     tracing::info!("offloading archived timeline");
      39              : 
      40            4 :     let (timeline, guard) = make_timeline_delete_guard(
      41            4 :         tenant,
      42            4 :         timeline.timeline_id,
      43            4 :         TimelineDeleteGuardKind::Offload,
      44            4 :     )
      45            4 :     .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
      46              : 
      47            4 :     let TimelineOrOffloaded::Timeline(timeline) = timeline else {
      48            0 :         tracing::error!("timeline already offloaded, but given timeline object");
      49            0 :         return Ok(());
      50              :     };
      51              : 
      52            4 :     match timeline.remote_client.shutdown_if_archived().await {
      53            4 :         Ok(()) => {}
      54            0 :         Err(ShutdownIfArchivedError::NotInitialized(_)) => {
      55            0 :             // Either the timeline is being deleted, the operation is being retried, or we are shutting down.
      56            0 :             // Don't return cancelled here to keep it idempotent.
      57            0 :         }
      58            0 :         Err(ShutdownIfArchivedError::NotArchived) => return Err(OffloadError::NotArchived),
      59              :     }
      60            4 :     timeline.set_state(TimelineState::Stopping);
      61            4 : 
      62            4 :     // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
      63            4 :     timeline.shutdown(super::ShutdownMode::Reload).await;
      64              : 
      65              :     // TODO extend guard mechanism above with method
      66              :     // to make deletions possible while offloading is in progress
      67              : 
      68            4 :     let conf = &tenant.conf;
      69            4 :     delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await;
      70              : 
      71            4 :     let remaining_refcount = remove_timeline_from_tenant(tenant, &timeline, &guard);
      72            4 : 
      73            4 :     {
      74            4 :         let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
      75            4 :         if matches!(
      76            4 :             tenant.current_state(),
      77              :             TenantState::Stopping { .. } | TenantState::Broken { .. }
      78              :         ) {
      79              :             // Cancel the operation if the tenant is shutting down. Do this while the
      80              :             // timelines_offloaded lock is held to prevent a race with Tenant::shutdown
      81              :             // for defusing the lock
      82            0 :             return Err(OffloadError::Cancelled);
      83            4 :         }
      84            4 :         offloaded_timelines.insert(
      85            4 :             timeline.timeline_id,
      86            4 :             Arc::new(
      87            4 :                 OffloadedTimeline::from_timeline(&timeline)
      88            4 :                     .expect("we checked above that timeline was ready"),
      89            4 :             ),
      90            4 :         );
      91            4 :     }
      92            4 : 
      93            4 :     // Last step: mark timeline as offloaded in S3
      94            4 :     // TODO: maybe move this step above, right above deletion of the local timeline directory,
      95            4 :     // then there is no potential race condition where we partially offload a timeline, and
      96            4 :     // at the next restart attach it again.
      97            4 :     // For that to happen, we'd need to make the manifest reflect our *intended* state,
      98            4 :     // not our actual state of offloaded timelines.
      99            4 :     tenant.store_tenant_manifest().await?;
     100              : 
     101            4 :     tracing::info!("Timeline offload complete (remaining arc refcount: {remaining_refcount})");
     102              : 
     103            4 :     Ok(())
     104            4 : }
     105              : 
     106              : /// It is important that this gets called when DeletionGuard is being held.
     107              : /// For more context see comments in [`make_timeline_delete_guard`]
     108              : ///
     109              : /// Returns the strong count of the timeline `Arc`
     110            4 : fn remove_timeline_from_tenant(
     111            4 :     tenant: &Tenant,
     112            4 :     timeline: &Timeline,
     113            4 :     _: &DeletionGuard, // using it as a witness
     114            4 : ) -> usize {
     115            4 :     // Remove the timeline from the map.
     116            4 :     let mut timelines = tenant.timelines.lock().unwrap();
     117            4 :     let children_exist = timelines
     118            4 :         .iter()
     119            8 :         .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id));
     120            4 :     // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
     121            4 :     // We already deleted the layer files, so it's probably best to panic.
     122            4 :     // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
     123            4 :     if children_exist {
     124            0 :         panic!("Timeline grew children while we removed layer files");
     125            4 :     }
     126            4 : 
     127            4 :     let timeline = timelines
     128            4 :         .remove(&timeline.timeline_id)
     129            4 :         .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
     130            4 : 
     131            4 :     Arc::strong_count(&timeline)
     132            4 : }
        

Generated by: LCOV version 2.1-beta