LCOV - b837401fb09d2d9818b70e630fdb67e9799b7b0d.info - storage

LCOV - code coverage report

Current view:	top level - storage_controller/src - reconciler.rs (source / functions)		Coverage	Total	Hit
Test:	b837401fb09d2d9818b70e630fdb67e9799b7b0d.info	Lines:	0.0 %	534	0
Test Date:	2024-04-18 15:32:49	Functions:	0.0 %	69	0

            Line data    Source code

       1              : use crate::pageserver_client::PageserverClient;
       2              : use crate::persistence::Persistence;
       3              : use crate::service;
       4              : use hyper::StatusCode;
       5              : use pageserver_api::models::{
       6              :     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
       7              : };
       8              : use pageserver_api::shard::{ShardIdentity, TenantShardId};
       9              : use pageserver_client::mgmt_api;
      10              : use std::collections::HashMap;
      11              : use std::sync::Arc;
      12              : use std::time::{Duration, Instant};
      13              : use tokio_util::sync::CancellationToken;
      14              : use utils::generation::Generation;
      15              : use utils::id::{NodeId, TimelineId};
      16              : use utils::lsn::Lsn;
      17              : use utils::sync::gate::GateGuard;
      18              : 
      19              : use crate::compute_hook::{ComputeHook, NotifyError};
      20              : use crate::node::Node;
      21              : use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation};
      22              : 
      23              : const DEFAULT_HEATMAP_PERIOD: &str = "60s";
      24              : 
      25              : /// Object with the lifetime of the background reconcile task that is created
      26              : /// for tenants which have a difference between their intent and observed states.
      27              : pub(super) struct Reconciler {
      28              :     /// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot
      29              :     /// of a tenant's state from when we spawned a reconcile task.
      30              :     pub(super) tenant_shard_id: TenantShardId,
      31              :     pub(crate) shard: ShardIdentity,
      32              :     pub(crate) generation: Option<Generation>,
      33              :     pub(crate) intent: TargetState,
      34              : 
      35              :     /// Nodes not referenced by [`Self::intent`], from which we should try
      36              :     /// to detach this tenant shard.
      37              :     pub(crate) detach: Vec<Node>,
      38              : 
      39              :     pub(crate) config: TenantConfig,
      40              :     pub(crate) observed: ObservedState,
      41              : 
      42              :     pub(crate) service_config: service::Config,
      43              : 
      44              :     /// A hook to notify the running postgres instances when we change the location
      45              :     /// of a tenant.  Use this via [`Self::compute_notify`] to update our failure flag
      46              :     /// and guarantee eventual retries.
      47              :     pub(crate) compute_hook: Arc<ComputeHook>,
      48              : 
      49              :     /// To avoid stalling if the cloud control plane is unavailable, we may proceed
      50              :     /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
      51              :     /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
      52              :     pub(crate) compute_notify_failure: bool,
      53              : 
      54              :     /// A means to abort background reconciliation: it is essential to
      55              :     /// call this when something changes in the original TenantShard that
      56              :     /// will make this reconciliation impossible or unnecessary, for
      57              :     /// example when a pageserver node goes offline, or the PlacementPolicy for
      58              :     /// the tenant is changed.
      59              :     pub(crate) cancel: CancellationToken,
      60              : 
      61              :     /// Reconcilers are registered with a Gate so that during a graceful shutdown we
      62              :     /// can wait for all the reconcilers to respond to their cancellation tokens.
      63              :     pub(crate) _gate_guard: GateGuard,
      64              : 
      65              :     /// Access to persistent storage for updating generation numbers
      66              :     pub(crate) persistence: Arc<Persistence>,
      67              : }
      68              : 
      69              : /// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
      70              : /// reference counting for Scheduler.  The IntentState is what the scheduler works with,
      71              : /// and the TargetState is just the instruction for a particular Reconciler run.
      72              : #[derive(Debug)]
      73              : pub(crate) struct TargetState {
      74              :     pub(crate) attached: Option<Node>,
      75              :     pub(crate) secondary: Vec<Node>,
      76              : }
      77              : 
      78              : impl TargetState {
      79            0 :     pub(crate) fn from_intent(nodes: &HashMap<NodeId, Node>, intent: &IntentState) -> Self {
      80            0 :         Self {
      81            0 :             attached: intent.get_attached().map(|n| {
      82            0 :                 nodes
      83            0 :                     .get(&n)
      84            0 :                     .expect("Intent attached referenced non-existent node")
      85            0 :                     .clone()
      86            0 :             }),
      87            0 :             secondary: intent
      88            0 :                 .get_secondary()
      89            0 :                 .iter()
      90            0 :                 .map(|n| {
      91            0 :                     nodes
      92            0 :                         .get(n)
      93            0 :                         .expect("Intent secondary referenced non-existent node")
      94            0 :                         .clone()
      95            0 :                 })
      96            0 :                 .collect(),
      97            0 :         }
      98            0 :     }
      99              : }
     100              : 
     101            0 : #[derive(thiserror::Error, Debug)]
     102              : pub(crate) enum ReconcileError {
     103              :     #[error(transparent)]
     104              :     Remote(#[from] mgmt_api::Error),
     105              :     #[error(transparent)]
     106              :     Notify(#[from] NotifyError),
     107              :     #[error("Cancelled")]
     108              :     Cancel,
     109              :     #[error(transparent)]
     110              :     Other(#[from] anyhow::Error),
     111              : }
     112              : 
     113              : impl Reconciler {
     114            0 :     async fn location_config(
     115            0 :         &mut self,
     116            0 :         node: &Node,
     117            0 :         config: LocationConfig,
     118            0 :         flush_ms: Option<Duration>,
     119            0 :         lazy: bool,
     120            0 :     ) -> Result<(), ReconcileError> {
     121            0 :         if !node.is_available() && config.mode == LocationConfigMode::Detached {
     122              :             // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
     123              :             // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
     124              :             // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
     125            0 :             tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
     126            0 :             self.observed.locations.remove(&node.get_id());
     127            0 :             return Ok(());
     128            0 :         }
     129            0 : 
     130            0 :         self.observed
     131            0 :             .locations
     132            0 :             .insert(node.get_id(), ObservedStateLocation { conf: None });
     133            0 : 
     134            0 :         // TODO: amend locations that use long-polling: they will hit this timeout.
     135            0 :         let timeout = Duration::from_secs(25);
     136            0 : 
     137            0 :         tracing::info!("location_config({node}) calling: {:?}", config);
     138            0 :         let tenant_shard_id = self.tenant_shard_id;
     139            0 :         let config_ref = &config;
     140            0 :         match node
     141            0 :             .with_client_retries(
     142            0 :                 |client| async move {
     143            0 :                     let config = config_ref.clone();
     144            0 :                     client
     145            0 :                         .location_config(tenant_shard_id, config.clone(), flush_ms, lazy)
     146            0 :                         .await
     147            0 :                 },
     148            0 :                 &self.service_config.jwt_token,
     149            0 :                 1,
     150            0 :                 3,
     151            0 :                 timeout,
     152            0 :                 &self.cancel,
     153            0 :             )
     154            0 :             .await
     155              :         {
     156            0 :             Some(Ok(_)) => {}
     157            0 :             Some(Err(e)) => return Err(e.into()),
     158            0 :             None => return Err(ReconcileError::Cancel),
     159              :         };
     160            0 :         tracing::info!("location_config({node}) complete: {:?}", config);
     161              : 
     162            0 :         match config.mode {
     163            0 :             LocationConfigMode::Detached => {
     164            0 :                 self.observed.locations.remove(&node.get_id());
     165            0 :             }
     166            0 :             _ => {
     167            0 :                 self.observed
     168            0 :                     .locations
     169            0 :                     .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
     170            0 :             }
     171              :         }
     172              : 
     173            0 :         Ok(())
     174            0 :     }
     175              : 
     176            0 :     fn get_node(&self, node_id: &NodeId) -> Option<&Node> {
     177            0 :         if let Some(node) = self.intent.attached.as_ref() {
     178            0 :             if node.get_id() == *node_id {
     179            0 :                 return Some(node);
     180            0 :             }
     181            0 :         }
     182              : 
     183            0 :         if let Some(node) = self
     184            0 :             .intent
     185            0 :             .secondary
     186            0 :             .iter()
     187            0 :             .find(|n| n.get_id() == *node_id)
     188              :         {
     189            0 :             return Some(node);
     190            0 :         }
     191              : 
     192            0 :         if let Some(node) = self.detach.iter().find(|n| n.get_id() == *node_id) {
     193            0 :             return Some(node);
     194            0 :         }
     195            0 : 
     196            0 :         None
     197            0 :     }
     198              : 
     199            0 :     async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> {
     200            0 :         let destination = if let Some(node) = &self.intent.attached {
     201            0 :             match self.observed.locations.get(&node.get_id()) {
     202            0 :                 Some(conf) => {
     203              :                     // We will do a live migration only if the intended destination is not
     204              :                     // currently in an attached state.
     205            0 :                     match &conf.conf {
     206            0 :                         Some(conf) if conf.mode == LocationConfigMode::Secondary => {
     207            0 :                             // Fall through to do a live migration
     208            0 :                             node
     209              :                         }
     210              :                         None | Some(_) => {
     211              :                             // Attached or uncertain: don't do a live migration, proceed
     212              :                             // with a general-case reconciliation
     213            0 :                             tracing::info!("maybe_live_migrate: destination is None or attached");
     214            0 :                             return Ok(());
     215              :                         }
     216              :                     }
     217              :                 }
     218              :                 None => {
     219              :                     // Our destination is not attached: maybe live migrate if some other
     220              :                     // node is currently attached.  Fall through.
     221            0 :                     node
     222              :                 }
     223              :             }
     224              :         } else {
     225              :             // No intent to be attached
     226            0 :             tracing::info!("maybe_live_migrate: no attached intent");
     227            0 :             return Ok(());
     228              :         };
     229              : 
     230            0 :         let mut origin = None;
     231            0 :         for (node_id, state) in &self.observed.locations {
     232            0 :             if let Some(observed_conf) = &state.conf {
     233            0 :                 if observed_conf.mode == LocationConfigMode::AttachedSingle {
     234              :                     // We will only attempt live migration if the origin is not offline: this
     235              :                     // avoids trying to do it while reconciling after responding to an HA failover.
     236            0 :                     if let Some(node) = self.get_node(node_id) {
     237            0 :                         if node.is_available() {
     238            0 :                             origin = Some(node.clone());
     239            0 :                             break;
     240            0 :                         }
     241            0 :                     }
     242            0 :                 }
     243            0 :             }
     244              :         }
     245              : 
     246            0 :         let Some(origin) = origin else {
     247            0 :             tracing::info!("maybe_live_migrate: no origin found");
     248            0 :             return Ok(());
     249              :         };
     250              : 
     251              :         // We have an origin and a destination: proceed to do the live migration
     252            0 :         tracing::info!("Live migrating {}->{}", origin, destination);
     253            0 :         self.live_migrate(origin, destination.clone()).await?;
     254              : 
     255            0 :         Ok(())
     256            0 :     }
     257              : 
     258            0 :     async fn get_lsns(
     259            0 :         &self,
     260            0 :         tenant_shard_id: TenantShardId,
     261            0 :         node: &Node,
     262            0 :     ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
     263            0 :         let client = PageserverClient::new(
     264            0 :             node.get_id(),
     265            0 :             node.base_url(),
     266            0 :             self.service_config.jwt_token.as_deref(),
     267            0 :         );
     268              : 
     269            0 :         let timelines = client.timeline_list(&tenant_shard_id).await?;
     270            0 :         Ok(timelines
     271            0 :             .into_iter()
     272            0 :             .map(|t| (t.timeline_id, t.last_record_lsn))
     273            0 :             .collect())
     274            0 :     }
     275              : 
     276            0 :     async fn secondary_download(
     277            0 :         &self,
     278            0 :         tenant_shard_id: TenantShardId,
     279            0 :         node: &Node,
     280            0 :     ) -> Result<(), ReconcileError> {
     281            0 :         // This is not the timeout for a request, but the total amount of time we're willing to wait
     282            0 :         // for a secondary location to get up to date before
     283            0 :         const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
     284            0 : 
     285            0 :         // This the long-polling interval for the secondary download requests we send to destination pageserver
     286            0 :         // during a migration.
     287            0 :         const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
     288            0 : 
     289            0 :         let started_at = Instant::now();
     290              : 
     291              :         loop {
     292            0 :             let (status, progress) = match node
     293            0 :                 .with_client_retries(
     294            0 :                     |client| async move {
     295            0 :                         client
     296            0 :                             .tenant_secondary_download(
     297            0 :                                 tenant_shard_id,
     298            0 :                                 Some(REQUEST_DOWNLOAD_TIMEOUT),
     299            0 :                             )
     300            0 :                             .await
     301            0 :                     },
     302            0 :                     &self.service_config.jwt_token,
     303            0 :                     1,
     304            0 :                     3,
     305            0 :                     REQUEST_DOWNLOAD_TIMEOUT * 2,
     306            0 :                     &self.cancel,
     307            0 :                 )
     308            0 :                 .await
     309              :             {
     310            0 :                 None => Err(ReconcileError::Cancel),
     311            0 :                 Some(Ok(v)) => Ok(v),
     312            0 :                 Some(Err(e)) => {
     313            0 :                     // Give up, but proceed: it's unfortunate if we couldn't freshen the destination before
     314            0 :                     // attaching, but we should not let an issue with a secondary location stop us proceeding
     315            0 :                     // with a live migration.
     316            0 :                     tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})");
     317            0 :                     return Ok(());
     318              :                 }
     319            0 :             }?;
     320              : 
     321            0 :             if status == StatusCode::OK {
     322            0 :                 tracing::info!(
     323            0 :                     "Downloads to {} complete: {}/{} layers, {}/{} bytes",
     324            0 :                     node,
     325            0 :                     progress.layers_downloaded,
     326            0 :                     progress.layers_total,
     327            0 :                     progress.bytes_downloaded,
     328            0 :                     progress.bytes_total
     329            0 :                 );
     330            0 :                 return Ok(());
     331            0 :             } else if status == StatusCode::ACCEPTED {
     332            0 :                 let total_runtime = started_at.elapsed();
     333            0 :                 if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
     334            0 :                     tracing::warn!("Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
     335            0 :                         total_runtime.as_millis(),
     336            0 :                         progress.layers_downloaded,
     337            0 :                         progress.layers_total,
     338            0 :                         progress.bytes_downloaded,
     339            0 :                         progress.bytes_total
     340            0 :                     );
     341              :                     // Give up, but proceed: an incompletely warmed destination doesn't prevent migration working,
     342              :                     // it just makes the I/O performance for users less good.
     343            0 :                     return Ok(());
     344            0 :                 }
     345            0 : 
     346            0 :                 // Log and proceed around the loop to retry.  We don't sleep between requests, because our HTTP call
     347            0 :                 // to the pageserver is a long-poll.
     348            0 :                 tracing::info!(
     349            0 :                     "Downloads to {} not yet complete: {}/{} layers, {}/{} bytes",
     350            0 :                     node,
     351            0 :                     progress.layers_downloaded,
     352            0 :                     progress.layers_total,
     353            0 :                     progress.bytes_downloaded,
     354            0 :                     progress.bytes_total
     355            0 :                 );
     356            0 :             }
     357              :         }
     358            0 :     }
     359              : 
     360            0 :     async fn await_lsn(
     361            0 :         &self,
     362            0 :         tenant_shard_id: TenantShardId,
     363            0 :         node: &Node,
     364            0 :         baseline: HashMap<TimelineId, Lsn>,
     365            0 :     ) -> anyhow::Result<()> {
     366              :         loop {
     367            0 :             let latest = match self.get_lsns(tenant_shard_id, node).await {
     368            0 :                 Ok(l) => l,
     369            0 :                 Err(e) => {
     370            0 :                     tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",);
     371            0 :                     std::thread::sleep(Duration::from_millis(500));
     372            0 :                     continue;
     373              :                 }
     374              :             };
     375              : 
     376            0 :             let mut any_behind: bool = false;
     377            0 :             for (timeline_id, baseline_lsn) in &baseline {
     378            0 :                 match latest.get(timeline_id) {
     379            0 :                     Some(latest_lsn) => {
     380            0 :                         tracing::info!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
     381            0 :                         if latest_lsn < baseline_lsn {
     382            0 :                             any_behind = true;
     383            0 :                         }
     384              :                     }
     385            0 :                     None => {
     386            0 :                         // Expected timeline isn't yet visible on migration destination.
     387            0 :                         // (IRL we would have to account for timeline deletion, but this
     388            0 :                         //  is just test helper)
     389            0 :                         any_behind = true;
     390            0 :                     }
     391              :                 }
     392              :             }
     393              : 
     394            0 :             if !any_behind {
     395            0 :                 tracing::info!("✅ LSN caught up.  Proceeding...");
     396            0 :                 break;
     397            0 :             } else {
     398            0 :                 std::thread::sleep(Duration::from_millis(500));
     399            0 :             }
     400              :         }
     401              : 
     402            0 :         Ok(())
     403            0 :     }
     404              : 
     405            0 :     pub async fn live_migrate(
     406            0 :         &mut self,
     407            0 :         origin_ps: Node,
     408            0 :         dest_ps: Node,
     409            0 :     ) -> Result<(), ReconcileError> {
     410            0 :         // `maybe_live_migrate` is responsibble for sanity of inputs
     411            0 :         assert!(origin_ps.get_id() != dest_ps.get_id());
     412              : 
     413            0 :         fn build_location_config(
     414            0 :             shard: &ShardIdentity,
     415            0 :             config: &TenantConfig,
     416            0 :             mode: LocationConfigMode,
     417            0 :             generation: Option<Generation>,
     418            0 :             secondary_conf: Option<LocationConfigSecondary>,
     419            0 :         ) -> LocationConfig {
     420            0 :             LocationConfig {
     421            0 :                 mode,
     422            0 :                 generation: generation.map(|g| g.into().unwrap()),
     423            0 :                 secondary_conf,
     424            0 :                 tenant_conf: config.clone(),
     425            0 :                 shard_number: shard.number.0,
     426            0 :                 shard_count: shard.count.literal(),
     427            0 :                 shard_stripe_size: shard.stripe_size.0,
     428            0 :             }
     429            0 :         }
     430              : 
     431            0 :         tracing::info!("🔁 Switching origin node {origin_ps} to stale mode",);
     432              : 
     433              :         // FIXME: it is incorrect to use self.generation here, we should use the generation
     434              :         // from the ObservedState of the origin pageserver (it might be older than self.generation)
     435            0 :         let stale_conf = build_location_config(
     436            0 :             &self.shard,
     437            0 :             &self.config,
     438            0 :             LocationConfigMode::AttachedStale,
     439            0 :             self.generation,
     440            0 :             None,
     441            0 :         );
     442            0 :         self.location_config(&origin_ps, stale_conf, Some(Duration::from_secs(10)), false)
     443            0 :             .await?;
     444              : 
     445            0 :         let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps).await?);
     446              : 
     447              :         // If we are migrating to a destination that has a secondary location, warm it up first
     448            0 :         if let Some(destination_conf) = self.observed.locations.get(&dest_ps.get_id()) {
     449            0 :             if let Some(destination_conf) = &destination_conf.conf {
     450            0 :                 if destination_conf.mode == LocationConfigMode::Secondary {
     451            0 :                     tracing::info!("🔁 Downloading latest layers to destination node {dest_ps}",);
     452            0 :                     self.secondary_download(self.tenant_shard_id, &dest_ps)
     453            0 :                         .await?;
     454            0 :                 }
     455            0 :             }
     456            0 :         }
     457              : 
     458              :         // Increment generation before attaching to new pageserver
     459              :         self.generation = Some(
     460            0 :             self.persistence
     461            0 :                 .increment_generation(self.tenant_shard_id, dest_ps.get_id())
     462            0 :                 .await?,
     463              :         );
     464              : 
     465            0 :         let dest_conf = build_location_config(
     466            0 :             &self.shard,
     467            0 :             &self.config,
     468            0 :             LocationConfigMode::AttachedMulti,
     469            0 :             self.generation,
     470            0 :             None,
     471            0 :         );
     472            0 : 
     473            0 :         tracing::info!("🔁 Attaching to pageserver {dest_ps}");
     474            0 :         self.location_config(&dest_ps, dest_conf, None, false)
     475            0 :             .await?;
     476              : 
     477            0 :         if let Some(baseline) = baseline_lsns {
     478            0 :             tracing::info!("🕑 Waiting for LSN to catch up...");
     479            0 :             self.await_lsn(self.tenant_shard_id, &dest_ps, baseline)
     480            0 :                 .await?;
     481            0 :         }
     482              : 
     483            0 :         tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}");
     484              : 
     485              :         // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach
     486              :         // the origin without notifying compute, we will render the tenant unavailable.
     487            0 :         while let Err(e) = self.compute_notify().await {
     488            0 :             match e {
     489            0 :                 NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
     490            0 :                 NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
     491              :                 _ => {
     492            0 :                     tracing::warn!(
     493            0 :                         "Live migration blocked by compute notification error, retrying: {e}"
     494            0 :                     );
     495              :                 }
     496              :             }
     497              :         }
     498              : 
     499              :         // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
     500              :         // this location will be deleted in the general case reconciliation that runs after this.
     501            0 :         let origin_secondary_conf = build_location_config(
     502            0 :             &self.shard,
     503            0 :             &self.config,
     504            0 :             LocationConfigMode::Secondary,
     505            0 :             None,
     506            0 :             Some(LocationConfigSecondary { warm: true }),
     507            0 :         );
     508            0 :         self.location_config(&origin_ps, origin_secondary_conf.clone(), None, false)
     509            0 :             .await?;
     510              :         // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
     511              :         // partway through.  In fact, all location conf API calls should be in a wrapper that sets
     512              :         // the observed state to None, then runs, then sets it to what we wrote.
     513            0 :         self.observed.locations.insert(
     514            0 :             origin_ps.get_id(),
     515            0 :             ObservedStateLocation {
     516            0 :                 conf: Some(origin_secondary_conf),
     517            0 :             },
     518            0 :         );
     519            0 : 
     520            0 :         tracing::info!("🔁 Switching to AttachedSingle mode on node {dest_ps}",);
     521            0 :         let dest_final_conf = build_location_config(
     522            0 :             &self.shard,
     523            0 :             &self.config,
     524            0 :             LocationConfigMode::AttachedSingle,
     525            0 :             self.generation,
     526            0 :             None,
     527            0 :         );
     528            0 :         self.location_config(&dest_ps, dest_final_conf.clone(), None, false)
     529            0 :             .await?;
     530            0 :         self.observed.locations.insert(
     531            0 :             dest_ps.get_id(),
     532            0 :             ObservedStateLocation {
     533            0 :                 conf: Some(dest_final_conf),
     534            0 :             },
     535            0 :         );
     536            0 : 
     537            0 :         tracing::info!("✅ Migration complete");
     538              : 
     539            0 :         Ok(())
     540            0 :     }
     541              : 
     542            0 :     async fn maybe_refresh_observed(&mut self) -> Result<(), ReconcileError> {
     543              :         // If the attached node has uncertain state, read it from the pageserver before proceeding: this
     544              :         // is important to avoid spurious generation increments.
     545              :         //
     546              :         // We don't need to do this for secondary/detach locations because it's harmless to just PUT their
     547              :         // location conf, whereas for attached locations it can interrupt clients if we spuriously destroy/recreate
     548              :         // the `Timeline` object in the pageserver.
     549              : 
     550            0 :         let Some(attached_node) = self.intent.attached.as_ref() else {
     551              :             // Nothing to do
     552            0 :             return Ok(());
     553              :         };
     554              : 
     555            0 :         if matches!(
     556            0 :             self.observed.locations.get(&attached_node.get_id()),
     557              :             Some(ObservedStateLocation { conf: None })
     558              :         ) {
     559            0 :             let tenant_shard_id = self.tenant_shard_id;
     560            0 :             let observed_conf = match attached_node
     561            0 :                 .with_client_retries(
     562            0 :                     |client| async move { client.get_location_config(tenant_shard_id).await },
     563            0 :                     &self.service_config.jwt_token,
     564            0 :                     1,
     565            0 :                     1,
     566            0 :                     Duration::from_secs(5),
     567            0 :                     &self.cancel,
     568            0 :                 )
     569            0 :                 .await
     570              :             {
     571            0 :                 Some(Ok(observed)) => Some(observed),
     572            0 :                 Some(Err(mgmt_api::Error::ApiError(status, _msg)))
     573            0 :                     if status == StatusCode::NOT_FOUND =>
     574            0 :                 {
     575            0 :                     None
     576              :                 }
     577            0 :                 Some(Err(e)) => return Err(e.into()),
     578            0 :                 None => return Err(ReconcileError::Cancel),
     579              :             };
     580            0 :             tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
     581            0 :             match observed_conf {
     582            0 :                 Some(conf) => {
     583            0 :                     // Pageserver returned a state: update it in observed.  This may still be an indeterminate (None) state,
     584            0 :                     // if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running)
     585            0 :                     self.observed
     586            0 :                         .locations
     587            0 :                         .insert(attached_node.get_id(), ObservedStateLocation { conf });
     588            0 :                 }
     589            0 :                 None => {
     590            0 :                     // Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver.
     591            0 :                     self.observed.locations.remove(&attached_node.get_id());
     592            0 :                 }
     593              :             }
     594            0 :         }
     595              : 
     596            0 :         Ok(())
     597            0 :     }
     598              : 
     599              :     /// Reconciling a tenant makes API calls to pageservers until the observed state
     600              :     /// matches the intended state.
     601              :     ///
     602              :     /// First we apply special case handling (e.g. for live migrations), and then a
     603              :     /// general case reconciliation where we walk through the intent by pageserver
     604              :     /// and call out to the pageserver to apply the desired state.
     605            0 :     pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
     606            0 :         // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
     607            0 :         self.maybe_refresh_observed().await?;
     608              : 
     609              :         // Special case: live migration
     610            0 :         self.maybe_live_migrate().await?;
     611              : 
     612              :         // If the attached pageserver is not attached, do so now.
     613            0 :         if let Some(node) = self.intent.attached.as_ref() {
     614              :             // If we are in an attached policy, then generation must have been set (null generations
     615              :             // are only present when a tenant is initially loaded with a secondary policy)
     616            0 :             debug_assert!(self.generation.is_some());
     617            0 :             let Some(generation) = self.generation else {
     618            0 :                 return Err(ReconcileError::Other(anyhow::anyhow!(
     619            0 :                     "Attempted to attach with NULL generation"
     620            0 :                 )));
     621              :             };
     622              : 
     623            0 :             let mut wanted_conf = attached_location_conf(
     624            0 :                 generation,
     625            0 :                 &self.shard,
     626            0 :                 &self.config,
     627            0 :                 !self.intent.secondary.is_empty(),
     628            0 :             );
     629            0 :             match self.observed.locations.get(&node.get_id()) {
     630            0 :                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
     631            0 :                     // Nothing to do
     632            0 :                     tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
     633              :                 }
     634            0 :                 observed => {
     635              :                     // In all cases other than a matching observed configuration, we will
     636              :                     // reconcile this location.  This includes locations with different configurations, as well
     637              :                     // as locations with unknown (None) observed state.
     638              : 
     639              :                     // The general case is to increment the generation.  However, there are cases
     640              :                     // where this is not necessary:
     641              :                     // - if we are only updating the TenantConf part of the location
     642              :                     // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
     643              :                     //   and the location was already in the correct generation
     644            0 :                     let increment_generation = match observed {
     645            0 :                         None => true,
     646            0 :                         Some(ObservedStateLocation { conf: None }) => true,
     647              :                         Some(ObservedStateLocation {
     648            0 :                             conf: Some(observed),
     649            0 :                         }) => {
     650            0 :                             let generations_match = observed.generation == wanted_conf.generation;
     651              : 
     652              :                             use LocationConfigMode::*;
     653            0 :                             let mode_transition_requires_gen_inc =
     654            0 :                                 match (observed.mode, wanted_conf.mode) {
     655              :                                     // Usually the short-lived attachment modes (multi and stale) are only used
     656              :                                     // in the case of [`Self::live_migrate`], but it is simple to handle them correctly
     657              :                                     // here too.  Locations are allowed to go Single->Stale and Multi->Single within the same generation.
     658            0 :                                     (AttachedSingle, AttachedStale) => false,
     659            0 :                                     (AttachedMulti, AttachedSingle) => false,
     660            0 :                                     (lhs, rhs) => lhs != rhs,
     661              :                                 };
     662              : 
     663            0 :                             !generations_match || mode_transition_requires_gen_inc
     664              :                         }
     665              :                     };
     666              : 
     667            0 :                     if increment_generation {
     668            0 :                         let generation = self
     669            0 :                             .persistence
     670            0 :                             .increment_generation(self.tenant_shard_id, node.get_id())
     671            0 :                             .await?;
     672            0 :                         self.generation = Some(generation);
     673            0 :                         wanted_conf.generation = generation.into();
     674            0 :                     }
     675            0 :                     tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
     676              : 
     677              :                     // Because `node` comes from a ref to &self, clone it before calling into a &mut self
     678              :                     // function: this could be avoided by refactoring the state mutated by location_config into
     679              :                     // a separate type to Self.
     680            0 :                     let node = node.clone();
     681            0 : 
     682            0 :                     // Use lazy=true, because we may run many of Self concurrently, and do not want to
     683            0 :                     // overload the pageserver with logical size calculations.
     684            0 :                     self.location_config(&node, wanted_conf, None, true).await?;
     685            0 :                     self.compute_notify().await?;
     686              :                 }
     687              :             }
     688            0 :         }
     689              : 
     690              :         // Configure secondary locations: if these were previously attached this
     691              :         // implicitly downgrades them from attached to secondary.
     692            0 :         let mut changes = Vec::new();
     693            0 :         for node in &self.intent.secondary {
     694            0 :             let wanted_conf = secondary_location_conf(&self.shard, &self.config);
     695            0 :             match self.observed.locations.get(&node.get_id()) {
     696            0 :                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
     697            0 :                     // Nothing to do
     698            0 :                     tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
     699              :                 }
     700              :                 _ => {
     701              :                     // In all cases other than a matching observed configuration, we will
     702              :                     // reconcile this location.
     703            0 :                     tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
     704            0 :                     changes.push((node.clone(), wanted_conf))
     705              :                 }
     706              :             }
     707              :         }
     708              : 
     709              :         // Detach any extraneous pageservers that are no longer referenced
     710              :         // by our intent.
     711            0 :         for node in &self.detach {
     712            0 :             changes.push((
     713            0 :                 node.clone(),
     714            0 :                 LocationConfig {
     715            0 :                     mode: LocationConfigMode::Detached,
     716            0 :                     generation: None,
     717            0 :                     secondary_conf: None,
     718            0 :                     shard_number: self.shard.number.0,
     719            0 :                     shard_count: self.shard.count.literal(),
     720            0 :                     shard_stripe_size: self.shard.stripe_size.0,
     721            0 :                     tenant_conf: self.config.clone(),
     722            0 :                 },
     723            0 :             ));
     724            0 :         }
     725              : 
     726            0 :         for (node, conf) in changes {
     727            0 :             if self.cancel.is_cancelled() {
     728            0 :                 return Err(ReconcileError::Cancel);
     729            0 :             }
     730            0 :             self.location_config(&node, conf, None, false).await?;
     731              :         }
     732              : 
     733            0 :         Ok(())
     734            0 :     }
     735              : 
     736            0 :     pub(crate) async fn compute_notify(&mut self) -> Result<(), NotifyError> {
     737              :         // Whenever a particular Reconciler emits a notification, it is always notifying for the intended
     738              :         // destination.
     739            0 :         if let Some(node) = &self.intent.attached {
     740            0 :             let result = self
     741            0 :                 .compute_hook
     742            0 :                 .notify(
     743            0 :                     self.tenant_shard_id,
     744            0 :                     node.get_id(),
     745            0 :                     self.shard.stripe_size,
     746            0 :                     &self.cancel,
     747            0 :                 )
     748            0 :                 .await;
     749            0 :             if let Err(e) = &result {
     750              :                 // It is up to the caller whether they want to drop out on this error, but they don't have to:
     751              :                 // in general we should avoid letting unavailability of the cloud control plane stop us from
     752              :                 // making progress.
     753            0 :                 tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
     754              :                 // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
     755              :                 // needs to retry at some point.
     756            0 :                 self.compute_notify_failure = true;
     757            0 :             }
     758            0 :             result
     759              :         } else {
     760            0 :             Ok(())
     761              :         }
     762            0 :     }
     763              : }
     764              : 
     765              : /// We tweak the externally-set TenantConfig while configuring
     766              : /// locations, using our awareness of whether secondary locations
     767              : /// are in use to automatically enable/disable heatmap uploads.
     768            0 : fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig {
     769            0 :     let mut config = config.clone();
     770            0 :     if has_secondaries {
     771            0 :         if config.heatmap_period.is_none() {
     772            0 :             config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
     773            0 :         }
     774            0 :     } else {
     775            0 :         config.heatmap_period = None;
     776            0 :     }
     777            0 :     config
     778            0 : }
     779              : 
     780            0 : pub(crate) fn attached_location_conf(
     781            0 :     generation: Generation,
     782            0 :     shard: &ShardIdentity,
     783            0 :     config: &TenantConfig,
     784            0 :     has_secondaries: bool,
     785            0 : ) -> LocationConfig {
     786            0 :     LocationConfig {
     787            0 :         mode: LocationConfigMode::AttachedSingle,
     788            0 :         generation: generation.into(),
     789            0 :         secondary_conf: None,
     790            0 :         shard_number: shard.number.0,
     791            0 :         shard_count: shard.count.literal(),
     792            0 :         shard_stripe_size: shard.stripe_size.0,
     793            0 :         tenant_conf: ha_aware_config(config, has_secondaries),
     794            0 :     }
     795            0 : }
     796              : 
     797            0 : pub(crate) fn secondary_location_conf(
     798            0 :     shard: &ShardIdentity,
     799            0 :     config: &TenantConfig,
     800            0 : ) -> LocationConfig {
     801            0 :     LocationConfig {
     802            0 :         mode: LocationConfigMode::Secondary,
     803            0 :         generation: None,
     804            0 :         secondary_conf: Some(LocationConfigSecondary { warm: true }),
     805            0 :         shard_number: shard.number.0,
     806            0 :         shard_count: shard.count.literal(),
     807            0 :         shard_stripe_size: shard.stripe_size.0,
     808            0 :         tenant_conf: ha_aware_config(config, true),
     809            0 :     }
     810            0 : }

Generated by: LCOV version 2.1-beta