LCOV - code coverage report
Current view: top level - safekeeper/src - recovery.rs (source / functions) Coverage Total Hit
Test: 691a4c28fe7169edd60b367c52d448a0a6605f1f.info Lines: 0.0 % 246 0
Test Date: 2024-05-10 13:18:37 Functions: 0.0 % 18 0

            Line data    Source code
       1              : //! This module implements pulling WAL from peer safekeepers if compute can't
       2              : //! provide it, i.e. safekeeper lags too much.
       3              : 
       4              : use std::time::SystemTime;
       5              : use std::{fmt, pin::pin, sync::Arc};
       6              : 
       7              : use anyhow::{bail, Context};
       8              : use futures::StreamExt;
       9              : use postgres_protocol::message::backend::ReplicationMessage;
      10              : use tokio::sync::mpsc::{channel, Receiver, Sender};
      11              : use tokio::time::timeout;
      12              : use tokio::{
      13              :     select,
      14              :     time::sleep,
      15              :     time::{self, Duration},
      16              : };
      17              : use tokio_postgres::replication::ReplicationStream;
      18              : use tokio_postgres::types::PgLsn;
      19              : use tracing::*;
      20              : use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config};
      21              : 
      22              : use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
      23              : use crate::safekeeper::{AppendRequest, AppendRequestHeader};
      24              : use crate::{
      25              :     http::routes::TimelineStatus,
      26              :     receive_wal::MSG_QUEUE_SIZE,
      27              :     safekeeper::{
      28              :         AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
      29              :         TermLsn, VoteRequest,
      30              :     },
      31              :     timeline::{PeerInfo, Timeline},
      32              :     SafeKeeperConf,
      33              : };
      34              : 
      35              : /// Entrypoint for per timeline task which always runs, checking whether
      36              : /// recovery for this safekeeper is needed and starting it if so.
      37            0 : #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
      38              : pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
      39              :     info!("started");
      40              :     let mut cancellation_rx = match tli.get_cancellation_rx() {
      41              :         Ok(rx) => rx,
      42              :         Err(_) => {
      43              :             info!("timeline canceled during task start");
      44              :             return;
      45              :         }
      46              :     };
      47              : 
      48              :     select! {
      49              :         _ = recovery_main_loop(tli, conf) => { unreachable!() }
      50              :         _ = cancellation_rx.changed() => {
      51              :             info!("stopped");
      52              :         }
      53              :     }
      54              : }
      55              : 
      56              : /// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
      57              : /// fields to explain the choice.
      58              : #[derive(Debug)]
      59              : pub struct RecoveryNeededInfo {
      60              :     /// my term
      61              :     pub term: Term,
      62              :     /// my last_log_term
      63              :     pub last_log_term: Term,
      64              :     /// my flush_lsn
      65              :     pub flush_lsn: Lsn,
      66              :     /// peers from which we can fetch WAL, for observability.
      67              :     pub peers: Vec<PeerInfo>,
      68              :     /// for observability
      69              :     pub num_streaming_computes: usize,
      70              :     pub donors: Vec<Donor>,
      71              : }
      72              : 
      73              : // Custom to omit not important fields from PeerInfo.
      74              : impl fmt::Display for RecoveryNeededInfo {
      75            0 :     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
      76            0 :         write!(f, "{{")?;
      77            0 :         write!(
      78            0 :             f,
      79            0 :             "term: {}, last_log_term: {}, flush_lsn: {}, peers: {{",
      80            0 :             self.term, self.last_log_term, self.flush_lsn
      81            0 :         )?;
      82            0 :         for p in self.peers.iter() {
      83            0 :             write!(
      84            0 :                 f,
      85            0 :                 "PeerInfo {{ sk_id: {}, term: {}, last_log_term: {}, flush_lsn: {} }}, ",
      86            0 :                 p.sk_id, p.term, p.last_log_term, p.flush_lsn
      87            0 :             )?;
      88              :         }
      89            0 :         write!(
      90            0 :             f,
      91            0 :             "}} num_streaming_computes: {}, donors: {:?}",
      92            0 :             self.num_streaming_computes, self.donors
      93            0 :         )
      94            0 :     }
      95              : }
      96              : 
      97              : #[derive(Clone, Debug)]
      98              : pub struct Donor {
      99              :     pub sk_id: NodeId,
     100              :     /// equals to last_log_term
     101              :     pub term: Term,
     102              :     pub flush_lsn: Lsn,
     103              :     pub pg_connstr: String,
     104              :     pub http_connstr: String,
     105              : }
     106              : 
     107              : impl From<&PeerInfo> for Donor {
     108            0 :     fn from(p: &PeerInfo) -> Self {
     109            0 :         Donor {
     110            0 :             sk_id: p.sk_id,
     111            0 :             term: p.term,
     112            0 :             flush_lsn: p.flush_lsn,
     113            0 :             pg_connstr: p.pg_connstr.clone(),
     114            0 :             http_connstr: p.http_connstr.clone(),
     115            0 :         }
     116            0 :     }
     117              : }
     118              : 
     119              : const CHECK_INTERVAL_MS: u64 = 2000;
     120              : 
     121              : /// Check regularly whether we need to start recovery.
     122            0 : async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
     123            0 :     let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
     124              :     loop {
     125            0 :         let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
     126            0 :         match recovery_needed_info.donors.first() {
     127            0 :             Some(donor) => {
     128            0 :                 info!(
     129            0 :                     "starting recovery from donor {}: {}",
     130              :                     donor.sk_id, recovery_needed_info
     131              :                 );
     132            0 :                 match recover(tli.clone(), donor, &conf).await {
     133              :                     // Note: 'write_wal rewrites WAL written before' error is
     134              :                     // expected here and might happen if compute and recovery
     135              :                     // concurrently write the same data. Eventually compute
     136              :                     // should win.
     137            0 :                     Err(e) => warn!("recovery failed: {:#}", e),
     138            0 :                     Ok(msg) => info!("recovery finished: {}", msg),
     139              :                 }
     140              :             }
     141              :             None => {
     142            0 :                 trace!(
     143            0 :                     "recovery not needed or not possible: {}",
     144              :                     recovery_needed_info
     145              :                 );
     146              :             }
     147              :         }
     148            0 :         sleep(check_duration).await;
     149              :     }
     150              : }
     151              : 
     152              : /// Recover from the specified donor. Returns message explaining normal finish
     153              : /// reason or error.
     154            0 : async fn recover(
     155            0 :     tli: Arc<Timeline>,
     156            0 :     donor: &Donor,
     157            0 :     conf: &SafeKeeperConf,
     158            0 : ) -> anyhow::Result<String> {
     159            0 :     // Learn donor term switch history to figure out starting point.
     160            0 :     let client = reqwest::Client::new();
     161            0 :     let timeline_info: TimelineStatus = client
     162            0 :         .get(format!(
     163            0 :             "http://{}/v1/tenant/{}/timeline/{}",
     164            0 :             donor.http_connstr, tli.ttid.tenant_id, tli.ttid.timeline_id
     165            0 :         ))
     166            0 :         .send()
     167            0 :         .await?
     168            0 :         .json()
     169            0 :         .await?;
     170            0 :     if timeline_info.acceptor_state.term != donor.term {
     171            0 :         bail!(
     172            0 :             "donor term changed from {} to {}",
     173            0 :             donor.term,
     174            0 :             timeline_info.acceptor_state.term
     175            0 :         );
     176            0 :     }
     177            0 :     // convert from API TermSwitchApiEntry into TermLsn.
     178            0 :     let donor_th = TermHistory(
     179            0 :         timeline_info
     180            0 :             .acceptor_state
     181            0 :             .term_history
     182            0 :             .iter()
     183            0 :             .map(|tl| Into::<TermLsn>::into(*tl))
     184            0 :             .collect(),
     185            0 :     );
     186            0 : 
     187            0 :     // Now understand our term history.
     188            0 :     let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
     189            0 :     let vote_response = match tli
     190            0 :         .process_msg(&vote_request)
     191            0 :         .await
     192            0 :         .context("VoteRequest handling")?
     193              :     {
     194            0 :         Some(AcceptorProposerMessage::VoteResponse(vr)) => vr,
     195              :         _ => {
     196            0 :             bail!("unexpected VoteRequest response"); // unreachable
     197              :         }
     198              :     };
     199            0 :     if vote_response.term != donor.term {
     200            0 :         bail!(
     201            0 :             "our term changed from {} to {}",
     202            0 :             donor.term,
     203            0 :             vote_response.term
     204            0 :         );
     205            0 :     }
     206              : 
     207            0 :     let last_common_point = match TermHistory::find_highest_common_point(
     208            0 :         &donor_th,
     209            0 :         &vote_response.term_history,
     210            0 :         vote_response.flush_lsn,
     211            0 :     ) {
     212            0 :         None => bail!(
     213            0 :             "couldn't find common point in histories, donor {:?}, sk {:?}",
     214            0 :             donor_th,
     215            0 :             vote_response.term_history,
     216            0 :         ),
     217            0 :         Some(lcp) => lcp,
     218            0 :     };
     219            0 :     info!("found last common point at {:?}", last_common_point);
     220              : 
     221              :     // truncate WAL locally
     222            0 :     let pe = ProposerAcceptorMessage::Elected(ProposerElected {
     223            0 :         term: donor.term,
     224            0 :         start_streaming_at: last_common_point.lsn,
     225            0 :         term_history: donor_th,
     226            0 :         timeline_start_lsn: Lsn::INVALID,
     227            0 :     });
     228            0 :     // Successful ProposerElected handling always returns None. If term changed,
     229            0 :     // we'll find out that during the streaming. Note: it is expected to get
     230            0 :     // 'refusing to overwrite correct WAL' here if walproposer reconnected
     231            0 :     // concurrently, restart helps here.
     232            0 :     tli.process_msg(&pe)
     233            0 :         .await
     234            0 :         .context("ProposerElected handling")?;
     235              : 
     236            0 :     recovery_stream(tli, donor, last_common_point.lsn, conf).await
     237            0 : }
     238              : 
     239              : // Pull WAL from donor, assuming handshake is already done.
     240            0 : async fn recovery_stream(
     241            0 :     tli: Arc<Timeline>,
     242            0 :     donor: &Donor,
     243            0 :     start_streaming_at: Lsn,
     244            0 :     conf: &SafeKeeperConf,
     245            0 : ) -> anyhow::Result<String> {
     246              :     // TODO: pass auth token
     247            0 :     let cfg = wal_stream_connection_config(tli.ttid, &donor.pg_connstr, None, None)?;
     248            0 :     let mut cfg = cfg.to_tokio_postgres_config();
     249            0 :     // It will make safekeeper give out not committed WAL (up to flush_lsn).
     250            0 :     cfg.application_name(&format!("safekeeper_{}", conf.my_id));
     251            0 :     cfg.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
     252            0 : 
     253            0 :     let connect_timeout = Duration::from_millis(10000);
     254            0 :     let (client, connection) = match time::timeout(connect_timeout, cfg.connect(postgres::NoTls))
     255            0 :         .await
     256              :     {
     257            0 :         Ok(client_and_conn) => client_and_conn?,
     258            0 :         Err(_elapsed) => {
     259            0 :             bail!("timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open");
     260              :         }
     261              :     };
     262            0 :     trace!("connected to {:?}", donor);
     263              : 
     264              :     // The connection object performs the actual communication with the
     265              :     // server, spawn it off to run on its own.
     266            0 :     let ttid = tli.ttid;
     267            0 :     tokio::spawn(async move {
     268            0 :         if let Err(e) = connection
     269            0 :             .instrument(info_span!("recovery task connection poll", ttid = %ttid))
     270            0 :             .await
     271              :         {
     272              :             // This logging isn't very useful as error is anyway forwarded to client.
     273            0 :             trace!(
     274            0 :                 "tokio_postgres connection object finished with error: {}",
     275              :                 e
     276              :             );
     277            0 :         }
     278            0 :     });
     279            0 : 
     280            0 :     let query = format!(
     281            0 :         "START_REPLICATION PHYSICAL {} (term='{}')",
     282            0 :         start_streaming_at, donor.term
     283            0 :     );
     284              : 
     285            0 :     let copy_stream = client.copy_both_simple(&query).await?;
     286            0 :     let physical_stream = ReplicationStream::new(copy_stream);
     287            0 : 
     288            0 :     // As in normal walreceiver, do networking and writing to disk in parallel.
     289            0 :     let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE);
     290            0 :     let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE);
     291            0 :     let wa = WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, None);
     292              : 
     293            0 :     let res = tokio::select! {
     294              :         r = network_io(physical_stream, msg_tx, donor.clone(), tli.clone(), conf.clone()) => r,
     295            0 :         r = read_replies(reply_rx, donor.term) => r.map(|()| None),
     296              :     };
     297              : 
     298              :     // Join the spawned WalAcceptor. At this point chans to/from it passed to
     299              :     // network routines are dropped, so it will exit as soon as it touches them.
     300            0 :     match wa.await {
     301              :         Ok(Ok(())) => {
     302              :             // WalAcceptor finished normally, termination reason is different
     303            0 :             match res {
     304            0 :                 Ok(Some(success_desc)) => Ok(success_desc),
     305            0 :                 Ok(None) => bail!("unexpected recovery end without error/success"), // can't happen
     306            0 :                 Err(e) => Err(e), // network error or term change
     307              :             }
     308              :         }
     309            0 :         Ok(Err(e)) => Err(e), // error while processing message
     310            0 :         Err(e) => bail!("WalAcceptor panicked: {}", e),
     311              :     }
     312            0 : }
     313              : 
     314              : // Perform network part of streaming: read data and push it to msg_tx, send KA
     315              : // to make sender hear from us. If there is nothing coming for a while, check
     316              : // for termination.
     317              : // Returns
     318              : // - Ok(None) if channel to WalAcceptor closed -- its task should return error.
     319              : // - Ok(Some(String)) if recovery successfully completed.
     320              : // - Err if error happened while reading/writing to socket.
     321            0 : async fn network_io(
     322            0 :     physical_stream: ReplicationStream,
     323            0 :     msg_tx: Sender<ProposerAcceptorMessage>,
     324            0 :     donor: Donor,
     325            0 :     tli: Arc<Timeline>,
     326            0 :     conf: SafeKeeperConf,
     327            0 : ) -> anyhow::Result<Option<String>> {
     328            0 :     let mut physical_stream = pin!(physical_stream);
     329            0 :     let mut last_received_lsn = Lsn::INVALID;
     330            0 :     // tear down connection if no data arrives withing this period
     331            0 :     let no_data_timeout = Duration::from_millis(30000);
     332              : 
     333              :     loop {
     334            0 :         let msg = match timeout(no_data_timeout, physical_stream.next()).await {
     335            0 :             Ok(next) => match next {
     336            0 :                 None => bail!("unexpected end of replication stream"),
     337            0 :                 Some(msg) => msg.context("get replication message")?,
     338              :             },
     339            0 :             Err(_) => bail!("no message received within {:?}", no_data_timeout),
     340              :         };
     341              : 
     342            0 :         match msg {
     343            0 :             ReplicationMessage::XLogData(xlog_data) => {
     344            0 :                 let ar_hdr = AppendRequestHeader {
     345            0 :                     term: donor.term,
     346            0 :                     epoch_start_lsn: Lsn::INVALID, // unused
     347            0 :                     begin_lsn: Lsn(xlog_data.wal_start()),
     348            0 :                     end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
     349            0 :                     commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
     350            0 :                     truncate_lsn: Lsn::INVALID, // do not attempt to advance
     351            0 :                     proposer_uuid: [0; 16],
     352            0 :                 };
     353            0 :                 let ar = AppendRequest {
     354            0 :                     h: ar_hdr,
     355            0 :                     wal_data: xlog_data.into_data(),
     356            0 :                 };
     357            0 :                 trace!(
     358            0 :                     "processing AppendRequest {}-{}, len {}",
     359            0 :                     ar.h.begin_lsn,
     360            0 :                     ar.h.end_lsn,
     361            0 :                     ar.wal_data.len()
     362              :                 );
     363            0 :                 last_received_lsn = ar.h.end_lsn;
     364            0 :                 if msg_tx
     365            0 :                     .send(ProposerAcceptorMessage::AppendRequest(ar))
     366            0 :                     .await
     367            0 :                     .is_err()
     368              :                 {
     369            0 :                     return Ok(None); // chan closed, WalAcceptor terminated
     370            0 :                 }
     371              :             }
     372              :             ReplicationMessage::PrimaryKeepAlive(_) => {
     373              :                 // keepalive means nothing is being streamed for a while. Check whether we need to stop.
     374            0 :                 let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
     375              :                 // do current donors still contain one we currently connected to?
     376            0 :                 if !recovery_needed_info
     377            0 :                     .donors
     378            0 :                     .iter()
     379            0 :                     .any(|d| d.sk_id == donor.sk_id)
     380              :                 {
     381              :                     // Most likely it means we are caughtup.
     382              :                     // note: just exiting makes tokio_postgres send CopyFail to the far end.
     383            0 :                     return Ok(Some(format!(
     384            0 :                         "terminating at {} as connected safekeeper {} with term {} is not a donor anymore: {}",
     385            0 :                         last_received_lsn, donor.sk_id, donor.term, recovery_needed_info
     386            0 :                     )));
     387            0 :                 }
     388              :             }
     389            0 :             _ => {}
     390              :         }
     391              :         // Send reply to each message to keep connection alive. Ideally we
     392              :         // should do that once in a while instead, but this again requires
     393              :         // stream split or similar workaround, and recovery is anyway not that
     394              :         // performance critical.
     395              :         //
     396              :         // We do not know here real write/flush LSNs (need to take mutex again
     397              :         // or check replies which are read in different future), but neither
     398              :         // sender much cares about them, so just send last received.
     399            0 :         physical_stream
     400            0 :             .as_mut()
     401            0 :             .standby_status_update(
     402            0 :                 PgLsn::from(last_received_lsn.0),
     403            0 :                 PgLsn::from(last_received_lsn.0),
     404            0 :                 PgLsn::from(last_received_lsn.0),
     405            0 :                 SystemTime::now(),
     406            0 :                 0,
     407            0 :             )
     408            0 :             .await?;
     409              :     }
     410            0 : }
     411              : 
     412              : // Read replies from WalAcceptor. We are not interested much in sending them to
     413              : // donor safekeeper, so don't route them anywhere. However, we should check if
     414              : // term changes and exit if it does.
     415              : // Returns Ok(()) if channel closed, Err in case of term change.
     416            0 : async fn read_replies(
     417            0 :     mut reply_rx: Receiver<AcceptorProposerMessage>,
     418            0 :     donor_term: Term,
     419            0 : ) -> anyhow::Result<()> {
     420              :     loop {
     421            0 :         match reply_rx.recv().await {
     422            0 :             Some(msg) => {
     423            0 :                 if let AcceptorProposerMessage::AppendResponse(ar) = msg {
     424            0 :                     if ar.term != donor_term {
     425            0 :                         bail!("donor term changed from {} to {}", donor_term, ar.term);
     426            0 :                     }
     427            0 :                 }
     428              :             }
     429            0 :             None => return Ok(()), // chan closed, WalAcceptor terminated
     430              :         }
     431              :     }
     432            0 : }
        

Generated by: LCOV version 2.1-beta