LCOV - 5187d4b6d9cfe1c429baf0147b0578521d04e1ed.info

LCOV - code coverage report

Current view:	top level - safekeeper/src - recovery.rs (source / functions)		Coverage	Total	Hit
Test:	5187d4b6d9cfe1c429baf0147b0578521d04e1ed.info	Lines:	0.0 %	304	0
Test Date:	2024-06-26 21:48:01	Functions:	0.0 %	22	0

            Line data    Source code

       1              : //! This module implements pulling WAL from peer safekeepers if compute can't
       2              : //! provide it, i.e. safekeeper lags too much.
       3              : 
       4              : use std::time::SystemTime;
       5              : use std::{fmt, pin::pin};
       6              : 
       7              : use anyhow::{bail, Context};
       8              : use futures::StreamExt;
       9              : use postgres_protocol::message::backend::ReplicationMessage;
      10              : use tokio::sync::mpsc::{channel, Receiver, Sender};
      11              : use tokio::time::timeout;
      12              : use tokio::{
      13              :     select,
      14              :     time::sleep,
      15              :     time::{self, Duration},
      16              : };
      17              : use tokio_postgres::replication::ReplicationStream;
      18              : use tokio_postgres::types::PgLsn;
      19              : use tracing::*;
      20              : use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config};
      21              : 
      22              : use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
      23              : use crate::safekeeper::{AppendRequest, AppendRequestHeader};
      24              : use crate::timeline::WalResidentTimeline;
      25              : use crate::{
      26              :     http::routes::TimelineStatus,
      27              :     receive_wal::MSG_QUEUE_SIZE,
      28              :     safekeeper::{
      29              :         AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
      30              :         TermLsn, VoteRequest,
      31              :     },
      32              :     timeline::PeerInfo,
      33              :     SafeKeeperConf,
      34              : };
      35              : 
      36              : /// Entrypoint for per timeline task which always runs, checking whether
      37              : /// recovery for this safekeeper is needed and starting it if so.
      38            0 : #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
      39              : pub async fn recovery_main(tli: WalResidentTimeline, conf: SafeKeeperConf) {
      40              :     info!("started");
      41              : 
      42              :     let cancel = tli.cancel.clone();
      43              :     select! {
      44              :         _ = recovery_main_loop(tli, conf) => { unreachable!() }
      45              :         _ = cancel.cancelled() => {
      46              :             info!("stopped");
      47              :         }
      48              :     }
      49              : }
      50              : 
      51              : /// Should we start fetching WAL from a peer safekeeper, and if yes, from
      52              : /// which? Answer is yes, i.e. .donors is not empty if 1) there is something
      53              : /// to fetch, and we can do that without running elections; 2) there is no
      54              : /// actively streaming compute, as we don't want to compete with it.
      55              : ///
      56              : /// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
      57              : /// to its last_log_term so we are sure such a leader ever had been elected.
      58              : ///
      59              : /// All possible donors are returned so that we could keep connection to the
      60              : /// current one if it is good even if it slightly lags behind.
      61              : ///
      62              : /// Note that term conditions above might be not met, but safekeepers are
      63              : /// still not aligned on last flush_lsn. Generally in this case until
      64              : /// elections are run it is not possible to say which safekeeper should
      65              : /// recover from which one -- history which would be committed is different
      66              : /// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
      67              : /// Thus we don't try to predict it here.
      68            0 : async fn recovery_needed(
      69            0 :     tli: &WalResidentTimeline,
      70            0 :     heartbeat_timeout: Duration,
      71            0 : ) -> RecoveryNeededInfo {
      72            0 :     let ss = tli.read_shared_state().await;
      73            0 :     let term = ss.sk.state().acceptor_state.term;
      74            0 :     let last_log_term = ss.sk.last_log_term();
      75            0 :     let flush_lsn = ss.sk.flush_lsn();
      76            0 :     // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
      77            0 :     let mut peers = ss.get_peers(heartbeat_timeout);
      78            0 :     // Sort by <last log term, lsn> pairs.
      79            0 :     peers.sort_by(|p1, p2| {
      80            0 :         let tl1 = TermLsn {
      81            0 :             term: p1.last_log_term,
      82            0 :             lsn: p1.flush_lsn,
      83            0 :         };
      84            0 :         let tl2 = TermLsn {
      85            0 :             term: p2.last_log_term,
      86            0 :             lsn: p2.flush_lsn,
      87            0 :         };
      88            0 :         tl2.cmp(&tl1) // desc
      89            0 :     });
      90            0 :     let num_streaming_computes = tli.get_walreceivers().get_num_streaming();
      91            0 :     let donors = if num_streaming_computes > 0 {
      92            0 :         vec![] // If there is a streaming compute, don't try to recover to not intervene.
      93              :     } else {
      94            0 :         peers
      95            0 :             .iter()
      96            0 :             .filter_map(|candidate| {
      97            0 :                 // Are we interested in this candidate?
      98            0 :                 let candidate_tl = TermLsn {
      99            0 :                     term: candidate.last_log_term,
     100            0 :                     lsn: candidate.flush_lsn,
     101            0 :                 };
     102            0 :                 let my_tl = TermLsn {
     103            0 :                     term: last_log_term,
     104            0 :                     lsn: flush_lsn,
     105            0 :                 };
     106            0 :                 if my_tl < candidate_tl {
     107              :                     // Yes, we are interested. Can we pull from it without
     108              :                     // (re)running elections? It is possible if 1) his term
     109              :                     // is equal to his last_log_term so we could act on
     110              :                     // behalf of leader of this term (we must be sure he was
     111              :                     // ever elected) and 2) our term is not higher, or we'll refuse data.
     112            0 :                     if candidate.term == candidate.last_log_term && candidate.term >= term {
     113            0 :                         Some(Donor::from(candidate))
     114              :                     } else {
     115            0 :                         None
     116              :                     }
     117              :                 } else {
     118            0 :                     None
     119              :                 }
     120            0 :             })
     121            0 :             .collect()
     122              :     };
     123            0 :     RecoveryNeededInfo {
     124            0 :         term,
     125            0 :         last_log_term,
     126            0 :         flush_lsn,
     127            0 :         peers,
     128            0 :         num_streaming_computes,
     129            0 :         donors,
     130            0 :     }
     131            0 : }
     132              : /// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
     133              : /// fields to explain the choice.
     134              : #[derive(Debug)]
     135              : pub struct RecoveryNeededInfo {
     136              :     /// my term
     137              :     pub term: Term,
     138              :     /// my last_log_term
     139              :     pub last_log_term: Term,
     140              :     /// my flush_lsn
     141              :     pub flush_lsn: Lsn,
     142              :     /// peers from which we can fetch WAL, for observability.
     143              :     pub peers: Vec<PeerInfo>,
     144              :     /// for observability
     145              :     pub num_streaming_computes: usize,
     146              :     pub donors: Vec<Donor>,
     147              : }
     148              : 
     149              : // Custom to omit not important fields from PeerInfo.
     150              : impl fmt::Display for RecoveryNeededInfo {
     151            0 :     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
     152            0 :         write!(f, "{{")?;
     153            0 :         write!(
     154            0 :             f,
     155            0 :             "term: {}, last_log_term: {}, flush_lsn: {}, peers: {{",
     156            0 :             self.term, self.last_log_term, self.flush_lsn
     157            0 :         )?;
     158            0 :         for p in self.peers.iter() {
     159            0 :             write!(
     160            0 :                 f,
     161            0 :                 "PeerInfo {{ sk_id: {}, term: {}, last_log_term: {}, flush_lsn: {} }}, ",
     162            0 :                 p.sk_id, p.term, p.last_log_term, p.flush_lsn
     163            0 :             )?;
     164              :         }
     165            0 :         write!(
     166            0 :             f,
     167            0 :             "}} num_streaming_computes: {}, donors: {:?}",
     168            0 :             self.num_streaming_computes, self.donors
     169            0 :         )
     170            0 :     }
     171              : }
     172              : 
     173              : #[derive(Clone, Debug)]
     174              : pub struct Donor {
     175              :     pub sk_id: NodeId,
     176              :     /// equals to last_log_term
     177              :     pub term: Term,
     178              :     pub flush_lsn: Lsn,
     179              :     pub pg_connstr: String,
     180              :     pub http_connstr: String,
     181              : }
     182              : 
     183              : impl From<&PeerInfo> for Donor {
     184            0 :     fn from(p: &PeerInfo) -> Self {
     185            0 :         Donor {
     186            0 :             sk_id: p.sk_id,
     187            0 :             term: p.term,
     188            0 :             flush_lsn: p.flush_lsn,
     189            0 :             pg_connstr: p.pg_connstr.clone(),
     190            0 :             http_connstr: p.http_connstr.clone(),
     191            0 :         }
     192            0 :     }
     193              : }
     194              : 
     195              : const CHECK_INTERVAL_MS: u64 = 2000;
     196              : 
     197              : /// Check regularly whether we need to start recovery.
     198            0 : async fn recovery_main_loop(tli: WalResidentTimeline, conf: SafeKeeperConf) {
     199            0 :     let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
     200              :     loop {
     201            0 :         let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
     202            0 :         match recovery_needed_info.donors.first() {
     203            0 :             Some(donor) => {
     204            0 :                 info!(
     205            0 :                     "starting recovery from donor {}: {}",
     206              :                     donor.sk_id, recovery_needed_info
     207              :                 );
     208            0 :                 let res = tli.wal_residence_guard().await;
     209            0 :                 if let Err(e) = res {
     210            0 :                     warn!("failed to obtain guard: {}", e);
     211            0 :                     continue;
     212            0 :                 }
     213            0 :                 match recover(res.unwrap(), donor, &conf).await {
     214              :                     // Note: 'write_wal rewrites WAL written before' error is
     215              :                     // expected here and might happen if compute and recovery
     216              :                     // concurrently write the same data. Eventually compute
     217              :                     // should win.
     218            0 :                     Err(e) => warn!("recovery failed: {:#}", e),
     219            0 :                     Ok(msg) => info!("recovery finished: {}", msg),
     220              :                 }
     221              :             }
     222              :             None => {
     223            0 :                 trace!(
     224            0 :                     "recovery not needed or not possible: {}",
     225              :                     recovery_needed_info
     226              :                 );
     227              :             }
     228              :         }
     229            0 :         sleep(check_duration).await;
     230              :     }
     231              : }
     232              : 
     233              : /// Recover from the specified donor. Returns message explaining normal finish
     234              : /// reason or error.
     235            0 : async fn recover(
     236            0 :     tli: WalResidentTimeline,
     237            0 :     donor: &Donor,
     238            0 :     conf: &SafeKeeperConf,
     239            0 : ) -> anyhow::Result<String> {
     240            0 :     // Learn donor term switch history to figure out starting point.
     241            0 :     let client = reqwest::Client::new();
     242            0 :     let timeline_info: TimelineStatus = client
     243            0 :         .get(format!(
     244            0 :             "http://{}/v1/tenant/{}/timeline/{}",
     245            0 :             donor.http_connstr, tli.ttid.tenant_id, tli.ttid.timeline_id
     246            0 :         ))
     247            0 :         .send()
     248            0 :         .await?
     249            0 :         .json()
     250            0 :         .await?;
     251            0 :     if timeline_info.acceptor_state.term != donor.term {
     252            0 :         bail!(
     253            0 :             "donor term changed from {} to {}",
     254            0 :             donor.term,
     255            0 :             timeline_info.acceptor_state.term
     256            0 :         );
     257            0 :     }
     258            0 :     // convert from API TermSwitchApiEntry into TermLsn.
     259            0 :     let donor_th = TermHistory(
     260            0 :         timeline_info
     261            0 :             .acceptor_state
     262            0 :             .term_history
     263            0 :             .iter()
     264            0 :             .map(|tl| Into::<TermLsn>::into(*tl))
     265            0 :             .collect(),
     266            0 :     );
     267            0 : 
     268            0 :     // Now understand our term history.
     269            0 :     let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
     270            0 :     let vote_response = match tli
     271            0 :         .process_msg(&vote_request)
     272            0 :         .await
     273            0 :         .context("VoteRequest handling")?
     274              :     {
     275            0 :         Some(AcceptorProposerMessage::VoteResponse(vr)) => vr,
     276              :         _ => {
     277            0 :             bail!("unexpected VoteRequest response"); // unreachable
     278              :         }
     279              :     };
     280            0 :     if vote_response.term != donor.term {
     281            0 :         bail!(
     282            0 :             "our term changed from {} to {}",
     283            0 :             donor.term,
     284            0 :             vote_response.term
     285            0 :         );
     286            0 :     }
     287              : 
     288            0 :     let last_common_point = match TermHistory::find_highest_common_point(
     289            0 :         &donor_th,
     290            0 :         &vote_response.term_history,
     291            0 :         vote_response.flush_lsn,
     292            0 :     ) {
     293            0 :         None => bail!(
     294            0 :             "couldn't find common point in histories, donor {:?}, sk {:?}",
     295            0 :             donor_th,
     296            0 :             vote_response.term_history,
     297            0 :         ),
     298            0 :         Some(lcp) => lcp,
     299            0 :     };
     300            0 :     info!("found last common point at {:?}", last_common_point);
     301              : 
     302              :     // truncate WAL locally
     303            0 :     let pe = ProposerAcceptorMessage::Elected(ProposerElected {
     304            0 :         term: donor.term,
     305            0 :         start_streaming_at: last_common_point.lsn,
     306            0 :         term_history: donor_th,
     307            0 :         timeline_start_lsn: Lsn::INVALID,
     308            0 :     });
     309            0 :     // Successful ProposerElected handling always returns None. If term changed,
     310            0 :     // we'll find out that during the streaming. Note: it is expected to get
     311            0 :     // 'refusing to overwrite correct WAL' here if walproposer reconnected
     312            0 :     // concurrently, restart helps here.
     313            0 :     tli.process_msg(&pe)
     314            0 :         .await
     315            0 :         .context("ProposerElected handling")?;
     316              : 
     317            0 :     recovery_stream(tli, donor, last_common_point.lsn, conf).await
     318            0 : }
     319              : 
     320              : // Pull WAL from donor, assuming handshake is already done.
     321            0 : async fn recovery_stream(
     322            0 :     tli: WalResidentTimeline,
     323            0 :     donor: &Donor,
     324            0 :     start_streaming_at: Lsn,
     325            0 :     conf: &SafeKeeperConf,
     326            0 : ) -> anyhow::Result<String> {
     327              :     // TODO: pass auth token
     328            0 :     let cfg = wal_stream_connection_config(tli.ttid, &donor.pg_connstr, None, None)?;
     329            0 :     let mut cfg = cfg.to_tokio_postgres_config();
     330            0 :     // It will make safekeeper give out not committed WAL (up to flush_lsn).
     331            0 :     cfg.application_name(&format!("safekeeper_{}", conf.my_id));
     332            0 :     cfg.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
     333            0 : 
     334            0 :     let connect_timeout = Duration::from_millis(10000);
     335            0 :     let (client, connection) = match time::timeout(connect_timeout, cfg.connect(postgres::NoTls))
     336            0 :         .await
     337              :     {
     338            0 :         Ok(client_and_conn) => client_and_conn?,
     339            0 :         Err(_elapsed) => {
     340            0 :             bail!("timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open");
     341              :         }
     342              :     };
     343            0 :     trace!("connected to {:?}", donor);
     344              : 
     345              :     // The connection object performs the actual communication with the
     346              :     // server, spawn it off to run on its own.
     347            0 :     let ttid = tli.ttid;
     348            0 :     tokio::spawn(async move {
     349            0 :         if let Err(e) = connection
     350            0 :             .instrument(info_span!("recovery task connection poll", ttid = %ttid))
     351            0 :             .await
     352              :         {
     353              :             // This logging isn't very useful as error is anyway forwarded to client.
     354            0 :             trace!(
     355            0 :                 "tokio_postgres connection object finished with error: {}",
     356              :                 e
     357              :             );
     358            0 :         }
     359            0 :     });
     360            0 : 
     361            0 :     let query = format!(
     362            0 :         "START_REPLICATION PHYSICAL {} (term='{}')",
     363            0 :         start_streaming_at, donor.term
     364            0 :     );
     365              : 
     366            0 :     let copy_stream = client.copy_both_simple(&query).await?;
     367            0 :     let physical_stream = ReplicationStream::new(copy_stream);
     368            0 : 
     369            0 :     // As in normal walreceiver, do networking and writing to disk in parallel.
     370            0 :     let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE);
     371            0 :     let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE);
     372            0 :     let wa = WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, None);
     373              : 
     374            0 :     let res = tokio::select! {
     375              :         r = network_io(physical_stream, msg_tx, donor.clone(), tli, conf.clone()) => r,
     376            0 :         r = read_replies(reply_rx, donor.term) => r.map(|()| None),
     377              :     };
     378              : 
     379              :     // Join the spawned WalAcceptor. At this point chans to/from it passed to
     380              :     // network routines are dropped, so it will exit as soon as it touches them.
     381            0 :     match wa.await {
     382              :         Ok(Ok(())) => {
     383              :             // WalAcceptor finished normally, termination reason is different
     384            0 :             match res {
     385            0 :                 Ok(Some(success_desc)) => Ok(success_desc),
     386            0 :                 Ok(None) => bail!("unexpected recovery end without error/success"), // can't happen
     387            0 :                 Err(e) => Err(e), // network error or term change
     388              :             }
     389              :         }
     390            0 :         Ok(Err(e)) => Err(e), // error while processing message
     391            0 :         Err(e) => bail!("WalAcceptor panicked: {}", e),
     392              :     }
     393            0 : }
     394              : 
     395              : // Perform network part of streaming: read data and push it to msg_tx, send KA
     396              : // to make sender hear from us. If there is nothing coming for a while, check
     397              : // for termination.
     398              : // Returns
     399              : // - Ok(None) if channel to WalAcceptor closed -- its task should return error.
     400              : // - Ok(Some(String)) if recovery successfully completed.
     401              : // - Err if error happened while reading/writing to socket.
     402            0 : async fn network_io(
     403            0 :     physical_stream: ReplicationStream,
     404            0 :     msg_tx: Sender<ProposerAcceptorMessage>,
     405            0 :     donor: Donor,
     406            0 :     tli: WalResidentTimeline,
     407            0 :     conf: SafeKeeperConf,
     408            0 : ) -> anyhow::Result<Option<String>> {
     409            0 :     let mut physical_stream = pin!(physical_stream);
     410            0 :     let mut last_received_lsn = Lsn::INVALID;
     411            0 :     // tear down connection if no data arrives withing this period
     412            0 :     let no_data_timeout = Duration::from_millis(30000);
     413              : 
     414              :     loop {
     415            0 :         let msg = match timeout(no_data_timeout, physical_stream.next()).await {
     416            0 :             Ok(next) => match next {
     417            0 :                 None => bail!("unexpected end of replication stream"),
     418            0 :                 Some(msg) => msg.context("get replication message")?,
     419              :             },
     420            0 :             Err(_) => bail!("no message received within {:?}", no_data_timeout),
     421              :         };
     422              : 
     423            0 :         match msg {
     424            0 :             ReplicationMessage::XLogData(xlog_data) => {
     425            0 :                 let ar_hdr = AppendRequestHeader {
     426            0 :                     term: donor.term,
     427            0 :                     term_start_lsn: Lsn::INVALID, // unused
     428            0 :                     begin_lsn: Lsn(xlog_data.wal_start()),
     429            0 :                     end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
     430            0 :                     commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
     431            0 :                     truncate_lsn: Lsn::INVALID, // do not attempt to advance
     432            0 :                     proposer_uuid: [0; 16],
     433            0 :                 };
     434            0 :                 let ar = AppendRequest {
     435            0 :                     h: ar_hdr,
     436            0 :                     wal_data: xlog_data.into_data(),
     437            0 :                 };
     438            0 :                 trace!(
     439            0 :                     "processing AppendRequest {}-{}, len {}",
     440            0 :                     ar.h.begin_lsn,
     441            0 :                     ar.h.end_lsn,
     442            0 :                     ar.wal_data.len()
     443              :                 );
     444            0 :                 last_received_lsn = ar.h.end_lsn;
     445            0 :                 if msg_tx
     446            0 :                     .send(ProposerAcceptorMessage::AppendRequest(ar))
     447            0 :                     .await
     448            0 :                     .is_err()
     449              :                 {
     450            0 :                     return Ok(None); // chan closed, WalAcceptor terminated
     451            0 :                 }
     452              :             }
     453              :             ReplicationMessage::PrimaryKeepAlive(_) => {
     454              :                 // keepalive means nothing is being streamed for a while. Check whether we need to stop.
     455            0 :                 let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
     456              :                 // do current donors still contain one we currently connected to?
     457            0 :                 if !recovery_needed_info
     458            0 :                     .donors
     459            0 :                     .iter()
     460            0 :                     .any(|d| d.sk_id == donor.sk_id)
     461              :                 {
     462              :                     // Most likely it means we are caughtup.
     463              :                     // note: just exiting makes tokio_postgres send CopyFail to the far end.
     464            0 :                     return Ok(Some(format!(
     465            0 :                         "terminating at {} as connected safekeeper {} with term {} is not a donor anymore: {}",
     466            0 :                         last_received_lsn, donor.sk_id, donor.term, recovery_needed_info
     467            0 :                     )));
     468            0 :                 }
     469              :             }
     470            0 :             _ => {}
     471              :         }
     472              :         // Send reply to each message to keep connection alive. Ideally we
     473              :         // should do that once in a while instead, but this again requires
     474              :         // stream split or similar workaround, and recovery is anyway not that
     475              :         // performance critical.
     476              :         //
     477              :         // We do not know here real write/flush LSNs (need to take mutex again
     478              :         // or check replies which are read in different future), but neither
     479              :         // sender much cares about them, so just send last received.
     480            0 :         physical_stream
     481            0 :             .as_mut()
     482            0 :             .standby_status_update(
     483            0 :                 PgLsn::from(last_received_lsn.0),
     484            0 :                 PgLsn::from(last_received_lsn.0),
     485            0 :                 PgLsn::from(last_received_lsn.0),
     486            0 :                 SystemTime::now(),
     487            0 :                 0,
     488            0 :             )
     489            0 :             .await?;
     490              :     }
     491            0 : }
     492              : 
     493              : // Read replies from WalAcceptor. We are not interested much in sending them to
     494              : // donor safekeeper, so don't route them anywhere. However, we should check if
     495              : // term changes and exit if it does.
     496              : // Returns Ok(()) if channel closed, Err in case of term change.
     497            0 : async fn read_replies(
     498            0 :     mut reply_rx: Receiver<AcceptorProposerMessage>,
     499            0 :     donor_term: Term,
     500            0 : ) -> anyhow::Result<()> {
     501              :     loop {
     502            0 :         match reply_rx.recv().await {
     503            0 :             Some(msg) => {
     504            0 :                 if let AcceptorProposerMessage::AppendResponse(ar) = msg {
     505            0 :                     if ar.term != donor_term {
     506            0 :                         bail!("donor term changed from {} to {}", donor_term, ar.term);
     507            0 :                     }
     508            0 :                 }
     509              :             }
     510            0 :             None => return Ok(()), // chan closed, WalAcceptor terminated
     511              :         }
     512              :     }
     513            0 : }

Generated by: LCOV version 2.1-beta