|             Line data    Source code 
       1              : //! This module implements pulling WAL from peer safekeepers if compute can't
       2              : //! provide it, i.e. safekeeper lags too much.
       3              : 
       4              : use std::time::SystemTime;
       5              : use std::{fmt, pin::pin};
       6              : 
       7              : use anyhow::{bail, Context};
       8              : use futures::StreamExt;
       9              : use postgres_protocol::message::backend::ReplicationMessage;
      10              : use tokio::sync::mpsc::{channel, Receiver, Sender};
      11              : use tokio::time::timeout;
      12              : use tokio::{
      13              :     select,
      14              :     time::sleep,
      15              :     time::{self, Duration},
      16              : };
      17              : use tokio_postgres::replication::ReplicationStream;
      18              : use tokio_postgres::types::PgLsn;
      19              : use tracing::*;
      20              : use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config};
      21              : 
      22              : use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
      23              : use crate::safekeeper::{AppendRequest, AppendRequestHeader};
      24              : use crate::timeline::FullAccessTimeline;
      25              : use crate::{
      26              :     http::routes::TimelineStatus,
      27              :     receive_wal::MSG_QUEUE_SIZE,
      28              :     safekeeper::{
      29              :         AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
      30              :         TermLsn, VoteRequest,
      31              :     },
      32              :     timeline::PeerInfo,
      33              :     SafeKeeperConf,
      34              : };
      35              : 
      36              : /// Entrypoint for per timeline task which always runs, checking whether
      37              : /// recovery for this safekeeper is needed and starting it if so.
      38            0 : #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
      39              : pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) {
      40              :     info!("started");
      41              : 
      42              :     let cancel = tli.cancel.clone();
      43              :     select! {
      44              :         _ = recovery_main_loop(tli, conf) => { unreachable!() }
      45              :         _ = cancel.cancelled() => {
      46              :             info!("stopped");
      47              :         }
      48              :     }
      49              : }
      50              : 
      51              : /// Should we start fetching WAL from a peer safekeeper, and if yes, from
      52              : /// which? Answer is yes, i.e. .donors is not empty if 1) there is something
      53              : /// to fetch, and we can do that without running elections; 2) there is no
      54              : /// actively streaming compute, as we don't want to compete with it.
      55              : ///
      56              : /// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
      57              : /// to its last_log_term so we are sure such a leader ever had been elected.
      58              : ///
      59              : /// All possible donors are returned so that we could keep connection to the
      60              : /// current one if it is good even if it slightly lags behind.
      61              : ///
      62              : /// Note that term conditions above might be not met, but safekeepers are
      63              : /// still not aligned on last flush_lsn. Generally in this case until
      64              : /// elections are run it is not possible to say which safekeeper should
      65              : /// recover from which one -- history which would be committed is different
      66              : /// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
      67              : /// Thus we don't try to predict it here.
      68            0 : async fn recovery_needed(
      69            0 :     tli: &FullAccessTimeline,
      70            0 :     heartbeat_timeout: Duration,
      71            0 : ) -> RecoveryNeededInfo {
      72            0 :     let ss = tli.read_shared_state().await;
      73            0 :     let term = ss.sk.state.acceptor_state.term;
      74            0 :     let last_log_term = ss.sk.get_last_log_term();
      75            0 :     let flush_lsn = ss.sk.flush_lsn();
      76            0 :     // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
      77            0 :     let mut peers = ss.get_peers(heartbeat_timeout);
      78            0 :     // Sort by <last log term, lsn> pairs.
      79            0 :     peers.sort_by(|p1, p2| {
      80            0 :         let tl1 = TermLsn {
      81            0 :             term: p1.last_log_term,
      82            0 :             lsn: p1.flush_lsn,
      83            0 :         };
      84            0 :         let tl2 = TermLsn {
      85            0 :             term: p2.last_log_term,
      86            0 :             lsn: p2.flush_lsn,
      87            0 :         };
      88            0 :         tl2.cmp(&tl1) // desc
      89            0 :     });
      90            0 :     let num_streaming_computes = tli.get_walreceivers().get_num_streaming();
      91            0 :     let donors = if num_streaming_computes > 0 {
      92            0 :         vec![] // If there is a streaming compute, don't try to recover to not intervene.
      93              :     } else {
      94            0 :         peers
      95            0 :             .iter()
      96            0 :             .filter_map(|candidate| {
      97            0 :                 // Are we interested in this candidate?
      98            0 :                 let candidate_tl = TermLsn {
      99            0 :                     term: candidate.last_log_term,
     100            0 :                     lsn: candidate.flush_lsn,
     101            0 :                 };
     102            0 :                 let my_tl = TermLsn {
     103            0 :                     term: last_log_term,
     104            0 :                     lsn: flush_lsn,
     105            0 :                 };
     106            0 :                 if my_tl < candidate_tl {
     107              :                     // Yes, we are interested. Can we pull from it without
     108              :                     // (re)running elections? It is possible if 1) his term
     109              :                     // is equal to his last_log_term so we could act on
     110              :                     // behalf of leader of this term (we must be sure he was
     111              :                     // ever elected) and 2) our term is not higher, or we'll refuse data.
     112            0 :                     if candidate.term == candidate.last_log_term && candidate.term >= term {
     113            0 :                         Some(Donor::from(candidate))
     114              :                     } else {
     115            0 :                         None
     116              :                     }
     117              :                 } else {
     118            0 :                     None
     119              :                 }
     120            0 :             })
     121            0 :             .collect()
     122              :     };
     123            0 :     RecoveryNeededInfo {
     124            0 :         term,
     125            0 :         last_log_term,
     126            0 :         flush_lsn,
     127            0 :         peers,
     128            0 :         num_streaming_computes,
     129            0 :         donors,
     130            0 :     }
     131            0 : }
     132              : /// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
     133              : /// fields to explain the choice.
     134              : #[derive(Debug)]
     135              : pub struct RecoveryNeededInfo {
     136              :     /// my term
     137              :     pub term: Term,
     138              :     /// my last_log_term
     139              :     pub last_log_term: Term,
     140              :     /// my flush_lsn
     141              :     pub flush_lsn: Lsn,
     142              :     /// peers from which we can fetch WAL, for observability.
     143              :     pub peers: Vec<PeerInfo>,
     144              :     /// for observability
     145              :     pub num_streaming_computes: usize,
     146              :     pub donors: Vec<Donor>,
     147              : }
     148              : 
     149              : // Custom to omit not important fields from PeerInfo.
     150              : impl fmt::Display for RecoveryNeededInfo {
     151            0 :     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
     152            0 :         write!(f, "{{")?;
     153            0 :         write!(
     154            0 :             f,
     155            0 :             "term: {}, last_log_term: {}, flush_lsn: {}, peers: {{",
     156            0 :             self.term, self.last_log_term, self.flush_lsn
     157            0 :         )?;
     158            0 :         for p in self.peers.iter() {
     159            0 :             write!(
     160            0 :                 f,
     161            0 :                 "PeerInfo {{ sk_id: {}, term: {}, last_log_term: {}, flush_lsn: {} }}, ",
     162            0 :                 p.sk_id, p.term, p.last_log_term, p.flush_lsn
     163            0 :             )?;
     164              :         }
     165            0 :         write!(
     166            0 :             f,
     167            0 :             "}} num_streaming_computes: {}, donors: {:?}",
     168            0 :             self.num_streaming_computes, self.donors
     169            0 :         )
     170            0 :     }
     171              : }
     172              : 
     173              : #[derive(Clone, Debug)]
     174              : pub struct Donor {
     175              :     pub sk_id: NodeId,
     176              :     /// equals to last_log_term
     177              :     pub term: Term,
     178              :     pub flush_lsn: Lsn,
     179              :     pub pg_connstr: String,
     180              :     pub http_connstr: String,
     181              : }
     182              : 
     183              : impl From<&PeerInfo> for Donor {
     184            0 :     fn from(p: &PeerInfo) -> Self {
     185            0 :         Donor {
     186            0 :             sk_id: p.sk_id,
     187            0 :             term: p.term,
     188            0 :             flush_lsn: p.flush_lsn,
     189            0 :             pg_connstr: p.pg_connstr.clone(),
     190            0 :             http_connstr: p.http_connstr.clone(),
     191            0 :         }
     192            0 :     }
     193              : }
     194              : 
     195              : const CHECK_INTERVAL_MS: u64 = 2000;
     196              : 
     197              : /// Check regularly whether we need to start recovery.
     198            0 : async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
     199            0 :     let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
     200              :     loop {
     201            0 :         let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
     202            0 :         match recovery_needed_info.donors.first() {
     203            0 :             Some(donor) => {
     204            0 :                 info!(
     205            0 :                     "starting recovery from donor {}: {}",
     206              :                     donor.sk_id, recovery_needed_info
     207              :                 );
     208            0 :                 match recover(tli.clone(), donor, &conf).await {
     209              :                     // Note: 'write_wal rewrites WAL written before' error is
     210              :                     // expected here and might happen if compute and recovery
     211              :                     // concurrently write the same data. Eventually compute
     212              :                     // should win.
     213            0 :                     Err(e) => warn!("recovery failed: {:#}", e),
     214            0 :                     Ok(msg) => info!("recovery finished: {}", msg),
     215              :                 }
     216              :             }
     217              :             None => {
     218            0 :                 trace!(
     219            0 :                     "recovery not needed or not possible: {}",
     220              :                     recovery_needed_info
     221              :                 );
     222              :             }
     223              :         }
     224            0 :         sleep(check_duration).await;
     225              :     }
     226              : }
     227              : 
     228              : /// Recover from the specified donor. Returns message explaining normal finish
     229              : /// reason or error.
     230            0 : async fn recover(
     231            0 :     tli: FullAccessTimeline,
     232            0 :     donor: &Donor,
     233            0 :     conf: &SafeKeeperConf,
     234            0 : ) -> anyhow::Result<String> {
     235            0 :     // Learn donor term switch history to figure out starting point.
     236            0 :     let client = reqwest::Client::new();
     237            0 :     let timeline_info: TimelineStatus = client
     238            0 :         .get(format!(
     239            0 :             "http://{}/v1/tenant/{}/timeline/{}",
     240            0 :             donor.http_connstr, tli.ttid.tenant_id, tli.ttid.timeline_id
     241            0 :         ))
     242            0 :         .send()
     243            0 :         .await?
     244            0 :         .json()
     245            0 :         .await?;
     246            0 :     if timeline_info.acceptor_state.term != donor.term {
     247            0 :         bail!(
     248            0 :             "donor term changed from {} to {}",
     249            0 :             donor.term,
     250            0 :             timeline_info.acceptor_state.term
     251            0 :         );
     252            0 :     }
     253            0 :     // convert from API TermSwitchApiEntry into TermLsn.
     254            0 :     let donor_th = TermHistory(
     255            0 :         timeline_info
     256            0 :             .acceptor_state
     257            0 :             .term_history
     258            0 :             .iter()
     259            0 :             .map(|tl| Into::<TermLsn>::into(*tl))
     260            0 :             .collect(),
     261            0 :     );
     262            0 : 
     263            0 :     // Now understand our term history.
     264            0 :     let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
     265            0 :     let vote_response = match tli
     266            0 :         .process_msg(&vote_request)
     267            0 :         .await
     268            0 :         .context("VoteRequest handling")?
     269              :     {
     270            0 :         Some(AcceptorProposerMessage::VoteResponse(vr)) => vr,
     271              :         _ => {
     272            0 :             bail!("unexpected VoteRequest response"); // unreachable
     273              :         }
     274              :     };
     275            0 :     if vote_response.term != donor.term {
     276            0 :         bail!(
     277            0 :             "our term changed from {} to {}",
     278            0 :             donor.term,
     279            0 :             vote_response.term
     280            0 :         );
     281            0 :     }
     282              : 
     283            0 :     let last_common_point = match TermHistory::find_highest_common_point(
     284            0 :         &donor_th,
     285            0 :         &vote_response.term_history,
     286            0 :         vote_response.flush_lsn,
     287            0 :     ) {
     288            0 :         None => bail!(
     289            0 :             "couldn't find common point in histories, donor {:?}, sk {:?}",
     290            0 :             donor_th,
     291            0 :             vote_response.term_history,
     292            0 :         ),
     293            0 :         Some(lcp) => lcp,
     294            0 :     };
     295            0 :     info!("found last common point at {:?}", last_common_point);
     296              : 
     297              :     // truncate WAL locally
     298            0 :     let pe = ProposerAcceptorMessage::Elected(ProposerElected {
     299            0 :         term: donor.term,
     300            0 :         start_streaming_at: last_common_point.lsn,
     301            0 :         term_history: donor_th,
     302            0 :         timeline_start_lsn: Lsn::INVALID,
     303            0 :     });
     304            0 :     // Successful ProposerElected handling always returns None. If term changed,
     305            0 :     // we'll find out that during the streaming. Note: it is expected to get
     306            0 :     // 'refusing to overwrite correct WAL' here if walproposer reconnected
     307            0 :     // concurrently, restart helps here.
     308            0 :     tli.process_msg(&pe)
     309            0 :         .await
     310            0 :         .context("ProposerElected handling")?;
     311              : 
     312            0 :     recovery_stream(tli, donor, last_common_point.lsn, conf).await
     313            0 : }
     314              : 
     315              : // Pull WAL from donor, assuming handshake is already done.
     316            0 : async fn recovery_stream(
     317            0 :     tli: FullAccessTimeline,
     318            0 :     donor: &Donor,
     319            0 :     start_streaming_at: Lsn,
     320            0 :     conf: &SafeKeeperConf,
     321            0 : ) -> anyhow::Result<String> {
     322              :     // TODO: pass auth token
     323            0 :     let cfg = wal_stream_connection_config(tli.ttid, &donor.pg_connstr, None, None)?;
     324            0 :     let mut cfg = cfg.to_tokio_postgres_config();
     325            0 :     // It will make safekeeper give out not committed WAL (up to flush_lsn).
     326            0 :     cfg.application_name(&format!("safekeeper_{}", conf.my_id));
     327            0 :     cfg.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
     328            0 : 
     329            0 :     let connect_timeout = Duration::from_millis(10000);
     330            0 :     let (client, connection) = match time::timeout(connect_timeout, cfg.connect(postgres::NoTls))
     331            0 :         .await
     332              :     {
     333            0 :         Ok(client_and_conn) => client_and_conn?,
     334            0 :         Err(_elapsed) => {
     335            0 :             bail!("timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open");
     336              :         }
     337              :     };
     338            0 :     trace!("connected to {:?}", donor);
     339              : 
     340              :     // The connection object performs the actual communication with the
     341              :     // server, spawn it off to run on its own.
     342            0 :     let ttid = tli.ttid;
     343            0 :     tokio::spawn(async move {
     344            0 :         if let Err(e) = connection
     345            0 :             .instrument(info_span!("recovery task connection poll", ttid = %ttid))
     346            0 :             .await
     347              :         {
     348              :             // This logging isn't very useful as error is anyway forwarded to client.
     349            0 :             trace!(
     350            0 :                 "tokio_postgres connection object finished with error: {}",
     351              :                 e
     352              :             );
     353            0 :         }
     354            0 :     });
     355            0 : 
     356            0 :     let query = format!(
     357            0 :         "START_REPLICATION PHYSICAL {} (term='{}')",
     358            0 :         start_streaming_at, donor.term
     359            0 :     );
     360              : 
     361            0 :     let copy_stream = client.copy_both_simple(&query).await?;
     362            0 :     let physical_stream = ReplicationStream::new(copy_stream);
     363            0 : 
     364            0 :     // As in normal walreceiver, do networking and writing to disk in parallel.
     365            0 :     let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE);
     366            0 :     let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE);
     367            0 :     let wa = WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, None);
     368              : 
     369            0 :     let res = tokio::select! {
     370              :         r = network_io(physical_stream, msg_tx, donor.clone(), tli.clone(), conf.clone()) => r,
     371            0 :         r = read_replies(reply_rx, donor.term) => r.map(|()| None),
     372              :     };
     373              : 
     374              :     // Join the spawned WalAcceptor. At this point chans to/from it passed to
     375              :     // network routines are dropped, so it will exit as soon as it touches them.
     376            0 :     match wa.await {
     377              :         Ok(Ok(())) => {
     378              :             // WalAcceptor finished normally, termination reason is different
     379            0 :             match res {
     380            0 :                 Ok(Some(success_desc)) => Ok(success_desc),
     381            0 :                 Ok(None) => bail!("unexpected recovery end without error/success"), // can't happen
     382            0 :                 Err(e) => Err(e), // network error or term change
     383              :             }
     384              :         }
     385            0 :         Ok(Err(e)) => Err(e), // error while processing message
     386            0 :         Err(e) => bail!("WalAcceptor panicked: {}", e),
     387              :     }
     388            0 : }
     389              : 
     390              : // Perform network part of streaming: read data and push it to msg_tx, send KA
     391              : // to make sender hear from us. If there is nothing coming for a while, check
     392              : // for termination.
     393              : // Returns
     394              : // - Ok(None) if channel to WalAcceptor closed -- its task should return error.
     395              : // - Ok(Some(String)) if recovery successfully completed.
     396              : // - Err if error happened while reading/writing to socket.
     397            0 : async fn network_io(
     398            0 :     physical_stream: ReplicationStream,
     399            0 :     msg_tx: Sender<ProposerAcceptorMessage>,
     400            0 :     donor: Donor,
     401            0 :     tli: FullAccessTimeline,
     402            0 :     conf: SafeKeeperConf,
     403            0 : ) -> anyhow::Result<Option<String>> {
     404            0 :     let mut physical_stream = pin!(physical_stream);
     405            0 :     let mut last_received_lsn = Lsn::INVALID;
     406            0 :     // tear down connection if no data arrives withing this period
     407            0 :     let no_data_timeout = Duration::from_millis(30000);
     408              : 
     409              :     loop {
     410            0 :         let msg = match timeout(no_data_timeout, physical_stream.next()).await {
     411            0 :             Ok(next) => match next {
     412            0 :                 None => bail!("unexpected end of replication stream"),
     413            0 :                 Some(msg) => msg.context("get replication message")?,
     414              :             },
     415            0 :             Err(_) => bail!("no message received within {:?}", no_data_timeout),
     416              :         };
     417              : 
     418            0 :         match msg {
     419            0 :             ReplicationMessage::XLogData(xlog_data) => {
     420            0 :                 let ar_hdr = AppendRequestHeader {
     421            0 :                     term: donor.term,
     422            0 :                     term_start_lsn: Lsn::INVALID, // unused
     423            0 :                     begin_lsn: Lsn(xlog_data.wal_start()),
     424            0 :                     end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
     425            0 :                     commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
     426            0 :                     truncate_lsn: Lsn::INVALID, // do not attempt to advance
     427            0 :                     proposer_uuid: [0; 16],
     428            0 :                 };
     429            0 :                 let ar = AppendRequest {
     430            0 :                     h: ar_hdr,
     431            0 :                     wal_data: xlog_data.into_data(),
     432            0 :                 };
     433            0 :                 trace!(
     434            0 :                     "processing AppendRequest {}-{}, len {}",
     435            0 :                     ar.h.begin_lsn,
     436            0 :                     ar.h.end_lsn,
     437            0 :                     ar.wal_data.len()
     438              :                 );
     439            0 :                 last_received_lsn = ar.h.end_lsn;
     440            0 :                 if msg_tx
     441            0 :                     .send(ProposerAcceptorMessage::AppendRequest(ar))
     442            0 :                     .await
     443            0 :                     .is_err()
     444              :                 {
     445            0 :                     return Ok(None); // chan closed, WalAcceptor terminated
     446            0 :                 }
     447              :             }
     448              :             ReplicationMessage::PrimaryKeepAlive(_) => {
     449              :                 // keepalive means nothing is being streamed for a while. Check whether we need to stop.
     450            0 :                 let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
     451              :                 // do current donors still contain one we currently connected to?
     452            0 :                 if !recovery_needed_info
     453            0 :                     .donors
     454            0 :                     .iter()
     455            0 :                     .any(|d| d.sk_id == donor.sk_id)
     456              :                 {
     457              :                     // Most likely it means we are caughtup.
     458              :                     // note: just exiting makes tokio_postgres send CopyFail to the far end.
     459            0 :                     return Ok(Some(format!(
     460            0 :                         "terminating at {} as connected safekeeper {} with term {} is not a donor anymore: {}",
     461            0 :                         last_received_lsn, donor.sk_id, donor.term, recovery_needed_info
     462            0 :                     )));
     463            0 :                 }
     464              :             }
     465            0 :             _ => {}
     466              :         }
     467              :         // Send reply to each message to keep connection alive. Ideally we
     468              :         // should do that once in a while instead, but this again requires
     469              :         // stream split or similar workaround, and recovery is anyway not that
     470              :         // performance critical.
     471              :         //
     472              :         // We do not know here real write/flush LSNs (need to take mutex again
     473              :         // or check replies which are read in different future), but neither
     474              :         // sender much cares about them, so just send last received.
     475            0 :         physical_stream
     476            0 :             .as_mut()
     477            0 :             .standby_status_update(
     478            0 :                 PgLsn::from(last_received_lsn.0),
     479            0 :                 PgLsn::from(last_received_lsn.0),
     480            0 :                 PgLsn::from(last_received_lsn.0),
     481            0 :                 SystemTime::now(),
     482            0 :                 0,
     483            0 :             )
     484            0 :             .await?;
     485              :     }
     486            0 : }
     487              : 
     488              : // Read replies from WalAcceptor. We are not interested much in sending them to
     489              : // donor safekeeper, so don't route them anywhere. However, we should check if
     490              : // term changes and exit if it does.
     491              : // Returns Ok(()) if channel closed, Err in case of term change.
     492            0 : async fn read_replies(
     493            0 :     mut reply_rx: Receiver<AcceptorProposerMessage>,
     494            0 :     donor_term: Term,
     495            0 : ) -> anyhow::Result<()> {
     496              :     loop {
     497            0 :         match reply_rx.recv().await {
     498            0 :             Some(msg) => {
     499            0 :                 if let AcceptorProposerMessage::AppendResponse(ar) = msg {
     500            0 :                     if ar.term != donor_term {
     501            0 :                         bail!("donor term changed from {} to {}", donor_term, ar.term);
     502            0 :                     }
     503            0 :                 }
     504              :             }
     505            0 :             None => return Ok(()), // chan closed, WalAcceptor terminated
     506              :         }
     507              :     }
     508            0 : }
         |