LCOV - 8b13a09a5c233d98abd4a0d3e59157e7db16d6fd.info

LCOV - code coverage report

Current view:	top level - safekeeper/src - send_wal.rs (source / functions)		Coverage	Total	Hit
Test:	8b13a09a5c233d98abd4a0d3e59157e7db16d6fd.info	Lines:	19.4 %	562	109
Test Date:	2024-11-21 10:53:51	Functions:	9.3 %	118	11

            Line data    Source code

       1              : //! This module implements the streaming side of replication protocol, starting
       2              : //! with the "START_REPLICATION" message, and registry of walsenders.
       3              : 
       4              : use crate::handler::SafekeeperPostgresHandler;
       5              : use crate::metrics::RECEIVED_PS_FEEDBACKS;
       6              : use crate::receive_wal::WalReceivers;
       7              : use crate::safekeeper::{Term, TermLsn};
       8              : use crate::send_interpreted_wal::InterpretedWalSender;
       9              : use crate::timeline::WalResidentTimeline;
      10              : use crate::wal_reader_stream::WalReaderStreamBuilder;
      11              : use crate::wal_service::ConnectionId;
      12              : use crate::wal_storage::WalReader;
      13              : use crate::GlobalTimelines;
      14              : use anyhow::{bail, Context as AnyhowContext};
      15              : use bytes::Bytes;
      16              : use futures::future::Either;
      17              : use parking_lot::Mutex;
      18              : use postgres_backend::PostgresBackend;
      19              : use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
      20              : use postgres_ffi::get_current_timestamp;
      21              : use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
      22              : use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
      23              : use serde::{Deserialize, Serialize};
      24              : use tokio::io::{AsyncRead, AsyncWrite};
      25              : use utils::failpoint_support;
      26              : use utils::id::TenantTimelineId;
      27              : use utils::pageserver_feedback::PageserverFeedback;
      28              : use utils::postgres_client::PostgresClientProtocol;
      29              : 
      30              : use std::cmp::{max, min};
      31              : use std::net::SocketAddr;
      32              : use std::str;
      33              : use std::sync::Arc;
      34              : use std::time::Duration;
      35              : use tokio::sync::watch::Receiver;
      36              : use tokio::time::timeout;
      37              : use tracing::*;
      38              : use utils::{bin_ser::BeSer, lsn::Lsn};
      39              : 
      40              : // See: https://www.postgresql.org/docs/13/protocol-replication.html
      41              : const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h';
      42              : const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r';
      43              : // neon extension of replication protocol
      44              : const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z';
      45              : 
      46              : type FullTransactionId = u64;
      47              : 
      48              : /// Hot standby feedback received from replica
      49            0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
      50              : pub struct HotStandbyFeedback {
      51              :     pub ts: TimestampTz,
      52              :     pub xmin: FullTransactionId,
      53              :     pub catalog_xmin: FullTransactionId,
      54              : }
      55              : 
      56              : const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0;
      57              : 
      58              : impl HotStandbyFeedback {
      59         3266 :     pub fn empty() -> HotStandbyFeedback {
      60         3266 :         HotStandbyFeedback {
      61         3266 :             ts: 0,
      62         3266 :             xmin: 0,
      63         3266 :             catalog_xmin: 0,
      64         3266 :         }
      65         3266 :     }
      66              : }
      67              : 
      68              : /// Standby status update
      69            0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
      70              : pub struct StandbyReply {
      71              :     pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby.
      72              :     pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby.
      73              :     pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby.
      74              :     pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01.
      75              :     pub reply_requested: bool,
      76              : }
      77              : 
      78              : impl StandbyReply {
      79            8 :     fn empty() -> Self {
      80            8 :         StandbyReply {
      81            8 :             write_lsn: Lsn::INVALID,
      82            8 :             flush_lsn: Lsn::INVALID,
      83            8 :             apply_lsn: Lsn::INVALID,
      84            8 :             reply_ts: 0,
      85            8 :             reply_requested: false,
      86            8 :         }
      87            8 :     }
      88              : }
      89              : 
      90            0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
      91              : pub struct StandbyFeedback {
      92              :     pub reply: StandbyReply,
      93              :     pub hs_feedback: HotStandbyFeedback,
      94              : }
      95              : 
      96              : impl StandbyFeedback {
      97            2 :     pub fn empty() -> Self {
      98            2 :         StandbyFeedback {
      99            2 :             reply: StandbyReply::empty(),
     100            2 :             hs_feedback: HotStandbyFeedback::empty(),
     101            2 :         }
     102            2 :     }
     103              : }
     104              : 
     105              : /// WalSenders registry. Timeline holds it (wrapped in Arc).
     106              : pub struct WalSenders {
     107              :     mutex: Mutex<WalSendersShared>,
     108              :     walreceivers: Arc<WalReceivers>,
     109              : }
     110              : 
     111              : impl WalSenders {
     112            0 :     pub fn new(walreceivers: Arc<WalReceivers>) -> Arc<WalSenders> {
     113            0 :         Arc::new(WalSenders {
     114            0 :             mutex: Mutex::new(WalSendersShared::new()),
     115            0 :             walreceivers,
     116            0 :         })
     117            0 :     }
     118              : 
     119              :     /// Register new walsender. Returned guard provides access to the slot and
     120              :     /// automatically deregisters in Drop.
     121            0 :     fn register(
     122            0 :         self: &Arc<WalSenders>,
     123            0 :         ttid: TenantTimelineId,
     124            0 :         addr: SocketAddr,
     125            0 :         conn_id: ConnectionId,
     126            0 :         appname: Option<String>,
     127            0 :     ) -> WalSenderGuard {
     128            0 :         let slots = &mut self.mutex.lock().slots;
     129            0 :         let walsender_state = WalSenderState {
     130            0 :             ttid,
     131            0 :             addr,
     132            0 :             conn_id,
     133            0 :             appname,
     134            0 :             feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
     135            0 :         };
     136              :         // find empty slot or create new one
     137            0 :         let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
     138            0 :             slots[pos] = Some(walsender_state);
     139            0 :             pos
     140              :         } else {
     141            0 :             let pos = slots.len();
     142            0 :             slots.push(Some(walsender_state));
     143            0 :             pos
     144              :         };
     145            0 :         WalSenderGuard {
     146            0 :             id: pos,
     147            0 :             walsenders: self.clone(),
     148            0 :         }
     149            0 :     }
     150              : 
     151              :     /// Get state of all walsenders.
     152            0 :     pub fn get_all(self: &Arc<WalSenders>) -> Vec<WalSenderState> {
     153            0 :         self.mutex.lock().slots.iter().flatten().cloned().collect()
     154            0 :     }
     155              : 
     156              :     /// Get LSN of the most lagging pageserver receiver. Return None if there are no
     157              :     /// active walsenders.
     158            0 :     pub fn laggard_lsn(self: &Arc<WalSenders>) -> Option<Lsn> {
     159            0 :         self.mutex
     160            0 :             .lock()
     161            0 :             .slots
     162            0 :             .iter()
     163            0 :             .flatten()
     164            0 :             .filter_map(|s| match s.feedback {
     165            0 :                 ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn),
     166            0 :                 ReplicationFeedback::Standby(_) => None,
     167            0 :             })
     168            0 :             .min()
     169            0 :     }
     170              : 
     171              :     /// Returns total counter of pageserver feedbacks received and last feedback.
     172            0 :     pub fn get_ps_feedback_stats(self: &Arc<WalSenders>) -> (u64, PageserverFeedback) {
     173            0 :         let shared = self.mutex.lock();
     174            0 :         (shared.ps_feedback_counter, shared.last_ps_feedback)
     175            0 :     }
     176              : 
     177              :     /// Get aggregated hot standby feedback (we send it to compute).
     178            0 :     pub fn get_hotstandby(self: &Arc<WalSenders>) -> StandbyFeedback {
     179            0 :         self.mutex.lock().agg_standby_feedback
     180            0 :     }
     181              : 
     182              :     /// Record new pageserver feedback, update aggregated values.
     183            0 :     fn record_ps_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &PageserverFeedback) {
     184            0 :         let mut shared = self.mutex.lock();
     185            0 :         shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback);
     186            0 :         shared.last_ps_feedback = *feedback;
     187            0 :         shared.ps_feedback_counter += 1;
     188            0 :         drop(shared);
     189            0 : 
     190            0 :         RECEIVED_PS_FEEDBACKS.inc();
     191            0 : 
     192            0 :         // send feedback to connected walproposers
     193            0 :         self.walreceivers.broadcast_pageserver_feedback(*feedback);
     194            0 :     }
     195              : 
     196              :     /// Record standby reply.
     197            0 :     fn record_standby_reply(self: &Arc<WalSenders>, id: WalSenderId, reply: &StandbyReply) {
     198            0 :         let mut shared = self.mutex.lock();
     199            0 :         let slot = shared.get_slot_mut(id);
     200            0 :         debug!(
     201            0 :             "Record standby reply: ts={} apply_lsn={}",
     202              :             reply.reply_ts, reply.apply_lsn
     203              :         );
     204            0 :         match &mut slot.feedback {
     205            0 :             ReplicationFeedback::Standby(sf) => sf.reply = *reply,
     206              :             ReplicationFeedback::Pageserver(_) => {
     207            0 :                 slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
     208            0 :                     reply: *reply,
     209            0 :                     hs_feedback: HotStandbyFeedback::empty(),
     210            0 :                 })
     211              :             }
     212              :         }
     213            0 :     }
     214              : 
     215              :     /// Record hot standby feedback, update aggregated value.
     216            0 :     fn record_hs_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &HotStandbyFeedback) {
     217            0 :         let mut shared = self.mutex.lock();
     218            0 :         let slot = shared.get_slot_mut(id);
     219            0 :         match &mut slot.feedback {
     220            0 :             ReplicationFeedback::Standby(sf) => sf.hs_feedback = *feedback,
     221              :             ReplicationFeedback::Pageserver(_) => {
     222            0 :                 slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
     223            0 :                     reply: StandbyReply::empty(),
     224            0 :                     hs_feedback: *feedback,
     225            0 :                 })
     226              :             }
     227              :         }
     228            0 :         shared.update_reply_feedback();
     229            0 :     }
     230              : 
     231              :     /// Get remote_consistent_lsn reported by the pageserver. Returns None if
     232              :     /// client is not pageserver.
     233            0 :     pub fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
     234            0 :         let shared = self.mutex.lock();
     235            0 :         let slot = shared.get_slot(id);
     236            0 :         match slot.feedback {
     237            0 :             ReplicationFeedback::Pageserver(feedback) => Some(feedback.remote_consistent_lsn),
     238            0 :             _ => None,
     239              :         }
     240            0 :     }
     241              : 
     242              :     /// Unregister walsender.
     243            0 :     fn unregister(self: &Arc<WalSenders>, id: WalSenderId) {
     244            0 :         let mut shared = self.mutex.lock();
     245            0 :         shared.slots[id] = None;
     246            0 :         shared.update_reply_feedback();
     247            0 :     }
     248              : }
     249              : 
     250              : struct WalSendersShared {
     251              :     // aggregated over all walsenders value
     252              :     agg_standby_feedback: StandbyFeedback,
     253              :     // last feedback ever received from any pageserver, empty if none
     254              :     last_ps_feedback: PageserverFeedback,
     255              :     // total counter of pageserver feedbacks received
     256              :     ps_feedback_counter: u64,
     257              :     slots: Vec<Option<WalSenderState>>,
     258              : }
     259              : 
     260              : impl WalSendersShared {
     261            2 :     fn new() -> Self {
     262            2 :         WalSendersShared {
     263            2 :             agg_standby_feedback: StandbyFeedback::empty(),
     264            2 :             last_ps_feedback: PageserverFeedback::empty(),
     265            2 :             ps_feedback_counter: 0,
     266            2 :             slots: Vec::new(),
     267            2 :         }
     268            2 :     }
     269              : 
     270              :     /// Get content of provided id slot, it must exist.
     271            0 :     fn get_slot(&self, id: WalSenderId) -> &WalSenderState {
     272            0 :         self.slots[id].as_ref().expect("walsender doesn't exist")
     273            0 :     }
     274              : 
     275              :     /// Get mut content of provided id slot, it must exist.
     276            0 :     fn get_slot_mut(&mut self, id: WalSenderId) -> &mut WalSenderState {
     277            0 :         self.slots[id].as_mut().expect("walsender doesn't exist")
     278            0 :     }
     279              : 
     280              :     /// Update aggregated hot standy and normal reply feedbacks. We just take min of valid xmins
     281              :     /// and ts.
     282            2 :     fn update_reply_feedback(&mut self) {
     283            2 :         let mut agg = HotStandbyFeedback::empty();
     284            2 :         let mut reply_agg = StandbyReply::empty();
     285            4 :         for ws_state in self.slots.iter().flatten() {
     286            4 :             if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
     287            4 :                 let hs_feedback = standby_feedback.hs_feedback;
     288            4 :                 // doing Option math like op1.iter().chain(op2.iter()).min()
     289            4 :                 // would be nicer, but we serialize/deserialize this struct
     290            4 :                 // directly, so leave as is for now
     291            4 :                 if hs_feedback.xmin != INVALID_FULL_TRANSACTION_ID {
     292            2 :                     if agg.xmin != INVALID_FULL_TRANSACTION_ID {
     293            1 :                         agg.xmin = min(agg.xmin, hs_feedback.xmin);
     294            1 :                     } else {
     295            1 :                         agg.xmin = hs_feedback.xmin;
     296            1 :                     }
     297            2 :                     agg.ts = max(agg.ts, hs_feedback.ts);
     298            2 :                 }
     299            4 :                 if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
     300            0 :                     if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
     301            0 :                         agg.catalog_xmin = min(agg.catalog_xmin, hs_feedback.catalog_xmin);
     302            0 :                     } else {
     303            0 :                         agg.catalog_xmin = hs_feedback.catalog_xmin;
     304            0 :                     }
     305            0 :                     agg.ts = max(agg.ts, hs_feedback.ts);
     306            4 :                 }
     307            4 :                 let reply = standby_feedback.reply;
     308            4 :                 if reply.write_lsn != Lsn::INVALID {
     309            0 :                     if reply_agg.write_lsn != Lsn::INVALID {
     310            0 :                         reply_agg.write_lsn = Lsn::min(reply_agg.write_lsn, reply.write_lsn);
     311            0 :                     } else {
     312            0 :                         reply_agg.write_lsn = reply.write_lsn;
     313            0 :                     }
     314            4 :                 }
     315            4 :                 if reply.flush_lsn != Lsn::INVALID {
     316            0 :                     if reply_agg.flush_lsn != Lsn::INVALID {
     317            0 :                         reply_agg.flush_lsn = Lsn::min(reply_agg.flush_lsn, reply.flush_lsn);
     318            0 :                     } else {
     319            0 :                         reply_agg.flush_lsn = reply.flush_lsn;
     320            0 :                     }
     321            4 :                 }
     322            4 :                 if reply.apply_lsn != Lsn::INVALID {
     323            0 :                     if reply_agg.apply_lsn != Lsn::INVALID {
     324            0 :                         reply_agg.apply_lsn = Lsn::min(reply_agg.apply_lsn, reply.apply_lsn);
     325            0 :                     } else {
     326            0 :                         reply_agg.apply_lsn = reply.apply_lsn;
     327            0 :                     }
     328            4 :                 }
     329            4 :                 if reply.reply_ts != 0 {
     330            0 :                     if reply_agg.reply_ts != 0 {
     331            0 :                         reply_agg.reply_ts = TimestampTz::min(reply_agg.reply_ts, reply.reply_ts);
     332            0 :                     } else {
     333            0 :                         reply_agg.reply_ts = reply.reply_ts;
     334            0 :                     }
     335            4 :                 }
     336            0 :             }
     337              :         }
     338            2 :         self.agg_standby_feedback = StandbyFeedback {
     339            2 :             reply: reply_agg,
     340            2 :             hs_feedback: agg,
     341            2 :         };
     342            2 :     }
     343              : }
     344              : 
     345              : // Serialized is used only for pretty printing in json.
     346            0 : #[derive(Debug, Clone, Serialize, Deserialize)]
     347              : pub struct WalSenderState {
     348              :     ttid: TenantTimelineId,
     349              :     addr: SocketAddr,
     350              :     conn_id: ConnectionId,
     351              :     // postgres application_name
     352              :     appname: Option<String>,
     353              :     feedback: ReplicationFeedback,
     354              : }
     355              : 
     356              : // Receiver is either pageserver or regular standby, which have different
     357              : // feedbacks.
     358            0 : #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
     359              : enum ReplicationFeedback {
     360              :     Pageserver(PageserverFeedback),
     361              :     Standby(StandbyFeedback),
     362              : }
     363              : 
     364              : // id of the occupied slot in WalSenders to access it (and save in the
     365              : // WalSenderGuard). We could give Arc directly to the slot, but there is not
     366              : // much sense in that as values aggregation which is performed on each feedback
     367              : // receival iterates over all walsenders.
     368              : pub type WalSenderId = usize;
     369              : 
     370              : /// Scope guard to access slot in WalSenders registry and unregister from it in
     371              : /// Drop.
     372              : pub struct WalSenderGuard {
     373              :     id: WalSenderId,
     374              :     walsenders: Arc<WalSenders>,
     375              : }
     376              : 
     377              : impl WalSenderGuard {
     378            0 :     pub fn id(&self) -> WalSenderId {
     379            0 :         self.id
     380            0 :     }
     381              : 
     382            0 :     pub fn walsenders(&self) -> &Arc<WalSenders> {
     383            0 :         &self.walsenders
     384            0 :     }
     385              : }
     386              : 
     387              : impl Drop for WalSenderGuard {
     388            0 :     fn drop(&mut self) {
     389            0 :         self.walsenders.unregister(self.id);
     390            0 :     }
     391              : }
     392              : 
     393              : impl SafekeeperPostgresHandler {
     394              :     /// Wrapper around handle_start_replication_guts handling result. Error is
     395              :     /// handled here while we're still in walsender ttid span; with API
     396              :     /// extension, this can probably be moved into postgres_backend.
     397            0 :     pub async fn handle_start_replication<IO: AsyncRead + AsyncWrite + Unpin>(
     398            0 :         &mut self,
     399            0 :         pgb: &mut PostgresBackend<IO>,
     400            0 :         start_pos: Lsn,
     401            0 :         term: Option<Term>,
     402            0 :     ) -> Result<(), QueryError> {
     403            0 :         let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
     404            0 :         let residence_guard = tli.wal_residence_guard().await?;
     405              : 
     406            0 :         if let Err(end) = self
     407            0 :             .handle_start_replication_guts(pgb, start_pos, term, residence_guard)
     408            0 :             .await
     409              :         {
     410            0 :             let info = tli.get_safekeeper_info(&self.conf).await;
     411              :             // Log the result and probably send it to the client, closing the stream.
     412            0 :             pgb.handle_copy_stream_end(end)
     413            0 :             .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.flush_lsn)))
     414            0 :             .await;
     415            0 :         }
     416            0 :         Ok(())
     417            0 :     }
     418              : 
     419            0 :     pub async fn handle_start_replication_guts<IO: AsyncRead + AsyncWrite + Unpin>(
     420            0 :         &mut self,
     421            0 :         pgb: &mut PostgresBackend<IO>,
     422            0 :         start_pos: Lsn,
     423            0 :         term: Option<Term>,
     424            0 :         tli: WalResidentTimeline,
     425            0 :     ) -> Result<(), CopyStreamHandlerEnd> {
     426            0 :         let appname = self.appname.clone();
     427            0 : 
     428            0 :         // Use a guard object to remove our entry from the timeline when we are done.
     429            0 :         let ws_guard = Arc::new(tli.get_walsenders().register(
     430            0 :             self.ttid,
     431            0 :             *pgb.get_peer_addr(),
     432            0 :             self.conn_id,
     433            0 :             self.appname.clone(),
     434            0 :         ));
     435              : 
     436              :         // Walsender can operate in one of two modes which we select by
     437              :         // application_name: give only committed WAL (used by pageserver) or all
     438              :         // existing WAL (up to flush_lsn, used by walproposer or peer recovery).
     439              :         // The second case is always driven by a consensus leader which term
     440              :         // must be supplied.
     441            0 :         let end_watch = if term.is_some() {
     442            0 :             EndWatch::Flush(tli.get_term_flush_lsn_watch_rx())
     443              :         } else {
     444            0 :             EndWatch::Commit(tli.get_commit_lsn_watch_rx())
     445              :         };
     446              :         // we don't check term here; it will be checked on first waiting/WAL reading anyway.
     447            0 :         let end_pos = end_watch.get();
     448            0 : 
     449            0 :         if end_pos < start_pos {
     450            0 :             warn!(
     451            0 :                 "requested start_pos {} is ahead of available WAL end_pos {}",
     452              :                 start_pos, end_pos
     453              :             );
     454            0 :         }
     455              : 
     456            0 :         info!(
     457            0 :             "starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}, protocol={:?}",
     458              :             start_pos,
     459              :             end_pos,
     460            0 :             matches!(end_watch, EndWatch::Flush(_)),
     461              :             appname,
     462            0 :             self.protocol(),
     463              :         );
     464              : 
     465              :         // switch to copy
     466            0 :         pgb.write_message(&BeMessage::CopyBothResponse).await?;
     467              : 
     468            0 :         let wal_reader = tli.get_walreader(start_pos).await?;
     469              : 
     470              :         // Split to concurrently receive and send data; replies are generally
     471              :         // not synchronized with sends, so this avoids deadlocks.
     472            0 :         let reader = pgb.split().context("START_REPLICATION split")?;
     473              : 
     474            0 :         let send_fut = match self.protocol() {
     475              :             PostgresClientProtocol::Vanilla => {
     476            0 :                 let sender = WalSender {
     477            0 :                     pgb,
     478            0 :                     // should succeed since we're already holding another guard
     479            0 :                     tli: tli.wal_residence_guard().await?,
     480            0 :                     appname,
     481            0 :                     start_pos,
     482            0 :                     end_pos,
     483            0 :                     term,
     484            0 :                     end_watch,
     485            0 :                     ws_guard: ws_guard.clone(),
     486            0 :                     wal_reader,
     487            0 :                     send_buf: vec![0u8; MAX_SEND_SIZE],
     488            0 :                 };
     489            0 : 
     490            0 :                 Either::Left(sender.run())
     491              :             }
     492              :             PostgresClientProtocol::Interpreted {
     493            0 :                 format,
     494            0 :                 compression,
     495              :             } => {
     496            0 :                 let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000;
     497            0 :                 let end_watch_view = end_watch.view();
     498            0 :                 let wal_stream_builder = WalReaderStreamBuilder {
     499            0 :                     tli: tli.wal_residence_guard().await?,
     500            0 :                     start_pos,
     501            0 :                     end_pos,
     502            0 :                     term,
     503            0 :                     end_watch,
     504            0 :                     wal_sender_guard: ws_guard.clone(),
     505            0 :                 };
     506            0 : 
     507            0 :                 let sender = InterpretedWalSender {
     508            0 :                     format,
     509            0 :                     compression,
     510            0 :                     pgb,
     511            0 :                     wal_stream_builder,
     512            0 :                     end_watch_view,
     513            0 :                     shard: self.shard.unwrap(),
     514            0 :                     pg_version,
     515            0 :                     appname,
     516            0 :                 };
     517            0 : 
     518            0 :                 Either::Right(sender.run())
     519              :             }
     520              :         };
     521              : 
     522            0 :         let tli_cancel = tli.cancel.clone();
     523            0 : 
     524            0 :         let mut reply_reader = ReplyReader {
     525            0 :             reader,
     526            0 :             ws_guard: ws_guard.clone(),
     527            0 :             tli,
     528            0 :         };
     529              : 
     530            0 :         let res = tokio::select! {
     531              :             // todo: add read|write .context to these errors
     532            0 :             r = send_fut => r,
     533            0 :             r = reply_reader.run() => r,
     534            0 :             _ = tli_cancel.cancelled() => {
     535            0 :                 return Err(CopyStreamHandlerEnd::Cancelled);
     536              :             }
     537              :         };
     538              : 
     539            0 :         let ws_state = ws_guard
     540            0 :             .walsenders
     541            0 :             .mutex
     542            0 :             .lock()
     543            0 :             .get_slot(ws_guard.id)
     544            0 :             .clone();
     545            0 :         info!(
     546            0 :             "finished streaming to {}, feedback={:?}",
     547              :             ws_state.addr, ws_state.feedback,
     548              :         );
     549              : 
     550              :         // Join pg backend back.
     551            0 :         pgb.unsplit(reply_reader.reader)?;
     552              : 
     553            0 :         res
     554            0 :     }
     555              : }
     556              : 
     557              : /// TODO(vlad): maybe lift this instead
     558              : /// Walsender streams either up to commit_lsn (normally) or flush_lsn in the
     559              : /// given term (recovery by walproposer or peer safekeeper).
     560              : #[derive(Clone)]
     561              : pub(crate) enum EndWatch {
     562              :     Commit(Receiver<Lsn>),
     563              :     Flush(Receiver<TermLsn>),
     564              : }
     565              : 
     566              : impl EndWatch {
     567            0 :     pub(crate) fn view(&self) -> EndWatchView {
     568            0 :         EndWatchView(self.clone())
     569            0 :     }
     570              : 
     571              :     /// Get current end of WAL.
     572            0 :     pub(crate) fn get(&self) -> Lsn {
     573            0 :         match self {
     574            0 :             EndWatch::Commit(r) => *r.borrow(),
     575            0 :             EndWatch::Flush(r) => r.borrow().lsn,
     576              :         }
     577            0 :     }
     578              : 
     579              :     /// Wait for the update.
     580            0 :     pub(crate) async fn changed(&mut self) -> anyhow::Result<()> {
     581            0 :         match self {
     582            0 :             EndWatch::Commit(r) => r.changed().await?,
     583            0 :             EndWatch::Flush(r) => r.changed().await?,
     584              :         }
     585            0 :         Ok(())
     586            0 :     }
     587              : 
     588            0 :     pub(crate) async fn wait_for_lsn(
     589            0 :         &mut self,
     590            0 :         lsn: Lsn,
     591            0 :         client_term: Option<Term>,
     592            0 :     ) -> anyhow::Result<Lsn> {
     593              :         loop {
     594            0 :             let end_pos = self.get();
     595            0 :             if end_pos > lsn {
     596            0 :                 return Ok(end_pos);
     597            0 :             }
     598            0 :             if let EndWatch::Flush(rx) = &self {
     599            0 :                 let curr_term = rx.borrow().term;
     600            0 :                 if let Some(client_term) = client_term {
     601            0 :                     if curr_term != client_term {
     602            0 :                         bail!("term changed: requested {}, now {}", client_term, curr_term);
     603            0 :                     }
     604            0 :                 }
     605            0 :             }
     606            0 :             self.changed().await?;
     607              :         }
     608            0 :     }
     609              : }
     610              : 
     611              : pub(crate) struct EndWatchView(EndWatch);
     612              : 
     613              : impl EndWatchView {
     614            0 :     pub(crate) fn get(&self) -> Lsn {
     615            0 :         self.0.get()
     616            0 :     }
     617              : }
     618              : /// A half driving sending WAL.
     619              : struct WalSender<'a, IO> {
     620              :     pgb: &'a mut PostgresBackend<IO>,
     621              :     tli: WalResidentTimeline,
     622              :     appname: Option<String>,
     623              :     // Position since which we are sending next chunk.
     624              :     start_pos: Lsn,
     625              :     // WAL up to this position is known to be locally available.
     626              :     // Usually this is the same as the latest commit_lsn, but in case of
     627              :     // walproposer recovery, this is flush_lsn.
     628              :     //
     629              :     // We send this LSN to the receiver as wal_end, so that it knows how much
     630              :     // WAL this safekeeper has. This LSN should be as fresh as possible.
     631              :     end_pos: Lsn,
     632              :     /// When streaming uncommitted part, the term the client acts as the leader
     633              :     /// in. Streaming is stopped if local term changes to a different (higher)
     634              :     /// value.
     635              :     term: Option<Term>,
     636              :     /// Watch channel receiver to learn end of available WAL (and wait for its advancement).
     637              :     end_watch: EndWatch,
     638              :     ws_guard: Arc<WalSenderGuard>,
     639              :     wal_reader: WalReader,
     640              :     // buffer for readling WAL into to send it
     641              :     send_buf: Vec<u8>,
     642              : }
     643              : 
     644              : const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
     645              : 
     646              : impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
     647              :     /// Send WAL until
     648              :     /// - an error occurs
     649              :     /// - receiver is caughtup and there is no computes (if streaming up to commit_lsn)
     650              :     /// - timeline's cancellation token fires
     651              :     ///
     652              :     /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
     653              :     /// convenience.
     654            0 :     async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
     655              :         loop {
     656              :             // Wait for the next portion if it is not there yet, or just
     657              :             // update our end of WAL available for sending value, we
     658              :             // communicate it to the receiver.
     659            0 :             self.wait_wal().await?;
     660            0 :             assert!(
     661            0 :                 self.end_pos > self.start_pos,
     662            0 :                 "nothing to send after waiting for WAL"
     663              :             );
     664              : 
     665              :             // try to send as much as available, capped by MAX_SEND_SIZE
     666            0 :             let mut chunk_end_pos = self.start_pos + MAX_SEND_SIZE as u64;
     667            0 :             // if we went behind available WAL, back off
     668            0 :             if chunk_end_pos >= self.end_pos {
     669            0 :                 chunk_end_pos = self.end_pos;
     670            0 :             } else {
     671            0 :                 // If sending not up to end pos, round down to page boundary to
     672            0 :                 // avoid breaking WAL record not at page boundary, as protocol
     673            0 :                 // demands. See walsender.c (XLogSendPhysical).
     674            0 :                 chunk_end_pos = chunk_end_pos
     675            0 :                     .checked_sub(chunk_end_pos.block_offset())
     676            0 :                     .unwrap();
     677            0 :             }
     678            0 :             let send_size = (chunk_end_pos.0 - self.start_pos.0) as usize;
     679            0 :             let send_buf = &mut self.send_buf[..send_size];
     680              :             let send_size: usize;
     681              :             {
     682              :                 // If uncommitted part is being pulled, check that the term is
     683              :                 // still the expected one.
     684            0 :                 let _term_guard = if let Some(t) = self.term {
     685            0 :                     Some(self.tli.acquire_term(t).await?)
     686              :                 } else {
     687            0 :                     None
     688              :                 };
     689              :                 // Read WAL into buffer. send_size can be additionally capped to
     690              :                 // segment boundary here.
     691            0 :                 send_size = self.wal_reader.read(send_buf).await?
     692              :             };
     693            0 :             let send_buf = &send_buf[..send_size];
     694            0 : 
     695            0 :             // and send it, while respecting Timeline::cancel
     696            0 :             let msg = BeMessage::XLogData(XLogDataBody {
     697            0 :                 wal_start: self.start_pos.0,
     698            0 :                 wal_end: self.end_pos.0,
     699            0 :                 timestamp: get_current_timestamp(),
     700            0 :                 data: send_buf,
     701            0 :             });
     702            0 :             self.pgb.write_message(&msg).await?;
     703              : 
     704            0 :             if let Some(appname) = &self.appname {
     705            0 :                 if appname == "replica" {
     706            0 :                     failpoint_support::sleep_millis_async!("sk-send-wal-replica-sleep");
     707            0 :                 }
     708            0 :             }
     709            0 :             trace!(
     710            0 :                 "sent {} bytes of WAL {}-{}",
     711            0 :                 send_size,
     712            0 :                 self.start_pos,
     713            0 :                 self.start_pos + send_size as u64
     714              :             );
     715            0 :             self.start_pos += send_size as u64;
     716              :         }
     717            0 :     }
     718              : 
     719              :     /// wait until we have WAL to stream, sending keepalives and checking for
     720              :     /// exit in the meanwhile
     721            0 :     async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> {
     722              :         loop {
     723            0 :             self.end_pos = self.end_watch.get();
     724            0 :             let have_something_to_send = (|| {
     725            0 :                 fail::fail_point!(
     726            0 :                     "sk-pause-send",
     727            0 :                     self.appname.as_deref() != Some("pageserver"),
     728            0 :                     |_| { false }
     729            0 :                 );
     730            0 :                 self.end_pos > self.start_pos
     731            0 :             })();
     732            0 : 
     733            0 :             if have_something_to_send {
     734            0 :                 trace!("got end_pos {:?}, streaming", self.end_pos);
     735            0 :                 return Ok(());
     736            0 :             }
     737              : 
     738              :             // Wait for WAL to appear, now self.end_pos == self.start_pos.
     739            0 :             if let Some(lsn) = self.wait_for_lsn().await? {
     740            0 :                 self.end_pos = lsn;
     741            0 :                 trace!("got end_pos {:?}, streaming", self.end_pos);
     742            0 :                 return Ok(());
     743            0 :             }
     744            0 : 
     745            0 :             // Timed out waiting for WAL, check for termination and send KA.
     746            0 :             // Check for termination only if we are streaming up to commit_lsn
     747            0 :             // (to pageserver).
     748            0 :             if let EndWatch::Commit(_) = self.end_watch {
     749            0 :                 if let Some(remote_consistent_lsn) = self
     750            0 :                     .ws_guard
     751            0 :                     .walsenders
     752            0 :                     .get_ws_remote_consistent_lsn(self.ws_guard.id)
     753              :                 {
     754            0 :                     if self.tli.should_walsender_stop(remote_consistent_lsn).await {
     755              :                         // Terminate if there is nothing more to send.
     756              :                         // Note that "ending streaming" part of the string is used by
     757              :                         // pageserver to identify WalReceiverError::SuccessfulCompletion,
     758              :                         // do not change this string without updating pageserver.
     759            0 :                         return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
     760            0 :                         "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
     761            0 :                         self.appname, self.start_pos,
     762            0 :                     )));
     763            0 :                     }
     764            0 :                 }
     765            0 :             }
     766              : 
     767            0 :             let msg = BeMessage::KeepAlive(WalSndKeepAlive {
     768            0 :                 wal_end: self.end_pos.0,
     769            0 :                 timestamp: get_current_timestamp(),
     770            0 :                 request_reply: true,
     771            0 :             });
     772            0 : 
     773            0 :             self.pgb.write_message(&msg).await?;
     774              :         }
     775            0 :     }
     776              : 
     777              :     /// Wait until we have available WAL > start_pos or timeout expires. Returns
     778              :     /// - Ok(Some(end_pos)) if needed lsn is successfully observed;
     779              :     /// - Ok(None) if timeout expired;
     780              :     /// - Err in case of error -- only if 1) term changed while fetching in recovery
     781              :     ///   mode 2) watch channel closed, which must never happen.
     782            0 :     async fn wait_for_lsn(&mut self) -> anyhow::Result<Option<Lsn>> {
     783            0 :         let fp = (|| {
     784            0 :             fail::fail_point!(
     785            0 :                 "sk-pause-send",
     786            0 :                 self.appname.as_deref() != Some("pageserver"),
     787            0 :                 |_| { true }
     788            0 :             );
     789            0 :             false
     790            0 :         })();
     791            0 :         if fp {
     792            0 :             tokio::time::sleep(POLL_STATE_TIMEOUT).await;
     793            0 :             return Ok(None);
     794            0 :         }
     795              : 
     796            0 :         let res = timeout(POLL_STATE_TIMEOUT, async move {
     797              :             loop {
     798            0 :                 let end_pos = self.end_watch.get();
     799            0 :                 if end_pos > self.start_pos {
     800            0 :                     return Ok(end_pos);
     801            0 :                 }
     802            0 :                 if let EndWatch::Flush(rx) = &self.end_watch {
     803            0 :                     let curr_term = rx.borrow().term;
     804            0 :                     if let Some(client_term) = self.term {
     805            0 :                         if curr_term != client_term {
     806            0 :                             bail!("term changed: requested {}, now {}", client_term, curr_term);
     807            0 :                         }
     808            0 :                     }
     809            0 :                 }
     810            0 :                 self.end_watch.changed().await?;
     811              :             }
     812            0 :         })
     813            0 :         .await;
     814              : 
     815            0 :         match res {
     816              :             // success
     817            0 :             Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)),
     818              :             // error inside closure
     819            0 :             Ok(Err(err)) => Err(err),
     820              :             // timeout
     821            0 :             Err(_) => Ok(None),
     822              :         }
     823            0 :     }
     824              : }
     825              : 
     826              : /// A half driving receiving replies.
     827              : struct ReplyReader<IO> {
     828              :     reader: PostgresBackendReader<IO>,
     829              :     ws_guard: Arc<WalSenderGuard>,
     830              :     tli: WalResidentTimeline,
     831              : }
     832              : 
     833              : impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
     834            0 :     async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> {
     835              :         loop {
     836            0 :             let msg = self.reader.read_copy_message().await?;
     837            0 :             self.handle_feedback(&msg).await?
     838              :         }
     839            0 :     }
     840              : 
     841            0 :     async fn handle_feedback(&mut self, msg: &Bytes) -> anyhow::Result<()> {
     842            0 :         match msg.first().cloned() {
     843              :             Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
     844              :                 // Note: deserializing is on m[1..] because we skip the tag byte.
     845            0 :                 let mut hs_feedback = HotStandbyFeedback::des(&msg[1..])
     846            0 :                     .context("failed to deserialize HotStandbyFeedback")?;
     847              :                 // TODO: xmin/catalog_xmin are serialized by walreceiver.c in this way:
     848              :                 // pq_sendint32(&reply_message, xmin);
     849              :                 // pq_sendint32(&reply_message, xmin_epoch);
     850              :                 // So it is two big endian 32-bit words in low endian order!
     851            0 :                 hs_feedback.xmin = hs_feedback.xmin.rotate_left(32);
     852            0 :                 hs_feedback.catalog_xmin = hs_feedback.catalog_xmin.rotate_left(32);
     853            0 :                 self.ws_guard
     854            0 :                     .walsenders
     855            0 :                     .record_hs_feedback(self.ws_guard.id, &hs_feedback);
     856              :             }
     857              :             Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => {
     858            0 :                 let reply =
     859            0 :                     StandbyReply::des(&msg[1..]).context("failed to deserialize StandbyReply")?;
     860            0 :                 self.ws_guard
     861            0 :                     .walsenders
     862            0 :                     .record_standby_reply(self.ws_guard.id, &reply);
     863              :             }
     864              :             Some(NEON_STATUS_UPDATE_TAG_BYTE) => {
     865              :                 // pageserver sends this.
     866              :                 // Note: deserializing is on m[9..] because we skip the tag byte and len bytes.
     867            0 :                 let buf = Bytes::copy_from_slice(&msg[9..]);
     868            0 :                 let ps_feedback = PageserverFeedback::parse(buf);
     869            0 : 
     870            0 :                 trace!("PageserverFeedback is {:?}", ps_feedback);
     871            0 :                 self.ws_guard
     872            0 :                     .walsenders
     873            0 :                     .record_ps_feedback(self.ws_guard.id, &ps_feedback);
     874            0 :                 self.tli
     875            0 :                     .update_remote_consistent_lsn(ps_feedback.remote_consistent_lsn)
     876            0 :                     .await;
     877              :                 // in principle new remote_consistent_lsn could allow to
     878              :                 // deactivate the timeline, but we check that regularly through
     879              :                 // broker updated, not need to do it here
     880              :             }
     881            0 :             _ => warn!("unexpected message {:?}", msg),
     882              :         }
     883            0 :         Ok(())
     884            0 :     }
     885              : }
     886              : 
     887              : #[cfg(test)]
     888              : mod tests {
     889              :     use utils::id::{TenantId, TimelineId};
     890              : 
     891              :     use super::*;
     892              : 
     893            4 :     fn mock_ttid() -> TenantTimelineId {
     894            4 :         TenantTimelineId {
     895            4 :             tenant_id: TenantId::from_slice(&[0x00; 16]).unwrap(),
     896            4 :             timeline_id: TimelineId::from_slice(&[0x00; 16]).unwrap(),
     897            4 :         }
     898            4 :     }
     899              : 
     900            4 :     fn mock_addr() -> SocketAddr {
     901            4 :         "127.0.0.1:8080".parse().unwrap()
     902            4 :     }
     903              : 
     904              :     // add to wss specified feedback setting other fields to dummy values
     905            4 :     fn push_feedback(wss: &mut WalSendersShared, feedback: ReplicationFeedback) {
     906            4 :         let walsender_state = WalSenderState {
     907            4 :             ttid: mock_ttid(),
     908            4 :             addr: mock_addr(),
     909            4 :             conn_id: 1,
     910            4 :             appname: None,
     911            4 :             feedback,
     912            4 :         };
     913            4 :         wss.slots.push(Some(walsender_state))
     914            4 :     }
     915              : 
     916              :     // form standby feedback with given hot standby feedback ts/xmin and the
     917              :     // rest set to dummy values.
     918            4 :     fn hs_feedback(ts: TimestampTz, xmin: FullTransactionId) -> ReplicationFeedback {
     919            4 :         ReplicationFeedback::Standby(StandbyFeedback {
     920            4 :             reply: StandbyReply::empty(),
     921            4 :             hs_feedback: HotStandbyFeedback {
     922            4 :                 ts,
     923            4 :                 xmin,
     924            4 :                 catalog_xmin: 0,
     925            4 :             },
     926            4 :         })
     927            4 :     }
     928              : 
     929              :     // test that hs aggregation works as expected
     930              :     #[test]
     931            1 :     fn test_hs_feedback_no_valid() {
     932            1 :         let mut wss = WalSendersShared::new();
     933            1 :         push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
     934            1 :         wss.update_reply_feedback();
     935            1 :         assert_eq!(
     936            1 :             wss.agg_standby_feedback.hs_feedback.xmin,
     937            1 :             INVALID_FULL_TRANSACTION_ID
     938            1 :         );
     939            1 :     }
     940              : 
     941              :     #[test]
     942            1 :     fn test_hs_feedback() {
     943            1 :         let mut wss = WalSendersShared::new();
     944            1 :         push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
     945            1 :         push_feedback(&mut wss, hs_feedback(1, 42));
     946            1 :         push_feedback(&mut wss, hs_feedback(1, 64));
     947            1 :         wss.update_reply_feedback();
     948            1 :         assert_eq!(wss.agg_standby_feedback.hs_feedback.xmin, 42);
     949            1 :     }
     950              : }

Generated by: LCOV version 2.1-beta