LCOV - code coverage report
Current view: top level - pageserver/src/tenant/timeline/walreceiver - walreceiver_connection.rs (source / functions) Coverage Total Hit
Test: 8ac049b474321fdc72ddcb56d7165153a1a900e8.info Lines: 87.6 % 274 240
Test Date: 2023-09-06 10:18:01 Functions: 54.8 % 42 23

            Line data    Source code
       1              : //! Actual Postgres connection handler to stream WAL to the server.
       2              : 
       3              : use std::{
       4              :     error::Error,
       5              :     pin::pin,
       6              :     str::FromStr,
       7              :     sync::Arc,
       8              :     time::{Duration, SystemTime},
       9              : };
      10              : 
      11              : use anyhow::{anyhow, Context};
      12              : use bytes::BytesMut;
      13              : use chrono::{NaiveDateTime, Utc};
      14              : use fail::fail_point;
      15              : use futures::StreamExt;
      16              : use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
      17              : use postgres_ffi::WAL_SEGMENT_SIZE;
      18              : use postgres_ffi::{v14::xlog_utils::normalize_lsn, waldecoder::WalDecodeError};
      19              : use postgres_protocol::message::backend::ReplicationMessage;
      20              : use postgres_types::PgLsn;
      21              : use tokio::{select, sync::watch, time};
      22              : use tokio_postgres::{replication::ReplicationStream, Client};
      23              : use tokio_util::sync::CancellationToken;
      24              : use tracing::{debug, error, info, trace, warn, Instrument};
      25              : 
      26              : use super::TaskStateUpdate;
      27              : use crate::{
      28              :     context::RequestContext,
      29              :     metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
      30              :     task_mgr,
      31              :     task_mgr::TaskKind,
      32              :     task_mgr::WALRECEIVER_RUNTIME,
      33              :     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
      34              :     walingest::WalIngest,
      35              :     walrecord::DecodedWALRecord,
      36              : };
      37              : use postgres_backend::is_expected_io_error;
      38              : use postgres_connection::PgConnectionConfig;
      39              : use postgres_ffi::waldecoder::WalStreamDecoder;
      40              : use utils::pageserver_feedback::PageserverFeedback;
      41              : use utils::{id::NodeId, lsn::Lsn};
      42              : 
      43              : /// Status of the connection.
      44       707142 : #[derive(Debug, Clone, Copy)]
      45              : pub(super) struct WalConnectionStatus {
      46              :     /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running.
      47              :     pub is_connected: bool,
      48              :     /// Defines a healthy connection as one on which pageserver received WAL from safekeeper
      49              :     /// and is able to process it in walingest without errors.
      50              :     pub has_processed_wal: bool,
      51              :     /// Connection establishment time or the timestamp of a latest connection message received.
      52              :     pub latest_connection_update: NaiveDateTime,
      53              :     /// Time of the latest WAL message received.
      54              :     pub latest_wal_update: NaiveDateTime,
      55              :     /// Latest WAL update contained WAL up to this LSN. Next WAL message with start from that LSN.
      56              :     pub streaming_lsn: Option<Lsn>,
      57              :     /// Latest commit_lsn received from the safekeeper. Can be zero if no message has been received yet.
      58              :     pub commit_lsn: Option<Lsn>,
      59              :     /// The node it is connected to
      60              :     pub node: NodeId,
      61              : }
      62              : 
      63              : pub(super) enum WalReceiverError {
      64              :     /// An error of a type that does not indicate an issue, e.g. a connection closing
      65              :     ExpectedSafekeeperError(postgres::Error),
      66              :     /// An "error" message that carries a SUCCESSFUL_COMPLETION status code.  Carries
      67              :     /// the message part of the original postgres error
      68              :     SuccessfulCompletion(String),
      69              :     /// Generic error
      70              :     Other(anyhow::Error),
      71              : }
      72              : 
      73              : impl From<tokio_postgres::Error> for WalReceiverError {
      74              :     fn from(err: tokio_postgres::Error) -> Self {
      75         1247 :         if let Some(dberror) = err.as_db_error().filter(|db_error| {
      76          129 :             db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
      77          129 :                 && db_error.message().contains("ending streaming")
      78         1247 :         }) {
      79              :             // Strip the outer DbError, which carries a misleading "error" severity
      80          129 :             Self::SuccessfulCompletion(dberror.message().to_string())
      81         1118 :         } else if err.is_closed()
      82          486 :             || err
      83          486 :                 .source()
      84          486 :                 .and_then(|source| source.downcast_ref::<std::io::Error>())
      85          486 :                 .map(is_expected_io_error)
      86          486 :                 .unwrap_or(false)
      87              :         {
      88         1114 :             Self::ExpectedSafekeeperError(err)
      89              :         } else {
      90            4 :             Self::Other(anyhow::Error::new(err))
      91              :         }
      92         1247 :     }
      93              : }
      94              : 
      95              : impl From<anyhow::Error> for WalReceiverError {
      96           10 :     fn from(err: anyhow::Error) -> Self {
      97           10 :         Self::Other(err)
      98           10 :     }
      99              : }
     100              : 
     101              : impl From<WalDecodeError> for WalReceiverError {
     102            0 :     fn from(err: WalDecodeError) -> Self {
     103            0 :         Self::Other(anyhow::Error::new(err))
     104            0 :     }
     105              : }
     106              : 
     107              : /// Open a connection to the given safekeeper and receive WAL, sending back progress
     108              : /// messages as we go.
     109         1222 : pub(super) async fn handle_walreceiver_connection(
     110         1222 :     timeline: Arc<Timeline>,
     111         1222 :     wal_source_connconf: PgConnectionConfig,
     112         1222 :     events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
     113         1222 :     cancellation: CancellationToken,
     114         1222 :     connect_timeout: Duration,
     115         1222 :     ctx: RequestContext,
     116         1222 :     node: NodeId,
     117         1222 : ) -> Result<(), WalReceiverError> {
     118         1222 :     debug_assert_current_span_has_tenant_and_timeline_id();
     119         1222 : 
     120         1222 :     WALRECEIVER_STARTED_CONNECTIONS.inc();
     121              : 
     122              :     // Connect to the database in replication mode.
     123         1222 :     info!("connecting to {wal_source_connconf:?}");
     124              : 
     125          752 :     let (mut replication_client, connection) = {
     126         1222 :         let mut config = wal_source_connconf.to_tokio_postgres_config();
     127         1222 :         config.application_name("pageserver");
     128         1222 :         config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
     129         1996 :         match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
     130         1222 :             Ok(client_and_conn) => client_and_conn?,
     131            0 :             Err(_elapsed) => {
     132              :                 // Timing out to connect to a safekeeper node could happen long time, due to
     133              :                 // many reasons that pageserver cannot control.
     134              :                 // Do not produce an error, but make it visible, that timeouts happen by logging the `event.
     135            0 :                 info!("Timed out while waiting {connect_timeout:?} for walreceiver connection to open");
     136            0 :                 return Ok(());
     137              :             }
     138              :         }
     139              :     };
     140              : 
     141            0 :     debug!("connected!");
     142          752 :     let mut connection_status = WalConnectionStatus {
     143          752 :         is_connected: true,
     144          752 :         has_processed_wal: false,
     145          752 :         latest_connection_update: Utc::now().naive_utc(),
     146          752 :         latest_wal_update: Utc::now().naive_utc(),
     147          752 :         streaming_lsn: None,
     148          752 :         commit_lsn: None,
     149          752 :         node,
     150          752 :     };
     151          752 :     if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
     152            0 :         warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");
     153            0 :         return Ok(());
     154          752 :     }
     155          752 : 
     156          752 :     // The connection object performs the actual communication with the database,
     157          752 :     // so spawn it off to run on its own.
     158          752 :     let _connection_ctx = ctx.detached_child(
     159          752 :         TaskKind::WalReceiverConnectionPoller,
     160          752 :         ctx.download_behavior(),
     161          752 :     );
     162          752 :     let connection_cancellation = cancellation.clone();
     163          752 :     task_mgr::spawn(
     164          752 :         WALRECEIVER_RUNTIME.handle(),
     165          752 :         TaskKind::WalReceiverConnectionPoller,
     166          752 :         Some(timeline.tenant_id),
     167          752 :         Some(timeline.timeline_id),
     168          752 :         "walreceiver connection",
     169              :         false,
     170          752 :         async move {
     171          752 :             debug_assert_current_span_has_tenant_and_timeline_id();
     172          752 : 
     173      1329400 :             select! {
     174          436 :                 connection_result = connection => match connection_result {
     175            0 :                     Ok(()) => debug!("Walreceiver db connection closed"),
     176              :                     Err(connection_error) => {
     177              :                         match WalReceiverError::from(connection_error) {
     178              :                             WalReceiverError::ExpectedSafekeeperError(_) => {
     179              :                                 // silence, because most likely we've already exited the outer call
     180              :                                 // with a similar error.
     181              :                             },
     182              :                             WalReceiverError::SuccessfulCompletion(_) => {}
     183              :                             WalReceiverError::Other(err) => {
     184            4 :                                 warn!("Connection aborted: {err:#}")
     185              :                             }
     186              :                         }
     187              :                     }
     188              :                 },
     189            0 :                 _ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
     190              :             }
     191          692 :             Ok(())
     192          692 :         }
     193              :         // Enrich the log lines emitted by this closure with meaningful context.
     194              :         // TODO: technically, this task outlives the surrounding function, so, the
     195              :         // spans won't be properly nested.
     196          752 :         .instrument(tracing::info_span!("poller")),
     197              :     );
     198              : 
     199              :     // Immediately increment the gauge, then create a job to decrement it on task exit.
     200              :     // One of the pros of `defer!` is that this will *most probably*
     201              :     // get called, even in presence of panics.
     202          752 :     let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]);
     203          752 :     gauge.inc();
     204          752 :     scopeguard::defer! {
     205          692 :         gauge.dec();
     206          692 :     }
     207              : 
     208          752 :     let identify = identify_system(&mut replication_client).await?;
     209          752 :     info!("{identify:?}");
     210              : 
     211          752 :     let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
     212          752 :     let mut caught_up = false;
     213          752 : 
     214          752 :     connection_status.latest_connection_update = Utc::now().naive_utc();
     215          752 :     connection_status.latest_wal_update = Utc::now().naive_utc();
     216          752 :     connection_status.commit_lsn = Some(end_of_wal);
     217          752 :     if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
     218            0 :         warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}");
     219            0 :         return Ok(());
     220          752 :     }
     221          752 : 
     222          752 :     //
     223          752 :     // Start streaming the WAL, from where we left off previously.
     224          752 :     //
     225          752 :     // If we had previously received WAL up to some point in the middle of a WAL record, we
     226          752 :     // better start from the end of last full WAL record, not in the middle of one.
     227          752 :     let mut last_rec_lsn = timeline.get_last_record_lsn();
     228          752 :     let mut startpoint = last_rec_lsn;
     229          752 : 
     230          752 :     if startpoint == Lsn(0) {
     231            0 :         return Err(WalReceiverError::Other(anyhow!("No previous WAL position")));
     232          752 :     }
     233          752 : 
     234          752 :     // There might be some padding after the last full record, skip it.
     235          752 :     startpoint += startpoint.calc_padding(8u32);
     236          752 : 
     237          752 :     // If the starting point is at a WAL page boundary, skip past the page header. We don't need the page headers
     238          752 :     // for anything, and in some corner cases, the compute node might have never generated the WAL for page headers
     239          752 :     //. That happens if you create a branch at page boundary: the start point of the branch is at the page boundary,
     240          752 :     // but when the compute node first starts on the branch, we normalize the first REDO position to just after the page
     241          752 :     // header (see generate_pg_control()), so the WAL for the page header is never streamed from the compute node
     242          752 :     //  to the safekeepers.
     243          752 :     startpoint = normalize_lsn(startpoint, WAL_SEGMENT_SIZE);
     244              : 
     245          752 :     info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}...");
     246              : 
     247          752 :     let query = format!("START_REPLICATION PHYSICAL {startpoint}");
     248              : 
     249          752 :     let copy_stream = replication_client.copy_both_simple(&query).await?;
     250          750 :     let mut physical_stream = pin!(ReplicationStream::new(copy_stream));
     251          750 : 
     252          750 :     let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
     253              : 
     254          750 :     let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
     255              : 
     256       734618 :     while let Some(replication_message) = {
     257       734871 :         select! {
     258              :             _ = cancellation.cancelled() => {
     259            0 :                 debug!("walreceiver interrupted");
     260              :                 None
     261              :             }
     262       734618 :             replication_message = physical_stream.next() => replication_message,
     263              :         }
     264              :     } {
     265       734618 :         let replication_message = replication_message?;
     266              : 
     267       734174 :         let now = Utc::now().naive_utc();
     268       734174 :         let last_rec_lsn_before_msg = last_rec_lsn;
     269       734174 : 
     270       734174 :         // Update the connection status before processing the message. If the message processing
     271       734174 :         // fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper.
     272       734174 :         match &replication_message {
     273       732367 :             ReplicationMessage::XLogData(xlog_data) => {
     274       732367 :                 connection_status.latest_connection_update = now;
     275       732367 :                 connection_status.commit_lsn = Some(Lsn::from(xlog_data.wal_end()));
     276       732367 :                 connection_status.streaming_lsn = Some(Lsn::from(
     277       732367 :                     xlog_data.wal_start() + xlog_data.data().len() as u64,
     278       732367 :                 ));
     279       732367 :                 if !xlog_data.data().is_empty() {
     280       732367 :                     connection_status.latest_wal_update = now;
     281       732367 :                 }
     282              :             }
     283         1807 :             ReplicationMessage::PrimaryKeepAlive(keepalive) => {
     284         1807 :                 connection_status.latest_connection_update = now;
     285         1807 :                 connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end()));
     286         1807 :             }
     287            0 :             &_ => {}
     288              :         };
     289       734174 :         if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
     290            0 :             warn!("Wal connection event listener dropped, aborting the connection: {e}");
     291            0 :             return Ok(());
     292       734174 :         }
     293              : 
     294       734174 :         let status_update = match replication_message {
     295       732367 :             ReplicationMessage::XLogData(xlog_data) => {
     296       732367 :                 // Pass the WAL data to the decoder, and see if we can decode
     297       732367 :                 // more records as a result.
     298       732367 :                 let data = xlog_data.data();
     299       732367 :                 let startlsn = Lsn::from(xlog_data.wal_start());
     300       732367 :                 let endlsn = startlsn + data.len() as u64;
     301              : 
     302            0 :                 trace!("received XLogData between {startlsn} and {endlsn}");
     303              : 
     304       732367 :                 waldecoder.feed_bytes(data);
     305       732367 : 
     306       732367 :                 {
     307       732367 :                     let mut decoded = DecodedWALRecord::default();
     308       732367 :                     let mut modification = timeline.begin_modification(endlsn);
     309     74258353 :                     while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
     310              :                         // It is important to deal with the aligned records as lsn in getPage@LSN is
     311              :                         // aligned and can be several bytes bigger. Without this alignment we are
     312              :                         // at risk of hitting a deadlock.
     313     73525998 :                         if !lsn.is_aligned() {
     314            0 :                             return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
     315     73525998 :                         }
     316     73525998 : 
     317     73525998 :                         walingest
     318     73525998 :                             .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
     319      5400318 :                             .await
     320     73525986 :                             .with_context(|| format!("could not ingest record at {lsn}"))?;
     321              : 
     322            0 :                         fail_point!("walreceiver-after-ingest");
     323              : 
     324     73525986 :                         last_rec_lsn = lsn;
     325              :                     }
     326              :                 }
     327              : 
     328       732355 :                 if !caught_up && endlsn >= end_of_wal {
     329          537 :                     info!("caught up at LSN {endlsn}");
     330          537 :                     caught_up = true;
     331       731818 :                 }
     332              : 
     333       732355 :                 Some(endlsn)
     334              :             }
     335              : 
     336         1807 :             ReplicationMessage::PrimaryKeepAlive(keepalive) => {
     337         1807 :                 let wal_end = keepalive.wal_end();
     338         1807 :                 let timestamp = keepalive.timestamp();
     339         1807 :                 let reply_requested = keepalive.reply() != 0;
     340              : 
     341            0 :                 trace!("received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})");
     342              : 
     343         1807 :                 if reply_requested {
     344         1807 :                     Some(last_rec_lsn)
     345              :                 } else {
     346            0 :                     None
     347              :                 }
     348              :             }
     349              : 
     350            0 :             _ => None,
     351              :         };
     352              : 
     353       734162 :         if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg {
     354              :             // We have successfully processed at least one WAL record.
     355          543 :             connection_status.has_processed_wal = true;
     356          543 :             if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
     357            0 :                 warn!("Wal connection event listener dropped, aborting the connection: {e}");
     358            0 :                 return Ok(());
     359          543 :             }
     360       733619 :         }
     361              : 
     362       734162 :         timeline
     363       734162 :             .check_checkpoint_distance()
     364         2520 :             .await
     365       734162 :             .with_context(|| {
     366            0 :                 format!(
     367            0 :                     "Failed to check checkpoint distance for timeline {}",
     368            0 :                     timeline.timeline_id
     369            0 :                 )
     370       734162 :             })?;
     371              : 
     372       734162 :         if let Some(last_lsn) = status_update {
     373       734162 :             let timeline_remote_consistent_lsn =
     374       734162 :                 timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
     375       734162 : 
     376       734162 :             // The last LSN we processed. It is not guaranteed to survive pageserver crash.
     377       734162 :             let last_received_lsn = last_lsn;
     378       734162 :             // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data
     379       734162 :             let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
     380       734162 :             // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
     381       734162 :             // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
     382       734162 :             let remote_consistent_lsn = timeline_remote_consistent_lsn;
     383       734162 :             let ts = SystemTime::now();
     384       734162 : 
     385       734162 :             // Update the status about what we just received. This is shown in the mgmt API.
     386       734162 :             let last_received_wal = WalReceiverInfo {
     387       734162 :                 wal_source_connconf: wal_source_connconf.clone(),
     388       734162 :                 last_received_msg_lsn: last_lsn,
     389       734162 :                 last_received_msg_ts: ts
     390       734162 :                     .duration_since(SystemTime::UNIX_EPOCH)
     391       734162 :                     .expect("Received message time should be before UNIX EPOCH!")
     392       734162 :                     .as_micros(),
     393       734162 :             };
     394       734162 :             *timeline.last_received_wal.lock().unwrap() = Some(last_received_wal);
     395              : 
     396              :             // Send the replication feedback message.
     397              :             // Regular standby_status_update fields are put into this message.
     398       734162 :             let (timeline_logical_size, _) = timeline
     399       734162 :                 .get_current_logical_size(&ctx)
     400       734162 :                 .context("Status update creation failed to get current logical size")?;
     401       734162 :             let status_update = PageserverFeedback {
     402       734162 :                 current_timeline_size: timeline_logical_size,
     403       734162 :                 last_received_lsn,
     404       734162 :                 disk_consistent_lsn,
     405       734162 :                 remote_consistent_lsn,
     406       734162 :                 replytime: ts,
     407       734162 :             };
     408              : 
     409            0 :             debug!("neon_status_update {status_update:?}");
     410              : 
     411       734162 :             let mut data = BytesMut::new();
     412       734162 :             status_update.serialize(&mut data);
     413       734162 :             physical_stream
     414       734162 :                 .as_mut()
     415       734162 :                 .zenith_status_update(data.len() as u64, &data)
     416        23657 :                 .await?;
     417            0 :         }
     418              :     }
     419              : 
     420          206 :     Ok(())
     421         1162 : }
     422              : 
     423              : /// Data returned from the postgres `IDENTIFY_SYSTEM` command
     424              : ///
     425              : /// See the [postgres docs] for more details.
     426              : ///
     427              : /// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html
     428          752 : #[derive(Debug)]
     429              : // As of nightly 2021-09-11, fields that are only read by the type's `Debug` impl still count as
     430              : // unused. Relevant issue: https://github.com/rust-lang/rust/issues/88900
     431              : #[allow(dead_code)]
     432              : struct IdentifySystem {
     433              :     systemid: u64,
     434              :     timeline: u32,
     435              :     xlogpos: PgLsn,
     436              :     dbname: Option<String>,
     437              : }
     438              : 
     439              : /// There was a problem parsing the response to
     440              : /// a postgres IDENTIFY_SYSTEM command.
     441            0 : #[derive(Debug, thiserror::Error)]
     442              : #[error("IDENTIFY_SYSTEM parse error")]
     443              : struct IdentifyError;
     444              : 
     445              : /// Run the postgres `IDENTIFY_SYSTEM` command
     446          752 : async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem> {
     447          752 :     let query_str = "IDENTIFY_SYSTEM";
     448          752 :     let response = client.simple_query(query_str).await?;
     449              : 
     450              :     // get(N) from row, then parse it as some destination type.
     451         3008 :     fn get_parse<T>(row: &SimpleQueryRow, idx: usize) -> Result<T, IdentifyError>
     452         3008 :     where
     453         3008 :         T: FromStr,
     454         3008 :     {
     455         3008 :         let val = row.get(idx).ok_or(IdentifyError)?;
     456         2256 :         val.parse::<T>().or(Err(IdentifyError))
     457         3008 :     }
     458              : 
     459              :     // extract the row contents into an IdentifySystem struct.
     460              :     // written as a closure so I can use ? for Option here.
     461          752 :     if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) {
     462              :         Ok(IdentifySystem {
     463          752 :             systemid: get_parse(first_row, 0)?,
     464          752 :             timeline: get_parse(first_row, 1)?,
     465          752 :             xlogpos: get_parse(first_row, 2)?,
     466          752 :             dbname: get_parse(first_row, 3).ok(),
     467              :         })
     468              :     } else {
     469            0 :         Err(IdentifyError.into())
     470              :     }
     471          752 : }
        

Generated by: LCOV version 2.1-beta