LCOV - code coverage report
Current view: top level - pageserver/src/tenant - vectored_blob_io.rs (source / functions) Coverage Total Hit
Test: 322b88762cba8ea666f63cda880cccab6936bf37.info Lines: 94.7 % 247 234
Test Date: 2024-02-29 11:57:12 Functions: 75.9 % 29 22

            Line data    Source code
       1              : //!
       2              : //! Utilities for vectored reading of variable-sized "blobs".
       3              : //!
       4              : //! The "blob" api is an abstraction on top of the "block" api,
       5              : //! with the main difference being that blobs do not have a fixed
       6              : //! size (each blob is prefixed with 1 or 4 byte length field)
       7              : //!
       8              : //! The vectored apis provided in this module allow for planning
       9              : //! and executing disk IO which covers multiple blobs.
      10              : //!
      11              : //! Reads are planned with [`VectoredReadPlanner`] which will coalesce
      12              : //! adjacent blocks into a single disk IO request and exectuted by
      13              : //! [`VectoredBlobReader`] which does all the required offset juggling
      14              : //! and returns a buffer housing all the blobs and a list of offsets.
      15              : //!
      16              : //! Note that the vectored blob api does *not* go through the page cache.
      17              : 
      18              : use std::collections::BTreeMap;
      19              : use std::num::NonZeroUsize;
      20              : 
      21              : use bytes::BytesMut;
      22              : use pageserver_api::key::Key;
      23              : use utils::lsn::Lsn;
      24              : use utils::vec_map::VecMap;
      25              : 
      26              : use crate::virtual_file::VirtualFile;
      27              : 
      28            4 : #[derive(Copy, Clone, Debug, PartialEq, Eq)]
      29              : pub struct MaxVectoredReadBytes(pub NonZeroUsize);
      30              : 
      31              : /// Metadata bundled with the start and end offset of a blob.
      32            0 : #[derive(Copy, Clone, Debug)]
      33              : pub struct BlobMeta {
      34              :     pub key: Key,
      35              :     pub lsn: Lsn,
      36              : }
      37              : 
      38              : /// Blob offsets into [`VectoredBlobsBuf::buf`]
      39              : pub struct VectoredBlob {
      40              :     pub start: usize,
      41              :     pub end: usize,
      42              :     pub meta: BlobMeta,
      43              : }
      44              : 
      45              : /// Return type of [`VectoredBlobReader::read_blobs`]
      46              : pub struct VectoredBlobsBuf {
      47              :     /// Buffer for all blobs in this read
      48              :     pub buf: BytesMut,
      49              :     /// Offsets into the buffer and metadata for all blobs in this read
      50              :     pub blobs: Vec<VectoredBlob>,
      51              : }
      52              : 
      53              : /// Description of one disk read for multiple blobs.
      54              : /// Used as the argument form [`VectoredBlobReader::read_blobs`]
      55            0 : #[derive(Debug)]
      56              : pub struct VectoredRead {
      57              :     pub start: u64,
      58              :     pub end: u64,
      59              :     /// Starting offsets and metadata for each blob in this read
      60              :     pub blobs_at: VecMap<u64, BlobMeta>,
      61              : }
      62              : 
      63              : impl VectoredRead {
      64           30 :     fn size(&self) -> usize {
      65           30 :         (self.end - self.start) as usize
      66           30 :     }
      67              : }
      68              : 
      69          344 : #[derive(Eq, PartialEq)]
      70              : enum VectoredReadExtended {
      71              :     Yes,
      72              :     No,
      73              : }
      74              : 
      75              : struct VectoredReadBuilder {
      76              :     start: u64,
      77              :     end: u64,
      78              :     blobs_at: VecMap<u64, BlobMeta>,
      79              :     max_read_size: usize,
      80              : }
      81              : 
      82              : impl VectoredReadBuilder {
      83           26 :     fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
      84           26 :         let mut blobs_at = VecMap::default();
      85           26 :         blobs_at
      86           26 :             .append(start_offset, meta)
      87           26 :             .expect("First insertion always succeeds");
      88           26 : 
      89           26 :         Self {
      90           26 :             start: start_offset,
      91           26 :             end: end_offset,
      92           26 :             blobs_at,
      93           26 :             max_read_size,
      94           26 :         }
      95           26 :     }
      96              : 
      97              :     /// Attempt to extend the current read with a new blob if the start
      98              :     /// offset matches with the current end of the vectored read
      99              :     /// and the resuting size is below the max read size
     100          330 :     fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
     101          330 :         let size = (end - start) as usize;
     102          330 :         if self.end == start && self.size() + size <= self.max_read_size {
     103          318 :             self.end = end;
     104          318 :             self.blobs_at
     105          318 :                 .append(start, meta)
     106          318 :                 .expect("LSNs are ordered within vectored reads");
     107          318 : 
     108          318 :             return VectoredReadExtended::Yes;
     109           12 :         }
     110           12 : 
     111           12 :         VectoredReadExtended::No
     112          330 :     }
     113              : 
     114          328 :     fn size(&self) -> usize {
     115          328 :         (self.end - self.start) as usize
     116          328 :     }
     117              : 
     118           26 :     fn build(self) -> VectoredRead {
     119           26 :         VectoredRead {
     120           26 :             start: self.start,
     121           26 :             end: self.end,
     122           26 :             blobs_at: self.blobs_at,
     123           26 :         }
     124           26 :     }
     125              : }
     126              : 
     127            0 : #[derive(Copy, Clone, Debug)]
     128              : pub enum BlobFlag {
     129              :     None,
     130              :     Ignore,
     131              :     Replaces,
     132              : }
     133              : 
     134              : /// Planner for vectored blob reads.
     135              : ///
     136              : /// Blob offsets are received via [`VectoredReadPlanner::handle`]
     137              : /// and coalesced into disk reads.
     138              : ///
     139              : /// The implementation is very simple:
     140              : /// * Collect all blob offsets in an ordered structure
     141              : /// * Iterate over the collected blobs and coalesce them into reads at the end
     142              : pub struct VectoredReadPlanner {
     143              :     // Track all the blob offsets. Start offsets must be ordered.
     144              :     blobs: BTreeMap<Key, Vec<(Lsn, u64, u64)>>,
     145              :     // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
     146              :     prev: Option<(Key, Lsn, u64, BlobFlag)>,
     147              : 
     148              :     max_read_size: usize,
     149              : }
     150              : 
     151              : impl VectoredReadPlanner {
     152           14 :     pub fn new(max_read_size: usize) -> Self {
     153           14 :         Self {
     154           14 :             blobs: BTreeMap::new(),
     155           14 :             prev: None,
     156           14 :             max_read_size,
     157           14 :         }
     158           14 :     }
     159              : 
     160              :     /// Include a new blob in the read plan.
     161              :     ///
     162              :     /// This function is called from a B-Tree index visitor (see `DeltaLayerInner::plan_reads`
     163              :     /// and `ImageLayerInner::plan_reads`). Said visitor wants to collect blob offsets for all
     164              :     /// keys in a given keyspace. This function must be called for each key in the desired
     165              :     /// keyspace (monotonically continuous). [`Self::handle_range_end`] must
     166              :     /// be called after every range in the offset.
     167              :     ///
     168              :     /// In the event that keys are skipped, the behaviour is undefined and can lead to an
     169              :     /// incorrect read plan. We can end up asserting, erroring in wal redo or returning
     170              :     /// incorrect data to the user.
     171              :     ///
     172              :     /// The `flag` argument has two interesting values:
     173              :     /// * [`BlobFlag::Replaces`]: The blob for this key should replace all existing blobs.
     174              :     /// This is used for WAL records that `will_init`.
     175              :     /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
     176              :     /// if the blob is cached.
     177          348 :     pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
     178          334 :         // Implementation note: internally lag behind by one blob such that
     179          334 :         // we have a start and end offset when initialising [`VectoredRead`]
     180          348 :         let (prev_key, prev_lsn, prev_offset, prev_flag) = match self.prev {
     181              :             None => {
     182           14 :                 self.prev = Some((key, lsn, offset, flag));
     183           14 :                 return;
     184              :             }
     185          334 :             Some(prev) => prev,
     186          334 :         };
     187          334 : 
     188          334 :         self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
     189          334 : 
     190          334 :         self.prev = Some((key, lsn, offset, flag));
     191          348 :     }
     192              : 
     193           14 :     pub fn handle_range_end(&mut self, offset: u64) {
     194           14 :         if let Some((prev_key, prev_lsn, prev_offset, prev_flag)) = self.prev {
     195           14 :             self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
     196           14 :         }
     197              : 
     198           14 :         self.prev = None;
     199           14 :     }
     200              : 
     201          348 :     fn add_blob(&mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, flag: BlobFlag) {
     202          348 :         match flag {
     203           24 :             BlobFlag::None => {
     204           24 :                 let blobs_for_key = self.blobs.entry(key).or_default();
     205           24 :                 blobs_for_key.push((lsn, start_offset, end_offset));
     206           24 :             }
     207          324 :             BlobFlag::Replaces => {
     208          324 :                 let blobs_for_key = self.blobs.entry(key).or_default();
     209          324 :                 blobs_for_key.clear();
     210          324 :                 blobs_for_key.push((lsn, start_offset, end_offset));
     211          324 :             }
     212            0 :             BlobFlag::Ignore => {}
     213              :         }
     214          348 :     }
     215              : 
     216           14 :     pub fn finish(self) -> Vec<VectoredRead> {
     217           14 :         let mut current_read_builder: Option<VectoredReadBuilder> = None;
     218           14 :         let mut reads = Vec::new();
     219              : 
     220          340 :         for (key, blobs_for_key) in self.blobs {
     221          670 :             for (lsn, start_offset, end_offset) in blobs_for_key {
     222          344 :                 let extended = match &mut current_read_builder {
     223          330 :                     Some(read_builder) => {
     224          330 :                         read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn })
     225              :                     }
     226           14 :                     None => VectoredReadExtended::No,
     227              :                 };
     228              : 
     229          344 :                 if extended == VectoredReadExtended::No {
     230           26 :                     let next_read_builder = VectoredReadBuilder::new(
     231           26 :                         start_offset,
     232           26 :                         end_offset,
     233           26 :                         BlobMeta { key, lsn },
     234           26 :                         self.max_read_size,
     235           26 :                     );
     236           26 : 
     237           26 :                     let prev_read_builder = current_read_builder.replace(next_read_builder);
     238              : 
     239              :                     // `current_read_builder` is None in the first iteration of the outer loop
     240           26 :                     if let Some(read_builder) = prev_read_builder {
     241           12 :                         reads.push(read_builder.build());
     242           14 :                     }
     243          318 :                 }
     244              :             }
     245              :         }
     246              : 
     247           14 :         if let Some(read_builder) = current_read_builder {
     248           14 :             reads.push(read_builder.build());
     249           14 :         }
     250              : 
     251           14 :         reads
     252           14 :     }
     253              : }
     254              : 
     255              : /// Disk reader for vectored blob spans (does not go through the page cache)
     256              : pub struct VectoredBlobReader<'a> {
     257              :     file: &'a VirtualFile,
     258              : }
     259              : 
     260              : impl<'a> VectoredBlobReader<'a> {
     261           10 :     pub fn new(file: &'a VirtualFile) -> Self {
     262           10 :         Self { file }
     263           10 :     }
     264              : 
     265              :     /// Read the requested blobs into the buffer.
     266              :     ///
     267              :     /// We have to deal with the fact that blobs are not fixed size.
     268              :     /// Each blob is prefixed by a size header.
     269              :     ///
     270              :     /// The success return value is a struct which contains the buffer
     271              :     /// filled from disk and a list of offsets at which each blob lies
     272              :     /// in the buffer.
     273           10 :     pub async fn read_blobs(
     274           10 :         &self,
     275           10 :         read: &VectoredRead,
     276           10 :         buf: BytesMut,
     277           10 :     ) -> Result<VectoredBlobsBuf, std::io::Error> {
     278           10 :         assert!(read.size() > 0);
     279           10 :         assert!(
     280           10 :             read.size() <= buf.capacity(),
     281            0 :             "{} > {}",
     282            0 :             read.size(),
     283            0 :             buf.capacity()
     284              :         );
     285           10 :         let buf = self
     286           10 :             .file
     287           10 :             .read_exact_at_n(buf, read.start, read.size())
     288            5 :             .await?;
     289              : 
     290           10 :         let blobs_at = read.blobs_at.as_slice();
     291           10 :         let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
     292           10 : 
     293           10 :         let mut metas = Vec::with_capacity(blobs_at.len());
     294           10 : 
     295           10 :         // Blobs in `read` only provide their starting offset. The end offset
     296           10 :         // of a blob is implicit: the start of the next blob if one exists
     297           10 :         // or the end of the read.
     298           10 :         let pairs = blobs_at.iter().zip(
     299           10 :             blobs_at
     300           10 :                 .iter()
     301           10 :                 .map(Some)
     302           10 :                 .skip(1)
     303           10 :                 .chain(std::iter::once(None)),
     304           10 :         );
     305              : 
     306          330 :         for ((offset, meta), next) in pairs {
     307          320 :             let offset_in_buf = offset - start_offset;
     308          320 :             let first_len_byte = buf[offset_in_buf as usize];
     309              : 
     310              :             // Each blob is prefixed by a header containing it's size.
     311              :             // Extract the size and skip that header to find the start of the data.
     312              :             // The size can be 1 or 4 bytes. The most significant bit is 0 in the
     313              :             // 1 byte case and 1 in the 4 byte case.
     314          320 :             let (size_length, blob_size) = if first_len_byte < 0x80 {
     315          320 :                 (1, first_len_byte as u64)
     316              :             } else {
     317            0 :                 let mut blob_size_buf = [0u8; 4];
     318            0 :                 let offset_in_buf = offset_in_buf as usize;
     319            0 : 
     320            0 :                 blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
     321            0 :                 blob_size_buf[0] &= 0x7f;
     322            0 :                 (4, u32::from_be_bytes(blob_size_buf) as u64)
     323              :             };
     324              : 
     325          320 :             let start = offset_in_buf + size_length;
     326          320 :             let end = match next {
     327          310 :                 Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
     328           10 :                 None => start + blob_size,
     329              :             };
     330              : 
     331          320 :             assert_eq!(end - start, blob_size);
     332              : 
     333          320 :             metas.push(VectoredBlob {
     334          320 :                 start: start as usize,
     335          320 :                 end: end as usize,
     336          320 :                 meta: *meta,
     337          320 :             })
     338              :         }
     339              : 
     340           10 :         Ok(VectoredBlobsBuf { buf, blobs: metas })
     341           10 :     }
     342              : }
     343              : 
     344              : #[cfg(test)]
     345              : mod tests {
     346              :     use super::*;
     347              : 
     348           16 :     fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
     349           16 :         assert_eq!(read.start, offset_range.first().unwrap().2);
     350              : 
     351           24 :         let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();
     352           16 : 
     353           16 :         let offsets_in_read: Vec<_> = read
     354           16 :             .blobs_at
     355           16 :             .as_slice()
     356           16 :             .iter()
     357           24 :             .map(|(offset, _)| *offset)
     358           16 :             .collect();
     359           16 : 
     360           16 :         assert_eq!(expected_offsets_in_read, offsets_in_read);
     361           16 :     }
     362              : 
     363            2 :     #[test]
     364            2 :     fn planner_max_read_size_test() {
     365            2 :         let max_read_size = 128 * 1024;
     366            2 :         let key = Key::MIN;
     367            2 :         let lsn = Lsn(0);
     368            2 : 
     369            2 :         let blob_descriptions = vec![
     370            2 :             (key, lsn, 0, BlobFlag::None),
     371            2 :             (key, lsn, 32 * 1024, BlobFlag::None),
     372            2 :             (key, lsn, 96 * 1024, BlobFlag::None), // Last in read 1
     373            2 :             (key, lsn, 128 * 1024, BlobFlag::None), // Last in read 2
     374            2 :             (key, lsn, 198 * 1024, BlobFlag::None), // Last in read 3
     375            2 :             (key, lsn, 268 * 1024, BlobFlag::None), // Last in read 4
     376            2 :             (key, lsn, 396 * 1024, BlobFlag::None), // Last in read 5
     377            2 :             (key, lsn, 652 * 1024, BlobFlag::None), // Last in read 6
     378            2 :         ];
     379            2 : 
     380            2 :         let ranges = [
     381            2 :             &blob_descriptions[0..3],
     382            2 :             &blob_descriptions[3..4],
     383            2 :             &blob_descriptions[4..5],
     384            2 :             &blob_descriptions[5..6],
     385            2 :             &blob_descriptions[6..7],
     386            2 :             &blob_descriptions[7..],
     387            2 :         ];
     388            2 : 
     389            2 :         let mut planner = VectoredReadPlanner::new(max_read_size);
     390           16 :         for (key, lsn, offset, flag) in blob_descriptions.clone() {
     391           16 :             planner.handle(key, lsn, offset, flag);
     392           16 :         }
     393              : 
     394            2 :         planner.handle_range_end(652 * 1024);
     395            2 : 
     396            2 :         let reads = planner.finish();
     397            2 :         assert_eq!(reads.len(), 6);
     398              : 
     399           12 :         for (idx, read) in reads.iter().enumerate() {
     400           12 :             validate_read(read, ranges[idx]);
     401           12 :         }
     402            2 :     }
     403              : 
     404            2 :     #[test]
     405            2 :     fn planner_replacement_test() {
     406            2 :         let max_read_size = 128 * 1024;
     407            2 :         let first_key = Key::MIN;
     408            2 :         let second_key = first_key.next();
     409            2 :         let lsn = Lsn(0);
     410            2 : 
     411            2 :         let blob_descriptions = vec![
     412            2 :             (first_key, lsn, 0, BlobFlag::None),    // First in read 1
     413            2 :             (first_key, lsn, 1024, BlobFlag::None), // Last in read 1
     414            2 :             (second_key, lsn, 2 * 1024, BlobFlag::Replaces),
     415            2 :             (second_key, lsn, 3 * 1024, BlobFlag::None),
     416            2 :             (second_key, lsn, 4 * 1024, BlobFlag::Replaces), // First in read 2
     417            2 :             (second_key, lsn, 5 * 1024, BlobFlag::None),     // Last in read 2
     418            2 :         ];
     419            2 : 
     420            2 :         let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];
     421            2 : 
     422            2 :         let mut planner = VectoredReadPlanner::new(max_read_size);
     423           12 :         for (key, lsn, offset, flag) in blob_descriptions.clone() {
     424           12 :             planner.handle(key, lsn, offset, flag);
     425           12 :         }
     426              : 
     427            2 :         planner.handle_range_end(6 * 1024);
     428            2 : 
     429            2 :         let reads = planner.finish();
     430            2 :         assert_eq!(reads.len(), 2);
     431              : 
     432            4 :         for (idx, read) in reads.iter().enumerate() {
     433            4 :             validate_read(read, ranges[idx]);
     434            4 :         }
     435            2 :     }
     436              : }
        

Generated by: LCOV version 2.1-beta