LCOV - code coverage report
Current view: top level - pageserver/src/tenant - vectored_blob_io.rs (source / functions) Coverage Total Hit
Test: 36bb8dd7c7efcb53483d1a7d9f7cb33e8406dcf0.info Lines: 98.8 % 240 237
Test Date: 2024-04-08 10:22:05 Functions: 100.0 % 18 18

            Line data    Source code
       1              : //!
       2              : //! Utilities for vectored reading of variable-sized "blobs".
       3              : //!
       4              : //! The "blob" api is an abstraction on top of the "block" api,
       5              : //! with the main difference being that blobs do not have a fixed
       6              : //! size (each blob is prefixed with 1 or 4 byte length field)
       7              : //!
       8              : //! The vectored apis provided in this module allow for planning
       9              : //! and executing disk IO which covers multiple blobs.
      10              : //!
      11              : //! Reads are planned with [`VectoredReadPlanner`] which will coalesce
      12              : //! adjacent blocks into a single disk IO request and exectuted by
      13              : //! [`VectoredBlobReader`] which does all the required offset juggling
      14              : //! and returns a buffer housing all the blobs and a list of offsets.
      15              : //!
      16              : //! Note that the vectored blob api does *not* go through the page cache.
      17              : 
      18              : use std::collections::BTreeMap;
      19              : use std::num::NonZeroUsize;
      20              : 
      21              : use bytes::BytesMut;
      22              : use pageserver_api::key::Key;
      23              : use utils::lsn::Lsn;
      24              : use utils::vec_map::VecMap;
      25              : 
      26              : use crate::virtual_file::VirtualFile;
      27              : 
      28              : #[derive(Copy, Clone, Debug, PartialEq, Eq)]
      29              : pub struct MaxVectoredReadBytes(pub NonZeroUsize);
      30              : 
      31              : /// Metadata bundled with the start and end offset of a blob.
      32              : #[derive(Copy, Clone, Debug)]
      33              : pub struct BlobMeta {
      34              :     pub key: Key,
      35              :     pub lsn: Lsn,
      36              : }
      37              : 
      38              : /// Blob offsets into [`VectoredBlobsBuf::buf`]
      39              : pub struct VectoredBlob {
      40              :     pub start: usize,
      41              :     pub end: usize,
      42              :     pub meta: BlobMeta,
      43              : }
      44              : 
      45              : /// Return type of [`VectoredBlobReader::read_blobs`]
      46              : pub struct VectoredBlobsBuf {
      47              :     /// Buffer for all blobs in this read
      48              :     pub buf: BytesMut,
      49              :     /// Offsets into the buffer and metadata for all blobs in this read
      50              :     pub blobs: Vec<VectoredBlob>,
      51              : }
      52              : 
      53              : /// Description of one disk read for multiple blobs.
      54              : /// Used as the argument form [`VectoredBlobReader::read_blobs`]
      55              : #[derive(Debug)]
      56              : pub struct VectoredRead {
      57              :     pub start: u64,
      58              :     pub end: u64,
      59              :     /// Starting offsets and metadata for each blob in this read
      60              :     pub blobs_at: VecMap<u64, BlobMeta>,
      61              : }
      62              : 
      63              : impl VectoredRead {
      64        79186 :     pub fn size(&self) -> usize {
      65        79186 :         (self.end - self.start) as usize
      66        79186 :     }
      67              : }
      68              : 
      69              : #[derive(Eq, PartialEq)]
      70              : enum VectoredReadExtended {
      71              :     Yes,
      72              :     No,
      73              : }
      74              : 
      75              : struct VectoredReadBuilder {
      76              :     start: u64,
      77              :     end: u64,
      78              :     blobs_at: VecMap<u64, BlobMeta>,
      79              :     max_read_size: usize,
      80              : }
      81              : 
      82              : impl VectoredReadBuilder {
      83        19764 :     fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
      84        19764 :         let mut blobs_at = VecMap::default();
      85        19764 :         blobs_at
      86        19764 :             .append(start_offset, meta)
      87        19764 :             .expect("First insertion always succeeds");
      88        19764 : 
      89        19764 :         Self {
      90        19764 :             start: start_offset,
      91        19764 :             end: end_offset,
      92        19764 :             blobs_at,
      93        19764 :             max_read_size,
      94        19764 :         }
      95        19764 :     }
      96              : 
      97              :     /// Attempt to extend the current read with a new blob if the start
      98              :     /// offset matches with the current end of the vectored read
      99              :     /// and the resuting size is below the max read size
     100        57494 :     fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
     101        57494 :         let size = (end - start) as usize;
     102        57494 :         if self.end == start && self.size() + size <= self.max_read_size {
     103        37954 :             self.end = end;
     104        37954 :             self.blobs_at
     105        37954 :                 .append(start, meta)
     106        37954 :                 .expect("LSNs are ordered within vectored reads");
     107        37954 : 
     108        37954 :             return VectoredReadExtended::Yes;
     109        19540 :         }
     110        19540 : 
     111        19540 :         VectoredReadExtended::No
     112        57494 :     }
     113              : 
     114        57306 :     fn size(&self) -> usize {
     115        57306 :         (self.end - self.start) as usize
     116        57306 :     }
     117              : 
     118        19764 :     fn build(self) -> VectoredRead {
     119        19764 :         VectoredRead {
     120        19764 :             start: self.start,
     121        19764 :             end: self.end,
     122        19764 :             blobs_at: self.blobs_at,
     123        19764 :         }
     124        19764 :     }
     125              : }
     126              : 
     127              : #[derive(Copy, Clone, Debug)]
     128              : pub enum BlobFlag {
     129              :     None,
     130              :     Ignore,
     131              :     ReplaceAll,
     132              : }
     133              : 
     134              : /// Planner for vectored blob reads.
     135              : ///
     136              : /// Blob offsets are received via [`VectoredReadPlanner::handle`]
     137              : /// and coalesced into disk reads.
     138              : ///
     139              : /// The implementation is very simple:
     140              : /// * Collect all blob offsets in an ordered structure
     141              : /// * Iterate over the collected blobs and coalesce them into reads at the end
     142              : pub struct VectoredReadPlanner {
     143              :     // Track all the blob offsets. Start offsets must be ordered.
     144              :     blobs: BTreeMap<Key, Vec<(Lsn, u64, u64)>>,
     145              :     // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
     146              :     prev: Option<(Key, Lsn, u64, BlobFlag)>,
     147              : 
     148              :     max_read_size: usize,
     149              : }
     150              : 
     151              : impl VectoredReadPlanner {
     152          224 :     pub fn new(max_read_size: usize) -> Self {
     153          224 :         Self {
     154          224 :             blobs: BTreeMap::new(),
     155          224 :             prev: None,
     156          224 :             max_read_size,
     157          224 :         }
     158          224 :     }
     159              : 
     160              :     /// Include a new blob in the read plan.
     161              :     ///
     162              :     /// This function is called from a B-Tree index visitor (see `DeltaLayerInner::plan_reads`
     163              :     /// and `ImageLayerInner::plan_reads`). Said visitor wants to collect blob offsets for all
     164              :     /// keys in a given keyspace. This function must be called for each key in the desired
     165              :     /// keyspace (monotonically continuous). [`Self::handle_range_end`] must
     166              :     /// be called after every range in the offset.
     167              :     ///
     168              :     /// In the event that keys are skipped, the behaviour is undefined and can lead to an
     169              :     /// incorrect read plan. We can end up asserting, erroring in wal redo or returning
     170              :     /// incorrect data to the user.
     171              :     ///
     172              :     /// The `flag` argument has two interesting values:
     173              :     /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs.
     174              :     /// This is used for WAL records that `will_init`.
     175              :     /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
     176              :     /// if the blob is cached.
     177        57724 :     pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
     178        57298 :         // Implementation note: internally lag behind by one blob such that
     179        57298 :         // we have a start and end offset when initialising [`VectoredRead`]
     180        57724 :         let (prev_key, prev_lsn, prev_offset, prev_flag) = match self.prev {
     181              :             None => {
     182          426 :                 self.prev = Some((key, lsn, offset, flag));
     183          426 :                 return;
     184              :             }
     185        57298 :             Some(prev) => prev,
     186        57298 :         };
     187        57298 : 
     188        57298 :         self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
     189        57298 : 
     190        57298 :         self.prev = Some((key, lsn, offset, flag));
     191        57724 :     }
     192              : 
     193          426 :     pub fn handle_range_end(&mut self, offset: u64) {
     194          426 :         if let Some((prev_key, prev_lsn, prev_offset, prev_flag)) = self.prev {
     195          426 :             self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
     196          426 :         }
     197              : 
     198          426 :         self.prev = None;
     199          426 :     }
     200              : 
     201        57724 :     fn add_blob(&mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, flag: BlobFlag) {
     202        57724 :         match flag {
     203        57356 :             BlobFlag::None => {
     204        57356 :                 let blobs_for_key = self.blobs.entry(key).or_default();
     205        57356 :                 blobs_for_key.push((lsn, start_offset, end_offset));
     206        57356 :             }
     207          366 :             BlobFlag::ReplaceAll => {
     208          366 :                 let blobs_for_key = self.blobs.entry(key).or_default();
     209          366 :                 blobs_for_key.clear();
     210          366 :                 blobs_for_key.push((lsn, start_offset, end_offset));
     211          366 :             }
     212            2 :             BlobFlag::Ignore => {}
     213              :         }
     214        57724 :     }
     215              : 
     216          224 :     pub fn finish(self) -> Vec<VectoredRead> {
     217          224 :         let mut current_read_builder: Option<VectoredReadBuilder> = None;
     218          224 :         let mut reads = Vec::new();
     219              : 
     220         6902 :         for (key, blobs_for_key) in self.blobs {
     221        64396 :             for (lsn, start_offset, end_offset) in blobs_for_key {
     222        57718 :                 let extended = match &mut current_read_builder {
     223        57494 :                     Some(read_builder) => {
     224        57494 :                         read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn })
     225              :                     }
     226          224 :                     None => VectoredReadExtended::No,
     227              :                 };
     228              : 
     229        57718 :                 if extended == VectoredReadExtended::No {
     230        19764 :                     let next_read_builder = VectoredReadBuilder::new(
     231        19764 :                         start_offset,
     232        19764 :                         end_offset,
     233        19764 :                         BlobMeta { key, lsn },
     234        19764 :                         self.max_read_size,
     235        19764 :                     );
     236        19764 : 
     237        19764 :                     let prev_read_builder = current_read_builder.replace(next_read_builder);
     238              : 
     239              :                     // `current_read_builder` is None in the first iteration of the outer loop
     240        19764 :                     if let Some(read_builder) = prev_read_builder {
     241        19540 :                         reads.push(read_builder.build());
     242        19540 :                     }
     243        37954 :                 }
     244              :             }
     245              :         }
     246              : 
     247          224 :         if let Some(read_builder) = current_read_builder {
     248          224 :             reads.push(read_builder.build());
     249          224 :         }
     250              : 
     251          224 :         reads
     252          224 :     }
     253              : }
     254              : 
     255              : /// Disk reader for vectored blob spans (does not go through the page cache)
     256              : pub struct VectoredBlobReader<'a> {
     257              :     file: &'a VirtualFile,
     258              : }
     259              : 
     260              : impl<'a> VectoredBlobReader<'a> {
     261          218 :     pub fn new(file: &'a VirtualFile) -> Self {
     262          218 :         Self { file }
     263          218 :     }
     264              : 
     265              :     /// Read the requested blobs into the buffer.
     266              :     ///
     267              :     /// We have to deal with the fact that blobs are not fixed size.
     268              :     /// Each blob is prefixed by a size header.
     269              :     ///
     270              :     /// The success return value is a struct which contains the buffer
     271              :     /// filled from disk and a list of offsets at which each blob lies
     272              :     /// in the buffer.
     273        19742 :     pub async fn read_blobs(
     274        19742 :         &self,
     275        19742 :         read: &VectoredRead,
     276        19742 :         buf: BytesMut,
     277        19742 :     ) -> Result<VectoredBlobsBuf, std::io::Error> {
     278        19742 :         assert!(read.size() > 0);
     279        19742 :         assert!(
     280        19742 :             read.size() <= buf.capacity(),
     281            0 :             "{} > {}",
     282            0 :             read.size(),
     283            0 :             buf.capacity()
     284              :         );
     285        19742 :         let buf = self
     286        19742 :             .file
     287        19742 :             .read_exact_at_n(buf, read.start, read.size())
     288        10026 :             .await?;
     289              : 
     290        19742 :         let blobs_at = read.blobs_at.as_slice();
     291        19742 :         let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
     292        19742 : 
     293        19742 :         let mut metas = Vec::with_capacity(blobs_at.len());
     294        19742 : 
     295        19742 :         // Blobs in `read` only provide their starting offset. The end offset
     296        19742 :         // of a blob is implicit: the start of the next blob if one exists
     297        19742 :         // or the end of the read.
     298        19742 :         let pairs = blobs_at.iter().zip(
     299        19742 :             blobs_at
     300        19742 :                 .iter()
     301        19742 :                 .map(Some)
     302        19742 :                 .skip(1)
     303        19742 :                 .chain(std::iter::once(None)),
     304        19742 :         );
     305              : 
     306        77408 :         for ((offset, meta), next) in pairs {
     307        57666 :             let offset_in_buf = offset - start_offset;
     308        57666 :             let first_len_byte = buf[offset_in_buf as usize];
     309              : 
     310              :             // Each blob is prefixed by a header containing it's size.
     311              :             // Extract the size and skip that header to find the start of the data.
     312              :             // The size can be 1 or 4 bytes. The most significant bit is 0 in the
     313              :             // 1 byte case and 1 in the 4 byte case.
     314        57666 :             let (size_length, blob_size) = if first_len_byte < 0x80 {
     315        24052 :                 (1, first_len_byte as u64)
     316              :             } else {
     317        33614 :                 let mut blob_size_buf = [0u8; 4];
     318        33614 :                 let offset_in_buf = offset_in_buf as usize;
     319        33614 : 
     320        33614 :                 blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
     321        33614 :                 blob_size_buf[0] &= 0x7f;
     322        33614 :                 (4, u32::from_be_bytes(blob_size_buf) as u64)
     323              :             };
     324              : 
     325        57666 :             let start = offset_in_buf + size_length;
     326        57666 :             let end = match next {
     327        37924 :                 Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
     328        19742 :                 None => start + blob_size,
     329              :             };
     330              : 
     331        57666 :             assert_eq!(end - start, blob_size);
     332              : 
     333        57666 :             metas.push(VectoredBlob {
     334        57666 :                 start: start as usize,
     335        57666 :                 end: end as usize,
     336        57666 :                 meta: *meta,
     337        57666 :             })
     338              :         }
     339              : 
     340        19742 :         Ok(VectoredBlobsBuf { buf, blobs: metas })
     341        19742 :     }
     342              : }
     343              : 
     344              : #[cfg(test)]
     345              : mod tests {
     346              :     use super::*;
     347              : 
     348           16 :     fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
     349           16 :         assert_eq!(read.start, offset_range.first().unwrap().2);
     350              : 
     351           24 :         let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();
     352           16 : 
     353           16 :         let offsets_in_read: Vec<_> = read
     354           16 :             .blobs_at
     355           16 :             .as_slice()
     356           16 :             .iter()
     357           24 :             .map(|(offset, _)| *offset)
     358           16 :             .collect();
     359           16 : 
     360           16 :         assert_eq!(expected_offsets_in_read, offsets_in_read);
     361           16 :     }
     362              : 
     363              :     #[test]
     364            2 :     fn planner_max_read_size_test() {
     365            2 :         let max_read_size = 128 * 1024;
     366            2 :         let key = Key::MIN;
     367            2 :         let lsn = Lsn(0);
     368            2 : 
     369            2 :         let blob_descriptions = vec![
     370            2 :             (key, lsn, 0, BlobFlag::None),
     371            2 :             (key, lsn, 32 * 1024, BlobFlag::None),
     372            2 :             (key, lsn, 96 * 1024, BlobFlag::None), // Last in read 1
     373            2 :             (key, lsn, 128 * 1024, BlobFlag::None), // Last in read 2
     374            2 :             (key, lsn, 198 * 1024, BlobFlag::None), // Last in read 3
     375            2 :             (key, lsn, 268 * 1024, BlobFlag::None), // Last in read 4
     376            2 :             (key, lsn, 396 * 1024, BlobFlag::None), // Last in read 5
     377            2 :             (key, lsn, 652 * 1024, BlobFlag::None), // Last in read 6
     378            2 :         ];
     379            2 : 
     380            2 :         let ranges = [
     381            2 :             &blob_descriptions[0..3],
     382            2 :             &blob_descriptions[3..4],
     383            2 :             &blob_descriptions[4..5],
     384            2 :             &blob_descriptions[5..6],
     385            2 :             &blob_descriptions[6..7],
     386            2 :             &blob_descriptions[7..],
     387            2 :         ];
     388            2 : 
     389            2 :         let mut planner = VectoredReadPlanner::new(max_read_size);
     390           16 :         for (key, lsn, offset, flag) in blob_descriptions.clone() {
     391           16 :             planner.handle(key, lsn, offset, flag);
     392           16 :         }
     393              : 
     394            2 :         planner.handle_range_end(652 * 1024);
     395            2 : 
     396            2 :         let reads = planner.finish();
     397            2 :         assert_eq!(reads.len(), 6);
     398              : 
     399           12 :         for (idx, read) in reads.iter().enumerate() {
     400           12 :             validate_read(read, ranges[idx]);
     401           12 :         }
     402            2 :     }
     403              : 
     404              :     #[test]
     405            2 :     fn planner_replacement_test() {
     406            2 :         let max_read_size = 128 * 1024;
     407            2 :         let first_key = Key::MIN;
     408            2 :         let second_key = first_key.next();
     409            2 :         let lsn = Lsn(0);
     410            2 : 
     411            2 :         let blob_descriptions = vec![
     412            2 :             (first_key, lsn, 0, BlobFlag::None),    // First in read 1
     413            2 :             (first_key, lsn, 1024, BlobFlag::None), // Last in read 1
     414            2 :             (second_key, lsn, 2 * 1024, BlobFlag::ReplaceAll),
     415            2 :             (second_key, lsn, 3 * 1024, BlobFlag::None),
     416            2 :             (second_key, lsn, 4 * 1024, BlobFlag::ReplaceAll), // First in read 2
     417            2 :             (second_key, lsn, 5 * 1024, BlobFlag::None),       // Last in read 2
     418            2 :         ];
     419            2 : 
     420            2 :         let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];
     421            2 : 
     422            2 :         let mut planner = VectoredReadPlanner::new(max_read_size);
     423           12 :         for (key, lsn, offset, flag) in blob_descriptions.clone() {
     424           12 :             planner.handle(key, lsn, offset, flag);
     425           12 :         }
     426              : 
     427            2 :         planner.handle_range_end(6 * 1024);
     428            2 : 
     429            2 :         let reads = planner.finish();
     430            2 :         assert_eq!(reads.len(), 2);
     431              : 
     432            4 :         for (idx, read) in reads.iter().enumerate() {
     433            4 :             validate_read(read, ranges[idx]);
     434            4 :         }
     435            2 :     }
     436              : }
        

Generated by: LCOV version 2.1-beta