LCOV - code coverage report
Current view: top level - pageserver/src/tenant/storage_layer - image_layer.rs (source / functions) Coverage Total Hit
Test: f081ec316c96fa98335efd15ef501745aa4f015d.info Lines: 76.2 % 755 575
Test Date: 2024-06-25 15:11:17 Functions: 45.8 % 72 33

            Line data    Source code
       1              : //! An ImageLayer represents an image or a snapshot of a key-range at
       2              : //! one particular LSN. It contains an image of all key-value pairs
       3              : //! in its key-range. Any key that falls into the image layer's range
       4              : //! but does not exist in the layer, does not exist.
       5              : //!
       6              : //! An image layer is stored in a file on disk. The file is stored in
       7              : //! timelines/<timeline_id> directory.  Currently, there are no
       8              : //! subdirectories, and each image layer file is named like this:
       9              : //!
      10              : //! ```text
      11              : //!    <key start>-<key end>__<LSN>
      12              : //! ```
      13              : //!
      14              : //! For example:
      15              : //!
      16              : //! ```text
      17              : //!    000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
      18              : //! ```
      19              : //!
      20              : //! Every image layer file consists of three parts: "summary",
      21              : //! "index", and "values".  The summary is a fixed size header at the
      22              : //! beginning of the file, and it contains basic information about the
      23              : //! layer, and offsets to the other parts. The "index" is a B-tree,
      24              : //! mapping from Key to an offset in the "values" part.  The
      25              : //! actual page images are stored in the "values" part.
      26              : use crate::config::PageServerConf;
      27              : use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
      28              : use crate::page_cache::{self, FileId, PAGE_SZ};
      29              : use crate::repository::{Key, Value, KEY_SIZE};
      30              : use crate::tenant::blob_io::BlobWriter;
      31              : use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
      32              : use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
      33              : use crate::tenant::storage_layer::{
      34              :     LayerAccessStats, ValueReconstructResult, ValueReconstructState,
      35              : };
      36              : use crate::tenant::timeline::GetVectoredError;
      37              : use crate::tenant::vectored_blob_io::{
      38              :     BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
      39              : };
      40              : use crate::tenant::{PageReconstructError, Timeline};
      41              : use crate::virtual_file::{self, VirtualFile};
      42              : use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
      43              : use anyhow::{anyhow, bail, ensure, Context, Result};
      44              : use bytes::{Bytes, BytesMut};
      45              : use camino::{Utf8Path, Utf8PathBuf};
      46              : use hex;
      47              : use itertools::Itertools;
      48              : use pageserver_api::keyspace::KeySpace;
      49              : use pageserver_api::models::LayerAccessKind;
      50              : use pageserver_api::shard::{ShardIdentity, TenantShardId};
      51              : use rand::{distributions::Alphanumeric, Rng};
      52              : use serde::{Deserialize, Serialize};
      53              : use std::fs::File;
      54              : use std::io::SeekFrom;
      55              : use std::ops::Range;
      56              : use std::os::unix::prelude::FileExt;
      57              : use std::str::FromStr;
      58              : use std::sync::Arc;
      59              : use tokio::sync::OnceCell;
      60              : use tokio_stream::StreamExt;
      61              : use tracing::*;
      62              : 
      63              : use utils::{
      64              :     bin_ser::BeSer,
      65              :     id::{TenantId, TimelineId},
      66              :     lsn::Lsn,
      67              : };
      68              : 
      69              : use super::layer_name::ImageLayerName;
      70              : use super::{
      71              :     AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
      72              : };
      73              : 
      74              : ///
      75              : /// Header stored in the beginning of the file
      76              : ///
      77              : /// After this comes the 'values' part, starting on block 1. After that,
      78              : /// the 'index' starts at the block indicated by 'index_start_blk'
      79              : ///
      80           88 : #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
      81              : pub struct Summary {
      82              :     /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
      83              :     pub magic: u16,
      84              :     pub format_version: u16,
      85              : 
      86              :     pub tenant_id: TenantId,
      87              :     pub timeline_id: TimelineId,
      88              :     pub key_range: Range<Key>,
      89              :     pub lsn: Lsn,
      90              : 
      91              :     /// Block number where the 'index' part of the file begins.
      92              :     pub index_start_blk: u32,
      93              :     /// Block within the 'index', where the B-tree root page is stored
      94              :     pub index_root_blk: u32,
      95              :     // the 'values' part starts after the summary header, on block 1.
      96              : }
      97              : 
      98              : impl From<&ImageLayer> for Summary {
      99            0 :     fn from(layer: &ImageLayer) -> Self {
     100            0 :         Self::expected(
     101            0 :             layer.desc.tenant_shard_id.tenant_id,
     102            0 :             layer.desc.timeline_id,
     103            0 :             layer.desc.key_range.clone(),
     104            0 :             layer.lsn,
     105            0 :         )
     106            0 :     }
     107              : }
     108              : 
     109              : impl Summary {
     110           88 :     pub(super) fn expected(
     111           88 :         tenant_id: TenantId,
     112           88 :         timeline_id: TimelineId,
     113           88 :         key_range: Range<Key>,
     114           88 :         lsn: Lsn,
     115           88 :     ) -> Self {
     116           88 :         Self {
     117           88 :             magic: IMAGE_FILE_MAGIC,
     118           88 :             format_version: STORAGE_FORMAT_VERSION,
     119           88 :             tenant_id,
     120           88 :             timeline_id,
     121           88 :             key_range,
     122           88 :             lsn,
     123           88 : 
     124           88 :             index_start_blk: 0,
     125           88 :             index_root_blk: 0,
     126           88 :         }
     127           88 :     }
     128              : }
     129              : 
     130              : /// This is used only from `pagectl`. Within pageserver, all layers are
     131              : /// [`crate::tenant::storage_layer::Layer`], which can hold an [`ImageLayerInner`].
     132              : pub struct ImageLayer {
     133              :     path: Utf8PathBuf,
     134              :     pub desc: PersistentLayerDesc,
     135              :     // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
     136              :     pub lsn: Lsn,
     137              :     access_stats: LayerAccessStats,
     138              :     inner: OnceCell<ImageLayerInner>,
     139              : }
     140              : 
     141              : impl std::fmt::Debug for ImageLayer {
     142            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     143            0 :         use super::RangeDisplayDebug;
     144            0 : 
     145            0 :         f.debug_struct("ImageLayer")
     146            0 :             .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
     147            0 :             .field("file_size", &self.desc.file_size)
     148            0 :             .field("lsn", &self.lsn)
     149            0 :             .field("inner", &self.inner)
     150            0 :             .finish()
     151            0 :     }
     152              : }
     153              : 
     154              : /// ImageLayer is the in-memory data structure associated with an on-disk image
     155              : /// file.
     156              : pub struct ImageLayerInner {
     157              :     // values copied from summary
     158              :     index_start_blk: u32,
     159              :     index_root_blk: u32,
     160              : 
     161              :     key_range: Range<Key>,
     162              :     lsn: Lsn,
     163              : 
     164              :     file: VirtualFile,
     165              :     file_id: FileId,
     166              : 
     167              :     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
     168              : }
     169              : 
     170              : impl std::fmt::Debug for ImageLayerInner {
     171            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     172            0 :         f.debug_struct("ImageLayerInner")
     173            0 :             .field("index_start_blk", &self.index_start_blk)
     174            0 :             .field("index_root_blk", &self.index_root_blk)
     175            0 :             .finish()
     176            0 :     }
     177              : }
     178              : 
     179              : impl ImageLayerInner {
     180            0 :     pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
     181            0 :         let block_reader = FileBlockReader::new(&self.file, self.file_id);
     182            0 :         let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
     183            0 :             self.index_start_blk,
     184            0 :             self.index_root_blk,
     185            0 :             block_reader,
     186            0 :         );
     187            0 : 
     188            0 :         tree_reader.dump().await?;
     189              : 
     190            0 :         tree_reader
     191            0 :             .visit(
     192            0 :                 &[0u8; KEY_SIZE],
     193            0 :                 VisitDirection::Forwards,
     194            0 :                 |key, value| {
     195            0 :                     println!("key: {} offset {}", hex::encode(key), value);
     196            0 :                     true
     197            0 :                 },
     198            0 :                 ctx,
     199            0 :             )
     200            0 :             .await?;
     201              : 
     202            0 :         Ok(())
     203            0 :     }
     204              : }
     205              : 
     206              : /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
     207              : impl std::fmt::Display for ImageLayer {
     208            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     209            0 :         write!(f, "{}", self.layer_desc().short_id())
     210            0 :     }
     211              : }
     212              : 
     213              : impl AsLayerDesc for ImageLayer {
     214            0 :     fn layer_desc(&self) -> &PersistentLayerDesc {
     215            0 :         &self.desc
     216            0 :     }
     217              : }
     218              : 
     219              : impl ImageLayer {
     220            0 :     pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
     221            0 :         self.desc.dump();
     222            0 : 
     223            0 :         if !verbose {
     224            0 :             return Ok(());
     225            0 :         }
     226              : 
     227            0 :         let inner = self.load(LayerAccessKind::Dump, ctx).await?;
     228              : 
     229            0 :         inner.dump(ctx).await?;
     230              : 
     231            0 :         Ok(())
     232            0 :     }
     233              : 
     234          238 :     fn temp_path_for(
     235          238 :         conf: &PageServerConf,
     236          238 :         timeline_id: TimelineId,
     237          238 :         tenant_shard_id: TenantShardId,
     238          238 :         fname: &ImageLayerName,
     239          238 :     ) -> Utf8PathBuf {
     240          238 :         let rand_string: String = rand::thread_rng()
     241          238 :             .sample_iter(&Alphanumeric)
     242          238 :             .take(8)
     243          238 :             .map(char::from)
     244          238 :             .collect();
     245          238 : 
     246          238 :         conf.timeline_path(&tenant_shard_id, &timeline_id)
     247          238 :             .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
     248          238 :     }
     249              : 
     250              :     ///
     251              :     /// Open the underlying file and read the metadata into memory, if it's
     252              :     /// not loaded already.
     253              :     ///
     254            0 :     async fn load(
     255            0 :         &self,
     256            0 :         access_kind: LayerAccessKind,
     257            0 :         ctx: &RequestContext,
     258            0 :     ) -> Result<&ImageLayerInner> {
     259            0 :         self.access_stats.record_access(access_kind, ctx);
     260            0 :         self.inner
     261            0 :             .get_or_try_init(|| self.load_inner(ctx))
     262            0 :             .await
     263            0 :             .with_context(|| format!("Failed to load image layer {}", self.path()))
     264            0 :     }
     265              : 
     266            0 :     async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
     267            0 :         let path = self.path();
     268              : 
     269            0 :         let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
     270            0 :             .await
     271            0 :             .and_then(|res| res)?;
     272              : 
     273              :         // not production code
     274            0 :         let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
     275            0 :         let expected_layer_name = self.layer_desc().layer_name();
     276            0 : 
     277            0 :         if actual_layer_name != expected_layer_name {
     278            0 :             println!("warning: filename does not match what is expected from in-file summary");
     279            0 :             println!("actual: {:?}", actual_layer_name.to_string());
     280            0 :             println!("expected: {:?}", expected_layer_name.to_string());
     281            0 :         }
     282              : 
     283            0 :         Ok(loaded)
     284            0 :     }
     285              : 
     286              :     /// Create an ImageLayer struct representing an existing file on disk.
     287              :     ///
     288              :     /// This variant is only used for debugging purposes, by the 'pagectl' binary.
     289            0 :     pub fn new_for_path(path: &Utf8Path, file: File) -> Result<ImageLayer> {
     290            0 :         let mut summary_buf = vec![0; PAGE_SZ];
     291            0 :         file.read_exact_at(&mut summary_buf, 0)?;
     292            0 :         let summary = Summary::des_prefix(&summary_buf)?;
     293            0 :         let metadata = file
     294            0 :             .metadata()
     295            0 :             .context("get file metadata to determine size")?;
     296              : 
     297              :         // This function is never used for constructing layers in a running pageserver,
     298              :         // so it does not need an accurate TenantShardId.
     299            0 :         let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
     300            0 : 
     301            0 :         Ok(ImageLayer {
     302            0 :             path: path.to_path_buf(),
     303            0 :             desc: PersistentLayerDesc::new_img(
     304            0 :                 tenant_shard_id,
     305            0 :                 summary.timeline_id,
     306            0 :                 summary.key_range,
     307            0 :                 summary.lsn,
     308            0 :                 metadata.len(),
     309            0 :             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
     310            0 :             lsn: summary.lsn,
     311            0 :             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
     312            0 :             inner: OnceCell::new(),
     313            0 :         })
     314            0 :     }
     315              : 
     316            0 :     fn path(&self) -> Utf8PathBuf {
     317            0 :         self.path.clone()
     318            0 :     }
     319              : }
     320              : 
     321            0 : #[derive(thiserror::Error, Debug)]
     322              : pub enum RewriteSummaryError {
     323              :     #[error("magic mismatch")]
     324              :     MagicMismatch,
     325              :     #[error(transparent)]
     326              :     Other(#[from] anyhow::Error),
     327              : }
     328              : 
     329              : impl From<std::io::Error> for RewriteSummaryError {
     330            0 :     fn from(e: std::io::Error) -> Self {
     331            0 :         Self::Other(anyhow::anyhow!(e))
     332            0 :     }
     333              : }
     334              : 
     335              : impl ImageLayer {
     336            0 :     pub async fn rewrite_summary<F>(
     337            0 :         path: &Utf8Path,
     338            0 :         rewrite: F,
     339            0 :         ctx: &RequestContext,
     340            0 :     ) -> Result<(), RewriteSummaryError>
     341            0 :     where
     342            0 :         F: Fn(Summary) -> Summary,
     343            0 :     {
     344            0 :         let mut file = VirtualFile::open_with_options(
     345            0 :             path,
     346            0 :             virtual_file::OpenOptions::new().read(true).write(true),
     347            0 :             ctx,
     348            0 :         )
     349            0 :         .await
     350            0 :         .with_context(|| format!("Failed to open file '{}'", path))?;
     351            0 :         let file_id = page_cache::next_file_id();
     352            0 :         let block_reader = FileBlockReader::new(&file, file_id);
     353            0 :         let summary_blk = block_reader.read_blk(0, ctx).await?;
     354            0 :         let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
     355            0 :         if actual_summary.magic != IMAGE_FILE_MAGIC {
     356            0 :             return Err(RewriteSummaryError::MagicMismatch);
     357            0 :         }
     358            0 : 
     359            0 :         let new_summary = rewrite(actual_summary);
     360            0 : 
     361            0 :         let mut buf = Vec::with_capacity(PAGE_SZ);
     362            0 :         // TODO: could use smallvec here but it's a pain with Slice<T>
     363            0 :         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
     364            0 :         file.seek(SeekFrom::Start(0)).await?;
     365            0 :         let (_buf, res) = file.write_all(buf, ctx).await;
     366            0 :         res?;
     367            0 :         Ok(())
     368            0 :     }
     369              : }
     370              : 
     371              : impl ImageLayerInner {
     372              :     /// Returns nested result following Result<Result<_, OpErr>, Critical>:
     373              :     /// - inner has the success or transient failure
     374              :     /// - outer has the permanent failure
     375           88 :     pub(super) async fn load(
     376           88 :         path: &Utf8Path,
     377           88 :         lsn: Lsn,
     378           88 :         summary: Option<Summary>,
     379           88 :         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
     380           88 :         ctx: &RequestContext,
     381           88 :     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
     382           88 :         let file = match VirtualFile::open(path, ctx).await {
     383           88 :             Ok(file) => file,
     384            0 :             Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
     385              :         };
     386           88 :         let file_id = page_cache::next_file_id();
     387           88 :         let block_reader = FileBlockReader::new(&file, file_id);
     388           88 :         let summary_blk = match block_reader.read_blk(0, ctx).await {
     389           88 :             Ok(blk) => blk,
     390            0 :             Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
     391              :         };
     392              : 
     393              :         // length is the only way how this could fail, so it's not actually likely at all unless
     394              :         // read_blk returns wrong sized block.
     395              :         //
     396              :         // TODO: confirm and make this into assertion
     397           88 :         let actual_summary =
     398           88 :             Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;
     399              : 
     400           88 :         if let Some(mut expected_summary) = summary {
     401              :             // production code path
     402           88 :             expected_summary.index_start_blk = actual_summary.index_start_blk;
     403           88 :             expected_summary.index_root_blk = actual_summary.index_root_blk;
     404           88 :             // mask out the timeline_id, but still require the layers to be from the same tenant
     405           88 :             expected_summary.timeline_id = actual_summary.timeline_id;
     406           88 : 
     407           88 :             if actual_summary != expected_summary {
     408            0 :                 bail!(
     409            0 :                     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
     410            0 :                     actual_summary,
     411            0 :                     expected_summary
     412            0 :                 );
     413           88 :             }
     414            0 :         }
     415              : 
     416           88 :         Ok(Ok(ImageLayerInner {
     417           88 :             index_start_blk: actual_summary.index_start_blk,
     418           88 :             index_root_blk: actual_summary.index_root_blk,
     419           88 :             lsn,
     420           88 :             file,
     421           88 :             file_id,
     422           88 :             max_vectored_read_bytes,
     423           88 :             key_range: actual_summary.key_range,
     424           88 :         }))
     425           88 :     }
     426              : 
     427         7187 :     pub(super) async fn get_value_reconstruct_data(
     428         7187 :         &self,
     429         7187 :         key: Key,
     430         7187 :         reconstruct_state: &mut ValueReconstructState,
     431         7187 :         ctx: &RequestContext,
     432         7187 :     ) -> anyhow::Result<ValueReconstructResult> {
     433         7187 :         let block_reader = FileBlockReader::new(&self.file, self.file_id);
     434         7187 :         let tree_reader =
     435         7187 :             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
     436         7187 : 
     437         7187 :         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
     438         7187 :         key.write_to_byte_slice(&mut keybuf);
     439         7187 :         if let Some(offset) = tree_reader
     440         7187 :             .get(
     441         7187 :                 &keybuf,
     442         7187 :                 &RequestContextBuilder::extend(ctx)
     443         7187 :                     .page_content_kind(PageContentKind::ImageLayerBtreeNode)
     444         7187 :                     .build(),
     445         7187 :             )
     446          430 :             .await?
     447              :         {
     448         7183 :             let blob = block_reader
     449         7183 :                 .block_cursor()
     450         7183 :                 .read_blob(
     451         7183 :                     offset,
     452         7183 :                     &RequestContextBuilder::extend(ctx)
     453         7183 :                         .page_content_kind(PageContentKind::ImageLayerValue)
     454         7183 :                         .build(),
     455         7183 :                 )
     456          317 :                 .await
     457         7183 :                 .with_context(|| format!("failed to read value from offset {}", offset))?;
     458         7183 :             let value = Bytes::from(blob);
     459         7183 : 
     460         7183 :             reconstruct_state.img = Some((self.lsn, value));
     461         7183 :             Ok(ValueReconstructResult::Complete)
     462              :         } else {
     463            4 :             Ok(ValueReconstructResult::Missing)
     464              :         }
     465         7187 :     }
     466              : 
     467              :     // Look up the keys in the provided keyspace and update
     468              :     // the reconstruct state with whatever is found.
     469           74 :     pub(super) async fn get_values_reconstruct_data(
     470           74 :         &self,
     471           74 :         keyspace: KeySpace,
     472           74 :         reconstruct_state: &mut ValuesReconstructState,
     473           74 :         ctx: &RequestContext,
     474           74 :     ) -> Result<(), GetVectoredError> {
     475           74 :         let reads = self
     476           74 :             .plan_reads(keyspace, None, ctx)
     477          343 :             .await
     478           74 :             .map_err(GetVectoredError::Other)?;
     479              : 
     480           74 :         self.do_reads_and_update_state(reads, reconstruct_state, ctx)
     481          894 :             .await;
     482              : 
     483           74 :         reconstruct_state.on_image_layer_visited(&self.key_range);
     484           74 : 
     485           74 :         Ok(())
     486           74 :     }
     487              : 
     488              :     /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
     489              :     #[cfg(test)]
     490            8 :     pub(super) async fn load_key_values(
     491            8 :         &self,
     492            8 :         ctx: &RequestContext,
     493            8 :     ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
     494            8 :         let block_reader = FileBlockReader::new(&self.file, self.file_id);
     495            8 :         let tree_reader =
     496            8 :             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
     497            8 :         let mut result = Vec::new();
     498            8 :         let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx));
     499            8 :         let block_reader = FileBlockReader::new(&self.file, self.file_id);
     500            8 :         let cursor = block_reader.block_cursor();
     501           80 :         while let Some(item) = stream.next().await {
     502              :             // TODO: dedup code with get_reconstruct_value
     503           72 :             let (raw_key, offset) = item?;
     504           72 :             let key = Key::from_slice(&raw_key[..KEY_SIZE]);
     505              :             // TODO: ctx handling and sharding
     506           72 :             let blob = cursor
     507           72 :                 .read_blob(offset, ctx)
     508            2 :                 .await
     509           72 :                 .with_context(|| format!("failed to read value from offset {}", offset))?;
     510           72 :             let value = Bytes::from(blob);
     511           72 :             result.push((key, self.lsn, Value::Image(value)));
     512              :         }
     513            8 :         Ok(result)
     514            8 :     }
     515              : 
     516              :     /// Traverse the layer's index to build read operations on the overlap of the input keyspace
     517              :     /// and the keys in this layer.
     518              :     ///
     519              :     /// If shard_identity is provided, it will be used to filter keys down to those stored on
     520              :     /// this shard.
     521           82 :     async fn plan_reads(
     522           82 :         &self,
     523           82 :         keyspace: KeySpace,
     524           82 :         shard_identity: Option<&ShardIdentity>,
     525           82 :         ctx: &RequestContext,
     526           82 :     ) -> anyhow::Result<Vec<VectoredRead>> {
     527           82 :         let mut planner = VectoredReadPlanner::new(
     528           82 :             self.max_vectored_read_bytes
     529           82 :                 .expect("Layer is loaded with max vectored bytes config")
     530           82 :                 .0
     531           82 :                 .into(),
     532           82 :         );
     533           82 : 
     534           82 :         let block_reader = FileBlockReader::new(&self.file, self.file_id);
     535           82 :         let tree_reader =
     536           82 :             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
     537           82 : 
     538           82 :         let ctx = RequestContextBuilder::extend(ctx)
     539           82 :             .page_content_kind(PageContentKind::ImageLayerBtreeNode)
     540           82 :             .build();
     541              : 
     542        21668 :         for range in keyspace.ranges.iter() {
     543        21668 :             let mut range_end_handled = false;
     544        21668 :             let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
     545        21668 :             range.start.write_to_byte_slice(&mut search_key);
     546        21668 : 
     547        21668 :             let index_stream = tree_reader.get_stream_from(&search_key, &ctx);
     548        21668 :             let mut index_stream = std::pin::pin!(index_stream);
     549              : 
     550      1080721 :             while let Some(index_entry) = index_stream.next().await {
     551      1080661 :                 let (raw_key, offset) = index_entry?;
     552              : 
     553      1080661 :                 let key = Key::from_slice(&raw_key[..KEY_SIZE]);
     554      1080661 :                 assert!(key >= range.start);
     555              : 
     556      1080661 :                 let flag = if let Some(shard_identity) = shard_identity {
     557      1048576 :                     if shard_identity.is_key_disposable(&key) {
     558       786432 :                         BlobFlag::Ignore
     559              :                     } else {
     560       262144 :                         BlobFlag::None
     561              :                     }
     562              :                 } else {
     563        32085 :                     BlobFlag::None
     564              :                 };
     565              : 
     566      1080661 :                 if key >= range.end {
     567        21608 :                     planner.handle_range_end(offset);
     568        21608 :                     range_end_handled = true;
     569        21608 :                     break;
     570      1059053 :                 } else {
     571      1059053 :                     planner.handle(key, self.lsn, offset, flag);
     572      1059053 :                 }
     573              :             }
     574              : 
     575        21668 :             if !range_end_handled {
     576           60 :                 let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
     577           60 :                 planner.handle_range_end(payload_end);
     578        21608 :             }
     579              :         }
     580              : 
     581           82 :         Ok(planner.finish())
     582           82 :     }
     583              : 
     584              :     /// Given a key range, select the parts of that range that should be retained by the ShardIdentity,
     585              :     /// then execute vectored GET operations, passing the results of all read keys into the writer.
     586            8 :     pub(super) async fn filter(
     587            8 :         &self,
     588            8 :         shard_identity: &ShardIdentity,
     589            8 :         writer: &mut ImageLayerWriter,
     590            8 :         ctx: &RequestContext,
     591            8 :     ) -> anyhow::Result<usize> {
     592              :         // Fragment the range into the regions owned by this ShardIdentity
     593            8 :         let plan = self
     594            8 :             .plan_reads(
     595            8 :                 KeySpace {
     596            8 :                     // If asked for the total key space, plan_reads will give us all the keys in the layer
     597            8 :                     ranges: vec![Key::MIN..Key::MAX],
     598            8 :                 },
     599            8 :                 Some(shard_identity),
     600            8 :                 ctx,
     601            8 :             )
     602          469 :             .await?;
     603              : 
     604            8 :         let vectored_blob_reader = VectoredBlobReader::new(&self.file);
     605            8 :         let mut key_count = 0;
     606           16 :         for read in plan.into_iter() {
     607           16 :             let buf_size = read.size();
     608           16 : 
     609           16 :             let buf = BytesMut::with_capacity(buf_size);
     610           16 :             let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
     611              : 
     612           16 :             let frozen_buf = blobs_buf.buf.freeze();
     613              : 
     614       262144 :             for meta in blobs_buf.blobs.iter() {
     615       262144 :                 let img_buf = frozen_buf.slice(meta.start..meta.end);
     616       262144 : 
     617       262144 :                 key_count += 1;
     618       262144 :                 writer
     619       262144 :                     .put_image(meta.meta.key, img_buf, ctx)
     620       266240 :                     .await
     621       262144 :                     .context(format!("Storing key {}", meta.meta.key))?;
     622              :             }
     623              :         }
     624              : 
     625            8 :         Ok(key_count)
     626            8 :     }
     627              : 
     628           74 :     async fn do_reads_and_update_state(
     629           74 :         &self,
     630           74 :         reads: Vec<VectoredRead>,
     631           74 :         reconstruct_state: &mut ValuesReconstructState,
     632           74 :         ctx: &RequestContext,
     633           74 :     ) {
     634           74 :         let max_vectored_read_bytes = self
     635           74 :             .max_vectored_read_bytes
     636           74 :             .expect("Layer is loaded with max vectored bytes config")
     637           74 :             .0
     638           74 :             .into();
     639           74 : 
     640           74 :         let vectored_blob_reader = VectoredBlobReader::new(&self.file);
     641         1772 :         for read in reads.into_iter() {
     642         1772 :             let buf_size = read.size();
     643         1772 : 
     644         1772 :             if buf_size > max_vectored_read_bytes {
     645              :                 // If the read is oversized, it should only contain one key.
     646            0 :                 let offenders = read
     647            0 :                     .blobs_at
     648            0 :                     .as_slice()
     649            0 :                     .iter()
     650            0 :                     .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
     651            0 :                     .join(", ");
     652            0 :                 tracing::warn!(
     653            0 :                     "Oversized vectored read ({} > {}) for keys {}",
     654              :                     buf_size,
     655              :                     max_vectored_read_bytes,
     656              :                     offenders
     657              :                 );
     658         1772 :             }
     659              : 
     660         1772 :             let buf = BytesMut::with_capacity(buf_size);
     661         1772 :             let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
     662              : 
     663         1772 :             match res {
     664         1772 :                 Ok(blobs_buf) => {
     665         1772 :                     let frozen_buf = blobs_buf.buf.freeze();
     666              : 
     667        10477 :                     for meta in blobs_buf.blobs.iter() {
     668        10477 :                         let img_buf = frozen_buf.slice(meta.start..meta.end);
     669        10477 :                         reconstruct_state.update_key(
     670        10477 :                             &meta.meta.key,
     671        10477 :                             self.lsn,
     672        10477 :                             Value::Image(img_buf),
     673        10477 :                         );
     674        10477 :                     }
     675              :                 }
     676            0 :                 Err(err) => {
     677            0 :                     let kind = err.kind();
     678            0 :                     for (_, blob_meta) in read.blobs_at.as_slice() {
     679            0 :                         reconstruct_state.on_key_error(
     680            0 :                             blob_meta.key,
     681            0 :                             PageReconstructError::from(anyhow!(
     682            0 :                                 "Failed to read blobs from virtual file {}: {}",
     683            0 :                                 self.file.path,
     684            0 :                                 kind
     685            0 :                             )),
     686            0 :                         );
     687            0 :                     }
     688              :                 }
     689              :             };
     690              :         }
     691           74 :     }
     692              : }
     693              : 
     694              : /// A builder object for constructing a new image layer.
     695              : ///
     696              : /// Usage:
     697              : ///
     698              : /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
     699              : ///
     700              : /// 2. Write the contents by calling `put_page_image` for every key-value
     701              : ///    pair in the key range.
     702              : ///
     703              : /// 3. Call `finish`.
     704              : ///
     705              : struct ImageLayerWriterInner {
     706              :     conf: &'static PageServerConf,
     707              :     path: Utf8PathBuf,
     708              :     timeline_id: TimelineId,
     709              :     tenant_shard_id: TenantShardId,
     710              :     key_range: Range<Key>,
     711              :     lsn: Lsn,
     712              : 
     713              :     blob_writer: BlobWriter<false>,
     714              :     tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
     715              : }
     716              : 
     717              : impl ImageLayerWriterInner {
     718              :     ///
     719              :     /// Start building a new image layer.
     720              :     ///
     721          238 :     async fn new(
     722          238 :         conf: &'static PageServerConf,
     723          238 :         timeline_id: TimelineId,
     724          238 :         tenant_shard_id: TenantShardId,
     725          238 :         key_range: &Range<Key>,
     726          238 :         lsn: Lsn,
     727          238 :         ctx: &RequestContext,
     728          238 :     ) -> anyhow::Result<Self> {
     729          238 :         // Create the file initially with a temporary filename.
     730          238 :         // We'll atomically rename it to the final name when we're done.
     731          238 :         let path = ImageLayer::temp_path_for(
     732          238 :             conf,
     733          238 :             timeline_id,
     734          238 :             tenant_shard_id,
     735          238 :             &ImageLayerName {
     736          238 :                 key_range: key_range.clone(),
     737          238 :                 lsn,
     738          238 :             },
     739          238 :         );
     740          238 :         trace!("creating image layer {}", path);
     741          238 :         let mut file = {
     742          238 :             VirtualFile::open_with_options(
     743          238 :                 &path,
     744          238 :                 virtual_file::OpenOptions::new()
     745          238 :                     .write(true)
     746          238 :                     .create_new(true),
     747          238 :                 ctx,
     748          238 :             )
     749          184 :             .await?
     750              :         };
     751              :         // make room for the header block
     752          238 :         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
     753          238 :         let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
     754          238 : 
     755          238 :         // Initialize the b-tree index builder
     756          238 :         let block_buf = BlockBuf::new();
     757          238 :         let tree_builder = DiskBtreeBuilder::new(block_buf);
     758          238 : 
     759          238 :         let writer = Self {
     760          238 :             conf,
     761          238 :             path,
     762          238 :             timeline_id,
     763          238 :             tenant_shard_id,
     764          238 :             key_range: key_range.clone(),
     765          238 :             lsn,
     766          238 :             tree: tree_builder,
     767          238 :             blob_writer,
     768          238 :         };
     769          238 : 
     770          238 :         Ok(writer)
     771          238 :     }
     772              : 
     773              :     ///
     774              :     /// Write next value to the file.
     775              :     ///
     776              :     /// The page versions must be appended in blknum order.
     777              :     ///
     778       535686 :     async fn put_image(
     779       535686 :         &mut self,
     780       535686 :         key: Key,
     781       535686 :         img: Bytes,
     782       535686 :         ctx: &RequestContext,
     783       535686 :     ) -> anyhow::Result<()> {
     784       535686 :         ensure!(self.key_range.contains(&key));
     785       544170 :         let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
     786              :         // TODO: re-use the buffer for `img` further upstack
     787       535686 :         let off = res?;
     788              : 
     789       535686 :         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
     790       535686 :         key.write_to_byte_slice(&mut keybuf);
     791       535686 :         self.tree.append(&keybuf, off)?;
     792              : 
     793       535686 :         Ok(())
     794       535686 :     }
     795              : 
     796              :     ///
     797              :     /// Finish writing the image layer.
     798              :     ///
     799          232 :     async fn finish(
     800          232 :         self,
     801          232 :         timeline: &Arc<Timeline>,
     802          232 :         ctx: &RequestContext,
     803          232 :     ) -> anyhow::Result<ResidentLayer> {
     804          232 :         let index_start_blk =
     805          232 :             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
     806          232 : 
     807          232 :         let mut file = self.blob_writer.into_inner();
     808          232 : 
     809          232 :         // Write out the index
     810          232 :         file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
     811            0 :             .await?;
     812          232 :         let (index_root_blk, block_buf) = self.tree.finish()?;
     813          920 :         for buf in block_buf.blocks {
     814          688 :             let (_buf, res) = file.write_all(buf, ctx).await;
     815          688 :             res?;
     816              :         }
     817              : 
     818              :         // Fill in the summary on blk 0
     819          232 :         let summary = Summary {
     820          232 :             magic: IMAGE_FILE_MAGIC,
     821          232 :             format_version: STORAGE_FORMAT_VERSION,
     822          232 :             tenant_id: self.tenant_shard_id.tenant_id,
     823          232 :             timeline_id: self.timeline_id,
     824          232 :             key_range: self.key_range.clone(),
     825          232 :             lsn: self.lsn,
     826          232 :             index_start_blk,
     827          232 :             index_root_blk,
     828          232 :         };
     829          232 : 
     830          232 :         let mut buf = Vec::with_capacity(PAGE_SZ);
     831          232 :         // TODO: could use smallvec here but it's a pain with Slice<T>
     832          232 :         Summary::ser_into(&summary, &mut buf)?;
     833          232 :         file.seek(SeekFrom::Start(0)).await?;
     834          232 :         let (_buf, res) = file.write_all(buf, ctx).await;
     835          232 :         res?;
     836              : 
     837          232 :         let metadata = file
     838          232 :             .metadata()
     839          119 :             .await
     840          232 :             .context("get metadata to determine file size")?;
     841              : 
     842          232 :         let desc = PersistentLayerDesc::new_img(
     843          232 :             self.tenant_shard_id,
     844          232 :             self.timeline_id,
     845          232 :             self.key_range.clone(),
     846          232 :             self.lsn,
     847          232 :             metadata.len(),
     848          232 :         );
     849          232 : 
     850          232 :         // Note: Because we open the file in write-only mode, we cannot
     851          232 :         // reuse the same VirtualFile for reading later. That's why we don't
     852          232 :         // set inner.file here. The first read will have to re-open it.
     853          232 : 
     854          232 :         // fsync the file
     855          232 :         file.sync_all().await?;
     856              : 
     857              :         // FIXME: why not carry the virtualfile here, it supports renaming?
     858          232 :         let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
     859              : 
     860          232 :         info!("created image layer {}", layer.local_path());
     861              : 
     862          232 :         Ok(layer)
     863          232 :     }
     864              : }
     865              : 
     866              : /// A builder object for constructing a new image layer.
     867              : ///
     868              : /// Usage:
     869              : ///
     870              : /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
     871              : ///
     872              : /// 2. Write the contents by calling `put_page_image` for every key-value
     873              : ///    pair in the key range.
     874              : ///
     875              : /// 3. Call `finish`.
     876              : ///
     877              : /// # Note
     878              : ///
     879              : /// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
     880              : /// possible for the writer to drop before `finish` is actually called. So this
     881              : /// could lead to odd temporary files in the directory, exhausting file system.
     882              : /// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
     883              : /// implementation that cleans up the temporary file in failure. It's not
     884              : /// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
     885              : /// out some fields, making it impossible to implement `Drop`.
     886              : ///
     887              : #[must_use]
     888              : pub struct ImageLayerWriter {
     889              :     inner: Option<ImageLayerWriterInner>,
     890              : }
     891              : 
     892              : impl ImageLayerWriter {
     893              :     ///
     894              :     /// Start building a new image layer.
     895              :     ///
     896          238 :     pub async fn new(
     897          238 :         conf: &'static PageServerConf,
     898          238 :         timeline_id: TimelineId,
     899          238 :         tenant_shard_id: TenantShardId,
     900          238 :         key_range: &Range<Key>,
     901          238 :         lsn: Lsn,
     902          238 :         ctx: &RequestContext,
     903          238 :     ) -> anyhow::Result<ImageLayerWriter> {
     904          238 :         Ok(Self {
     905          238 :             inner: Some(
     906          238 :                 ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx)
     907          184 :                     .await?,
     908              :             ),
     909              :         })
     910          238 :     }
     911              : 
     912              :     ///
     913              :     /// Write next value to the file.
     914              :     ///
     915              :     /// The page versions must be appended in blknum order.
     916              :     ///
     917       535686 :     pub async fn put_image(
     918       535686 :         &mut self,
     919       535686 :         key: Key,
     920       535686 :         img: Bytes,
     921       535686 :         ctx: &RequestContext,
     922       535686 :     ) -> anyhow::Result<()> {
     923       544170 :         self.inner.as_mut().unwrap().put_image(key, img, ctx).await
     924       535686 :     }
     925              : 
     926              :     ///
     927              :     /// Finish writing the image layer.
     928              :     ///
     929          232 :     pub(crate) async fn finish(
     930          232 :         mut self,
     931          232 :         timeline: &Arc<Timeline>,
     932          232 :         ctx: &RequestContext,
     933          232 :     ) -> anyhow::Result<super::ResidentLayer> {
     934          701 :         self.inner.take().unwrap().finish(timeline, ctx).await
     935          232 :     }
     936              : }
     937              : 
     938              : impl Drop for ImageLayerWriter {
     939          238 :     fn drop(&mut self) {
     940          238 :         if let Some(inner) = self.inner.take() {
     941            6 :             inner.blob_writer.into_inner().remove();
     942          232 :         }
     943          238 :     }
     944              : }
     945              : 
     946              : #[cfg(test)]
     947              : mod test {
     948              :     use std::time::Duration;
     949              : 
     950              :     use bytes::Bytes;
     951              :     use pageserver_api::{
     952              :         key::Key,
     953              :         shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
     954              :     };
     955              :     use utils::{
     956              :         generation::Generation,
     957              :         id::{TenantId, TimelineId},
     958              :         lsn::Lsn,
     959              :     };
     960              : 
     961              :     use crate::{
     962              :         tenant::{config::TenantConf, harness::TenantHarness},
     963              :         DEFAULT_PG_VERSION,
     964              :     };
     965              : 
     966              :     use super::ImageLayerWriter;
     967              : 
     968              :     #[tokio::test]
     969            2 :     async fn image_layer_rewrite() {
     970            2 :         let tenant_conf = TenantConf {
     971            2 :             gc_period: Duration::ZERO,
     972            2 :             compaction_period: Duration::ZERO,
     973            2 :             ..TenantConf::default()
     974            2 :         };
     975            2 :         let tenant_id = TenantId::generate();
     976            2 :         let mut gen = Generation::new(0xdead0001);
     977           10 :         let mut get_next_gen = || {
     978           10 :             let ret = gen;
     979           10 :             gen = gen.next();
     980           10 :             ret
     981           10 :         };
     982            2 :         // The LSN at which we will create an image layer to filter
     983            2 :         let lsn = Lsn(0xdeadbeef0000);
     984            2 :         let timeline_id = TimelineId::generate();
     985            2 : 
     986            2 :         //
     987            2 :         // Create an unsharded parent with a layer.
     988            2 :         //
     989            2 : 
     990            2 :         let harness = TenantHarness::create_custom(
     991            2 :             "test_image_layer_rewrite--parent",
     992            2 :             tenant_conf.clone(),
     993            2 :             tenant_id,
     994            2 :             ShardIdentity::unsharded(),
     995            2 :             get_next_gen(),
     996            2 :         )
     997            2 :         .unwrap();
     998            8 :         let (tenant, ctx) = harness.load().await;
     999            2 :         let timeline = tenant
    1000            2 :             .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
    1001            6 :             .await
    1002            2 :             .unwrap();
    1003            2 : 
    1004            2 :         // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
    1005            2 :         let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
    1006            2 :         let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
    1007            2 :         let range = input_start..input_end;
    1008            2 : 
    1009            2 :         // Build an image layer to filter
    1010            2 :         let resident = {
    1011            2 :             let mut writer = ImageLayerWriter::new(
    1012            2 :                 harness.conf,
    1013            2 :                 timeline_id,
    1014            2 :                 harness.tenant_shard_id,
    1015            2 :                 &range,
    1016            2 :                 lsn,
    1017            2 :                 &ctx,
    1018            2 :             )
    1019            2 :             .await
    1020            2 :             .unwrap();
    1021            2 : 
    1022            2 :             let foo_img = Bytes::from_static(&[1, 2, 3, 4]);
    1023            2 :             let mut key = range.start;
    1024       262146 :             while key < range.end {
    1025       266239 :                 writer.put_image(key, foo_img.clone(), &ctx).await.unwrap();
    1026       262144 : 
    1027       262144 :                 key = key.next();
    1028            2 :             }
    1029          119 :             writer.finish(&timeline, &ctx).await.unwrap()
    1030            2 :         };
    1031            2 :         let original_size = resident.metadata().file_size;
    1032            2 : 
    1033            2 :         //
    1034            2 :         // Create child shards and do the rewrite, exercising filter().
    1035            2 :         // TODO: abstraction in TenantHarness for splits.
    1036            2 :         //
    1037            2 : 
    1038            2 :         // Filter for various shards: this exercises cases like values at start of key range, end of key
    1039            2 :         // range, middle of key range.
    1040            2 :         let shard_count = ShardCount::new(4);
    1041            8 :         for shard_number in 0..shard_count.count() {
    1042            2 :             //
    1043            2 :             // mimic the shard split
    1044            2 :             //
    1045            8 :             let shard_identity = ShardIdentity::new(
    1046            8 :                 ShardNumber(shard_number),
    1047            8 :                 shard_count,
    1048            8 :                 ShardStripeSize(0x8000),
    1049            8 :             )
    1050            8 :             .unwrap();
    1051            8 :             let harness = TenantHarness::create_custom(
    1052            8 :                 Box::leak(Box::new(format!(
    1053            8 :                     "test_image_layer_rewrite--child{}",
    1054            8 :                     shard_identity.shard_slug()
    1055            8 :                 ))),
    1056            8 :                 tenant_conf.clone(),
    1057            8 :                 tenant_id,
    1058            8 :                 shard_identity,
    1059            8 :                 // NB: in reality, the shards would each fork off their own gen number sequence from the parent.
    1060            8 :                 // But here, all we care about is that the gen number is unique.
    1061            8 :                 get_next_gen(),
    1062            8 :             )
    1063            8 :             .unwrap();
    1064           29 :             let (tenant, ctx) = harness.load().await;
    1065            8 :             let timeline = tenant
    1066            8 :                 .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
    1067           20 :                 .await
    1068            8 :                 .unwrap();
    1069            2 : 
    1070            2 :             //
    1071            2 :             // use filter() and make assertions
    1072            2 :             //
    1073            2 : 
    1074            8 :             let mut filtered_writer = ImageLayerWriter::new(
    1075            8 :                 harness.conf,
    1076            8 :                 timeline_id,
    1077            8 :                 harness.tenant_shard_id,
    1078            8 :                 &range,
    1079            8 :                 lsn,
    1080            8 :                 &ctx,
    1081            8 :             )
    1082            4 :             .await
    1083            8 :             .unwrap();
    1084            2 : 
    1085            8 :             let wrote_keys = resident
    1086            8 :                 .filter(&shard_identity, &mut filtered_writer, &ctx)
    1087       266719 :                 .await
    1088            8 :                 .unwrap();
    1089            8 :             let replacement = if wrote_keys > 0 {
    1090          129 :                 Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
    1091            2 :             } else {
    1092            2 :                 None
    1093            2 :             };
    1094            2 : 
    1095            2 :             // This exact size and those below will need updating as/when the layer encoding changes, but
    1096            2 :             // should be deterministic for a given version of the format, as we used no randomness generating the input.
    1097            8 :             assert_eq!(original_size, 1597440);
    1098            2 : 
    1099            8 :             match shard_number {
    1100            2 :                 0 => {
    1101            2 :                     // We should have written out just one stripe for our shard identity
    1102            2 :                     assert_eq!(wrote_keys, 0x8000);
    1103            2 :                     let replacement = replacement.unwrap();
    1104            2 : 
    1105            2 :                     // We should have dropped some of the data
    1106            2 :                     assert!(replacement.metadata().file_size < original_size);
    1107            2 :                     assert!(replacement.metadata().file_size > 0);
    1108            2 : 
    1109            2 :                     // Assert that we dropped ~3/4 of the data.
    1110            2 :                     assert_eq!(replacement.metadata().file_size, 417792);
    1111            2 :                 }
    1112            2 :                 1 => {
    1113            2 :                     // Shard 1 has no keys in our input range
    1114            2 :                     assert_eq!(wrote_keys, 0x0);
    1115            2 :                     assert!(replacement.is_none());
    1116            2 :                 }
    1117            2 :                 2 => {
    1118            2 :                     // Shard 2 has one stripes in the input range
    1119            2 :                     assert_eq!(wrote_keys, 0x8000);
    1120            2 :                     let replacement = replacement.unwrap();
    1121            2 :                     assert!(replacement.metadata().file_size < original_size);
    1122            2 :                     assert!(replacement.metadata().file_size > 0);
    1123            2 :                     assert_eq!(replacement.metadata().file_size, 417792);
    1124            2 :                 }
    1125            2 :                 3 => {
    1126            2 :                     // Shard 3 has two stripes in the input range
    1127            2 :                     assert_eq!(wrote_keys, 0x10000);
    1128            2 :                     let replacement = replacement.unwrap();
    1129            2 :                     assert!(replacement.metadata().file_size < original_size);
    1130            2 :                     assert!(replacement.metadata().file_size > 0);
    1131            2 :                     assert_eq!(replacement.metadata().file_size, 811008);
    1132            2 :                 }
    1133            2 :                 _ => unreachable!(),
    1134            2 :             }
    1135            2 :         }
    1136            2 :     }
    1137              : }
        

Generated by: LCOV version 2.1-beta