LCOV - code coverage report
Current view: top level - pageserver/src/tenant/storage_layer - image_layer.rs (source / functions) Coverage Total Hit
Test: c639aa5f7ab62b43d647b10f40d15a15686ce8a9.info Lines: 57.5 % 391 225
Test Date: 2024-02-12 20:26:03 Functions: 36.4 % 66 24

            Line data    Source code
       1              : //! An ImageLayer represents an image or a snapshot of a key-range at
       2              : //! one particular LSN. It contains an image of all key-value pairs
       3              : //! in its key-range. Any key that falls into the image layer's range
       4              : //! but does not exist in the layer, does not exist.
       5              : //!
       6              : //! An image layer is stored in a file on disk. The file is stored in
       7              : //! timelines/<timeline_id> directory.  Currently, there are no
       8              : //! subdirectories, and each image layer file is named like this:
       9              : //!
      10              : //! ```text
      11              : //!    <key start>-<key end>__<LSN>
      12              : //! ```
      13              : //!
      14              : //! For example:
      15              : //!
      16              : //! ```text
      17              : //!    000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
      18              : //! ```
      19              : //!
      20              : //! Every image layer file consists of three parts: "summary",
      21              : //! "index", and "values".  The summary is a fixed size header at the
      22              : //! beginning of the file, and it contains basic information about the
      23              : //! layer, and offsets to the other parts. The "index" is a B-tree,
      24              : //! mapping from Key to an offset in the "values" part.  The
      25              : //! actual page images are stored in the "values" part.
      26              : use crate::config::PageServerConf;
      27              : use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
      28              : use crate::page_cache::PAGE_SZ;
      29              : use crate::repository::{Key, KEY_SIZE};
      30              : use crate::tenant::blob_io::BlobWriter;
      31              : use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
      32              : use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
      33              : use crate::tenant::storage_layer::{
      34              :     LayerAccessStats, ValueReconstructResult, ValueReconstructState,
      35              : };
      36              : use crate::tenant::Timeline;
      37              : use crate::virtual_file::{self, VirtualFile};
      38              : use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
      39              : use anyhow::{bail, ensure, Context, Result};
      40              : use bytes::Bytes;
      41              : use camino::{Utf8Path, Utf8PathBuf};
      42              : use hex;
      43              : use pageserver_api::models::LayerAccessKind;
      44              : use pageserver_api::shard::TenantShardId;
      45              : use rand::{distributions::Alphanumeric, Rng};
      46              : use serde::{Deserialize, Serialize};
      47              : use std::fs::File;
      48              : use std::io::SeekFrom;
      49              : use std::ops::Range;
      50              : use std::os::unix::prelude::FileExt;
      51              : use std::sync::Arc;
      52              : use tokio::sync::OnceCell;
      53              : use tracing::*;
      54              : 
      55              : use utils::{
      56              :     bin_ser::BeSer,
      57              :     id::{TenantId, TimelineId},
      58              :     lsn::Lsn,
      59              : };
      60              : 
      61              : use super::filename::ImageFileName;
      62              : use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
      63              : 
      64              : ///
      65              : /// Header stored in the beginning of the file
      66              : ///
      67              : /// After this comes the 'values' part, starting on block 1. After that,
      68              : /// the 'index' starts at the block indicated by 'index_start_blk'
      69              : ///
      70        21244 : #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
      71              : pub struct Summary {
      72              :     /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
      73              :     pub magic: u16,
      74              :     pub format_version: u16,
      75              : 
      76              :     pub tenant_id: TenantId,
      77              :     pub timeline_id: TimelineId,
      78              :     pub key_range: Range<Key>,
      79              :     pub lsn: Lsn,
      80              : 
      81              :     /// Block number where the 'index' part of the file begins.
      82              :     pub index_start_blk: u32,
      83              :     /// Block within the 'index', where the B-tree root page is stored
      84              :     pub index_root_blk: u32,
      85              :     // the 'values' part starts after the summary header, on block 1.
      86              : }
      87              : 
      88              : impl From<&ImageLayer> for Summary {
      89            0 :     fn from(layer: &ImageLayer) -> Self {
      90            0 :         Self::expected(
      91            0 :             layer.desc.tenant_shard_id.tenant_id,
      92            0 :             layer.desc.timeline_id,
      93            0 :             layer.desc.key_range.clone(),
      94            0 :             layer.lsn,
      95            0 :         )
      96            0 :     }
      97              : }
      98              : 
      99              : impl Summary {
     100        21244 :     pub(super) fn expected(
     101        21244 :         tenant_id: TenantId,
     102        21244 :         timeline_id: TimelineId,
     103        21244 :         key_range: Range<Key>,
     104        21244 :         lsn: Lsn,
     105        21244 :     ) -> Self {
     106        21244 :         Self {
     107        21244 :             magic: IMAGE_FILE_MAGIC,
     108        21244 :             format_version: STORAGE_FORMAT_VERSION,
     109        21244 :             tenant_id,
     110        21244 :             timeline_id,
     111        21244 :             key_range,
     112        21244 :             lsn,
     113        21244 : 
     114        21244 :             index_start_blk: 0,
     115        21244 :             index_root_blk: 0,
     116        21244 :         }
     117        21244 :     }
     118              : }
     119              : 
     120              : /// This is used only from `pagectl`. Within pageserver, all layers are
     121              : /// [`crate::tenant::storage_layer::Layer`], which can hold an [`ImageLayerInner`].
     122              : pub struct ImageLayer {
     123              :     path: Utf8PathBuf,
     124              :     pub desc: PersistentLayerDesc,
     125              :     // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
     126              :     pub lsn: Lsn,
     127              :     access_stats: LayerAccessStats,
     128              :     inner: OnceCell<ImageLayerInner>,
     129              : }
     130              : 
     131              : impl std::fmt::Debug for ImageLayer {
     132            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     133            0 :         use super::RangeDisplayDebug;
     134            0 : 
     135            0 :         f.debug_struct("ImageLayer")
     136            0 :             .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
     137            0 :             .field("file_size", &self.desc.file_size)
     138            0 :             .field("lsn", &self.lsn)
     139            0 :             .field("inner", &self.inner)
     140            0 :             .finish()
     141            0 :     }
     142              : }
     143              : 
     144              : /// ImageLayer is the in-memory data structure associated with an on-disk image
     145              : /// file.
     146              : pub struct ImageLayerInner {
     147              :     // values copied from summary
     148              :     index_start_blk: u32,
     149              :     index_root_blk: u32,
     150              : 
     151              :     lsn: Lsn,
     152              : 
     153              :     /// Reader object for reading blocks from the file.
     154              :     file: FileBlockReader,
     155              : }
     156              : 
     157              : impl std::fmt::Debug for ImageLayerInner {
     158            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     159            0 :         f.debug_struct("ImageLayerInner")
     160            0 :             .field("index_start_blk", &self.index_start_blk)
     161            0 :             .field("index_root_blk", &self.index_root_blk)
     162            0 :             .finish()
     163            0 :     }
     164              : }
     165              : 
     166              : impl ImageLayerInner {
     167            0 :     pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
     168            0 :         let file = &self.file;
     169            0 :         let tree_reader =
     170            0 :             DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);
     171            0 : 
     172            0 :         tree_reader.dump().await?;
     173              : 
     174            0 :         tree_reader
     175            0 :             .visit(
     176            0 :                 &[0u8; KEY_SIZE],
     177            0 :                 VisitDirection::Forwards,
     178            0 :                 |key, value| {
     179            0 :                     println!("key: {} offset {}", hex::encode(key), value);
     180            0 :                     true
     181            0 :                 },
     182            0 :                 ctx,
     183            0 :             )
     184            0 :             .await?;
     185              : 
     186            0 :         Ok(())
     187            0 :     }
     188              : }
     189              : 
     190              : /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
     191              : impl std::fmt::Display for ImageLayer {
     192            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     193            0 :         write!(f, "{}", self.layer_desc().short_id())
     194            0 :     }
     195              : }
     196              : 
     197              : impl AsLayerDesc for ImageLayer {
     198            0 :     fn layer_desc(&self) -> &PersistentLayerDesc {
     199            0 :         &self.desc
     200            0 :     }
     201              : }
     202              : 
     203              : impl ImageLayer {
     204            0 :     pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
     205            0 :         self.desc.dump();
     206            0 : 
     207            0 :         if !verbose {
     208            0 :             return Ok(());
     209            0 :         }
     210              : 
     211            0 :         let inner = self.load(LayerAccessKind::Dump, ctx).await?;
     212              : 
     213            0 :         inner.dump(ctx).await?;
     214              : 
     215            0 :         Ok(())
     216            0 :     }
     217              : 
     218         6472 :     fn temp_path_for(
     219         6472 :         conf: &PageServerConf,
     220         6472 :         timeline_id: TimelineId,
     221         6472 :         tenant_shard_id: TenantShardId,
     222         6472 :         fname: &ImageFileName,
     223         6472 :     ) -> Utf8PathBuf {
     224         6472 :         let rand_string: String = rand::thread_rng()
     225         6472 :             .sample_iter(&Alphanumeric)
     226         6472 :             .take(8)
     227         6472 :             .map(char::from)
     228         6472 :             .collect();
     229         6472 : 
     230         6472 :         conf.timeline_path(&tenant_shard_id, &timeline_id)
     231         6472 :             .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
     232         6472 :     }
     233              : 
     234              :     ///
     235              :     /// Open the underlying file and read the metadata into memory, if it's
     236              :     /// not loaded already.
     237              :     ///
     238            0 :     async fn load(
     239            0 :         &self,
     240            0 :         access_kind: LayerAccessKind,
     241            0 :         ctx: &RequestContext,
     242            0 :     ) -> Result<&ImageLayerInner> {
     243            0 :         self.access_stats.record_access(access_kind, ctx);
     244            0 :         self.inner
     245            0 :             .get_or_try_init(|| self.load_inner(ctx))
     246            0 :             .await
     247            0 :             .with_context(|| format!("Failed to load image layer {}", self.path()))
     248            0 :     }
     249              : 
     250            0 :     async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
     251            0 :         let path = self.path();
     252              : 
     253            0 :         let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
     254            0 :             .await
     255            0 :             .and_then(|res| res)?;
     256              : 
     257              :         // not production code
     258            0 :         let actual_filename = path.file_name().unwrap().to_owned();
     259            0 :         let expected_filename = self.layer_desc().filename().file_name();
     260            0 : 
     261            0 :         if actual_filename != expected_filename {
     262            0 :             println!("warning: filename does not match what is expected from in-file summary");
     263            0 :             println!("actual: {:?}", actual_filename);
     264            0 :             println!("expected: {:?}", expected_filename);
     265            0 :         }
     266              : 
     267            0 :         Ok(loaded)
     268            0 :     }
     269              : 
     270              :     /// Create an ImageLayer struct representing an existing file on disk.
     271              :     ///
     272              :     /// This variant is only used for debugging purposes, by the 'pagectl' binary.
     273            0 :     pub fn new_for_path(path: &Utf8Path, file: File) -> Result<ImageLayer> {
     274            0 :         let mut summary_buf = vec![0; PAGE_SZ];
     275            0 :         file.read_exact_at(&mut summary_buf, 0)?;
     276            0 :         let summary = Summary::des_prefix(&summary_buf)?;
     277            0 :         let metadata = file
     278            0 :             .metadata()
     279            0 :             .context("get file metadata to determine size")?;
     280              : 
     281              :         // This function is never used for constructing layers in a running pageserver,
     282              :         // so it does not need an accurate TenantShardId.
     283            0 :         let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
     284            0 : 
     285            0 :         Ok(ImageLayer {
     286            0 :             path: path.to_path_buf(),
     287            0 :             desc: PersistentLayerDesc::new_img(
     288            0 :                 tenant_shard_id,
     289            0 :                 summary.timeline_id,
     290            0 :                 summary.key_range,
     291            0 :                 summary.lsn,
     292            0 :                 metadata.len(),
     293            0 :             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
     294            0 :             lsn: summary.lsn,
     295            0 :             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
     296            0 :             inner: OnceCell::new(),
     297            0 :         })
     298            0 :     }
     299              : 
     300            0 :     fn path(&self) -> Utf8PathBuf {
     301            0 :         self.path.clone()
     302            0 :     }
     303              : }
     304              : 
     305            0 : #[derive(thiserror::Error, Debug)]
     306              : pub enum RewriteSummaryError {
     307              :     #[error("magic mismatch")]
     308              :     MagicMismatch,
     309              :     #[error(transparent)]
     310              :     Other(#[from] anyhow::Error),
     311              : }
     312              : 
     313              : impl From<std::io::Error> for RewriteSummaryError {
     314            0 :     fn from(e: std::io::Error) -> Self {
     315            0 :         Self::Other(anyhow::anyhow!(e))
     316            0 :     }
     317              : }
     318              : 
     319              : impl ImageLayer {
     320            0 :     pub async fn rewrite_summary<F>(
     321            0 :         path: &Utf8Path,
     322            0 :         rewrite: F,
     323            0 :         ctx: &RequestContext,
     324            0 :     ) -> Result<(), RewriteSummaryError>
     325            0 :     where
     326            0 :         F: Fn(Summary) -> Summary,
     327            0 :     {
     328            0 :         let file = VirtualFile::open_with_options(
     329            0 :             path,
     330            0 :             virtual_file::OpenOptions::new().read(true).write(true),
     331            0 :         )
     332            0 :         .await
     333            0 :         .with_context(|| format!("Failed to open file '{}'", path))?;
     334            0 :         let file = FileBlockReader::new(file);
     335            0 :         let summary_blk = file.read_blk(0, ctx).await?;
     336            0 :         let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
     337            0 :         let mut file = file.file;
     338            0 :         if actual_summary.magic != IMAGE_FILE_MAGIC {
     339            0 :             return Err(RewriteSummaryError::MagicMismatch);
     340            0 :         }
     341            0 : 
     342            0 :         let new_summary = rewrite(actual_summary);
     343            0 : 
     344            0 :         let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
     345            0 :         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
     346            0 :         if buf.spilled() {
     347              :             // The code in ImageLayerWriterInner just warn!()s for this.
     348              :             // It should probably error out as well.
     349            0 :             return Err(RewriteSummaryError::Other(anyhow::anyhow!(
     350            0 :                 "Used more than one page size for summary buffer: {}",
     351            0 :                 buf.len()
     352            0 :             )));
     353            0 :         }
     354            0 :         file.seek(SeekFrom::Start(0)).await?;
     355            0 :         file.write_all(&buf).await?;
     356            0 :         Ok(())
     357            0 :     }
     358              : }
     359              : 
     360              : impl ImageLayerInner {
     361              :     /// Returns nested result following Result<Result<_, OpErr>, Critical>:
     362              :     /// - inner has the success or transient failure
     363              :     /// - outer has the permanent failure
     364        21244 :     pub(super) async fn load(
     365        21244 :         path: &Utf8Path,
     366        21244 :         lsn: Lsn,
     367        21244 :         summary: Option<Summary>,
     368        21244 :         ctx: &RequestContext,
     369        21244 :     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
     370        21244 :         let file = match VirtualFile::open(path).await {
     371        21244 :             Ok(file) => file,
     372            0 :             Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
     373              :         };
     374        21244 :         let file = FileBlockReader::new(file);
     375        21244 :         let summary_blk = match file.read_blk(0, ctx).await {
     376        21244 :             Ok(blk) => blk,
     377            0 :             Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
     378              :         };
     379              : 
     380              :         // length is the only way how this could fail, so it's not actually likely at all unless
     381              :         // read_blk returns wrong sized block.
     382              :         //
     383              :         // TODO: confirm and make this into assertion
     384        21244 :         let actual_summary =
     385        21244 :             Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;
     386              : 
     387        21244 :         if let Some(mut expected_summary) = summary {
     388              :             // production code path
     389        21244 :             expected_summary.index_start_blk = actual_summary.index_start_blk;
     390        21244 :             expected_summary.index_root_blk = actual_summary.index_root_blk;
     391        21244 : 
     392        21244 :             if actual_summary != expected_summary {
     393            0 :                 bail!(
     394            0 :                     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
     395            0 :                     actual_summary,
     396            0 :                     expected_summary
     397            0 :                 );
     398        21244 :             }
     399            0 :         }
     400              : 
     401        21244 :         Ok(Ok(ImageLayerInner {
     402        21244 :             index_start_blk: actual_summary.index_start_blk,
     403        21244 :             index_root_blk: actual_summary.index_root_blk,
     404        21244 :             lsn,
     405        21244 :             file,
     406        21244 :         }))
     407        21244 :     }
     408              : 
     409       423734 :     pub(super) async fn get_value_reconstruct_data(
     410       423734 :         &self,
     411       423734 :         key: Key,
     412       423734 :         reconstruct_state: &mut ValueReconstructState,
     413       423734 :         ctx: &RequestContext,
     414       423734 :     ) -> anyhow::Result<ValueReconstructResult> {
     415       423734 :         let file = &self.file;
     416       423734 :         let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
     417       423734 : 
     418       423734 :         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
     419       423734 :         key.write_to_byte_slice(&mut keybuf);
     420       423734 :         if let Some(offset) = tree_reader
     421       423734 :             .get(
     422       423734 :                 &keybuf,
     423       423734 :                 &RequestContextBuilder::extend(ctx)
     424       423734 :                     .page_content_kind(PageContentKind::ImageLayerBtreeNode)
     425       423734 :                     .build(),
     426       423734 :             )
     427         6219 :             .await?
     428              :         {
     429       423730 :             let blob = file
     430       423730 :                 .block_cursor()
     431       423730 :                 .read_blob(
     432       423730 :                     offset,
     433       423730 :                     &RequestContextBuilder::extend(ctx)
     434       423730 :                         .page_content_kind(PageContentKind::ImageLayerValue)
     435       423730 :                         .build(),
     436       423730 :                 )
     437         6815 :                 .await
     438       423730 :                 .with_context(|| format!("failed to read value from offset {}", offset))?;
     439       423730 :             let value = Bytes::from(blob);
     440       423730 : 
     441       423730 :             reconstruct_state.img = Some((self.lsn, value));
     442       423730 :             Ok(ValueReconstructResult::Complete)
     443              :         } else {
     444            4 :             Ok(ValueReconstructResult::Missing)
     445              :         }
     446       423734 :     }
     447              : }
     448              : 
     449              : /// A builder object for constructing a new image layer.
     450              : ///
     451              : /// Usage:
     452              : ///
     453              : /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
     454              : ///
     455              : /// 2. Write the contents by calling `put_page_image` for every key-value
     456              : ///    pair in the key range.
     457              : ///
     458              : /// 3. Call `finish`.
     459              : ///
     460              : struct ImageLayerWriterInner {
     461              :     conf: &'static PageServerConf,
     462              :     path: Utf8PathBuf,
     463              :     timeline_id: TimelineId,
     464              :     tenant_shard_id: TenantShardId,
     465              :     key_range: Range<Key>,
     466              :     lsn: Lsn,
     467              : 
     468              :     blob_writer: BlobWriter<false>,
     469              :     tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
     470              : }
     471              : 
     472              : impl ImageLayerWriterInner {
     473              :     ///
     474              :     /// Start building a new image layer.
     475              :     ///
     476         6472 :     async fn new(
     477         6472 :         conf: &'static PageServerConf,
     478         6472 :         timeline_id: TimelineId,
     479         6472 :         tenant_shard_id: TenantShardId,
     480         6472 :         key_range: &Range<Key>,
     481         6472 :         lsn: Lsn,
     482         6472 :     ) -> anyhow::Result<Self> {
     483         6472 :         // Create the file initially with a temporary filename.
     484         6472 :         // We'll atomically rename it to the final name when we're done.
     485         6472 :         let path = ImageLayer::temp_path_for(
     486         6472 :             conf,
     487         6472 :             timeline_id,
     488         6472 :             tenant_shard_id,
     489         6472 :             &ImageFileName {
     490         6472 :                 key_range: key_range.clone(),
     491         6472 :                 lsn,
     492         6472 :             },
     493         6472 :         );
     494         6472 :         info!("new image layer {path}");
     495         6472 :         let mut file = {
     496         6472 :             VirtualFile::open_with_options(
     497         6472 :                 &path,
     498         6472 :                 virtual_file::OpenOptions::new()
     499         6472 :                     .write(true)
     500         6472 :                     .create_new(true),
     501         6472 :             )
     502           70 :             .await?
     503              :         };
     504              :         // make room for the header block
     505         6472 :         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
     506         6472 :         let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
     507         6472 : 
     508         6472 :         // Initialize the b-tree index builder
     509         6472 :         let block_buf = BlockBuf::new();
     510         6472 :         let tree_builder = DiskBtreeBuilder::new(block_buf);
     511         6472 : 
     512         6472 :         let writer = Self {
     513         6472 :             conf,
     514         6472 :             path,
     515         6472 :             timeline_id,
     516         6472 :             tenant_shard_id,
     517         6472 :             key_range: key_range.clone(),
     518         6472 :             lsn,
     519         6472 :             tree: tree_builder,
     520         6472 :             blob_writer,
     521         6472 :         };
     522         6472 : 
     523         6472 :         Ok(writer)
     524         6472 :     }
     525              : 
     526              :     ///
     527              :     /// Write next value to the file.
     528              :     ///
     529              :     /// The page versions must be appended in blknum order.
     530              :     ///
     531       229254 :     async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
     532       229254 :         ensure!(self.key_range.contains(&key));
     533       229254 :         let (_img, res) = self.blob_writer.write_blob(img).await;
     534              :         // TODO: re-use the buffer for `img` further upstack
     535       229254 :         let off = res?;
     536              : 
     537       229254 :         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
     538       229254 :         key.write_to_byte_slice(&mut keybuf);
     539       229254 :         self.tree.append(&keybuf, off)?;
     540              : 
     541       229254 :         Ok(())
     542       229254 :     }
     543              : 
     544              :     ///
     545              :     /// Finish writing the image layer.
     546              :     ///
     547         6472 :     async fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
     548         6472 :         let index_start_blk =
     549         6472 :             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
     550         6472 : 
     551         6472 :         let mut file = self.blob_writer.into_inner();
     552         6472 : 
     553         6472 :         // Write out the index
     554         6472 :         file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
     555            0 :             .await?;
     556         6472 :         let (index_root_blk, block_buf) = self.tree.finish()?;
     557        13158 :         for buf in block_buf.blocks {
     558         6686 :             file.write_all(buf.as_ref()).await?;
     559              :         }
     560              : 
     561              :         // Fill in the summary on blk 0
     562         6472 :         let summary = Summary {
     563         6472 :             magic: IMAGE_FILE_MAGIC,
     564         6472 :             format_version: STORAGE_FORMAT_VERSION,
     565         6472 :             tenant_id: self.tenant_shard_id.tenant_id,
     566         6472 :             timeline_id: self.timeline_id,
     567         6472 :             key_range: self.key_range.clone(),
     568         6472 :             lsn: self.lsn,
     569         6472 :             index_start_blk,
     570         6472 :             index_root_blk,
     571         6472 :         };
     572         6472 : 
     573         6472 :         let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
     574         6472 :         Summary::ser_into(&summary, &mut buf)?;
     575         6472 :         if buf.spilled() {
     576              :             // This is bad as we only have one free block for the summary
     577            0 :             warn!(
     578            0 :                 "Used more than one page size for summary buffer: {}",
     579            0 :                 buf.len()
     580            0 :             );
     581         6472 :         }
     582         6472 :         file.seek(SeekFrom::Start(0)).await?;
     583         6472 :         file.write_all(&buf).await?;
     584              : 
     585         6472 :         let metadata = file
     586         6472 :             .metadata()
     587           50 :             .await
     588         6472 :             .context("get metadata to determine file size")?;
     589              : 
     590         6472 :         let desc = PersistentLayerDesc::new_img(
     591         6472 :             self.tenant_shard_id,
     592         6472 :             self.timeline_id,
     593         6472 :             self.key_range.clone(),
     594         6472 :             self.lsn,
     595         6472 :             metadata.len(),
     596         6472 :         );
     597         6472 : 
     598         6472 :         // Note: Because we open the file in write-only mode, we cannot
     599         6472 :         // reuse the same VirtualFile for reading later. That's why we don't
     600         6472 :         // set inner.file here. The first read will have to re-open it.
     601         6472 : 
     602         6472 :         // fsync the file
     603         6472 :         file.sync_all().await?;
     604              : 
     605              :         // FIXME: why not carry the virtualfile here, it supports renaming?
     606         6472 :         let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
     607              : 
     608            0 :         trace!("created image layer {}", layer.local_path());
     609              : 
     610         6472 :         Ok(layer)
     611         6472 :     }
     612              : }
     613              : 
     614              : /// A builder object for constructing a new image layer.
     615              : ///
     616              : /// Usage:
     617              : ///
     618              : /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
     619              : ///
     620              : /// 2. Write the contents by calling `put_page_image` for every key-value
     621              : ///    pair in the key range.
     622              : ///
     623              : /// 3. Call `finish`.
     624              : ///
     625              : /// # Note
     626              : ///
     627              : /// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
     628              : /// possible for the writer to drop before `finish` is actually called. So this
     629              : /// could lead to odd temporary files in the directory, exhausting file system.
     630              : /// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
     631              : /// implementation that cleans up the temporary file in failure. It's not
     632              : /// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
     633              : /// out some fields, making it impossible to implement `Drop`.
     634              : ///
     635              : #[must_use]
     636              : pub struct ImageLayerWriter {
     637              :     inner: Option<ImageLayerWriterInner>,
     638              : }
     639              : 
     640              : impl ImageLayerWriter {
     641              :     ///
     642              :     /// Start building a new image layer.
     643              :     ///
     644         6472 :     pub async fn new(
     645         6472 :         conf: &'static PageServerConf,
     646         6472 :         timeline_id: TimelineId,
     647         6472 :         tenant_shard_id: TenantShardId,
     648         6472 :         key_range: &Range<Key>,
     649         6472 :         lsn: Lsn,
     650         6472 :     ) -> anyhow::Result<ImageLayerWriter> {
     651         6472 :         Ok(Self {
     652         6472 :             inner: Some(
     653         6472 :                 ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn)
     654           70 :                     .await?,
     655              :             ),
     656              :         })
     657         6472 :     }
     658              : 
     659              :     ///
     660              :     /// Write next value to the file.
     661              :     ///
     662              :     /// The page versions must be appended in blknum order.
     663              :     ///
     664       229254 :     pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
     665       229254 :         self.inner.as_mut().unwrap().put_image(key, img).await
     666       229254 :     }
     667              : 
     668              :     ///
     669              :     /// Finish writing the image layer.
     670              :     ///
     671         6472 :     pub(crate) async fn finish(
     672         6472 :         mut self,
     673         6472 :         timeline: &Arc<Timeline>,
     674         6472 :     ) -> anyhow::Result<super::ResidentLayer> {
     675         6472 :         self.inner.take().unwrap().finish(timeline).await
     676         6472 :     }
     677              : }
     678              : 
     679              : impl Drop for ImageLayerWriter {
     680         6472 :     fn drop(&mut self) {
     681         6472 :         if let Some(inner) = self.inner.take() {
     682            0 :             inner.blob_writer.into_inner().remove();
     683         6472 :         }
     684         6472 :     }
     685              : }
        

Generated by: LCOV version 2.1-beta