LCOV - differential code coverage report
Current view: top level - pageserver/src/tenant/storage_layer - image_layer.rs (source / functions) Coverage Total Hit UBC CBC
Current: f6946e90941b557c917ac98cd5a7e9506d180f3e.info Lines: 79.1 % 464 367 97 367
Current Date: 2023-10-19 02:04:12 Functions: 68.1 % 69 47 22 47
Baseline: c8637f37369098875162f194f92736355783b050.info
Baseline Date: 2023-10-18 20:25:20

           TLA  Line data    Source code
       1                 : //! An ImageLayer represents an image or a snapshot of a key-range at
       2                 : //! one particular LSN. It contains an image of all key-value pairs
       3                 : //! in its key-range. Any key that falls into the image layer's range
       4                 : //! but does not exist in the layer, does not exist.
       5                 : //!
       6                 : //! An image layer is stored in a file on disk. The file is stored in
       7                 : //! timelines/<timeline_id> directory.  Currently, there are no
       8                 : //! subdirectories, and each image layer file is named like this:
       9                 : //!
      10                 : //! ```text
      11                 : //!    <key start>-<key end>__<LSN>
      12                 : //! ```
      13                 : //!
      14                 : //! For example:
      15                 : //!
      16                 : //! ```text
      17                 : //!    000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
      18                 : //! ```
      19                 : //!
      20                 : //! Every image layer file consists of three parts: "summary",
      21                 : //! "index", and "values".  The summary is a fixed size header at the
      22                 : //! beginning of the file, and it contains basic information about the
      23                 : //! layer, and offsets to the other parts. The "index" is a B-tree,
      24                 : //! mapping from Key to an offset in the "values" part.  The
      25                 : //! actual page images are stored in the "values" part.
      26                 : use crate::config::PageServerConf;
      27                 : use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
      28                 : use crate::page_cache::PAGE_SZ;
      29                 : use crate::repository::{Key, KEY_SIZE};
      30                 : use crate::tenant::blob_io::BlobWriter;
      31                 : use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
      32                 : use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
      33                 : use crate::tenant::storage_layer::{
      34                 :     LayerAccessStats, PersistentLayer, ValueReconstructResult, ValueReconstructState,
      35                 : };
      36                 : use crate::virtual_file::VirtualFile;
      37                 : use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
      38                 : use anyhow::{bail, ensure, Context, Result};
      39                 : use bytes::Bytes;
      40                 : use camino::{Utf8Path, Utf8PathBuf};
      41                 : use hex;
      42                 : use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
      43                 : use rand::{distributions::Alphanumeric, Rng};
      44                 : use serde::{Deserialize, Serialize};
      45                 : use std::fs::{self, File};
      46                 : use std::io::SeekFrom;
      47                 : use std::ops::Range;
      48                 : use std::os::unix::prelude::FileExt;
      49                 : use tokio::sync::OnceCell;
      50                 : use tracing::*;
      51                 : 
      52                 : use utils::{
      53                 :     bin_ser::BeSer,
      54                 :     id::{TenantId, TimelineId},
      55                 :     lsn::Lsn,
      56                 : };
      57                 : 
      58                 : use super::filename::ImageFileName;
      59                 : use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
      60                 : 
      61                 : ///
      62                 : /// Header stored in the beginning of the file
      63                 : ///
      64                 : /// After this comes the 'values' part, starting on block 1. After that,
      65                 : /// the 'index' starts at the block indicated by 'index_start_blk'
      66                 : ///
      67 CBC        4002 : #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
      68                 : pub(super) struct Summary {
      69                 :     /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
      70                 :     magic: u16,
      71                 :     format_version: u16,
      72                 : 
      73                 :     tenant_id: TenantId,
      74                 :     timeline_id: TimelineId,
      75                 :     key_range: Range<Key>,
      76                 :     lsn: Lsn,
      77                 : 
      78                 :     /// Block number where the 'index' part of the file begins.
      79                 :     index_start_blk: u32,
      80                 :     /// Block within the 'index', where the B-tree root page is stored
      81                 :     index_root_blk: u32,
      82                 :     // the 'values' part starts after the summary header, on block 1.
      83                 : }
      84                 : 
      85                 : impl From<&ImageLayer> for Summary {
      86            4002 :     fn from(layer: &ImageLayer) -> Self {
      87            4002 :         Self::expected(
      88            4002 :             layer.desc.tenant_id,
      89            4002 :             layer.desc.timeline_id,
      90            4002 :             layer.desc.key_range.clone(),
      91            4002 :             layer.lsn,
      92            4002 :         )
      93            4002 :     }
      94                 : }
      95                 : 
      96                 : impl Summary {
      97            4002 :     pub(super) fn expected(
      98            4002 :         tenant_id: TenantId,
      99            4002 :         timeline_id: TimelineId,
     100            4002 :         key_range: Range<Key>,
     101            4002 :         lsn: Lsn,
     102            4002 :     ) -> Self {
     103            4002 :         Self {
     104            4002 :             magic: IMAGE_FILE_MAGIC,
     105            4002 :             format_version: STORAGE_FORMAT_VERSION,
     106            4002 :             tenant_id,
     107            4002 :             timeline_id,
     108            4002 :             key_range,
     109            4002 :             lsn,
     110            4002 : 
     111            4002 :             index_start_blk: 0,
     112            4002 :             index_root_blk: 0,
     113            4002 :         }
     114            4002 :     }
     115                 : }
     116                 : 
     117                 : /// ImageLayer is the in-memory data structure associated with an on-disk image
     118                 : /// file.
     119                 : ///
     120                 : /// We keep an ImageLayer in memory for each file, in the LayerMap. If a layer
     121                 : /// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
     122                 : /// Otherwise the struct is just a placeholder for a file that exists on disk,
     123                 : /// and it needs to be loaded before using it in queries.
     124                 : pub struct ImageLayer {
     125                 :     path_or_conf: PathOrConf,
     126                 : 
     127                 :     pub desc: PersistentLayerDesc,
     128                 :     // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
     129                 :     pub lsn: Lsn,
     130                 : 
     131                 :     access_stats: LayerAccessStats,
     132                 : 
     133                 :     inner: OnceCell<ImageLayerInner>,
     134                 : }
     135                 : 
     136                 : impl std::fmt::Debug for ImageLayer {
     137 UBC           0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     138               0 :         use super::RangeDisplayDebug;
     139               0 : 
     140               0 :         f.debug_struct("ImageLayer")
     141               0 :             .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
     142               0 :             .field("file_size", &self.desc.file_size)
     143               0 :             .field("lsn", &self.lsn)
     144               0 :             .field("inner", &self.inner)
     145               0 :             .finish()
     146               0 :     }
     147                 : }
     148                 : 
     149                 : pub struct ImageLayerInner {
     150                 :     // values copied from summary
     151                 :     index_start_blk: u32,
     152                 :     index_root_blk: u32,
     153                 : 
     154                 :     lsn: Lsn,
     155                 : 
     156                 :     /// Reader object for reading blocks from the file.
     157                 :     file: FileBlockReader,
     158                 : }
     159                 : 
     160                 : impl std::fmt::Debug for ImageLayerInner {
     161               0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     162               0 :         f.debug_struct("ImageLayerInner")
     163               0 :             .field("index_start_blk", &self.index_start_blk)
     164               0 :             .field("index_root_blk", &self.index_root_blk)
     165               0 :             .finish()
     166               0 :     }
     167                 : }
     168                 : 
     169                 : #[async_trait::async_trait]
     170                 : impl Layer for ImageLayer {
     171                 :     /// Look up given page in the file
     172 CBC      430880 :     async fn get_value_reconstruct_data(
     173          430880 :         &self,
     174          430880 :         key: Key,
     175          430880 :         lsn_range: Range<Lsn>,
     176          430880 :         reconstruct_state: &mut ValueReconstructState,
     177          430880 :         ctx: &RequestContext,
     178          430880 :     ) -> anyhow::Result<ValueReconstructResult> {
     179          430880 :         self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
     180            7814 :             .await
     181          861760 :     }
     182                 : }
     183                 : 
     184                 : /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
     185                 : impl std::fmt::Display for ImageLayer {
     186             380 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     187             380 :         write!(f, "{}", self.layer_desc().short_id())
     188             380 :     }
     189                 : }
     190                 : 
     191                 : impl AsLayerDesc for ImageLayer {
     192           19163 :     fn layer_desc(&self) -> &PersistentLayerDesc {
     193           19163 :         &self.desc
     194           19163 :     }
     195                 : }
     196                 : 
     197                 : impl PersistentLayer for ImageLayer {
     198             384 :     fn local_path(&self) -> Option<Utf8PathBuf> {
     199             384 :         self.local_path()
     200             384 :     }
     201                 : 
     202             490 :     fn delete_resident_layer_file(&self) -> Result<()> {
     203             490 :         self.delete_resident_layer_file()
     204             490 :     }
     205                 : 
     206               8 :     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
     207               8 :         self.info(reset)
     208               8 :     }
     209                 : 
     210             381 :     fn access_stats(&self) -> &LayerAccessStats {
     211             381 :         self.access_stats()
     212             381 :     }
     213                 : }
     214                 : 
     215                 : impl ImageLayer {
     216 UBC           0 :     pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
     217               0 :         println!(
     218               0 :             "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
     219               0 :             self.desc.tenant_id,
     220               0 :             self.desc.timeline_id,
     221               0 :             self.desc.key_range.start,
     222               0 :             self.desc.key_range.end,
     223               0 :             self.lsn,
     224               0 :             self.desc.is_incremental(),
     225               0 :             self.desc.file_size
     226               0 :         );
     227               0 : 
     228               0 :         if !verbose {
     229               0 :             return Ok(());
     230               0 :         }
     231                 : 
     232               0 :         let inner = self.load(LayerAccessKind::Dump, ctx).await?;
     233               0 :         let file = &inner.file;
     234               0 :         let tree_reader =
     235               0 :             DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
     236               0 : 
     237               0 :         tree_reader.dump().await?;
     238                 : 
     239               0 :         tree_reader
     240               0 :             .visit(
     241               0 :                 &[0u8; KEY_SIZE],
     242               0 :                 VisitDirection::Forwards,
     243               0 :                 |key, value| {
     244               0 :                     println!("key: {} offset {}", hex::encode(key), value);
     245               0 :                     true
     246               0 :                 },
     247               0 :                 ctx,
     248               0 :             )
     249               0 :             .await?;
     250                 : 
     251               0 :         Ok(())
     252               0 :     }
     253                 : 
     254 CBC      430880 :     pub(crate) async fn get_value_reconstruct_data(
     255          430880 :         &self,
     256          430880 :         key: Key,
     257          430880 :         lsn_range: Range<Lsn>,
     258          430880 :         reconstruct_state: &mut ValueReconstructState,
     259          430880 :         ctx: &RequestContext,
     260          430880 :     ) -> anyhow::Result<ValueReconstructResult> {
     261          430880 :         assert!(self.desc.key_range.contains(&key));
     262          430880 :         assert!(lsn_range.start >= self.lsn);
     263          430880 :         assert!(lsn_range.end >= self.lsn);
     264                 : 
     265          430880 :         let inner = self
     266          430880 :             .load(LayerAccessKind::GetValueReconstructData, ctx)
     267              46 :             .await?;
     268          430880 :         inner
     269          430880 :             .get_value_reconstruct_data(key, reconstruct_state, ctx)
     270            7768 :             .await
     271                 :             // FIXME: makes no sense to dump paths
     272          430880 :             .with_context(|| format!("read {}", self.path()))
     273          430880 :     }
     274                 : 
     275             384 :     pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
     276             384 :         Some(self.path())
     277             384 :     }
     278                 : 
     279                 :     pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
     280                 :         // delete underlying file
     281             490 :         fs::remove_file(self.path())?;
     282             490 :         Ok(())
     283             490 :     }
     284                 : 
     285               8 :     pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
     286               8 :         let layer_file_name = self.layer_desc().filename().file_name();
     287               8 :         let lsn_start = self.layer_desc().image_layer_lsn();
     288               8 : 
     289               8 :         HistoricLayerInfo::Image {
     290               8 :             layer_file_name,
     291               8 :             layer_file_size: self.desc.file_size,
     292               8 :             lsn_start,
     293               8 :             remote: false,
     294               8 :             access_stats: self.access_stats.as_api_model(reset),
     295               8 :         }
     296               8 :     }
     297                 : 
     298            3783 :     pub(crate) fn access_stats(&self) -> &LayerAccessStats {
     299            3783 :         &self.access_stats
     300            3783 :     }
     301                 : 
     302           11680 :     fn path_for(
     303           11680 :         path_or_conf: &PathOrConf,
     304           11680 :         timeline_id: TimelineId,
     305           11680 :         tenant_id: TenantId,
     306           11680 :         fname: &ImageFileName,
     307           11680 :     ) -> Utf8PathBuf {
     308           11680 :         match path_or_conf {
     309 UBC           0 :             PathOrConf::Path(path) => path.to_path_buf(),
     310 CBC       11680 :             PathOrConf::Conf(conf) => conf
     311           11680 :                 .timeline_path(&tenant_id, &timeline_id)
     312           11680 :                 .join(fname.to_string()),
     313                 :         }
     314           11680 :     }
     315                 : 
     316            3406 :     fn temp_path_for(
     317            3406 :         conf: &PageServerConf,
     318            3406 :         timeline_id: TimelineId,
     319            3406 :         tenant_id: TenantId,
     320            3406 :         fname: &ImageFileName,
     321            3406 :     ) -> Utf8PathBuf {
     322            3406 :         let rand_string: String = rand::thread_rng()
     323            3406 :             .sample_iter(&Alphanumeric)
     324            3406 :             .take(8)
     325            3406 :             .map(char::from)
     326            3406 :             .collect();
     327            3406 : 
     328            3406 :         conf.timeline_path(&tenant_id, &timeline_id)
     329            3406 :             .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
     330            3406 :     }
     331                 : 
     332                 :     ///
     333                 :     /// Open the underlying file and read the metadata into memory, if it's
     334                 :     /// not loaded already.
     335                 :     ///
     336          430880 :     async fn load(
     337          430880 :         &self,
     338          430880 :         access_kind: LayerAccessKind,
     339          430880 :         ctx: &RequestContext,
     340          430880 :     ) -> Result<&ImageLayerInner> {
     341          430880 :         self.access_stats.record_access(access_kind, ctx);
     342          430880 :         self.inner
     343          430880 :             .get_or_try_init(|| self.load_inner(ctx))
     344              46 :             .await
     345          430880 :             .with_context(|| format!("Failed to load image layer {}", self.path()))
     346          430880 :     }
     347                 : 
     348            4002 :     async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
     349            4002 :         let path = self.path();
     350                 : 
     351            4002 :         let expected_summary = match &self.path_or_conf {
     352            4002 :             PathOrConf::Conf(_) => Some(Summary::from(self)),
     353 UBC           0 :             PathOrConf::Path(_) => None,
     354                 :         };
     355                 : 
     356 CBC        4002 :         let loaded =
     357            4002 :             ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary, ctx)
     358              14 :                 .await?;
     359                 : 
     360            4002 :         if let PathOrConf::Path(ref path) = self.path_or_conf {
     361                 :             // not production code
     362 UBC           0 :             let actual_filename = path.file_name().unwrap().to_owned();
     363               0 :             let expected_filename = self.filename().file_name();
     364               0 : 
     365               0 :             if actual_filename != expected_filename {
     366               0 :                 println!("warning: filename does not match what is expected from in-file summary");
     367               0 :                 println!("actual: {:?}", actual_filename);
     368               0 :                 println!("expected: {:?}", expected_filename);
     369               0 :             }
     370 CBC        4002 :         }
     371                 : 
     372            4002 :         Ok(loaded)
     373            4002 :     }
     374                 : 
     375                 :     /// Create an ImageLayer struct representing an existing file on disk
     376            3026 :     pub fn new(
     377            3026 :         conf: &'static PageServerConf,
     378            3026 :         timeline_id: TimelineId,
     379            3026 :         tenant_id: TenantId,
     380            3026 :         filename: &ImageFileName,
     381            3026 :         file_size: u64,
     382            3026 :         access_stats: LayerAccessStats,
     383            3026 :     ) -> ImageLayer {
     384            3026 :         ImageLayer {
     385            3026 :             path_or_conf: PathOrConf::Conf(conf),
     386            3026 :             desc: PersistentLayerDesc::new_img(
     387            3026 :                 tenant_id,
     388            3026 :                 timeline_id,
     389            3026 :                 filename.key_range.clone(),
     390            3026 :                 filename.lsn,
     391            3026 :                 file_size,
     392            3026 :             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
     393            3026 :             lsn: filename.lsn,
     394            3026 :             access_stats,
     395            3026 :             inner: OnceCell::new(),
     396            3026 :         }
     397            3026 :     }
     398                 : 
     399                 :     /// Create an ImageLayer struct representing an existing file on disk.
     400                 :     ///
     401                 :     /// This variant is only used for debugging purposes, by the 'pagectl' binary.
     402 UBC           0 :     pub fn new_for_path(path: &Utf8Path, file: File) -> Result<ImageLayer> {
     403               0 :         let mut summary_buf = vec![0; PAGE_SZ];
     404               0 :         file.read_exact_at(&mut summary_buf, 0)?;
     405               0 :         let summary = Summary::des_prefix(&summary_buf)?;
     406               0 :         let metadata = file
     407               0 :             .metadata()
     408               0 :             .context("get file metadata to determine size")?;
     409               0 :         Ok(ImageLayer {
     410               0 :             path_or_conf: PathOrConf::Path(path.to_path_buf()),
     411               0 :             desc: PersistentLayerDesc::new_img(
     412               0 :                 summary.tenant_id,
     413               0 :                 summary.timeline_id,
     414               0 :                 summary.key_range,
     415               0 :                 summary.lsn,
     416               0 :                 metadata.len(),
     417               0 :             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
     418               0 :             lsn: summary.lsn,
     419               0 :             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
     420               0 :             inner: OnceCell::new(),
     421               0 :         })
     422               0 :     }
     423                 : 
     424 CBC        8278 :     fn layer_name(&self) -> ImageFileName {
     425            8278 :         self.desc.image_file_name()
     426            8278 :     }
     427                 : 
     428                 :     /// Path to the layer file in pageserver workdir.
     429            8278 :     pub fn path(&self) -> Utf8PathBuf {
     430            8278 :         Self::path_for(
     431            8278 :             &self.path_or_conf,
     432            8278 :             self.desc.timeline_id,
     433            8278 :             self.desc.tenant_id,
     434            8278 :             &self.layer_name(),
     435            8278 :         )
     436            8278 :     }
     437                 : }
     438                 : 
     439                 : impl ImageLayerInner {
     440            4002 :     pub(super) async fn load(
     441            4002 :         path: &Utf8Path,
     442            4002 :         lsn: Lsn,
     443            4002 :         summary: Option<Summary>,
     444            4002 :         ctx: &RequestContext,
     445            4002 :     ) -> anyhow::Result<Self> {
     446            4002 :         let file = VirtualFile::open(path)
     447 UBC           0 :             .await
     448 CBC        4002 :             .with_context(|| format!("Failed to open file '{}'", path))?;
     449            4002 :         let file = FileBlockReader::new(file);
     450            4002 :         let summary_blk = file.read_blk(0, ctx).await?;
     451            4002 :         let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
     452                 : 
     453            4002 :         if let Some(mut expected_summary) = summary {
     454                 :             // production code path
     455            4002 :             expected_summary.index_start_blk = actual_summary.index_start_blk;
     456            4002 :             expected_summary.index_root_blk = actual_summary.index_root_blk;
     457            4002 : 
     458            4002 :             if actual_summary != expected_summary {
     459 UBC           0 :                 bail!(
     460               0 :                     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
     461               0 :                     actual_summary,
     462               0 :                     expected_summary
     463               0 :                 );
     464 CBC        4002 :             }
     465 UBC           0 :         }
     466                 : 
     467 CBC        4002 :         Ok(ImageLayerInner {
     468            4002 :             index_start_blk: actual_summary.index_start_blk,
     469            4002 :             index_root_blk: actual_summary.index_root_blk,
     470            4002 :             lsn,
     471            4002 :             file,
     472            4002 :         })
     473            4002 :     }
     474                 : 
     475          430880 :     pub(super) async fn get_value_reconstruct_data(
     476          430880 :         &self,
     477          430880 :         key: Key,
     478          430880 :         reconstruct_state: &mut ValueReconstructState,
     479          430880 :         ctx: &RequestContext,
     480          430880 :     ) -> anyhow::Result<ValueReconstructResult> {
     481          430880 :         let file = &self.file;
     482          430880 :         let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
     483          430880 : 
     484          430880 :         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
     485          430880 :         key.write_to_byte_slice(&mut keybuf);
     486          430880 :         if let Some(offset) = tree_reader
     487          430880 :             .get(
     488          430880 :                 &keybuf,
     489          430880 :                 &RequestContextBuilder::extend(ctx)
     490          430880 :                     .page_content_kind(PageContentKind::ImageLayerBtreeNode)
     491          430880 :                     .build(),
     492          430880 :             )
     493            3710 :             .await?
     494                 :         {
     495          430878 :             let blob = file
     496          430878 :                 .block_cursor()
     497          430878 :                 .read_blob(
     498          430878 :                     offset,
     499          430878 :                     &RequestContextBuilder::extend(ctx)
     500          430878 :                         .page_content_kind(PageContentKind::ImageLayerValue)
     501          430878 :                         .build(),
     502          430878 :                 )
     503            4058 :                 .await
     504          430878 :                 .with_context(|| format!("failed to read value from offset {}", offset))?;
     505          430878 :             let value = Bytes::from(blob);
     506          430878 : 
     507          430878 :             reconstruct_state.img = Some((self.lsn, value));
     508          430878 :             Ok(ValueReconstructResult::Complete)
     509                 :         } else {
     510               2 :             Ok(ValueReconstructResult::Missing)
     511                 :         }
     512          430880 :     }
     513                 : }
     514                 : 
     515                 : /// A builder object for constructing a new image layer.
     516                 : ///
     517                 : /// Usage:
     518                 : ///
     519                 : /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
     520                 : ///
     521                 : /// 2. Write the contents by calling `put_page_image` for every key-value
     522                 : ///    pair in the key range.
     523                 : ///
     524                 : /// 3. Call `finish`.
     525                 : ///
     526                 : struct ImageLayerWriterInner {
     527                 :     conf: &'static PageServerConf,
     528                 :     path: Utf8PathBuf,
     529                 :     timeline_id: TimelineId,
     530                 :     tenant_id: TenantId,
     531                 :     key_range: Range<Key>,
     532                 :     lsn: Lsn,
     533                 : 
     534                 :     blob_writer: BlobWriter<false>,
     535                 :     tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
     536                 : }
     537                 : 
     538                 : impl ImageLayerWriterInner {
     539                 :     ///
     540                 :     /// Start building a new image layer.
     541                 :     ///
     542            3406 :     async fn new(
     543            3406 :         conf: &'static PageServerConf,
     544            3406 :         timeline_id: TimelineId,
     545            3406 :         tenant_id: TenantId,
     546            3406 :         key_range: &Range<Key>,
     547            3406 :         lsn: Lsn,
     548            3406 :     ) -> anyhow::Result<Self> {
     549            3406 :         // Create the file initially with a temporary filename.
     550            3406 :         // We'll atomically rename it to the final name when we're done.
     551            3406 :         let path = ImageLayer::temp_path_for(
     552            3406 :             conf,
     553            3406 :             timeline_id,
     554            3406 :             tenant_id,
     555            3406 :             &ImageFileName {
     556            3406 :                 key_range: key_range.clone(),
     557            3406 :                 lsn,
     558            3406 :             },
     559            3406 :         );
     560            3406 :         info!("new image layer {path}");
     561            3406 :         let mut file = VirtualFile::open_with_options(
     562            3406 :             &path,
     563            3406 :             std::fs::OpenOptions::new().write(true).create_new(true),
     564            3406 :         )
     565 UBC           0 :         .await?;
     566                 :         // make room for the header block
     567 CBC        3406 :         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
     568            3406 :         let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
     569            3406 : 
     570            3406 :         // Initialize the b-tree index builder
     571            3406 :         let block_buf = BlockBuf::new();
     572            3406 :         let tree_builder = DiskBtreeBuilder::new(block_buf);
     573            3406 : 
     574            3406 :         let writer = Self {
     575            3406 :             conf,
     576            3406 :             path,
     577            3406 :             timeline_id,
     578            3406 :             tenant_id,
     579            3406 :             key_range: key_range.clone(),
     580            3406 :             lsn,
     581            3406 :             tree: tree_builder,
     582            3406 :             blob_writer,
     583            3406 :         };
     584            3406 : 
     585            3406 :         Ok(writer)
     586            3406 :     }
     587                 : 
     588                 :     ///
     589                 :     /// Write next value to the file.
     590                 :     ///
     591                 :     /// The page versions must be appended in blknum order.
     592                 :     ///
     593          224122 :     async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
     594          224122 :         ensure!(self.key_range.contains(&key));
     595          224122 :         let off = self.blob_writer.write_blob(img).await?;
     596                 : 
     597          224122 :         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
     598          224122 :         key.write_to_byte_slice(&mut keybuf);
     599          224122 :         self.tree.append(&keybuf, off)?;
     600                 : 
     601          224122 :         Ok(())
     602          224122 :     }
     603                 : 
     604                 :     ///
     605                 :     /// Finish writing the image layer.
     606                 :     ///
     607            3402 :     async fn finish(self) -> anyhow::Result<ImageLayer> {
     608            3402 :         let index_start_blk =
     609            3402 :             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
     610            3402 : 
     611            3402 :         let mut file = self.blob_writer.into_inner();
     612            3402 : 
     613            3402 :         // Write out the index
     614            3402 :         file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
     615 UBC           0 :             .await?;
     616 CBC        3402 :         let (index_root_blk, block_buf) = self.tree.finish()?;
     617            6953 :         for buf in block_buf.blocks {
     618            3551 :             file.write_all(buf.as_ref()).await?;
     619                 :         }
     620                 : 
     621                 :         // Fill in the summary on blk 0
     622            3402 :         let summary = Summary {
     623            3402 :             magic: IMAGE_FILE_MAGIC,
     624            3402 :             format_version: STORAGE_FORMAT_VERSION,
     625            3402 :             tenant_id: self.tenant_id,
     626            3402 :             timeline_id: self.timeline_id,
     627            3402 :             key_range: self.key_range.clone(),
     628            3402 :             lsn: self.lsn,
     629            3402 :             index_start_blk,
     630            3402 :             index_root_blk,
     631            3402 :         };
     632            3402 : 
     633            3402 :         let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
     634            3402 :         Summary::ser_into(&summary, &mut buf)?;
     635            3402 :         if buf.spilled() {
     636                 :             // This is bad as we only have one free block for the summary
     637 UBC           0 :             warn!(
     638               0 :                 "Used more than one page size for summary buffer: {}",
     639               0 :                 buf.len()
     640               0 :             );
     641 CBC        3402 :         }
     642            3402 :         file.seek(SeekFrom::Start(0)).await?;
     643            3402 :         file.write_all(&buf).await?;
     644                 : 
     645            3402 :         let metadata = file
     646            3402 :             .metadata()
     647 UBC           0 :             .await
     648 CBC        3402 :             .context("get metadata to determine file size")?;
     649                 : 
     650            3402 :         let desc = PersistentLayerDesc::new_img(
     651            3402 :             self.tenant_id,
     652            3402 :             self.timeline_id,
     653            3402 :             self.key_range.clone(),
     654            3402 :             self.lsn,
     655            3402 :             metadata.len(),
     656            3402 :         );
     657            3402 : 
     658            3402 :         // Note: Because we open the file in write-only mode, we cannot
     659            3402 :         // reuse the same VirtualFile for reading later. That's why we don't
     660            3402 :         // set inner.file here. The first read will have to re-open it.
     661            3402 :         let layer = ImageLayer {
     662            3402 :             path_or_conf: PathOrConf::Conf(self.conf),
     663            3402 :             desc,
     664            3402 :             lsn: self.lsn,
     665            3402 :             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
     666            3402 :             inner: OnceCell::new(),
     667            3402 :         };
     668            3402 : 
     669            3402 :         // fsync the file
     670            3402 :         file.sync_all().await?;
     671                 : 
     672                 :         // Rename the file to its final name
     673                 :         //
     674                 :         // Note: This overwrites any existing file. There shouldn't be any.
     675                 :         // FIXME: throw an error instead?
     676            3402 :         let final_path = ImageLayer::path_for(
     677            3402 :             &PathOrConf::Conf(self.conf),
     678            3402 :             self.timeline_id,
     679            3402 :             self.tenant_id,
     680            3402 :             &ImageFileName {
     681            3402 :                 key_range: self.key_range.clone(),
     682            3402 :                 lsn: self.lsn,
     683            3402 :             },
     684            3402 :         );
     685            3402 :         std::fs::rename(self.path, final_path)?;
     686                 : 
     687 UBC           0 :         trace!("created image layer {}", layer.path());
     688                 : 
     689 CBC        3402 :         Ok(layer)
     690            3402 :     }
     691                 : }
     692                 : 
     693                 : /// A builder object for constructing a new image layer.
     694                 : ///
     695                 : /// Usage:
     696                 : ///
     697                 : /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
     698                 : ///
     699                 : /// 2. Write the contents by calling `put_page_image` for every key-value
     700                 : ///    pair in the key range.
     701                 : ///
     702                 : /// 3. Call `finish`.
     703                 : ///
     704                 : /// # Note
     705                 : ///
     706                 : /// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
     707                 : /// possible for the writer to drop before `finish` is actually called. So this
     708                 : /// could lead to odd temporary files in the directory, exhausting file system.
     709                 : /// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
     710                 : /// implementation that cleans up the temporary file in failure. It's not
     711                 : /// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
     712                 : /// out some fields, making it impossible to implement `Drop`.
     713                 : ///
     714                 : #[must_use]
     715                 : pub struct ImageLayerWriter {
     716                 :     inner: Option<ImageLayerWriterInner>,
     717                 : }
     718                 : 
     719                 : impl ImageLayerWriter {
     720                 :     ///
     721                 :     /// Start building a new image layer.
     722                 :     ///
     723            3406 :     pub async fn new(
     724            3406 :         conf: &'static PageServerConf,
     725            3406 :         timeline_id: TimelineId,
     726            3406 :         tenant_id: TenantId,
     727            3406 :         key_range: &Range<Key>,
     728            3406 :         lsn: Lsn,
     729            3406 :     ) -> anyhow::Result<ImageLayerWriter> {
     730                 :         Ok(Self {
     731                 :             inner: Some(
     732            3406 :                 ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?,
     733                 :             ),
     734                 :         })
     735            3406 :     }
     736                 : 
     737                 :     ///
     738                 :     /// Write next value to the file.
     739                 :     ///
     740                 :     /// The page versions must be appended in blknum order.
     741                 :     ///
     742          224122 :     pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
     743          224122 :         self.inner.as_mut().unwrap().put_image(key, img).await
     744          224122 :     }
     745                 : 
     746                 :     ///
     747                 :     /// Finish writing the image layer.
     748                 :     ///
     749            3402 :     pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
     750            3402 :         self.inner.take().unwrap().finish().await
     751            3402 :     }
     752                 : }
     753                 : 
     754                 : impl Drop for ImageLayerWriter {
     755                 :     fn drop(&mut self) {
     756            3402 :         if let Some(inner) = self.inner.take() {
     757 UBC           0 :             inner.blob_writer.into_inner().remove();
     758 CBC        3402 :         }
     759            3402 :     }
     760                 : }
        

Generated by: LCOV version 2.1-beta