LCOV - code coverage report
Current view: top level - pageserver/src/tenant/storage_layer - image_layer.rs (source / functions) Coverage Total Hit
Test: 8ac049b474321fdc72ddcb56d7165153a1a900e8.info Lines: 79.5 % 443 352
Test Date: 2023-09-06 10:18:01 Functions: 68.1 % 69 47

            Line data    Source code
       1              : //! An ImageLayer represents an image or a snapshot of a key-range at
       2              : //! one particular LSN. It contains an image of all key-value pairs
       3              : //! in its key-range. Any key that falls into the image layer's range
       4              : //! but does not exist in the layer, does not exist.
       5              : //!
       6              : //! An image layer is stored in a file on disk. The file is stored in
       7              : //! timelines/<timeline_id> directory.  Currently, there are no
       8              : //! subdirectories, and each image layer file is named like this:
       9              : //!
      10              : //! ```text
      11              : //!    <key start>-<key end>__<LSN>
      12              : //! ```
      13              : //!
      14              : //! For example:
      15              : //!
      16              : //! ```text
      17              : //!    000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
      18              : //! ```
      19              : //!
      20              : //! Every image layer file consists of three parts: "summary",
      21              : //! "index", and "values".  The summary is a fixed size header at the
      22              : //! beginning of the file, and it contains basic information about the
      23              : //! layer, and offsets to the other parts. The "index" is a B-tree,
      24              : //! mapping from Key to an offset in the "values" part.  The
      25              : //! actual page images are stored in the "values" part.
      26              : use crate::config::PageServerConf;
      27              : use crate::context::RequestContext;
      28              : use crate::page_cache::PAGE_SZ;
      29              : use crate::repository::{Key, KEY_SIZE};
      30              : use crate::tenant::blob_io::WriteBlobWriter;
      31              : use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
      32              : use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
      33              : use crate::tenant::storage_layer::{
      34              :     LayerAccessStats, PersistentLayer, ValueReconstructResult, ValueReconstructState,
      35              : };
      36              : use crate::virtual_file::VirtualFile;
      37              : use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
      38              : use anyhow::{bail, ensure, Context, Result};
      39              : use bytes::Bytes;
      40              : use hex;
      41              : use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
      42              : use rand::{distributions::Alphanumeric, Rng};
      43              : use serde::{Deserialize, Serialize};
      44              : use std::fs::{self, File};
      45              : use std::io::SeekFrom;
      46              : use std::io::Write;
      47              : use std::ops::Range;
      48              : use std::os::unix::prelude::FileExt;
      49              : use std::path::{Path, PathBuf};
      50              : use tokio::sync::OnceCell;
      51              : use tracing::*;
      52              : 
      53              : use utils::{
      54              :     bin_ser::BeSer,
      55              :     id::{TenantId, TimelineId},
      56              :     lsn::Lsn,
      57              : };
      58              : 
      59              : use super::filename::ImageFileName;
      60              : use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
      61              : 
      62              : ///
      63              : /// Header stored in the beginning of the file
      64              : ///
      65              : /// After this comes the 'values' part, starting on block 1. After that,
      66              : /// the 'index' starts at the block indicated by 'index_start_blk'
      67              : ///
      68         1562 : #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
      69              : pub(super) struct Summary {
      70              :     /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
      71              :     magic: u16,
      72              :     format_version: u16,
      73              : 
      74              :     tenant_id: TenantId,
      75              :     timeline_id: TimelineId,
      76              :     key_range: Range<Key>,
      77              :     lsn: Lsn,
      78              : 
      79              :     /// Block number where the 'index' part of the file begins.
      80              :     index_start_blk: u32,
      81              :     /// Block within the 'index', where the B-tree root page is stored
      82              :     index_root_blk: u32,
      83              :     // the 'values' part starts after the summary header, on block 1.
      84              : }
      85              : 
      86              : impl From<&ImageLayer> for Summary {
      87         1562 :     fn from(layer: &ImageLayer) -> Self {
      88         1562 :         Self::expected(
      89         1562 :             layer.desc.tenant_id,
      90         1562 :             layer.desc.timeline_id,
      91         1562 :             layer.desc.key_range.clone(),
      92         1562 :             layer.lsn,
      93         1562 :         )
      94         1562 :     }
      95              : }
      96              : 
      97              : impl Summary {
      98         1562 :     pub(super) fn expected(
      99         1562 :         tenant_id: TenantId,
     100         1562 :         timeline_id: TimelineId,
     101         1562 :         key_range: Range<Key>,
     102         1562 :         lsn: Lsn,
     103         1562 :     ) -> Self {
     104         1562 :         Self {
     105         1562 :             magic: IMAGE_FILE_MAGIC,
     106         1562 :             format_version: STORAGE_FORMAT_VERSION,
     107         1562 :             tenant_id,
     108         1562 :             timeline_id,
     109         1562 :             key_range,
     110         1562 :             lsn,
     111         1562 : 
     112         1562 :             index_start_blk: 0,
     113         1562 :             index_root_blk: 0,
     114         1562 :         }
     115         1562 :     }
     116              : }
     117              : 
     118              : /// ImageLayer is the in-memory data structure associated with an on-disk image
     119              : /// file.
     120              : ///
     121              : /// We keep an ImageLayer in memory for each file, in the LayerMap. If a layer
     122              : /// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
     123              : /// Otherwise the struct is just a placeholder for a file that exists on disk,
     124              : /// and it needs to be loaded before using it in queries.
     125              : pub struct ImageLayer {
     126              :     path_or_conf: PathOrConf,
     127              : 
     128              :     pub desc: PersistentLayerDesc,
     129              :     // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
     130              :     pub lsn: Lsn,
     131              : 
     132              :     access_stats: LayerAccessStats,
     133              : 
     134              :     inner: OnceCell<ImageLayerInner>,
     135              : }
     136              : 
     137              : impl std::fmt::Debug for ImageLayer {
     138            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     139            0 :         use super::RangeDisplayDebug;
     140            0 : 
     141            0 :         f.debug_struct("ImageLayer")
     142            0 :             .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
     143            0 :             .field("file_size", &self.desc.file_size)
     144            0 :             .field("lsn", &self.lsn)
     145            0 :             .field("inner", &self.inner)
     146            0 :             .finish()
     147            0 :     }
     148              : }
     149              : 
     150              : pub struct ImageLayerInner {
     151              :     // values copied from summary
     152              :     index_start_blk: u32,
     153              :     index_root_blk: u32,
     154              : 
     155              :     lsn: Lsn,
     156              : 
     157              :     /// Reader object for reading blocks from the file.
     158              :     file: FileBlockReader,
     159              : }
     160              : 
     161              : impl std::fmt::Debug for ImageLayerInner {
     162            0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     163            0 :         f.debug_struct("ImageLayerInner")
     164            0 :             .field("index_start_blk", &self.index_start_blk)
     165            0 :             .field("index_root_blk", &self.index_root_blk)
     166            0 :             .finish()
     167            0 :     }
     168              : }
     169              : 
     170              : #[async_trait::async_trait]
     171              : impl Layer for ImageLayer {
     172              :     /// Look up given page in the file
     173       658916 :     async fn get_value_reconstruct_data(
     174       658916 :         &self,
     175       658916 :         key: Key,
     176       658916 :         lsn_range: Range<Lsn>,
     177       658916 :         reconstruct_state: &mut ValueReconstructState,
     178       658916 :         ctx: &RequestContext,
     179       658917 :     ) -> anyhow::Result<ValueReconstructResult> {
     180       658917 :         self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
     181         5545 :             .await
     182      1317834 :     }
     183              : }
     184              : 
     185              : /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
     186              : impl std::fmt::Display for ImageLayer {
     187            5 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     188            5 :         write!(f, "{}", self.layer_desc().short_id())
     189            5 :     }
     190              : }
     191              : 
     192              : impl AsLayerDesc for ImageLayer {
     193         5022 :     fn layer_desc(&self) -> &PersistentLayerDesc {
     194         5022 :         &self.desc
     195         5022 :     }
     196              : }
     197              : 
     198              : impl PersistentLayer for ImageLayer {
     199            9 :     fn local_path(&self) -> Option<PathBuf> {
     200            9 :         self.local_path()
     201            9 :     }
     202              : 
     203           78 :     fn delete_resident_layer_file(&self) -> Result<()> {
     204           78 :         self.delete_resident_layer_file()
     205           78 :     }
     206              : 
     207            8 :     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
     208            8 :         self.info(reset)
     209            8 :     }
     210              : 
     211            6 :     fn access_stats(&self) -> &LayerAccessStats {
     212            6 :         self.access_stats()
     213            6 :     }
     214              : }
     215              : 
     216              : impl ImageLayer {
     217            0 :     pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
     218            0 :         println!(
     219            0 :             "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
     220            0 :             self.desc.tenant_id,
     221            0 :             self.desc.timeline_id,
     222            0 :             self.desc.key_range.start,
     223            0 :             self.desc.key_range.end,
     224            0 :             self.lsn,
     225            0 :             self.desc.is_incremental(),
     226            0 :             self.desc.file_size
     227            0 :         );
     228            0 : 
     229            0 :         if !verbose {
     230            0 :             return Ok(());
     231            0 :         }
     232              : 
     233            0 :         let inner = self.load(LayerAccessKind::Dump, ctx).await?;
     234            0 :         let file = &inner.file;
     235            0 :         let tree_reader =
     236            0 :             DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
     237            0 : 
     238            0 :         tree_reader.dump().await?;
     239              : 
     240            0 :         tree_reader
     241            0 :             .visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
     242            0 :                 println!("key: {} offset {}", hex::encode(key), value);
     243            0 :                 true
     244            0 :             })
     245            0 :             .await?;
     246              : 
     247            0 :         Ok(())
     248            0 :     }
     249              : 
     250       658916 :     pub(crate) async fn get_value_reconstruct_data(
     251       658916 :         &self,
     252       658916 :         key: Key,
     253       658916 :         lsn_range: Range<Lsn>,
     254       658916 :         reconstruct_state: &mut ValueReconstructState,
     255       658916 :         ctx: &RequestContext,
     256       658917 :     ) -> anyhow::Result<ValueReconstructResult> {
     257       658917 :         assert!(self.desc.key_range.contains(&key));
     258       658917 :         assert!(lsn_range.start >= self.lsn);
     259       658917 :         assert!(lsn_range.end >= self.lsn);
     260              : 
     261       658917 :         let inner = self
     262       658917 :             .load(LayerAccessKind::GetValueReconstructData, ctx)
     263            9 :             .await?;
     264       658917 :         inner
     265       658917 :             .get_value_reconstruct_data(key, reconstruct_state)
     266         5536 :             .await
     267              :             // FIXME: makes no sense to dump paths
     268       658917 :             .with_context(|| format!("read {}", self.path().display()))
     269       658917 :     }
     270              : 
     271            9 :     pub(crate) fn local_path(&self) -> Option<PathBuf> {
     272            9 :         Some(self.path())
     273            9 :     }
     274              : 
     275              :     pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
     276              :         // delete underlying file
     277           78 :         fs::remove_file(self.path())?;
     278           78 :         Ok(())
     279           78 :     }
     280              : 
     281            8 :     pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
     282            8 :         let layer_file_name = self.layer_desc().filename().file_name();
     283            8 :         let lsn_start = self.layer_desc().image_layer_lsn();
     284            8 : 
     285            8 :         HistoricLayerInfo::Image {
     286            8 :             layer_file_name,
     287            8 :             layer_file_size: self.desc.file_size,
     288            8 :             lsn_start,
     289            8 :             remote: false,
     290            8 :             access_stats: self.access_stats.as_api_model(reset),
     291            8 :         }
     292            8 :     }
     293              : 
     294         1118 :     pub(crate) fn access_stats(&self) -> &LayerAccessStats {
     295         1118 :         &self.access_stats
     296         1118 :     }
     297              : 
     298         3873 :     fn path_for(
     299         3873 :         path_or_conf: &PathOrConf,
     300         3873 :         timeline_id: TimelineId,
     301         3873 :         tenant_id: TenantId,
     302         3873 :         fname: &ImageFileName,
     303         3873 :     ) -> PathBuf {
     304         3873 :         match path_or_conf {
     305            0 :             PathOrConf::Path(path) => path.to_path_buf(),
     306         3873 :             PathOrConf::Conf(conf) => conf
     307         3873 :                 .timeline_path(&tenant_id, &timeline_id)
     308         3873 :                 .join(fname.to_string()),
     309              :         }
     310         3873 :     }
     311              : 
     312         1113 :     fn temp_path_for(
     313         1113 :         conf: &PageServerConf,
     314         1113 :         timeline_id: TimelineId,
     315         1113 :         tenant_id: TenantId,
     316         1113 :         fname: &ImageFileName,
     317         1113 :     ) -> PathBuf {
     318         1113 :         let rand_string: String = rand::thread_rng()
     319         1113 :             .sample_iter(&Alphanumeric)
     320         1113 :             .take(8)
     321         1113 :             .map(char::from)
     322         1113 :             .collect();
     323         1113 : 
     324         1113 :         conf.timeline_path(&tenant_id, &timeline_id)
     325         1113 :             .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
     326         1113 :     }
     327              : 
     328              :     ///
     329              :     /// Open the underlying file and read the metadata into memory, if it's
     330              :     /// not loaded already.
     331              :     ///
     332       658916 :     async fn load(
     333       658916 :         &self,
     334       658916 :         access_kind: LayerAccessKind,
     335       658916 :         ctx: &RequestContext,
     336       658917 :     ) -> Result<&ImageLayerInner> {
     337       658917 :         self.access_stats.record_access(access_kind, ctx);
     338       658917 :         self.inner
     339       658917 :             .get_or_try_init(|| self.load_inner())
     340            9 :             .await
     341       658917 :             .with_context(|| format!("Failed to load image layer {}", self.path().display()))
     342       658917 :     }
     343              : 
     344         1562 :     async fn load_inner(&self) -> Result<ImageLayerInner> {
     345         1562 :         let path = self.path();
     346              : 
     347         1562 :         let expected_summary = match &self.path_or_conf {
     348         1562 :             PathOrConf::Conf(_) => Some(Summary::from(self)),
     349            0 :             PathOrConf::Path(_) => None,
     350              :         };
     351              : 
     352         1562 :         let loaded =
     353         1562 :             ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary).await?;
     354              : 
     355         1562 :         if let PathOrConf::Path(ref path) = self.path_or_conf {
     356              :             // not production code
     357            0 :             let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
     358            0 :             let expected_filename = self.filename().file_name();
     359            0 : 
     360            0 :             if actual_filename != expected_filename {
     361            0 :                 println!("warning: filename does not match what is expected from in-file summary");
     362            0 :                 println!("actual: {:?}", actual_filename);
     363            0 :                 println!("expected: {:?}", expected_filename);
     364            0 :             }
     365         1562 :         }
     366              : 
     367         1562 :         Ok(loaded)
     368         1562 :     }
     369              : 
     370              :     /// Create an ImageLayer struct representing an existing file on disk
     371          743 :     pub fn new(
     372          743 :         conf: &'static PageServerConf,
     373          743 :         timeline_id: TimelineId,
     374          743 :         tenant_id: TenantId,
     375          743 :         filename: &ImageFileName,
     376          743 :         file_size: u64,
     377          743 :         access_stats: LayerAccessStats,
     378          743 :     ) -> ImageLayer {
     379          743 :         ImageLayer {
     380          743 :             path_or_conf: PathOrConf::Conf(conf),
     381          743 :             desc: PersistentLayerDesc::new_img(
     382          743 :                 tenant_id,
     383          743 :                 timeline_id,
     384          743 :                 filename.key_range.clone(),
     385          743 :                 filename.lsn,
     386          743 :                 file_size,
     387          743 :             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
     388          743 :             lsn: filename.lsn,
     389          743 :             access_stats,
     390          743 :             inner: OnceCell::new(),
     391          743 :         }
     392          743 :     }
     393              : 
     394              :     /// Create an ImageLayer struct representing an existing file on disk.
     395              :     ///
     396              :     /// This variant is only used for debugging purposes, by the 'pagectl' binary.
     397            0 :     pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
     398            0 :         let mut summary_buf = Vec::new();
     399            0 :         summary_buf.resize(PAGE_SZ, 0);
     400            0 :         file.read_exact_at(&mut summary_buf, 0)?;
     401            0 :         let summary = Summary::des_prefix(&summary_buf)?;
     402            0 :         let metadata = file
     403            0 :             .metadata()
     404            0 :             .context("get file metadata to determine size")?;
     405            0 :         Ok(ImageLayer {
     406            0 :             path_or_conf: PathOrConf::Path(path.to_path_buf()),
     407            0 :             desc: PersistentLayerDesc::new_img(
     408            0 :                 summary.tenant_id,
     409            0 :                 summary.timeline_id,
     410            0 :                 summary.key_range,
     411            0 :                 summary.lsn,
     412            0 :                 metadata.len(),
     413            0 :             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
     414            0 :             lsn: summary.lsn,
     415            0 :             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
     416            0 :             inner: OnceCell::new(),
     417            0 :         })
     418            0 :     }
     419              : 
     420         2761 :     fn layer_name(&self) -> ImageFileName {
     421         2761 :         self.desc.image_file_name()
     422         2761 :     }
     423              : 
     424              :     /// Path to the layer file in pageserver workdir.
     425         2761 :     pub fn path(&self) -> PathBuf {
     426         2761 :         Self::path_for(
     427         2761 :             &self.path_or_conf,
     428         2761 :             self.desc.timeline_id,
     429         2761 :             self.desc.tenant_id,
     430         2761 :             &self.layer_name(),
     431         2761 :         )
     432         2761 :     }
     433              : }
     434              : 
     435              : impl ImageLayerInner {
     436         1562 :     pub(super) async fn load(
     437         1562 :         path: &std::path::Path,
     438         1562 :         lsn: Lsn,
     439         1562 :         summary: Option<Summary>,
     440         1562 :     ) -> anyhow::Result<Self> {
     441         1562 :         let file = VirtualFile::open(path)
     442         1562 :             .with_context(|| format!("Failed to open file '{}'", path.display()))?;
     443         1562 :         let file = FileBlockReader::new(file);
     444         1562 :         let summary_blk = file.read_blk(0).await?;
     445         1562 :         let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
     446              : 
     447         1562 :         if let Some(mut expected_summary) = summary {
     448              :             // production code path
     449         1562 :             expected_summary.index_start_blk = actual_summary.index_start_blk;
     450         1562 :             expected_summary.index_root_blk = actual_summary.index_root_blk;
     451         1562 : 
     452         1562 :             if actual_summary != expected_summary {
     453            0 :                 bail!(
     454            0 :                     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
     455            0 :                     actual_summary,
     456            0 :                     expected_summary
     457            0 :                 );
     458         1562 :             }
     459            0 :         }
     460              : 
     461         1562 :         Ok(ImageLayerInner {
     462         1562 :             index_start_blk: actual_summary.index_start_blk,
     463         1562 :             index_root_blk: actual_summary.index_root_blk,
     464         1562 :             lsn,
     465         1562 :             file,
     466         1562 :         })
     467         1562 :     }
     468              : 
     469       658916 :     pub(super) async fn get_value_reconstruct_data(
     470       658916 :         &self,
     471       658916 :         key: Key,
     472       658916 :         reconstruct_state: &mut ValueReconstructState,
     473       658917 :     ) -> anyhow::Result<ValueReconstructResult> {
     474       658917 :         let file = &self.file;
     475       658917 :         let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
     476       658917 : 
     477       658917 :         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
     478       658917 :         key.write_to_byte_slice(&mut keybuf);
     479       658917 :         if let Some(offset) = tree_reader.get(&keybuf).await? {
     480       658915 :             let blob = file
     481       658915 :                 .block_cursor()
     482       658915 :                 .read_blob(offset)
     483         3058 :                 .await
     484       658915 :                 .with_context(|| format!("failed to read value from offset {}", offset))?;
     485       658915 :             let value = Bytes::from(blob);
     486       658915 : 
     487       658915 :             reconstruct_state.img = Some((self.lsn, value));
     488       658915 :             Ok(ValueReconstructResult::Complete)
     489              :         } else {
     490            2 :             Ok(ValueReconstructResult::Missing)
     491              :         }
     492       658917 :     }
     493              : }
     494              : 
     495              : /// A builder object for constructing a new image layer.
     496              : ///
     497              : /// Usage:
     498              : ///
     499              : /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
     500              : ///
     501              : /// 2. Write the contents by calling `put_page_image` for every key-value
     502              : ///    pair in the key range.
     503              : ///
     504              : /// 3. Call `finish`.
     505              : ///
     506              : struct ImageLayerWriterInner {
     507              :     conf: &'static PageServerConf,
     508              :     path: PathBuf,
     509              :     timeline_id: TimelineId,
     510              :     tenant_id: TenantId,
     511              :     key_range: Range<Key>,
     512              :     lsn: Lsn,
     513              : 
     514              :     blob_writer: WriteBlobWriter<VirtualFile>,
     515              :     tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
     516              : }
     517              : 
     518              : impl ImageLayerWriterInner {
     519              :     ///
     520              :     /// Start building a new image layer.
     521              :     ///
     522         1113 :     async fn new(
     523         1113 :         conf: &'static PageServerConf,
     524         1113 :         timeline_id: TimelineId,
     525         1113 :         tenant_id: TenantId,
     526         1113 :         key_range: &Range<Key>,
     527         1113 :         lsn: Lsn,
     528         1113 :     ) -> anyhow::Result<Self> {
     529         1113 :         // Create the file initially with a temporary filename.
     530         1113 :         // We'll atomically rename it to the final name when we're done.
     531         1113 :         let path = ImageLayer::temp_path_for(
     532         1113 :             conf,
     533         1113 :             timeline_id,
     534         1113 :             tenant_id,
     535         1113 :             &ImageFileName {
     536         1113 :                 key_range: key_range.clone(),
     537         1113 :                 lsn,
     538         1113 :             },
     539         1113 :         );
     540         1113 :         info!("new image layer {}", path.display());
     541         1113 :         let mut file = VirtualFile::open_with_options(
     542         1113 :             &path,
     543         1113 :             std::fs::OpenOptions::new().write(true).create_new(true),
     544         1113 :         )?;
     545              :         // make room for the header block
     546         1113 :         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
     547         1113 :         let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);
     548         1113 : 
     549         1113 :         // Initialize the b-tree index builder
     550         1113 :         let block_buf = BlockBuf::new();
     551         1113 :         let tree_builder = DiskBtreeBuilder::new(block_buf);
     552         1113 : 
     553         1113 :         let writer = Self {
     554         1113 :             conf,
     555         1113 :             path,
     556         1113 :             timeline_id,
     557         1113 :             tenant_id,
     558         1113 :             key_range: key_range.clone(),
     559         1113 :             lsn,
     560         1113 :             tree: tree_builder,
     561         1113 :             blob_writer,
     562         1113 :         };
     563         1113 : 
     564         1113 :         Ok(writer)
     565         1113 :     }
     566              : 
     567              :     ///
     568              :     /// Write next value to the file.
     569              :     ///
     570              :     /// The page versions must be appended in blknum order.
     571              :     ///
     572       205153 :     async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
     573       205153 :         ensure!(self.key_range.contains(&key));
     574       205153 :         let off = self.blob_writer.write_blob(img).await?;
     575              : 
     576       205153 :         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
     577       205153 :         key.write_to_byte_slice(&mut keybuf);
     578       205153 :         self.tree.append(&keybuf, off)?;
     579              : 
     580       205153 :         Ok(())
     581       205153 :     }
     582              : 
     583              :     ///
     584              :     /// Finish writing the image layer.
     585              :     ///
     586         1112 :     async fn finish(self) -> anyhow::Result<ImageLayer> {
     587         1112 :         let index_start_blk =
     588         1112 :             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
     589         1112 : 
     590         1112 :         let mut file = self.blob_writer.into_inner();
     591         1112 : 
     592         1112 :         // Write out the index
     593         1112 :         file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
     594            0 :             .await?;
     595         1112 :         let (index_root_blk, block_buf) = self.tree.finish()?;
     596         2424 :         for buf in block_buf.blocks {
     597         1312 :             file.write_all(buf.as_ref())?;
     598              :         }
     599              : 
     600              :         // Fill in the summary on blk 0
     601         1112 :         let summary = Summary {
     602         1112 :             magic: IMAGE_FILE_MAGIC,
     603         1112 :             format_version: STORAGE_FORMAT_VERSION,
     604         1112 :             tenant_id: self.tenant_id,
     605         1112 :             timeline_id: self.timeline_id,
     606         1112 :             key_range: self.key_range.clone(),
     607         1112 :             lsn: self.lsn,
     608         1112 :             index_start_blk,
     609         1112 :             index_root_blk,
     610         1112 :         };
     611         1112 : 
     612         1112 :         let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
     613         1112 :         Summary::ser_into(&summary, &mut buf)?;
     614         1112 :         if buf.spilled() {
     615              :             // This is bad as we only have one free block for the summary
     616            0 :             warn!(
     617            0 :                 "Used more than one page size for summary buffer: {}",
     618            0 :                 buf.len()
     619            0 :             );
     620         1112 :         }
     621         1112 :         file.seek(SeekFrom::Start(0)).await?;
     622         1112 :         file.write_all(&buf)?;
     623              : 
     624         1112 :         let metadata = file
     625         1112 :             .metadata()
     626            0 :             .await
     627         1112 :             .context("get metadata to determine file size")?;
     628              : 
     629         1112 :         let desc = PersistentLayerDesc::new_img(
     630         1112 :             self.tenant_id,
     631         1112 :             self.timeline_id,
     632         1112 :             self.key_range.clone(),
     633         1112 :             self.lsn,
     634         1112 :             metadata.len(),
     635         1112 :         );
     636         1112 : 
     637         1112 :         // Note: Because we open the file in write-only mode, we cannot
     638         1112 :         // reuse the same VirtualFile for reading later. That's why we don't
     639         1112 :         // set inner.file here. The first read will have to re-open it.
     640         1112 :         let layer = ImageLayer {
     641         1112 :             path_or_conf: PathOrConf::Conf(self.conf),
     642         1112 :             desc,
     643         1112 :             lsn: self.lsn,
     644         1112 :             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
     645         1112 :             inner: OnceCell::new(),
     646         1112 :         };
     647         1112 : 
     648         1112 :         // fsync the file
     649         1112 :         file.sync_all()?;
     650              : 
     651              :         // Rename the file to its final name
     652              :         //
     653              :         // Note: This overwrites any existing file. There shouldn't be any.
     654              :         // FIXME: throw an error instead?
     655         1112 :         let final_path = ImageLayer::path_for(
     656         1112 :             &PathOrConf::Conf(self.conf),
     657         1112 :             self.timeline_id,
     658         1112 :             self.tenant_id,
     659         1112 :             &ImageFileName {
     660         1112 :                 key_range: self.key_range.clone(),
     661         1112 :                 lsn: self.lsn,
     662         1112 :             },
     663         1112 :         );
     664         1112 :         std::fs::rename(self.path, final_path)?;
     665              : 
     666            0 :         trace!("created image layer {}", layer.path().display());
     667              : 
     668         1112 :         Ok(layer)
     669         1112 :     }
     670              : }
     671              : 
     672              : /// A builder object for constructing a new image layer.
     673              : ///
     674              : /// Usage:
     675              : ///
     676              : /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
     677              : ///
     678              : /// 2. Write the contents by calling `put_page_image` for every key-value
     679              : ///    pair in the key range.
     680              : ///
     681              : /// 3. Call `finish`.
     682              : ///
     683              : /// # Note
     684              : ///
     685              : /// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
     686              : /// possible for the writer to drop before `finish` is actually called. So this
     687              : /// could lead to odd temporary files in the directory, exhausting file system.
     688              : /// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
     689              : /// implementation that cleans up the temporary file in failure. It's not
     690              : /// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
     691              : /// out some fields, making it impossible to implement `Drop`.
     692              : ///
     693              : #[must_use]
     694              : pub struct ImageLayerWriter {
     695              :     inner: Option<ImageLayerWriterInner>,
     696              : }
     697              : 
     698              : impl ImageLayerWriter {
     699              :     ///
     700              :     /// Start building a new image layer.
     701              :     ///
     702         1113 :     pub async fn new(
     703         1113 :         conf: &'static PageServerConf,
     704         1113 :         timeline_id: TimelineId,
     705         1113 :         tenant_id: TenantId,
     706         1113 :         key_range: &Range<Key>,
     707         1113 :         lsn: Lsn,
     708         1113 :     ) -> anyhow::Result<ImageLayerWriter> {
     709              :         Ok(Self {
     710              :             inner: Some(
     711         1113 :                 ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?,
     712              :             ),
     713              :         })
     714         1113 :     }
     715              : 
     716              :     ///
     717              :     /// Write next value to the file.
     718              :     ///
     719              :     /// The page versions must be appended in blknum order.
     720              :     ///
     721       205152 :     pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
     722       205153 :         self.inner.as_mut().unwrap().put_image(key, img).await
     723       205153 :     }
     724              : 
     725              :     ///
     726              :     /// Finish writing the image layer.
     727              :     ///
     728         1112 :     pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
     729         1112 :         self.inner.take().unwrap().finish().await
     730         1112 :     }
     731              : }
     732              : 
     733              : impl Drop for ImageLayerWriter {
     734              :     fn drop(&mut self) {
     735         1112 :         if let Some(inner) = self.inner.take() {
     736            0 :             inner.blob_writer.into_inner().remove();
     737         1112 :         }
     738         1112 :     }
     739              : }
        

Generated by: LCOV version 2.1-beta