LCOV - differential code coverage report
Current view: top level - pageserver/src/tenant/storage_layer - image_layer.rs (source / functions) Coverage Total Hit UBC CBC
Current: cd44433dd675caa99df17a61b18949c8387e2242.info Lines: 56.3 % 387 218 169 218
Current Date: 2024-01-09 02:06:09 Functions: 36.4 % 66 24 42 24
Baseline: 66c52a629a0f4a503e193045e0df4c77139e344b.info
Baseline Date: 2024-01-08 15:34:46

           TLA  Line data    Source code
       1                 : //! An ImageLayer represents an image or a snapshot of a key-range at
       2                 : //! one particular LSN. It contains an image of all key-value pairs
       3                 : //! in its key-range. Any key that falls into the image layer's range
       4                 : //! but does not exist in the layer, does not exist.
       5                 : //!
       6                 : //! An image layer is stored in a file on disk. The file is stored in
       7                 : //! timelines/<timeline_id> directory.  Currently, there are no
       8                 : //! subdirectories, and each image layer file is named like this:
       9                 : //!
      10                 : //! ```text
      11                 : //!    <key start>-<key end>__<LSN>
      12                 : //! ```
      13                 : //!
      14                 : //! For example:
      15                 : //!
      16                 : //! ```text
      17                 : //!    000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
      18                 : //! ```
      19                 : //!
      20                 : //! Every image layer file consists of three parts: "summary",
      21                 : //! "index", and "values".  The summary is a fixed size header at the
      22                 : //! beginning of the file, and it contains basic information about the
      23                 : //! layer, and offsets to the other parts. The "index" is a B-tree,
      24                 : //! mapping from Key to an offset in the "values" part.  The
      25                 : //! actual page images are stored in the "values" part.
      26                 : use crate::config::PageServerConf;
      27                 : use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
      28                 : use crate::page_cache::PAGE_SZ;
      29                 : use crate::repository::{Key, KEY_SIZE};
      30                 : use crate::tenant::blob_io::BlobWriter;
      31                 : use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
      32                 : use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
      33                 : use crate::tenant::storage_layer::{
      34                 :     LayerAccessStats, ValueReconstructResult, ValueReconstructState,
      35                 : };
      36                 : use crate::tenant::Timeline;
      37                 : use crate::virtual_file::VirtualFile;
      38                 : use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
      39                 : use anyhow::{bail, ensure, Context, Result};
      40                 : use bytes::Bytes;
      41                 : use camino::{Utf8Path, Utf8PathBuf};
      42                 : use hex;
      43                 : use pageserver_api::models::LayerAccessKind;
      44                 : use pageserver_api::shard::TenantShardId;
      45                 : use rand::{distributions::Alphanumeric, Rng};
      46                 : use serde::{Deserialize, Serialize};
      47                 : use std::fs::File;
      48                 : use std::io::SeekFrom;
      49                 : use std::ops::Range;
      50                 : use std::os::unix::prelude::FileExt;
      51                 : use std::sync::Arc;
      52                 : use tokio::sync::OnceCell;
      53                 : use tracing::*;
      54                 : 
      55                 : use utils::{
      56                 :     bin_ser::BeSer,
      57                 :     id::{TenantId, TimelineId},
      58                 :     lsn::Lsn,
      59                 : };
      60                 : 
      61                 : use super::filename::ImageFileName;
      62                 : use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
      63                 : 
      64                 : ///
      65                 : /// Header stored in the beginning of the file
      66                 : ///
      67                 : /// After this comes the 'values' part, starting on block 1. After that,
      68                 : /// the 'index' starts at the block indicated by 'index_start_blk'
      69                 : ///
      70 CBC       22231 : #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
      71                 : pub struct Summary {
      72                 :     /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
      73                 :     pub magic: u16,
      74                 :     pub format_version: u16,
      75                 : 
      76                 :     pub tenant_id: TenantId,
      77                 :     pub timeline_id: TimelineId,
      78                 :     pub key_range: Range<Key>,
      79                 :     pub lsn: Lsn,
      80                 : 
      81                 :     /// Block number where the 'index' part of the file begins.
      82                 :     pub index_start_blk: u32,
      83                 :     /// Block within the 'index', where the B-tree root page is stored
      84                 :     pub index_root_blk: u32,
      85                 :     // the 'values' part starts after the summary header, on block 1.
      86                 : }
      87                 : 
      88                 : impl From<&ImageLayer> for Summary {
      89 UBC           0 :     fn from(layer: &ImageLayer) -> Self {
      90               0 :         Self::expected(
      91               0 :             layer.desc.tenant_shard_id.tenant_id,
      92               0 :             layer.desc.timeline_id,
      93               0 :             layer.desc.key_range.clone(),
      94               0 :             layer.lsn,
      95               0 :         )
      96               0 :     }
      97                 : }
      98                 : 
      99                 : impl Summary {
     100 CBC       22231 :     pub(super) fn expected(
     101           22231 :         tenant_id: TenantId,
     102           22231 :         timeline_id: TimelineId,
     103           22231 :         key_range: Range<Key>,
     104           22231 :         lsn: Lsn,
     105           22231 :     ) -> Self {
     106           22231 :         Self {
     107           22231 :             magic: IMAGE_FILE_MAGIC,
     108           22231 :             format_version: STORAGE_FORMAT_VERSION,
     109           22231 :             tenant_id,
     110           22231 :             timeline_id,
     111           22231 :             key_range,
     112           22231 :             lsn,
     113           22231 : 
     114           22231 :             index_start_blk: 0,
     115           22231 :             index_root_blk: 0,
     116           22231 :         }
     117           22231 :     }
     118                 : }
     119                 : 
     120                 : /// This is used only from `pagectl`. Within pageserver, all layers are
     121                 : /// [`crate::tenant::storage_layer::Layer`], which can hold an [`ImageLayerInner`].
     122                 : pub struct ImageLayer {
     123                 :     path: Utf8PathBuf,
     124                 :     pub desc: PersistentLayerDesc,
     125                 :     // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
     126                 :     pub lsn: Lsn,
     127                 :     access_stats: LayerAccessStats,
     128                 :     inner: OnceCell<ImageLayerInner>,
     129                 : }
     130                 : 
     131                 : impl std::fmt::Debug for ImageLayer {
     132 UBC           0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     133               0 :         use super::RangeDisplayDebug;
     134               0 : 
     135               0 :         f.debug_struct("ImageLayer")
     136               0 :             .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
     137               0 :             .field("file_size", &self.desc.file_size)
     138               0 :             .field("lsn", &self.lsn)
     139               0 :             .field("inner", &self.inner)
     140               0 :             .finish()
     141               0 :     }
     142                 : }
     143                 : 
     144                 : /// ImageLayer is the in-memory data structure associated with an on-disk image
     145                 : /// file.
     146                 : pub struct ImageLayerInner {
     147                 :     // values copied from summary
     148                 :     index_start_blk: u32,
     149                 :     index_root_blk: u32,
     150                 : 
     151                 :     lsn: Lsn,
     152                 : 
     153                 :     /// Reader object for reading blocks from the file.
     154                 :     file: FileBlockReader,
     155                 : }
     156                 : 
     157                 : impl std::fmt::Debug for ImageLayerInner {
     158               0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     159               0 :         f.debug_struct("ImageLayerInner")
     160               0 :             .field("index_start_blk", &self.index_start_blk)
     161               0 :             .field("index_root_blk", &self.index_root_blk)
     162               0 :             .finish()
     163               0 :     }
     164                 : }
     165                 : 
     166                 : impl ImageLayerInner {
     167               0 :     pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
     168               0 :         let file = &self.file;
     169               0 :         let tree_reader =
     170               0 :             DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);
     171               0 : 
     172               0 :         tree_reader.dump().await?;
     173                 : 
     174               0 :         tree_reader
     175               0 :             .visit(
     176               0 :                 &[0u8; KEY_SIZE],
     177               0 :                 VisitDirection::Forwards,
     178               0 :                 |key, value| {
     179               0 :                     println!("key: {} offset {}", hex::encode(key), value);
     180               0 :                     true
     181               0 :                 },
     182               0 :                 ctx,
     183               0 :             )
     184               0 :             .await?;
     185                 : 
     186               0 :         Ok(())
     187               0 :     }
     188                 : }
     189                 : 
     190                 : /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
     191                 : impl std::fmt::Display for ImageLayer {
     192               0 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     193               0 :         write!(f, "{}", self.layer_desc().short_id())
     194               0 :     }
     195                 : }
     196                 : 
     197                 : impl AsLayerDesc for ImageLayer {
     198               0 :     fn layer_desc(&self) -> &PersistentLayerDesc {
     199               0 :         &self.desc
     200               0 :     }
     201                 : }
     202                 : 
     203                 : impl ImageLayer {
     204               0 :     pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
     205               0 :         self.desc.dump();
     206               0 : 
     207               0 :         if !verbose {
     208               0 :             return Ok(());
     209               0 :         }
     210                 : 
     211               0 :         let inner = self.load(LayerAccessKind::Dump, ctx).await?;
     212                 : 
     213               0 :         inner.dump(ctx).await?;
     214                 : 
     215               0 :         Ok(())
     216               0 :     }
     217                 : 
     218 CBC        5643 :     fn temp_path_for(
     219            5643 :         conf: &PageServerConf,
     220            5643 :         timeline_id: TimelineId,
     221            5643 :         tenant_shard_id: TenantShardId,
     222            5643 :         fname: &ImageFileName,
     223            5643 :     ) -> Utf8PathBuf {
     224            5643 :         let rand_string: String = rand::thread_rng()
     225            5643 :             .sample_iter(&Alphanumeric)
     226            5643 :             .take(8)
     227            5643 :             .map(char::from)
     228            5643 :             .collect();
     229            5643 : 
     230            5643 :         conf.timeline_path(&tenant_shard_id, &timeline_id)
     231            5643 :             .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
     232            5643 :     }
     233                 : 
     234                 :     ///
     235                 :     /// Open the underlying file and read the metadata into memory, if it's
     236                 :     /// not loaded already.
     237                 :     ///
     238 UBC           0 :     async fn load(
     239               0 :         &self,
     240               0 :         access_kind: LayerAccessKind,
     241               0 :         ctx: &RequestContext,
     242               0 :     ) -> Result<&ImageLayerInner> {
     243               0 :         self.access_stats.record_access(access_kind, ctx);
     244               0 :         self.inner
     245               0 :             .get_or_try_init(|| self.load_inner(ctx))
     246               0 :             .await
     247               0 :             .with_context(|| format!("Failed to load image layer {}", self.path()))
     248               0 :     }
     249                 : 
     250               0 :     async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
     251               0 :         let path = self.path();
     252                 : 
     253               0 :         let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
     254               0 :             .await
     255               0 :             .and_then(|res| res)?;
     256                 : 
     257                 :         // not production code
     258               0 :         let actual_filename = path.file_name().unwrap().to_owned();
     259               0 :         let expected_filename = self.layer_desc().filename().file_name();
     260               0 : 
     261               0 :         if actual_filename != expected_filename {
     262               0 :             println!("warning: filename does not match what is expected from in-file summary");
     263               0 :             println!("actual: {:?}", actual_filename);
     264               0 :             println!("expected: {:?}", expected_filename);
     265               0 :         }
     266                 : 
     267               0 :         Ok(loaded)
     268               0 :     }
     269                 : 
     270                 :     /// Create an ImageLayer struct representing an existing file on disk.
     271                 :     ///
     272                 :     /// This variant is only used for debugging purposes, by the 'pagectl' binary.
     273               0 :     pub fn new_for_path(path: &Utf8Path, file: File) -> Result<ImageLayer> {
     274               0 :         let mut summary_buf = vec![0; PAGE_SZ];
     275               0 :         file.read_exact_at(&mut summary_buf, 0)?;
     276               0 :         let summary = Summary::des_prefix(&summary_buf)?;
     277               0 :         let metadata = file
     278               0 :             .metadata()
     279               0 :             .context("get file metadata to determine size")?;
     280                 : 
     281                 :         // This function is never used for constructing layers in a running pageserver,
     282                 :         // so it does not need an accurate TenantShardId.
     283               0 :         let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
     284               0 : 
     285               0 :         Ok(ImageLayer {
     286               0 :             path: path.to_path_buf(),
     287               0 :             desc: PersistentLayerDesc::new_img(
     288               0 :                 tenant_shard_id,
     289               0 :                 summary.timeline_id,
     290               0 :                 summary.key_range,
     291               0 :                 summary.lsn,
     292               0 :                 metadata.len(),
     293               0 :             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
     294               0 :             lsn: summary.lsn,
     295               0 :             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
     296               0 :             inner: OnceCell::new(),
     297               0 :         })
     298               0 :     }
     299                 : 
     300               0 :     fn path(&self) -> Utf8PathBuf {
     301               0 :         self.path.clone()
     302               0 :     }
     303                 : }
     304                 : 
     305               0 : #[derive(thiserror::Error, Debug)]
     306                 : pub enum RewriteSummaryError {
     307                 :     #[error("magic mismatch")]
     308                 :     MagicMismatch,
     309                 :     #[error(transparent)]
     310                 :     Other(#[from] anyhow::Error),
     311                 : }
     312                 : 
     313                 : impl From<std::io::Error> for RewriteSummaryError {
     314               0 :     fn from(e: std::io::Error) -> Self {
     315               0 :         Self::Other(anyhow::anyhow!(e))
     316               0 :     }
     317                 : }
     318                 : 
     319                 : impl ImageLayer {
     320               0 :     pub async fn rewrite_summary<F>(
     321               0 :         path: &Utf8Path,
     322               0 :         rewrite: F,
     323               0 :         ctx: &RequestContext,
     324               0 :     ) -> Result<(), RewriteSummaryError>
     325               0 :     where
     326               0 :         F: Fn(Summary) -> Summary,
     327               0 :     {
     328               0 :         let file = VirtualFile::open_with_options(
     329               0 :             path,
     330               0 :             &*std::fs::OpenOptions::new().read(true).write(true),
     331               0 :         )
     332               0 :         .await
     333               0 :         .with_context(|| format!("Failed to open file '{}'", path))?;
     334               0 :         let file = FileBlockReader::new(file);
     335               0 :         let summary_blk = file.read_blk(0, ctx).await?;
     336               0 :         let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
     337               0 :         let mut file = file.file;
     338               0 :         if actual_summary.magic != IMAGE_FILE_MAGIC {
     339               0 :             return Err(RewriteSummaryError::MagicMismatch);
     340               0 :         }
     341               0 : 
     342               0 :         let new_summary = rewrite(actual_summary);
     343               0 : 
     344               0 :         let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
     345               0 :         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
     346               0 :         if buf.spilled() {
     347                 :             // The code in ImageLayerWriterInner just warn!()s for this.
     348                 :             // It should probably error out as well.
     349               0 :             return Err(RewriteSummaryError::Other(anyhow::anyhow!(
     350               0 :                 "Used more than one page size for summary buffer: {}",
     351               0 :                 buf.len()
     352               0 :             )));
     353               0 :         }
     354               0 :         file.seek(SeekFrom::Start(0)).await?;
     355               0 :         file.write_all(&buf).await?;
     356               0 :         Ok(())
     357               0 :     }
     358                 : }
     359                 : 
     360                 : impl ImageLayerInner {
     361                 :     /// Returns nested result following Result<Result<_, OpErr>, Critical>:
     362                 :     /// - inner has the success or transient failure
     363                 :     /// - outer has the permanent failure
     364 CBC       22231 :     pub(super) async fn load(
     365           22231 :         path: &Utf8Path,
     366           22231 :         lsn: Lsn,
     367           22231 :         summary: Option<Summary>,
     368           22231 :         ctx: &RequestContext,
     369           22231 :     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
     370           22231 :         let file = match VirtualFile::open(path).await {
     371           22231 :             Ok(file) => file,
     372 UBC           0 :             Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
     373                 :         };
     374 CBC       22231 :         let file = FileBlockReader::new(file);
     375           22231 :         let summary_blk = match file.read_blk(0, ctx).await {
     376           22231 :             Ok(blk) => blk,
     377 UBC           0 :             Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
     378                 :         };
     379                 : 
     380                 :         // length is the only way how this could fail, so it's not actually likely at all unless
     381                 :         // read_blk returns wrong sized block.
     382                 :         //
     383                 :         // TODO: confirm and make this into assertion
     384 CBC       22231 :         let actual_summary =
     385           22231 :             Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;
     386                 : 
     387           22231 :         if let Some(mut expected_summary) = summary {
     388                 :             // production code path
     389           22231 :             expected_summary.index_start_blk = actual_summary.index_start_blk;
     390           22231 :             expected_summary.index_root_blk = actual_summary.index_root_blk;
     391           22231 : 
     392           22231 :             if actual_summary != expected_summary {
     393 UBC           0 :                 bail!(
     394               0 :                     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
     395               0 :                     actual_summary,
     396               0 :                     expected_summary
     397               0 :                 );
     398 CBC       22231 :             }
     399 UBC           0 :         }
     400                 : 
     401 CBC       22231 :         Ok(Ok(ImageLayerInner {
     402           22231 :             index_start_blk: actual_summary.index_start_blk,
     403           22231 :             index_root_blk: actual_summary.index_root_blk,
     404           22231 :             lsn,
     405           22231 :             file,
     406           22231 :         }))
     407           22231 :     }
     408                 : 
     409          452100 :     pub(super) async fn get_value_reconstruct_data(
     410          452100 :         &self,
     411          452100 :         key: Key,
     412          452100 :         reconstruct_state: &mut ValueReconstructState,
     413          452100 :         ctx: &RequestContext,
     414          452100 :     ) -> anyhow::Result<ValueReconstructResult> {
     415          452100 :         let file = &self.file;
     416          452100 :         let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
     417          452100 : 
     418          452100 :         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
     419          452100 :         key.write_to_byte_slice(&mut keybuf);
     420          452100 :         if let Some(offset) = tree_reader
     421          452100 :             .get(
     422          452100 :                 &keybuf,
     423          452100 :                 &RequestContextBuilder::extend(ctx)
     424          452100 :                     .page_content_kind(PageContentKind::ImageLayerBtreeNode)
     425          452100 :                     .build(),
     426          452100 :             )
     427            6110 :             .await?
     428                 :         {
     429          452098 :             let blob = file
     430          452098 :                 .block_cursor()
     431          452098 :                 .read_blob(
     432          452098 :                     offset,
     433          452098 :                     &RequestContextBuilder::extend(ctx)
     434          452098 :                         .page_content_kind(PageContentKind::ImageLayerValue)
     435          452098 :                         .build(),
     436          452098 :                 )
     437            4643 :                 .await
     438          452098 :                 .with_context(|| format!("failed to read value from offset {}", offset))?;
     439          452098 :             let value = Bytes::from(blob);
     440          452098 : 
     441          452098 :             reconstruct_state.img = Some((self.lsn, value));
     442          452098 :             Ok(ValueReconstructResult::Complete)
     443                 :         } else {
     444               2 :             Ok(ValueReconstructResult::Missing)
     445                 :         }
     446          452100 :     }
     447                 : }
     448                 : 
     449                 : /// A builder object for constructing a new image layer.
     450                 : ///
     451                 : /// Usage:
     452                 : ///
     453                 : /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
     454                 : ///
     455                 : /// 2. Write the contents by calling `put_page_image` for every key-value
     456                 : ///    pair in the key range.
     457                 : ///
     458                 : /// 3. Call `finish`.
     459                 : ///
     460                 : struct ImageLayerWriterInner {
     461                 :     conf: &'static PageServerConf,
     462                 :     path: Utf8PathBuf,
     463                 :     timeline_id: TimelineId,
     464                 :     tenant_shard_id: TenantShardId,
     465                 :     key_range: Range<Key>,
     466                 :     lsn: Lsn,
     467                 : 
     468                 :     blob_writer: BlobWriter<false>,
     469                 :     tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
     470                 : }
     471                 : 
     472                 : impl ImageLayerWriterInner {
     473                 :     ///
     474                 :     /// Start building a new image layer.
     475                 :     ///
     476            5643 :     async fn new(
     477            5643 :         conf: &'static PageServerConf,
     478            5643 :         timeline_id: TimelineId,
     479            5643 :         tenant_shard_id: TenantShardId,
     480            5643 :         key_range: &Range<Key>,
     481            5643 :         lsn: Lsn,
     482            5643 :     ) -> anyhow::Result<Self> {
     483            5643 :         // Create the file initially with a temporary filename.
     484            5643 :         // We'll atomically rename it to the final name when we're done.
     485            5643 :         let path = ImageLayer::temp_path_for(
     486            5643 :             conf,
     487            5643 :             timeline_id,
     488            5643 :             tenant_shard_id,
     489            5643 :             &ImageFileName {
     490            5643 :                 key_range: key_range.clone(),
     491            5643 :                 lsn,
     492            5643 :             },
     493            5643 :         );
     494            5643 :         info!("new image layer {path}");
     495            5643 :         let mut file = VirtualFile::open_with_options(
     496            5643 :             &path,
     497            5643 :             std::fs::OpenOptions::new().write(true).create_new(true),
     498            5643 :         )
     499 UBC           0 :         .await?;
     500                 :         // make room for the header block
     501 CBC        5643 :         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
     502            5643 :         let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
     503            5643 : 
     504            5643 :         // Initialize the b-tree index builder
     505            5643 :         let block_buf = BlockBuf::new();
     506            5643 :         let tree_builder = DiskBtreeBuilder::new(block_buf);
     507            5643 : 
     508            5643 :         let writer = Self {
     509            5643 :             conf,
     510            5643 :             path,
     511            5643 :             timeline_id,
     512            5643 :             tenant_shard_id,
     513            5643 :             key_range: key_range.clone(),
     514            5643 :             lsn,
     515            5643 :             tree: tree_builder,
     516            5643 :             blob_writer,
     517            5643 :         };
     518            5643 : 
     519            5643 :         Ok(writer)
     520            5643 :     }
     521                 : 
     522                 :     ///
     523                 :     /// Write next value to the file.
     524                 :     ///
     525                 :     /// The page versions must be appended in blknum order.
     526                 :     ///
     527          226127 :     async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
     528          226127 :         ensure!(self.key_range.contains(&key));
     529          226127 :         let off = self.blob_writer.write_blob(img).await?;
     530                 : 
     531          226127 :         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
     532          226127 :         key.write_to_byte_slice(&mut keybuf);
     533          226127 :         self.tree.append(&keybuf, off)?;
     534                 : 
     535          226127 :         Ok(())
     536          226127 :     }
     537                 : 
     538                 :     ///
     539                 :     /// Finish writing the image layer.
     540                 :     ///
     541            5639 :     async fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
     542            5639 :         let index_start_blk =
     543            5639 :             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
     544            5639 : 
     545            5639 :         let mut file = self.blob_writer.into_inner();
     546            5639 : 
     547            5639 :         // Write out the index
     548            5639 :         file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
     549 UBC           0 :             .await?;
     550 CBC        5639 :         let (index_root_blk, block_buf) = self.tree.finish()?;
     551           11469 :         for buf in block_buf.blocks {
     552            5830 :             file.write_all(buf.as_ref()).await?;
     553                 :         }
     554                 : 
     555                 :         // Fill in the summary on blk 0
     556            5639 :         let summary = Summary {
     557            5639 :             magic: IMAGE_FILE_MAGIC,
     558            5639 :             format_version: STORAGE_FORMAT_VERSION,
     559            5639 :             tenant_id: self.tenant_shard_id.tenant_id,
     560            5639 :             timeline_id: self.timeline_id,
     561            5639 :             key_range: self.key_range.clone(),
     562            5639 :             lsn: self.lsn,
     563            5639 :             index_start_blk,
     564            5639 :             index_root_blk,
     565            5639 :         };
     566            5639 : 
     567            5639 :         let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
     568            5639 :         Summary::ser_into(&summary, &mut buf)?;
     569            5639 :         if buf.spilled() {
     570                 :             // This is bad as we only have one free block for the summary
     571 UBC           0 :             warn!(
     572               0 :                 "Used more than one page size for summary buffer: {}",
     573               0 :                 buf.len()
     574               0 :             );
     575 CBC        5639 :         }
     576            5639 :         file.seek(SeekFrom::Start(0)).await?;
     577            5639 :         file.write_all(&buf).await?;
     578                 : 
     579            5639 :         let metadata = file
     580            5639 :             .metadata()
     581 UBC           0 :             .await
     582 CBC        5639 :             .context("get metadata to determine file size")?;
     583                 : 
     584            5639 :         let desc = PersistentLayerDesc::new_img(
     585            5639 :             self.tenant_shard_id,
     586            5639 :             self.timeline_id,
     587            5639 :             self.key_range.clone(),
     588            5639 :             self.lsn,
     589            5639 :             metadata.len(),
     590            5639 :         );
     591            5639 : 
     592            5639 :         // Note: Because we open the file in write-only mode, we cannot
     593            5639 :         // reuse the same VirtualFile for reading later. That's why we don't
     594            5639 :         // set inner.file here. The first read will have to re-open it.
     595            5639 : 
     596            5639 :         // fsync the file
     597            5639 :         file.sync_all().await?;
     598                 : 
     599                 :         // FIXME: why not carry the virtualfile here, it supports renaming?
     600            5639 :         let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
     601                 : 
     602 UBC           0 :         trace!("created image layer {}", layer.local_path());
     603                 : 
     604 CBC        5639 :         Ok(layer)
     605            5639 :     }
     606                 : }
     607                 : 
     608                 : /// A builder object for constructing a new image layer.
     609                 : ///
     610                 : /// Usage:
     611                 : ///
     612                 : /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
     613                 : ///
     614                 : /// 2. Write the contents by calling `put_page_image` for every key-value
     615                 : ///    pair in the key range.
     616                 : ///
     617                 : /// 3. Call `finish`.
     618                 : ///
     619                 : /// # Note
     620                 : ///
     621                 : /// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
     622                 : /// possible for the writer to drop before `finish` is actually called. So this
     623                 : /// could lead to odd temporary files in the directory, exhausting file system.
     624                 : /// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
     625                 : /// implementation that cleans up the temporary file in failure. It's not
     626                 : /// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
     627                 : /// out some fields, making it impossible to implement `Drop`.
     628                 : ///
     629                 : #[must_use]
     630                 : pub struct ImageLayerWriter {
     631                 :     inner: Option<ImageLayerWriterInner>,
     632                 : }
     633                 : 
     634                 : impl ImageLayerWriter {
     635                 :     ///
     636                 :     /// Start building a new image layer.
     637                 :     ///
     638            5643 :     pub async fn new(
     639            5643 :         conf: &'static PageServerConf,
     640            5643 :         timeline_id: TimelineId,
     641            5643 :         tenant_shard_id: TenantShardId,
     642            5643 :         key_range: &Range<Key>,
     643            5643 :         lsn: Lsn,
     644            5643 :     ) -> anyhow::Result<ImageLayerWriter> {
     645            5643 :         Ok(Self {
     646            5643 :             inner: Some(
     647            5643 :                 ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn)
     648 UBC           0 :                     .await?,
     649                 :             ),
     650                 :         })
     651 CBC        5643 :     }
     652                 : 
     653                 :     ///
     654                 :     /// Write next value to the file.
     655                 :     ///
     656                 :     /// The page versions must be appended in blknum order.
     657                 :     ///
     658          226127 :     pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
     659          226127 :         self.inner.as_mut().unwrap().put_image(key, img).await
     660          226127 :     }
     661                 : 
     662                 :     ///
     663                 :     /// Finish writing the image layer.
     664                 :     ///
     665            5639 :     pub(crate) async fn finish(
     666            5639 :         mut self,
     667            5639 :         timeline: &Arc<Timeline>,
     668            5639 :     ) -> anyhow::Result<super::ResidentLayer> {
     669            5639 :         self.inner.take().unwrap().finish(timeline).await
     670            5639 :     }
     671                 : }
     672                 : 
     673                 : impl Drop for ImageLayerWriter {
     674            5639 :     fn drop(&mut self) {
     675            5639 :         if let Some(inner) = self.inner.take() {
     676 UBC           0 :             inner.blob_writer.into_inner().remove();
     677 CBC        5639 :         }
     678            5639 :     }
     679                 : }
        

Generated by: LCOV version 2.1-beta