TLA Line data Source code
1 : //! An ImageLayer represents an image or a snapshot of a key-range at
2 : //! one particular LSN. It contains an image of all key-value pairs
3 : //! in its key-range. Any key that falls into the image layer's range
4 : //! but does not exist in the layer, does not exist.
5 : //!
6 : //! An image layer is stored in a file on disk. The file is stored in
7 : //! timelines/<timeline_id> directory. Currently, there are no
8 : //! subdirectories, and each image layer file is named like this:
9 : //!
10 : //! ```text
11 : //! <key start>-<key end>__<LSN>
12 : //! ```
13 : //!
14 : //! For example:
15 : //!
16 : //! ```text
17 : //! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
18 : //! ```
19 : //!
20 : //! Every image layer file consists of three parts: "summary",
21 : //! "index", and "values". The summary is a fixed size header at the
22 : //! beginning of the file, and it contains basic information about the
23 : //! layer, and offsets to the other parts. The "index" is a B-tree,
24 : //! mapping from Key to an offset in the "values" part. The
25 : //! actual page images are stored in the "values" part.
26 : use crate::config::PageServerConf;
27 : use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
28 : use crate::page_cache::PAGE_SZ;
29 : use crate::repository::{Key, KEY_SIZE};
30 : use crate::tenant::blob_io::BlobWriter;
31 : use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
32 : use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
33 : use crate::tenant::storage_layer::{
34 : LayerAccessStats, PersistentLayer, ValueReconstructResult, ValueReconstructState,
35 : };
36 : use crate::virtual_file::VirtualFile;
37 : use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
38 : use anyhow::{bail, ensure, Context, Result};
39 : use bytes::Bytes;
40 : use camino::{Utf8Path, Utf8PathBuf};
41 : use hex;
42 : use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
43 : use rand::{distributions::Alphanumeric, Rng};
44 : use serde::{Deserialize, Serialize};
45 : use std::fs::{self, File};
46 : use std::io::SeekFrom;
47 : use std::ops::Range;
48 : use std::os::unix::prelude::FileExt;
49 : use tokio::sync::OnceCell;
50 : use tracing::*;
51 :
52 : use utils::{
53 : bin_ser::BeSer,
54 : id::{TenantId, TimelineId},
55 : lsn::Lsn,
56 : };
57 :
58 : use super::filename::ImageFileName;
59 : use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
60 :
61 : ///
62 : /// Header stored in the beginning of the file
63 : ///
64 : /// After this comes the 'values' part, starting on block 1. After that,
65 : /// the 'index' starts at the block indicated by 'index_start_blk'
66 : ///
67 CBC 4002 : #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
68 : pub(super) struct Summary {
69 : /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
70 : magic: u16,
71 : format_version: u16,
72 :
73 : tenant_id: TenantId,
74 : timeline_id: TimelineId,
75 : key_range: Range<Key>,
76 : lsn: Lsn,
77 :
78 : /// Block number where the 'index' part of the file begins.
79 : index_start_blk: u32,
80 : /// Block within the 'index', where the B-tree root page is stored
81 : index_root_blk: u32,
82 : // the 'values' part starts after the summary header, on block 1.
83 : }
84 :
85 : impl From<&ImageLayer> for Summary {
86 4002 : fn from(layer: &ImageLayer) -> Self {
87 4002 : Self::expected(
88 4002 : layer.desc.tenant_id,
89 4002 : layer.desc.timeline_id,
90 4002 : layer.desc.key_range.clone(),
91 4002 : layer.lsn,
92 4002 : )
93 4002 : }
94 : }
95 :
96 : impl Summary {
97 4002 : pub(super) fn expected(
98 4002 : tenant_id: TenantId,
99 4002 : timeline_id: TimelineId,
100 4002 : key_range: Range<Key>,
101 4002 : lsn: Lsn,
102 4002 : ) -> Self {
103 4002 : Self {
104 4002 : magic: IMAGE_FILE_MAGIC,
105 4002 : format_version: STORAGE_FORMAT_VERSION,
106 4002 : tenant_id,
107 4002 : timeline_id,
108 4002 : key_range,
109 4002 : lsn,
110 4002 :
111 4002 : index_start_blk: 0,
112 4002 : index_root_blk: 0,
113 4002 : }
114 4002 : }
115 : }
116 :
117 : /// ImageLayer is the in-memory data structure associated with an on-disk image
118 : /// file.
119 : ///
120 : /// We keep an ImageLayer in memory for each file, in the LayerMap. If a layer
121 : /// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
122 : /// Otherwise the struct is just a placeholder for a file that exists on disk,
123 : /// and it needs to be loaded before using it in queries.
124 : pub struct ImageLayer {
125 : path_or_conf: PathOrConf,
126 :
127 : pub desc: PersistentLayerDesc,
128 : // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
129 : pub lsn: Lsn,
130 :
131 : access_stats: LayerAccessStats,
132 :
133 : inner: OnceCell<ImageLayerInner>,
134 : }
135 :
136 : impl std::fmt::Debug for ImageLayer {
137 UBC 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
138 0 : use super::RangeDisplayDebug;
139 0 :
140 0 : f.debug_struct("ImageLayer")
141 0 : .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
142 0 : .field("file_size", &self.desc.file_size)
143 0 : .field("lsn", &self.lsn)
144 0 : .field("inner", &self.inner)
145 0 : .finish()
146 0 : }
147 : }
148 :
149 : pub struct ImageLayerInner {
150 : // values copied from summary
151 : index_start_blk: u32,
152 : index_root_blk: u32,
153 :
154 : lsn: Lsn,
155 :
156 : /// Reader object for reading blocks from the file.
157 : file: FileBlockReader,
158 : }
159 :
160 : impl std::fmt::Debug for ImageLayerInner {
161 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
162 0 : f.debug_struct("ImageLayerInner")
163 0 : .field("index_start_blk", &self.index_start_blk)
164 0 : .field("index_root_blk", &self.index_root_blk)
165 0 : .finish()
166 0 : }
167 : }
168 :
169 : #[async_trait::async_trait]
170 : impl Layer for ImageLayer {
171 : /// Look up given page in the file
172 CBC 430880 : async fn get_value_reconstruct_data(
173 430880 : &self,
174 430880 : key: Key,
175 430880 : lsn_range: Range<Lsn>,
176 430880 : reconstruct_state: &mut ValueReconstructState,
177 430880 : ctx: &RequestContext,
178 430880 : ) -> anyhow::Result<ValueReconstructResult> {
179 430880 : self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
180 7814 : .await
181 861760 : }
182 : }
183 :
184 : /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
185 : impl std::fmt::Display for ImageLayer {
186 380 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
187 380 : write!(f, "{}", self.layer_desc().short_id())
188 380 : }
189 : }
190 :
191 : impl AsLayerDesc for ImageLayer {
192 19163 : fn layer_desc(&self) -> &PersistentLayerDesc {
193 19163 : &self.desc
194 19163 : }
195 : }
196 :
197 : impl PersistentLayer for ImageLayer {
198 384 : fn local_path(&self) -> Option<Utf8PathBuf> {
199 384 : self.local_path()
200 384 : }
201 :
202 490 : fn delete_resident_layer_file(&self) -> Result<()> {
203 490 : self.delete_resident_layer_file()
204 490 : }
205 :
206 8 : fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
207 8 : self.info(reset)
208 8 : }
209 :
210 381 : fn access_stats(&self) -> &LayerAccessStats {
211 381 : self.access_stats()
212 381 : }
213 : }
214 :
215 : impl ImageLayer {
216 UBC 0 : pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
217 0 : println!(
218 0 : "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
219 0 : self.desc.tenant_id,
220 0 : self.desc.timeline_id,
221 0 : self.desc.key_range.start,
222 0 : self.desc.key_range.end,
223 0 : self.lsn,
224 0 : self.desc.is_incremental(),
225 0 : self.desc.file_size
226 0 : );
227 0 :
228 0 : if !verbose {
229 0 : return Ok(());
230 0 : }
231 :
232 0 : let inner = self.load(LayerAccessKind::Dump, ctx).await?;
233 0 : let file = &inner.file;
234 0 : let tree_reader =
235 0 : DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
236 0 :
237 0 : tree_reader.dump().await?;
238 :
239 0 : tree_reader
240 0 : .visit(
241 0 : &[0u8; KEY_SIZE],
242 0 : VisitDirection::Forwards,
243 0 : |key, value| {
244 0 : println!("key: {} offset {}", hex::encode(key), value);
245 0 : true
246 0 : },
247 0 : ctx,
248 0 : )
249 0 : .await?;
250 :
251 0 : Ok(())
252 0 : }
253 :
254 CBC 430880 : pub(crate) async fn get_value_reconstruct_data(
255 430880 : &self,
256 430880 : key: Key,
257 430880 : lsn_range: Range<Lsn>,
258 430880 : reconstruct_state: &mut ValueReconstructState,
259 430880 : ctx: &RequestContext,
260 430880 : ) -> anyhow::Result<ValueReconstructResult> {
261 430880 : assert!(self.desc.key_range.contains(&key));
262 430880 : assert!(lsn_range.start >= self.lsn);
263 430880 : assert!(lsn_range.end >= self.lsn);
264 :
265 430880 : let inner = self
266 430880 : .load(LayerAccessKind::GetValueReconstructData, ctx)
267 46 : .await?;
268 430880 : inner
269 430880 : .get_value_reconstruct_data(key, reconstruct_state, ctx)
270 7768 : .await
271 : // FIXME: makes no sense to dump paths
272 430880 : .with_context(|| format!("read {}", self.path()))
273 430880 : }
274 :
275 384 : pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
276 384 : Some(self.path())
277 384 : }
278 :
279 : pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
280 : // delete underlying file
281 490 : fs::remove_file(self.path())?;
282 490 : Ok(())
283 490 : }
284 :
285 8 : pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
286 8 : let layer_file_name = self.layer_desc().filename().file_name();
287 8 : let lsn_start = self.layer_desc().image_layer_lsn();
288 8 :
289 8 : HistoricLayerInfo::Image {
290 8 : layer_file_name,
291 8 : layer_file_size: self.desc.file_size,
292 8 : lsn_start,
293 8 : remote: false,
294 8 : access_stats: self.access_stats.as_api_model(reset),
295 8 : }
296 8 : }
297 :
298 3783 : pub(crate) fn access_stats(&self) -> &LayerAccessStats {
299 3783 : &self.access_stats
300 3783 : }
301 :
302 11680 : fn path_for(
303 11680 : path_or_conf: &PathOrConf,
304 11680 : timeline_id: TimelineId,
305 11680 : tenant_id: TenantId,
306 11680 : fname: &ImageFileName,
307 11680 : ) -> Utf8PathBuf {
308 11680 : match path_or_conf {
309 UBC 0 : PathOrConf::Path(path) => path.to_path_buf(),
310 CBC 11680 : PathOrConf::Conf(conf) => conf
311 11680 : .timeline_path(&tenant_id, &timeline_id)
312 11680 : .join(fname.to_string()),
313 : }
314 11680 : }
315 :
316 3406 : fn temp_path_for(
317 3406 : conf: &PageServerConf,
318 3406 : timeline_id: TimelineId,
319 3406 : tenant_id: TenantId,
320 3406 : fname: &ImageFileName,
321 3406 : ) -> Utf8PathBuf {
322 3406 : let rand_string: String = rand::thread_rng()
323 3406 : .sample_iter(&Alphanumeric)
324 3406 : .take(8)
325 3406 : .map(char::from)
326 3406 : .collect();
327 3406 :
328 3406 : conf.timeline_path(&tenant_id, &timeline_id)
329 3406 : .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
330 3406 : }
331 :
332 : ///
333 : /// Open the underlying file and read the metadata into memory, if it's
334 : /// not loaded already.
335 : ///
336 430880 : async fn load(
337 430880 : &self,
338 430880 : access_kind: LayerAccessKind,
339 430880 : ctx: &RequestContext,
340 430880 : ) -> Result<&ImageLayerInner> {
341 430880 : self.access_stats.record_access(access_kind, ctx);
342 430880 : self.inner
343 430880 : .get_or_try_init(|| self.load_inner(ctx))
344 46 : .await
345 430880 : .with_context(|| format!("Failed to load image layer {}", self.path()))
346 430880 : }
347 :
348 4002 : async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
349 4002 : let path = self.path();
350 :
351 4002 : let expected_summary = match &self.path_or_conf {
352 4002 : PathOrConf::Conf(_) => Some(Summary::from(self)),
353 UBC 0 : PathOrConf::Path(_) => None,
354 : };
355 :
356 CBC 4002 : let loaded =
357 4002 : ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary, ctx)
358 14 : .await?;
359 :
360 4002 : if let PathOrConf::Path(ref path) = self.path_or_conf {
361 : // not production code
362 UBC 0 : let actual_filename = path.file_name().unwrap().to_owned();
363 0 : let expected_filename = self.filename().file_name();
364 0 :
365 0 : if actual_filename != expected_filename {
366 0 : println!("warning: filename does not match what is expected from in-file summary");
367 0 : println!("actual: {:?}", actual_filename);
368 0 : println!("expected: {:?}", expected_filename);
369 0 : }
370 CBC 4002 : }
371 :
372 4002 : Ok(loaded)
373 4002 : }
374 :
375 : /// Create an ImageLayer struct representing an existing file on disk
376 3026 : pub fn new(
377 3026 : conf: &'static PageServerConf,
378 3026 : timeline_id: TimelineId,
379 3026 : tenant_id: TenantId,
380 3026 : filename: &ImageFileName,
381 3026 : file_size: u64,
382 3026 : access_stats: LayerAccessStats,
383 3026 : ) -> ImageLayer {
384 3026 : ImageLayer {
385 3026 : path_or_conf: PathOrConf::Conf(conf),
386 3026 : desc: PersistentLayerDesc::new_img(
387 3026 : tenant_id,
388 3026 : timeline_id,
389 3026 : filename.key_range.clone(),
390 3026 : filename.lsn,
391 3026 : file_size,
392 3026 : ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
393 3026 : lsn: filename.lsn,
394 3026 : access_stats,
395 3026 : inner: OnceCell::new(),
396 3026 : }
397 3026 : }
398 :
399 : /// Create an ImageLayer struct representing an existing file on disk.
400 : ///
401 : /// This variant is only used for debugging purposes, by the 'pagectl' binary.
402 UBC 0 : pub fn new_for_path(path: &Utf8Path, file: File) -> Result<ImageLayer> {
403 0 : let mut summary_buf = vec![0; PAGE_SZ];
404 0 : file.read_exact_at(&mut summary_buf, 0)?;
405 0 : let summary = Summary::des_prefix(&summary_buf)?;
406 0 : let metadata = file
407 0 : .metadata()
408 0 : .context("get file metadata to determine size")?;
409 0 : Ok(ImageLayer {
410 0 : path_or_conf: PathOrConf::Path(path.to_path_buf()),
411 0 : desc: PersistentLayerDesc::new_img(
412 0 : summary.tenant_id,
413 0 : summary.timeline_id,
414 0 : summary.key_range,
415 0 : summary.lsn,
416 0 : metadata.len(),
417 0 : ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
418 0 : lsn: summary.lsn,
419 0 : access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
420 0 : inner: OnceCell::new(),
421 0 : })
422 0 : }
423 :
424 CBC 8278 : fn layer_name(&self) -> ImageFileName {
425 8278 : self.desc.image_file_name()
426 8278 : }
427 :
428 : /// Path to the layer file in pageserver workdir.
429 8278 : pub fn path(&self) -> Utf8PathBuf {
430 8278 : Self::path_for(
431 8278 : &self.path_or_conf,
432 8278 : self.desc.timeline_id,
433 8278 : self.desc.tenant_id,
434 8278 : &self.layer_name(),
435 8278 : )
436 8278 : }
437 : }
438 :
439 : impl ImageLayerInner {
440 4002 : pub(super) async fn load(
441 4002 : path: &Utf8Path,
442 4002 : lsn: Lsn,
443 4002 : summary: Option<Summary>,
444 4002 : ctx: &RequestContext,
445 4002 : ) -> anyhow::Result<Self> {
446 4002 : let file = VirtualFile::open(path)
447 UBC 0 : .await
448 CBC 4002 : .with_context(|| format!("Failed to open file '{}'", path))?;
449 4002 : let file = FileBlockReader::new(file);
450 4002 : let summary_blk = file.read_blk(0, ctx).await?;
451 4002 : let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
452 :
453 4002 : if let Some(mut expected_summary) = summary {
454 : // production code path
455 4002 : expected_summary.index_start_blk = actual_summary.index_start_blk;
456 4002 : expected_summary.index_root_blk = actual_summary.index_root_blk;
457 4002 :
458 4002 : if actual_summary != expected_summary {
459 UBC 0 : bail!(
460 0 : "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
461 0 : actual_summary,
462 0 : expected_summary
463 0 : );
464 CBC 4002 : }
465 UBC 0 : }
466 :
467 CBC 4002 : Ok(ImageLayerInner {
468 4002 : index_start_blk: actual_summary.index_start_blk,
469 4002 : index_root_blk: actual_summary.index_root_blk,
470 4002 : lsn,
471 4002 : file,
472 4002 : })
473 4002 : }
474 :
475 430880 : pub(super) async fn get_value_reconstruct_data(
476 430880 : &self,
477 430880 : key: Key,
478 430880 : reconstruct_state: &mut ValueReconstructState,
479 430880 : ctx: &RequestContext,
480 430880 : ) -> anyhow::Result<ValueReconstructResult> {
481 430880 : let file = &self.file;
482 430880 : let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
483 430880 :
484 430880 : let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
485 430880 : key.write_to_byte_slice(&mut keybuf);
486 430880 : if let Some(offset) = tree_reader
487 430880 : .get(
488 430880 : &keybuf,
489 430880 : &RequestContextBuilder::extend(ctx)
490 430880 : .page_content_kind(PageContentKind::ImageLayerBtreeNode)
491 430880 : .build(),
492 430880 : )
493 3710 : .await?
494 : {
495 430878 : let blob = file
496 430878 : .block_cursor()
497 430878 : .read_blob(
498 430878 : offset,
499 430878 : &RequestContextBuilder::extend(ctx)
500 430878 : .page_content_kind(PageContentKind::ImageLayerValue)
501 430878 : .build(),
502 430878 : )
503 4058 : .await
504 430878 : .with_context(|| format!("failed to read value from offset {}", offset))?;
505 430878 : let value = Bytes::from(blob);
506 430878 :
507 430878 : reconstruct_state.img = Some((self.lsn, value));
508 430878 : Ok(ValueReconstructResult::Complete)
509 : } else {
510 2 : Ok(ValueReconstructResult::Missing)
511 : }
512 430880 : }
513 : }
514 :
515 : /// A builder object for constructing a new image layer.
516 : ///
517 : /// Usage:
518 : ///
519 : /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
520 : ///
521 : /// 2. Write the contents by calling `put_page_image` for every key-value
522 : /// pair in the key range.
523 : ///
524 : /// 3. Call `finish`.
525 : ///
526 : struct ImageLayerWriterInner {
527 : conf: &'static PageServerConf,
528 : path: Utf8PathBuf,
529 : timeline_id: TimelineId,
530 : tenant_id: TenantId,
531 : key_range: Range<Key>,
532 : lsn: Lsn,
533 :
534 : blob_writer: BlobWriter<false>,
535 : tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
536 : }
537 :
538 : impl ImageLayerWriterInner {
539 : ///
540 : /// Start building a new image layer.
541 : ///
542 3406 : async fn new(
543 3406 : conf: &'static PageServerConf,
544 3406 : timeline_id: TimelineId,
545 3406 : tenant_id: TenantId,
546 3406 : key_range: &Range<Key>,
547 3406 : lsn: Lsn,
548 3406 : ) -> anyhow::Result<Self> {
549 3406 : // Create the file initially with a temporary filename.
550 3406 : // We'll atomically rename it to the final name when we're done.
551 3406 : let path = ImageLayer::temp_path_for(
552 3406 : conf,
553 3406 : timeline_id,
554 3406 : tenant_id,
555 3406 : &ImageFileName {
556 3406 : key_range: key_range.clone(),
557 3406 : lsn,
558 3406 : },
559 3406 : );
560 3406 : info!("new image layer {path}");
561 3406 : let mut file = VirtualFile::open_with_options(
562 3406 : &path,
563 3406 : std::fs::OpenOptions::new().write(true).create_new(true),
564 3406 : )
565 UBC 0 : .await?;
566 : // make room for the header block
567 CBC 3406 : file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
568 3406 : let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
569 3406 :
570 3406 : // Initialize the b-tree index builder
571 3406 : let block_buf = BlockBuf::new();
572 3406 : let tree_builder = DiskBtreeBuilder::new(block_buf);
573 3406 :
574 3406 : let writer = Self {
575 3406 : conf,
576 3406 : path,
577 3406 : timeline_id,
578 3406 : tenant_id,
579 3406 : key_range: key_range.clone(),
580 3406 : lsn,
581 3406 : tree: tree_builder,
582 3406 : blob_writer,
583 3406 : };
584 3406 :
585 3406 : Ok(writer)
586 3406 : }
587 :
588 : ///
589 : /// Write next value to the file.
590 : ///
591 : /// The page versions must be appended in blknum order.
592 : ///
593 224122 : async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
594 224122 : ensure!(self.key_range.contains(&key));
595 224122 : let off = self.blob_writer.write_blob(img).await?;
596 :
597 224122 : let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
598 224122 : key.write_to_byte_slice(&mut keybuf);
599 224122 : self.tree.append(&keybuf, off)?;
600 :
601 224122 : Ok(())
602 224122 : }
603 :
604 : ///
605 : /// Finish writing the image layer.
606 : ///
607 3402 : async fn finish(self) -> anyhow::Result<ImageLayer> {
608 3402 : let index_start_blk =
609 3402 : ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
610 3402 :
611 3402 : let mut file = self.blob_writer.into_inner();
612 3402 :
613 3402 : // Write out the index
614 3402 : file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
615 UBC 0 : .await?;
616 CBC 3402 : let (index_root_blk, block_buf) = self.tree.finish()?;
617 6953 : for buf in block_buf.blocks {
618 3551 : file.write_all(buf.as_ref()).await?;
619 : }
620 :
621 : // Fill in the summary on blk 0
622 3402 : let summary = Summary {
623 3402 : magic: IMAGE_FILE_MAGIC,
624 3402 : format_version: STORAGE_FORMAT_VERSION,
625 3402 : tenant_id: self.tenant_id,
626 3402 : timeline_id: self.timeline_id,
627 3402 : key_range: self.key_range.clone(),
628 3402 : lsn: self.lsn,
629 3402 : index_start_blk,
630 3402 : index_root_blk,
631 3402 : };
632 3402 :
633 3402 : let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
634 3402 : Summary::ser_into(&summary, &mut buf)?;
635 3402 : if buf.spilled() {
636 : // This is bad as we only have one free block for the summary
637 UBC 0 : warn!(
638 0 : "Used more than one page size for summary buffer: {}",
639 0 : buf.len()
640 0 : );
641 CBC 3402 : }
642 3402 : file.seek(SeekFrom::Start(0)).await?;
643 3402 : file.write_all(&buf).await?;
644 :
645 3402 : let metadata = file
646 3402 : .metadata()
647 UBC 0 : .await
648 CBC 3402 : .context("get metadata to determine file size")?;
649 :
650 3402 : let desc = PersistentLayerDesc::new_img(
651 3402 : self.tenant_id,
652 3402 : self.timeline_id,
653 3402 : self.key_range.clone(),
654 3402 : self.lsn,
655 3402 : metadata.len(),
656 3402 : );
657 3402 :
658 3402 : // Note: Because we open the file in write-only mode, we cannot
659 3402 : // reuse the same VirtualFile for reading later. That's why we don't
660 3402 : // set inner.file here. The first read will have to re-open it.
661 3402 : let layer = ImageLayer {
662 3402 : path_or_conf: PathOrConf::Conf(self.conf),
663 3402 : desc,
664 3402 : lsn: self.lsn,
665 3402 : access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
666 3402 : inner: OnceCell::new(),
667 3402 : };
668 3402 :
669 3402 : // fsync the file
670 3402 : file.sync_all().await?;
671 :
672 : // Rename the file to its final name
673 : //
674 : // Note: This overwrites any existing file. There shouldn't be any.
675 : // FIXME: throw an error instead?
676 3402 : let final_path = ImageLayer::path_for(
677 3402 : &PathOrConf::Conf(self.conf),
678 3402 : self.timeline_id,
679 3402 : self.tenant_id,
680 3402 : &ImageFileName {
681 3402 : key_range: self.key_range.clone(),
682 3402 : lsn: self.lsn,
683 3402 : },
684 3402 : );
685 3402 : std::fs::rename(self.path, final_path)?;
686 :
687 UBC 0 : trace!("created image layer {}", layer.path());
688 :
689 CBC 3402 : Ok(layer)
690 3402 : }
691 : }
692 :
693 : /// A builder object for constructing a new image layer.
694 : ///
695 : /// Usage:
696 : ///
697 : /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
698 : ///
699 : /// 2. Write the contents by calling `put_page_image` for every key-value
700 : /// pair in the key range.
701 : ///
702 : /// 3. Call `finish`.
703 : ///
704 : /// # Note
705 : ///
706 : /// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
707 : /// possible for the writer to drop before `finish` is actually called. So this
708 : /// could lead to odd temporary files in the directory, exhausting file system.
709 : /// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
710 : /// implementation that cleans up the temporary file in failure. It's not
711 : /// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
712 : /// out some fields, making it impossible to implement `Drop`.
713 : ///
714 : #[must_use]
715 : pub struct ImageLayerWriter {
716 : inner: Option<ImageLayerWriterInner>,
717 : }
718 :
719 : impl ImageLayerWriter {
720 : ///
721 : /// Start building a new image layer.
722 : ///
723 3406 : pub async fn new(
724 3406 : conf: &'static PageServerConf,
725 3406 : timeline_id: TimelineId,
726 3406 : tenant_id: TenantId,
727 3406 : key_range: &Range<Key>,
728 3406 : lsn: Lsn,
729 3406 : ) -> anyhow::Result<ImageLayerWriter> {
730 : Ok(Self {
731 : inner: Some(
732 3406 : ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?,
733 : ),
734 : })
735 3406 : }
736 :
737 : ///
738 : /// Write next value to the file.
739 : ///
740 : /// The page versions must be appended in blknum order.
741 : ///
742 224122 : pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
743 224122 : self.inner.as_mut().unwrap().put_image(key, img).await
744 224122 : }
745 :
746 : ///
747 : /// Finish writing the image layer.
748 : ///
749 3402 : pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
750 3402 : self.inner.take().unwrap().finish().await
751 3402 : }
752 : }
753 :
754 : impl Drop for ImageLayerWriter {
755 : fn drop(&mut self) {
756 3402 : if let Some(inner) = self.inner.take() {
757 UBC 0 : inner.blob_writer.into_inner().remove();
758 CBC 3402 : }
759 3402 : }
760 : }
|