Line data Source code
1 : //! An ImageLayer represents an image or a snapshot of a key-range at
2 : //! one particular LSN. It contains an image of all key-value pairs
3 : //! in its key-range. Any key that falls into the image layer's range
4 : //! but does not exist in the layer, does not exist.
5 : //!
6 : //! An image layer is stored in a file on disk. The file is stored in
7 : //! timelines/<timeline_id> directory. Currently, there are no
8 : //! subdirectories, and each image layer file is named like this:
9 : //!
10 : //! ```text
11 : //! <key start>-<key end>__<LSN>
12 : //! ```
13 : //!
14 : //! For example:
15 : //!
16 : //! ```text
17 : //! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
18 : //! ```
19 : //!
20 : //! Every image layer file consists of three parts: "summary",
21 : //! "index", and "values". The summary is a fixed size header at the
22 : //! beginning of the file, and it contains basic information about the
23 : //! layer, and offsets to the other parts. The "index" is a B-tree,
24 : //! mapping from Key to an offset in the "values" part. The
25 : //! actual page images are stored in the "values" part.
26 : use crate::config::PageServerConf;
27 : use crate::context::RequestContext;
28 : use crate::page_cache::PAGE_SZ;
29 : use crate::repository::{Key, KEY_SIZE};
30 : use crate::tenant::blob_io::WriteBlobWriter;
31 : use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
32 : use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
33 : use crate::tenant::storage_layer::{
34 : LayerAccessStats, PersistentLayer, ValueReconstructResult, ValueReconstructState,
35 : };
36 : use crate::virtual_file::VirtualFile;
37 : use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
38 : use anyhow::{bail, ensure, Context, Result};
39 : use bytes::Bytes;
40 : use hex;
41 : use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
42 : use rand::{distributions::Alphanumeric, Rng};
43 : use serde::{Deserialize, Serialize};
44 : use std::fs::{self, File};
45 : use std::io::SeekFrom;
46 : use std::io::Write;
47 : use std::ops::Range;
48 : use std::os::unix::prelude::FileExt;
49 : use std::path::{Path, PathBuf};
50 : use tokio::sync::OnceCell;
51 : use tracing::*;
52 :
53 : use utils::{
54 : bin_ser::BeSer,
55 : id::{TenantId, TimelineId},
56 : lsn::Lsn,
57 : };
58 :
59 : use super::filename::ImageFileName;
60 : use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
61 :
62 : ///
63 : /// Header stored in the beginning of the file
64 : ///
65 : /// After this comes the 'values' part, starting on block 1. After that,
66 : /// the 'index' starts at the block indicated by 'index_start_blk'
67 : ///
68 1562 : #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
69 : pub(super) struct Summary {
70 : /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
71 : magic: u16,
72 : format_version: u16,
73 :
74 : tenant_id: TenantId,
75 : timeline_id: TimelineId,
76 : key_range: Range<Key>,
77 : lsn: Lsn,
78 :
79 : /// Block number where the 'index' part of the file begins.
80 : index_start_blk: u32,
81 : /// Block within the 'index', where the B-tree root page is stored
82 : index_root_blk: u32,
83 : // the 'values' part starts after the summary header, on block 1.
84 : }
85 :
86 : impl From<&ImageLayer> for Summary {
87 1562 : fn from(layer: &ImageLayer) -> Self {
88 1562 : Self::expected(
89 1562 : layer.desc.tenant_id,
90 1562 : layer.desc.timeline_id,
91 1562 : layer.desc.key_range.clone(),
92 1562 : layer.lsn,
93 1562 : )
94 1562 : }
95 : }
96 :
97 : impl Summary {
98 1562 : pub(super) fn expected(
99 1562 : tenant_id: TenantId,
100 1562 : timeline_id: TimelineId,
101 1562 : key_range: Range<Key>,
102 1562 : lsn: Lsn,
103 1562 : ) -> Self {
104 1562 : Self {
105 1562 : magic: IMAGE_FILE_MAGIC,
106 1562 : format_version: STORAGE_FORMAT_VERSION,
107 1562 : tenant_id,
108 1562 : timeline_id,
109 1562 : key_range,
110 1562 : lsn,
111 1562 :
112 1562 : index_start_blk: 0,
113 1562 : index_root_blk: 0,
114 1562 : }
115 1562 : }
116 : }
117 :
118 : /// ImageLayer is the in-memory data structure associated with an on-disk image
119 : /// file.
120 : ///
121 : /// We keep an ImageLayer in memory for each file, in the LayerMap. If a layer
122 : /// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
123 : /// Otherwise the struct is just a placeholder for a file that exists on disk,
124 : /// and it needs to be loaded before using it in queries.
125 : pub struct ImageLayer {
126 : path_or_conf: PathOrConf,
127 :
128 : pub desc: PersistentLayerDesc,
129 : // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
130 : pub lsn: Lsn,
131 :
132 : access_stats: LayerAccessStats,
133 :
134 : inner: OnceCell<ImageLayerInner>,
135 : }
136 :
137 : impl std::fmt::Debug for ImageLayer {
138 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
139 0 : use super::RangeDisplayDebug;
140 0 :
141 0 : f.debug_struct("ImageLayer")
142 0 : .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
143 0 : .field("file_size", &self.desc.file_size)
144 0 : .field("lsn", &self.lsn)
145 0 : .field("inner", &self.inner)
146 0 : .finish()
147 0 : }
148 : }
149 :
150 : pub struct ImageLayerInner {
151 : // values copied from summary
152 : index_start_blk: u32,
153 : index_root_blk: u32,
154 :
155 : lsn: Lsn,
156 :
157 : /// Reader object for reading blocks from the file.
158 : file: FileBlockReader,
159 : }
160 :
161 : impl std::fmt::Debug for ImageLayerInner {
162 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
163 0 : f.debug_struct("ImageLayerInner")
164 0 : .field("index_start_blk", &self.index_start_blk)
165 0 : .field("index_root_blk", &self.index_root_blk)
166 0 : .finish()
167 0 : }
168 : }
169 :
170 : #[async_trait::async_trait]
171 : impl Layer for ImageLayer {
172 : /// Look up given page in the file
173 658916 : async fn get_value_reconstruct_data(
174 658916 : &self,
175 658916 : key: Key,
176 658916 : lsn_range: Range<Lsn>,
177 658916 : reconstruct_state: &mut ValueReconstructState,
178 658916 : ctx: &RequestContext,
179 658917 : ) -> anyhow::Result<ValueReconstructResult> {
180 658917 : self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
181 5545 : .await
182 1317834 : }
183 : }
184 :
185 : /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
186 : impl std::fmt::Display for ImageLayer {
187 5 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
188 5 : write!(f, "{}", self.layer_desc().short_id())
189 5 : }
190 : }
191 :
192 : impl AsLayerDesc for ImageLayer {
193 5022 : fn layer_desc(&self) -> &PersistentLayerDesc {
194 5022 : &self.desc
195 5022 : }
196 : }
197 :
198 : impl PersistentLayer for ImageLayer {
199 9 : fn local_path(&self) -> Option<PathBuf> {
200 9 : self.local_path()
201 9 : }
202 :
203 78 : fn delete_resident_layer_file(&self) -> Result<()> {
204 78 : self.delete_resident_layer_file()
205 78 : }
206 :
207 8 : fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
208 8 : self.info(reset)
209 8 : }
210 :
211 6 : fn access_stats(&self) -> &LayerAccessStats {
212 6 : self.access_stats()
213 6 : }
214 : }
215 :
216 : impl ImageLayer {
217 0 : pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
218 0 : println!(
219 0 : "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
220 0 : self.desc.tenant_id,
221 0 : self.desc.timeline_id,
222 0 : self.desc.key_range.start,
223 0 : self.desc.key_range.end,
224 0 : self.lsn,
225 0 : self.desc.is_incremental(),
226 0 : self.desc.file_size
227 0 : );
228 0 :
229 0 : if !verbose {
230 0 : return Ok(());
231 0 : }
232 :
233 0 : let inner = self.load(LayerAccessKind::Dump, ctx).await?;
234 0 : let file = &inner.file;
235 0 : let tree_reader =
236 0 : DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
237 0 :
238 0 : tree_reader.dump().await?;
239 :
240 0 : tree_reader
241 0 : .visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
242 0 : println!("key: {} offset {}", hex::encode(key), value);
243 0 : true
244 0 : })
245 0 : .await?;
246 :
247 0 : Ok(())
248 0 : }
249 :
250 658916 : pub(crate) async fn get_value_reconstruct_data(
251 658916 : &self,
252 658916 : key: Key,
253 658916 : lsn_range: Range<Lsn>,
254 658916 : reconstruct_state: &mut ValueReconstructState,
255 658916 : ctx: &RequestContext,
256 658917 : ) -> anyhow::Result<ValueReconstructResult> {
257 658917 : assert!(self.desc.key_range.contains(&key));
258 658917 : assert!(lsn_range.start >= self.lsn);
259 658917 : assert!(lsn_range.end >= self.lsn);
260 :
261 658917 : let inner = self
262 658917 : .load(LayerAccessKind::GetValueReconstructData, ctx)
263 9 : .await?;
264 658917 : inner
265 658917 : .get_value_reconstruct_data(key, reconstruct_state)
266 5536 : .await
267 : // FIXME: makes no sense to dump paths
268 658917 : .with_context(|| format!("read {}", self.path().display()))
269 658917 : }
270 :
271 9 : pub(crate) fn local_path(&self) -> Option<PathBuf> {
272 9 : Some(self.path())
273 9 : }
274 :
275 : pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
276 : // delete underlying file
277 78 : fs::remove_file(self.path())?;
278 78 : Ok(())
279 78 : }
280 :
281 8 : pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
282 8 : let layer_file_name = self.layer_desc().filename().file_name();
283 8 : let lsn_start = self.layer_desc().image_layer_lsn();
284 8 :
285 8 : HistoricLayerInfo::Image {
286 8 : layer_file_name,
287 8 : layer_file_size: self.desc.file_size,
288 8 : lsn_start,
289 8 : remote: false,
290 8 : access_stats: self.access_stats.as_api_model(reset),
291 8 : }
292 8 : }
293 :
294 1118 : pub(crate) fn access_stats(&self) -> &LayerAccessStats {
295 1118 : &self.access_stats
296 1118 : }
297 :
298 3873 : fn path_for(
299 3873 : path_or_conf: &PathOrConf,
300 3873 : timeline_id: TimelineId,
301 3873 : tenant_id: TenantId,
302 3873 : fname: &ImageFileName,
303 3873 : ) -> PathBuf {
304 3873 : match path_or_conf {
305 0 : PathOrConf::Path(path) => path.to_path_buf(),
306 3873 : PathOrConf::Conf(conf) => conf
307 3873 : .timeline_path(&tenant_id, &timeline_id)
308 3873 : .join(fname.to_string()),
309 : }
310 3873 : }
311 :
312 1113 : fn temp_path_for(
313 1113 : conf: &PageServerConf,
314 1113 : timeline_id: TimelineId,
315 1113 : tenant_id: TenantId,
316 1113 : fname: &ImageFileName,
317 1113 : ) -> PathBuf {
318 1113 : let rand_string: String = rand::thread_rng()
319 1113 : .sample_iter(&Alphanumeric)
320 1113 : .take(8)
321 1113 : .map(char::from)
322 1113 : .collect();
323 1113 :
324 1113 : conf.timeline_path(&tenant_id, &timeline_id)
325 1113 : .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
326 1113 : }
327 :
328 : ///
329 : /// Open the underlying file and read the metadata into memory, if it's
330 : /// not loaded already.
331 : ///
332 658916 : async fn load(
333 658916 : &self,
334 658916 : access_kind: LayerAccessKind,
335 658916 : ctx: &RequestContext,
336 658917 : ) -> Result<&ImageLayerInner> {
337 658917 : self.access_stats.record_access(access_kind, ctx);
338 658917 : self.inner
339 658917 : .get_or_try_init(|| self.load_inner())
340 9 : .await
341 658917 : .with_context(|| format!("Failed to load image layer {}", self.path().display()))
342 658917 : }
343 :
344 1562 : async fn load_inner(&self) -> Result<ImageLayerInner> {
345 1562 : let path = self.path();
346 :
347 1562 : let expected_summary = match &self.path_or_conf {
348 1562 : PathOrConf::Conf(_) => Some(Summary::from(self)),
349 0 : PathOrConf::Path(_) => None,
350 : };
351 :
352 1562 : let loaded =
353 1562 : ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary).await?;
354 :
355 1562 : if let PathOrConf::Path(ref path) = self.path_or_conf {
356 : // not production code
357 0 : let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
358 0 : let expected_filename = self.filename().file_name();
359 0 :
360 0 : if actual_filename != expected_filename {
361 0 : println!("warning: filename does not match what is expected from in-file summary");
362 0 : println!("actual: {:?}", actual_filename);
363 0 : println!("expected: {:?}", expected_filename);
364 0 : }
365 1562 : }
366 :
367 1562 : Ok(loaded)
368 1562 : }
369 :
370 : /// Create an ImageLayer struct representing an existing file on disk
371 743 : pub fn new(
372 743 : conf: &'static PageServerConf,
373 743 : timeline_id: TimelineId,
374 743 : tenant_id: TenantId,
375 743 : filename: &ImageFileName,
376 743 : file_size: u64,
377 743 : access_stats: LayerAccessStats,
378 743 : ) -> ImageLayer {
379 743 : ImageLayer {
380 743 : path_or_conf: PathOrConf::Conf(conf),
381 743 : desc: PersistentLayerDesc::new_img(
382 743 : tenant_id,
383 743 : timeline_id,
384 743 : filename.key_range.clone(),
385 743 : filename.lsn,
386 743 : file_size,
387 743 : ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
388 743 : lsn: filename.lsn,
389 743 : access_stats,
390 743 : inner: OnceCell::new(),
391 743 : }
392 743 : }
393 :
394 : /// Create an ImageLayer struct representing an existing file on disk.
395 : ///
396 : /// This variant is only used for debugging purposes, by the 'pagectl' binary.
397 0 : pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
398 0 : let mut summary_buf = Vec::new();
399 0 : summary_buf.resize(PAGE_SZ, 0);
400 0 : file.read_exact_at(&mut summary_buf, 0)?;
401 0 : let summary = Summary::des_prefix(&summary_buf)?;
402 0 : let metadata = file
403 0 : .metadata()
404 0 : .context("get file metadata to determine size")?;
405 0 : Ok(ImageLayer {
406 0 : path_or_conf: PathOrConf::Path(path.to_path_buf()),
407 0 : desc: PersistentLayerDesc::new_img(
408 0 : summary.tenant_id,
409 0 : summary.timeline_id,
410 0 : summary.key_range,
411 0 : summary.lsn,
412 0 : metadata.len(),
413 0 : ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
414 0 : lsn: summary.lsn,
415 0 : access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
416 0 : inner: OnceCell::new(),
417 0 : })
418 0 : }
419 :
420 2761 : fn layer_name(&self) -> ImageFileName {
421 2761 : self.desc.image_file_name()
422 2761 : }
423 :
424 : /// Path to the layer file in pageserver workdir.
425 2761 : pub fn path(&self) -> PathBuf {
426 2761 : Self::path_for(
427 2761 : &self.path_or_conf,
428 2761 : self.desc.timeline_id,
429 2761 : self.desc.tenant_id,
430 2761 : &self.layer_name(),
431 2761 : )
432 2761 : }
433 : }
434 :
435 : impl ImageLayerInner {
436 1562 : pub(super) async fn load(
437 1562 : path: &std::path::Path,
438 1562 : lsn: Lsn,
439 1562 : summary: Option<Summary>,
440 1562 : ) -> anyhow::Result<Self> {
441 1562 : let file = VirtualFile::open(path)
442 1562 : .with_context(|| format!("Failed to open file '{}'", path.display()))?;
443 1562 : let file = FileBlockReader::new(file);
444 1562 : let summary_blk = file.read_blk(0).await?;
445 1562 : let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
446 :
447 1562 : if let Some(mut expected_summary) = summary {
448 : // production code path
449 1562 : expected_summary.index_start_blk = actual_summary.index_start_blk;
450 1562 : expected_summary.index_root_blk = actual_summary.index_root_blk;
451 1562 :
452 1562 : if actual_summary != expected_summary {
453 0 : bail!(
454 0 : "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
455 0 : actual_summary,
456 0 : expected_summary
457 0 : );
458 1562 : }
459 0 : }
460 :
461 1562 : Ok(ImageLayerInner {
462 1562 : index_start_blk: actual_summary.index_start_blk,
463 1562 : index_root_blk: actual_summary.index_root_blk,
464 1562 : lsn,
465 1562 : file,
466 1562 : })
467 1562 : }
468 :
469 658916 : pub(super) async fn get_value_reconstruct_data(
470 658916 : &self,
471 658916 : key: Key,
472 658916 : reconstruct_state: &mut ValueReconstructState,
473 658917 : ) -> anyhow::Result<ValueReconstructResult> {
474 658917 : let file = &self.file;
475 658917 : let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
476 658917 :
477 658917 : let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
478 658917 : key.write_to_byte_slice(&mut keybuf);
479 658917 : if let Some(offset) = tree_reader.get(&keybuf).await? {
480 658915 : let blob = file
481 658915 : .block_cursor()
482 658915 : .read_blob(offset)
483 3058 : .await
484 658915 : .with_context(|| format!("failed to read value from offset {}", offset))?;
485 658915 : let value = Bytes::from(blob);
486 658915 :
487 658915 : reconstruct_state.img = Some((self.lsn, value));
488 658915 : Ok(ValueReconstructResult::Complete)
489 : } else {
490 2 : Ok(ValueReconstructResult::Missing)
491 : }
492 658917 : }
493 : }
494 :
495 : /// A builder object for constructing a new image layer.
496 : ///
497 : /// Usage:
498 : ///
499 : /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
500 : ///
501 : /// 2. Write the contents by calling `put_page_image` for every key-value
502 : /// pair in the key range.
503 : ///
504 : /// 3. Call `finish`.
505 : ///
506 : struct ImageLayerWriterInner {
507 : conf: &'static PageServerConf,
508 : path: PathBuf,
509 : timeline_id: TimelineId,
510 : tenant_id: TenantId,
511 : key_range: Range<Key>,
512 : lsn: Lsn,
513 :
514 : blob_writer: WriteBlobWriter<VirtualFile>,
515 : tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
516 : }
517 :
518 : impl ImageLayerWriterInner {
519 : ///
520 : /// Start building a new image layer.
521 : ///
522 1113 : async fn new(
523 1113 : conf: &'static PageServerConf,
524 1113 : timeline_id: TimelineId,
525 1113 : tenant_id: TenantId,
526 1113 : key_range: &Range<Key>,
527 1113 : lsn: Lsn,
528 1113 : ) -> anyhow::Result<Self> {
529 1113 : // Create the file initially with a temporary filename.
530 1113 : // We'll atomically rename it to the final name when we're done.
531 1113 : let path = ImageLayer::temp_path_for(
532 1113 : conf,
533 1113 : timeline_id,
534 1113 : tenant_id,
535 1113 : &ImageFileName {
536 1113 : key_range: key_range.clone(),
537 1113 : lsn,
538 1113 : },
539 1113 : );
540 1113 : info!("new image layer {}", path.display());
541 1113 : let mut file = VirtualFile::open_with_options(
542 1113 : &path,
543 1113 : std::fs::OpenOptions::new().write(true).create_new(true),
544 1113 : )?;
545 : // make room for the header block
546 1113 : file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
547 1113 : let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);
548 1113 :
549 1113 : // Initialize the b-tree index builder
550 1113 : let block_buf = BlockBuf::new();
551 1113 : let tree_builder = DiskBtreeBuilder::new(block_buf);
552 1113 :
553 1113 : let writer = Self {
554 1113 : conf,
555 1113 : path,
556 1113 : timeline_id,
557 1113 : tenant_id,
558 1113 : key_range: key_range.clone(),
559 1113 : lsn,
560 1113 : tree: tree_builder,
561 1113 : blob_writer,
562 1113 : };
563 1113 :
564 1113 : Ok(writer)
565 1113 : }
566 :
567 : ///
568 : /// Write next value to the file.
569 : ///
570 : /// The page versions must be appended in blknum order.
571 : ///
572 205153 : async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
573 205153 : ensure!(self.key_range.contains(&key));
574 205153 : let off = self.blob_writer.write_blob(img).await?;
575 :
576 205153 : let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
577 205153 : key.write_to_byte_slice(&mut keybuf);
578 205153 : self.tree.append(&keybuf, off)?;
579 :
580 205153 : Ok(())
581 205153 : }
582 :
583 : ///
584 : /// Finish writing the image layer.
585 : ///
586 1112 : async fn finish(self) -> anyhow::Result<ImageLayer> {
587 1112 : let index_start_blk =
588 1112 : ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
589 1112 :
590 1112 : let mut file = self.blob_writer.into_inner();
591 1112 :
592 1112 : // Write out the index
593 1112 : file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
594 0 : .await?;
595 1112 : let (index_root_blk, block_buf) = self.tree.finish()?;
596 2424 : for buf in block_buf.blocks {
597 1312 : file.write_all(buf.as_ref())?;
598 : }
599 :
600 : // Fill in the summary on blk 0
601 1112 : let summary = Summary {
602 1112 : magic: IMAGE_FILE_MAGIC,
603 1112 : format_version: STORAGE_FORMAT_VERSION,
604 1112 : tenant_id: self.tenant_id,
605 1112 : timeline_id: self.timeline_id,
606 1112 : key_range: self.key_range.clone(),
607 1112 : lsn: self.lsn,
608 1112 : index_start_blk,
609 1112 : index_root_blk,
610 1112 : };
611 1112 :
612 1112 : let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
613 1112 : Summary::ser_into(&summary, &mut buf)?;
614 1112 : if buf.spilled() {
615 : // This is bad as we only have one free block for the summary
616 0 : warn!(
617 0 : "Used more than one page size for summary buffer: {}",
618 0 : buf.len()
619 0 : );
620 1112 : }
621 1112 : file.seek(SeekFrom::Start(0)).await?;
622 1112 : file.write_all(&buf)?;
623 :
624 1112 : let metadata = file
625 1112 : .metadata()
626 0 : .await
627 1112 : .context("get metadata to determine file size")?;
628 :
629 1112 : let desc = PersistentLayerDesc::new_img(
630 1112 : self.tenant_id,
631 1112 : self.timeline_id,
632 1112 : self.key_range.clone(),
633 1112 : self.lsn,
634 1112 : metadata.len(),
635 1112 : );
636 1112 :
637 1112 : // Note: Because we open the file in write-only mode, we cannot
638 1112 : // reuse the same VirtualFile for reading later. That's why we don't
639 1112 : // set inner.file here. The first read will have to re-open it.
640 1112 : let layer = ImageLayer {
641 1112 : path_or_conf: PathOrConf::Conf(self.conf),
642 1112 : desc,
643 1112 : lsn: self.lsn,
644 1112 : access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
645 1112 : inner: OnceCell::new(),
646 1112 : };
647 1112 :
648 1112 : // fsync the file
649 1112 : file.sync_all()?;
650 :
651 : // Rename the file to its final name
652 : //
653 : // Note: This overwrites any existing file. There shouldn't be any.
654 : // FIXME: throw an error instead?
655 1112 : let final_path = ImageLayer::path_for(
656 1112 : &PathOrConf::Conf(self.conf),
657 1112 : self.timeline_id,
658 1112 : self.tenant_id,
659 1112 : &ImageFileName {
660 1112 : key_range: self.key_range.clone(),
661 1112 : lsn: self.lsn,
662 1112 : },
663 1112 : );
664 1112 : std::fs::rename(self.path, final_path)?;
665 :
666 0 : trace!("created image layer {}", layer.path().display());
667 :
668 1112 : Ok(layer)
669 1112 : }
670 : }
671 :
672 : /// A builder object for constructing a new image layer.
673 : ///
674 : /// Usage:
675 : ///
676 : /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
677 : ///
678 : /// 2. Write the contents by calling `put_page_image` for every key-value
679 : /// pair in the key range.
680 : ///
681 : /// 3. Call `finish`.
682 : ///
683 : /// # Note
684 : ///
685 : /// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
686 : /// possible for the writer to drop before `finish` is actually called. So this
687 : /// could lead to odd temporary files in the directory, exhausting file system.
688 : /// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
689 : /// implementation that cleans up the temporary file in failure. It's not
690 : /// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
691 : /// out some fields, making it impossible to implement `Drop`.
692 : ///
693 : #[must_use]
694 : pub struct ImageLayerWriter {
695 : inner: Option<ImageLayerWriterInner>,
696 : }
697 :
698 : impl ImageLayerWriter {
699 : ///
700 : /// Start building a new image layer.
701 : ///
702 1113 : pub async fn new(
703 1113 : conf: &'static PageServerConf,
704 1113 : timeline_id: TimelineId,
705 1113 : tenant_id: TenantId,
706 1113 : key_range: &Range<Key>,
707 1113 : lsn: Lsn,
708 1113 : ) -> anyhow::Result<ImageLayerWriter> {
709 : Ok(Self {
710 : inner: Some(
711 1113 : ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?,
712 : ),
713 : })
714 1113 : }
715 :
716 : ///
717 : /// Write next value to the file.
718 : ///
719 : /// The page versions must be appended in blknum order.
720 : ///
721 205152 : pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
722 205153 : self.inner.as_mut().unwrap().put_image(key, img).await
723 205153 : }
724 :
725 : ///
726 : /// Finish writing the image layer.
727 : ///
728 1112 : pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
729 1112 : self.inner.take().unwrap().finish().await
730 1112 : }
731 : }
732 :
733 : impl Drop for ImageLayerWriter {
734 : fn drop(&mut self) {
735 1112 : if let Some(inner) = self.inner.take() {
736 0 : inner.blob_writer.into_inner().remove();
737 1112 : }
738 1112 : }
739 : }
|