Line data Source code
1 : //! An in-memory layer stores recently received key-value pairs.
2 : //!
3 : //! The "in-memory" part of the name is a bit misleading: the actual page versions are
4 : //! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
5 : //! its position in the file, is kept in memory, though.
6 : //!
7 : use std::cmp::Ordering;
8 : use std::collections::{BTreeMap, HashMap};
9 : use std::fmt::Write;
10 : use std::ops::Range;
11 : use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering as AtomicOrdering};
12 : use std::sync::{Arc, OnceLock};
13 : use std::time::Instant;
14 :
15 : use anyhow::Result;
16 : use camino::Utf8PathBuf;
17 : use pageserver_api::key::{CompactKey, Key};
18 : use pageserver_api::keyspace::KeySpace;
19 : use pageserver_api::models::InMemoryLayerInfo;
20 : use pageserver_api::shard::TenantShardId;
21 : use tokio::sync::RwLock;
22 : use tracing::*;
23 : use utils::id::TimelineId;
24 : use utils::lsn::Lsn;
25 : use utils::vec_map::VecMap;
26 : use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
27 :
28 : use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};
29 : use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64, u64_to_usize};
30 : use crate::config::PageServerConf;
31 : use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
32 : // avoid binding to Write (conflicts with std::io::Write)
33 : // while being able to use std::fmt::Write's methods
34 : use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
35 : use crate::tenant::ephemeral_file::EphemeralFile;
36 : use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo};
37 : use crate::tenant::timeline::GetVectoredError;
38 : use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
39 : use crate::{l0_flush, page_cache};
40 :
41 : pub(crate) mod vectored_dio_read;
42 :
43 : #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
44 : pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
45 :
46 : pub struct InMemoryLayer {
47 : conf: &'static PageServerConf,
48 : tenant_shard_id: TenantShardId,
49 : timeline_id: TimelineId,
50 : file_id: InMemoryLayerFileId,
51 :
52 : /// This layer contains all the changes from 'start_lsn'. The
53 : /// start is inclusive.
54 : start_lsn: Lsn,
55 :
56 : /// Frozen layers have an exclusive end LSN.
57 : /// Writes are only allowed when this is `None`.
58 : pub(crate) end_lsn: OnceLock<Lsn>,
59 :
60 : /// Used for traversal path. Cached representation of the in-memory layer after frozen.
61 : frozen_local_path_str: OnceLock<Arc<str>>,
62 :
63 : opened_at: Instant,
64 :
65 : /// The above fields never change, except for `end_lsn`, which is only set once.
66 : /// All other changing parts are in `inner`, and protected by a mutex.
67 : inner: RwLock<InMemoryLayerInner>,
68 :
69 : estimated_in_mem_size: AtomicU64,
70 : }
71 :
72 : impl std::fmt::Debug for InMemoryLayer {
73 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
74 0 : f.debug_struct("InMemoryLayer")
75 0 : .field("start_lsn", &self.start_lsn)
76 0 : .field("end_lsn", &self.end_lsn)
77 0 : .field("inner", &self.inner)
78 0 : .finish()
79 0 : }
80 : }
81 :
82 : pub struct InMemoryLayerInner {
83 : /// All versions of all pages in the layer are kept here. Indexed
84 : /// by block number and LSN. The [`IndexEntry`] is an offset into the
85 : /// ephemeral file where the page version is stored.
86 : index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
87 :
88 : /// The values are stored in a serialized format in this file.
89 : /// Each serialized Value is preceded by a 'u32' length field.
90 : /// PerSeg::page_versions map stores offsets into this file.
91 : file: EphemeralFile,
92 :
93 : resource_units: GlobalResourceUnits,
94 : }
95 :
96 : /// Support the same max blob length as blob_io, because ultimately
97 : /// all the InMemoryLayer contents end up being written into a delta layer,
98 : /// using the [`crate::tenant::blob_io`].
99 : const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN;
100 : const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
101 : let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize;
102 : let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize;
103 : assert!(trailing_ones + leading_zeroes == std::mem::size_of::<usize>() * 8);
104 : trailing_ones
105 : };
106 :
107 : /// See [`InMemoryLayerInner::index`].
108 : ///
109 : /// For memory efficiency, the data is packed into a u64.
110 : ///
111 : /// Layout:
112 : /// - 1 bit: `will_init`
113 : /// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len`
114 : /// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos`
115 : #[derive(Debug, Clone, Copy, PartialEq, Eq)]
116 : pub struct IndexEntry(u64);
117 :
118 : impl IndexEntry {
119 : /// See [`Self::MAX_SUPPORTED_POS`].
120 : const MAX_SUPPORTED_POS_BITS: usize = {
121 : let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS;
122 : if remainder < 32 {
123 : panic!("pos can be u32 as per type system, support that");
124 : }
125 : remainder
126 : };
127 : /// The maximum supported blob offset that can be represented by [`Self`].
128 : /// See also [`Self::validate_checkpoint_distance`].
129 : const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1;
130 :
131 : // Layout
132 : const WILL_INIT_RANGE: Range<usize> = 0..1;
133 : const LEN_RANGE: Range<usize> =
134 : Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS;
135 : const POS_RANGE: Range<usize> =
136 : Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS;
137 : const _ASSERT: () = {
138 : if Self::POS_RANGE.end != 64 {
139 : panic!("we don't want undefined bits for our own sanity")
140 : }
141 : };
142 :
143 : /// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`].
144 : ///
145 : /// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long.
146 : /// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`].
147 : ///
148 : /// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance,
149 : /// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value.
150 : ///
151 : /// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested)
152 : /// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer.
153 : #[inline(always)]
154 10181460 : fn new(arg: IndexEntryNewArgs) -> anyhow::Result<Self> {
155 10181460 : let IndexEntryNewArgs {
156 10181460 : base_offset,
157 10181460 : batch_offset,
158 10181460 : len,
159 10181460 : will_init,
160 10181460 : } = arg;
161 :
162 10181460 : let pos = base_offset
163 10181460 : .checked_add(batch_offset)
164 10181460 : .ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?;
165 :
166 10181460 : if pos.into_usize() > Self::MAX_SUPPORTED_POS {
167 16 : anyhow::bail!(
168 16 : "base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}",
169 16 : max = Self::MAX_SUPPORTED_POS
170 16 : );
171 10181444 : }
172 10181444 :
173 10181444 : if len > MAX_SUPPORTED_BLOB_LEN {
174 4 : anyhow::bail!(
175 4 : "len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}",
176 4 : );
177 10181440 : }
178 10181440 :
179 10181440 : let mut data: u64 = 0;
180 : use bit_field::BitField;
181 10181440 : data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 });
182 10181440 : data.set_bits(Self::LEN_RANGE, len.into_u64());
183 10181440 : data.set_bits(Self::POS_RANGE, pos);
184 10181440 :
185 10181440 : Ok(Self(data))
186 10181460 : }
187 :
188 : #[inline(always)]
189 9769626 : fn unpack(&self) -> IndexEntryUnpacked {
190 : use bit_field::BitField;
191 9769626 : IndexEntryUnpacked {
192 9769626 : will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0,
193 9769626 : len: self.0.get_bits(Self::LEN_RANGE),
194 9769626 : pos: self.0.get_bits(Self::POS_RANGE),
195 9769626 : }
196 9769626 : }
197 :
198 : /// See [`Self::new`].
199 480 : pub(crate) const fn validate_checkpoint_distance(
200 480 : checkpoint_distance: u64,
201 480 : ) -> Result<(), &'static str> {
202 480 : if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 {
203 0 : return Err("exceeds the maximum supported value");
204 480 : }
205 480 : let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN);
206 480 : if res.is_none() {
207 0 : return Err(
208 0 : "checkpoint distance + max supported blob len overflows in-memory addition",
209 0 : );
210 480 : }
211 480 :
212 480 : // NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS
213 480 :
214 480 : Ok(())
215 480 : }
216 :
217 : const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
218 : let res = Self::validate_checkpoint_distance(
219 : pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE,
220 : );
221 : if res.is_err() {
222 : panic!("default checkpoint distance is valid")
223 : }
224 : };
225 : }
226 :
227 : /// Args to [`IndexEntry::new`].
228 : #[derive(Clone, Copy)]
229 : struct IndexEntryNewArgs {
230 : base_offset: u64,
231 : batch_offset: u64,
232 : len: usize,
233 : will_init: bool,
234 : }
235 :
236 : /// Unpacked representation of the bitfielded [`IndexEntry`].
237 : #[derive(Clone, Copy, PartialEq, Eq, Debug)]
238 : struct IndexEntryUnpacked {
239 : will_init: bool,
240 : len: u64,
241 : pos: u64,
242 : }
243 :
244 : impl std::fmt::Debug for InMemoryLayerInner {
245 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
246 0 : f.debug_struct("InMemoryLayerInner").finish()
247 0 : }
248 : }
249 :
250 : /// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline,
251 : /// to minimize contention.
252 : ///
253 : /// This global state is used to implement behaviors that require a global view of the system, e.g.
254 : /// rolling layers proactively to limit the total amount of dirty data.
255 : pub(crate) struct GlobalResources {
256 : // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
257 : // Zero means unlimited.
258 : pub(crate) max_dirty_bytes: AtomicU64,
259 : // How many bytes are in all EphemeralFile objects
260 : dirty_bytes: AtomicU64,
261 : // How many layers are contributing to dirty_bytes
262 : dirty_layers: AtomicUsize,
263 : }
264 :
265 : // Per-timeline RAII struct for its contribution to [`GlobalResources`]
266 : struct GlobalResourceUnits {
267 : // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
268 : // for decrementing the global counter by this many bytes when dropped.
269 : dirty_bytes: u64,
270 : }
271 :
272 : impl GlobalResourceUnits {
273 : // Hint for the layer append path to update us when the layer size differs from the last
274 : // call to update_size by this much. If we don't reach this threshold, we'll still get
275 : // updated when the Timeline "ticks" in the background.
276 : const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
277 :
278 2596 : fn new() -> Self {
279 2596 : GLOBAL_RESOURCES
280 2596 : .dirty_layers
281 2596 : .fetch_add(1, AtomicOrdering::Relaxed);
282 2596 : Self { dirty_bytes: 0 }
283 2596 : }
284 :
285 : /// Do not call this frequently: all timelines will write to these same global atomics,
286 : /// so this is a relatively expensive operation. Wait at least a few seconds between calls.
287 : ///
288 : /// Returns the effective layer size limit that should be applied, if any, to keep
289 : /// the total number of dirty bytes below the configured maximum.
290 2368 : fn publish_size(&mut self, size: u64) -> Option<u64> {
291 2368 : let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
292 2348 : Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
293 : Ordering::Greater => {
294 16 : let delta = size - self.dirty_bytes;
295 16 : let old = GLOBAL_RESOURCES
296 16 : .dirty_bytes
297 16 : .fetch_add(delta, AtomicOrdering::Relaxed);
298 16 : old + delta
299 : }
300 : Ordering::Less => {
301 4 : let delta = self.dirty_bytes - size;
302 4 : let old = GLOBAL_RESOURCES
303 4 : .dirty_bytes
304 4 : .fetch_sub(delta, AtomicOrdering::Relaxed);
305 4 : old - delta
306 : }
307 : };
308 :
309 : // This is a sloppy update: concurrent updates to the counter will race, and the exact
310 : // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
311 : // That's okay: as long as the metric contains some recent value, it doesn't have to always
312 : // be literally the last update.
313 2368 : TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
314 2368 :
315 2368 : self.dirty_bytes = size;
316 2368 :
317 2368 : let max_dirty_bytes = GLOBAL_RESOURCES
318 2368 : .max_dirty_bytes
319 2368 : .load(AtomicOrdering::Relaxed);
320 2368 : if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
321 : // Set the layer file limit to the average layer size: this implies that all above-average
322 : // sized layers will be elegible for freezing. They will be frozen in the order they
323 : // next enter publish_size.
324 0 : Some(
325 0 : new_global_dirty_bytes
326 0 : / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
327 0 : )
328 : } else {
329 2368 : None
330 : }
331 2368 : }
332 :
333 : // Call publish_size if the input size differs from last published size by more than
334 : // the drift limit
335 9608464 : fn maybe_publish_size(&mut self, size: u64) {
336 9608464 : let publish = match size.cmp(&self.dirty_bytes) {
337 0 : Ordering::Equal => false,
338 9608464 : Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
339 0 : Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT,
340 : };
341 :
342 9608464 : if publish {
343 16 : self.publish_size(size);
344 9608448 : }
345 9608464 : }
346 : }
347 :
348 : impl Drop for GlobalResourceUnits {
349 2352 : fn drop(&mut self) {
350 2352 : GLOBAL_RESOURCES
351 2352 : .dirty_layers
352 2352 : .fetch_sub(1, AtomicOrdering::Relaxed);
353 2352 :
354 2352 : // Subtract our contribution to the global total dirty bytes
355 2352 : self.publish_size(0);
356 2352 : }
357 : }
358 :
359 : pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
360 : max_dirty_bytes: AtomicU64::new(0),
361 : dirty_bytes: AtomicU64::new(0),
362 : dirty_layers: AtomicUsize::new(0),
363 : };
364 :
365 : impl InMemoryLayer {
366 1213369 : pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
367 1213369 : self.file_id
368 1213369 : }
369 :
370 2348 : pub(crate) fn get_timeline_id(&self) -> TimelineId {
371 2348 : self.timeline_id
372 2348 : }
373 :
374 4696 : pub(crate) fn info(&self) -> InMemoryLayerInfo {
375 4696 : let lsn_start = self.start_lsn;
376 :
377 4696 : if let Some(&lsn_end) = self.end_lsn.get() {
378 4696 : InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
379 : } else {
380 0 : InMemoryLayerInfo::Open { lsn_start }
381 : }
382 4696 : }
383 :
384 4696 : pub(crate) fn try_len(&self) -> Option<u64> {
385 4696 : self.inner.try_read().map(|i| i.file.len()).ok()
386 4696 : }
387 :
388 9608464 : pub(crate) fn assert_writable(&self) {
389 9608464 : assert!(self.end_lsn.get().is_none());
390 9608464 : }
391 :
392 4259930 : pub(crate) fn end_lsn_or_max(&self) -> Lsn {
393 4259930 : self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
394 4259930 : }
395 :
396 4257582 : pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
397 4257582 : self.start_lsn..self.end_lsn_or_max()
398 4257582 : }
399 :
400 : /// debugging function to print out the contents of the layer
401 : ///
402 : /// this is likely completly unused
403 0 : pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
404 0 : let end_str = self.end_lsn_or_max();
405 0 :
406 0 : println!(
407 0 : "----- in-memory layer for tli {} LSNs {}-{} ----",
408 0 : self.timeline_id, self.start_lsn, end_str,
409 0 : );
410 0 :
411 0 : Ok(())
412 0 : }
413 :
414 : // Look up the keys in the provided keyspace and update
415 : // the reconstruct state with whatever is found.
416 1213369 : pub(crate) async fn get_values_reconstruct_data(
417 1213369 : self: &Arc<InMemoryLayer>,
418 1213369 : keyspace: KeySpace,
419 1213369 : end_lsn: Lsn,
420 1213369 : reconstruct_state: &mut ValuesReconstructState,
421 1213369 : ctx: &RequestContext,
422 1213369 : ) -> Result<(), GetVectoredError> {
423 1213369 : let ctx = RequestContextBuilder::extend(ctx)
424 1213369 : .page_content_kind(PageContentKind::InMemoryLayer)
425 1213369 : .build();
426 :
427 1213369 : let inner = self.inner.read().await;
428 :
429 : struct ValueRead {
430 : entry_lsn: Lsn,
431 : read: vectored_dio_read::LogicalRead<Vec<u8>>,
432 : }
433 1213369 : let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
434 1213369 : let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
435 1213369 :
436 1213369 : let lsn_range = self.start_lsn..end_lsn;
437 :
438 1213781 : for range in keyspace.ranges.iter() {
439 1213781 : for (key, vec_map) in inner
440 1213781 : .index
441 1213781 : .range(range.start.to_compact()..range.end.to_compact())
442 : {
443 998522 : let key = Key::from_compact(*key);
444 998522 : let slice = vec_map.slice_range(lsn_range.clone());
445 :
446 998522 : for (entry_lsn, index_entry) in slice.iter().rev() {
447 : let IndexEntryUnpacked {
448 998514 : pos,
449 998514 : len,
450 998514 : will_init,
451 998514 : } = index_entry.unpack();
452 998514 :
453 998514 : reads.entry(key).or_default().push(ValueRead {
454 998514 : entry_lsn: *entry_lsn,
455 998514 : read: vectored_dio_read::LogicalRead::new(
456 998514 : pos,
457 998514 : Vec::with_capacity(len as usize),
458 998514 : ),
459 998514 : });
460 998514 :
461 998514 : let io = reconstruct_state.update_key(&key, *entry_lsn, will_init);
462 998514 : ios.insert((key, *entry_lsn), io);
463 998514 :
464 998514 : if will_init {
465 998514 : break;
466 0 : }
467 : }
468 : }
469 : }
470 1213369 : drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
471 1213369 : let read_from = Arc::clone(self);
472 1213369 : let read_ctx = ctx.attached_child();
473 1213369 : reconstruct_state
474 1213369 : .spawn_io(async move {
475 1213369 : let inner = read_from.inner.read().await;
476 1213369 : let f = vectored_dio_read::execute(
477 1213369 : &inner.file,
478 1213369 : reads
479 1213369 : .iter()
480 1213369 : .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
481 1213369 : &read_ctx,
482 1213369 : );
483 1213369 : send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
484 1213369 : .await;
485 :
486 2211883 : for (key, value_reads) in reads {
487 1997028 : for ValueRead { entry_lsn, read } in value_reads {
488 998514 : let io = ios.remove(&(key, entry_lsn)).expect("sender must exist");
489 998514 : match read.into_result().expect("we run execute() above") {
490 0 : Err(e) => {
491 0 : io.complete(Err(std::io::Error::new(
492 0 : e.kind(),
493 0 : "dio vec read failed",
494 0 : )));
495 0 : }
496 998514 : Ok(value_buf) => {
497 998514 : io.complete(Ok(OnDiskValue::WalRecordOrImage(value_buf.into())));
498 998514 : }
499 : }
500 : }
501 : }
502 :
503 1213369 : assert!(ios.is_empty());
504 :
505 : // Keep layer existent until this IO is done;
506 : // This is kinda forced for InMemoryLayer because we need to inner.read() anyway,
507 : // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit
508 : // drop for consistency among all three layer types.
509 1213369 : drop(inner);
510 1213369 : drop(read_from);
511 1213369 : })
512 1213369 : .await;
513 :
514 1213369 : Ok(())
515 1213369 : }
516 : }
517 :
518 4696 : fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
519 4696 : write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
520 4696 : }
521 :
522 2348 : fn inmem_layer_log_display(
523 2348 : mut f: impl Write,
524 2348 : timeline: TimelineId,
525 2348 : start_lsn: Lsn,
526 2348 : end_lsn: Lsn,
527 2348 : ) -> std::fmt::Result {
528 2348 : write!(f, "timeline {} in-memory ", timeline)?;
529 2348 : inmem_layer_display(f, start_lsn, end_lsn)
530 2348 : }
531 :
532 : impl std::fmt::Display for InMemoryLayer {
533 2348 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
534 2348 : let end_lsn = self.end_lsn_or_max();
535 2348 : inmem_layer_display(f, self.start_lsn, end_lsn)
536 2348 : }
537 : }
538 :
539 : impl InMemoryLayer {
540 : /// Get layer size.
541 2596 : pub async fn size(&self) -> Result<u64> {
542 2596 : let inner = self.inner.read().await;
543 2596 : Ok(inner.file.len())
544 2596 : }
545 :
546 2413 : pub fn estimated_in_mem_size(&self) -> u64 {
547 2413 : self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
548 2413 : }
549 :
550 : /// Create a new, empty, in-memory layer
551 2596 : pub async fn create(
552 2596 : conf: &'static PageServerConf,
553 2596 : timeline_id: TimelineId,
554 2596 : tenant_shard_id: TenantShardId,
555 2596 : start_lsn: Lsn,
556 2596 : gate: &utils::sync::gate::Gate,
557 2596 : ctx: &RequestContext,
558 2596 : ) -> Result<InMemoryLayer> {
559 2596 : trace!(
560 0 : "initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"
561 : );
562 :
563 2596 : let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?;
564 2596 : let key = InMemoryLayerFileId(file.page_cache_file_id());
565 2596 :
566 2596 : Ok(InMemoryLayer {
567 2596 : file_id: key,
568 2596 : frozen_local_path_str: OnceLock::new(),
569 2596 : conf,
570 2596 : timeline_id,
571 2596 : tenant_shard_id,
572 2596 : start_lsn,
573 2596 : end_lsn: OnceLock::new(),
574 2596 : opened_at: Instant::now(),
575 2596 : inner: RwLock::new(InMemoryLayerInner {
576 2596 : index: BTreeMap::new(),
577 2596 : file,
578 2596 : resource_units: GlobalResourceUnits::new(),
579 2596 : }),
580 2596 : estimated_in_mem_size: AtomicU64::new(0),
581 2596 : })
582 2596 : }
583 :
584 : /// Write path.
585 : ///
586 : /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
587 : /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
588 : /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
589 9608464 : pub async fn put_batch(
590 9608464 : &self,
591 9608464 : serialized_batch: SerializedValueBatch,
592 9608464 : ctx: &RequestContext,
593 9608464 : ) -> anyhow::Result<()> {
594 9608464 : let mut inner = self.inner.write().await;
595 9608464 : self.assert_writable();
596 9608464 :
597 9608464 : let base_offset = inner.file.len();
598 9608464 :
599 9608464 : let SerializedValueBatch {
600 9608464 : raw,
601 9608464 : metadata,
602 9608464 : max_lsn: _,
603 9608464 : len: _,
604 9608464 : } = serialized_batch;
605 9608464 :
606 9608464 : // Write the batch to the file
607 9608464 : inner.file.write_raw(&raw, ctx).await?;
608 9608464 : let new_size = inner.file.len();
609 9608464 :
610 9608464 : let expected_new_len = base_offset
611 9608464 : .checked_add(raw.len().into_u64())
612 9608464 : // write_raw would error if we were to overflow u64.
613 9608464 : // also IndexEntry and higher levels in
614 9608464 : //the code don't allow the file to grow that large
615 9608464 : .unwrap();
616 9608464 : assert_eq!(new_size, expected_new_len);
617 :
618 : // Update the index with the new entries
619 19789824 : for meta in metadata {
620 : let SerializedValueMeta {
621 10181360 : key,
622 10181360 : lsn,
623 10181360 : batch_offset,
624 10181360 : len,
625 10181360 : will_init,
626 10181360 : } = match meta {
627 10181360 : ValueMeta::Serialized(ser) => ser,
628 : ValueMeta::Observed(_) => {
629 0 : continue;
630 : }
631 : };
632 :
633 : // Add the base_offset to the batch's index entries which are relative to the batch start.
634 10181360 : let index_entry = IndexEntry::new(IndexEntryNewArgs {
635 10181360 : base_offset,
636 10181360 : batch_offset,
637 10181360 : len,
638 10181360 : will_init,
639 10181360 : })?;
640 :
641 10181360 : let vec_map = inner.index.entry(key).or_default();
642 10181360 : let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
643 10181360 : if old.is_some() {
644 : // This should not break anything, but is unexpected: ingestion code aims to filter out
645 : // multiple writes to the same key at the same LSN. This happens in cases where our
646 : // ingenstion code generates some write like an empty page, and we see a write from postgres
647 : // to the same key in the same wal record. If one such write makes it through, we
648 : // index the most recent write, implicitly ignoring the earlier write. We log a warning
649 : // because this case is unexpected, and we would like tests to fail if this happens.
650 0 : warn!("Key {} at {} written twice at same LSN", key, lsn);
651 10181360 : }
652 10181360 : self.estimated_in_mem_size.fetch_add(
653 10181360 : (std::mem::size_of::<CompactKey>()
654 10181360 : + std::mem::size_of::<Lsn>()
655 10181360 : + std::mem::size_of::<IndexEntry>()) as u64,
656 10181360 : AtomicOrdering::Relaxed,
657 10181360 : );
658 : }
659 :
660 9608464 : inner.resource_units.maybe_publish_size(new_size);
661 9608464 :
662 9608464 : Ok(())
663 9608464 : }
664 :
665 9606020 : pub(crate) fn get_opened_at(&self) -> Instant {
666 9606020 : self.opened_at
667 9606020 : }
668 :
669 0 : pub(crate) async fn tick(&self) -> Option<u64> {
670 0 : let mut inner = self.inner.write().await;
671 0 : let size = inner.file.len();
672 0 : inner.resource_units.publish_size(size)
673 0 : }
674 :
675 4 : pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
676 4 : // TODO: Currently, we just leak the storage for any deleted keys
677 4 : Ok(())
678 4 : }
679 :
680 : /// Records the end_lsn for non-dropped layers.
681 : /// `end_lsn` is exclusive
682 2348 : pub async fn freeze(&self, end_lsn: Lsn) {
683 2348 : assert!(
684 2348 : self.start_lsn < end_lsn,
685 0 : "{} >= {}",
686 : self.start_lsn,
687 : end_lsn
688 : );
689 2348 : self.end_lsn.set(end_lsn).expect("end_lsn set only once");
690 2348 :
691 2348 : self.frozen_local_path_str
692 2348 : .set({
693 2348 : let mut buf = String::new();
694 2348 : inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
695 2348 : .unwrap();
696 2348 : buf.into()
697 2348 : })
698 2348 : .expect("frozen_local_path_str set only once");
699 :
700 : #[cfg(debug_assertions)]
701 : {
702 2348 : let inner = self.inner.write().await;
703 8512055 : for vec_map in inner.index.values() {
704 8773880 : for (lsn, _) in vec_map.as_slice() {
705 8773880 : assert!(*lsn < end_lsn);
706 : }
707 : }
708 : }
709 2348 : }
710 :
711 : /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
712 : /// layer will only contain the key range the user specifies, and may return `None`
713 : /// if there are no matching keys.
714 : ///
715 : /// Returns a new delta layer with all the same data as this in-memory layer
716 1936 : pub async fn write_to_disk(
717 1936 : &self,
718 1936 : ctx: &RequestContext,
719 1936 : key_range: Option<Range<Key>>,
720 1936 : l0_flush_global_state: &l0_flush::Inner,
721 1936 : ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
722 : // Grab the lock in read-mode. We hold it over the I/O, but because this
723 : // layer is not writeable anymore, no one should be trying to acquire the
724 : // write lock on it, so we shouldn't block anyone. There's one exception
725 : // though: another thread might have grabbed a reference to this layer
726 : // in `get_layer_for_write' just before the checkpointer called
727 : // `freeze`, and then `write_to_disk` on it. When the thread gets the
728 : // lock, it will see that it's not writeable anymore and retry, but it
729 : // would have to wait until we release it. That race condition is very
730 : // rare though, so we just accept the potential latency hit for now.
731 1936 : let inner = self.inner.read().await;
732 :
733 : use l0_flush::Inner;
734 1936 : let _concurrency_permit = match l0_flush_global_state {
735 1936 : Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
736 : };
737 :
738 1936 : let end_lsn = *self.end_lsn.get().unwrap();
739 :
740 1936 : let key_count = if let Some(key_range) = key_range {
741 0 : let key_range = key_range.start.to_compact()..key_range.end.to_compact();
742 0 :
743 0 : inner
744 0 : .index
745 0 : .iter()
746 0 : .filter(|(k, _)| key_range.contains(k))
747 0 : .count()
748 : } else {
749 1936 : inner.index.len()
750 : };
751 1936 : if key_count == 0 {
752 0 : return Ok(None);
753 1936 : }
754 :
755 1936 : let mut delta_layer_writer = DeltaLayerWriter::new(
756 1936 : self.conf,
757 1936 : self.timeline_id,
758 1936 : self.tenant_shard_id,
759 1936 : Key::MIN,
760 1936 : self.start_lsn..end_lsn,
761 1936 : ctx,
762 1936 : )
763 1936 : .await?;
764 :
765 1936 : match l0_flush_global_state {
766 : l0_flush::Inner::Direct { .. } => {
767 1936 : let file_contents = inner.file.load_to_io_buf(ctx).await?;
768 1936 : let file_contents = file_contents.freeze();
769 :
770 8509207 : for (key, vec_map) in inner.index.iter() {
771 : // Write all page versions
772 8771032 : for (lsn, entry) in vec_map
773 8509207 : .as_slice()
774 8509207 : .iter()
775 8771032 : .map(|(lsn, entry)| (lsn, entry.unpack()))
776 : {
777 : let IndexEntryUnpacked {
778 8771032 : pos,
779 8771032 : len,
780 8771032 : will_init,
781 8771032 : } = entry;
782 8771032 : let buf = file_contents.slice(pos as usize..(pos + len) as usize);
783 8771032 : let (_buf, res) = delta_layer_writer
784 8771032 : .put_value_bytes(
785 8771032 : Key::from_compact(*key),
786 8771032 : *lsn,
787 8771032 : buf.slice_len(),
788 8771032 : will_init,
789 8771032 : ctx,
790 8771032 : )
791 8771032 : .await;
792 8771032 : res?;
793 : }
794 : }
795 : }
796 : }
797 :
798 : // MAX is used here because we identify L0 layers by full key range
799 1936 : let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
800 :
801 : // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
802 : //
803 : // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
804 : // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
805 : // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
806 : //
807 : // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
808 : // we dirtied when writing to the filesystem have been flushed and marked !dirty.
809 1936 : drop(_concurrency_permit);
810 1936 :
811 1936 : Ok(Some((desc, path)))
812 1936 : }
813 : }
814 :
815 : #[cfg(test)]
816 : mod tests {
817 : use super::*;
818 :
819 : #[test]
820 4 : fn test_index_entry() {
821 : const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS;
822 : use {IndexEntryNewArgs as Args, IndexEntryUnpacked as Unpacked};
823 :
824 80 : let roundtrip = |args, expect: Unpacked| {
825 80 : let res = IndexEntry::new(args).expect("this tests expects no errors");
826 80 : let IndexEntryUnpacked {
827 80 : will_init,
828 80 : len,
829 80 : pos,
830 80 : } = res.unpack();
831 80 : assert_eq!(will_init, expect.will_init);
832 80 : assert_eq!(len, expect.len);
833 80 : assert_eq!(pos, expect.pos);
834 80 : };
835 :
836 : // basic roundtrip
837 12 : for pos in [0, MAX_SUPPORTED_POS] {
838 24 : for len in [0, MAX_SUPPORTED_BLOB_LEN] {
839 48 : for will_init in [true, false] {
840 32 : let expect = Unpacked {
841 32 : will_init,
842 32 : len: len.into_u64(),
843 32 : pos: pos.into_u64(),
844 32 : };
845 32 : roundtrip(
846 32 : Args {
847 32 : will_init,
848 32 : base_offset: pos.into_u64(),
849 32 : batch_offset: 0,
850 32 : len,
851 32 : },
852 32 : expect,
853 32 : );
854 32 : roundtrip(
855 32 : Args {
856 32 : will_init,
857 32 : base_offset: 0,
858 32 : batch_offset: pos.into_u64(),
859 32 : len,
860 32 : },
861 32 : expect,
862 32 : );
863 32 : }
864 : }
865 : }
866 :
867 : // too-large len
868 4 : let too_large = Args {
869 4 : will_init: false,
870 4 : len: MAX_SUPPORTED_BLOB_LEN + 1,
871 4 : base_offset: 0,
872 4 : batch_offset: 0,
873 4 : };
874 4 : assert!(IndexEntry::new(too_large).is_err());
875 :
876 : // too-large pos
877 : {
878 4 : let too_large = Args {
879 4 : will_init: false,
880 4 : len: 0,
881 4 : base_offset: MAX_SUPPORTED_POS.into_u64() + 1,
882 4 : batch_offset: 0,
883 4 : };
884 4 : assert!(IndexEntry::new(too_large).is_err());
885 4 : let too_large = Args {
886 4 : will_init: false,
887 4 : len: 0,
888 4 : base_offset: 0,
889 4 : batch_offset: MAX_SUPPORTED_POS.into_u64() + 1,
890 4 : };
891 4 : assert!(IndexEntry::new(too_large).is_err());
892 : }
893 :
894 : // too large (base_offset + batch_offset)
895 : {
896 4 : let too_large = Args {
897 4 : will_init: false,
898 4 : len: 0,
899 4 : base_offset: MAX_SUPPORTED_POS.into_u64(),
900 4 : batch_offset: 1,
901 4 : };
902 4 : assert!(IndexEntry::new(too_large).is_err());
903 4 : let too_large = Args {
904 4 : will_init: false,
905 4 : len: 0,
906 4 : base_offset: MAX_SUPPORTED_POS.into_u64() - 1,
907 4 : batch_offset: MAX_SUPPORTED_POS.into_u64() - 1,
908 4 : };
909 4 : assert!(IndexEntry::new(too_large).is_err());
910 : }
911 :
912 : // valid special cases
913 : // - area past the max supported pos that is accessible by len
914 12 : for len in [1, MAX_SUPPORTED_BLOB_LEN] {
915 8 : roundtrip(
916 8 : Args {
917 8 : will_init: false,
918 8 : len,
919 8 : base_offset: MAX_SUPPORTED_POS.into_u64(),
920 8 : batch_offset: 0,
921 8 : },
922 8 : Unpacked {
923 8 : will_init: false,
924 8 : len: len as u64,
925 8 : pos: MAX_SUPPORTED_POS.into_u64(),
926 8 : },
927 8 : );
928 8 : roundtrip(
929 8 : Args {
930 8 : will_init: false,
931 8 : len,
932 8 : base_offset: 0,
933 8 : batch_offset: MAX_SUPPORTED_POS.into_u64(),
934 8 : },
935 8 : Unpacked {
936 8 : will_init: false,
937 8 : len: len as u64,
938 8 : pos: MAX_SUPPORTED_POS.into_u64(),
939 8 : },
940 8 : );
941 8 : }
942 4 : }
943 : }
|