Line data Source code
1 : //! An in-memory layer stores recently received key-value pairs.
2 : //!
3 : //! The "in-memory" part of the name is a bit misleading: the actual page versions are
4 : //! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
5 : //! its position in the file, is kept in memory, though.
6 : //!
7 : use std::cmp::Ordering;
8 : use std::collections::{BTreeMap, HashMap};
9 : use std::fmt::Write;
10 : use std::ops::Range;
11 : use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering as AtomicOrdering};
12 : use std::sync::{Arc, OnceLock};
13 : use std::time::Instant;
14 :
15 : use anyhow::Result;
16 : use camino::Utf8PathBuf;
17 : use pageserver_api::key::{CompactKey, Key};
18 : use pageserver_api::keyspace::KeySpace;
19 : use pageserver_api::models::InMemoryLayerInfo;
20 : use pageserver_api::shard::TenantShardId;
21 : use tokio::sync::RwLock;
22 : use tokio_util::sync::CancellationToken;
23 : use tracing::*;
24 : use utils::id::TimelineId;
25 : use utils::lsn::Lsn;
26 : use utils::vec_map::VecMap;
27 : use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
28 :
29 : use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};
30 : use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64, u64_to_usize};
31 : use crate::config::PageServerConf;
32 : use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
33 : // avoid binding to Write (conflicts with std::io::Write)
34 : // while being able to use std::fmt::Write's methods
35 : use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
36 : use crate::tenant::ephemeral_file::EphemeralFile;
37 : use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo};
38 : use crate::tenant::timeline::GetVectoredError;
39 : use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
40 : use crate::{l0_flush, page_cache};
41 :
42 : pub(crate) mod vectored_dio_read;
43 :
44 : #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
45 : pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
46 :
47 : pub struct InMemoryLayer {
48 : conf: &'static PageServerConf,
49 : tenant_shard_id: TenantShardId,
50 : timeline_id: TimelineId,
51 : file_id: InMemoryLayerFileId,
52 :
53 : /// This layer contains all the changes from 'start_lsn'. The
54 : /// start is inclusive.
55 : start_lsn: Lsn,
56 :
57 : /// Frozen layers have an exclusive end LSN.
58 : /// Writes are only allowed when this is `None`.
59 : pub(crate) end_lsn: OnceLock<Lsn>,
60 :
61 : /// Used for traversal path. Cached representation of the in-memory layer after frozen.
62 : frozen_local_path_str: OnceLock<Arc<str>>,
63 :
64 : opened_at: Instant,
65 :
66 : /// All versions of all pages in the layer are kept here. Indexed
67 : /// by block number and LSN. The [`IndexEntry`] is an offset into the
68 : /// ephemeral file where the page version is stored.
69 : ///
70 : /// We use a separate lock for the index to reduce the critical section
71 : /// during which reads cannot be planned.
72 : ///
73 : /// If you need access to both the index and the underlying file at the same time,
74 : /// respect the following locking order to avoid deadlocks:
75 : /// 1. [`InMemoryLayer::inner`]
76 : /// 2. [`InMemoryLayer::index`]
77 : ///
78 : /// Note that the file backing [`InMemoryLayer::inner`] is append-only,
79 : /// so it is not necessary to hold simultaneous locks on index.
80 : /// This avoids holding index locks across IO, and is crucial for avoiding read tail latency.
81 : /// In particular:
82 : /// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`].
83 : /// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`].
84 : index: RwLock<BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>>,
85 :
86 : /// The above fields never change, except for `end_lsn`, which is only set once,
87 : /// and `index` (see rationale there).
88 : /// All other changing parts are in `inner`, and protected by a mutex.
89 : inner: RwLock<InMemoryLayerInner>,
90 :
91 : estimated_in_mem_size: AtomicU64,
92 : }
93 :
94 : impl std::fmt::Debug for InMemoryLayer {
95 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
96 0 : f.debug_struct("InMemoryLayer")
97 0 : .field("start_lsn", &self.start_lsn)
98 0 : .field("end_lsn", &self.end_lsn)
99 0 : .field("inner", &self.inner)
100 0 : .finish()
101 0 : }
102 : }
103 :
104 : pub struct InMemoryLayerInner {
105 : /// The values are stored in a serialized format in this file.
106 : /// Each serialized Value is preceded by a 'u32' length field.
107 : /// PerSeg::page_versions map stores offsets into this file.
108 : file: EphemeralFile,
109 :
110 : resource_units: GlobalResourceUnits,
111 : }
112 :
113 : /// Support the same max blob length as blob_io, because ultimately
114 : /// all the InMemoryLayer contents end up being written into a delta layer,
115 : /// using the [`crate::tenant::blob_io`].
116 : const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN;
117 : const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
118 : let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize;
119 : let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize;
120 : assert!(trailing_ones + leading_zeroes == std::mem::size_of::<usize>() * 8);
121 : trailing_ones
122 : };
123 :
124 : /// See [`InMemoryLayer::index`].
125 : ///
126 : /// For memory efficiency, the data is packed into a u64.
127 : ///
128 : /// Layout:
129 : /// - 1 bit: `will_init`
130 : /// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len`
131 : /// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos`
132 : #[derive(Debug, Clone, Copy, PartialEq, Eq)]
133 : pub struct IndexEntry(u64);
134 :
135 : impl IndexEntry {
136 : /// See [`Self::MAX_SUPPORTED_POS`].
137 : const MAX_SUPPORTED_POS_BITS: usize = {
138 : let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS;
139 : if remainder < 32 {
140 : panic!("pos can be u32 as per type system, support that");
141 : }
142 : remainder
143 : };
144 : /// The maximum supported blob offset that can be represented by [`Self`].
145 : /// See also [`Self::validate_checkpoint_distance`].
146 : const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1;
147 :
148 : // Layout
149 : const WILL_INIT_RANGE: Range<usize> = 0..1;
150 : const LEN_RANGE: Range<usize> =
151 : Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS;
152 : const POS_RANGE: Range<usize> =
153 : Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS;
154 : const _ASSERT: () = {
155 : if Self::POS_RANGE.end != 64 {
156 : panic!("we don't want undefined bits for our own sanity")
157 : }
158 : };
159 :
160 : /// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`].
161 : ///
162 : /// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long.
163 : /// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`].
164 : ///
165 : /// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance,
166 : /// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value.
167 : ///
168 : /// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested)
169 : /// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer.
170 : #[inline(always)]
171 2565230 : fn new(arg: IndexEntryNewArgs) -> anyhow::Result<Self> {
172 2565230 : let IndexEntryNewArgs {
173 2565230 : base_offset,
174 2565230 : batch_offset,
175 2565230 : len,
176 2565230 : will_init,
177 2565230 : } = arg;
178 :
179 2565230 : let pos = base_offset
180 2565230 : .checked_add(batch_offset)
181 2565230 : .ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?;
182 :
183 2565230 : if pos.into_usize() > Self::MAX_SUPPORTED_POS {
184 4 : anyhow::bail!(
185 4 : "base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}",
186 4 : max = Self::MAX_SUPPORTED_POS
187 4 : );
188 2565226 : }
189 2565226 :
190 2565226 : if len > MAX_SUPPORTED_BLOB_LEN {
191 1 : anyhow::bail!(
192 1 : "len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}",
193 1 : );
194 2565225 : }
195 2565225 :
196 2565225 : let mut data: u64 = 0;
197 : use bit_field::BitField;
198 2565225 : data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 });
199 2565225 : data.set_bits(Self::LEN_RANGE, len.into_u64());
200 2565225 : data.set_bits(Self::POS_RANGE, pos);
201 2565225 :
202 2565225 : Ok(Self(data))
203 2565230 : }
204 :
205 : #[inline(always)]
206 3149462 : fn unpack(&self) -> IndexEntryUnpacked {
207 : use bit_field::BitField;
208 3149462 : IndexEntryUnpacked {
209 3149462 : will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0,
210 3149462 : len: self.0.get_bits(Self::LEN_RANGE),
211 3149462 : pos: self.0.get_bits(Self::POS_RANGE),
212 3149462 : }
213 3149462 : }
214 :
215 : /// See [`Self::new`].
216 126 : pub(crate) const fn validate_checkpoint_distance(
217 126 : checkpoint_distance: u64,
218 126 : ) -> Result<(), &'static str> {
219 126 : if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 {
220 0 : return Err("exceeds the maximum supported value");
221 126 : }
222 126 : let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN);
223 126 : if res.is_none() {
224 0 : return Err(
225 0 : "checkpoint distance + max supported blob len overflows in-memory addition",
226 0 : );
227 126 : }
228 126 :
229 126 : // NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS
230 126 :
231 126 : Ok(())
232 126 : }
233 :
234 : const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
235 : let res = Self::validate_checkpoint_distance(
236 : pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE,
237 : );
238 : if res.is_err() {
239 : panic!("default checkpoint distance is valid")
240 : }
241 : };
242 : }
243 :
244 : /// Args to [`IndexEntry::new`].
245 : #[derive(Clone, Copy)]
246 : struct IndexEntryNewArgs {
247 : base_offset: u64,
248 : batch_offset: u64,
249 : len: usize,
250 : will_init: bool,
251 : }
252 :
253 : /// Unpacked representation of the bitfielded [`IndexEntry`].
254 : #[derive(Clone, Copy, PartialEq, Eq, Debug)]
255 : struct IndexEntryUnpacked {
256 : will_init: bool,
257 : len: u64,
258 : pos: u64,
259 : }
260 :
261 : impl std::fmt::Debug for InMemoryLayerInner {
262 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
263 0 : f.debug_struct("InMemoryLayerInner").finish()
264 0 : }
265 : }
266 :
267 : /// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline,
268 : /// to minimize contention.
269 : ///
270 : /// This global state is used to implement behaviors that require a global view of the system, e.g.
271 : /// rolling layers proactively to limit the total amount of dirty data.
272 : pub(crate) struct GlobalResources {
273 : // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
274 : // Zero means unlimited.
275 : pub(crate) max_dirty_bytes: AtomicU64,
276 : // How many bytes are in all EphemeralFile objects
277 : dirty_bytes: AtomicU64,
278 : // How many layers are contributing to dirty_bytes
279 : dirty_layers: AtomicUsize,
280 : }
281 :
282 : // Per-timeline RAII struct for its contribution to [`GlobalResources`]
283 : struct GlobalResourceUnits {
284 : // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
285 : // for decrementing the global counter by this many bytes when dropped.
286 : dirty_bytes: u64,
287 : }
288 :
289 : impl GlobalResourceUnits {
290 : // Hint for the layer append path to update us when the layer size differs from the last
291 : // call to update_size by this much. If we don't reach this threshold, we'll still get
292 : // updated when the Timeline "ticks" in the background.
293 : const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
294 :
295 661 : fn new() -> Self {
296 661 : GLOBAL_RESOURCES
297 661 : .dirty_layers
298 661 : .fetch_add(1, AtomicOrdering::Relaxed);
299 661 : Self { dirty_bytes: 0 }
300 661 : }
301 :
302 : /// Do not call this frequently: all timelines will write to these same global atomics,
303 : /// so this is a relatively expensive operation. Wait at least a few seconds between calls.
304 : ///
305 : /// Returns the effective layer size limit that should be applied, if any, to keep
306 : /// the total number of dirty bytes below the configured maximum.
307 599 : fn publish_size(&mut self, size: u64) -> Option<u64> {
308 599 : let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
309 594 : Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
310 : Ordering::Greater => {
311 4 : let delta = size - self.dirty_bytes;
312 4 : let old = GLOBAL_RESOURCES
313 4 : .dirty_bytes
314 4 : .fetch_add(delta, AtomicOrdering::Relaxed);
315 4 : old + delta
316 : }
317 : Ordering::Less => {
318 1 : let delta = self.dirty_bytes - size;
319 1 : let old = GLOBAL_RESOURCES
320 1 : .dirty_bytes
321 1 : .fetch_sub(delta, AtomicOrdering::Relaxed);
322 1 : old - delta
323 : }
324 : };
325 :
326 : // This is a sloppy update: concurrent updates to the counter will race, and the exact
327 : // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
328 : // That's okay: as long as the metric contains some recent value, it doesn't have to always
329 : // be literally the last update.
330 599 : TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
331 599 :
332 599 : self.dirty_bytes = size;
333 599 :
334 599 : let max_dirty_bytes = GLOBAL_RESOURCES
335 599 : .max_dirty_bytes
336 599 : .load(AtomicOrdering::Relaxed);
337 599 : if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
338 : // Set the layer file limit to the average layer size: this implies that all above-average
339 : // sized layers will be elegible for freezing. They will be frozen in the order they
340 : // next enter publish_size.
341 0 : Some(
342 0 : new_global_dirty_bytes
343 0 : / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
344 0 : )
345 : } else {
346 599 : None
347 : }
348 599 : }
349 :
350 : // Call publish_size if the input size differs from last published size by more than
351 : // the drift limit
352 2402128 : fn maybe_publish_size(&mut self, size: u64) {
353 2402128 : let publish = match size.cmp(&self.dirty_bytes) {
354 0 : Ordering::Equal => false,
355 2402128 : Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
356 0 : Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT,
357 : };
358 :
359 2402128 : if publish {
360 4 : self.publish_size(size);
361 2402124 : }
362 2402128 : }
363 : }
364 :
365 : impl Drop for GlobalResourceUnits {
366 595 : fn drop(&mut self) {
367 595 : GLOBAL_RESOURCES
368 595 : .dirty_layers
369 595 : .fetch_sub(1, AtomicOrdering::Relaxed);
370 595 :
371 595 : // Subtract our contribution to the global total dirty bytes
372 595 : self.publish_size(0);
373 595 : }
374 : }
375 :
376 : pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
377 : max_dirty_bytes: AtomicU64::new(0),
378 : dirty_bytes: AtomicU64::new(0),
379 : dirty_layers: AtomicUsize::new(0),
380 : };
381 :
382 : impl InMemoryLayer {
383 311044 : pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
384 311044 : self.file_id
385 311044 : }
386 :
387 597 : pub(crate) fn get_timeline_id(&self) -> TimelineId {
388 597 : self.timeline_id
389 597 : }
390 :
391 1190 : pub(crate) fn info(&self) -> InMemoryLayerInfo {
392 1190 : let lsn_start = self.start_lsn;
393 :
394 1190 : if let Some(&lsn_end) = self.end_lsn.get() {
395 1189 : InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
396 : } else {
397 1 : InMemoryLayerInfo::Open { lsn_start }
398 : }
399 1190 : }
400 :
401 1186 : pub(crate) fn try_len(&self) -> Option<u64> {
402 1186 : self.inner.try_read().map(|i| i.file.len()).ok()
403 1186 : }
404 :
405 2402128 : pub(crate) fn assert_writable(&self) {
406 2402128 : assert!(self.end_lsn.get().is_none());
407 2402128 : }
408 :
409 1212296 : pub(crate) fn end_lsn_or_max(&self) -> Lsn {
410 1212296 : self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
411 1212296 : }
412 :
413 1211703 : pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
414 1211703 : self.start_lsn..self.end_lsn_or_max()
415 1211703 : }
416 :
417 : /// debugging function to print out the contents of the layer
418 : ///
419 : /// this is likely completly unused
420 0 : pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
421 0 : let end_str = self.end_lsn_or_max();
422 0 :
423 0 : println!(
424 0 : "----- in-memory layer for tli {} LSNs {}-{} ----",
425 0 : self.timeline_id, self.start_lsn, end_str,
426 0 : );
427 0 :
428 0 : Ok(())
429 0 : }
430 :
431 : // Look up the keys in the provided keyspace and update
432 : // the reconstruct state with whatever is found.
433 307204 : pub(crate) async fn get_values_reconstruct_data(
434 307204 : self: &Arc<InMemoryLayer>,
435 307204 : keyspace: KeySpace,
436 307204 : lsn_range: Range<Lsn>,
437 307204 : reconstruct_state: &mut ValuesReconstructState,
438 307204 : ctx: &RequestContext,
439 307204 : ) -> Result<(), GetVectoredError> {
440 307204 : let ctx = RequestContextBuilder::from(ctx)
441 307204 : .page_content_kind(PageContentKind::InMemoryLayer)
442 307204 : .attached_child();
443 :
444 307204 : let index = self.index.read().await;
445 :
446 : struct ValueRead {
447 : entry_lsn: Lsn,
448 : read: vectored_dio_read::LogicalRead<Vec<u8>>,
449 : }
450 307204 : let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
451 307204 : let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
452 :
453 310851 : for range in keyspace.ranges.iter() {
454 310851 : for (key, vec_map) in index.range(range.start.to_compact()..range.end.to_compact()) {
455 263874 : let key = Key::from_compact(*key);
456 263874 : let slice = vec_map.slice_range(lsn_range.clone());
457 :
458 956684 : for (entry_lsn, index_entry) in slice.iter().rev() {
459 : let IndexEntryUnpacked {
460 956684 : pos,
461 956684 : len,
462 956684 : will_init,
463 956684 : } = index_entry.unpack();
464 956684 :
465 956684 : reads.entry(key).or_default().push(ValueRead {
466 956684 : entry_lsn: *entry_lsn,
467 956684 : read: vectored_dio_read::LogicalRead::new(
468 956684 : pos,
469 956684 : Vec::with_capacity(len as usize),
470 956684 : ),
471 956684 : });
472 956684 :
473 956684 : let io = reconstruct_state.update_key(&key, *entry_lsn, will_init);
474 956684 : ios.insert((key, *entry_lsn), io);
475 956684 :
476 956684 : if will_init {
477 253522 : break;
478 703162 : }
479 : }
480 : }
481 : }
482 307204 : drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
483 307204 : let read_from = Arc::clone(self);
484 307204 : let read_ctx = ctx.attached_child();
485 307204 : reconstruct_state
486 307204 : .spawn_io(async move {
487 307204 : let inner = read_from.inner.read().await;
488 307204 : let f = vectored_dio_read::execute(
489 307204 : &inner.file,
490 307204 : reads
491 307204 : .iter()
492 956684 : .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
493 307204 : &read_ctx,
494 307204 : );
495 307204 : send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
496 307204 : .await;
497 :
498 571076 : for (key, value_reads) in reads {
499 1220556 : for ValueRead { entry_lsn, read } in value_reads {
500 956684 : let io = ios.remove(&(key, entry_lsn)).expect("sender must exist");
501 956684 : match read.into_result().expect("we run execute() above") {
502 0 : Err(e) => {
503 0 : io.complete(Err(std::io::Error::new(
504 0 : e.kind(),
505 0 : "dio vec read failed",
506 0 : )));
507 0 : }
508 956684 : Ok(value_buf) => {
509 956684 : io.complete(Ok(OnDiskValue::WalRecordOrImage(value_buf.into())));
510 956684 : }
511 : }
512 : }
513 : }
514 :
515 307204 : assert!(ios.is_empty());
516 :
517 : // Keep layer existent until this IO is done;
518 : // This is kinda forced for InMemoryLayer because we need to inner.read() anyway,
519 : // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit
520 : // drop for consistency among all three layer types.
521 307204 : drop(inner);
522 307204 : drop(read_from);
523 307204 : })
524 307204 : .await;
525 :
526 307204 : Ok(())
527 307204 : }
528 : }
529 :
530 1190 : fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
531 1190 : write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
532 1190 : }
533 :
534 597 : fn inmem_layer_log_display(
535 597 : mut f: impl Write,
536 597 : timeline: TimelineId,
537 597 : start_lsn: Lsn,
538 597 : end_lsn: Lsn,
539 597 : ) -> std::fmt::Result {
540 597 : write!(f, "timeline {} in-memory ", timeline)?;
541 597 : inmem_layer_display(f, start_lsn, end_lsn)
542 597 : }
543 :
544 : impl std::fmt::Display for InMemoryLayer {
545 593 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
546 593 : let end_lsn = self.end_lsn_or_max();
547 593 : inmem_layer_display(f, self.start_lsn, end_lsn)
548 593 : }
549 : }
550 :
551 : impl InMemoryLayer {
552 : /// Get layer size.
553 656 : pub async fn size(&self) -> Result<u64> {
554 656 : let inner = self.inner.read().await;
555 656 : Ok(inner.file.len())
556 656 : }
557 :
558 602 : pub fn estimated_in_mem_size(&self) -> u64 {
559 602 : self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
560 602 : }
561 :
562 : /// Create a new, empty, in-memory layer
563 661 : pub async fn create(
564 661 : conf: &'static PageServerConf,
565 661 : timeline_id: TimelineId,
566 661 : tenant_shard_id: TenantShardId,
567 661 : start_lsn: Lsn,
568 661 : gate: &utils::sync::gate::Gate,
569 661 : cancel: &CancellationToken,
570 661 : ctx: &RequestContext,
571 661 : ) -> Result<InMemoryLayer> {
572 661 : trace!(
573 0 : "initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"
574 : );
575 :
576 661 : let file =
577 661 : EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, cancel, ctx).await?;
578 661 : let key = InMemoryLayerFileId(file.page_cache_file_id());
579 661 :
580 661 : Ok(InMemoryLayer {
581 661 : file_id: key,
582 661 : frozen_local_path_str: OnceLock::new(),
583 661 : conf,
584 661 : timeline_id,
585 661 : tenant_shard_id,
586 661 : start_lsn,
587 661 : end_lsn: OnceLock::new(),
588 661 : opened_at: Instant::now(),
589 661 : index: RwLock::new(BTreeMap::new()),
590 661 : inner: RwLock::new(InMemoryLayerInner {
591 661 : file,
592 661 : resource_units: GlobalResourceUnits::new(),
593 661 : }),
594 661 : estimated_in_mem_size: AtomicU64::new(0),
595 661 : })
596 661 : }
597 :
598 : /// Write path.
599 : ///
600 : /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
601 : /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
602 : /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
603 2402128 : pub async fn put_batch(
604 2402128 : &self,
605 2402128 : serialized_batch: SerializedValueBatch,
606 2402128 : ctx: &RequestContext,
607 2402128 : ) -> anyhow::Result<()> {
608 2402128 : let (base_offset, metadata) = {
609 2402128 : let mut inner = self.inner.write().await;
610 2402128 : self.assert_writable();
611 2402128 :
612 2402128 : let base_offset = inner.file.len();
613 2402128 :
614 2402128 : let SerializedValueBatch {
615 2402128 : raw,
616 2402128 : metadata,
617 2402128 : max_lsn: _,
618 2402128 : len: _,
619 2402128 : } = serialized_batch;
620 2402128 :
621 2402128 : // Write the batch to the file
622 2402128 : inner.file.write_raw(&raw, ctx).await?;
623 2402128 : let new_size = inner.file.len();
624 2402128 :
625 2402128 : let expected_new_len = base_offset
626 2402128 : .checked_add(raw.len().into_u64())
627 2402128 : // write_raw would error if we were to overflow u64.
628 2402128 : // also IndexEntry and higher levels in
629 2402128 : //the code don't allow the file to grow that large
630 2402128 : .unwrap();
631 2402128 : assert_eq!(new_size, expected_new_len);
632 :
633 2402128 : inner.resource_units.maybe_publish_size(new_size);
634 2402128 :
635 2402128 : (base_offset, metadata)
636 : };
637 :
638 : // Update the index with the new entries
639 2402128 : let mut index = self.index.write().await;
640 :
641 4967333 : for meta in metadata {
642 : let SerializedValueMeta {
643 2565205 : key,
644 2565205 : lsn,
645 2565205 : batch_offset,
646 2565205 : len,
647 2565205 : will_init,
648 2565205 : } = match meta {
649 2565205 : ValueMeta::Serialized(ser) => ser,
650 : ValueMeta::Observed(_) => {
651 0 : continue;
652 : }
653 : };
654 :
655 : // Add the base_offset to the batch's index entries which are relative to the batch start.
656 2565205 : let index_entry = IndexEntry::new(IndexEntryNewArgs {
657 2565205 : base_offset,
658 2565205 : batch_offset,
659 2565205 : len,
660 2565205 : will_init,
661 2565205 : })?;
662 :
663 2565205 : let vec_map = index.entry(key).or_default();
664 2565205 : let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
665 2565205 : if old.is_some() {
666 : // This should not break anything, but is unexpected: ingestion code aims to filter out
667 : // multiple writes to the same key at the same LSN. This happens in cases where our
668 : // ingenstion code generates some write like an empty page, and we see a write from postgres
669 : // to the same key in the same wal record. If one such write makes it through, we
670 : // index the most recent write, implicitly ignoring the earlier write. We log a warning
671 : // because this case is unexpected, and we would like tests to fail if this happens.
672 0 : warn!("Key {} at {} written twice at same LSN", key, lsn);
673 2565205 : }
674 2565205 : self.estimated_in_mem_size.fetch_add(
675 2565205 : (std::mem::size_of::<CompactKey>()
676 2565205 : + std::mem::size_of::<Lsn>()
677 2565205 : + std::mem::size_of::<IndexEntry>()) as u64,
678 2565205 : AtomicOrdering::Relaxed,
679 2565205 : );
680 : }
681 :
682 2402128 : Ok(())
683 2402128 : }
684 :
685 2401506 : pub(crate) fn get_opened_at(&self) -> Instant {
686 2401506 : self.opened_at
687 2401506 : }
688 :
689 0 : pub(crate) async fn tick(&self) -> Option<u64> {
690 0 : let mut inner = self.inner.write().await;
691 0 : let size = inner.file.len();
692 0 : inner.resource_units.publish_size(size)
693 0 : }
694 :
695 1 : pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
696 1 : // TODO: Currently, we just leak the storage for any deleted keys
697 1 : Ok(())
698 1 : }
699 :
700 : /// Records the end_lsn for non-dropped layers.
701 : /// `end_lsn` is exclusive
702 : ///
703 : /// A note on locking:
704 : /// The current API of [`InMemoryLayer`] does not ensure that there's no ongoing
705 : /// writes while freezing the layer. This is enforced at a higher level via
706 : /// [`crate::tenant::Timeline::write_lock`]. Freeze might be called via two code paths:
707 : /// 1. Via the active [`crate::tenant::timeline::TimelineWriter`]. This holds the
708 : /// Timeline::write_lock for its lifetime. The rolling is handled in
709 : /// [`crate::tenant::timeline::TimelineWriter::put_batch`]. It's a &mut self function
710 : /// so can't be called from different threads.
711 : /// 2. In the background via [`crate::tenant::Timeline::maybe_freeze_ephemeral_layer`].
712 : /// This only proceeds if try_lock on Timeline::write_lock succeeds (i.e. there's no active writer),
713 : /// hence there can be no concurrent writes
714 597 : pub async fn freeze(&self, end_lsn: Lsn) {
715 597 : assert!(
716 597 : self.start_lsn < end_lsn,
717 0 : "{} >= {}",
718 : self.start_lsn,
719 : end_lsn
720 : );
721 597 : self.end_lsn.set(end_lsn).expect("end_lsn set only once");
722 597 :
723 597 : self.frozen_local_path_str
724 597 : .set({
725 597 : let mut buf = String::new();
726 597 : inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
727 597 : .unwrap();
728 597 : buf.into()
729 597 : })
730 597 : .expect("frozen_local_path_str set only once");
731 :
732 : #[cfg(debug_assertions)]
733 : {
734 597 : let index = self.index.read().await;
735 2128078 : for vec_map in index.values() {
736 2213322 : for (lsn, _) in vec_map.as_slice() {
737 2213322 : assert!(*lsn < end_lsn);
738 : }
739 : }
740 : }
741 597 : }
742 :
743 : /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
744 : /// layer will only contain the key range the user specifies, and may return `None`
745 : /// if there are no matching keys.
746 : ///
747 : /// Returns a new delta layer with all the same data as this in-memory layer
748 484 : pub async fn write_to_disk(
749 484 : &self,
750 484 : ctx: &RequestContext,
751 484 : key_range: Option<Range<Key>>,
752 484 : l0_flush_global_state: &l0_flush::Inner,
753 484 : gate: &utils::sync::gate::Gate,
754 484 : cancel: CancellationToken,
755 484 : ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
756 : // Grab the lock in read-mode. We hold it over the I/O, but because this
757 : // layer is not writeable anymore, no one should be trying to acquire the
758 : // write lock on it, so we shouldn't block anyone. See the comment on
759 : // [`InMemoryLayer::freeze`] to understand how locking between the append path
760 : // and layer flushing works.
761 484 : let inner = self.inner.read().await;
762 484 : let index = self.index.read().await;
763 :
764 : use l0_flush::Inner;
765 484 : let _concurrency_permit = match l0_flush_global_state {
766 484 : Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
767 : };
768 :
769 484 : let end_lsn = *self.end_lsn.get().unwrap();
770 :
771 484 : let key_count = if let Some(key_range) = key_range {
772 0 : let key_range = key_range.start.to_compact()..key_range.end.to_compact();
773 0 :
774 0 : index.iter().filter(|(k, _)| key_range.contains(k)).count()
775 : } else {
776 484 : index.len()
777 : };
778 484 : if key_count == 0 {
779 0 : return Ok(None);
780 484 : }
781 :
782 484 : let mut delta_layer_writer = DeltaLayerWriter::new(
783 484 : self.conf,
784 484 : self.timeline_id,
785 484 : self.tenant_shard_id,
786 484 : Key::MIN,
787 484 : self.start_lsn..end_lsn,
788 484 : gate,
789 484 : cancel,
790 484 : ctx,
791 484 : )
792 484 : .await?;
793 :
794 484 : match l0_flush_global_state {
795 : l0_flush::Inner::Direct { .. } => {
796 484 : let file_contents = inner.file.load_to_io_buf(ctx).await?;
797 484 : let file_contents = file_contents.freeze();
798 :
799 2127116 : for (key, vec_map) in index.iter() {
800 : // Write all page versions
801 2192758 : for (lsn, entry) in vec_map
802 2127116 : .as_slice()
803 2127116 : .iter()
804 2192758 : .map(|(lsn, entry)| (lsn, entry.unpack()))
805 : {
806 : let IndexEntryUnpacked {
807 2192758 : pos,
808 2192758 : len,
809 2192758 : will_init,
810 2192758 : } = entry;
811 2192758 : let buf = file_contents.slice(pos as usize..(pos + len) as usize);
812 2192758 : let (_buf, res) = delta_layer_writer
813 2192758 : .put_value_bytes(
814 2192758 : Key::from_compact(*key),
815 2192758 : *lsn,
816 2192758 : buf.slice_len(),
817 2192758 : will_init,
818 2192758 : ctx,
819 2192758 : )
820 2192758 : .await;
821 2192758 : res?;
822 : }
823 : }
824 : }
825 : }
826 :
827 : // MAX is used here because we identify L0 layers by full key range
828 484 : let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
829 :
830 : // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
831 : //
832 : // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
833 : // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
834 : // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
835 : //
836 : // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
837 : // we dirtied when writing to the filesystem have been flushed and marked !dirty.
838 484 : drop(_concurrency_permit);
839 484 :
840 484 : Ok(Some((desc, path)))
841 484 : }
842 : }
843 :
844 : #[cfg(test)]
845 : mod tests {
846 : use super::*;
847 :
848 : #[test]
849 1 : fn test_index_entry() {
850 : const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS;
851 : use {IndexEntryNewArgs as Args, IndexEntryUnpacked as Unpacked};
852 :
853 20 : let roundtrip = |args, expect: Unpacked| {
854 20 : let res = IndexEntry::new(args).expect("this tests expects no errors");
855 20 : let IndexEntryUnpacked {
856 20 : will_init,
857 20 : len,
858 20 : pos,
859 20 : } = res.unpack();
860 20 : assert_eq!(will_init, expect.will_init);
861 20 : assert_eq!(len, expect.len);
862 20 : assert_eq!(pos, expect.pos);
863 20 : };
864 :
865 : // basic roundtrip
866 3 : for pos in [0, MAX_SUPPORTED_POS] {
867 6 : for len in [0, MAX_SUPPORTED_BLOB_LEN] {
868 12 : for will_init in [true, false] {
869 8 : let expect = Unpacked {
870 8 : will_init,
871 8 : len: len.into_u64(),
872 8 : pos: pos.into_u64(),
873 8 : };
874 8 : roundtrip(
875 8 : Args {
876 8 : will_init,
877 8 : base_offset: pos.into_u64(),
878 8 : batch_offset: 0,
879 8 : len,
880 8 : },
881 8 : expect,
882 8 : );
883 8 : roundtrip(
884 8 : Args {
885 8 : will_init,
886 8 : base_offset: 0,
887 8 : batch_offset: pos.into_u64(),
888 8 : len,
889 8 : },
890 8 : expect,
891 8 : );
892 8 : }
893 : }
894 : }
895 :
896 : // too-large len
897 1 : let too_large = Args {
898 1 : will_init: false,
899 1 : len: MAX_SUPPORTED_BLOB_LEN + 1,
900 1 : base_offset: 0,
901 1 : batch_offset: 0,
902 1 : };
903 1 : assert!(IndexEntry::new(too_large).is_err());
904 :
905 : // too-large pos
906 : {
907 1 : let too_large = Args {
908 1 : will_init: false,
909 1 : len: 0,
910 1 : base_offset: MAX_SUPPORTED_POS.into_u64() + 1,
911 1 : batch_offset: 0,
912 1 : };
913 1 : assert!(IndexEntry::new(too_large).is_err());
914 1 : let too_large = Args {
915 1 : will_init: false,
916 1 : len: 0,
917 1 : base_offset: 0,
918 1 : batch_offset: MAX_SUPPORTED_POS.into_u64() + 1,
919 1 : };
920 1 : assert!(IndexEntry::new(too_large).is_err());
921 : }
922 :
923 : // too large (base_offset + batch_offset)
924 : {
925 1 : let too_large = Args {
926 1 : will_init: false,
927 1 : len: 0,
928 1 : base_offset: MAX_SUPPORTED_POS.into_u64(),
929 1 : batch_offset: 1,
930 1 : };
931 1 : assert!(IndexEntry::new(too_large).is_err());
932 1 : let too_large = Args {
933 1 : will_init: false,
934 1 : len: 0,
935 1 : base_offset: MAX_SUPPORTED_POS.into_u64() - 1,
936 1 : batch_offset: MAX_SUPPORTED_POS.into_u64() - 1,
937 1 : };
938 1 : assert!(IndexEntry::new(too_large).is_err());
939 : }
940 :
941 : // valid special cases
942 : // - area past the max supported pos that is accessible by len
943 3 : for len in [1, MAX_SUPPORTED_BLOB_LEN] {
944 2 : roundtrip(
945 2 : Args {
946 2 : will_init: false,
947 2 : len,
948 2 : base_offset: MAX_SUPPORTED_POS.into_u64(),
949 2 : batch_offset: 0,
950 2 : },
951 2 : Unpacked {
952 2 : will_init: false,
953 2 : len: len as u64,
954 2 : pos: MAX_SUPPORTED_POS.into_u64(),
955 2 : },
956 2 : );
957 2 : roundtrip(
958 2 : Args {
959 2 : will_init: false,
960 2 : len,
961 2 : base_offset: 0,
962 2 : batch_offset: MAX_SUPPORTED_POS.into_u64(),
963 2 : },
964 2 : Unpacked {
965 2 : will_init: false,
966 2 : len: len as u64,
967 2 : pos: MAX_SUPPORTED_POS.into_u64(),
968 2 : },
969 2 : );
970 2 : }
971 1 : }
972 : }
|