Line data Source code
1 : //! An in-memory layer stores recently received key-value pairs.
2 : //!
3 : //! The "in-memory" part of the name is a bit misleading: the actual page versions are
4 : //! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
5 : //! its position in the file, is kept in memory, though.
6 : //!
7 : use std::cmp::Ordering;
8 : use std::collections::{BTreeMap, HashMap};
9 : use std::fmt::Write;
10 : use std::ops::Range;
11 : use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering as AtomicOrdering};
12 : use std::sync::{Arc, OnceLock};
13 : use std::time::Instant;
14 :
15 : use anyhow::Result;
16 : use camino::Utf8PathBuf;
17 : use pageserver_api::key::{CompactKey, Key};
18 : use pageserver_api::keyspace::KeySpace;
19 : use pageserver_api::models::InMemoryLayerInfo;
20 : use pageserver_api::shard::TenantShardId;
21 : use tokio::sync::RwLock;
22 : use tokio_util::sync::CancellationToken;
23 : use tracing::*;
24 : use utils::id::TimelineId;
25 : use utils::lsn::Lsn;
26 : use utils::vec_map::VecMap;
27 : use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
28 :
29 : use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};
30 : use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64, u64_to_usize};
31 : use crate::config::PageServerConf;
32 : use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
33 : // avoid binding to Write (conflicts with std::io::Write)
34 : // while being able to use std::fmt::Write's methods
35 : use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
36 : use crate::tenant::ephemeral_file::EphemeralFile;
37 : use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo};
38 : use crate::tenant::timeline::GetVectoredError;
39 : use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
40 : use crate::{l0_flush, page_cache};
41 :
42 : pub(crate) mod vectored_dio_read;
43 :
44 : #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
45 : pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
46 :
47 : pub struct InMemoryLayer {
48 : conf: &'static PageServerConf,
49 : tenant_shard_id: TenantShardId,
50 : timeline_id: TimelineId,
51 : file_id: InMemoryLayerFileId,
52 :
53 : /// This layer contains all the changes from 'start_lsn'. The
54 : /// start is inclusive.
55 : start_lsn: Lsn,
56 :
57 : /// Frozen layers have an exclusive end LSN.
58 : /// Writes are only allowed when this is `None`.
59 : pub(crate) end_lsn: OnceLock<Lsn>,
60 :
61 : /// Used for traversal path. Cached representation of the in-memory layer after frozen.
62 : frozen_local_path_str: OnceLock<Arc<str>>,
63 :
64 : opened_at: Instant,
65 :
66 : /// The above fields never change, except for `end_lsn`, which is only set once.
67 : /// All other changing parts are in `inner`, and protected by a mutex.
68 : inner: RwLock<InMemoryLayerInner>,
69 :
70 : estimated_in_mem_size: AtomicU64,
71 : }
72 :
73 : impl std::fmt::Debug for InMemoryLayer {
74 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
75 0 : f.debug_struct("InMemoryLayer")
76 0 : .field("start_lsn", &self.start_lsn)
77 0 : .field("end_lsn", &self.end_lsn)
78 0 : .field("inner", &self.inner)
79 0 : .finish()
80 0 : }
81 : }
82 :
83 : pub struct InMemoryLayerInner {
84 : /// All versions of all pages in the layer are kept here. Indexed
85 : /// by block number and LSN. The [`IndexEntry`] is an offset into the
86 : /// ephemeral file where the page version is stored.
87 : index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
88 :
89 : /// The values are stored in a serialized format in this file.
90 : /// Each serialized Value is preceded by a 'u32' length field.
91 : /// PerSeg::page_versions map stores offsets into this file.
92 : file: EphemeralFile,
93 :
94 : resource_units: GlobalResourceUnits,
95 : }
96 :
97 : /// Support the same max blob length as blob_io, because ultimately
98 : /// all the InMemoryLayer contents end up being written into a delta layer,
99 : /// using the [`crate::tenant::blob_io`].
100 : const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN;
101 : const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
102 : let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize;
103 : let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize;
104 : assert!(trailing_ones + leading_zeroes == std::mem::size_of::<usize>() * 8);
105 : trailing_ones
106 : };
107 :
108 : /// See [`InMemoryLayerInner::index`].
109 : ///
110 : /// For memory efficiency, the data is packed into a u64.
111 : ///
112 : /// Layout:
113 : /// - 1 bit: `will_init`
114 : /// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len`
115 : /// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos`
116 : #[derive(Debug, Clone, Copy, PartialEq, Eq)]
117 : pub struct IndexEntry(u64);
118 :
119 : impl IndexEntry {
120 : /// See [`Self::MAX_SUPPORTED_POS`].
121 : const MAX_SUPPORTED_POS_BITS: usize = {
122 : let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS;
123 : if remainder < 32 {
124 : panic!("pos can be u32 as per type system, support that");
125 : }
126 : remainder
127 : };
128 : /// The maximum supported blob offset that can be represented by [`Self`].
129 : /// See also [`Self::validate_checkpoint_distance`].
130 : const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1;
131 :
132 : // Layout
133 : const WILL_INIT_RANGE: Range<usize> = 0..1;
134 : const LEN_RANGE: Range<usize> =
135 : Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS;
136 : const POS_RANGE: Range<usize> =
137 : Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS;
138 : const _ASSERT: () = {
139 : if Self::POS_RANGE.end != 64 {
140 : panic!("we don't want undefined bits for our own sanity")
141 : }
142 : };
143 :
144 : /// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`].
145 : ///
146 : /// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long.
147 : /// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`].
148 : ///
149 : /// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance,
150 : /// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value.
151 : ///
152 : /// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested)
153 : /// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer.
154 : #[inline(always)]
155 30782640 : fn new(arg: IndexEntryNewArgs) -> anyhow::Result<Self> {
156 30782640 : let IndexEntryNewArgs {
157 30782640 : base_offset,
158 30782640 : batch_offset,
159 30782640 : len,
160 30782640 : will_init,
161 30782640 : } = arg;
162 :
163 30782640 : let pos = base_offset
164 30782640 : .checked_add(batch_offset)
165 30782640 : .ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?;
166 :
167 30782640 : if pos.into_usize() > Self::MAX_SUPPORTED_POS {
168 48 : anyhow::bail!(
169 48 : "base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}",
170 48 : max = Self::MAX_SUPPORTED_POS
171 48 : );
172 30782592 : }
173 30782592 :
174 30782592 : if len > MAX_SUPPORTED_BLOB_LEN {
175 12 : anyhow::bail!(
176 12 : "len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}",
177 12 : );
178 30782580 : }
179 30782580 :
180 30782580 : let mut data: u64 = 0;
181 : use bit_field::BitField;
182 30782580 : data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 });
183 30782580 : data.set_bits(Self::LEN_RANGE, len.into_u64());
184 30782580 : data.set_bits(Self::POS_RANGE, pos);
185 30782580 :
186 30782580 : Ok(Self(data))
187 30782640 : }
188 :
189 : #[inline(always)]
190 37825581 : fn unpack(&self) -> IndexEntryUnpacked {
191 : use bit_field::BitField;
192 37825581 : IndexEntryUnpacked {
193 37825581 : will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0,
194 37825581 : len: self.0.get_bits(Self::LEN_RANGE),
195 37825581 : pos: self.0.get_bits(Self::POS_RANGE),
196 37825581 : }
197 37825581 : }
198 :
199 : /// See [`Self::new`].
200 1500 : pub(crate) const fn validate_checkpoint_distance(
201 1500 : checkpoint_distance: u64,
202 1500 : ) -> Result<(), &'static str> {
203 1500 : if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 {
204 0 : return Err("exceeds the maximum supported value");
205 1500 : }
206 1500 : let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN);
207 1500 : if res.is_none() {
208 0 : return Err(
209 0 : "checkpoint distance + max supported blob len overflows in-memory addition",
210 0 : );
211 1500 : }
212 1500 :
213 1500 : // NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS
214 1500 :
215 1500 : Ok(())
216 1500 : }
217 :
218 : const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
219 : let res = Self::validate_checkpoint_distance(
220 : pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE,
221 : );
222 : if res.is_err() {
223 : panic!("default checkpoint distance is valid")
224 : }
225 : };
226 : }
227 :
228 : /// Args to [`IndexEntry::new`].
229 : #[derive(Clone, Copy)]
230 : struct IndexEntryNewArgs {
231 : base_offset: u64,
232 : batch_offset: u64,
233 : len: usize,
234 : will_init: bool,
235 : }
236 :
237 : /// Unpacked representation of the bitfielded [`IndexEntry`].
238 : #[derive(Clone, Copy, PartialEq, Eq, Debug)]
239 : struct IndexEntryUnpacked {
240 : will_init: bool,
241 : len: u64,
242 : pos: u64,
243 : }
244 :
245 : impl std::fmt::Debug for InMemoryLayerInner {
246 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
247 0 : f.debug_struct("InMemoryLayerInner").finish()
248 0 : }
249 : }
250 :
251 : /// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline,
252 : /// to minimize contention.
253 : ///
254 : /// This global state is used to implement behaviors that require a global view of the system, e.g.
255 : /// rolling layers proactively to limit the total amount of dirty data.
256 : pub(crate) struct GlobalResources {
257 : // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
258 : // Zero means unlimited.
259 : pub(crate) max_dirty_bytes: AtomicU64,
260 : // How many bytes are in all EphemeralFile objects
261 : dirty_bytes: AtomicU64,
262 : // How many layers are contributing to dirty_bytes
263 : dirty_layers: AtomicUsize,
264 : }
265 :
266 : // Per-timeline RAII struct for its contribution to [`GlobalResources`]
267 : struct GlobalResourceUnits {
268 : // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
269 : // for decrementing the global counter by this many bytes when dropped.
270 : dirty_bytes: u64,
271 : }
272 :
273 : impl GlobalResourceUnits {
274 : // Hint for the layer append path to update us when the layer size differs from the last
275 : // call to update_size by this much. If we don't reach this threshold, we'll still get
276 : // updated when the Timeline "ticks" in the background.
277 : const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
278 :
279 7908 : fn new() -> Self {
280 7908 : GLOBAL_RESOURCES
281 7908 : .dirty_layers
282 7908 : .fetch_add(1, AtomicOrdering::Relaxed);
283 7908 : Self { dirty_bytes: 0 }
284 7908 : }
285 :
286 : /// Do not call this frequently: all timelines will write to these same global atomics,
287 : /// so this is a relatively expensive operation. Wait at least a few seconds between calls.
288 : ///
289 : /// Returns the effective layer size limit that should be applied, if any, to keep
290 : /// the total number of dirty bytes below the configured maximum.
291 7176 : fn publish_size(&mut self, size: u64) -> Option<u64> {
292 7176 : let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
293 7116 : Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
294 : Ordering::Greater => {
295 48 : let delta = size - self.dirty_bytes;
296 48 : let old = GLOBAL_RESOURCES
297 48 : .dirty_bytes
298 48 : .fetch_add(delta, AtomicOrdering::Relaxed);
299 48 : old + delta
300 : }
301 : Ordering::Less => {
302 12 : let delta = self.dirty_bytes - size;
303 12 : let old = GLOBAL_RESOURCES
304 12 : .dirty_bytes
305 12 : .fetch_sub(delta, AtomicOrdering::Relaxed);
306 12 : old - delta
307 : }
308 : };
309 :
310 : // This is a sloppy update: concurrent updates to the counter will race, and the exact
311 : // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
312 : // That's okay: as long as the metric contains some recent value, it doesn't have to always
313 : // be literally the last update.
314 7176 : TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
315 7176 :
316 7176 : self.dirty_bytes = size;
317 7176 :
318 7176 : let max_dirty_bytes = GLOBAL_RESOURCES
319 7176 : .max_dirty_bytes
320 7176 : .load(AtomicOrdering::Relaxed);
321 7176 : if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
322 : // Set the layer file limit to the average layer size: this implies that all above-average
323 : // sized layers will be elegible for freezing. They will be frozen in the order they
324 : // next enter publish_size.
325 0 : Some(
326 0 : new_global_dirty_bytes
327 0 : / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
328 0 : )
329 : } else {
330 7176 : None
331 : }
332 7176 : }
333 :
334 : // Call publish_size if the input size differs from last published size by more than
335 : // the drift limit
336 28825500 : fn maybe_publish_size(&mut self, size: u64) {
337 28825500 : let publish = match size.cmp(&self.dirty_bytes) {
338 0 : Ordering::Equal => false,
339 28825500 : Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
340 0 : Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT,
341 : };
342 :
343 28825500 : if publish {
344 48 : self.publish_size(size);
345 28825452 : }
346 28825500 : }
347 : }
348 :
349 : impl Drop for GlobalResourceUnits {
350 7128 : fn drop(&mut self) {
351 7128 : GLOBAL_RESOURCES
352 7128 : .dirty_layers
353 7128 : .fetch_sub(1, AtomicOrdering::Relaxed);
354 7128 :
355 7128 : // Subtract our contribution to the global total dirty bytes
356 7128 : self.publish_size(0);
357 7128 : }
358 : }
359 :
360 : pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
361 : max_dirty_bytes: AtomicU64::new(0),
362 : dirty_bytes: AtomicU64::new(0),
363 : dirty_layers: AtomicUsize::new(0),
364 : };
365 :
366 : impl InMemoryLayer {
367 3763267 : pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
368 3763267 : self.file_id
369 3763267 : }
370 :
371 7152 : pub(crate) fn get_timeline_id(&self) -> TimelineId {
372 7152 : self.timeline_id
373 7152 : }
374 :
375 14256 : pub(crate) fn info(&self) -> InMemoryLayerInfo {
376 14256 : let lsn_start = self.start_lsn;
377 :
378 14256 : if let Some(&lsn_end) = self.end_lsn.get() {
379 14244 : InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
380 : } else {
381 12 : InMemoryLayerInfo::Open { lsn_start }
382 : }
383 14256 : }
384 :
385 14208 : pub(crate) fn try_len(&self) -> Option<u64> {
386 14208 : self.inner.try_read().map(|i| i.file.len()).ok()
387 14208 : }
388 :
389 28825500 : pub(crate) fn assert_writable(&self) {
390 28825500 : assert!(self.end_lsn.get().is_none());
391 28825500 : }
392 :
393 14641790 : pub(crate) fn end_lsn_or_max(&self) -> Lsn {
394 14641790 : self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
395 14641790 : }
396 :
397 14634686 : pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
398 14634686 : self.start_lsn..self.end_lsn_or_max()
399 14634686 : }
400 :
401 : /// debugging function to print out the contents of the layer
402 : ///
403 : /// this is likely completly unused
404 0 : pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
405 0 : let end_str = self.end_lsn_or_max();
406 0 :
407 0 : println!(
408 0 : "----- in-memory layer for tli {} LSNs {}-{} ----",
409 0 : self.timeline_id, self.start_lsn, end_str,
410 0 : );
411 0 :
412 0 : Ok(())
413 0 : }
414 :
415 : // Look up the keys in the provided keyspace and update
416 : // the reconstruct state with whatever is found.
417 3717211 : pub(crate) async fn get_values_reconstruct_data(
418 3717211 : self: &Arc<InMemoryLayer>,
419 3717211 : keyspace: KeySpace,
420 3717211 : lsn_range: Range<Lsn>,
421 3717211 : reconstruct_state: &mut ValuesReconstructState,
422 3717211 : ctx: &RequestContext,
423 3717211 : ) -> Result<(), GetVectoredError> {
424 3717211 : let ctx = RequestContextBuilder::from(ctx)
425 3717211 : .page_content_kind(PageContentKind::InMemoryLayer)
426 3717211 : .attached_child();
427 :
428 3717211 : let inner = self.inner.read().await;
429 :
430 : struct ValueRead {
431 : entry_lsn: Lsn,
432 : read: vectored_dio_read::LogicalRead<Vec<u8>>,
433 : }
434 3717211 : let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
435 3717211 : let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
436 :
437 3760963 : for range in keyspace.ranges.iter() {
438 3760963 : for (key, vec_map) in inner
439 3760963 : .index
440 3760963 : .range(range.start.to_compact()..range.end.to_compact())
441 : {
442 3198525 : let key = Key::from_compact(*key);
443 3198525 : let slice = vec_map.slice_range(lsn_range.clone());
444 :
445 11512245 : for (entry_lsn, index_entry) in slice.iter().rev() {
446 : let IndexEntryUnpacked {
447 11512245 : pos,
448 11512245 : len,
449 11512245 : will_init,
450 11512245 : } = index_entry.unpack();
451 11512245 :
452 11512245 : reads.entry(key).or_default().push(ValueRead {
453 11512245 : entry_lsn: *entry_lsn,
454 11512245 : read: vectored_dio_read::LogicalRead::new(
455 11512245 : pos,
456 11512245 : Vec::with_capacity(len as usize),
457 11512245 : ),
458 11512245 : });
459 11512245 :
460 11512245 : let io = reconstruct_state.update_key(&key, *entry_lsn, will_init);
461 11512245 : ios.insert((key, *entry_lsn), io);
462 11512245 :
463 11512245 : if will_init {
464 3074301 : break;
465 8437944 : }
466 : }
467 : }
468 : }
469 3717211 : drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
470 3717211 : let read_from = Arc::clone(self);
471 3717211 : let read_ctx = ctx.attached_child();
472 3717211 : reconstruct_state
473 3717211 : .spawn_io(async move {
474 3717211 : let inner = read_from.inner.read().await;
475 3717211 : let f = vectored_dio_read::execute(
476 3717211 : &inner.file,
477 3717211 : reads
478 3717211 : .iter()
479 11512245 : .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
480 3717211 : &read_ctx,
481 3717211 : );
482 3717211 : send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
483 3717211 : .await;
484 :
485 6915712 : for (key, value_reads) in reads {
486 14710746 : for ValueRead { entry_lsn, read } in value_reads {
487 11512245 : let io = ios.remove(&(key, entry_lsn)).expect("sender must exist");
488 11512245 : match read.into_result().expect("we run execute() above") {
489 0 : Err(e) => {
490 0 : io.complete(Err(std::io::Error::new(
491 0 : e.kind(),
492 0 : "dio vec read failed",
493 0 : )));
494 0 : }
495 11512245 : Ok(value_buf) => {
496 11512245 : io.complete(Ok(OnDiskValue::WalRecordOrImage(value_buf.into())));
497 11512245 : }
498 : }
499 : }
500 : }
501 :
502 3717211 : assert!(ios.is_empty());
503 :
504 : // Keep layer existent until this IO is done;
505 : // This is kinda forced for InMemoryLayer because we need to inner.read() anyway,
506 : // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit
507 : // drop for consistency among all three layer types.
508 3717211 : drop(inner);
509 3717211 : drop(read_from);
510 3717211 : })
511 3717211 : .await;
512 :
513 3717211 : Ok(())
514 3717211 : }
515 : }
516 :
517 14256 : fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
518 14256 : write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
519 14256 : }
520 :
521 7152 : fn inmem_layer_log_display(
522 7152 : mut f: impl Write,
523 7152 : timeline: TimelineId,
524 7152 : start_lsn: Lsn,
525 7152 : end_lsn: Lsn,
526 7152 : ) -> std::fmt::Result {
527 7152 : write!(f, "timeline {} in-memory ", timeline)?;
528 7152 : inmem_layer_display(f, start_lsn, end_lsn)
529 7152 : }
530 :
531 : impl std::fmt::Display for InMemoryLayer {
532 7104 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
533 7104 : let end_lsn = self.end_lsn_or_max();
534 7104 : inmem_layer_display(f, self.start_lsn, end_lsn)
535 7104 : }
536 : }
537 :
538 : impl InMemoryLayer {
539 : /// Get layer size.
540 7848 : pub async fn size(&self) -> Result<u64> {
541 7848 : let inner = self.inner.read().await;
542 7848 : Ok(inner.file.len())
543 7848 : }
544 :
545 7264 : pub fn estimated_in_mem_size(&self) -> u64 {
546 7264 : self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
547 7264 : }
548 :
549 : /// Create a new, empty, in-memory layer
550 7908 : pub async fn create(
551 7908 : conf: &'static PageServerConf,
552 7908 : timeline_id: TimelineId,
553 7908 : tenant_shard_id: TenantShardId,
554 7908 : start_lsn: Lsn,
555 7908 : gate: &utils::sync::gate::Gate,
556 7908 : cancel: &CancellationToken,
557 7908 : ctx: &RequestContext,
558 7908 : ) -> Result<InMemoryLayer> {
559 7908 : trace!(
560 0 : "initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"
561 : );
562 :
563 7908 : let file =
564 7908 : EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, cancel, ctx).await?;
565 7908 : let key = InMemoryLayerFileId(file.page_cache_file_id());
566 7908 :
567 7908 : Ok(InMemoryLayer {
568 7908 : file_id: key,
569 7908 : frozen_local_path_str: OnceLock::new(),
570 7908 : conf,
571 7908 : timeline_id,
572 7908 : tenant_shard_id,
573 7908 : start_lsn,
574 7908 : end_lsn: OnceLock::new(),
575 7908 : opened_at: Instant::now(),
576 7908 : inner: RwLock::new(InMemoryLayerInner {
577 7908 : index: BTreeMap::new(),
578 7908 : file,
579 7908 : resource_units: GlobalResourceUnits::new(),
580 7908 : }),
581 7908 : estimated_in_mem_size: AtomicU64::new(0),
582 7908 : })
583 7908 : }
584 :
585 : /// Write path.
586 : ///
587 : /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
588 : /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
589 : /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
590 28825500 : pub async fn put_batch(
591 28825500 : &self,
592 28825500 : serialized_batch: SerializedValueBatch,
593 28825500 : ctx: &RequestContext,
594 28825500 : ) -> anyhow::Result<()> {
595 28825500 : let mut inner = self.inner.write().await;
596 28825500 : self.assert_writable();
597 28825500 :
598 28825500 : let base_offset = inner.file.len();
599 28825500 :
600 28825500 : let SerializedValueBatch {
601 28825500 : raw,
602 28825500 : metadata,
603 28825500 : max_lsn: _,
604 28825500 : len: _,
605 28825500 : } = serialized_batch;
606 28825500 :
607 28825500 : // Write the batch to the file
608 28825500 : inner.file.write_raw(&raw, ctx).await?;
609 28825500 : let new_size = inner.file.len();
610 28825500 :
611 28825500 : let expected_new_len = base_offset
612 28825500 : .checked_add(raw.len().into_u64())
613 28825500 : // write_raw would error if we were to overflow u64.
614 28825500 : // also IndexEntry and higher levels in
615 28825500 : //the code don't allow the file to grow that large
616 28825500 : .unwrap();
617 28825500 : assert_eq!(new_size, expected_new_len);
618 :
619 : // Update the index with the new entries
620 59607840 : for meta in metadata {
621 : let SerializedValueMeta {
622 30782340 : key,
623 30782340 : lsn,
624 30782340 : batch_offset,
625 30782340 : len,
626 30782340 : will_init,
627 30782340 : } = match meta {
628 30782340 : ValueMeta::Serialized(ser) => ser,
629 : ValueMeta::Observed(_) => {
630 0 : continue;
631 : }
632 : };
633 :
634 : // Add the base_offset to the batch's index entries which are relative to the batch start.
635 30782340 : let index_entry = IndexEntry::new(IndexEntryNewArgs {
636 30782340 : base_offset,
637 30782340 : batch_offset,
638 30782340 : len,
639 30782340 : will_init,
640 30782340 : })?;
641 :
642 30782340 : let vec_map = inner.index.entry(key).or_default();
643 30782340 : let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
644 30782340 : if old.is_some() {
645 : // This should not break anything, but is unexpected: ingestion code aims to filter out
646 : // multiple writes to the same key at the same LSN. This happens in cases where our
647 : // ingenstion code generates some write like an empty page, and we see a write from postgres
648 : // to the same key in the same wal record. If one such write makes it through, we
649 : // index the most recent write, implicitly ignoring the earlier write. We log a warning
650 : // because this case is unexpected, and we would like tests to fail if this happens.
651 0 : warn!("Key {} at {} written twice at same LSN", key, lsn);
652 30782340 : }
653 30782340 : self.estimated_in_mem_size.fetch_add(
654 30782340 : (std::mem::size_of::<CompactKey>()
655 30782340 : + std::mem::size_of::<Lsn>()
656 30782340 : + std::mem::size_of::<IndexEntry>()) as u64,
657 30782340 : AtomicOrdering::Relaxed,
658 30782340 : );
659 : }
660 :
661 28825500 : inner.resource_units.maybe_publish_size(new_size);
662 28825500 :
663 28825500 : Ok(())
664 28825500 : }
665 :
666 28818060 : pub(crate) fn get_opened_at(&self) -> Instant {
667 28818060 : self.opened_at
668 28818060 : }
669 :
670 0 : pub(crate) async fn tick(&self) -> Option<u64> {
671 0 : let mut inner = self.inner.write().await;
672 0 : let size = inner.file.len();
673 0 : inner.resource_units.publish_size(size)
674 0 : }
675 :
676 12 : pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
677 12 : // TODO: Currently, we just leak the storage for any deleted keys
678 12 : Ok(())
679 12 : }
680 :
681 : /// Records the end_lsn for non-dropped layers.
682 : /// `end_lsn` is exclusive
683 7152 : pub async fn freeze(&self, end_lsn: Lsn) {
684 7152 : assert!(
685 7152 : self.start_lsn < end_lsn,
686 0 : "{} >= {}",
687 : self.start_lsn,
688 : end_lsn
689 : );
690 7152 : self.end_lsn.set(end_lsn).expect("end_lsn set only once");
691 7152 :
692 7152 : self.frozen_local_path_str
693 7152 : .set({
694 7152 : let mut buf = String::new();
695 7152 : inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
696 7152 : .unwrap();
697 7152 : buf.into()
698 7152 : })
699 7152 : .expect("frozen_local_path_str set only once");
700 :
701 : #[cfg(debug_assertions)]
702 : {
703 7152 : let inner = self.inner.write().await;
704 25539443 : for vec_map in inner.index.values() {
705 26559780 : for (lsn, _) in vec_map.as_slice() {
706 26559780 : assert!(*lsn < end_lsn);
707 : }
708 : }
709 : }
710 7152 : }
711 :
712 : /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
713 : /// layer will only contain the key range the user specifies, and may return `None`
714 : /// if there are no matching keys.
715 : ///
716 : /// Returns a new delta layer with all the same data as this in-memory layer
717 5808 : pub async fn write_to_disk(
718 5808 : &self,
719 5808 : ctx: &RequestContext,
720 5808 : key_range: Option<Range<Key>>,
721 5808 : l0_flush_global_state: &l0_flush::Inner,
722 5808 : gate: &utils::sync::gate::Gate,
723 5808 : cancel: CancellationToken,
724 5808 : ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
725 : // Grab the lock in read-mode. We hold it over the I/O, but because this
726 : // layer is not writeable anymore, no one should be trying to acquire the
727 : // write lock on it, so we shouldn't block anyone. There's one exception
728 : // though: another thread might have grabbed a reference to this layer
729 : // in `get_layer_for_write' just before the checkpointer called
730 : // `freeze`, and then `write_to_disk` on it. When the thread gets the
731 : // lock, it will see that it's not writeable anymore and retry, but it
732 : // would have to wait until we release it. That race condition is very
733 : // rare though, so we just accept the potential latency hit for now.
734 5808 : let inner = self.inner.read().await;
735 :
736 : use l0_flush::Inner;
737 5808 : let _concurrency_permit = match l0_flush_global_state {
738 5808 : Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
739 : };
740 :
741 5808 : let end_lsn = *self.end_lsn.get().unwrap();
742 :
743 5808 : let key_count = if let Some(key_range) = key_range {
744 0 : let key_range = key_range.start.to_compact()..key_range.end.to_compact();
745 0 :
746 0 : inner
747 0 : .index
748 0 : .iter()
749 0 : .filter(|(k, _)| key_range.contains(k))
750 0 : .count()
751 : } else {
752 5808 : inner.index.len()
753 : };
754 5808 : if key_count == 0 {
755 0 : return Ok(None);
756 5808 : }
757 :
758 5808 : let mut delta_layer_writer = DeltaLayerWriter::new(
759 5808 : self.conf,
760 5808 : self.timeline_id,
761 5808 : self.tenant_shard_id,
762 5808 : Key::MIN,
763 5808 : self.start_lsn..end_lsn,
764 5808 : gate,
765 5808 : cancel,
766 5808 : ctx,
767 5808 : )
768 5808 : .await?;
769 :
770 5808 : match l0_flush_global_state {
771 : l0_flush::Inner::Direct { .. } => {
772 5808 : let file_contents = inner.file.load_to_io_buf(ctx).await?;
773 5808 : let file_contents = file_contents.freeze();
774 :
775 25527983 : for (key, vec_map) in inner.index.iter() {
776 : // Write all page versions
777 26313096 : for (lsn, entry) in vec_map
778 25527983 : .as_slice()
779 25527983 : .iter()
780 26313096 : .map(|(lsn, entry)| (lsn, entry.unpack()))
781 : {
782 : let IndexEntryUnpacked {
783 26313096 : pos,
784 26313096 : len,
785 26313096 : will_init,
786 26313096 : } = entry;
787 26313096 : let buf = file_contents.slice(pos as usize..(pos + len) as usize);
788 26313096 : let (_buf, res) = delta_layer_writer
789 26313096 : .put_value_bytes(
790 26313096 : Key::from_compact(*key),
791 26313096 : *lsn,
792 26313096 : buf.slice_len(),
793 26313096 : will_init,
794 26313096 : ctx,
795 26313096 : )
796 26313096 : .await;
797 26313096 : res?;
798 : }
799 : }
800 : }
801 : }
802 :
803 : // MAX is used here because we identify L0 layers by full key range
804 5808 : let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
805 :
806 : // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
807 : //
808 : // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
809 : // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
810 : // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
811 : //
812 : // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
813 : // we dirtied when writing to the filesystem have been flushed and marked !dirty.
814 5808 : drop(_concurrency_permit);
815 5808 :
816 5808 : Ok(Some((desc, path)))
817 5808 : }
818 : }
819 :
820 : #[cfg(test)]
821 : mod tests {
822 : use super::*;
823 :
824 : #[test]
825 12 : fn test_index_entry() {
826 : const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS;
827 : use {IndexEntryNewArgs as Args, IndexEntryUnpacked as Unpacked};
828 :
829 240 : let roundtrip = |args, expect: Unpacked| {
830 240 : let res = IndexEntry::new(args).expect("this tests expects no errors");
831 240 : let IndexEntryUnpacked {
832 240 : will_init,
833 240 : len,
834 240 : pos,
835 240 : } = res.unpack();
836 240 : assert_eq!(will_init, expect.will_init);
837 240 : assert_eq!(len, expect.len);
838 240 : assert_eq!(pos, expect.pos);
839 240 : };
840 :
841 : // basic roundtrip
842 36 : for pos in [0, MAX_SUPPORTED_POS] {
843 72 : for len in [0, MAX_SUPPORTED_BLOB_LEN] {
844 144 : for will_init in [true, false] {
845 96 : let expect = Unpacked {
846 96 : will_init,
847 96 : len: len.into_u64(),
848 96 : pos: pos.into_u64(),
849 96 : };
850 96 : roundtrip(
851 96 : Args {
852 96 : will_init,
853 96 : base_offset: pos.into_u64(),
854 96 : batch_offset: 0,
855 96 : len,
856 96 : },
857 96 : expect,
858 96 : );
859 96 : roundtrip(
860 96 : Args {
861 96 : will_init,
862 96 : base_offset: 0,
863 96 : batch_offset: pos.into_u64(),
864 96 : len,
865 96 : },
866 96 : expect,
867 96 : );
868 96 : }
869 : }
870 : }
871 :
872 : // too-large len
873 12 : let too_large = Args {
874 12 : will_init: false,
875 12 : len: MAX_SUPPORTED_BLOB_LEN + 1,
876 12 : base_offset: 0,
877 12 : batch_offset: 0,
878 12 : };
879 12 : assert!(IndexEntry::new(too_large).is_err());
880 :
881 : // too-large pos
882 : {
883 12 : let too_large = Args {
884 12 : will_init: false,
885 12 : len: 0,
886 12 : base_offset: MAX_SUPPORTED_POS.into_u64() + 1,
887 12 : batch_offset: 0,
888 12 : };
889 12 : assert!(IndexEntry::new(too_large).is_err());
890 12 : let too_large = Args {
891 12 : will_init: false,
892 12 : len: 0,
893 12 : base_offset: 0,
894 12 : batch_offset: MAX_SUPPORTED_POS.into_u64() + 1,
895 12 : };
896 12 : assert!(IndexEntry::new(too_large).is_err());
897 : }
898 :
899 : // too large (base_offset + batch_offset)
900 : {
901 12 : let too_large = Args {
902 12 : will_init: false,
903 12 : len: 0,
904 12 : base_offset: MAX_SUPPORTED_POS.into_u64(),
905 12 : batch_offset: 1,
906 12 : };
907 12 : assert!(IndexEntry::new(too_large).is_err());
908 12 : let too_large = Args {
909 12 : will_init: false,
910 12 : len: 0,
911 12 : base_offset: MAX_SUPPORTED_POS.into_u64() - 1,
912 12 : batch_offset: MAX_SUPPORTED_POS.into_u64() - 1,
913 12 : };
914 12 : assert!(IndexEntry::new(too_large).is_err());
915 : }
916 :
917 : // valid special cases
918 : // - area past the max supported pos that is accessible by len
919 36 : for len in [1, MAX_SUPPORTED_BLOB_LEN] {
920 24 : roundtrip(
921 24 : Args {
922 24 : will_init: false,
923 24 : len,
924 24 : base_offset: MAX_SUPPORTED_POS.into_u64(),
925 24 : batch_offset: 0,
926 24 : },
927 24 : Unpacked {
928 24 : will_init: false,
929 24 : len: len as u64,
930 24 : pos: MAX_SUPPORTED_POS.into_u64(),
931 24 : },
932 24 : );
933 24 : roundtrip(
934 24 : Args {
935 24 : will_init: false,
936 24 : len,
937 24 : base_offset: 0,
938 24 : batch_offset: MAX_SUPPORTED_POS.into_u64(),
939 24 : },
940 24 : Unpacked {
941 24 : will_init: false,
942 24 : len: len as u64,
943 24 : pos: MAX_SUPPORTED_POS.into_u64(),
944 24 : },
945 24 : );
946 24 : }
947 12 : }
948 : }
|