Line data Source code
1 : //! An in-memory layer stores recently received key-value pairs.
2 : //!
3 : //! The "in-memory" part of the name is a bit misleading: the actual page versions are
4 : //! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
5 : //! its position in the file, is kept in memory, though.
6 : //!
7 : use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
8 : use crate::config::PageServerConf;
9 : use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
10 : use crate::repository::{Key, Value};
11 : use crate::tenant::ephemeral_file::EphemeralFile;
12 : use crate::tenant::timeline::GetVectoredError;
13 : use crate::tenant::PageReconstructError;
14 : use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
15 : use crate::{l0_flush, page_cache};
16 : use anyhow::{anyhow, Context, Result};
17 : use bytes::Bytes;
18 : use camino::Utf8PathBuf;
19 : use pageserver_api::key::CompactKey;
20 : use pageserver_api::keyspace::KeySpace;
21 : use pageserver_api::models::InMemoryLayerInfo;
22 : use pageserver_api::shard::TenantShardId;
23 : use std::collections::{BTreeMap, HashMap};
24 : use std::sync::{Arc, OnceLock};
25 : use std::time::Instant;
26 : use tracing::*;
27 : use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
28 : // avoid binding to Write (conflicts with std::io::Write)
29 : // while being able to use std::fmt::Write's methods
30 : use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
31 : use std::cmp::Ordering;
32 : use std::fmt::Write;
33 : use std::ops::Range;
34 : use std::sync::atomic::Ordering as AtomicOrdering;
35 : use std::sync::atomic::{AtomicU64, AtomicUsize};
36 : use tokio::sync::RwLock;
37 :
38 : use super::{
39 : DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
40 : };
41 :
42 : pub(crate) mod vectored_dio_read;
43 :
44 : #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
45 : pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
46 :
47 : pub struct InMemoryLayer {
48 : conf: &'static PageServerConf,
49 : tenant_shard_id: TenantShardId,
50 : timeline_id: TimelineId,
51 : file_id: InMemoryLayerFileId,
52 :
53 : /// This layer contains all the changes from 'start_lsn'. The
54 : /// start is inclusive.
55 : start_lsn: Lsn,
56 :
57 : /// Frozen layers have an exclusive end LSN.
58 : /// Writes are only allowed when this is `None`.
59 : pub(crate) end_lsn: OnceLock<Lsn>,
60 :
61 : /// Used for traversal path. Cached representation of the in-memory layer after frozen.
62 : frozen_local_path_str: OnceLock<Arc<str>>,
63 :
64 : opened_at: Instant,
65 :
66 : /// The above fields never change, except for `end_lsn`, which is only set once.
67 : /// All other changing parts are in `inner`, and protected by a mutex.
68 : inner: RwLock<InMemoryLayerInner>,
69 : }
70 :
71 : impl std::fmt::Debug for InMemoryLayer {
72 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
73 0 : f.debug_struct("InMemoryLayer")
74 0 : .field("start_lsn", &self.start_lsn)
75 0 : .field("end_lsn", &self.end_lsn)
76 0 : .field("inner", &self.inner)
77 0 : .finish()
78 0 : }
79 : }
80 :
81 : pub struct InMemoryLayerInner {
82 : /// All versions of all pages in the layer are kept here. Indexed
83 : /// by block number and LSN. The [`IndexEntry`] is an offset into the
84 : /// ephemeral file where the page version is stored.
85 : index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
86 :
87 : /// The values are stored in a serialized format in this file.
88 : /// Each serialized Value is preceded by a 'u32' length field.
89 : /// PerSeg::page_versions map stores offsets into this file.
90 : file: EphemeralFile,
91 :
92 : resource_units: GlobalResourceUnits,
93 : }
94 :
95 : /// Support the same max blob length as blob_io, because ultimately
96 : /// all the InMemoryLayer contents end up being written into a delta layer,
97 : /// using the [`crate::tenant::blob_io`].
98 : const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN;
99 : const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
100 : let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize;
101 : let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize;
102 : assert!(trailing_ones + leading_zeroes == std::mem::size_of::<usize>() * 8);
103 : trailing_ones
104 : };
105 :
106 : /// See [`InMemoryLayerInner::index`].
107 : ///
108 : /// For memory efficiency, the data is packed into a u64.
109 : ///
110 : /// Layout:
111 : /// - 1 bit: `will_init`
112 : /// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
113 : /// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
114 : #[derive(Debug, Clone, Copy, PartialEq, Eq)]
115 : pub struct IndexEntry(u64);
116 :
117 : impl IndexEntry {
118 : /// See [`Self::MAX_SUPPORTED_POS`].
119 : const MAX_SUPPORTED_POS_BITS: usize = {
120 : let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS;
121 : if remainder < 32 {
122 : panic!("pos can be u32 as per type system, support that");
123 : }
124 : remainder
125 : };
126 : /// The maximum supported blob offset that can be represented by [`Self`].
127 : /// See also [`Self::validate_checkpoint_distance`].
128 : const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1;
129 :
130 : // Layout
131 : const WILL_INIT_RANGE: Range<usize> = 0..1;
132 : const LEN_RANGE: Range<usize> =
133 : Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS;
134 : const POS_RANGE: Range<usize> =
135 : Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS;
136 : const _ASSERT: () = {
137 : if Self::POS_RANGE.end != 64 {
138 : panic!("we don't want undefined bits for our own sanity")
139 : }
140 : };
141 :
142 : /// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`].
143 : ///
144 : /// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long.
145 : /// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`].
146 : ///
147 : /// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance,
148 : /// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value.
149 : ///
150 : /// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested)
151 : /// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer.
152 : #[inline(always)]
153 30543042 : fn new(arg: IndexEntryNewArgs) -> anyhow::Result<Self> {
154 30543042 : let IndexEntryNewArgs {
155 30543042 : base_offset,
156 30543042 : batch_offset,
157 30543042 : len,
158 30543042 : will_init,
159 30543042 : } = arg;
160 :
161 30543042 : let pos = base_offset
162 30543042 : .checked_add(batch_offset)
163 30543042 : .ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?;
164 :
165 30543042 : if pos.into_usize() > Self::MAX_SUPPORTED_POS {
166 24 : anyhow::bail!(
167 24 : "base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}",
168 24 : max = Self::MAX_SUPPORTED_POS
169 24 : );
170 30543018 : }
171 30543018 :
172 30543018 : if len > MAX_SUPPORTED_BLOB_LEN {
173 6 : anyhow::bail!(
174 6 : "len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}",
175 6 : );
176 30543012 : }
177 30543012 :
178 30543012 : let mut data: u64 = 0;
179 : use bit_field::BitField;
180 30543012 : data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 });
181 30543012 : data.set_bits(Self::LEN_RANGE, len.into_u64());
182 30543012 : data.set_bits(Self::POS_RANGE, pos);
183 30543012 :
184 30543012 : Ok(Self(data))
185 30543042 : }
186 :
187 : #[inline(always)]
188 29924237 : fn unpack(&self) -> IndexEntryUnpacked {
189 : use bit_field::BitField;
190 29924237 : IndexEntryUnpacked {
191 29924237 : will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0,
192 29924237 : len: self.0.get_bits(Self::LEN_RANGE),
193 29924237 : pos: self.0.get_bits(Self::POS_RANGE),
194 29924237 : }
195 29924237 : }
196 :
197 : /// See [`Self::new`].
198 612 : pub(crate) const fn validate_checkpoint_distance(
199 612 : checkpoint_distance: u64,
200 612 : ) -> Result<(), &'static str> {
201 612 : if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 {
202 0 : return Err("exceeds the maximum supported value");
203 612 : }
204 612 : let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN);
205 612 : if res.is_none() {
206 0 : return Err(
207 0 : "checkpoint distance + max supported blob len overflows in-memory addition",
208 0 : );
209 612 : }
210 612 :
211 612 : // NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS
212 612 :
213 612 : Ok(())
214 612 : }
215 :
216 : const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
217 : let res = Self::validate_checkpoint_distance(
218 : pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE,
219 : );
220 : if res.is_err() {
221 : panic!("default checkpoint distance is valid")
222 : }
223 : };
224 : }
225 :
226 : /// Args to [`IndexEntry::new`].
227 : #[derive(Clone, Copy)]
228 : struct IndexEntryNewArgs {
229 : base_offset: u64,
230 : batch_offset: u64,
231 : len: usize,
232 : will_init: bool,
233 : }
234 :
235 : /// Unpacked representation of the bitfielded [`IndexEntry`].
236 : #[derive(Clone, Copy, PartialEq, Eq, Debug)]
237 : struct IndexEntryUnpacked {
238 : will_init: bool,
239 : len: u64,
240 : pos: u64,
241 : }
242 :
243 : impl std::fmt::Debug for InMemoryLayerInner {
244 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
245 0 : f.debug_struct("InMemoryLayerInner").finish()
246 0 : }
247 : }
248 :
249 : /// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline,
250 : /// to minimize contention.
251 : ///
252 : /// This global state is used to implement behaviors that require a global view of the system, e.g.
253 : /// rolling layers proactively to limit the total amount of dirty data.
254 : pub(crate) struct GlobalResources {
255 : // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
256 : // Zero means unlimited.
257 : pub(crate) max_dirty_bytes: AtomicU64,
258 : // How many bytes are in all EphemeralFile objects
259 : dirty_bytes: AtomicU64,
260 : // How many layers are contributing to dirty_bytes
261 : dirty_layers: AtomicUsize,
262 : }
263 :
264 : // Per-timeline RAII struct for its contribution to [`GlobalResources`]
265 : struct GlobalResourceUnits {
266 : // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
267 : // for decrementing the global counter by this many bytes when dropped.
268 : dirty_bytes: u64,
269 : }
270 :
271 : impl GlobalResourceUnits {
272 : // Hint for the layer append path to update us when the layer size differs from the last
273 : // call to update_size by this much. If we don't reach this threshold, we'll still get
274 : // updated when the Timeline "ticks" in the background.
275 : const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
276 :
277 3810 : fn new() -> Self {
278 3810 : GLOBAL_RESOURCES
279 3810 : .dirty_layers
280 3810 : .fetch_add(1, AtomicOrdering::Relaxed);
281 3810 : Self { dirty_bytes: 0 }
282 3810 : }
283 :
284 : /// Do not call this frequently: all timelines will write to these same global atomics,
285 : /// so this is a relatively expensive operation. Wait at least a few seconds between calls.
286 : ///
287 : /// Returns the effective layer size limit that should be applied, if any, to keep
288 : /// the total number of dirty bytes below the configured maximum.
289 3450 : fn publish_size(&mut self, size: u64) -> Option<u64> {
290 3450 : let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
291 3420 : Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
292 : Ordering::Greater => {
293 24 : let delta = size - self.dirty_bytes;
294 24 : let old = GLOBAL_RESOURCES
295 24 : .dirty_bytes
296 24 : .fetch_add(delta, AtomicOrdering::Relaxed);
297 24 : old + delta
298 : }
299 : Ordering::Less => {
300 6 : let delta = self.dirty_bytes - size;
301 6 : let old = GLOBAL_RESOURCES
302 6 : .dirty_bytes
303 6 : .fetch_sub(delta, AtomicOrdering::Relaxed);
304 6 : old - delta
305 : }
306 : };
307 :
308 : // This is a sloppy update: concurrent updates to the counter will race, and the exact
309 : // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
310 : // That's okay: as long as the metric contains some recent value, it doesn't have to always
311 : // be literally the last update.
312 3450 : TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
313 3450 :
314 3450 : self.dirty_bytes = size;
315 3450 :
316 3450 : let max_dirty_bytes = GLOBAL_RESOURCES
317 3450 : .max_dirty_bytes
318 3450 : .load(AtomicOrdering::Relaxed);
319 3450 : if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
320 : // Set the layer file limit to the average layer size: this implies that all above-average
321 : // sized layers will be elegible for freezing. They will be frozen in the order they
322 : // next enter publish_size.
323 0 : Some(
324 0 : new_global_dirty_bytes
325 0 : / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
326 0 : )
327 : } else {
328 3450 : None
329 : }
330 3450 : }
331 :
332 : // Call publish_size if the input size differs from last published size by more than
333 : // the drift limit
334 14412642 : fn maybe_publish_size(&mut self, size: u64) {
335 14412642 : let publish = match size.cmp(&self.dirty_bytes) {
336 0 : Ordering::Equal => false,
337 14412642 : Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
338 0 : Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT,
339 : };
340 :
341 14412642 : if publish {
342 24 : self.publish_size(size);
343 14412618 : }
344 14412642 : }
345 : }
346 :
347 : impl Drop for GlobalResourceUnits {
348 3426 : fn drop(&mut self) {
349 3426 : GLOBAL_RESOURCES
350 3426 : .dirty_layers
351 3426 : .fetch_sub(1, AtomicOrdering::Relaxed);
352 3426 :
353 3426 : // Subtract our contribution to the global total dirty bytes
354 3426 : self.publish_size(0);
355 3426 : }
356 : }
357 :
358 : pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
359 : max_dirty_bytes: AtomicU64::new(0),
360 : dirty_bytes: AtomicU64::new(0),
361 : dirty_layers: AtomicUsize::new(0),
362 : };
363 :
364 : impl InMemoryLayer {
365 1819412 : pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
366 1819412 : self.file_id
367 1819412 : }
368 :
369 3420 : pub(crate) fn get_timeline_id(&self) -> TimelineId {
370 3420 : self.timeline_id
371 3420 : }
372 :
373 0 : pub(crate) fn info(&self) -> InMemoryLayerInfo {
374 0 : let lsn_start = self.start_lsn;
375 :
376 0 : if let Some(&lsn_end) = self.end_lsn.get() {
377 0 : InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
378 : } else {
379 0 : InMemoryLayerInfo::Open { lsn_start }
380 : }
381 0 : }
382 :
383 0 : pub(crate) fn try_len(&self) -> Option<u64> {
384 0 : self.inner.try_read().map(|i| i.file.len()).ok()
385 0 : }
386 :
387 14412642 : pub(crate) fn assert_writable(&self) {
388 14412642 : assert!(self.end_lsn.get().is_none());
389 14412642 : }
390 :
391 4564605 : pub(crate) fn end_lsn_or_max(&self) -> Lsn {
392 4564605 : self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
393 4564605 : }
394 :
395 4561185 : pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
396 4561185 : self.start_lsn..self.end_lsn_or_max()
397 4561185 : }
398 :
399 : /// debugging function to print out the contents of the layer
400 : ///
401 : /// this is likely completly unused
402 0 : pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
403 0 : let end_str = self.end_lsn_or_max();
404 0 :
405 0 : println!(
406 0 : "----- in-memory layer for tli {} LSNs {}-{} ----",
407 0 : self.timeline_id, self.start_lsn, end_str,
408 0 : );
409 0 :
410 0 : Ok(())
411 0 : }
412 :
413 : // Look up the keys in the provided keyspace and update
414 : // the reconstruct state with whatever is found.
415 : //
416 : // If the key is cached, go no further than the cached Lsn.
417 1819412 : pub(crate) async fn get_values_reconstruct_data(
418 1819412 : &self,
419 1819412 : keyspace: KeySpace,
420 1819412 : end_lsn: Lsn,
421 1819412 : reconstruct_state: &mut ValuesReconstructState,
422 1819412 : ctx: &RequestContext,
423 1819412 : ) -> Result<(), GetVectoredError> {
424 1819412 : let ctx = RequestContextBuilder::extend(ctx)
425 1819412 : .page_content_kind(PageContentKind::InMemoryLayer)
426 1819412 : .build();
427 :
428 1819412 : let inner = self.inner.read().await;
429 :
430 : struct ValueRead {
431 : entry_lsn: Lsn,
432 : read: vectored_dio_read::LogicalRead<Vec<u8>>,
433 : }
434 1819412 : let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
435 :
436 1819412 : for range in keyspace.ranges.iter() {
437 1819412 : for (key, vec_map) in inner
438 1819412 : .index
439 1819412 : .range(range.start.to_compact()..range.end.to_compact())
440 : {
441 1496129 : let key = Key::from_compact(*key);
442 1496129 : let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
443 0 : Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
444 1496129 : None => self.start_lsn..end_lsn,
445 : };
446 :
447 1496129 : let slice = vec_map.slice_range(lsn_range);
448 :
449 1496129 : for (entry_lsn, index_entry) in slice.iter().rev() {
450 : let IndexEntryUnpacked {
451 1496123 : pos,
452 1496123 : len,
453 1496123 : will_init,
454 1496123 : } = index_entry.unpack();
455 1496123 : reads.entry(key).or_default().push(ValueRead {
456 1496123 : entry_lsn: *entry_lsn,
457 1496123 : read: vectored_dio_read::LogicalRead::new(
458 1496123 : pos,
459 1496123 : Vec::with_capacity(len as usize),
460 1496123 : ),
461 1496123 : });
462 1496123 : if will_init {
463 1496111 : break;
464 12 : }
465 : }
466 : }
467 : }
468 :
469 : // Execute the reads.
470 :
471 1819412 : let f = vectored_dio_read::execute(
472 1819412 : &inner.file,
473 1819412 : reads
474 1819412 : .iter()
475 1819412 : .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
476 1819412 : &ctx,
477 1819412 : );
478 1819412 : send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
479 256243 : .await;
480 :
481 : // Process results into the reconstruct state
482 3315529 : 'next_key: for (key, value_reads) in reads {
483 1496129 : for ValueRead { entry_lsn, read } in value_reads {
484 1496123 : match read.into_result().expect("we run execute() above") {
485 0 : Err(e) => {
486 0 : reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
487 0 : continue 'next_key;
488 : }
489 1496123 : Ok(value_buf) => {
490 1496123 : let value = Value::des(&value_buf);
491 1496123 : if let Err(e) = value {
492 0 : reconstruct_state
493 0 : .on_key_error(key, PageReconstructError::from(anyhow!(e)));
494 0 : continue 'next_key;
495 1496123 : }
496 1496123 :
497 1496123 : let key_situation =
498 1496123 : reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
499 1496123 : if key_situation == ValueReconstructSituation::Complete {
500 : // TODO: metric to see if we fetched more values than necessary
501 1496111 : continue 'next_key;
502 12 : }
503 :
504 : // process the next value in the next iteration of the loop
505 : }
506 : }
507 : }
508 : }
509 :
510 1819412 : reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
511 1819412 :
512 1819412 : Ok(())
513 1819412 : }
514 : }
515 :
516 : /// Offset of a particular Value within a serialized batch.
517 : struct SerializedBatchOffset {
518 : key: CompactKey,
519 : lsn: Lsn,
520 : // TODO: separate type when we start serde-serializing this value, to avoid coupling
521 : // in-memory representation to serialization format.
522 : index_entry: IndexEntry,
523 : }
524 :
525 : pub struct SerializedBatch {
526 : /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
527 : pub(crate) raw: Vec<u8>,
528 :
529 : /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
530 : offsets: Vec<SerializedBatchOffset>,
531 :
532 : /// The highest LSN of any value in the batch
533 : pub(crate) max_lsn: Lsn,
534 : }
535 :
536 : impl SerializedBatch {
537 14412642 : pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result<Self> {
538 14412642 : // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
539 14412642 : // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
540 15271446 : let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
541 14412642 : let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
542 14412642 :
543 14412642 : let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
544 14412642 : let mut max_lsn: Lsn = Lsn(0);
545 29684088 : for (key, lsn, val_ser_size, val) in batch {
546 15271446 : let relative_off = cursor.position();
547 15271446 :
548 15271446 : val.ser_into(&mut cursor)
549 15271446 : .expect("Writing into in-memory buffer is infallible");
550 15271446 :
551 15271446 : offsets.push(SerializedBatchOffset {
552 15271446 : key,
553 15271446 : lsn,
554 15271446 : index_entry: IndexEntry::new(IndexEntryNewArgs {
555 15271446 : base_offset: 0,
556 15271446 : batch_offset: relative_off,
557 15271446 : len: val_ser_size,
558 15271446 : will_init: val.will_init(),
559 15271446 : })
560 15271446 : .context("higher-level code ensures that values are within supported ranges")?,
561 : });
562 15271446 : max_lsn = std::cmp::max(max_lsn, lsn);
563 : }
564 :
565 14412642 : let buffer = cursor.into_inner();
566 14412642 :
567 14412642 : // Assert that we didn't do any extra allocations while building buffer.
568 14412642 : debug_assert!(buffer.len() <= buffer_size);
569 :
570 14412642 : Ok(Self {
571 14412642 : raw: buffer,
572 14412642 : offsets,
573 14412642 : max_lsn,
574 14412642 : })
575 14412642 : }
576 : }
577 :
578 6840 : fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
579 6840 : write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
580 6840 : }
581 :
582 3420 : fn inmem_layer_log_display(
583 3420 : mut f: impl Write,
584 3420 : timeline: TimelineId,
585 3420 : start_lsn: Lsn,
586 3420 : end_lsn: Lsn,
587 3420 : ) -> std::fmt::Result {
588 3420 : write!(f, "timeline {} in-memory ", timeline)?;
589 3420 : inmem_layer_display(f, start_lsn, end_lsn)
590 3420 : }
591 :
592 : impl std::fmt::Display for InMemoryLayer {
593 3420 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
594 3420 : let end_lsn = self.end_lsn_or_max();
595 3420 : inmem_layer_display(f, self.start_lsn, end_lsn)
596 3420 : }
597 : }
598 :
599 : impl InMemoryLayer {
600 : /// Get layer size.
601 3810 : pub async fn size(&self) -> Result<u64> {
602 3810 : let inner = self.inner.read().await;
603 3810 : Ok(inner.file.len())
604 3810 : }
605 :
606 : /// Create a new, empty, in-memory layer
607 3810 : pub async fn create(
608 3810 : conf: &'static PageServerConf,
609 3810 : timeline_id: TimelineId,
610 3810 : tenant_shard_id: TenantShardId,
611 3810 : start_lsn: Lsn,
612 3810 : gate_guard: utils::sync::gate::GateGuard,
613 3810 : ctx: &RequestContext,
614 3810 : ) -> Result<InMemoryLayer> {
615 3810 : trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
616 :
617 3810 : let file =
618 3810 : EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
619 3810 : let key = InMemoryLayerFileId(file.page_cache_file_id());
620 3810 :
621 3810 : Ok(InMemoryLayer {
622 3810 : file_id: key,
623 3810 : frozen_local_path_str: OnceLock::new(),
624 3810 : conf,
625 3810 : timeline_id,
626 3810 : tenant_shard_id,
627 3810 : start_lsn,
628 3810 : end_lsn: OnceLock::new(),
629 3810 : opened_at: Instant::now(),
630 3810 : inner: RwLock::new(InMemoryLayerInner {
631 3810 : index: BTreeMap::new(),
632 3810 : file,
633 3810 : resource_units: GlobalResourceUnits::new(),
634 3810 : }),
635 3810 : })
636 3810 : }
637 :
638 : /// Write path.
639 : ///
640 : /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
641 : /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
642 : /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
643 14412642 : pub async fn put_batch(
644 14412642 : &self,
645 14412642 : serialized_batch: SerializedBatch,
646 14412642 : ctx: &RequestContext,
647 14412642 : ) -> anyhow::Result<()> {
648 14412642 : let mut inner = self.inner.write().await;
649 14412642 : self.assert_writable();
650 14412642 :
651 14412642 : let base_offset = inner.file.len();
652 14412642 :
653 14412642 : let SerializedBatch {
654 14412642 : raw,
655 14412642 : mut offsets,
656 14412642 : max_lsn: _,
657 14412642 : } = serialized_batch;
658 :
659 : // Add the base_offset to the batch's index entries which are relative to the batch start.
660 29684088 : for offset in &mut offsets {
661 : let IndexEntryUnpacked {
662 15271446 : will_init,
663 15271446 : len,
664 15271446 : pos,
665 15271446 : } = offset.index_entry.unpack();
666 15271446 : offset.index_entry = IndexEntry::new(IndexEntryNewArgs {
667 15271446 : base_offset,
668 15271446 : batch_offset: pos,
669 15271446 : len: len.into_usize(),
670 15271446 : will_init,
671 15271446 : })?;
672 : }
673 :
674 : // Write the batch to the file
675 14412642 : inner.file.write_raw(&raw, ctx).await?;
676 14412642 : let new_size = inner.file.len();
677 14412642 : let expected_new_len = base_offset
678 14412642 : .checked_add(raw.len().into_u64())
679 14412642 : // write_raw would error if we were to overflow u64.
680 14412642 : // also IndexEntry and higher levels in
681 14412642 : //the code don't allow the file to grow that large
682 14412642 : .unwrap();
683 14412642 : assert_eq!(new_size, expected_new_len);
684 :
685 : // Update the index with the new entries
686 : for SerializedBatchOffset {
687 15271446 : key,
688 15271446 : lsn,
689 15271446 : index_entry,
690 29684088 : } in offsets
691 : {
692 15271446 : let vec_map = inner.index.entry(key).or_default();
693 15271446 : let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
694 15271446 : if old.is_some() {
695 : // This should not break anything, but is unexpected: ingestion code aims to filter out
696 : // multiple writes to the same key at the same LSN. This happens in cases where our
697 : // ingenstion code generates some write like an empty page, and we see a write from postgres
698 : // to the same key in the same wal record. If one such write makes it through, we
699 : // index the most recent write, implicitly ignoring the earlier write. We log a warning
700 : // because this case is unexpected, and we would like tests to fail if this happens.
701 0 : warn!("Key {} at {} written twice at same LSN", key, lsn);
702 15271446 : }
703 : }
704 :
705 14412642 : inner.resource_units.maybe_publish_size(new_size);
706 14412642 :
707 14412642 : Ok(())
708 14412642 : }
709 :
710 14409060 : pub(crate) fn get_opened_at(&self) -> Instant {
711 14409060 : self.opened_at
712 14409060 : }
713 :
714 0 : pub(crate) async fn tick(&self) -> Option<u64> {
715 0 : let mut inner = self.inner.write().await;
716 0 : let size = inner.file.len();
717 0 : inner.resource_units.publish_size(size)
718 0 : }
719 :
720 6 : pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
721 6 : // TODO: Currently, we just leak the storage for any deleted keys
722 6 : Ok(())
723 6 : }
724 :
725 : /// Records the end_lsn for non-dropped layers.
726 : /// `end_lsn` is exclusive
727 3420 : pub async fn freeze(&self, end_lsn: Lsn) {
728 3420 : assert!(
729 3420 : self.start_lsn < end_lsn,
730 0 : "{} >= {}",
731 : self.start_lsn,
732 : end_lsn
733 : );
734 3420 : self.end_lsn.set(end_lsn).expect("end_lsn set only once");
735 3420 :
736 3420 : self.frozen_local_path_str
737 3420 : .set({
738 3420 : let mut buf = String::new();
739 3420 : inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
740 3420 : .unwrap();
741 3420 : buf.into()
742 3420 : })
743 3420 : .expect("frozen_local_path_str set only once");
744 :
745 : #[cfg(debug_assertions)]
746 : {
747 3420 : let inner = self.inner.write().await;
748 12766881 : for vec_map in inner.index.values() {
749 13160172 : for (lsn, _) in vec_map.as_slice() {
750 13160172 : assert!(*lsn < end_lsn);
751 : }
752 : }
753 : }
754 3420 : }
755 :
756 : /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
757 : /// layer will only contain the key range the user specifies, and may return `None`
758 : /// if there are no matching keys.
759 : ///
760 : /// Returns a new delta layer with all the same data as this in-memory layer
761 2904 : pub async fn write_to_disk(
762 2904 : &self,
763 2904 : ctx: &RequestContext,
764 2904 : key_range: Option<Range<Key>>,
765 2904 : l0_flush_global_state: &l0_flush::Inner,
766 2904 : ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
767 : // Grab the lock in read-mode. We hold it over the I/O, but because this
768 : // layer is not writeable anymore, no one should be trying to acquire the
769 : // write lock on it, so we shouldn't block anyone. There's one exception
770 : // though: another thread might have grabbed a reference to this layer
771 : // in `get_layer_for_write' just before the checkpointer called
772 : // `freeze`, and then `write_to_disk` on it. When the thread gets the
773 : // lock, it will see that it's not writeable anymore and retry, but it
774 : // would have to wait until we release it. That race condition is very
775 : // rare though, so we just accept the potential latency hit for now.
776 2904 : let inner = self.inner.read().await;
777 :
778 : use l0_flush::Inner;
779 2904 : let _concurrency_permit = match l0_flush_global_state {
780 2904 : Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
781 : };
782 :
783 2904 : let end_lsn = *self.end_lsn.get().unwrap();
784 :
785 2904 : let key_count = if let Some(key_range) = key_range {
786 0 : let key_range = key_range.start.to_compact()..key_range.end.to_compact();
787 0 :
788 0 : inner
789 0 : .index
790 0 : .iter()
791 0 : .filter(|(k, _)| key_range.contains(k))
792 0 : .count()
793 : } else {
794 2904 : inner.index.len()
795 : };
796 2904 : if key_count == 0 {
797 0 : return Ok(None);
798 2904 : }
799 :
800 2904 : let mut delta_layer_writer = DeltaLayerWriter::new(
801 2904 : self.conf,
802 2904 : self.timeline_id,
803 2904 : self.tenant_shard_id,
804 2904 : Key::MIN,
805 2904 : self.start_lsn..end_lsn,
806 2904 : ctx,
807 2904 : )
808 1492 : .await?;
809 :
810 2904 : match l0_flush_global_state {
811 : l0_flush::Inner::Direct { .. } => {
812 2904 : let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
813 :
814 2904 : let file_contents = Bytes::from(file_contents);
815 :
816 12763257 : for (key, vec_map) in inner.index.iter() {
817 : // Write all page versions
818 13156548 : for (lsn, entry) in vec_map
819 12763257 : .as_slice()
820 12763257 : .iter()
821 13156548 : .map(|(lsn, entry)| (lsn, entry.unpack()))
822 : {
823 : let IndexEntryUnpacked {
824 13156548 : pos,
825 13156548 : len,
826 13156548 : will_init,
827 13156548 : } = entry;
828 13156548 : let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize);
829 13156548 : let (_buf, res) = delta_layer_writer
830 13156548 : .put_value_bytes(
831 13156548 : Key::from_compact(*key),
832 13156548 : *lsn,
833 13156548 : buf.slice_len(),
834 13156548 : will_init,
835 13156548 : ctx,
836 13156548 : )
837 8193 : .await;
838 13156548 : res?;
839 : }
840 : }
841 : }
842 : }
843 :
844 : // MAX is used here because we identify L0 layers by full key range
845 19916 : let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
846 :
847 : // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
848 : //
849 : // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
850 : // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
851 : // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
852 : //
853 : // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
854 : // we dirtied when writing to the filesystem have been flushed and marked !dirty.
855 2904 : drop(_concurrency_permit);
856 2904 :
857 2904 : Ok(Some((desc, path)))
858 2904 : }
859 : }
860 :
861 : #[cfg(test)]
862 : mod tests {
863 : use super::*;
864 :
865 : #[test]
866 6 : fn test_index_entry() {
867 : const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS;
868 : use IndexEntryNewArgs as Args;
869 : use IndexEntryUnpacked as Unpacked;
870 :
871 120 : let roundtrip = |args, expect: Unpacked| {
872 120 : let res = IndexEntry::new(args).expect("this tests expects no errors");
873 120 : let IndexEntryUnpacked {
874 120 : will_init,
875 120 : len,
876 120 : pos,
877 120 : } = res.unpack();
878 120 : assert_eq!(will_init, expect.will_init);
879 120 : assert_eq!(len, expect.len);
880 120 : assert_eq!(pos, expect.pos);
881 120 : };
882 :
883 : // basic roundtrip
884 18 : for pos in [0, MAX_SUPPORTED_POS] {
885 36 : for len in [0, MAX_SUPPORTED_BLOB_LEN] {
886 72 : for will_init in [true, false] {
887 48 : let expect = Unpacked {
888 48 : will_init,
889 48 : len: len.into_u64(),
890 48 : pos: pos.into_u64(),
891 48 : };
892 48 : roundtrip(
893 48 : Args {
894 48 : will_init,
895 48 : base_offset: pos.into_u64(),
896 48 : batch_offset: 0,
897 48 : len,
898 48 : },
899 48 : expect,
900 48 : );
901 48 : roundtrip(
902 48 : Args {
903 48 : will_init,
904 48 : base_offset: 0,
905 48 : batch_offset: pos.into_u64(),
906 48 : len,
907 48 : },
908 48 : expect,
909 48 : );
910 48 : }
911 : }
912 : }
913 :
914 : // too-large len
915 6 : let too_large = Args {
916 6 : will_init: false,
917 6 : len: MAX_SUPPORTED_BLOB_LEN + 1,
918 6 : base_offset: 0,
919 6 : batch_offset: 0,
920 6 : };
921 6 : assert!(IndexEntry::new(too_large).is_err());
922 :
923 : // too-large pos
924 : {
925 6 : let too_large = Args {
926 6 : will_init: false,
927 6 : len: 0,
928 6 : base_offset: MAX_SUPPORTED_POS.into_u64() + 1,
929 6 : batch_offset: 0,
930 6 : };
931 6 : assert!(IndexEntry::new(too_large).is_err());
932 6 : let too_large = Args {
933 6 : will_init: false,
934 6 : len: 0,
935 6 : base_offset: 0,
936 6 : batch_offset: MAX_SUPPORTED_POS.into_u64() + 1,
937 6 : };
938 6 : assert!(IndexEntry::new(too_large).is_err());
939 : }
940 :
941 : // too large (base_offset + batch_offset)
942 : {
943 6 : let too_large = Args {
944 6 : will_init: false,
945 6 : len: 0,
946 6 : base_offset: MAX_SUPPORTED_POS.into_u64(),
947 6 : batch_offset: 1,
948 6 : };
949 6 : assert!(IndexEntry::new(too_large).is_err());
950 6 : let too_large = Args {
951 6 : will_init: false,
952 6 : len: 0,
953 6 : base_offset: MAX_SUPPORTED_POS.into_u64() - 1,
954 6 : batch_offset: MAX_SUPPORTED_POS.into_u64() - 1,
955 6 : };
956 6 : assert!(IndexEntry::new(too_large).is_err());
957 : }
958 :
959 : // valid special cases
960 : // - area past the max supported pos that is accessible by len
961 18 : for len in [1, MAX_SUPPORTED_BLOB_LEN] {
962 12 : roundtrip(
963 12 : Args {
964 12 : will_init: false,
965 12 : len,
966 12 : base_offset: MAX_SUPPORTED_POS.into_u64(),
967 12 : batch_offset: 0,
968 12 : },
969 12 : Unpacked {
970 12 : will_init: false,
971 12 : len: len as u64,
972 12 : pos: MAX_SUPPORTED_POS.into_u64(),
973 12 : },
974 12 : );
975 12 : roundtrip(
976 12 : Args {
977 12 : will_init: false,
978 12 : len,
979 12 : base_offset: 0,
980 12 : batch_offset: MAX_SUPPORTED_POS.into_u64(),
981 12 : },
982 12 : Unpacked {
983 12 : will_init: false,
984 12 : len: len as u64,
985 12 : pos: MAX_SUPPORTED_POS.into_u64(),
986 12 : },
987 12 : );
988 12 : }
989 6 : }
990 : }
|