Line data Source code
1 : //!
2 : //! The layer map tracks what layers exist in a timeline.
3 : //!
4 : //! When the timeline is first accessed, the server lists of all layer files
5 : //! in the timelines/<timeline_id> directory, and populates this map with
6 : //! ImageLayer and DeltaLayer structs corresponding to each file. When the first
7 : //! new WAL record is received, we create an InMemoryLayer to hold the incoming
8 : //! records. Now and then, in the checkpoint() function, the in-memory layer is
9 : //! are frozen, and it is split up into new image and delta layers and the
10 : //! corresponding files are written to disk.
11 : //!
12 : //! Design overview:
13 : //!
14 : //! The `search` method of the layer map is on the read critical path, so we've
15 : //! built an efficient data structure for fast reads, stored in `LayerMap::historic`.
16 : //! Other read methods are less critical but still impact performance of background tasks.
17 : //!
18 : //! This data structure relies on a persistent/immutable binary search tree. See the
19 : //! following lecture for an introduction <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
20 : //! Summary: A persistent/immutable BST (and persistent data structures in general) allows
21 : //! you to modify the tree in such a way that each modification creates a new "version"
22 : //! of the tree. When you modify it, you get a new version, but all previous versions are
23 : //! still accessible too. So if someone is still holding a reference to an older version,
24 : //! they continue to see the tree as it was then. The persistent BST stores all the
25 : //! different versions in an efficient way.
26 : //!
27 : //! Our persistent BST maintains a map of which layer file "covers" each key. It has only
28 : //! one dimension, the key. See `layer_coverage.rs`. We use the persistent/immutable property
29 : //! to handle the LSN dimension.
30 : //!
31 : //! To build the layer map, we insert each layer to the persistent BST in LSN.start order,
32 : //! starting from the oldest one. After each insertion, we grab a reference to that "version"
33 : //! of the tree, and store it in another tree, a BtreeMap keyed by the LSN. See
34 : //! `historic_layer_coverage.rs`.
35 : //!
36 : //! To search for a particular key-LSN pair, you first look up the right "version" in the
37 : //! BTreeMap. Then you search that version of the BST with the key.
38 : //!
39 : //! The persistent BST keeps all the versions, but there is no way to change the old versions
40 : //! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
41 : //! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
42 : //! to throw away most of the persistent BST and build a new one, starting from the oldest
43 : //! LSN. See [`LayerMap::flush_updates()`].
44 : //!
45 :
46 : mod historic_layer_coverage;
47 : mod layer_coverage;
48 :
49 : use crate::context::RequestContext;
50 : use crate::keyspace::KeyPartitioning;
51 : use crate::repository::Key;
52 : use crate::tenant::storage_layer::InMemoryLayer;
53 : use anyhow::Result;
54 : use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
55 : use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
56 : use std::collections::{HashMap, VecDeque};
57 : use std::iter::Peekable;
58 : use std::ops::Range;
59 : use std::sync::Arc;
60 : use utils::lsn::Lsn;
61 :
62 : use historic_layer_coverage::BufferedHistoricLayerCoverage;
63 : pub use historic_layer_coverage::LayerKey;
64 :
65 : use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
66 :
67 : ///
68 : /// LayerMap tracks what layers exist on a timeline.
69 : ///
70 : #[derive(Default)]
71 : pub struct LayerMap {
72 : //
73 : // 'open_layer' holds the current InMemoryLayer that is accepting new
74 : // records. If it is None, 'next_open_layer_at' will be set instead, indicating
75 : // where the start LSN of the next InMemoryLayer that is to be created.
76 : //
77 : pub open_layer: Option<Arc<InMemoryLayer>>,
78 : pub next_open_layer_at: Option<Lsn>,
79 :
80 : ///
81 : /// Frozen layers, if any. Frozen layers are in-memory layers that
82 : /// are no longer added to, but haven't been written out to disk
83 : /// yet. They contain WAL older than the current 'open_layer' or
84 : /// 'next_open_layer_at', but newer than any historic layer.
85 : /// The frozen layers are in order from oldest to newest, so that
86 : /// the newest one is in the 'back' of the VecDeque, and the oldest
87 : /// in the 'front'.
88 : ///
89 : pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
90 :
91 : /// Index of the historic layers optimized for search
92 : historic: BufferedHistoricLayerCoverage<Arc<PersistentLayerDesc>>,
93 :
94 : /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
95 : /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
96 : l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
97 : }
98 :
99 : /// The primary update API for the layer map.
100 : ///
101 : /// Batching historic layer insertions and removals is good for
102 : /// performance and this struct helps us do that correctly.
103 : #[must_use]
104 : pub struct BatchedUpdates<'a> {
105 : // While we hold this exclusive reference to the layer map the type checker
106 : // will prevent us from accidentally reading any unflushed updates.
107 : layer_map: &'a mut LayerMap,
108 : }
109 :
110 : /// Provide ability to batch more updates while hiding the read
111 : /// API so we don't accidentally read without flushing.
112 : impl BatchedUpdates<'_> {
113 : ///
114 : /// Insert an on-disk layer.
115 : ///
116 : // TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
117 14376 : pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc) {
118 14376 : self.layer_map.insert_historic_noflush(layer_desc)
119 14376 : }
120 :
121 : ///
122 : /// Remove an on-disk layer from the map.
123 : ///
124 : /// This should be called when the corresponding file on disk has been deleted.
125 : ///
126 1398 : pub fn remove_historic(&mut self, layer_desc: &PersistentLayerDesc) {
127 1398 : self.layer_map.remove_historic_noflush(layer_desc)
128 1398 : }
129 :
130 : // We will flush on drop anyway, but this method makes it
131 : // more explicit that there is some work being done.
132 : /// Apply all updates
133 5598 : pub fn flush(self) {
134 5598 : // Flush happens on drop
135 5598 : }
136 : }
137 :
138 : // Ideally the flush() method should be called explicitly for more
139 : // controlled execution. But if we forget we'd rather flush on drop
140 : // than panic later or read without flushing.
141 : //
142 : // TODO maybe warn if flush hasn't explicitly been called
143 : impl Drop for BatchedUpdates<'_> {
144 5598 : fn drop(&mut self) {
145 5598 : self.layer_map.flush_updates();
146 5598 : }
147 : }
148 :
149 : /// Return value of LayerMap::search
150 : #[derive(Eq, PartialEq, Debug, Hash)]
151 : pub struct SearchResult {
152 : pub layer: Arc<PersistentLayerDesc>,
153 : pub lsn_floor: Lsn,
154 : }
155 :
156 : /// Return value of [`LayerMap::range_search`]
157 : ///
158 : /// Contains a mapping from a layer description to a keyspace
159 : /// accumulator that contains all the keys which intersect the layer
160 : /// from the original search space. Keys that were not found are accumulated
161 : /// in a separate key space accumulator.
162 : #[derive(Debug)]
163 : pub struct RangeSearchResult {
164 : pub found: HashMap<SearchResult, KeySpaceAccum>,
165 : pub not_found: KeySpaceAccum,
166 : }
167 :
168 : impl RangeSearchResult {
169 1547749 : fn new() -> Self {
170 1547749 : Self {
171 1547749 : found: HashMap::new(),
172 1547749 : not_found: KeySpaceAccum::new(),
173 1547749 : }
174 1547749 : }
175 : }
176 :
177 : /// Collector for results of range search queries on the LayerMap.
178 : /// It should be provided with two iterators for the delta and image coverage
179 : /// that contain all the changes for layers which intersect the range.
180 : struct RangeSearchCollector<Iter>
181 : where
182 : Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
183 : {
184 : delta_coverage: Peekable<Iter>,
185 : image_coverage: Peekable<Iter>,
186 : key_range: Range<Key>,
187 : end_lsn: Lsn,
188 :
189 : current_delta: Option<Arc<PersistentLayerDesc>>,
190 : current_image: Option<Arc<PersistentLayerDesc>>,
191 :
192 : result: RangeSearchResult,
193 : }
194 :
195 : #[derive(Debug)]
196 : enum NextLayerType {
197 : Delta(i128),
198 : Image(i128),
199 : Both(i128),
200 : }
201 :
202 : impl NextLayerType {
203 1695996 : fn next_change_at_key(&self) -> Key {
204 1695996 : match self {
205 516350 : NextLayerType::Delta(at) => Key::from_i128(*at),
206 14142 : NextLayerType::Image(at) => Key::from_i128(*at),
207 1165504 : NextLayerType::Both(at) => Key::from_i128(*at),
208 : }
209 1695996 : }
210 : }
211 :
212 : impl<Iter> RangeSearchCollector<Iter>
213 : where
214 : Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
215 : {
216 862084 : fn new(
217 862084 : key_range: Range<Key>,
218 862084 : end_lsn: Lsn,
219 862084 : delta_coverage: Iter,
220 862084 : image_coverage: Iter,
221 862084 : ) -> Self {
222 862084 : Self {
223 862084 : delta_coverage: delta_coverage.peekable(),
224 862084 : image_coverage: image_coverage.peekable(),
225 862084 : key_range,
226 862084 : end_lsn,
227 862084 : current_delta: None,
228 862084 : current_image: None,
229 862084 : result: RangeSearchResult::new(),
230 862084 : }
231 862084 : }
232 :
233 : /// Run the collector. Collection is implemented via a two pointer algorithm.
234 : /// One pointer tracks the start of the current range and the other tracks
235 : /// the beginning of the next range which will overlap with the next change
236 : /// in coverage across both image and delta.
237 862084 : fn collect(mut self) -> RangeSearchResult {
238 862084 : let next_layer_type = self.choose_next_layer_type();
239 837280 : let mut current_range_start = match next_layer_type {
240 : None => {
241 : // No changes for the range
242 24804 : self.pad_range(self.key_range.clone());
243 24804 : return self.result;
244 : }
245 837280 : Some(layer_type) if self.key_range.end <= layer_type.next_change_at_key() => {
246 0 : // Changes only after the end of the range
247 0 : self.pad_range(self.key_range.clone());
248 0 : return self.result;
249 : }
250 837280 : Some(layer_type) => {
251 837280 : // Changes for the range exist. Record anything before the first
252 837280 : // coverage change as not found.
253 837280 : let coverage_start = layer_type.next_change_at_key();
254 837280 : let range_before = self.key_range.start..coverage_start;
255 837280 : self.pad_range(range_before);
256 837280 :
257 837280 : self.advance(&layer_type);
258 837280 : coverage_start
259 : }
260 : };
261 :
262 1695996 : while current_range_start < self.key_range.end {
263 858716 : let next_layer_type = self.choose_next_layer_type();
264 858716 : match next_layer_type {
265 21436 : Some(t) => {
266 21436 : let current_range_end = t.next_change_at_key();
267 21436 : self.add_range(current_range_start..current_range_end);
268 21436 : current_range_start = current_range_end;
269 21436 :
270 21436 : self.advance(&t);
271 21436 : }
272 837280 : None => {
273 837280 : self.add_range(current_range_start..self.key_range.end);
274 837280 : current_range_start = self.key_range.end;
275 837280 : }
276 : }
277 : }
278 :
279 837280 : self.result
280 862084 : }
281 :
282 : /// Mark a range as not found (i.e. no layers intersect it)
283 864922 : fn pad_range(&mut self, key_range: Range<Key>) {
284 864922 : if !key_range.is_empty() {
285 30594 : self.result.not_found.add_range(key_range);
286 834328 : }
287 864922 : }
288 :
289 : /// Select the appropiate layer for the given range and update
290 : /// the collector.
291 858716 : fn add_range(&mut self, covered_range: Range<Key>) {
292 858716 : let selected = LayerMap::select_layer(
293 858716 : self.current_delta.clone(),
294 858716 : self.current_image.clone(),
295 858716 : self.end_lsn,
296 858716 : );
297 858716 :
298 858716 : match selected {
299 855878 : Some(search_result) => self
300 855878 : .result
301 855878 : .found
302 855878 : .entry(search_result)
303 855878 : .or_default()
304 855878 : .add_range(covered_range),
305 2838 : None => self.pad_range(covered_range),
306 : }
307 858716 : }
308 :
309 : /// Move to the next coverage change.
310 858716 : fn advance(&mut self, layer_type: &NextLayerType) {
311 858716 : match layer_type {
312 260748 : NextLayerType::Delta(_) => {
313 260748 : let (_, layer) = self.delta_coverage.next().unwrap();
314 260748 : self.current_delta = layer;
315 260748 : }
316 8430 : NextLayerType::Image(_) => {
317 8430 : let (_, layer) = self.image_coverage.next().unwrap();
318 8430 : self.current_image = layer;
319 8430 : }
320 589538 : NextLayerType::Both(_) => {
321 589538 : let (_, image_layer) = self.image_coverage.next().unwrap();
322 589538 : let (_, delta_layer) = self.delta_coverage.next().unwrap();
323 589538 :
324 589538 : self.current_image = image_layer;
325 589538 : self.current_delta = delta_layer;
326 589538 : }
327 : }
328 858716 : }
329 :
330 : /// Pick the next coverage change: the one at the lesser key or both if they're alligned.
331 1720800 : fn choose_next_layer_type(&mut self) -> Option<NextLayerType> {
332 1720800 : let next_delta_at = self.delta_coverage.peek().map(|(key, _)| key);
333 1720800 : let next_image_at = self.image_coverage.peek().map(|(key, _)| key);
334 1720800 :
335 1720800 : match (next_delta_at, next_image_at) {
336 862084 : (None, None) => None,
337 253182 : (Some(next_delta_at), None) => Some(NextLayerType::Delta(*next_delta_at)),
338 7115 : (None, Some(next_image_at)) => Some(NextLayerType::Image(*next_image_at)),
339 598419 : (Some(next_delta_at), Some(next_image_at)) if next_image_at < next_delta_at => {
340 1315 : Some(NextLayerType::Image(*next_image_at))
341 : }
342 597104 : (Some(next_delta_at), Some(next_image_at)) if next_delta_at < next_image_at => {
343 7566 : Some(NextLayerType::Delta(*next_delta_at))
344 : }
345 589538 : (Some(next_delta_at), Some(_)) => Some(NextLayerType::Both(*next_delta_at)),
346 : }
347 1720800 : }
348 : }
349 :
350 : impl LayerMap {
351 : ///
352 : /// Find the latest layer (by lsn.end) that covers the given
353 : /// 'key', with lsn.start < 'end_lsn'.
354 : ///
355 : /// The caller of this function is the page reconstruction
356 : /// algorithm looking for the next relevant delta layer, or
357 : /// the terminal image layer. The caller will pass the lsn_floor
358 : /// value as end_lsn in the next call to search.
359 : ///
360 : /// If there's an image layer exactly below the given end_lsn,
361 : /// search should return that layer regardless if there are
362 : /// overlapping deltas.
363 : ///
364 : /// If the latest layer is a delta and there is an overlapping
365 : /// image with it below, the lsn_floor returned should be right
366 : /// above that image so we don't skip it in the search. Otherwise
367 : /// the lsn_floor returned should be the bottom of the delta layer
368 : /// because we should make as much progress down the lsn axis
369 : /// as possible. It's fine if this way we skip some overlapping
370 : /// deltas, because the delta we returned would contain the same
371 : /// wal content.
372 : ///
373 : /// TODO: This API is convoluted and inefficient. If the caller
374 : /// makes N search calls, we'll end up finding the same latest
375 : /// image layer N times. We should either cache the latest image
376 : /// layer result, or simplify the api to `get_latest_image` and
377 : /// `get_latest_delta`, and only call `get_latest_image` once.
378 : ///
379 : /// NOTE: This only searches the 'historic' layers, *not* the
380 : /// 'open' and 'frozen' layers!
381 : ///
382 215946 : pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult> {
383 215946 : let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
384 215946 : let latest_delta = version.delta_coverage.query(key.to_i128());
385 215946 : let latest_image = version.image_coverage.query(key.to_i128());
386 215946 :
387 215946 : Self::select_layer(latest_delta, latest_image, end_lsn)
388 215946 : }
389 :
390 1074662 : fn select_layer(
391 1074662 : delta_layer: Option<Arc<PersistentLayerDesc>>,
392 1074662 : image_layer: Option<Arc<PersistentLayerDesc>>,
393 1074662 : end_lsn: Lsn,
394 1074662 : ) -> Option<SearchResult> {
395 1074662 : assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta()));
396 1074662 : assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta()));
397 :
398 1074662 : match (delta_layer, image_layer) {
399 34818 : (None, None) => None,
400 108492 : (None, Some(image)) => {
401 108492 : let lsn_floor = image.get_lsn_range().start;
402 108492 : Some(SearchResult {
403 108492 : layer: image,
404 108492 : lsn_floor,
405 108492 : })
406 : }
407 273920 : (Some(delta), None) => {
408 273920 : let lsn_floor = delta.get_lsn_range().start;
409 273920 : Some(SearchResult {
410 273920 : layer: delta,
411 273920 : lsn_floor,
412 273920 : })
413 : }
414 657432 : (Some(delta), Some(image)) => {
415 657432 : let img_lsn = image.get_lsn_range().start;
416 657432 : let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
417 657432 : let image_exact_match = img_lsn + 1 == end_lsn;
418 657432 : if image_is_newer || image_exact_match {
419 116209 : Some(SearchResult {
420 116209 : layer: image,
421 116209 : lsn_floor: img_lsn,
422 116209 : })
423 : } else {
424 541223 : let lsn_floor =
425 541223 : std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
426 541223 : Some(SearchResult {
427 541223 : layer: delta,
428 541223 : lsn_floor,
429 541223 : })
430 : }
431 : }
432 : }
433 1074662 : }
434 :
435 1537129 : pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> RangeSearchResult {
436 1537129 : let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) {
437 862084 : Some(version) => version,
438 : None => {
439 675045 : let mut result = RangeSearchResult::new();
440 675045 : result.not_found.add_range(key_range);
441 675045 : return result;
442 : }
443 : };
444 :
445 862084 : let raw_range = key_range.start.to_i128()..key_range.end.to_i128();
446 862084 : let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
447 862084 : let image_changes = version.image_coverage.range_overlaps(&raw_range);
448 862084 :
449 862084 : let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
450 862084 : collector.collect()
451 1537129 : }
452 :
453 : /// Start a batch of updates, applied on drop
454 5598 : pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
455 5598 : BatchedUpdates { layer_map: self }
456 5598 : }
457 :
458 : ///
459 : /// Insert an on-disk layer
460 : ///
461 : /// Helper function for BatchedUpdates::insert_historic
462 : ///
463 : /// TODO(chi): remove L generic so that we do not need to pass layer object.
464 14406 : pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
465 14406 : // TODO: See #3869, resulting #4088, attempted fix and repro #4094
466 14406 :
467 14406 : if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
468 2988 : self.l0_delta_layers.push(layer_desc.clone().into());
469 11418 : }
470 :
471 14406 : self.historic.insert(
472 14406 : historic_layer_coverage::LayerKey::from(&layer_desc),
473 14406 : layer_desc.into(),
474 14406 : );
475 14406 : }
476 :
477 : ///
478 : /// Remove an on-disk layer from the map.
479 : ///
480 : /// Helper function for BatchedUpdates::remove_historic
481 : ///
482 1398 : pub fn remove_historic_noflush(&mut self, layer_desc: &PersistentLayerDesc) {
483 1398 : self.historic
484 1398 : .remove(historic_layer_coverage::LayerKey::from(layer_desc));
485 1398 : let layer_key = layer_desc.key();
486 1398 : if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
487 1212 : let len_before = self.l0_delta_layers.len();
488 1212 : let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
489 16572 : l0_delta_layers.retain(|other| other.key() != layer_key);
490 1212 : self.l0_delta_layers = l0_delta_layers;
491 1212 : // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
492 1212 : // there's a chance that the comparison fails at runtime due to it comparing (pointer,
493 1212 : // vtable) pairs.
494 1212 : assert_eq!(
495 1212 : self.l0_delta_layers.len(),
496 1212 : len_before - 1,
497 0 : "failed to locate removed historic layer from l0_delta_layers"
498 : );
499 186 : }
500 1398 : }
501 :
502 : /// Helper function for BatchedUpdates::drop.
503 5604 : pub(self) fn flush_updates(&mut self) {
504 5604 : self.historic.rebuild();
505 5604 : }
506 :
507 : /// Is there a newer image layer for given key- and LSN-range? Or a set
508 : /// of image layers within the specified lsn range that cover the entire
509 : /// specified key range?
510 : ///
511 : /// This is used for garbage collection, to determine if an old layer can
512 : /// be deleted.
513 34950 : pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> bool {
514 34950 : if key.is_empty() {
515 : // Vacuously true. There's a newer image for all 0 of the kerys in the range.
516 0 : return true;
517 34950 : }
518 :
519 34950 : let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
520 34950 : Some(v) => v,
521 0 : None => return false,
522 : };
523 :
524 34950 : let start = key.start.to_i128();
525 34950 : let end = key.end.to_i128();
526 34950 :
527 35244 : let layer_covers = |layer: Option<Arc<PersistentLayerDesc>>| match layer {
528 35154 : Some(layer) => layer.get_lsn_range().start >= lsn.start,
529 90 : None => false,
530 35244 : };
531 :
532 : // Check the start is covered
533 34950 : if !layer_covers(version.image_coverage.query(start)) {
534 34836 : return false;
535 114 : }
536 :
537 : // Check after all changes of coverage
538 294 : for (_, change_val) in version.image_coverage.range(start..end) {
539 294 : if !layer_covers(change_val) {
540 90 : return false;
541 204 : }
542 : }
543 :
544 24 : true
545 34950 : }
546 :
547 12495 : pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
548 12495 : self.historic.iter()
549 12495 : }
550 :
551 : /// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
552 3130777 : pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
553 3130777 : where
554 3130777 : Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
555 3130777 : {
556 3130777 : if let Some(open) = &self.open_layer {
557 2723909 : if pred(open) {
558 1812701 : return Some(open.clone());
559 911208 : }
560 406868 : }
561 :
562 1318076 : self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
563 3130777 : }
564 :
565 : ///
566 : /// Divide the whole given range of keys into sub-ranges based on the latest
567 : /// image layer that covers each range at the specified lsn (inclusive).
568 : /// This is used when creating new image layers.
569 42 : pub fn image_coverage(
570 42 : &self,
571 42 : key_range: &Range<Key>,
572 42 : lsn: Lsn,
573 42 : ) -> Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> {
574 42 : let version = match self.historic.get().unwrap().get_version(lsn.0) {
575 42 : Some(v) => v,
576 0 : None => return vec![],
577 : };
578 :
579 42 : let start = key_range.start.to_i128();
580 42 : let end = key_range.end.to_i128();
581 42 :
582 42 : // Initialize loop variables
583 42 : let mut coverage: Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> = vec![];
584 42 : let mut current_key = start;
585 42 : let mut current_val = version.image_coverage.query(start);
586 :
587 : // Loop through the change events and push intervals
588 42 : for (change_key, change_val) in version.image_coverage.range(start..end) {
589 0 : let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
590 0 : coverage.push((kr, current_val.take()));
591 0 : current_key = change_key;
592 0 : current_val.clone_from(&change_val);
593 0 : }
594 :
595 : // Add the final interval
596 42 : let kr = Key::from_i128(current_key)..Key::from_i128(end);
597 42 : coverage.push((kr, current_val.take()));
598 42 :
599 42 : coverage
600 42 : }
601 :
602 : /// Check if the key range resembles that of an L0 layer.
603 16822 : pub fn is_l0(key_range: &Range<Key>, is_delta_layer: bool) -> bool {
604 16822 : is_delta_layer && key_range == &(Key::MIN..Key::MAX)
605 16822 : }
606 :
607 : /// This function determines which layers are counted in `count_deltas`:
608 : /// layers that should count towards deciding whether or not to reimage
609 : /// a certain partition range.
610 : ///
611 : /// There are two kinds of layers we currently consider reimage-worthy:
612 : ///
613 : /// Case 1: Non-L0 layers are currently reimage-worthy by default.
614 : /// TODO Some of these layers are very sparse and cover the entire key
615 : /// range. Replacing 256MB of data (or less!) with terabytes of
616 : /// images doesn't seem wise. We need a better heuristic, possibly
617 : /// based on some of these factors:
618 : /// a) whether this layer has any wal in this partition range
619 : /// b) the size of the layer
620 : /// c) the number of images needed to cover it
621 : /// d) the estimated time until we'll have to reimage over it for GC
622 : ///
623 : /// Case 2: Since L0 layers by definition cover the entire key space, we consider
624 : /// them reimage-worthy only when the entire key space can be covered by very few
625 : /// images (currently 1).
626 : /// TODO The optimal number should probably be slightly higher than 1, but to
627 : /// implement that we need to plumb a lot more context into this function
628 : /// than just the current partition_range.
629 0 : pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
630 0 : // Case 1
631 0 : if !Self::is_l0(&layer.key_range, layer.is_delta) {
632 0 : return true;
633 0 : }
634 0 :
635 0 : // Case 2
636 0 : if partition_range == &(Key::MIN..Key::MAX) {
637 0 : return true;
638 0 : }
639 0 :
640 0 : false
641 0 : }
642 :
643 : /// Count the height of the tallest stack of reimage-worthy deltas
644 : /// in this 2d region.
645 : ///
646 : /// If `limit` is provided we don't try to count above that number.
647 : ///
648 : /// This number is used to compute the largest number of deltas that
649 : /// we'll need to visit for any page reconstruction in this region.
650 : /// We use this heuristic to decide whether to create an image layer.
651 42 : pub fn count_deltas(&self, key: &Range<Key>, lsn: &Range<Lsn>, limit: Option<usize>) -> usize {
652 42 : // We get the delta coverage of the region, and for each part of the coverage
653 42 : // we recurse right underneath the delta. The recursion depth is limited by
654 42 : // the largest result this function could return, which is in practice between
655 42 : // 3 and 10 (since we usually try to create an image when the number gets larger).
656 42 :
657 42 : if lsn.is_empty() || key.is_empty() || limit == Some(0) {
658 0 : return 0;
659 42 : }
660 :
661 42 : let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
662 42 : Some(v) => v,
663 0 : None => return 0,
664 : };
665 :
666 42 : let start = key.start.to_i128();
667 42 : let end = key.end.to_i128();
668 42 :
669 42 : // Initialize loop variables
670 42 : let mut max_stacked_deltas = 0;
671 42 : let mut current_key = start;
672 42 : let mut current_val = version.delta_coverage.query(start);
673 :
674 : // Loop through the delta coverage and recurse on each part
675 42 : for (change_key, change_val) in version.delta_coverage.range(start..end) {
676 : // If there's a relevant delta in this part, add 1 and recurse down
677 0 : if let Some(val) = ¤t_val {
678 0 : if val.get_lsn_range().end > lsn.start {
679 0 : let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
680 0 : let lr = lsn.start..val.get_lsn_range().start;
681 0 : if !kr.is_empty() {
682 0 : let base_count = Self::is_reimage_worthy(val, key) as usize;
683 0 : let new_limit = limit.map(|l| l - base_count);
684 0 : let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
685 0 : max_stacked_deltas = std::cmp::max(
686 0 : max_stacked_deltas,
687 0 : base_count + max_stacked_deltas_underneath,
688 0 : );
689 0 : }
690 0 : }
691 0 : }
692 :
693 0 : current_key = change_key;
694 0 : current_val.clone_from(&change_val);
695 : }
696 :
697 : // Consider the last part
698 42 : if let Some(val) = ¤t_val {
699 0 : if val.get_lsn_range().end > lsn.start {
700 0 : let kr = Key::from_i128(current_key)..Key::from_i128(end);
701 0 : let lr = lsn.start..val.get_lsn_range().start;
702 0 :
703 0 : if !kr.is_empty() {
704 0 : let base_count = Self::is_reimage_worthy(val, key) as usize;
705 0 : let new_limit = limit.map(|l| l - base_count);
706 0 : let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
707 0 : max_stacked_deltas = std::cmp::max(
708 0 : max_stacked_deltas,
709 0 : base_count + max_stacked_deltas_underneath,
710 0 : );
711 0 : }
712 0 : }
713 42 : }
714 :
715 42 : max_stacked_deltas
716 42 : }
717 :
718 : /// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
719 : ///
720 : /// The `partition_range` argument is used as context for the reimage-worthiness decision.
721 : ///
722 : /// Used as a helper for correctness checks only. Performance not critical.
723 0 : pub fn get_difficulty(&self, lsn: Lsn, key: Key, partition_range: &Range<Key>) -> usize {
724 0 : match self.search(key, lsn) {
725 0 : Some(search_result) => {
726 0 : if search_result.layer.is_incremental() {
727 0 : (Self::is_reimage_worthy(&search_result.layer, partition_range) as usize)
728 0 : + self.get_difficulty(search_result.lsn_floor, key, partition_range)
729 : } else {
730 0 : 0
731 : }
732 : }
733 0 : None => 0,
734 : }
735 0 : }
736 :
737 : /// Used for correctness checking. Results are expected to be identical to
738 : /// self.get_difficulty_map. Assumes self.search is correct.
739 0 : pub fn get_difficulty_map_bruteforce(
740 0 : &self,
741 0 : lsn: Lsn,
742 0 : partitioning: &KeyPartitioning,
743 0 : ) -> Vec<usize> {
744 0 : // Looking at the difficulty as a function of key, it could only increase
745 0 : // when a delta layer starts or an image layer ends. Therefore it's sufficient
746 0 : // to check the difficulties at:
747 0 : // - the key.start for each non-empty part range
748 0 : // - the key.start for each delta
749 0 : // - the key.end for each image
750 0 : let keys_iter: Box<dyn Iterator<Item = Key>> = {
751 0 : let mut keys: Vec<Key> = self
752 0 : .iter_historic_layers()
753 0 : .map(|layer| {
754 0 : if layer.is_incremental() {
755 0 : layer.get_key_range().start
756 : } else {
757 0 : layer.get_key_range().end
758 : }
759 0 : })
760 0 : .collect();
761 0 : keys.sort();
762 0 : Box::new(keys.into_iter())
763 0 : };
764 0 : let mut keys_iter = keys_iter.peekable();
765 0 :
766 0 : // Iter the partition and keys together and query all the necessary
767 0 : // keys, computing the max difficulty for each part.
768 0 : partitioning
769 0 : .parts
770 0 : .iter()
771 0 : .map(|part| {
772 0 : let mut difficulty = 0;
773 : // Partition ranges are assumed to be sorted and disjoint
774 : // TODO assert it
775 0 : for range in &part.ranges {
776 0 : if !range.is_empty() {
777 0 : difficulty =
778 0 : std::cmp::max(difficulty, self.get_difficulty(lsn, range.start, range));
779 0 : }
780 0 : while let Some(key) = keys_iter.peek() {
781 0 : if key >= &range.end {
782 0 : break;
783 0 : }
784 0 : let key = keys_iter.next().unwrap();
785 0 : if key < range.start {
786 0 : continue;
787 0 : }
788 0 : difficulty =
789 0 : std::cmp::max(difficulty, self.get_difficulty(lsn, key, range));
790 : }
791 : }
792 0 : difficulty
793 0 : })
794 0 : .collect()
795 0 : }
796 :
797 : /// For each part of a keyspace partitioning, return the maximum number of layers
798 : /// that would be needed for page reconstruction in that part at the given LSN.
799 : ///
800 : /// If `limit` is provided we don't try to count above that number.
801 : ///
802 : /// This method is used to decide where to create new image layers. Computing the
803 : /// result for the entire partitioning at once allows this function to be more
804 : /// efficient, and further optimization is possible by using iterators instead,
805 : /// to allow early return.
806 : ///
807 : /// TODO actually use this method instead of count_deltas. Currently we only use
808 : /// it for benchmarks.
809 0 : pub fn get_difficulty_map(
810 0 : &self,
811 0 : lsn: Lsn,
812 0 : partitioning: &KeyPartitioning,
813 0 : limit: Option<usize>,
814 0 : ) -> Vec<usize> {
815 0 : // TODO This is a naive implementation. Perf improvements to do:
816 0 : // 1. Instead of calling self.image_coverage and self.count_deltas,
817 0 : // iterate the image and delta coverage only once.
818 0 : partitioning
819 0 : .parts
820 0 : .iter()
821 0 : .map(|part| {
822 0 : let mut difficulty = 0;
823 0 : for range in &part.ranges {
824 0 : if limit == Some(difficulty) {
825 0 : break;
826 0 : }
827 0 : for (img_range, last_img) in self.image_coverage(range, lsn) {
828 0 : if limit == Some(difficulty) {
829 0 : break;
830 0 : }
831 0 : let img_lsn = if let Some(last_img) = last_img {
832 0 : last_img.get_lsn_range().end
833 : } else {
834 0 : Lsn(0)
835 : };
836 :
837 0 : if img_lsn < lsn {
838 0 : let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit);
839 0 : difficulty = std::cmp::max(difficulty, num_deltas);
840 0 : }
841 : }
842 : }
843 0 : difficulty
844 0 : })
845 0 : .collect()
846 0 : }
847 :
848 : /// Return all L0 delta layers
849 1110 : pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
850 1110 : &self.l0_delta_layers
851 1110 : }
852 :
853 : /// debugging function to print out the contents of the layer map
854 : #[allow(unused)]
855 6 : pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
856 6 : println!("Begin dump LayerMap");
857 6 :
858 6 : println!("open_layer:");
859 6 : if let Some(open_layer) = &self.open_layer {
860 0 : open_layer.dump(verbose, ctx).await?;
861 6 : }
862 :
863 6 : println!("frozen_layers:");
864 6 : for frozen_layer in self.frozen_layers.iter() {
865 0 : frozen_layer.dump(verbose, ctx).await?;
866 : }
867 :
868 6 : println!("historic_layers:");
869 36 : for desc in self.iter_historic_layers() {
870 36 : desc.dump();
871 36 : }
872 6 : println!("End dump LayerMap");
873 6 : Ok(())
874 6 : }
875 :
876 : /// `read_points` represent the tip of a timeline and any branch points, i.e. the places
877 : /// where we expect to serve reads.
878 : ///
879 : /// This function is O(N) and should be called infrequently. The caller is responsible for
880 : /// looking up and updating the Layer objects for these layer descriptors.
881 606 : pub fn get_visibility(
882 606 : &self,
883 606 : mut read_points: Vec<Lsn>,
884 606 : ) -> (
885 606 : Vec<(Arc<PersistentLayerDesc>, LayerVisibilityHint)>,
886 606 : KeySpace,
887 606 : ) {
888 : // This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas
889 : // KeySpace is intended to be composed statically and iterated over.
890 : struct KeyShadow {
891 : // Map of range start to range end
892 : inner: RangeSetBlaze<i128>,
893 : }
894 :
895 : impl KeyShadow {
896 606 : fn new() -> Self {
897 606 : Self {
898 606 : inner: Default::default(),
899 606 : }
900 606 : }
901 :
902 4800 : fn contains(&self, range: Range<Key>) -> bool {
903 4800 : let range_incl = range.start.to_i128()..=range.end.to_i128() - 1;
904 4800 : self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint(
905 4800 : CheckSortedDisjoint::from([range_incl]),
906 4800 : ))
907 4800 : }
908 :
909 : /// Add the input range to the keys covered by self.
910 : ///
911 : /// Return true if inserting this range covered some keys that were previously not covered
912 5592 : fn cover(&mut self, insert: Range<Key>) -> bool {
913 5592 : let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1;
914 5592 : self.inner.ranges_insert(range_incl)
915 5592 : }
916 :
917 618 : fn reset(&mut self) {
918 618 : self.inner = Default::default();
919 618 : }
920 :
921 606 : fn to_keyspace(&self) -> KeySpace {
922 606 : let mut accum = KeySpaceAccum::new();
923 612 : for range_incl in self.inner.ranges() {
924 612 : let range = Range {
925 612 : start: Key::from_i128(*range_incl.start()),
926 612 : end: Key::from_i128(range_incl.end() + 1),
927 612 : };
928 612 : accum.add_range(range)
929 : }
930 :
931 606 : accum.to_keyspace()
932 606 : }
933 : }
934 :
935 : // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
936 : // and a ReadPoint
937 606 : read_points.sort_by_key(|rp| rp.0);
938 606 : let mut shadow = KeyShadow::new();
939 :
940 : // We will interleave all our read points and layers into a sorted collection
941 : enum Item {
942 : ReadPoint { lsn: Lsn },
943 : Layer(Arc<PersistentLayerDesc>),
944 : }
945 :
946 606 : let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
947 606 : items.extend(self.iter_historic_layers().map(Item::Layer));
948 606 : items.extend(
949 606 : read_points
950 606 : .into_iter()
951 618 : .map(|rp| Item::ReadPoint { lsn: rp }),
952 606 : );
953 606 :
954 606 : // Ordering: we want to iterate like this:
955 606 : // 1. Highest LSNs first
956 606 : // 2. Consider images before deltas if they end at the same LSNs (images cover deltas)
957 606 : // 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible)
958 190080 : items.sort_by_key(|item| {
959 190080 : std::cmp::Reverse(match item {
960 188880 : Item::Layer(layer) => {
961 188880 : if layer.is_delta() {
962 88800 : (Lsn(layer.get_lsn_range().end.0 - 1), 0)
963 : } else {
964 100080 : (layer.image_layer_lsn(), 1)
965 : }
966 : }
967 1200 : Item::ReadPoint { lsn } => (*lsn, 2),
968 : })
969 190080 : });
970 606 :
971 606 : let mut results = Vec::with_capacity(self.historic.len());
972 606 :
973 606 : let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
974 :
975 11616 : for item in items {
976 11010 : let (reached_lsn, is_readpoint) = match &item {
977 618 : Item::ReadPoint { lsn } => (lsn, true),
978 10392 : Item::Layer(layer) => (&layer.lsn_range.start, false),
979 : };
980 11010 : maybe_covered_deltas.retain(|d| {
981 300 : if *reached_lsn >= d.lsn_range.start && is_readpoint {
982 : // We encountered a readpoint within the delta layer: it is visible
983 :
984 6 : results.push((d.clone(), LayerVisibilityHint::Visible));
985 6 : false
986 294 : } else if *reached_lsn < d.lsn_range.start {
987 : // We passed the layer's range without encountering a read point: it is not visible
988 90 : results.push((d.clone(), LayerVisibilityHint::Covered));
989 90 : false
990 : } else {
991 : // We're still in the delta layer: continue iterating
992 204 : true
993 : }
994 11010 : });
995 11010 :
996 11010 : match item {
997 618 : Item::ReadPoint { lsn: _lsn } => {
998 618 : // TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have
999 618 : // to assume that the whole key range is visible at the branch point.
1000 618 : shadow.reset();
1001 618 : }
1002 10392 : Item::Layer(layer) => {
1003 10392 : let visibility = if layer.is_delta() {
1004 4800 : if shadow.contains(layer.get_key_range()) {
1005 : // If a layer isn't visible based on current state, we must defer deciding whether
1006 : // it is truly not visible until we have advanced past the delta's range: we might
1007 : // encounter another branch point within this delta layer's LSN range.
1008 114 : maybe_covered_deltas.push(layer);
1009 114 : continue;
1010 : } else {
1011 4686 : LayerVisibilityHint::Visible
1012 : }
1013 : } else {
1014 5592 : let modified = shadow.cover(layer.get_key_range());
1015 5592 : if modified {
1016 : // An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered
1017 5508 : LayerVisibilityHint::Visible
1018 : } else {
1019 : // An image layer in a region that was already covered
1020 84 : LayerVisibilityHint::Covered
1021 : }
1022 : };
1023 :
1024 10278 : results.push((layer, visibility));
1025 : }
1026 : }
1027 : }
1028 :
1029 : // Drain any remaining maybe_covered deltas
1030 606 : results.extend(
1031 606 : maybe_covered_deltas
1032 606 : .into_iter()
1033 606 : .map(|d| (d, LayerVisibilityHint::Covered)),
1034 606 : );
1035 606 :
1036 606 : (results, shadow.to_keyspace())
1037 606 : }
1038 : }
1039 :
1040 : #[cfg(test)]
1041 : mod tests {
1042 : use crate::tenant::{storage_layer::LayerName, IndexPart};
1043 : use pageserver_api::{
1044 : key::DBDIR_KEY,
1045 : keyspace::{KeySpace, KeySpaceRandomAccum},
1046 : };
1047 : use std::{collections::HashMap, path::PathBuf};
1048 : use utils::{
1049 : id::{TenantId, TimelineId},
1050 : shard::TenantShardId,
1051 : };
1052 :
1053 : use super::*;
1054 :
1055 : #[derive(Clone)]
1056 : struct LayerDesc {
1057 : key_range: Range<Key>,
1058 : lsn_range: Range<Lsn>,
1059 : is_delta: bool,
1060 : }
1061 :
1062 6 : fn create_layer_map(layers: Vec<LayerDesc>) -> LayerMap {
1063 6 : let mut layer_map = LayerMap::default();
1064 :
1065 36 : for layer in layers {
1066 30 : layer_map.insert_historic_noflush(PersistentLayerDesc::new_test(
1067 30 : layer.key_range,
1068 30 : layer.lsn_range,
1069 30 : layer.is_delta,
1070 30 : ));
1071 30 : }
1072 :
1073 6 : layer_map.flush_updates();
1074 6 : layer_map
1075 6 : }
1076 :
1077 10620 : fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
1078 10620 : assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
1079 10620 : let lhs: HashMap<SearchResult, KeySpace> = lhs
1080 10620 : .found
1081 10620 : .into_iter()
1082 24690 : .map(|(search_result, accum)| (search_result, accum.to_keyspace()))
1083 10620 : .collect();
1084 10620 : let rhs: HashMap<SearchResult, KeySpace> = rhs
1085 10620 : .found
1086 10620 : .into_iter()
1087 24690 : .map(|(search_result, accum)| (search_result, accum.to_keyspace()))
1088 10620 : .collect();
1089 10620 :
1090 10620 : assert_eq!(lhs, rhs);
1091 10620 : }
1092 :
1093 : #[cfg(test)]
1094 10620 : fn brute_force_range_search(
1095 10620 : layer_map: &LayerMap,
1096 10620 : key_range: Range<Key>,
1097 10620 : end_lsn: Lsn,
1098 10620 : ) -> RangeSearchResult {
1099 10620 : let mut range_search_result = RangeSearchResult::new();
1100 10620 :
1101 10620 : let mut key = key_range.start;
1102 226560 : while key != key_range.end {
1103 215940 : let res = layer_map.search(key, end_lsn);
1104 215940 : match res {
1105 183960 : Some(res) => {
1106 183960 : range_search_result
1107 183960 : .found
1108 183960 : .entry(res)
1109 183960 : .or_default()
1110 183960 : .add_key(key);
1111 183960 : }
1112 31980 : None => {
1113 31980 : range_search_result.not_found.add_key(key);
1114 31980 : }
1115 : }
1116 :
1117 215940 : key = key.next();
1118 : }
1119 :
1120 10620 : range_search_result
1121 10620 : }
1122 :
1123 : #[test]
1124 6 : fn ranged_search_on_empty_layer_map() {
1125 6 : let layer_map = LayerMap::default();
1126 6 : let range = Key::from_i128(100)..Key::from_i128(200);
1127 6 :
1128 6 : let res = layer_map.range_search(range.clone(), Lsn(100));
1129 6 : assert_eq!(
1130 6 : res.not_found.to_keyspace(),
1131 6 : KeySpace {
1132 6 : ranges: vec![range]
1133 6 : }
1134 6 : );
1135 6 : }
1136 :
1137 : #[test]
1138 6 : fn ranged_search() {
1139 6 : let layers = vec![
1140 6 : LayerDesc {
1141 6 : key_range: Key::from_i128(15)..Key::from_i128(50),
1142 6 : lsn_range: Lsn(0)..Lsn(5),
1143 6 : is_delta: false,
1144 6 : },
1145 6 : LayerDesc {
1146 6 : key_range: Key::from_i128(10)..Key::from_i128(20),
1147 6 : lsn_range: Lsn(5)..Lsn(20),
1148 6 : is_delta: true,
1149 6 : },
1150 6 : LayerDesc {
1151 6 : key_range: Key::from_i128(15)..Key::from_i128(25),
1152 6 : lsn_range: Lsn(20)..Lsn(30),
1153 6 : is_delta: true,
1154 6 : },
1155 6 : LayerDesc {
1156 6 : key_range: Key::from_i128(35)..Key::from_i128(40),
1157 6 : lsn_range: Lsn(25)..Lsn(35),
1158 6 : is_delta: true,
1159 6 : },
1160 6 : LayerDesc {
1161 6 : key_range: Key::from_i128(35)..Key::from_i128(40),
1162 6 : lsn_range: Lsn(35)..Lsn(40),
1163 6 : is_delta: false,
1164 6 : },
1165 6 : ];
1166 6 :
1167 6 : let layer_map = create_layer_map(layers.clone());
1168 366 : for start in 0..60 {
1169 10620 : for end in (start + 1)..60 {
1170 10620 : let range = Key::from_i128(start)..Key::from_i128(end);
1171 10620 : let result = layer_map.range_search(range.clone(), Lsn(100));
1172 10620 : let expected = brute_force_range_search(&layer_map, range, Lsn(100));
1173 10620 :
1174 10620 : assert_range_search_result_eq(result, expected);
1175 10620 : }
1176 : }
1177 6 : }
1178 :
1179 : #[test]
1180 6 : fn layer_visibility_basic() {
1181 6 : // A simple synthetic input, as a smoke test.
1182 6 : let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
1183 6 : let timeline_id = TimelineId::generate();
1184 6 : let mut layer_map = LayerMap::default();
1185 6 : let mut updates = layer_map.batch_update();
1186 :
1187 : const FAKE_LAYER_SIZE: u64 = 1024;
1188 :
1189 6 : let inject_delta = |updates: &mut BatchedUpdates,
1190 : key_start: i128,
1191 : key_end: i128,
1192 : lsn_start: u64,
1193 42 : lsn_end: u64| {
1194 42 : let desc = PersistentLayerDesc::new_delta(
1195 42 : tenant_shard_id,
1196 42 : timeline_id,
1197 42 : Range {
1198 42 : start: Key::from_i128(key_start),
1199 42 : end: Key::from_i128(key_end),
1200 42 : },
1201 42 : Range {
1202 42 : start: Lsn(lsn_start),
1203 42 : end: Lsn(lsn_end),
1204 42 : },
1205 42 : 1024,
1206 42 : );
1207 42 : updates.insert_historic(desc.clone());
1208 42 : desc
1209 42 : };
1210 :
1211 6 : let inject_image =
1212 36 : |updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| {
1213 36 : let desc = PersistentLayerDesc::new_img(
1214 36 : tenant_shard_id,
1215 36 : timeline_id,
1216 36 : Range {
1217 36 : start: Key::from_i128(key_start),
1218 36 : end: Key::from_i128(key_end),
1219 36 : },
1220 36 : Lsn(lsn),
1221 36 : FAKE_LAYER_SIZE,
1222 36 : );
1223 36 : updates.insert_historic(desc.clone());
1224 36 : desc
1225 36 : };
1226 :
1227 : //
1228 : // Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios
1229 : // we expect to handle. You can follow these examples through in the same order as they would be processed
1230 : // by the function under test.
1231 : //
1232 :
1233 6 : let mut read_points = vec![Lsn(1000)];
1234 6 :
1235 6 : // A delta ahead of any image layer
1236 6 : let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110);
1237 6 :
1238 6 : // An image layer is visible and covers some layers beneath itself
1239 6 : let visible_covering_img = inject_image(&mut updates, 5, 25, 99);
1240 6 :
1241 6 : // A delta layer covered by the image layer: should be covered
1242 6 : let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100);
1243 6 :
1244 6 : // A delta layer partially covered by an image layer: should be visible
1245 6 : let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100);
1246 6 :
1247 6 : // A delta layer not covered by an image layer: should be visible
1248 6 : let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100);
1249 6 :
1250 6 : // An image layer covered by the image layer above: should be covered
1251 6 : let covered_image = inject_image(&mut updates, 10, 20, 89);
1252 6 :
1253 6 : // An image layer partially covered by an image layer: should be visible
1254 6 : let partially_covered_image = inject_image(&mut updates, 1, 7, 89);
1255 6 :
1256 6 : // An image layer not covered by an image layer: should be visible
1257 6 : let not_covered_image = inject_image(&mut updates, 1, 4, 89);
1258 6 :
1259 6 : // A read point: this will make subsequent layers below here visible, even if there are
1260 6 : // more recent layers covering them.
1261 6 : read_points.push(Lsn(80));
1262 6 :
1263 6 : // A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer
1264 6 : let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79);
1265 6 :
1266 6 : // A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range:
1267 6 : // the read point should make it visible, even though its end LSN is covered
1268 6 : let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69);
1269 6 : let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69);
1270 6 : read_points.push(Lsn(65));
1271 6 : let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69);
1272 6 :
1273 6 : let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65);
1274 6 :
1275 6 : updates.flush();
1276 6 :
1277 6 : let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
1278 6 : let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
1279 6 :
1280 6 : assert_eq!(
1281 6 : layer_visibilities.get(&ahead_layer),
1282 6 : Some(&LayerVisibilityHint::Visible)
1283 6 : );
1284 6 : assert_eq!(
1285 6 : layer_visibilities.get(&visible_covering_img),
1286 6 : Some(&LayerVisibilityHint::Visible)
1287 6 : );
1288 6 : assert_eq!(
1289 6 : layer_visibilities.get(&covered_delta),
1290 6 : Some(&LayerVisibilityHint::Covered)
1291 6 : );
1292 6 : assert_eq!(
1293 6 : layer_visibilities.get(&partially_covered_delta),
1294 6 : Some(&LayerVisibilityHint::Visible)
1295 6 : );
1296 6 : assert_eq!(
1297 6 : layer_visibilities.get(¬_covered_delta),
1298 6 : Some(&LayerVisibilityHint::Visible)
1299 6 : );
1300 6 : assert_eq!(
1301 6 : layer_visibilities.get(&covered_image),
1302 6 : Some(&LayerVisibilityHint::Covered)
1303 6 : );
1304 6 : assert_eq!(
1305 6 : layer_visibilities.get(&partially_covered_image),
1306 6 : Some(&LayerVisibilityHint::Visible)
1307 6 : );
1308 6 : assert_eq!(
1309 6 : layer_visibilities.get(¬_covered_image),
1310 6 : Some(&LayerVisibilityHint::Visible)
1311 6 : );
1312 6 : assert_eq!(
1313 6 : layer_visibilities.get(&covered_delta_below_read_point),
1314 6 : Some(&LayerVisibilityHint::Visible)
1315 6 : );
1316 6 : assert_eq!(
1317 6 : layer_visibilities.get(&covering_img_between_read_points),
1318 6 : Some(&LayerVisibilityHint::Visible)
1319 6 : );
1320 6 : assert_eq!(
1321 6 : layer_visibilities.get(&covered_delta_between_read_points),
1322 6 : Some(&LayerVisibilityHint::Covered)
1323 6 : );
1324 6 : assert_eq!(
1325 6 : layer_visibilities.get(&covered_delta_intersects_read_point),
1326 6 : Some(&LayerVisibilityHint::Visible)
1327 6 : );
1328 6 : assert_eq!(
1329 6 : layer_visibilities.get(&visible_img_after_last_read_point),
1330 6 : Some(&LayerVisibilityHint::Visible)
1331 6 : );
1332 :
1333 : // Shadow should include all the images below the last read point
1334 6 : let expected_shadow = KeySpace {
1335 6 : ranges: vec![Key::from_i128(10)..Key::from_i128(20)],
1336 6 : };
1337 6 : assert_eq!(shadow, expected_shadow);
1338 6 : }
1339 :
1340 6 : fn fixture_path(relative: &str) -> PathBuf {
1341 6 : PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
1342 6 : }
1343 :
1344 : #[test]
1345 6 : fn layer_visibility_realistic() {
1346 6 : // Load a large example layermap
1347 6 : let index_raw = std::fs::read_to_string(fixture_path(
1348 6 : "test_data/indices/mixed_workload/index_part.json",
1349 6 : ))
1350 6 : .unwrap();
1351 6 : let index: IndexPart = serde_json::from_str::<IndexPart>(&index_raw).unwrap();
1352 6 :
1353 6 : let tenant_id = TenantId::generate();
1354 6 : let tenant_shard_id = TenantShardId::unsharded(tenant_id);
1355 6 : let timeline_id = TimelineId::generate();
1356 6 :
1357 6 : let mut layer_map = LayerMap::default();
1358 6 : let mut updates = layer_map.batch_update();
1359 9390 : for (layer_name, layer_metadata) in index.layer_metadata {
1360 9384 : let layer_desc = match layer_name {
1361 4812 : LayerName::Image(layer_name) => PersistentLayerDesc {
1362 4812 : key_range: layer_name.key_range.clone(),
1363 4812 : lsn_range: layer_name.lsn_as_range(),
1364 4812 : tenant_shard_id,
1365 4812 : timeline_id,
1366 4812 : is_delta: false,
1367 4812 : file_size: layer_metadata.file_size,
1368 4812 : },
1369 4572 : LayerName::Delta(layer_name) => PersistentLayerDesc {
1370 4572 : key_range: layer_name.key_range,
1371 4572 : lsn_range: layer_name.lsn_range,
1372 4572 : tenant_shard_id,
1373 4572 : timeline_id,
1374 4572 : is_delta: true,
1375 4572 : file_size: layer_metadata.file_size,
1376 4572 : },
1377 : };
1378 9384 : updates.insert_historic(layer_desc);
1379 : }
1380 6 : updates.flush();
1381 6 :
1382 6 : let read_points = vec![index.metadata.disk_consistent_lsn()];
1383 6 : let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
1384 9390 : for (layer_desc, visibility) in &layer_visibilities {
1385 9384 : tracing::info!("{layer_desc:?}: {visibility:?}");
1386 9384 : eprintln!("{layer_desc:?}: {visibility:?}");
1387 : }
1388 :
1389 : // The shadow should be non-empty, since there were some image layers
1390 6 : assert!(!shadow.ranges.is_empty());
1391 :
1392 : // At least some layers should be marked covered
1393 6 : assert!(layer_visibilities
1394 6 : .iter()
1395 114 : .any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
1396 :
1397 6 : let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
1398 :
1399 : // Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it
1400 9390 : for (layer_desc, visible) in &layer_visibilities {
1401 9384 : let mut coverage = KeySpaceRandomAccum::new();
1402 9384 : let mut covered_by = Vec::new();
1403 :
1404 14676576 : for other_layer in layer_map.iter_historic_layers() {
1405 14676576 : if &other_layer == layer_desc {
1406 9384 : continue;
1407 14667192 : }
1408 14667192 : if !other_layer.is_delta()
1409 7521156 : && other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1)
1410 3790536 : && other_layer.key_range.start <= layer_desc.key_range.end
1411 1395912 : && layer_desc.key_range.start <= other_layer.key_range.end
1412 256938 : {
1413 256938 : coverage.add_range(other_layer.get_key_range());
1414 256938 : covered_by.push((*other_layer).clone());
1415 14410254 : }
1416 : }
1417 9384 : let coverage = coverage.to_keyspace();
1418 :
1419 9384 : let expect_visible = if coverage.ranges.len() == 1
1420 2268 : && coverage.contains(&layer_desc.key_range.start)
1421 102 : && coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1))
1422 : {
1423 60 : LayerVisibilityHint::Covered
1424 : } else {
1425 9324 : LayerVisibilityHint::Visible
1426 : };
1427 :
1428 9384 : if expect_visible != *visible {
1429 0 : eprintln!(
1430 0 : "Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}",
1431 0 : layer_desc.key_range.start,
1432 0 : layer_desc.key_range.end,
1433 0 : layer_desc.lsn_range.start,
1434 0 : layer_desc.lsn_range.end,
1435 0 : layer_desc.is_delta()
1436 0 : );
1437 0 : if expect_visible == LayerVisibilityHint::Covered {
1438 0 : eprintln!("Covered by:");
1439 0 : for other in covered_by {
1440 0 : eprintln!(
1441 0 : " {}..{} @ {}",
1442 0 : other.get_key_range().start,
1443 0 : other.get_key_range().end,
1444 0 : other.image_layer_lsn()
1445 0 : );
1446 0 : }
1447 0 : if let Some(range) = coverage.ranges.first() {
1448 0 : eprintln!(
1449 0 : "Total coverage from contributing layers: {}..{}",
1450 0 : range.start, range.end
1451 0 : );
1452 0 : } else {
1453 0 : eprintln!(
1454 0 : "Total coverage from contributing layers: {:?}",
1455 0 : coverage.ranges
1456 0 : );
1457 0 : }
1458 0 : }
1459 9384 : }
1460 9384 : assert_eq!(expect_visible, *visible);
1461 : }
1462 :
1463 : // Sanity: the layer that holds latest data for the DBDIR key should always be visible
1464 : // (just using this key as a key that will always exist for any layermap fixture)
1465 6 : let dbdir_layer = layer_map
1466 6 : .search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
1467 6 : .unwrap();
1468 6 : assert!(matches!(
1469 6 : layer_visibilities.get(&dbdir_layer.layer).unwrap(),
1470 : LayerVisibilityHint::Visible
1471 : ));
1472 6 : }
1473 : }
|