Line data Source code
1 : //!
2 : //! The layer map tracks what layers exist in a timeline.
3 : //!
4 : //! When the timeline is first accessed, the server lists of all layer files
5 : //! in the timelines/<timeline_id> directory, and populates this map with
6 : //! ImageLayer and DeltaLayer structs corresponding to each file. When the first
7 : //! new WAL record is received, we create an InMemoryLayer to hold the incoming
8 : //! records. Now and then, in the checkpoint() function, the in-memory layer is
9 : //! are frozen, and it is split up into new image and delta layers and the
10 : //! corresponding files are written to disk.
11 : //!
12 : //! Design overview:
13 : //!
14 : //! The `search` method of the layer map is on the read critical path, so we've
15 : //! built an efficient data structure for fast reads, stored in `LayerMap::historic`.
16 : //! Other read methods are less critical but still impact performance of background tasks.
17 : //!
18 : //! This data structure relies on a persistent/immutable binary search tree. See the
19 : //! following lecture for an introduction <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
20 : //! Summary: A persistent/immutable BST (and persistent data structures in general) allows
21 : //! you to modify the tree in such a way that each modification creates a new "version"
22 : //! of the tree. When you modify it, you get a new version, but all previous versions are
23 : //! still accessible too. So if someone is still holding a reference to an older version,
24 : //! they continue to see the tree as it was then. The persistent BST stores all the
25 : //! different versions in an efficient way.
26 : //!
27 : //! Our persistent BST maintains a map of which layer file "covers" each key. It has only
28 : //! one dimension, the key. See `layer_coverage.rs`. We use the persistent/immutable property
29 : //! to handle the LSN dimension.
30 : //!
31 : //! To build the layer map, we insert each layer to the persistent BST in LSN.start order,
32 : //! starting from the oldest one. After each insertion, we grab a reference to that "version"
33 : //! of the tree, and store it in another tree, a BtreeMap keyed by the LSN. See
34 : //! `historic_layer_coverage.rs`.
35 : //!
36 : //! To search for a particular key-LSN pair, you first look up the right "version" in the
37 : //! BTreeMap. Then you search that version of the BST with the key.
38 : //!
39 : //! The persistent BST keeps all the versions, but there is no way to change the old versions
40 : //! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
41 : //! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
42 : //! to throw away most of the persistent BST and build a new one, starting from the oldest
43 : //! LSN. See [`LayerMap::flush_updates()`].
44 : //!
45 :
46 : mod historic_layer_coverage;
47 : mod layer_coverage;
48 :
49 : use crate::context::RequestContext;
50 : use crate::keyspace::KeyPartitioning;
51 : use crate::tenant::storage_layer::InMemoryLayer;
52 : use anyhow::Result;
53 : use pageserver_api::key::Key;
54 : use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
55 : use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
56 : use std::collections::{HashMap, VecDeque};
57 : use std::iter::Peekable;
58 : use std::ops::Range;
59 : use std::sync::Arc;
60 : use utils::lsn::Lsn;
61 :
62 : use historic_layer_coverage::BufferedHistoricLayerCoverage;
63 : pub use historic_layer_coverage::LayerKey;
64 :
65 : use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
66 :
67 : ///
68 : /// LayerMap tracks what layers exist on a timeline.
69 : ///
70 : #[derive(Default)]
71 : pub struct LayerMap {
72 : //
73 : // 'open_layer' holds the current InMemoryLayer that is accepting new
74 : // records. If it is None, 'next_open_layer_at' will be set instead, indicating
75 : // where the start LSN of the next InMemoryLayer that is to be created.
76 : //
77 : pub open_layer: Option<Arc<InMemoryLayer>>,
78 : pub next_open_layer_at: Option<Lsn>,
79 :
80 : ///
81 : /// Frozen layers, if any. Frozen layers are in-memory layers that
82 : /// are no longer added to, but haven't been written out to disk
83 : /// yet. They contain WAL older than the current 'open_layer' or
84 : /// 'next_open_layer_at', but newer than any historic layer.
85 : /// The frozen layers are in order from oldest to newest, so that
86 : /// the newest one is in the 'back' of the VecDeque, and the oldest
87 : /// in the 'front'.
88 : ///
89 : pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
90 :
91 : /// Index of the historic layers optimized for search
92 : historic: BufferedHistoricLayerCoverage<Arc<PersistentLayerDesc>>,
93 :
94 : /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
95 : /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
96 : l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
97 : }
98 :
99 : /// The primary update API for the layer map.
100 : ///
101 : /// Batching historic layer insertions and removals is good for
102 : /// performance and this struct helps us do that correctly.
103 : #[must_use]
104 : pub struct BatchedUpdates<'a> {
105 : // While we hold this exclusive reference to the layer map the type checker
106 : // will prevent us from accidentally reading any unflushed updates.
107 : layer_map: &'a mut LayerMap,
108 : }
109 :
110 : /// Provide ability to batch more updates while hiding the read
111 : /// API so we don't accidentally read without flushing.
112 : impl BatchedUpdates<'_> {
113 : ///
114 : /// Insert an on-disk layer.
115 : ///
116 : // TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
117 4824 : pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc) {
118 4824 : self.layer_map.insert_historic_noflush(layer_desc)
119 4824 : }
120 :
121 : ///
122 : /// Remove an on-disk layer from the map.
123 : ///
124 : /// This should be called when the corresponding file on disk has been deleted.
125 : ///
126 486 : pub fn remove_historic(&mut self, layer_desc: &PersistentLayerDesc) {
127 486 : self.layer_map.remove_historic_noflush(layer_desc)
128 486 : }
129 :
130 : // We will flush on drop anyway, but this method makes it
131 : // more explicit that there is some work being done.
132 : /// Apply all updates
133 1888 : pub fn flush(self) {
134 1888 : // Flush happens on drop
135 1888 : }
136 : }
137 :
138 : // Ideally the flush() method should be called explicitly for more
139 : // controlled execution. But if we forget we'd rather flush on drop
140 : // than panic later or read without flushing.
141 : //
142 : // TODO maybe warn if flush hasn't explicitly been called
143 : impl Drop for BatchedUpdates<'_> {
144 1888 : fn drop(&mut self) {
145 1888 : self.layer_map.flush_updates();
146 1888 : }
147 : }
148 :
149 : /// Return value of LayerMap::search
150 : #[derive(Eq, PartialEq, Debug, Hash)]
151 : pub struct SearchResult {
152 : pub layer: Arc<PersistentLayerDesc>,
153 : pub lsn_floor: Lsn,
154 : }
155 :
156 : /// Return value of [`LayerMap::range_search`]
157 : ///
158 : /// Contains a mapping from a layer description to a keyspace
159 : /// accumulator that contains all the keys which intersect the layer
160 : /// from the original search space. Keys that were not found are accumulated
161 : /// in a separate key space accumulator.
162 : #[derive(Debug)]
163 : pub struct RangeSearchResult {
164 : pub found: HashMap<SearchResult, KeySpaceAccum>,
165 : pub not_found: KeySpaceAccum,
166 : }
167 :
168 : impl RangeSearchResult {
169 473906 : fn new() -> Self {
170 473906 : Self {
171 473906 : found: HashMap::new(),
172 473906 : not_found: KeySpaceAccum::new(),
173 473906 : }
174 473906 : }
175 : }
176 :
177 : /// Collector for results of range search queries on the LayerMap.
178 : /// It should be provided with two iterators for the delta and image coverage
179 : /// that contain all the changes for layers which intersect the range.
180 : struct RangeSearchCollector<Iter>
181 : where
182 : Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
183 : {
184 : delta_coverage: Peekable<Iter>,
185 : image_coverage: Peekable<Iter>,
186 : key_range: Range<Key>,
187 : end_lsn: Lsn,
188 :
189 : current_delta: Option<Arc<PersistentLayerDesc>>,
190 : current_image: Option<Arc<PersistentLayerDesc>>,
191 :
192 : result: RangeSearchResult,
193 : }
194 :
195 : #[derive(Debug)]
196 : enum NextLayerType {
197 : Delta(i128),
198 : Image(i128),
199 : Both(i128),
200 : }
201 :
202 : impl NextLayerType {
203 492532 : fn next_change_at_key(&self) -> Key {
204 492532 : match self {
205 144604 : NextLayerType::Delta(at) => Key::from_i128(*at),
206 4782 : NextLayerType::Image(at) => Key::from_i128(*at),
207 343146 : NextLayerType::Both(at) => Key::from_i128(*at),
208 : }
209 492532 : }
210 : }
211 :
212 : impl<Iter> RangeSearchCollector<Iter>
213 : where
214 : Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
215 : {
216 244921 : fn new(
217 244921 : key_range: Range<Key>,
218 244921 : end_lsn: Lsn,
219 244921 : delta_coverage: Iter,
220 244921 : image_coverage: Iter,
221 244921 : ) -> Self {
222 244921 : Self {
223 244921 : delta_coverage: delta_coverage.peekable(),
224 244921 : image_coverage: image_coverage.peekable(),
225 244921 : key_range,
226 244921 : end_lsn,
227 244921 : current_delta: None,
228 244921 : current_image: None,
229 244921 : result: RangeSearchResult::new(),
230 244921 : }
231 244921 : }
232 :
233 : /// Run the collector. Collection is implemented via a two pointer algorithm.
234 : /// One pointer tracks the start of the current range and the other tracks
235 : /// the beginning of the next range which will overlap with the next change
236 : /// in coverage across both image and delta.
237 244921 : fn collect(mut self) -> RangeSearchResult {
238 244921 : let next_layer_type = self.choose_next_layer_type();
239 242691 : let mut current_range_start = match next_layer_type {
240 : None => {
241 : // No changes for the range
242 2230 : self.pad_range(self.key_range.clone());
243 2230 : return self.result;
244 : }
245 242691 : Some(layer_type) if self.key_range.end <= layer_type.next_change_at_key() => {
246 0 : // Changes only after the end of the range
247 0 : self.pad_range(self.key_range.clone());
248 0 : return self.result;
249 : }
250 242691 : Some(layer_type) => {
251 242691 : // Changes for the range exist. Record anything before the first
252 242691 : // coverage change as not found.
253 242691 : let coverage_start = layer_type.next_change_at_key();
254 242691 : let range_before = self.key_range.start..coverage_start;
255 242691 : self.pad_range(range_before);
256 242691 :
257 242691 : self.advance(&layer_type);
258 242691 : coverage_start
259 : }
260 : };
261 :
262 492532 : while current_range_start < self.key_range.end {
263 249841 : let next_layer_type = self.choose_next_layer_type();
264 249841 : match next_layer_type {
265 7150 : Some(t) => {
266 7150 : let current_range_end = t.next_change_at_key();
267 7150 : self.add_range(current_range_start..current_range_end);
268 7150 : current_range_start = current_range_end;
269 7150 :
270 7150 : self.advance(&t);
271 7150 : }
272 242691 : None => {
273 242691 : self.add_range(current_range_start..self.key_range.end);
274 242691 : current_range_start = self.key_range.end;
275 242691 : }
276 : }
277 : }
278 :
279 242691 : self.result
280 244921 : }
281 :
282 : /// Mark a range as not found (i.e. no layers intersect it)
283 245859 : fn pad_range(&mut self, key_range: Range<Key>) {
284 245859 : if !key_range.is_empty() {
285 4154 : self.result.not_found.add_range(key_range);
286 241705 : }
287 245859 : }
288 :
289 : /// Select the appropiate layer for the given range and update
290 : /// the collector.
291 249841 : fn add_range(&mut self, covered_range: Range<Key>) {
292 249841 : let selected = LayerMap::select_layer(
293 249841 : self.current_delta.clone(),
294 249841 : self.current_image.clone(),
295 249841 : self.end_lsn,
296 249841 : );
297 249841 :
298 249841 : match selected {
299 248903 : Some(search_result) => self
300 248903 : .result
301 248903 : .found
302 248903 : .entry(search_result)
303 248903 : .or_default()
304 248903 : .add_range(covered_range),
305 938 : None => self.pad_range(covered_range),
306 : }
307 249841 : }
308 :
309 : /// Move to the next coverage change.
310 249841 : fn advance(&mut self, layer_type: &NextLayerType) {
311 249841 : match layer_type {
312 73162 : NextLayerType::Delta(_) => {
313 73162 : let (_, layer) = self.delta_coverage.next().unwrap();
314 73162 : self.current_delta = layer;
315 73162 : }
316 2844 : NextLayerType::Image(_) => {
317 2844 : let (_, layer) = self.image_coverage.next().unwrap();
318 2844 : self.current_image = layer;
319 2844 : }
320 173835 : NextLayerType::Both(_) => {
321 173835 : let (_, image_layer) = self.image_coverage.next().unwrap();
322 173835 : let (_, delta_layer) = self.delta_coverage.next().unwrap();
323 173835 :
324 173835 : self.current_image = image_layer;
325 173835 : self.current_delta = delta_layer;
326 173835 : }
327 : }
328 249841 : }
329 :
330 : /// Pick the next coverage change: the one at the lesser key or both if they're alligned.
331 494762 : fn choose_next_layer_type(&mut self) -> Option<NextLayerType> {
332 494762 : let next_delta_at = self.delta_coverage.peek().map(|(key, _)| key);
333 494762 : let next_image_at = self.image_coverage.peek().map(|(key, _)| key);
334 494762 :
335 494762 : match (next_delta_at, next_image_at) {
336 244921 : (None, None) => None,
337 70640 : (Some(next_delta_at), None) => Some(NextLayerType::Delta(*next_delta_at)),
338 2404 : (None, Some(next_image_at)) => Some(NextLayerType::Image(*next_image_at)),
339 176797 : (Some(next_delta_at), Some(next_image_at)) if next_image_at < next_delta_at => {
340 440 : Some(NextLayerType::Image(*next_image_at))
341 : }
342 176357 : (Some(next_delta_at), Some(next_image_at)) if next_delta_at < next_image_at => {
343 2522 : Some(NextLayerType::Delta(*next_delta_at))
344 : }
345 173835 : (Some(next_delta_at), Some(_)) => Some(NextLayerType::Both(*next_delta_at)),
346 : }
347 494762 : }
348 : }
349 :
350 : impl LayerMap {
351 : ///
352 : /// Find the latest layer (by lsn.end) that covers the given
353 : /// 'key', with lsn.start < 'end_lsn'.
354 : ///
355 : /// The caller of this function is the page reconstruction
356 : /// algorithm looking for the next relevant delta layer, or
357 : /// the terminal image layer. The caller will pass the lsn_floor
358 : /// value as end_lsn in the next call to search.
359 : ///
360 : /// If there's an image layer exactly below the given end_lsn,
361 : /// search should return that layer regardless if there are
362 : /// overlapping deltas.
363 : ///
364 : /// If the latest layer is a delta and there is an overlapping
365 : /// image with it below, the lsn_floor returned should be right
366 : /// above that image so we don't skip it in the search. Otherwise
367 : /// the lsn_floor returned should be the bottom of the delta layer
368 : /// because we should make as much progress down the lsn axis
369 : /// as possible. It's fine if this way we skip some overlapping
370 : /// deltas, because the delta we returned would contain the same
371 : /// wal content.
372 : ///
373 : /// TODO: This API is convoluted and inefficient. If the caller
374 : /// makes N search calls, we'll end up finding the same latest
375 : /// image layer N times. We should either cache the latest image
376 : /// layer result, or simplify the api to `get_latest_image` and
377 : /// `get_latest_delta`, and only call `get_latest_image` once.
378 : ///
379 : /// NOTE: This only searches the 'historic' layers, *not* the
380 : /// 'open' and 'frozen' layers!
381 : ///
382 71982 : pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult> {
383 71982 : let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
384 71982 : let latest_delta = version.delta_coverage.query(key.to_i128());
385 71982 : let latest_image = version.image_coverage.query(key.to_i128());
386 71982 :
387 71982 : Self::select_layer(latest_delta, latest_image, end_lsn)
388 71982 : }
389 :
390 321823 : fn select_layer(
391 321823 : delta_layer: Option<Arc<PersistentLayerDesc>>,
392 321823 : image_layer: Option<Arc<PersistentLayerDesc>>,
393 321823 : end_lsn: Lsn,
394 321823 : ) -> Option<SearchResult> {
395 321823 : assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta()));
396 321823 : assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta()));
397 :
398 321823 : match (delta_layer, image_layer) {
399 11598 : (None, None) => None,
400 36212 : (None, Some(image)) => {
401 36212 : let lsn_floor = image.get_lsn_range().start;
402 36212 : Some(SearchResult {
403 36212 : layer: image,
404 36212 : lsn_floor,
405 36212 : })
406 : }
407 77542 : (Some(delta), None) => {
408 77542 : let lsn_floor = delta.get_lsn_range().start;
409 77542 : Some(SearchResult {
410 77542 : layer: delta,
411 77542 : lsn_floor,
412 77542 : })
413 : }
414 196471 : (Some(delta), Some(image)) => {
415 196471 : let img_lsn = image.get_lsn_range().start;
416 196471 : let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
417 196471 : let image_exact_match = img_lsn + 1 == end_lsn;
418 196471 : if image_is_newer || image_exact_match {
419 30808 : Some(SearchResult {
420 30808 : layer: image,
421 30808 : lsn_floor: img_lsn,
422 30808 : })
423 : } else {
424 165663 : let lsn_floor =
425 165663 : std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
426 165663 : Some(SearchResult {
427 165663 : layer: delta,
428 165663 : lsn_floor,
429 165663 : })
430 : }
431 : }
432 : }
433 321823 : }
434 :
435 470366 : pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> RangeSearchResult {
436 470366 : let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) {
437 244921 : Some(version) => version,
438 : None => {
439 225445 : let mut result = RangeSearchResult::new();
440 225445 : result.not_found.add_range(key_range);
441 225445 : return result;
442 : }
443 : };
444 :
445 244921 : let raw_range = key_range.start.to_i128()..key_range.end.to_i128();
446 244921 : let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
447 244921 : let image_changes = version.image_coverage.range_overlaps(&raw_range);
448 244921 :
449 244921 : let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
450 244921 : collector.collect()
451 470366 : }
452 :
453 : /// Start a batch of updates, applied on drop
454 1888 : pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
455 1888 : BatchedUpdates { layer_map: self }
456 1888 : }
457 :
458 : ///
459 : /// Insert an on-disk layer
460 : ///
461 : /// Helper function for BatchedUpdates::insert_historic
462 : ///
463 : /// TODO(chi): remove L generic so that we do not need to pass layer object.
464 4834 : pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
465 4834 : // TODO: See #3869, resulting #4088, attempted fix and repro #4094
466 4834 :
467 4834 : if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
468 996 : self.l0_delta_layers.push(layer_desc.clone().into());
469 3838 : }
470 :
471 4834 : self.historic.insert(
472 4834 : historic_layer_coverage::LayerKey::from(&layer_desc),
473 4834 : layer_desc.into(),
474 4834 : );
475 4834 : }
476 :
477 : ///
478 : /// Remove an on-disk layer from the map.
479 : ///
480 : /// Helper function for BatchedUpdates::remove_historic
481 : ///
482 486 : pub fn remove_historic_noflush(&mut self, layer_desc: &PersistentLayerDesc) {
483 486 : self.historic
484 486 : .remove(historic_layer_coverage::LayerKey::from(layer_desc));
485 486 : let layer_key = layer_desc.key();
486 486 : if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
487 404 : let len_before = self.l0_delta_layers.len();
488 404 : let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
489 5524 : l0_delta_layers.retain(|other| other.key() != layer_key);
490 404 : self.l0_delta_layers = l0_delta_layers;
491 404 : // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
492 404 : // there's a chance that the comparison fails at runtime due to it comparing (pointer,
493 404 : // vtable) pairs.
494 404 : assert_eq!(
495 404 : self.l0_delta_layers.len(),
496 404 : len_before - 1,
497 0 : "failed to locate removed historic layer from l0_delta_layers"
498 : );
499 82 : }
500 486 : }
501 :
502 : /// Helper function for BatchedUpdates::drop.
503 1890 : pub(self) fn flush_updates(&mut self) {
504 1890 : self.historic.rebuild();
505 1890 : }
506 :
507 : /// Is there a newer image layer for given key- and LSN-range? Or a set
508 : /// of image layers within the specified lsn range that cover the entire
509 : /// specified key range?
510 : ///
511 : /// This is used for garbage collection, to determine if an old layer can
512 : /// be deleted.
513 8 : pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> bool {
514 8 : if key.is_empty() {
515 : // Vacuously true. There's a newer image for all 0 of the kerys in the range.
516 0 : return true;
517 8 : }
518 :
519 8 : let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
520 8 : Some(v) => v,
521 0 : None => return false,
522 : };
523 :
524 8 : let start = key.start.to_i128();
525 8 : let end = key.end.to_i128();
526 8 :
527 10 : let layer_covers = |layer: Option<Arc<PersistentLayerDesc>>| match layer {
528 10 : Some(layer) => layer.get_lsn_range().start >= lsn.start,
529 0 : None => false,
530 10 : };
531 :
532 : // Check the start is covered
533 8 : if !layer_covers(version.image_coverage.query(start)) {
534 6 : return false;
535 2 : }
536 :
537 : // Check after all changes of coverage
538 2 : for (_, change_val) in version.image_coverage.range(start..end) {
539 2 : if !layer_covers(change_val) {
540 0 : return false;
541 2 : }
542 : }
543 :
544 2 : true
545 8 : }
546 :
547 3478 : pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
548 3478 : self.historic.iter()
549 3478 : }
550 :
551 : /// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
552 1073097 : pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
553 1073097 : where
554 1073097 : Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
555 1073097 : {
556 1073097 : if let Some(open) = &self.open_layer {
557 910700 : if pred(open) {
558 604205 : return Some(open.clone());
559 306495 : }
560 162397 : }
561 :
562 468892 : self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
563 1073097 : }
564 :
565 : ///
566 : /// Divide the whole given range of keys into sub-ranges based on the latest
567 : /// image layer that covers each range at the specified lsn (inclusive).
568 : /// This is used when creating new image layers.
569 14 : pub fn image_coverage(
570 14 : &self,
571 14 : key_range: &Range<Key>,
572 14 : lsn: Lsn,
573 14 : ) -> Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> {
574 14 : let version = match self.historic.get().unwrap().get_version(lsn.0) {
575 14 : Some(v) => v,
576 0 : None => return vec![],
577 : };
578 :
579 14 : let start = key_range.start.to_i128();
580 14 : let end = key_range.end.to_i128();
581 14 :
582 14 : // Initialize loop variables
583 14 : let mut coverage: Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> = vec![];
584 14 : let mut current_key = start;
585 14 : let mut current_val = version.image_coverage.query(start);
586 :
587 : // Loop through the change events and push intervals
588 14 : for (change_key, change_val) in version.image_coverage.range(start..end) {
589 0 : let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
590 0 : coverage.push((kr, current_val.take()));
591 0 : current_key = change_key;
592 0 : current_val.clone_from(&change_val);
593 0 : }
594 :
595 : // Add the final interval
596 14 : let kr = Key::from_i128(current_key)..Key::from_i128(end);
597 14 : coverage.push((kr, current_val.take()));
598 14 :
599 14 : coverage
600 14 : }
601 :
602 : /// Check if the key range resembles that of an L0 layer.
603 5656 : pub fn is_l0(key_range: &Range<Key>, is_delta_layer: bool) -> bool {
604 5656 : is_delta_layer && key_range == &(Key::MIN..Key::MAX)
605 5656 : }
606 :
607 : /// This function determines which layers are counted in `count_deltas`:
608 : /// layers that should count towards deciding whether or not to reimage
609 : /// a certain partition range.
610 : ///
611 : /// There are two kinds of layers we currently consider reimage-worthy:
612 : ///
613 : /// Case 1: Non-L0 layers are currently reimage-worthy by default.
614 : /// TODO Some of these layers are very sparse and cover the entire key
615 : /// range. Replacing 256MB of data (or less!) with terabytes of
616 : /// images doesn't seem wise. We need a better heuristic, possibly
617 : /// based on some of these factors:
618 : /// a) whether this layer has any wal in this partition range
619 : /// b) the size of the layer
620 : /// c) the number of images needed to cover it
621 : /// d) the estimated time until we'll have to reimage over it for GC
622 : ///
623 : /// Case 2: Since L0 layers by definition cover the entire key space, we consider
624 : /// them reimage-worthy only when the entire key space can be covered by very few
625 : /// images (currently 1).
626 : /// TODO The optimal number should probably be slightly higher than 1, but to
627 : /// implement that we need to plumb a lot more context into this function
628 : /// than just the current partition_range.
629 0 : pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
630 0 : // Case 1
631 0 : if !Self::is_l0(&layer.key_range, layer.is_delta) {
632 0 : return true;
633 0 : }
634 0 :
635 0 : // Case 2
636 0 : if partition_range == &(Key::MIN..Key::MAX) {
637 0 : return true;
638 0 : }
639 0 :
640 0 : false
641 0 : }
642 :
643 : /// Count the height of the tallest stack of reimage-worthy deltas
644 : /// in this 2d region.
645 : ///
646 : /// If `limit` is provided we don't try to count above that number.
647 : ///
648 : /// This number is used to compute the largest number of deltas that
649 : /// we'll need to visit for any page reconstruction in this region.
650 : /// We use this heuristic to decide whether to create an image layer.
651 14 : pub fn count_deltas(&self, key: &Range<Key>, lsn: &Range<Lsn>, limit: Option<usize>) -> usize {
652 14 : // We get the delta coverage of the region, and for each part of the coverage
653 14 : // we recurse right underneath the delta. The recursion depth is limited by
654 14 : // the largest result this function could return, which is in practice between
655 14 : // 3 and 10 (since we usually try to create an image when the number gets larger).
656 14 :
657 14 : if lsn.is_empty() || key.is_empty() || limit == Some(0) {
658 0 : return 0;
659 14 : }
660 :
661 14 : let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
662 14 : Some(v) => v,
663 0 : None => return 0,
664 : };
665 :
666 14 : let start = key.start.to_i128();
667 14 : let end = key.end.to_i128();
668 14 :
669 14 : // Initialize loop variables
670 14 : let mut max_stacked_deltas = 0;
671 14 : let mut current_key = start;
672 14 : let mut current_val = version.delta_coverage.query(start);
673 :
674 : // Loop through the delta coverage and recurse on each part
675 14 : for (change_key, change_val) in version.delta_coverage.range(start..end) {
676 : // If there's a relevant delta in this part, add 1 and recurse down
677 0 : if let Some(val) = ¤t_val {
678 0 : if val.get_lsn_range().end > lsn.start {
679 0 : let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
680 0 : let lr = lsn.start..val.get_lsn_range().start;
681 0 : if !kr.is_empty() {
682 0 : let base_count = Self::is_reimage_worthy(val, key) as usize;
683 0 : let new_limit = limit.map(|l| l - base_count);
684 0 : let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
685 0 : max_stacked_deltas = std::cmp::max(
686 0 : max_stacked_deltas,
687 0 : base_count + max_stacked_deltas_underneath,
688 0 : );
689 0 : }
690 0 : }
691 0 : }
692 :
693 0 : current_key = change_key;
694 0 : current_val.clone_from(&change_val);
695 : }
696 :
697 : // Consider the last part
698 14 : if let Some(val) = ¤t_val {
699 0 : if val.get_lsn_range().end > lsn.start {
700 0 : let kr = Key::from_i128(current_key)..Key::from_i128(end);
701 0 : let lr = lsn.start..val.get_lsn_range().start;
702 0 :
703 0 : if !kr.is_empty() {
704 0 : let base_count = Self::is_reimage_worthy(val, key) as usize;
705 0 : let new_limit = limit.map(|l| l - base_count);
706 0 : let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
707 0 : max_stacked_deltas = std::cmp::max(
708 0 : max_stacked_deltas,
709 0 : base_count + max_stacked_deltas_underneath,
710 0 : );
711 0 : }
712 0 : }
713 14 : }
714 :
715 14 : max_stacked_deltas
716 14 : }
717 :
718 : /// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
719 : ///
720 : /// The `partition_range` argument is used as context for the reimage-worthiness decision.
721 : ///
722 : /// Used as a helper for correctness checks only. Performance not critical.
723 0 : pub fn get_difficulty(&self, lsn: Lsn, key: Key, partition_range: &Range<Key>) -> usize {
724 0 : match self.search(key, lsn) {
725 0 : Some(search_result) => {
726 0 : if search_result.layer.is_incremental() {
727 0 : (Self::is_reimage_worthy(&search_result.layer, partition_range) as usize)
728 0 : + self.get_difficulty(search_result.lsn_floor, key, partition_range)
729 : } else {
730 0 : 0
731 : }
732 : }
733 0 : None => 0,
734 : }
735 0 : }
736 :
737 : /// Used for correctness checking. Results are expected to be identical to
738 : /// self.get_difficulty_map. Assumes self.search is correct.
739 0 : pub fn get_difficulty_map_bruteforce(
740 0 : &self,
741 0 : lsn: Lsn,
742 0 : partitioning: &KeyPartitioning,
743 0 : ) -> Vec<usize> {
744 0 : // Looking at the difficulty as a function of key, it could only increase
745 0 : // when a delta layer starts or an image layer ends. Therefore it's sufficient
746 0 : // to check the difficulties at:
747 0 : // - the key.start for each non-empty part range
748 0 : // - the key.start for each delta
749 0 : // - the key.end for each image
750 0 : let keys_iter: Box<dyn Iterator<Item = Key>> = {
751 0 : let mut keys: Vec<Key> = self
752 0 : .iter_historic_layers()
753 0 : .map(|layer| {
754 0 : if layer.is_incremental() {
755 0 : layer.get_key_range().start
756 : } else {
757 0 : layer.get_key_range().end
758 : }
759 0 : })
760 0 : .collect();
761 0 : keys.sort();
762 0 : Box::new(keys.into_iter())
763 0 : };
764 0 : let mut keys_iter = keys_iter.peekable();
765 0 :
766 0 : // Iter the partition and keys together and query all the necessary
767 0 : // keys, computing the max difficulty for each part.
768 0 : partitioning
769 0 : .parts
770 0 : .iter()
771 0 : .map(|part| {
772 0 : let mut difficulty = 0;
773 : // Partition ranges are assumed to be sorted and disjoint
774 : // TODO assert it
775 0 : for range in &part.ranges {
776 0 : if !range.is_empty() {
777 0 : difficulty =
778 0 : std::cmp::max(difficulty, self.get_difficulty(lsn, range.start, range));
779 0 : }
780 0 : while let Some(key) = keys_iter.peek() {
781 0 : if key >= &range.end {
782 0 : break;
783 0 : }
784 0 : let key = keys_iter.next().unwrap();
785 0 : if key < range.start {
786 0 : continue;
787 0 : }
788 0 : difficulty =
789 0 : std::cmp::max(difficulty, self.get_difficulty(lsn, key, range));
790 : }
791 : }
792 0 : difficulty
793 0 : })
794 0 : .collect()
795 0 : }
796 :
797 : /// For each part of a keyspace partitioning, return the maximum number of layers
798 : /// that would be needed for page reconstruction in that part at the given LSN.
799 : ///
800 : /// If `limit` is provided we don't try to count above that number.
801 : ///
802 : /// This method is used to decide where to create new image layers. Computing the
803 : /// result for the entire partitioning at once allows this function to be more
804 : /// efficient, and further optimization is possible by using iterators instead,
805 : /// to allow early return.
806 : ///
807 : /// TODO actually use this method instead of count_deltas. Currently we only use
808 : /// it for benchmarks.
809 0 : pub fn get_difficulty_map(
810 0 : &self,
811 0 : lsn: Lsn,
812 0 : partitioning: &KeyPartitioning,
813 0 : limit: Option<usize>,
814 0 : ) -> Vec<usize> {
815 0 : // TODO This is a naive implementation. Perf improvements to do:
816 0 : // 1. Instead of calling self.image_coverage and self.count_deltas,
817 0 : // iterate the image and delta coverage only once.
818 0 : partitioning
819 0 : .parts
820 0 : .iter()
821 0 : .map(|part| {
822 0 : let mut difficulty = 0;
823 0 : for range in &part.ranges {
824 0 : if limit == Some(difficulty) {
825 0 : break;
826 0 : }
827 0 : for (img_range, last_img) in self.image_coverage(range, lsn) {
828 0 : if limit == Some(difficulty) {
829 0 : break;
830 0 : }
831 0 : let img_lsn = if let Some(last_img) = last_img {
832 0 : last_img.get_lsn_range().end
833 : } else {
834 0 : Lsn(0)
835 : };
836 :
837 0 : if img_lsn < lsn {
838 0 : let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit);
839 0 : difficulty = std::cmp::max(difficulty, num_deltas);
840 0 : }
841 : }
842 : }
843 0 : difficulty
844 0 : })
845 0 : .collect()
846 0 : }
847 :
848 : /// Return all L0 delta layers
849 370 : pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
850 370 : &self.l0_delta_layers
851 370 : }
852 :
853 : /// debugging function to print out the contents of the layer map
854 : #[allow(unused)]
855 2 : pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
856 2 : println!("Begin dump LayerMap");
857 2 :
858 2 : println!("open_layer:");
859 2 : if let Some(open_layer) = &self.open_layer {
860 0 : open_layer.dump(verbose, ctx).await?;
861 2 : }
862 :
863 2 : println!("frozen_layers:");
864 2 : for frozen_layer in self.frozen_layers.iter() {
865 0 : frozen_layer.dump(verbose, ctx).await?;
866 : }
867 :
868 2 : println!("historic_layers:");
869 12 : for desc in self.iter_historic_layers() {
870 12 : desc.dump();
871 12 : }
872 2 : println!("End dump LayerMap");
873 2 : Ok(())
874 2 : }
875 :
876 : /// `read_points` represent the tip of a timeline and any branch points, i.e. the places
877 : /// where we expect to serve reads.
878 : ///
879 : /// This function is O(N) and should be called infrequently. The caller is responsible for
880 : /// looking up and updating the Layer objects for these layer descriptors.
881 200 : pub fn get_visibility(
882 200 : &self,
883 200 : mut read_points: Vec<Lsn>,
884 200 : ) -> (
885 200 : Vec<(Arc<PersistentLayerDesc>, LayerVisibilityHint)>,
886 200 : KeySpace,
887 200 : ) {
888 : // This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas
889 : // KeySpace is intended to be composed statically and iterated over.
890 : struct KeyShadow {
891 : // Map of range start to range end
892 : inner: RangeSetBlaze<i128>,
893 : }
894 :
895 : impl KeyShadow {
896 200 : fn new() -> Self {
897 200 : Self {
898 200 : inner: Default::default(),
899 200 : }
900 200 : }
901 :
902 1600 : fn contains(&self, range: Range<Key>) -> bool {
903 1600 : let range_incl = range.start.to_i128()..=range.end.to_i128() - 1;
904 1600 : self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint(
905 1600 : CheckSortedDisjoint::from([range_incl]),
906 1600 : ))
907 1600 : }
908 :
909 : /// Add the input range to the keys covered by self.
910 : ///
911 : /// Return true if inserting this range covered some keys that were previously not covered
912 1870 : fn cover(&mut self, insert: Range<Key>) -> bool {
913 1870 : let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1;
914 1870 : self.inner.ranges_insert(range_incl)
915 1870 : }
916 :
917 204 : fn reset(&mut self) {
918 204 : self.inner = Default::default();
919 204 : }
920 :
921 200 : fn to_keyspace(&self) -> KeySpace {
922 200 : let mut accum = KeySpaceAccum::new();
923 202 : for range_incl in self.inner.ranges() {
924 202 : let range = Range {
925 202 : start: Key::from_i128(*range_incl.start()),
926 202 : end: Key::from_i128(range_incl.end() + 1),
927 202 : };
928 202 : accum.add_range(range)
929 : }
930 :
931 200 : accum.to_keyspace()
932 200 : }
933 : }
934 :
935 : // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
936 : // and a ReadPoint
937 200 : read_points.sort_by_key(|rp| rp.0);
938 200 : let mut shadow = KeyShadow::new();
939 :
940 : // We will interleave all our read points and layers into a sorted collection
941 : enum Item {
942 : ReadPoint { lsn: Lsn },
943 : Layer(Arc<PersistentLayerDesc>),
944 : }
945 :
946 200 : let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
947 200 : items.extend(self.iter_historic_layers().map(Item::Layer));
948 200 : items.extend(
949 200 : read_points
950 200 : .into_iter()
951 204 : .map(|rp| Item::ReadPoint { lsn: rp }),
952 200 : );
953 200 :
954 200 : // Ordering: we want to iterate like this:
955 200 : // 1. Highest LSNs first
956 200 : // 2. Consider images before deltas if they end at the same LSNs (images cover deltas)
957 200 : // 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible)
958 63492 : items.sort_by_key(|item| {
959 63492 : std::cmp::Reverse(match item {
960 63086 : Item::Layer(layer) => {
961 63086 : if layer.is_delta() {
962 29622 : (Lsn(layer.get_lsn_range().end.0 - 1), 0)
963 : } else {
964 33464 : (layer.image_layer_lsn(), 1)
965 : }
966 : }
967 406 : Item::ReadPoint { lsn } => (*lsn, 2),
968 : })
969 63492 : });
970 200 :
971 200 : let mut results = Vec::with_capacity(self.historic.len());
972 200 :
973 200 : let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
974 :
975 3874 : for item in items {
976 3674 : let (reached_lsn, is_readpoint) = match &item {
977 204 : Item::ReadPoint { lsn } => (lsn, true),
978 3470 : Item::Layer(layer) => (&layer.lsn_range.start, false),
979 : };
980 3674 : maybe_covered_deltas.retain(|d| {
981 106 : if *reached_lsn >= d.lsn_range.start && is_readpoint {
982 : // We encountered a readpoint within the delta layer: it is visible
983 :
984 2 : results.push((d.clone(), LayerVisibilityHint::Visible));
985 2 : false
986 104 : } else if *reached_lsn < d.lsn_range.start {
987 : // We passed the layer's range without encountering a read point: it is not visible
988 32 : results.push((d.clone(), LayerVisibilityHint::Covered));
989 32 : false
990 : } else {
991 : // We're still in the delta layer: continue iterating
992 72 : true
993 : }
994 3674 : });
995 3674 :
996 3674 : match item {
997 204 : Item::ReadPoint { lsn: _lsn } => {
998 204 : // TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have
999 204 : // to assume that the whole key range is visible at the branch point.
1000 204 : shadow.reset();
1001 204 : }
1002 3470 : Item::Layer(layer) => {
1003 3470 : let visibility = if layer.is_delta() {
1004 1600 : if shadow.contains(layer.get_key_range()) {
1005 : // If a layer isn't visible based on current state, we must defer deciding whether
1006 : // it is truly not visible until we have advanced past the delta's range: we might
1007 : // encounter another branch point within this delta layer's LSN range.
1008 38 : maybe_covered_deltas.push(layer);
1009 38 : continue;
1010 : } else {
1011 1562 : LayerVisibilityHint::Visible
1012 : }
1013 : } else {
1014 1870 : let modified = shadow.cover(layer.get_key_range());
1015 1870 : if modified {
1016 : // An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered
1017 1834 : LayerVisibilityHint::Visible
1018 : } else {
1019 : // An image layer in a region that was already covered
1020 36 : LayerVisibilityHint::Covered
1021 : }
1022 : };
1023 :
1024 3432 : results.push((layer, visibility));
1025 : }
1026 : }
1027 : }
1028 :
1029 : // Drain any remaining maybe_covered deltas
1030 200 : results.extend(
1031 200 : maybe_covered_deltas
1032 200 : .into_iter()
1033 200 : .map(|d| (d, LayerVisibilityHint::Covered)),
1034 200 : );
1035 200 :
1036 200 : (results, shadow.to_keyspace())
1037 200 : }
1038 : }
1039 :
1040 : #[cfg(test)]
1041 : mod tests {
1042 : use crate::tenant::{storage_layer::LayerName, IndexPart};
1043 : use pageserver_api::{
1044 : key::DBDIR_KEY,
1045 : keyspace::{KeySpace, KeySpaceRandomAccum},
1046 : };
1047 : use std::{collections::HashMap, path::PathBuf};
1048 : use utils::{
1049 : id::{TenantId, TimelineId},
1050 : shard::TenantShardId,
1051 : };
1052 :
1053 : use super::*;
1054 :
1055 : #[derive(Clone)]
1056 : struct LayerDesc {
1057 : key_range: Range<Key>,
1058 : lsn_range: Range<Lsn>,
1059 : is_delta: bool,
1060 : }
1061 :
1062 2 : fn create_layer_map(layers: Vec<LayerDesc>) -> LayerMap {
1063 2 : let mut layer_map = LayerMap::default();
1064 :
1065 12 : for layer in layers {
1066 10 : layer_map.insert_historic_noflush(PersistentLayerDesc::new_test(
1067 10 : layer.key_range,
1068 10 : layer.lsn_range,
1069 10 : layer.is_delta,
1070 10 : ));
1071 10 : }
1072 :
1073 2 : layer_map.flush_updates();
1074 2 : layer_map
1075 2 : }
1076 :
1077 3540 : fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
1078 3540 : assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
1079 3540 : let lhs: HashMap<SearchResult, KeySpace> = lhs
1080 3540 : .found
1081 3540 : .into_iter()
1082 8230 : .map(|(search_result, accum)| (search_result, accum.to_keyspace()))
1083 3540 : .collect();
1084 3540 : let rhs: HashMap<SearchResult, KeySpace> = rhs
1085 3540 : .found
1086 3540 : .into_iter()
1087 8230 : .map(|(search_result, accum)| (search_result, accum.to_keyspace()))
1088 3540 : .collect();
1089 3540 :
1090 3540 : assert_eq!(lhs, rhs);
1091 3540 : }
1092 :
1093 : #[cfg(test)]
1094 3540 : fn brute_force_range_search(
1095 3540 : layer_map: &LayerMap,
1096 3540 : key_range: Range<Key>,
1097 3540 : end_lsn: Lsn,
1098 3540 : ) -> RangeSearchResult {
1099 3540 : let mut range_search_result = RangeSearchResult::new();
1100 3540 :
1101 3540 : let mut key = key_range.start;
1102 75520 : while key != key_range.end {
1103 71980 : let res = layer_map.search(key, end_lsn);
1104 71980 : match res {
1105 61320 : Some(res) => {
1106 61320 : range_search_result
1107 61320 : .found
1108 61320 : .entry(res)
1109 61320 : .or_default()
1110 61320 : .add_key(key);
1111 61320 : }
1112 10660 : None => {
1113 10660 : range_search_result.not_found.add_key(key);
1114 10660 : }
1115 : }
1116 :
1117 71980 : key = key.next();
1118 : }
1119 :
1120 3540 : range_search_result
1121 3540 : }
1122 :
1123 : #[test]
1124 2 : fn ranged_search_on_empty_layer_map() {
1125 2 : let layer_map = LayerMap::default();
1126 2 : let range = Key::from_i128(100)..Key::from_i128(200);
1127 2 :
1128 2 : let res = layer_map.range_search(range.clone(), Lsn(100));
1129 2 : assert_eq!(
1130 2 : res.not_found.to_keyspace(),
1131 2 : KeySpace {
1132 2 : ranges: vec![range]
1133 2 : }
1134 2 : );
1135 2 : }
1136 :
1137 : #[test]
1138 2 : fn ranged_search() {
1139 2 : let layers = vec![
1140 2 : LayerDesc {
1141 2 : key_range: Key::from_i128(15)..Key::from_i128(50),
1142 2 : lsn_range: Lsn(0)..Lsn(5),
1143 2 : is_delta: false,
1144 2 : },
1145 2 : LayerDesc {
1146 2 : key_range: Key::from_i128(10)..Key::from_i128(20),
1147 2 : lsn_range: Lsn(5)..Lsn(20),
1148 2 : is_delta: true,
1149 2 : },
1150 2 : LayerDesc {
1151 2 : key_range: Key::from_i128(15)..Key::from_i128(25),
1152 2 : lsn_range: Lsn(20)..Lsn(30),
1153 2 : is_delta: true,
1154 2 : },
1155 2 : LayerDesc {
1156 2 : key_range: Key::from_i128(35)..Key::from_i128(40),
1157 2 : lsn_range: Lsn(25)..Lsn(35),
1158 2 : is_delta: true,
1159 2 : },
1160 2 : LayerDesc {
1161 2 : key_range: Key::from_i128(35)..Key::from_i128(40),
1162 2 : lsn_range: Lsn(35)..Lsn(40),
1163 2 : is_delta: false,
1164 2 : },
1165 2 : ];
1166 2 :
1167 2 : let layer_map = create_layer_map(layers.clone());
1168 122 : for start in 0..60 {
1169 3540 : for end in (start + 1)..60 {
1170 3540 : let range = Key::from_i128(start)..Key::from_i128(end);
1171 3540 : let result = layer_map.range_search(range.clone(), Lsn(100));
1172 3540 : let expected = brute_force_range_search(&layer_map, range, Lsn(100));
1173 3540 :
1174 3540 : assert_range_search_result_eq(result, expected);
1175 3540 : }
1176 : }
1177 2 : }
1178 :
1179 : #[test]
1180 2 : fn layer_visibility_basic() {
1181 2 : // A simple synthetic input, as a smoke test.
1182 2 : let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
1183 2 : let timeline_id = TimelineId::generate();
1184 2 : let mut layer_map = LayerMap::default();
1185 2 : let mut updates = layer_map.batch_update();
1186 :
1187 : const FAKE_LAYER_SIZE: u64 = 1024;
1188 :
1189 2 : let inject_delta = |updates: &mut BatchedUpdates,
1190 : key_start: i128,
1191 : key_end: i128,
1192 : lsn_start: u64,
1193 14 : lsn_end: u64| {
1194 14 : let desc = PersistentLayerDesc::new_delta(
1195 14 : tenant_shard_id,
1196 14 : timeline_id,
1197 14 : Range {
1198 14 : start: Key::from_i128(key_start),
1199 14 : end: Key::from_i128(key_end),
1200 14 : },
1201 14 : Range {
1202 14 : start: Lsn(lsn_start),
1203 14 : end: Lsn(lsn_end),
1204 14 : },
1205 14 : 1024,
1206 14 : );
1207 14 : updates.insert_historic(desc.clone());
1208 14 : desc
1209 14 : };
1210 :
1211 2 : let inject_image =
1212 12 : |updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| {
1213 12 : let desc = PersistentLayerDesc::new_img(
1214 12 : tenant_shard_id,
1215 12 : timeline_id,
1216 12 : Range {
1217 12 : start: Key::from_i128(key_start),
1218 12 : end: Key::from_i128(key_end),
1219 12 : },
1220 12 : Lsn(lsn),
1221 12 : FAKE_LAYER_SIZE,
1222 12 : );
1223 12 : updates.insert_historic(desc.clone());
1224 12 : desc
1225 12 : };
1226 :
1227 : //
1228 : // Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios
1229 : // we expect to handle. You can follow these examples through in the same order as they would be processed
1230 : // by the function under test.
1231 : //
1232 :
1233 2 : let mut read_points = vec![Lsn(1000)];
1234 2 :
1235 2 : // A delta ahead of any image layer
1236 2 : let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110);
1237 2 :
1238 2 : // An image layer is visible and covers some layers beneath itself
1239 2 : let visible_covering_img = inject_image(&mut updates, 5, 25, 99);
1240 2 :
1241 2 : // A delta layer covered by the image layer: should be covered
1242 2 : let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100);
1243 2 :
1244 2 : // A delta layer partially covered by an image layer: should be visible
1245 2 : let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100);
1246 2 :
1247 2 : // A delta layer not covered by an image layer: should be visible
1248 2 : let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100);
1249 2 :
1250 2 : // An image layer covered by the image layer above: should be covered
1251 2 : let covered_image = inject_image(&mut updates, 10, 20, 89);
1252 2 :
1253 2 : // An image layer partially covered by an image layer: should be visible
1254 2 : let partially_covered_image = inject_image(&mut updates, 1, 7, 89);
1255 2 :
1256 2 : // An image layer not covered by an image layer: should be visible
1257 2 : let not_covered_image = inject_image(&mut updates, 1, 4, 89);
1258 2 :
1259 2 : // A read point: this will make subsequent layers below here visible, even if there are
1260 2 : // more recent layers covering them.
1261 2 : read_points.push(Lsn(80));
1262 2 :
1263 2 : // A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer
1264 2 : let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79);
1265 2 :
1266 2 : // A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range:
1267 2 : // the read point should make it visible, even though its end LSN is covered
1268 2 : let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69);
1269 2 : let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69);
1270 2 : read_points.push(Lsn(65));
1271 2 : let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69);
1272 2 :
1273 2 : let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65);
1274 2 :
1275 2 : updates.flush();
1276 2 :
1277 2 : let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
1278 2 : let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
1279 2 :
1280 2 : assert_eq!(
1281 2 : layer_visibilities.get(&ahead_layer),
1282 2 : Some(&LayerVisibilityHint::Visible)
1283 2 : );
1284 2 : assert_eq!(
1285 2 : layer_visibilities.get(&visible_covering_img),
1286 2 : Some(&LayerVisibilityHint::Visible)
1287 2 : );
1288 2 : assert_eq!(
1289 2 : layer_visibilities.get(&covered_delta),
1290 2 : Some(&LayerVisibilityHint::Covered)
1291 2 : );
1292 2 : assert_eq!(
1293 2 : layer_visibilities.get(&partially_covered_delta),
1294 2 : Some(&LayerVisibilityHint::Visible)
1295 2 : );
1296 2 : assert_eq!(
1297 2 : layer_visibilities.get(¬_covered_delta),
1298 2 : Some(&LayerVisibilityHint::Visible)
1299 2 : );
1300 2 : assert_eq!(
1301 2 : layer_visibilities.get(&covered_image),
1302 2 : Some(&LayerVisibilityHint::Covered)
1303 2 : );
1304 2 : assert_eq!(
1305 2 : layer_visibilities.get(&partially_covered_image),
1306 2 : Some(&LayerVisibilityHint::Visible)
1307 2 : );
1308 2 : assert_eq!(
1309 2 : layer_visibilities.get(¬_covered_image),
1310 2 : Some(&LayerVisibilityHint::Visible)
1311 2 : );
1312 2 : assert_eq!(
1313 2 : layer_visibilities.get(&covered_delta_below_read_point),
1314 2 : Some(&LayerVisibilityHint::Visible)
1315 2 : );
1316 2 : assert_eq!(
1317 2 : layer_visibilities.get(&covering_img_between_read_points),
1318 2 : Some(&LayerVisibilityHint::Visible)
1319 2 : );
1320 2 : assert_eq!(
1321 2 : layer_visibilities.get(&covered_delta_between_read_points),
1322 2 : Some(&LayerVisibilityHint::Covered)
1323 2 : );
1324 2 : assert_eq!(
1325 2 : layer_visibilities.get(&covered_delta_intersects_read_point),
1326 2 : Some(&LayerVisibilityHint::Visible)
1327 2 : );
1328 2 : assert_eq!(
1329 2 : layer_visibilities.get(&visible_img_after_last_read_point),
1330 2 : Some(&LayerVisibilityHint::Visible)
1331 2 : );
1332 :
1333 : // Shadow should include all the images below the last read point
1334 2 : let expected_shadow = KeySpace {
1335 2 : ranges: vec![Key::from_i128(10)..Key::from_i128(20)],
1336 2 : };
1337 2 : assert_eq!(shadow, expected_shadow);
1338 2 : }
1339 :
1340 2 : fn fixture_path(relative: &str) -> PathBuf {
1341 2 : PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
1342 2 : }
1343 :
1344 : #[test]
1345 2 : fn layer_visibility_realistic() {
1346 2 : // Load a large example layermap
1347 2 : let index_raw = std::fs::read_to_string(fixture_path(
1348 2 : "test_data/indices/mixed_workload/index_part.json",
1349 2 : ))
1350 2 : .unwrap();
1351 2 : let index: IndexPart = serde_json::from_str::<IndexPart>(&index_raw).unwrap();
1352 2 :
1353 2 : let tenant_id = TenantId::generate();
1354 2 : let tenant_shard_id = TenantShardId::unsharded(tenant_id);
1355 2 : let timeline_id = TimelineId::generate();
1356 2 :
1357 2 : let mut layer_map = LayerMap::default();
1358 2 : let mut updates = layer_map.batch_update();
1359 3130 : for (layer_name, layer_metadata) in index.layer_metadata {
1360 3128 : let layer_desc = match layer_name {
1361 1604 : LayerName::Image(layer_name) => PersistentLayerDesc {
1362 1604 : key_range: layer_name.key_range.clone(),
1363 1604 : lsn_range: layer_name.lsn_as_range(),
1364 1604 : tenant_shard_id,
1365 1604 : timeline_id,
1366 1604 : is_delta: false,
1367 1604 : file_size: layer_metadata.file_size,
1368 1604 : },
1369 1524 : LayerName::Delta(layer_name) => PersistentLayerDesc {
1370 1524 : key_range: layer_name.key_range,
1371 1524 : lsn_range: layer_name.lsn_range,
1372 1524 : tenant_shard_id,
1373 1524 : timeline_id,
1374 1524 : is_delta: true,
1375 1524 : file_size: layer_metadata.file_size,
1376 1524 : },
1377 : };
1378 3128 : updates.insert_historic(layer_desc);
1379 : }
1380 2 : updates.flush();
1381 2 :
1382 2 : let read_points = vec![index.metadata.disk_consistent_lsn()];
1383 2 : let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
1384 3130 : for (layer_desc, visibility) in &layer_visibilities {
1385 3128 : tracing::info!("{layer_desc:?}: {visibility:?}");
1386 3128 : eprintln!("{layer_desc:?}: {visibility:?}");
1387 : }
1388 :
1389 : // The shadow should be non-empty, since there were some image layers
1390 2 : assert!(!shadow.ranges.is_empty());
1391 :
1392 : // At least some layers should be marked covered
1393 2 : assert!(layer_visibilities
1394 2 : .iter()
1395 38 : .any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
1396 :
1397 2 : let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
1398 :
1399 : // Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it
1400 3130 : for (layer_desc, visible) in &layer_visibilities {
1401 3128 : let mut coverage = KeySpaceRandomAccum::new();
1402 3128 : let mut covered_by = Vec::new();
1403 :
1404 4892192 : for other_layer in layer_map.iter_historic_layers() {
1405 4892192 : if &other_layer == layer_desc {
1406 3128 : continue;
1407 4889064 : }
1408 4889064 : if !other_layer.is_delta()
1409 2507052 : && other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1)
1410 1263512 : && other_layer.key_range.start <= layer_desc.key_range.end
1411 465304 : && layer_desc.key_range.start <= other_layer.key_range.end
1412 85646 : {
1413 85646 : coverage.add_range(other_layer.get_key_range());
1414 85646 : covered_by.push((*other_layer).clone());
1415 4803418 : }
1416 : }
1417 3128 : let coverage = coverage.to_keyspace();
1418 :
1419 3128 : let expect_visible = if coverage.ranges.len() == 1
1420 756 : && coverage.contains(&layer_desc.key_range.start)
1421 34 : && coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1))
1422 : {
1423 20 : LayerVisibilityHint::Covered
1424 : } else {
1425 3108 : LayerVisibilityHint::Visible
1426 : };
1427 :
1428 3128 : if expect_visible != *visible {
1429 0 : eprintln!(
1430 0 : "Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}",
1431 0 : layer_desc.key_range.start,
1432 0 : layer_desc.key_range.end,
1433 0 : layer_desc.lsn_range.start,
1434 0 : layer_desc.lsn_range.end,
1435 0 : layer_desc.is_delta()
1436 0 : );
1437 0 : if expect_visible == LayerVisibilityHint::Covered {
1438 0 : eprintln!("Covered by:");
1439 0 : for other in covered_by {
1440 0 : eprintln!(
1441 0 : " {}..{} @ {}",
1442 0 : other.get_key_range().start,
1443 0 : other.get_key_range().end,
1444 0 : other.image_layer_lsn()
1445 0 : );
1446 0 : }
1447 0 : if let Some(range) = coverage.ranges.first() {
1448 0 : eprintln!(
1449 0 : "Total coverage from contributing layers: {}..{}",
1450 0 : range.start, range.end
1451 0 : );
1452 0 : } else {
1453 0 : eprintln!(
1454 0 : "Total coverage from contributing layers: {:?}",
1455 0 : coverage.ranges
1456 0 : );
1457 0 : }
1458 0 : }
1459 3128 : }
1460 3128 : assert_eq!(expect_visible, *visible);
1461 : }
1462 :
1463 : // Sanity: the layer that holds latest data for the DBDIR key should always be visible
1464 : // (just using this key as a key that will always exist for any layermap fixture)
1465 2 : let dbdir_layer = layer_map
1466 2 : .search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
1467 2 : .unwrap();
1468 2 : assert!(matches!(
1469 2 : layer_visibilities.get(&dbdir_layer.layer).unwrap(),
1470 : LayerVisibilityHint::Visible
1471 : ));
1472 2 : }
1473 : }
|