Line data Source code
1 : //! An LSM tree consists of multiple levels, each exponentially larger than the
2 : //! previous level. And each level consists of multiple "tiers". With tiered
3 : //! compaction, a level is compacted when it has accumulated more than N tiers,
4 : //! forming one tier on the next level.
5 : //!
6 : //! In the pageserver, we don't explicitly track the levels and tiers. Instead,
7 : //! we identify them by looking at the shapes of the layers. It's an easy task
8 : //! for a human, but it's not straightforward to come up with the exact
9 : //! rules. Especially if there are cases like interrupted, half-finished
10 : //! compactions, or highly skewed data distributions that have let us "skip"
11 : //! some levels. It's not critical to classify all cases correctly; at worst we
12 : //! delay some compaction work, and suffer from more read amplification, or we
13 : //! perform some unnecessary compaction work.
14 : //!
15 : //! `identify_level` performs that shape-matching.
16 : //!
17 : //! It returns a Level struct, which has `depth()` function to count the number
18 : //! of "tiers" in the level. The tier count is the max depth of stacked layers
19 : //! within the level. That's a good measure, because the point of compacting is
20 : //! to reduce read amplification, and the depth is what determines that.
21 : //!
22 : //! One interesting effect of this is that if we generate very small delta
23 : //! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
24 : //! because they reach the target size, the L0 compaction will combine them to
25 : //! one larger file. But if the combined file is still smaller than the target
26 : //! file size, the file will still be considered to be part of L0 at the next
27 : //! iteration.
28 :
29 : use anyhow::bail;
30 : use std::collections::BTreeSet;
31 : use std::ops::Range;
32 : use utils::lsn::Lsn;
33 :
34 : use crate::interface::*;
35 :
36 : use tracing::{info, trace};
37 :
38 : pub struct Level<L> {
39 : pub lsn_range: Range<Lsn>,
40 : pub layers: Vec<L>,
41 : }
42 :
43 : /// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
44 : /// no layers that cross the boundary LSN.
45 : ///
46 : /// A further restriction is that all layers in the returned partition cover at
47 : /// most 'lsn_max_size' LSN bytes.
48 6066 : pub async fn identify_level<K, L>(
49 6066 : all_layers: Vec<L>,
50 6066 : end_lsn: Lsn,
51 6066 : lsn_max_size: u64,
52 6066 : ) -> anyhow::Result<Option<Level<L>>>
53 6066 : where
54 6066 : K: CompactionKey,
55 6066 : L: CompactionLayer<K> + Clone,
56 6066 : {
57 6066 : // filter out layers that are above the `end_lsn`, they are completely irrelevant.
58 6066 : let mut layers = Vec::new();
59 42234 : for l in all_layers {
60 36174 : if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
61 : // shouldn't happen. Indicates that the caller passed a bogus
62 : // end_lsn.
63 6 : bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
64 36168 : }
65 36168 : // include image layers sitting exacty at `end_lsn`.
66 36168 : let is_image = !l.is_delta();
67 36168 : if (is_image && l.lsn_range().start > end_lsn)
68 36168 : || (!is_image && l.lsn_range().start >= end_lsn)
69 : {
70 30 : continue;
71 36138 : }
72 36138 : layers.push(l);
73 : }
74 : // All the remaining layers either belong to this level, or are below it.
75 6060 : info!(
76 0 : "identify level at {}, size {}, num layers below: {}",
77 0 : end_lsn,
78 0 : lsn_max_size,
79 0 : layers.len()
80 : );
81 6060 : if layers.is_empty() {
82 540 : return Ok(None);
83 5520 : }
84 5520 :
85 5520 : // Walk the ranges in LSN order.
86 5520 : //
87 5520 : // ----- end_lsn
88 5520 : // |
89 5520 : // |
90 5520 : // v
91 5520 : //
92 61572 : layers.sort_by_key(|l| l.lsn_range().end);
93 5520 : let mut candidate_start_lsn = end_lsn;
94 5520 : let mut candidate_layers: Vec<L> = Vec::new();
95 5520 : let mut current_best_start_lsn = end_lsn;
96 5520 : let mut current_best_layers: Vec<L> = Vec::new();
97 5520 : let mut iter = layers.into_iter();
98 : loop {
99 14208 : let Some(l) = iter.next_back() else {
100 : // Reached end. Accept the last candidate
101 1692 : current_best_start_lsn = candidate_start_lsn;
102 1692 : current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
103 1692 : break;
104 : };
105 12516 : trace!(
106 0 : "inspecting {} for candidate {}, current best {}",
107 0 : l.short_id(),
108 : candidate_start_lsn,
109 : current_best_start_lsn
110 : );
111 :
112 12516 : let r = l.lsn_range();
113 12516 :
114 12516 : // Image layers don't restrict our choice of cutoff LSN
115 12516 : if l.is_delta() {
116 : // Is this candidate workable? In other words, are there any
117 : // delta layers that span across this LSN
118 : //
119 : // Valid: Not valid:
120 : // + +
121 : // | | +
122 : // + <- candidate + | <- candidate
123 : // + +
124 : // |
125 : // +
126 12510 : if r.end <= candidate_start_lsn {
127 12420 : // Hooray, there are no crossing LSNs. And we have visited
128 12420 : // through all the layers within candidate..end_lsn. The
129 12420 : // current candidate can be accepted.
130 12420 : current_best_start_lsn = r.end;
131 12420 : current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
132 12420 : candidate_start_lsn = r.start;
133 12420 : }
134 :
135 : // Is it small enough to be considered part of this level?
136 12510 : if r.end.0 - r.start.0 > lsn_max_size {
137 : // Too large, this layer belongs to next level. Stop.
138 3828 : trace!(
139 0 : "too large {}, size {} vs {}",
140 0 : l.short_id(),
141 0 : r.end.0 - r.start.0,
142 : lsn_max_size
143 : );
144 3828 : break;
145 8682 : }
146 8682 :
147 8682 : // If this crosses the candidate lsn, push it down.
148 8682 : if r.start < candidate_start_lsn {
149 12 : trace!(
150 0 : "layer {} prevents from stopping at {}",
151 0 : l.short_id(),
152 : candidate_start_lsn
153 : );
154 12 : candidate_start_lsn = r.start;
155 8670 : }
156 6 : }
157 :
158 : // Include this layer in our candidate
159 8688 : candidate_layers.push(l);
160 : }
161 :
162 5520 : Ok(if current_best_start_lsn == end_lsn {
163 : // empty level
164 1086 : None
165 : } else {
166 4434 : Some(Level {
167 4434 : lsn_range: current_best_start_lsn..end_lsn,
168 4434 : layers: current_best_layers,
169 4434 : })
170 : })
171 6066 : }
172 :
173 : impl<L> Level<L> {
174 : /// Count the number of deltas stacked on each other.
175 4434 : pub fn depth<K>(&self) -> u64
176 4434 : where
177 4434 : K: CompactionKey,
178 4434 : L: CompactionLayer<K>,
179 4434 : {
180 4434 : struct Event<K> {
181 4434 : key: K,
182 4434 : layer_idx: usize,
183 4434 : start: bool,
184 4434 : }
185 4434 : let mut events: Vec<Event<K>> = Vec::new();
186 8670 : for (idx, l) in self.layers.iter().enumerate() {
187 8670 : let key_range = l.key_range();
188 8670 : if key_range.end == key_range.start.next() && l.is_delta() {
189 : // Ignore single-key delta layers as they can be stacked on top of each other
190 : // as that is the only way to cut further.
191 72 : continue;
192 8598 : }
193 8598 : events.push(Event {
194 8598 : key: l.key_range().start,
195 8598 : layer_idx: idx,
196 8598 : start: true,
197 8598 : });
198 8598 : events.push(Event {
199 8598 : key: l.key_range().end,
200 8598 : layer_idx: idx,
201 8598 : start: false,
202 8598 : });
203 : }
204 38802 : events.sort_by_key(|e| (e.key, e.start));
205 4434 :
206 4434 : // Sweep the key space left to right. Stop at each distinct key, and
207 4434 : // count the number of deltas on top of the highest image at that key.
208 4434 : //
209 4434 : // This is a little inefficient, as we walk through the active_set on
210 4434 : // every key. We could increment/decrement a counter on each step
211 4434 : // instead, but that'd require a bit more complex bookkeeping.
212 4434 : let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
213 4434 : let mut max_depth = 0;
214 4434 : let mut events_iter = events.iter().peekable();
215 21630 : while let Some(e) = events_iter.next() {
216 17196 : let l = &self.layers[e.layer_idx];
217 17196 : let is_image = !l.is_delta();
218 17196 :
219 17196 : // update the active set
220 17196 : if e.start {
221 8598 : active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
222 8598 : } else {
223 8598 : active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
224 8598 : }
225 :
226 : // recalculate depth if this was the last event at this point
227 17196 : let more_events_at_this_key = events_iter
228 17196 : .peek()
229 17196 : .map_or(false, |next_e| next_e.key == e.key);
230 17196 : if !more_events_at_this_key {
231 8932 : let mut active_depth = 0;
232 8932 : for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
233 8626 : if *is_image {
234 12 : break;
235 8614 : }
236 8614 : active_depth += 1;
237 : }
238 8932 : if active_depth > max_depth {
239 4444 : max_depth = active_depth;
240 4488 : }
241 8264 : }
242 : }
243 4434 : debug_assert_eq!(active_set, BTreeSet::new());
244 4434 : max_depth
245 4434 : }
246 : }
247 :
248 : #[cfg(test)]
249 : mod tests {
250 : use super::*;
251 : use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
252 : use std::sync::{Arc, Mutex};
253 :
254 120 : fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
255 120 : MockLayer::Delta(Arc::new(MockDeltaLayer {
256 120 : key_range,
257 120 : lsn_range,
258 120 : // identify_level() doesn't pay attention to the rest of the fields
259 120 : file_size: 0,
260 120 : deleted: Mutex::new(false),
261 120 : records: vec![],
262 120 : }))
263 120 : }
264 :
265 6 : fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
266 6 : MockLayer::Image(Arc::new(MockImageLayer {
267 6 : key_range,
268 6 : lsn_range: lsn..(lsn + 1),
269 6 : // identify_level() doesn't pay attention to the rest of the fields
270 6 : file_size: 0,
271 6 : deleted: Mutex::new(false),
272 6 : }))
273 6 : }
274 :
275 : #[tokio::test]
276 6 : async fn test_identify_level() -> anyhow::Result<()> {
277 6 : let layers = vec![
278 6 : delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
279 6 : delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
280 6 : delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
281 6 : delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
282 6 : delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
283 6 : delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
284 6 : ];
285 6 :
286 6 : // All layers fit in the max file size
287 6 : let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
288 6 : .await?
289 6 : .unwrap();
290 6 : assert_eq!(level.depth(), 6);
291 6 :
292 6 : // Same LSN with smaller max file size. The second layer from the top is larger
293 6 : // and belongs to next level.
294 6 : let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
295 6 : .await?
296 6 : .unwrap();
297 6 : assert_eq!(level.depth(), 1);
298 6 :
299 6 : // Call with a smaller LSN
300 6 : let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
301 6 : .await?
302 6 : .unwrap();
303 6 : assert_eq!(level.depth(), 2);
304 6 :
305 6 : // Call with an LSN that doesn't partition the space
306 6 : let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
307 6 : assert!(result.is_err());
308 6 : Ok(())
309 6 : }
310 :
311 : #[tokio::test]
312 6 : async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
313 6 : // The files LSN ranges overlap, so even though there are more files that
314 6 : // fit under the file size, they are not included in the level because they
315 6 : // overlap so that we'd need to include the oldest file, too, which is
316 6 : // larger
317 6 : let layers = vec![
318 6 : delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
319 6 : delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
320 6 : delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
321 6 : delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
322 6 : delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
323 6 : ];
324 6 :
325 6 : let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
326 6 : .await?
327 6 : .unwrap();
328 6 : assert_eq!(level.depth(), 1);
329 6 :
330 6 : Ok(())
331 6 : }
332 :
333 : #[tokio::test]
334 6 : async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
335 6 : // The key ranges don't overlap, so depth is only 1.
336 6 : let layers = vec![
337 6 : delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
338 6 : delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
339 6 : delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
340 6 : ];
341 6 :
342 6 : let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
343 6 : .await?
344 6 : .unwrap();
345 6 : assert_eq!(level.layers.len(), 3);
346 6 : assert_eq!(level.depth(), 1);
347 6 :
348 6 : // Staggered. The 1st and 3rd layer don't overlap with each other.
349 6 : let layers = vec![
350 6 : delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
351 6 : delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
352 6 : delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
353 6 : ];
354 6 :
355 6 : let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
356 6 : .await?
357 6 : .unwrap();
358 6 : assert_eq!(level.layers.len(), 3);
359 6 : assert_eq!(level.depth(), 2);
360 6 : Ok(())
361 6 : }
362 :
363 : #[tokio::test]
364 6 : async fn test_depth_images() -> anyhow::Result<()> {
365 6 : let layers: Vec<MockLayer> = vec![
366 6 : delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
367 6 : delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
368 6 : delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
369 6 : // This covers the same key range as the 2nd delta layer. The depth
370 6 : // in that key range is therefore 0.
371 6 : image(1500..2500, Lsn(0x9000)),
372 6 : ];
373 6 :
374 6 : let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
375 6 : .await?
376 6 : .unwrap();
377 6 : assert_eq!(level.layers.len(), 4);
378 6 : assert_eq!(level.depth(), 1);
379 6 : Ok(())
380 6 : }
381 : }
|