Line data Source code
1 : //! An LSM tree consists of multiple levels, each exponentially larger than the
2 : //! previous level. And each level consists of multiple "tiers". With tiered
3 : //! compaction, a level is compacted when it has accumulated more than N tiers,
4 : //! forming one tier on the next level.
5 : //!
6 : //! In the pageserver, we don't explicitly track the levels and tiers. Instead,
7 : //! we identify them by looking at the shapes of the layers. It's an easy task
8 : //! for a human, but it's not straightforward to come up with the exact
9 : //! rules. Especially if there are cases like interrupted, half-finished
10 : //! compactions, or highly skewed data distributions that have let us "skip"
11 : //! some levels. It's not critical to classify all cases correctly; at worst we
12 : //! delay some compaction work, and suffer from more read amplification, or we
13 : //! perform some unnecessary compaction work.
14 : //!
15 : //! `identify_level` performs that shape-matching.
16 : //!
17 : //! It returns a Level struct, which has `depth()` function to count the number
18 : //! of "tiers" in the level. The tier count is the max depth of stacked layers
19 : //! within the level. That's a good measure, because the point of compacting is
20 : //! to reduce read amplification, and the depth is what determines that.
21 : //!
22 : //! One interesting effect of this is that if we generate very small delta
23 : //! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
24 : //! because they reach the target size, the L0 compaction will combine them to
25 : //! one larger file. But if the combined file is still smaller than the target
26 : //! file size, the file will still be considered to be part of L0 at the next
27 : //! iteration.
28 :
29 : use std::collections::BTreeSet;
30 : use std::ops::Range;
31 :
32 : use anyhow::bail;
33 : use tracing::{info, trace};
34 : use utils::lsn::Lsn;
35 :
36 : use crate::interface::*;
37 :
38 : pub struct Level<L> {
39 : pub lsn_range: Range<Lsn>,
40 : pub layers: Vec<L>,
41 : }
42 :
43 : /// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
44 : /// no layers that cross the boundary LSN.
45 : ///
46 : /// A further restriction is that all layers in the returned partition cover at
47 : /// most 'lsn_max_size' LSN bytes.
48 1011 : pub async fn identify_level<K, L>(
49 1011 : all_layers: Vec<L>,
50 1011 : end_lsn: Lsn,
51 1011 : lsn_max_size: u64,
52 1011 : ) -> anyhow::Result<Option<Level<L>>>
53 1011 : where
54 1011 : K: CompactionKey,
55 1011 : L: CompactionLayer<K> + Clone,
56 1011 : {
57 1011 : // filter out layers that are above the `end_lsn`, they are completely irrelevant.
58 1011 : let mut layers = Vec::new();
59 7039 : for l in all_layers {
60 6029 : if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
61 : // shouldn't happen. Indicates that the caller passed a bogus
62 : // end_lsn.
63 1 : bail!(
64 1 : "identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}",
65 1 : end_lsn,
66 1 : l.short_id()
67 1 : );
68 6028 : }
69 6028 : // include image layers sitting exacty at `end_lsn`.
70 6028 : let is_image = !l.is_delta();
71 6028 : if (is_image && l.lsn_range().start > end_lsn)
72 6028 : || (!is_image && l.lsn_range().start >= end_lsn)
73 : {
74 5 : continue;
75 6023 : }
76 6023 : layers.push(l);
77 : }
78 : // All the remaining layers either belong to this level, or are below it.
79 1010 : info!(
80 0 : "identify level at {}, size {}, num layers below: {}",
81 0 : end_lsn,
82 0 : lsn_max_size,
83 0 : layers.len()
84 : );
85 1010 : if layers.is_empty() {
86 90 : return Ok(None);
87 920 : }
88 920 :
89 920 : // Walk the ranges in LSN order.
90 920 : //
91 920 : // ----- end_lsn
92 920 : // |
93 920 : // |
94 920 : // v
95 920 : //
96 10262 : layers.sort_by_key(|l| l.lsn_range().end);
97 920 : let mut candidate_start_lsn = end_lsn;
98 920 : let mut candidate_layers: Vec<L> = Vec::new();
99 920 : let mut current_best_start_lsn = end_lsn;
100 920 : let mut current_best_layers: Vec<L> = Vec::new();
101 920 : let mut iter = layers.into_iter();
102 : loop {
103 2368 : let Some(l) = iter.next_back() else {
104 : // Reached end. Accept the last candidate
105 282 : current_best_start_lsn = candidate_start_lsn;
106 282 : current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
107 282 : break;
108 : };
109 2086 : trace!(
110 0 : "inspecting {} for candidate {}, current best {}",
111 0 : l.short_id(),
112 : candidate_start_lsn,
113 : current_best_start_lsn
114 : );
115 :
116 2086 : let r = l.lsn_range();
117 2086 :
118 2086 : // Image layers don't restrict our choice of cutoff LSN
119 2086 : if l.is_delta() {
120 : // Is this candidate workable? In other words, are there any
121 : // delta layers that span across this LSN
122 : //
123 : // Valid: Not valid:
124 : // + +
125 : // | | +
126 : // + <- candidate + | <- candidate
127 : // + +
128 : // |
129 : // +
130 2085 : if r.end <= candidate_start_lsn {
131 2070 : // Hooray, there are no crossing LSNs. And we have visited
132 2070 : // through all the layers within candidate..end_lsn. The
133 2070 : // current candidate can be accepted.
134 2070 : current_best_start_lsn = r.end;
135 2070 : current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
136 2070 : candidate_start_lsn = r.start;
137 2070 : }
138 :
139 : // Is it small enough to be considered part of this level?
140 2085 : if r.end.0 - r.start.0 > lsn_max_size {
141 : // Too large, this layer belongs to next level. Stop.
142 638 : trace!(
143 0 : "too large {}, size {} vs {}",
144 0 : l.short_id(),
145 0 : r.end.0 - r.start.0,
146 : lsn_max_size
147 : );
148 638 : break;
149 1447 : }
150 1447 :
151 1447 : // If this crosses the candidate lsn, push it down.
152 1447 : if r.start < candidate_start_lsn {
153 2 : trace!(
154 0 : "layer {} prevents from stopping at {}",
155 0 : l.short_id(),
156 : candidate_start_lsn
157 : );
158 2 : candidate_start_lsn = r.start;
159 20 : }
160 1 : }
161 :
162 : // Include this layer in our candidate
163 1448 : candidate_layers.push(l);
164 : }
165 :
166 920 : Ok(if current_best_start_lsn == end_lsn {
167 : // empty level
168 181 : None
169 : } else {
170 739 : Some(Level {
171 739 : lsn_range: current_best_start_lsn..end_lsn,
172 739 : layers: current_best_layers,
173 739 : })
174 : })
175 8 : }
176 :
177 : impl<L> Level<L> {
178 : /// Count the number of deltas stacked on each other.
179 739 : pub fn depth<K>(&self) -> u64
180 739 : where
181 739 : K: CompactionKey,
182 739 : L: CompactionLayer<K>,
183 739 : {
184 : struct Event<K> {
185 : key: K,
186 : layer_idx: usize,
187 : start: bool,
188 : }
189 739 : let mut events: Vec<Event<K>> = Vec::new();
190 1445 : for (idx, l) in self.layers.iter().enumerate() {
191 1445 : let key_range = l.key_range();
192 1445 : if key_range.end == key_range.start.next() && l.is_delta() {
193 : // Ignore single-key delta layers as they can be stacked on top of each other
194 : // as that is the only way to cut further.
195 12 : continue;
196 1433 : }
197 1433 : events.push(Event {
198 1433 : key: l.key_range().start,
199 1433 : layer_idx: idx,
200 1433 : start: true,
201 1433 : });
202 1433 : events.push(Event {
203 1433 : key: l.key_range().end,
204 1433 : layer_idx: idx,
205 1433 : start: false,
206 1433 : });
207 : }
208 6304 : events.sort_by_key(|e| (e.key, e.start));
209 739 :
210 739 : // Sweep the key space left to right. Stop at each distinct key, and
211 739 : // count the number of deltas on top of the highest image at that key.
212 739 : //
213 739 : // This is a little inefficient, as we walk through the active_set on
214 739 : // every key. We could increment/decrement a counter on each step
215 739 : // instead, but that'd require a bit more complex bookkeeping.
216 739 : let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
217 739 : let mut max_depth = 0;
218 739 : let mut events_iter = events.iter().peekable();
219 3605 : while let Some(e) = events_iter.next() {
220 2866 : let l = &self.layers[e.layer_idx];
221 2866 : let is_image = !l.is_delta();
222 2866 :
223 2866 : // update the active set
224 2866 : if e.start {
225 1433 : active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
226 1433 : } else {
227 1433 : active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
228 1433 : }
229 :
230 : // recalculate depth if this was the last event at this point
231 2866 : let more_events_at_this_key =
232 2866 : events_iter.peek().is_some_and(|next_e| next_e.key == e.key);
233 2866 : if !more_events_at_this_key {
234 1488 : let mut active_depth = 0;
235 1488 : for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
236 1437 : if *is_image {
237 2 : break;
238 1435 : }
239 1435 : active_depth += 1;
240 : }
241 1488 : if active_depth > max_depth {
242 740 : max_depth = active_depth;
243 740 : }
244 17 : }
245 : }
246 739 : debug_assert_eq!(active_set, BTreeSet::new());
247 739 : max_depth
248 739 : }
249 : }
250 :
251 : #[cfg(test)]
252 : mod tests {
253 : use std::sync::{Arc, Mutex};
254 :
255 : use super::*;
256 : use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
257 :
258 20 : fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
259 20 : MockLayer::Delta(Arc::new(MockDeltaLayer {
260 20 : key_range,
261 20 : lsn_range,
262 20 : // identify_level() doesn't pay attention to the rest of the fields
263 20 : file_size: 0,
264 20 : deleted: Mutex::new(false),
265 20 : records: vec![],
266 20 : }))
267 20 : }
268 :
269 1 : fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
270 1 : MockLayer::Image(Arc::new(MockImageLayer {
271 1 : key_range,
272 1 : lsn_range: lsn..(lsn + 1),
273 1 : // identify_level() doesn't pay attention to the rest of the fields
274 1 : file_size: 0,
275 1 : deleted: Mutex::new(false),
276 1 : }))
277 1 : }
278 :
279 : #[tokio::test]
280 1 : async fn test_identify_level() -> anyhow::Result<()> {
281 1 : let layers = vec![
282 1 : delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
283 1 : delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
284 1 : delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
285 1 : delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
286 1 : delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
287 1 : delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
288 1 : ];
289 1 :
290 1 : // All layers fit in the max file size
291 1 : let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
292 1 : .await?
293 1 : .unwrap();
294 1 : assert_eq!(level.depth(), 6);
295 1 :
296 1 : // Same LSN with smaller max file size. The second layer from the top is larger
297 1 : // and belongs to next level.
298 1 : let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
299 1 : .await?
300 1 : .unwrap();
301 1 : assert_eq!(level.depth(), 1);
302 1 :
303 1 : // Call with a smaller LSN
304 1 : let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
305 1 : .await?
306 1 : .unwrap();
307 1 : assert_eq!(level.depth(), 2);
308 1 :
309 1 : // Call with an LSN that doesn't partition the space
310 1 : let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
311 1 : assert!(result.is_err());
312 1 : Ok(())
313 1 : }
314 :
315 : #[tokio::test]
316 1 : async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
317 1 : // The files LSN ranges overlap, so even though there are more files that
318 1 : // fit under the file size, they are not included in the level because they
319 1 : // overlap so that we'd need to include the oldest file, too, which is
320 1 : // larger
321 1 : let layers = vec![
322 1 : delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
323 1 : delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
324 1 : delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
325 1 : delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
326 1 : delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
327 1 : ];
328 1 :
329 1 : let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
330 1 : .await?
331 1 : .unwrap();
332 1 : assert_eq!(level.depth(), 1);
333 1 :
334 1 : Ok(())
335 1 : }
336 :
337 : #[tokio::test]
338 1 : async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
339 1 : // The key ranges don't overlap, so depth is only 1.
340 1 : let layers = vec![
341 1 : delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
342 1 : delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
343 1 : delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
344 1 : ];
345 1 :
346 1 : let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
347 1 : .await?
348 1 : .unwrap();
349 1 : assert_eq!(level.layers.len(), 3);
350 1 : assert_eq!(level.depth(), 1);
351 1 :
352 1 : // Staggered. The 1st and 3rd layer don't overlap with each other.
353 1 : let layers = vec![
354 1 : delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
355 1 : delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
356 1 : delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
357 1 : ];
358 1 :
359 1 : let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
360 1 : .await?
361 1 : .unwrap();
362 1 : assert_eq!(level.layers.len(), 3);
363 1 : assert_eq!(level.depth(), 2);
364 1 : Ok(())
365 1 : }
366 :
367 : #[tokio::test]
368 1 : async fn test_depth_images() -> anyhow::Result<()> {
369 1 : let layers: Vec<MockLayer> = vec![
370 1 : delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
371 1 : delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
372 1 : delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
373 1 : // This covers the same key range as the 2nd delta layer. The depth
374 1 : // in that key range is therefore 0.
375 1 : image(1500..2500, Lsn(0x9000)),
376 1 : ];
377 1 :
378 1 : let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
379 1 : .await?
380 1 : .unwrap();
381 1 : assert_eq!(level.layers.len(), 4);
382 1 : assert_eq!(level.depth(), 1);
383 1 : Ok(())
384 1 : }
385 : }
|