LCOV - 6df3fc19ec669bcfbbf9aba41d1338898d24eaa0.info - pageserver/compaction/src/identify

LCOV - code coverage report

Current view:	top level - pageserver/compaction/src - identify_levels.rs (source / functions)		Coverage	Total	Hit
Test:	6df3fc19ec669bcfbbf9aba41d1338898d24eaa0.info	Lines:	95.9 %	267	256
Test Date:	2025-03-12 18:28:53	Functions:	72.7 %	22	16

            Line data    Source code

       1              : //! An LSM tree consists of multiple levels, each exponentially larger than the
       2              : //! previous level. And each level consists of multiple "tiers". With tiered
       3              : //! compaction, a level is compacted when it has accumulated more than N tiers,
       4              : //! forming one tier on the next level.
       5              : //!
       6              : //! In the pageserver, we don't explicitly track the levels and tiers. Instead,
       7              : //! we identify them by looking at the shapes of the layers. It's an easy task
       8              : //! for a human, but it's not straightforward to come up with the exact
       9              : //! rules. Especially if there are cases like interrupted, half-finished
      10              : //! compactions, or highly skewed data distributions that have let us "skip"
      11              : //! some levels. It's not critical to classify all cases correctly; at worst we
      12              : //! delay some compaction work, and suffer from more read amplification, or we
      13              : //! perform some unnecessary compaction work.
      14              : //!
      15              : //! `identify_level` performs that shape-matching.
      16              : //!
      17              : //! It returns a Level struct, which has `depth()` function to count the number
      18              : //! of "tiers" in the level. The tier count is the max depth of stacked layers
      19              : //! within the level. That's a good measure, because the point of compacting is
      20              : //! to reduce read amplification, and the depth is what determines that.
      21              : //!
      22              : //! One interesting effect of this is that if we generate very small delta
      23              : //! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
      24              : //! because they reach the target size, the L0 compaction will combine them to
      25              : //! one larger file. But if the combined file is still smaller than the target
      26              : //! file size, the file will still be considered to be part of L0 at the next
      27              : //! iteration.
      28              : 
      29              : use std::collections::BTreeSet;
      30              : use std::ops::Range;
      31              : 
      32              : use anyhow::bail;
      33              : use tracing::{info, trace};
      34              : use utils::lsn::Lsn;
      35              : 
      36              : use crate::interface::*;
      37              : 
      38              : pub struct Level<L> {
      39              :     pub lsn_range: Range<Lsn>,
      40              :     pub layers: Vec<L>,
      41              : }
      42              : 
      43              : /// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
      44              : /// no layers that cross the boundary LSN.
      45              : ///
      46              : /// A further restriction is that all layers in the returned partition cover at
      47              : /// most 'lsn_max_size' LSN bytes.
      48         1011 : pub async fn identify_level<K, L>(
      49         1011 :     all_layers: Vec<L>,
      50         1011 :     end_lsn: Lsn,
      51         1011 :     lsn_max_size: u64,
      52         1011 : ) -> anyhow::Result<Option<Level<L>>>
      53         1011 : where
      54         1011 :     K: CompactionKey,
      55         1011 :     L: CompactionLayer<K> + Clone,
      56         1011 : {
      57         1011 :     // filter out layers that are above the `end_lsn`, they are completely irrelevant.
      58         1011 :     let mut layers = Vec::new();
      59         7039 :     for l in all_layers {
      60         6029 :         if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
      61              :             // shouldn't happen. Indicates that the caller passed a bogus
      62              :             // end_lsn.
      63            1 :             bail!(
      64            1 :                 "identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}",
      65            1 :                 end_lsn,
      66            1 :                 l.short_id()
      67            1 :             );
      68         6028 :         }
      69         6028 :         // include image layers sitting exacty at `end_lsn`.
      70         6028 :         let is_image = !l.is_delta();
      71         6028 :         if (is_image && l.lsn_range().start > end_lsn)
      72         6028 :             || (!is_image && l.lsn_range().start >= end_lsn)
      73              :         {
      74            5 :             continue;
      75         6023 :         }
      76         6023 :         layers.push(l);
      77              :     }
      78              :     // All the remaining layers either belong to this level, or are below it.
      79         1010 :     info!(
      80            0 :         "identify level at {}, size {}, num layers below: {}",
      81            0 :         end_lsn,
      82            0 :         lsn_max_size,
      83            0 :         layers.len()
      84              :     );
      85         1010 :     if layers.is_empty() {
      86           90 :         return Ok(None);
      87          920 :     }
      88          920 : 
      89          920 :     // Walk the ranges in LSN order.
      90          920 :     //
      91          920 :     // ----- end_lsn
      92          920 :     //  |
      93          920 :     //  |
      94          920 :     //  v
      95          920 :     //
      96        10262 :     layers.sort_by_key(|l| l.lsn_range().end);
      97          920 :     let mut candidate_start_lsn = end_lsn;
      98          920 :     let mut candidate_layers: Vec<L> = Vec::new();
      99          920 :     let mut current_best_start_lsn = end_lsn;
     100          920 :     let mut current_best_layers: Vec<L> = Vec::new();
     101          920 :     let mut iter = layers.into_iter();
     102              :     loop {
     103         2368 :         let Some(l) = iter.next_back() else {
     104              :             // Reached end. Accept the last candidate
     105          282 :             current_best_start_lsn = candidate_start_lsn;
     106          282 :             current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
     107          282 :             break;
     108              :         };
     109         2086 :         trace!(
     110            0 :             "inspecting {} for candidate {}, current best {}",
     111            0 :             l.short_id(),
     112              :             candidate_start_lsn,
     113              :             current_best_start_lsn
     114              :         );
     115              : 
     116         2086 :         let r = l.lsn_range();
     117         2086 : 
     118         2086 :         // Image layers don't restrict our choice of cutoff LSN
     119         2086 :         if l.is_delta() {
     120              :             // Is this candidate workable? In other words, are there any
     121              :             // delta layers that span across this LSN
     122              :             //
     123              :             // Valid:                 Not valid:
     124              :             //  +                     +
     125              :             //  |                     | +
     126              :             //  +  <- candidate       + |   <- candidate
     127              :             //     +                    +
     128              :             //     |
     129              :             //     +
     130         2085 :             if r.end <= candidate_start_lsn {
     131         2070 :                 // Hooray, there are no crossing LSNs. And we have visited
     132         2070 :                 // through all the layers within candidate..end_lsn. The
     133         2070 :                 // current candidate can be accepted.
     134         2070 :                 current_best_start_lsn = r.end;
     135         2070 :                 current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
     136         2070 :                 candidate_start_lsn = r.start;
     137         2070 :             }
     138              : 
     139              :             // Is it small enough to be considered part of this level?
     140         2085 :             if r.end.0 - r.start.0 > lsn_max_size {
     141              :                 // Too large, this layer belongs to next level. Stop.
     142          638 :                 trace!(
     143            0 :                     "too large {}, size {} vs {}",
     144            0 :                     l.short_id(),
     145            0 :                     r.end.0 - r.start.0,
     146              :                     lsn_max_size
     147              :                 );
     148          638 :                 break;
     149         1447 :             }
     150         1447 : 
     151         1447 :             // If this crosses the candidate lsn, push it down.
     152         1447 :             if r.start < candidate_start_lsn {
     153            2 :                 trace!(
     154            0 :                     "layer {} prevents from stopping at {}",
     155            0 :                     l.short_id(),
     156              :                     candidate_start_lsn
     157              :                 );
     158            2 :                 candidate_start_lsn = r.start;
     159           20 :             }
     160            1 :         }
     161              : 
     162              :         // Include this layer in our candidate
     163         1448 :         candidate_layers.push(l);
     164              :     }
     165              : 
     166          920 :     Ok(if current_best_start_lsn == end_lsn {
     167              :         // empty level
     168          181 :         None
     169              :     } else {
     170          739 :         Some(Level {
     171          739 :             lsn_range: current_best_start_lsn..end_lsn,
     172          739 :             layers: current_best_layers,
     173          739 :         })
     174              :     })
     175            8 : }
     176              : 
     177              : impl<L> Level<L> {
     178              :     /// Count the number of deltas stacked on each other.
     179          739 :     pub fn depth<K>(&self) -> u64
     180          739 :     where
     181          739 :         K: CompactionKey,
     182          739 :         L: CompactionLayer<K>,
     183          739 :     {
     184              :         struct Event<K> {
     185              :             key: K,
     186              :             layer_idx: usize,
     187              :             start: bool,
     188              :         }
     189          739 :         let mut events: Vec<Event<K>> = Vec::new();
     190         1445 :         for (idx, l) in self.layers.iter().enumerate() {
     191         1445 :             let key_range = l.key_range();
     192         1445 :             if key_range.end == key_range.start.next() && l.is_delta() {
     193              :                 // Ignore single-key delta layers as they can be stacked on top of each other
     194              :                 // as that is the only way to cut further.
     195           12 :                 continue;
     196         1433 :             }
     197         1433 :             events.push(Event {
     198         1433 :                 key: l.key_range().start,
     199         1433 :                 layer_idx: idx,
     200         1433 :                 start: true,
     201         1433 :             });
     202         1433 :             events.push(Event {
     203         1433 :                 key: l.key_range().end,
     204         1433 :                 layer_idx: idx,
     205         1433 :                 start: false,
     206         1433 :             });
     207              :         }
     208         6304 :         events.sort_by_key(|e| (e.key, e.start));
     209          739 : 
     210          739 :         // Sweep the key space left to right. Stop at each distinct key, and
     211          739 :         // count the number of deltas on top of the highest image at that key.
     212          739 :         //
     213          739 :         // This is a little inefficient, as we walk through the active_set on
     214          739 :         // every key. We could increment/decrement a counter on each step
     215          739 :         // instead, but that'd require a bit more complex bookkeeping.
     216          739 :         let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
     217          739 :         let mut max_depth = 0;
     218          739 :         let mut events_iter = events.iter().peekable();
     219         3605 :         while let Some(e) = events_iter.next() {
     220         2866 :             let l = &self.layers[e.layer_idx];
     221         2866 :             let is_image = !l.is_delta();
     222         2866 : 
     223         2866 :             // update the active set
     224         2866 :             if e.start {
     225         1433 :                 active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
     226         1433 :             } else {
     227         1433 :                 active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
     228         1433 :             }
     229              : 
     230              :             // recalculate depth if this was the last event at this point
     231         2866 :             let more_events_at_this_key =
     232         2866 :                 events_iter.peek().is_some_and(|next_e| next_e.key == e.key);
     233         2866 :             if !more_events_at_this_key {
     234         1488 :                 let mut active_depth = 0;
     235         1488 :                 for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
     236         1437 :                     if *is_image {
     237            2 :                         break;
     238         1435 :                     }
     239         1435 :                     active_depth += 1;
     240              :                 }
     241         1488 :                 if active_depth > max_depth {
     242          740 :                     max_depth = active_depth;
     243          740 :                 }
     244           17 :             }
     245              :         }
     246          739 :         debug_assert_eq!(active_set, BTreeSet::new());
     247          739 :         max_depth
     248          739 :     }
     249              : }
     250              : 
     251              : #[cfg(test)]
     252              : mod tests {
     253              :     use std::sync::{Arc, Mutex};
     254              : 
     255              :     use super::*;
     256              :     use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
     257              : 
     258           20 :     fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
     259           20 :         MockLayer::Delta(Arc::new(MockDeltaLayer {
     260           20 :             key_range,
     261           20 :             lsn_range,
     262           20 :             // identify_level() doesn't pay attention to the rest of the fields
     263           20 :             file_size: 0,
     264           20 :             deleted: Mutex::new(false),
     265           20 :             records: vec![],
     266           20 :         }))
     267           20 :     }
     268              : 
     269            1 :     fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
     270            1 :         MockLayer::Image(Arc::new(MockImageLayer {
     271            1 :             key_range,
     272            1 :             lsn_range: lsn..(lsn + 1),
     273            1 :             // identify_level() doesn't pay attention to the rest of the fields
     274            1 :             file_size: 0,
     275            1 :             deleted: Mutex::new(false),
     276            1 :         }))
     277            1 :     }
     278              : 
     279              :     #[tokio::test]
     280            1 :     async fn test_identify_level() -> anyhow::Result<()> {
     281            1 :         let layers = vec![
     282            1 :             delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
     283            1 :             delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
     284            1 :             delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
     285            1 :             delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
     286            1 :             delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
     287            1 :             delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
     288            1 :         ];
     289            1 : 
     290            1 :         // All layers fit in the max file size
     291            1 :         let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
     292            1 :             .await?
     293            1 :             .unwrap();
     294            1 :         assert_eq!(level.depth(), 6);
     295            1 : 
     296            1 :         // Same LSN with smaller max file size. The second layer from the top is larger
     297            1 :         // and belongs to next level.
     298            1 :         let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
     299            1 :             .await?
     300            1 :             .unwrap();
     301            1 :         assert_eq!(level.depth(), 1);
     302            1 : 
     303            1 :         // Call with a smaller LSN
     304            1 :         let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
     305            1 :             .await?
     306            1 :             .unwrap();
     307            1 :         assert_eq!(level.depth(), 2);
     308            1 : 
     309            1 :         // Call with an LSN that doesn't partition the space
     310            1 :         let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
     311            1 :         assert!(result.is_err());
     312            1 :         Ok(())
     313            1 :     }
     314              : 
     315              :     #[tokio::test]
     316            1 :     async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
     317            1 :         // The files LSN ranges overlap, so even though there are more files that
     318            1 :         // fit under the file size, they are not included in the level because they
     319            1 :         // overlap so that we'd need to include the oldest file, too, which is
     320            1 :         // larger
     321            1 :         let layers = vec![
     322            1 :             delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
     323            1 :             delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
     324            1 :             delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
     325            1 :             delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
     326            1 :             delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
     327            1 :         ];
     328            1 : 
     329            1 :         let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
     330            1 :             .await?
     331            1 :             .unwrap();
     332            1 :         assert_eq!(level.depth(), 1);
     333            1 : 
     334            1 :         Ok(())
     335            1 :     }
     336              : 
     337              :     #[tokio::test]
     338            1 :     async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
     339            1 :         // The key ranges don't overlap, so depth is only 1.
     340            1 :         let layers = vec![
     341            1 :             delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
     342            1 :             delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
     343            1 :             delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
     344            1 :         ];
     345            1 : 
     346            1 :         let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
     347            1 :             .await?
     348            1 :             .unwrap();
     349            1 :         assert_eq!(level.layers.len(), 3);
     350            1 :         assert_eq!(level.depth(), 1);
     351            1 : 
     352            1 :         // Staggered. The 1st and 3rd layer don't overlap with each other.
     353            1 :         let layers = vec![
     354            1 :             delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
     355            1 :             delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
     356            1 :             delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
     357            1 :         ];
     358            1 : 
     359            1 :         let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
     360            1 :             .await?
     361            1 :             .unwrap();
     362            1 :         assert_eq!(level.layers.len(), 3);
     363            1 :         assert_eq!(level.depth(), 2);
     364            1 :         Ok(())
     365            1 :     }
     366              : 
     367              :     #[tokio::test]
     368            1 :     async fn test_depth_images() -> anyhow::Result<()> {
     369            1 :         let layers: Vec<MockLayer> = vec![
     370            1 :             delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
     371            1 :             delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
     372            1 :             delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
     373            1 :             // This covers the same key range as the 2nd delta layer. The depth
     374            1 :             // in that key range is therefore 0.
     375            1 :             image(1500..2500, Lsn(0x9000)),
     376            1 :         ];
     377            1 : 
     378            1 :         let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
     379            1 :             .await?
     380            1 :             .unwrap();
     381            1 :         assert_eq!(level.layers.len(), 4);
     382            1 :         assert_eq!(level.depth(), 1);
     383            1 :         Ok(())
     384            1 :     }
     385              : }

Generated by: LCOV version 2.1-beta