LCOV - code coverage report
Current view: top level - pageserver/compaction/src - identify_levels.rs (source / functions) Coverage Total Hit
Test: 322b88762cba8ea666f63cda880cccab6936bf37.info Lines: 90.8 % 272 247
Test Date: 2024-02-29 11:57:12 Functions: 40.0 % 40 16

            Line data    Source code
       1              : //! An LSM tree consists of multiple levels, each exponential larger than the
       2              : //! previous level. And each level consists of be multiple "tiers". With tiered
       3              : //! compaction, a level is compacted when it has accumulated more than N tiers,
       4              : //! forming one tier on the next level.
       5              : //!
       6              : //! In the pageserver, we don't explicitly track the levels and tiers. Instead,
       7              : //! we identify them by looking at the shapes of the layers. It's an easy task
       8              : //! for a human, but it's not straightforward to come up with the exact
       9              : //! rules. Especially if there are cases like interrupted, half-finished
      10              : //! compactions, or highly skewed data distributions that have let us "skip"
      11              : //! some levels. It's not critical to classify all cases correctly; at worst we
      12              : //! delay some compaction work, and suffer from more read amplification, or we
      13              : //! perform some unnecessary compaction work.
      14              : //!
      15              : //! `identify_level` performs that shape-matching.
      16              : //!
      17              : //! It returns a Level struct, which has `depth()` function to count the number
      18              : //! of "tiers" in the level. The tier count is the max depth of stacked layers
      19              : //! within the level. That's a good measure, because the point of compacting is
      20              : //! to reduce read amplification, and the depth is what determines that.
      21              : //!
      22              : //! One interesting effect of this is that if we generate very small delta
      23              : //! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
      24              : //! because they reach the target size, the L0 compaction will combine them to
      25              : //! one larger file. But if the combined file is still smaller than the target
      26              : //! file size, the file will still be considered to be part of L0 at the next
      27              : //! iteration.
      28              : 
      29              : use anyhow::bail;
      30              : use std::collections::BTreeSet;
      31              : use std::ops::Range;
      32              : use utils::lsn::Lsn;
      33              : 
      34              : use crate::interface::*;
      35              : 
      36              : use tracing::{info, trace};
      37              : 
      38              : pub struct Level<L> {
      39              :     pub lsn_range: Range<Lsn>,
      40              :     pub layers: Vec<L>,
      41              : }
      42              : 
      43              : /// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
      44              : /// no layers that cross the boundary LSN.
      45              : ///
      46              : /// A further restriction is that all layers in the returned partition cover at
      47              : /// most 'lsn_max_size' LSN bytes.
      48           16 : pub async fn identify_level<K, L>(
      49           16 :     all_layers: Vec<L>,
      50           16 :     end_lsn: Lsn,
      51           16 :     lsn_max_size: u64,
      52           16 : ) -> anyhow::Result<Option<Level<L>>>
      53           16 : where
      54           16 :     K: CompactionKey,
      55           16 :     L: CompactionLayer<K> + Clone,
      56           16 : {
      57           16 :     // filter out layers that are above the `end_lsn`, they are completely irrelevant.
      58           16 :     let mut layers = Vec::new();
      59           84 :     for l in all_layers {
      60           70 :         if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
      61              :             // shouldn't happen. Indicates that the caller passed a bogus
      62              :             // end_lsn.
      63            2 :             bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
      64           68 :         }
      65           68 :         // include image layers sitting exacty at `end_lsn`.
      66           68 :         let is_image = !l.is_delta();
      67           68 :         if (is_image && l.lsn_range().start > end_lsn)
      68           68 :             || (!is_image && l.lsn_range().start >= end_lsn)
      69              :         {
      70           10 :             continue;
      71           58 :         }
      72           58 :         layers.push(l);
      73              :     }
      74              :     // All the remaining layers either belong to this level, or are below it.
      75            0 :     info!(
      76            0 :         "identify level at {}, size {}, num layers below: {}",
      77            0 :         end_lsn,
      78            0 :         lsn_max_size,
      79            0 :         layers.len()
      80            0 :     );
      81           14 :     if layers.is_empty() {
      82            0 :         return Ok(None);
      83           14 :     }
      84           14 : 
      85           14 :     // Walk the ranges in LSN order.
      86           14 :     //
      87           14 :     // ----- end_lsn
      88           14 :     //  |
      89           14 :     //  |
      90           14 :     //  v
      91           14 :     //
      92          200 :     layers.sort_by_key(|l| l.lsn_range().end);
      93           14 :     let mut candidate_start_lsn = end_lsn;
      94           14 :     let mut candidate_layers: Vec<L> = Vec::new();
      95           14 :     let mut current_best_start_lsn = end_lsn;
      96           14 :     let mut current_best_layers: Vec<L> = Vec::new();
      97           14 :     let mut iter = layers.into_iter();
      98              :     loop {
      99           60 :         let Some(l) = iter.next_back() else {
     100              :             // Reached end. Accept the last candidate
     101           10 :             current_best_start_lsn = candidate_start_lsn;
     102           10 :             current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
     103           10 :             break;
     104              :         };
     105            0 :         trace!(
     106            0 :             "inspecting {} for candidate {}, current best {}",
     107            0 :             l.short_id(),
     108            0 :             candidate_start_lsn,
     109            0 :             current_best_start_lsn
     110            0 :         );
     111              : 
     112           50 :         let r = l.lsn_range();
     113           50 : 
     114           50 :         // Image layers don't restrict our choice of cutoff LSN
     115           50 :         if l.is_delta() {
     116              :             // Is this candidate workable? In other words, are there any
     117              :             // delta layers that span across this LSN
     118              :             //
     119              :             // Valid:                 Not valid:
     120              :             //  +                     +
     121              :             //  |                     | +
     122              :             //  +  <- candidate       + |   <- candidate
     123              :             //     +                    +
     124              :             //     |
     125              :             //     +
     126           48 :             if r.end <= candidate_start_lsn {
     127           42 :                 // Hooray, there are no crossing LSNs. And we have visited
     128           42 :                 // through all the layers within candidate..end_lsn. The
     129           42 :                 // current candidate can be accepted.
     130           42 :                 current_best_start_lsn = r.end;
     131           42 :                 current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
     132           42 :                 candidate_start_lsn = r.start;
     133           42 :             }
     134              : 
     135              :             // Is it small enough to be considered part of this level?
     136           48 :             if r.end.0 - r.start.0 > lsn_max_size {
     137              :                 // Too large, this layer belongs to next level. Stop.
     138            0 :                 trace!(
     139            0 :                     "too large {}, size {} vs {}",
     140            0 :                     l.short_id(),
     141            0 :                     r.end.0 - r.start.0,
     142            0 :                     lsn_max_size
     143            0 :                 );
     144            4 :                 break;
     145           44 :             }
     146           44 : 
     147           44 :             // If this crosses the candidate lsn, push it down.
     148           44 :             if r.start < candidate_start_lsn {
     149            0 :                 trace!(
     150            0 :                     "layer {} prevents from stopping at {}",
     151            0 :                     l.short_id(),
     152            0 :                     candidate_start_lsn
     153            0 :                 );
     154            4 :                 candidate_start_lsn = r.start;
     155           40 :             }
     156            2 :         }
     157              : 
     158              :         // Include this layer in our candidate
     159           46 :         candidate_layers.push(l);
     160              :     }
     161              : 
     162           14 :     Ok(if current_best_start_lsn == end_lsn {
     163              :         // empty level
     164            0 :         None
     165              :     } else {
     166           14 :         Some(Level {
     167           14 :             lsn_range: current_best_start_lsn..end_lsn,
     168           14 :             layers: current_best_layers,
     169           14 :         })
     170              :     })
     171           16 : }
     172              : 
     173              : // helper struct used in depth()
     174              : struct Event<K> {
     175              :     key: K,
     176              :     layer_idx: usize,
     177              :     start: bool,
     178              : }
     179              : 
     180              : impl<L> Level<L> {
     181              :     /// Count the number of deltas stacked on each other.
     182           14 :     pub fn depth<K>(&self) -> u64
     183           14 :     where
     184           14 :         K: CompactionKey,
     185           14 :         L: CompactionLayer<K>,
     186           14 :     {
     187           14 :         let mut events: Vec<Event<K>> = Vec::new();
     188           40 :         for (idx, l) in self.layers.iter().enumerate() {
     189           40 :             events.push(Event {
     190           40 :                 key: l.key_range().start,
     191           40 :                 layer_idx: idx,
     192           40 :                 start: true,
     193           40 :             });
     194           40 :             events.push(Event {
     195           40 :                 key: l.key_range().end,
     196           40 :                 layer_idx: idx,
     197           40 :                 start: false,
     198           40 :             });
     199           40 :         }
     200          228 :         events.sort_by_key(|e| (e.key, e.start));
     201           14 : 
     202           14 :         // Sweep the key space left to right. Stop at each distinct key, and
     203           14 :         // count the number of deltas on top of the highest image at that key.
     204           14 :         //
     205           14 :         // This is a little enefficient, as we walk through the active_set on
     206           14 :         // every key. We could increment/decrement a counter on each step
     207           14 :         // instead, but that'd require a bit more complex bookkeeping.
     208           14 :         let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
     209           14 :         let mut max_depth = 0;
     210           14 :         let mut events_iter = events.iter().peekable();
     211           94 :         while let Some(e) = events_iter.next() {
     212           80 :             let l = &self.layers[e.layer_idx];
     213           80 :             let is_image = !l.is_delta();
     214           80 : 
     215           80 :             // update the active set
     216           80 :             if e.start {
     217           40 :                 active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
     218           40 :             } else {
     219           40 :                 active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
     220           40 :             }
     221              : 
     222              :             // recalculate depth if this was the last event at this point
     223           80 :             let more_events_at_this_key = events_iter
     224           80 :                 .peek()
     225           80 :                 .map_or(false, |next_e| next_e.key == e.key);
     226           80 :             if !more_events_at_this_key {
     227           46 :                 let mut active_depth = 0;
     228           46 :                 for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
     229           46 :                     if *is_image {
     230            4 :                         break;
     231           42 :                     }
     232           42 :                     active_depth += 1;
     233              :                 }
     234           46 :                 if active_depth > max_depth {
     235           16 :                     max_depth = active_depth;
     236           30 :                 }
     237           34 :             }
     238              :         }
     239           14 :         max_depth
     240           14 :     }
     241              : }
     242              : 
     243              : #[cfg(test)]
     244              : mod tests {
     245              :     use super::*;
     246              :     use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
     247              :     use std::sync::{Arc, Mutex};
     248              : 
     249           40 :     fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
     250           40 :         MockLayer::Delta(Arc::new(MockDeltaLayer {
     251           40 :             key_range,
     252           40 :             lsn_range,
     253           40 :             // identify_level() doesn't pay attention to the rest of the fields
     254           40 :             file_size: 0,
     255           40 :             deleted: Mutex::new(false),
     256           40 :             records: vec![],
     257           40 :         }))
     258           40 :     }
     259              : 
     260            2 :     fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
     261            2 :         MockLayer::Image(Arc::new(MockImageLayer {
     262            2 :             key_range,
     263            2 :             lsn_range: lsn..(lsn + 1),
     264            2 :             // identify_level() doesn't pay attention to the rest of the fields
     265            2 :             file_size: 0,
     266            2 :             deleted: Mutex::new(false),
     267            2 :         }))
     268            2 :     }
     269              : 
     270            2 :     #[tokio::test]
     271            2 :     async fn test_identify_level() -> anyhow::Result<()> {
     272            2 :         let layers = vec![
     273            2 :             delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
     274            2 :             delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
     275            2 :             delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
     276            2 :             delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
     277            2 :             delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
     278            2 :             delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
     279            2 :         ];
     280            2 : 
     281            2 :         // All layers fit in the max file size
     282            2 :         let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
     283            2 :             .await?
     284            2 :             .unwrap();
     285            2 :         assert_eq!(level.depth(), 6);
     286            2 : 
     287            2 :         // Same LSN with smaller max file size. The second layer from the top is larger
     288            2 :         // and belongs to next level.
     289            2 :         let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
     290            2 :             .await?
     291            2 :             .unwrap();
     292            2 :         assert_eq!(level.depth(), 1);
     293            2 : 
     294            2 :         // Call with a smaller LSN
     295            2 :         let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
     296            2 :             .await?
     297            2 :             .unwrap();
     298            2 :         assert_eq!(level.depth(), 2);
     299            2 : 
     300            2 :         // Call with an LSN that doesn't partition the space
     301            2 :         let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
     302            2 :         assert!(result.is_err());
     303            2 :         Ok(())
     304            2 :     }
     305              : 
     306            2 :     #[tokio::test]
     307            2 :     async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
     308            2 :         // The files LSN ranges overlap, so even though there are more files that
     309            2 :         // fit under the file size, they are not included in the level because they
     310            2 :         // overlap so that we'd need to include the oldest file, too, which is
     311            2 :         // larger
     312            2 :         let layers = vec![
     313            2 :             delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
     314            2 :             delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
     315            2 :             delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
     316            2 :             delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
     317            2 :             delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
     318            2 :         ];
     319            2 : 
     320            2 :         let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
     321            2 :             .await?
     322            2 :             .unwrap();
     323            2 :         assert_eq!(level.depth(), 1);
     324            2 : 
     325            2 :         Ok(())
     326            2 :     }
     327              : 
     328            2 :     #[tokio::test]
     329            2 :     async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
     330            2 :         // The key ranges don't overlap, so depth is only 1.
     331            2 :         let layers = vec![
     332            2 :             delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
     333            2 :             delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
     334            2 :             delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
     335            2 :         ];
     336            2 : 
     337            2 :         let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
     338            2 :             .await?
     339            2 :             .unwrap();
     340            2 :         assert_eq!(level.layers.len(), 3);
     341            2 :         assert_eq!(level.depth(), 1);
     342            2 : 
     343            2 :         // Staggered. The 1st and 3rd layer don't overlap with each other.
     344            2 :         let layers = vec![
     345            2 :             delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
     346            2 :             delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
     347            2 :             delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
     348            2 :         ];
     349            2 : 
     350            2 :         let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
     351            2 :             .await?
     352            2 :             .unwrap();
     353            2 :         assert_eq!(level.layers.len(), 3);
     354            2 :         assert_eq!(level.depth(), 2);
     355            2 :         Ok(())
     356            2 :     }
     357              : 
     358            2 :     #[tokio::test]
     359            2 :     async fn test_depth_images() -> anyhow::Result<()> {
     360            2 :         let layers: Vec<MockLayer> = vec![
     361            2 :             delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
     362            2 :             delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
     363            2 :             delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
     364            2 :             // This covers the same key range as the 2nd delta layer. The depth
     365            2 :             // in that key range is therefore 0.
     366            2 :             image(1500..2500, Lsn(0x9000)),
     367            2 :         ];
     368            2 : 
     369            2 :         let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
     370            2 :             .await?
     371            2 :             .unwrap();
     372            2 :         assert_eq!(level.layers.len(), 4);
     373            2 :         assert_eq!(level.depth(), 1);
     374            2 :         Ok(())
     375            2 :     }
     376              : }
        

Generated by: LCOV version 2.1-beta