LCOV - 2a9d99866121f170b43760bd62e1e2431e597707.info - pageserver/src/tenant/timeline/compaction.rs

LCOV - code coverage report

Current view:	top level - pageserver/src/tenant/timeline - compaction.rs (source / functions)		Coverage	Total	Hit
Test:	2a9d99866121f170b43760bd62e1e2431e597707.info	Lines:	60.6 %	1758	1065
Test Date:	2024-09-02 14:10:37	Functions:	35.2 %	162	57

            Line data    Source code

       1              : //! New compaction implementation. The algorithm itself is implemented in the
       2              : //! compaction crate. This file implements the callbacks and structs that allow
       3              : //! the algorithm to drive the process.
       4              : //!
       5              : //! The old legacy algorithm is implemented directly in `timeline.rs`.
       6              : 
       7              : use std::collections::{BinaryHeap, HashSet};
       8              : use std::ops::{Deref, Range};
       9              : use std::sync::Arc;
      10              : 
      11              : use super::layer_manager::LayerManager;
      12              : use super::{
      13              :     CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
      14              :     RecordedDuration, Timeline,
      15              : };
      16              : 
      17              : use anyhow::{anyhow, bail, Context};
      18              : use bytes::Bytes;
      19              : use enumset::EnumSet;
      20              : use fail::fail_point;
      21              : use itertools::Itertools;
      22              : use pageserver_api::key::KEY_SIZE;
      23              : use pageserver_api::keyspace::ShardedRange;
      24              : use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
      25              : use serde::Serialize;
      26              : use tokio_util::sync::CancellationToken;
      27              : use tracing::{debug, info, info_span, trace, warn, Instrument};
      28              : use utils::id::TimelineId;
      29              : 
      30              : use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
      31              : use crate::page_cache;
      32              : use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
      33              : use crate::tenant::remote_timeline_client::WaitCompletionError;
      34              : use crate::tenant::storage_layer::merge_iterator::MergeIterator;
      35              : use crate::tenant::storage_layer::split_writer::{
      36              :     SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
      37              : };
      38              : use crate::tenant::storage_layer::{
      39              :     AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
      40              : };
      41              : use crate::tenant::timeline::ImageLayerCreationOutcome;
      42              : use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
      43              : use crate::tenant::timeline::{Layer, ResidentLayer};
      44              : use crate::tenant::DeltaLayer;
      45              : use crate::virtual_file::{MaybeFatalIo, VirtualFile};
      46              : 
      47              : use crate::keyspace::KeySpace;
      48              : use crate::repository::{Key, Value};
      49              : use crate::walrecord::NeonWalRecord;
      50              : 
      51              : use utils::lsn::Lsn;
      52              : 
      53              : use pageserver_compaction::helpers::overlaps_with;
      54              : use pageserver_compaction::interface::*;
      55              : 
      56              : use super::CompactionError;
      57              : 
      58              : /// Maximum number of deltas before generating an image layer in bottom-most compaction.
      59              : const COMPACTION_DELTA_THRESHOLD: usize = 5;
      60              : 
      61              : /// The result of bottom-most compaction for a single key at each LSN.
      62              : #[derive(Debug)]
      63              : #[cfg_attr(test, derive(PartialEq))]
      64              : pub struct KeyLogAtLsn(pub Vec<(Lsn, Value)>);
      65              : 
      66              : /// The result of bottom-most compaction.
      67              : #[derive(Debug)]
      68              : #[cfg_attr(test, derive(PartialEq))]
      69              : pub(crate) struct KeyHistoryRetention {
      70              :     /// Stores logs to reconstruct the value at the given LSN, that is to say, logs <= LSN or image == LSN.
      71              :     pub(crate) below_horizon: Vec<(Lsn, KeyLogAtLsn)>,
      72              :     /// Stores logs to reconstruct the value at any LSN above the horizon, that is to say, log > LSN.
      73              :     pub(crate) above_horizon: KeyLogAtLsn,
      74              : }
      75              : 
      76              : impl KeyHistoryRetention {
      77              :     /// Hack: skip delta layer if we need to produce a layer of a same key-lsn.
      78              :     ///
      79              :     /// This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
      80              :     /// For example, consider the case where a single delta with range [0x10,0x50) exists.
      81              :     /// And we have branches at LSN 0x10, 0x20, 0x30.
      82              :     /// Then we delete branch @ 0x20.
      83              :     /// Bottom-most compaction may now delete the delta [0x20,0x30).
      84              :     /// And that wouldnt' change the shape of the layer.
      85              :     ///
      86              :     /// Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
      87              :     ///
      88              :     /// `discard_key` will only be called when the writer reaches its target (instead of for every key), so it's fine to grab a lock inside.
      89          114 :     async fn discard_key(key: &PersistentLayerKey, tline: &Arc<Timeline>, dry_run: bool) -> bool {
      90          114 :         if dry_run {
      91            0 :             return true;
      92          114 :         }
      93          114 :         let guard = tline.layers.read().await;
      94          114 :         if !guard.contains_key(key) {
      95           66 :             return false;
      96           48 :         }
      97           48 :         let layer_generation = guard.get_from_key(key).metadata().generation;
      98           48 :         drop(guard);
      99           48 :         if layer_generation == tline.generation {
     100           48 :             info!(
     101              :                 key=%key,
     102              :                 ?layer_generation,
     103            0 :                 "discard layer due to duplicated layer key in the same generation",
     104              :             );
     105           48 :             true
     106              :         } else {
     107            0 :             false
     108              :         }
     109          114 :     }
     110              : 
     111              :     /// Pipe a history of a single key to the writers.
     112              :     ///
     113              :     /// If `image_writer` is none, the images will be placed into the delta layers.
     114              :     /// The delta writer will contain all images and deltas (below and above the horizon) except the bottom-most images.
     115              :     #[allow(clippy::too_many_arguments)]
     116         1266 :     async fn pipe_to(
     117         1266 :         self,
     118         1266 :         key: Key,
     119         1266 :         tline: &Arc<Timeline>,
     120         1266 :         delta_writer: &mut SplitDeltaLayerWriter,
     121         1266 :         mut image_writer: Option<&mut SplitImageLayerWriter>,
     122         1266 :         stat: &mut CompactionStatistics,
     123         1266 :         dry_run: bool,
     124         1266 :         ctx: &RequestContext,
     125         1266 :     ) -> anyhow::Result<()> {
     126         1266 :         let mut first_batch = true;
     127         1266 :         let discard = |key: &PersistentLayerKey| {
     128            0 :             let key = key.clone();
     129            0 :             async move { Self::discard_key(&key, tline, dry_run).await }
     130            0 :         };
     131         4206 :         for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
     132         2940 :             if first_batch {
     133         1266 :                 if logs.len() == 1 && logs[0].1.is_image() {
     134         1224 :                     let Value::Image(img) = &logs[0].1 else {
     135            0 :                         unreachable!()
     136              :                     };
     137         1224 :                     stat.produce_image_key(img);
     138         1224 :                     if let Some(image_writer) = image_writer.as_mut() {
     139         1224 :                         image_writer
     140         1224 :                             .put_image_with_discard_fn(key, img.clone(), tline, ctx, discard)
     141         1242 :                             .await?;
     142              :                     } else {
     143            0 :                         delta_writer
     144            0 :                             .put_value_with_discard_fn(
     145            0 :                                 key,
     146            0 :                                 cutoff_lsn,
     147            0 :                                 Value::Image(img.clone()),
     148            0 :                                 tline,
     149            0 :                                 ctx,
     150            0 :                                 discard,
     151            0 :                             )
     152            0 :                             .await?;
     153              :                     }
     154              :                 } else {
     155           84 :                     for (lsn, val) in logs {
     156           42 :                         stat.produce_key(&val);
     157           42 :                         delta_writer
     158           42 :                             .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
     159            0 :                             .await?;
     160              :                     }
     161              :                 }
     162         1266 :                 first_batch = false;
     163              :             } else {
     164         1920 :                 for (lsn, val) in logs {
     165          246 :                     stat.produce_key(&val);
     166          246 :                     delta_writer
     167          246 :                         .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
     168            0 :                         .await?;
     169              :                 }
     170              :             }
     171              :         }
     172         1266 :         let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
     173         1362 :         for (lsn, val) in above_horizon_logs {
     174           96 :             stat.produce_key(&val);
     175           96 :             delta_writer
     176           96 :                 .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
     177            0 :                 .await?;
     178              :         }
     179         1266 :         Ok(())
     180         1266 :     }
     181              : }
     182              : 
     183              : #[derive(Debug, Serialize, Default)]
     184              : struct CompactionStatisticsNumSize {
     185              :     num: u64,
     186              :     size: u64,
     187              : }
     188              : 
     189              : #[derive(Debug, Serialize, Default)]
     190              : pub struct CompactionStatistics {
     191              :     delta_layer_visited: CompactionStatisticsNumSize,
     192              :     image_layer_visited: CompactionStatisticsNumSize,
     193              :     delta_layer_produced: CompactionStatisticsNumSize,
     194              :     image_layer_produced: CompactionStatisticsNumSize,
     195              :     num_delta_layer_discarded: usize,
     196              :     num_image_layer_discarded: usize,
     197              :     num_unique_keys_visited: usize,
     198              :     wal_keys_visited: CompactionStatisticsNumSize,
     199              :     image_keys_visited: CompactionStatisticsNumSize,
     200              :     wal_produced: CompactionStatisticsNumSize,
     201              :     image_produced: CompactionStatisticsNumSize,
     202              : }
     203              : 
     204              : impl CompactionStatistics {
     205         2058 :     fn estimated_size_of_value(val: &Value) -> usize {
     206          798 :         match val {
     207         1260 :             Value::Image(img) => img.len(),
     208            0 :             Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
     209          798 :             _ => std::mem::size_of::<NeonWalRecord>(),
     210              :         }
     211         2058 :     }
     212         3288 :     fn estimated_size_of_key() -> usize {
     213         3288 :         KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer)
     214         3288 :     }
     215          138 :     fn visit_delta_layer(&mut self, size: u64) {
     216          138 :         self.delta_layer_visited.num += 1;
     217          138 :         self.delta_layer_visited.size += size;
     218          138 :     }
     219          108 :     fn visit_image_layer(&mut self, size: u64) {
     220          108 :         self.image_layer_visited.num += 1;
     221          108 :         self.image_layer_visited.size += size;
     222          108 :     }
     223         1266 :     fn on_unique_key_visited(&mut self) {
     224         1266 :         self.num_unique_keys_visited += 1;
     225         1266 :     }
     226          420 :     fn visit_wal_key(&mut self, val: &Value) {
     227          420 :         self.wal_keys_visited.num += 1;
     228          420 :         self.wal_keys_visited.size +=
     229          420 :             Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
     230          420 :     }
     231         1260 :     fn visit_image_key(&mut self, val: &Value) {
     232         1260 :         self.image_keys_visited.num += 1;
     233         1260 :         self.image_keys_visited.size +=
     234         1260 :             Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
     235         1260 :     }
     236          384 :     fn produce_key(&mut self, val: &Value) {
     237          384 :         match val {
     238            6 :             Value::Image(img) => self.produce_image_key(img),
     239          378 :             Value::WalRecord(_) => self.produce_wal_key(val),
     240              :         }
     241          384 :     }
     242          378 :     fn produce_wal_key(&mut self, val: &Value) {
     243          378 :         self.wal_produced.num += 1;
     244          378 :         self.wal_produced.size +=
     245          378 :             Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
     246          378 :     }
     247         1230 :     fn produce_image_key(&mut self, val: &Bytes) {
     248         1230 :         self.image_produced.num += 1;
     249         1230 :         self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
     250         1230 :     }
     251           24 :     fn discard_delta_layer(&mut self) {
     252           24 :         self.num_delta_layer_discarded += 1;
     253           24 :     }
     254           24 :     fn discard_image_layer(&mut self) {
     255           24 :         self.num_image_layer_discarded += 1;
     256           24 :     }
     257           30 :     fn produce_delta_layer(&mut self, size: u64) {
     258           30 :         self.delta_layer_produced.num += 1;
     259           30 :         self.delta_layer_produced.size += size;
     260           30 :     }
     261           36 :     fn produce_image_layer(&mut self, size: u64) {
     262           36 :         self.image_layer_produced.num += 1;
     263           36 :         self.image_layer_produced.size += size;
     264           36 :     }
     265              : }
     266              : 
     267              : impl Timeline {
     268              :     /// TODO: cancellation
     269              :     ///
     270              :     /// Returns whether the compaction has pending tasks.
     271         1092 :     pub(crate) async fn compact_legacy(
     272         1092 :         self: &Arc<Self>,
     273         1092 :         cancel: &CancellationToken,
     274         1092 :         flags: EnumSet<CompactFlags>,
     275         1092 :         ctx: &RequestContext,
     276         1092 :     ) -> Result<bool, CompactionError> {
     277         1092 :         if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
     278            0 :             self.compact_with_gc(cancel, flags, ctx)
     279            0 :                 .await
     280            0 :                 .map_err(CompactionError::Other)?;
     281            0 :             return Ok(false);
     282         1092 :         }
     283         1092 : 
     284         1092 :         if flags.contains(CompactFlags::DryRun) {
     285            0 :             return Err(CompactionError::Other(anyhow!(
     286            0 :                 "dry-run mode is not supported for legacy compaction for now"
     287            0 :             )));
     288         1092 :         }
     289         1092 : 
     290         1092 :         // High level strategy for compaction / image creation:
     291         1092 :         //
     292         1092 :         // 1. First, calculate the desired "partitioning" of the
     293         1092 :         // currently in-use key space. The goal is to partition the
     294         1092 :         // key space into roughly fixed-size chunks, but also take into
     295         1092 :         // account any existing image layers, and try to align the
     296         1092 :         // chunk boundaries with the existing image layers to avoid
     297         1092 :         // too much churn. Also try to align chunk boundaries with
     298         1092 :         // relation boundaries.  In principle, we don't know about
     299         1092 :         // relation boundaries here, we just deal with key-value
     300         1092 :         // pairs, and the code in pgdatadir_mapping.rs knows how to
     301         1092 :         // map relations into key-value pairs. But in practice we know
     302         1092 :         // that 'field6' is the block number, and the fields 1-5
     303         1092 :         // identify a relation. This is just an optimization,
     304         1092 :         // though.
     305         1092 :         //
     306         1092 :         // 2. Once we know the partitioning, for each partition,
     307         1092 :         // decide if it's time to create a new image layer. The
     308         1092 :         // criteria is: there has been too much "churn" since the last
     309         1092 :         // image layer? The "churn" is fuzzy concept, it's a
     310         1092 :         // combination of too many delta files, or too much WAL in
     311         1092 :         // total in the delta file. Or perhaps: if creating an image
     312         1092 :         // file would allow to delete some older files.
     313         1092 :         //
     314         1092 :         // 3. After that, we compact all level0 delta files if there
     315         1092 :         // are too many of them.  While compacting, we also garbage
     316         1092 :         // collect any page versions that are no longer needed because
     317         1092 :         // of the new image layers we created in step 2.
     318         1092 :         //
     319         1092 :         // TODO: This high level strategy hasn't been implemented yet.
     320         1092 :         // Below are functions compact_level0() and create_image_layers()
     321         1092 :         // but they are a bit ad hoc and don't quite work like it's explained
     322         1092 :         // above. Rewrite it.
     323         1092 : 
     324         1092 :         // Is the timeline being deleted?
     325         1092 :         if self.is_stopping() {
     326            0 :             trace!("Dropping out of compaction on timeline shutdown");
     327            0 :             return Err(CompactionError::ShuttingDown);
     328         1092 :         }
     329         1092 : 
     330         1092 :         let target_file_size = self.get_checkpoint_distance();
     331              : 
     332              :         // Define partitioning schema if needed
     333              : 
     334              :         // FIXME: the match should only cover repartitioning, not the next steps
     335         1092 :         let (partition_count, has_pending_tasks) = match self
     336         1092 :             .repartition(
     337         1092 :                 self.get_last_record_lsn(),
     338         1092 :                 self.get_compaction_target_size(),
     339         1092 :                 flags,
     340         1092 :                 ctx,
     341         1092 :             )
     342        48084 :             .await
     343              :         {
     344         1092 :             Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
     345         1092 :                 // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
     346         1092 :                 let image_ctx = RequestContextBuilder::extend(ctx)
     347         1092 :                     .access_stats_behavior(AccessStatsBehavior::Skip)
     348         1092 :                     .build();
     349         1092 : 
     350         1092 :                 // 2. Compact
     351         1092 :                 let timer = self.metrics.compact_time_histo.start_timer();
     352       137498 :                 let fully_compacted = self.compact_level0(target_file_size, ctx).await?;
     353         1092 :                 timer.stop_and_record();
     354         1092 : 
     355         1092 :                 let mut partitioning = dense_partitioning;
     356         1092 :                 partitioning
     357         1092 :                     .parts
     358         1092 :                     .extend(sparse_partitioning.into_dense().parts);
     359         1092 : 
     360         1092 :                 // 3. Create new image layers for partitions that have been modified
     361         1092 :                 // "enough". Skip image layer creation if L0 compaction cannot keep up.
     362         1092 :                 if fully_compacted {
     363         1092 :                     let image_layers = self
     364         1092 :                         .create_image_layers(
     365         1092 :                             &partitioning,
     366         1092 :                             lsn,
     367         1092 :                             if flags.contains(CompactFlags::ForceImageLayerCreation) {
     368           42 :                                 ImageLayerCreationMode::Force
     369              :                             } else {
     370         1050 :                                 ImageLayerCreationMode::Try
     371              :                             },
     372         1092 :                             &image_ctx,
     373              :                         )
     374        40976 :                         .await?;
     375              : 
     376         1092 :                     self.upload_new_image_layers(image_layers)?;
     377              :                 } else {
     378            0 :                     info!("skipping image layer generation due to L0 compaction did not include all layers.");
     379              :                 }
     380         1092 :                 (partitioning.parts.len(), !fully_compacted)
     381              :             }
     382            0 :             Err(err) => {
     383            0 :                 // no partitioning? This is normal, if the timeline was just created
     384            0 :                 // as an empty timeline. Also in unit tests, when we use the timeline
     385            0 :                 // as a simple key-value store, ignoring the datadir layout. Log the
     386            0 :                 // error but continue.
     387            0 :                 //
     388            0 :                 // Suppress error when it's due to cancellation
     389            0 :                 if !self.cancel.is_cancelled() {
     390            0 :                     tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
     391            0 :                 }
     392            0 :                 (1, false)
     393              :             }
     394              :         };
     395              : 
     396         1092 :         if self.shard_identity.count >= ShardCount::new(2) {
     397              :             // Limit the number of layer rewrites to the number of partitions: this means its
     398              :             // runtime should be comparable to a full round of image layer creations, rather than
     399              :             // being potentially much longer.
     400            0 :             let rewrite_max = partition_count;
     401            0 : 
     402            0 :             self.compact_shard_ancestors(rewrite_max, ctx).await?;
     403         1092 :         }
     404              : 
     405         1092 :         Ok(has_pending_tasks)
     406         1092 :     }
     407              : 
     408              :     /// Check for layers that are elegible to be rewritten:
     409              :     /// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that
     410              :     ///   we don't indefinitely retain keys in this shard that aren't needed.
     411              :     /// - For future use: layers beyond pitr_interval that are in formats we would
     412              :     ///   rather not maintain compatibility with indefinitely.
     413              :     ///
     414              :     /// Note: this phase may read and write many gigabytes of data: use rewrite_max to bound
     415              :     /// how much work it will try to do in each compaction pass.
     416            0 :     async fn compact_shard_ancestors(
     417            0 :         self: &Arc<Self>,
     418            0 :         rewrite_max: usize,
     419            0 :         ctx: &RequestContext,
     420            0 :     ) -> Result<(), CompactionError> {
     421            0 :         let mut drop_layers = Vec::new();
     422            0 :         let mut layers_to_rewrite: Vec<Layer> = Vec::new();
     423            0 : 
     424            0 :         // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
     425            0 :         // layer is behind this Lsn, it indicates that the layer is being retained beyond the
     426            0 :         // pitr_interval, for example because a branchpoint references it.
     427            0 :         //
     428            0 :         // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
     429            0 :         // are rewriting layers.
     430            0 :         let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
     431            0 : 
     432            0 :         tracing::info!(
     433            0 :             "latest_gc_cutoff: {}, pitr cutoff {}",
     434            0 :             *latest_gc_cutoff,
     435            0 :             self.gc_info.read().unwrap().cutoffs.time
     436              :         );
     437              : 
     438            0 :         let layers = self.layers.read().await;
     439            0 :         for layer_desc in layers.layer_map()?.iter_historic_layers() {
     440            0 :             let layer = layers.get_from_desc(&layer_desc);
     441            0 :             if layer.metadata().shard.shard_count == self.shard_identity.count {
     442              :                 // This layer does not belong to a historic ancestor, no need to re-image it.
     443            0 :                 continue;
     444            0 :             }
     445            0 : 
     446            0 :             // This layer was created on an ancestor shard: check if it contains any data for this shard.
     447            0 :             let sharded_range = ShardedRange::new(layer_desc.get_key_range(), &self.shard_identity);
     448            0 :             let layer_local_page_count = sharded_range.page_count();
     449            0 :             let layer_raw_page_count = ShardedRange::raw_size(&layer_desc.get_key_range());
     450            0 :             if layer_local_page_count == 0 {
     451              :                 // This ancestral layer only covers keys that belong to other shards.
     452              :                 // We include the full metadata in the log: if we had some critical bug that caused
     453              :                 // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
     454            0 :                 info!(%layer, old_metadata=?layer.metadata(),
     455            0 :                     "dropping layer after shard split, contains no keys for this shard.",
     456              :                 );
     457              : 
     458            0 :                 if cfg!(debug_assertions) {
     459              :                     // Expensive, exhaustive check of keys in this layer: this guards against ShardedRange's calculations being
     460              :                     // wrong.  If ShardedRange claims the local page count is zero, then no keys in this layer
     461              :                     // should be !is_key_disposable()
     462            0 :                     let range = layer_desc.get_key_range();
     463            0 :                     let mut key = range.start;
     464            0 :                     while key < range.end {
     465            0 :                         debug_assert!(self.shard_identity.is_key_disposable(&key));
     466            0 :                         key = key.next();
     467              :                     }
     468            0 :                 }
     469              : 
     470            0 :                 drop_layers.push(layer);
     471            0 :                 continue;
     472            0 :             } else if layer_local_page_count != u32::MAX
     473            0 :                 && layer_local_page_count == layer_raw_page_count
     474              :             {
     475            0 :                 debug!(%layer,
     476            0 :                     "layer is entirely shard local ({} keys), no need to filter it",
     477              :                     layer_local_page_count
     478              :                 );
     479            0 :                 continue;
     480            0 :             }
     481            0 : 
     482            0 :             // Don't bother re-writing a layer unless it will at least halve its size
     483            0 :             if layer_local_page_count != u32::MAX
     484            0 :                 && layer_local_page_count > layer_raw_page_count / 2
     485              :             {
     486            0 :                 debug!(%layer,
     487            0 :                     "layer is already mostly local ({}/{}), not rewriting",
     488              :                     layer_local_page_count,
     489              :                     layer_raw_page_count
     490              :                 );
     491            0 :             }
     492              : 
     493              :             // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
     494              :             // without incurring the I/O cost of a rewrite.
     495            0 :             if layer_desc.get_lsn_range().end >= *latest_gc_cutoff {
     496            0 :                 debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})",
     497            0 :                     layer_desc.get_lsn_range().end, *latest_gc_cutoff);
     498            0 :                 continue;
     499            0 :             }
     500            0 : 
     501            0 :             if layer_desc.is_delta() {
     502              :                 // We do not yet implement rewrite of delta layers
     503            0 :                 debug!(%layer, "Skipping rewrite of delta layer");
     504            0 :                 continue;
     505            0 :             }
     506            0 : 
     507            0 :             // Only rewrite layers if their generations differ.  This guarantees:
     508            0 :             //  - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
     509            0 :             //  - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage
     510            0 :             if layer.metadata().generation == self.generation {
     511            0 :                 debug!(%layer, "Skipping rewrite, is not from old generation");
     512            0 :                 continue;
     513            0 :             }
     514            0 : 
     515            0 :             if layers_to_rewrite.len() >= rewrite_max {
     516            0 :                 tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
     517            0 :                     layers_to_rewrite.len()
     518              :                 );
     519            0 :                 continue;
     520            0 :             }
     521            0 : 
     522            0 :             // Fall through: all our conditions for doing a rewrite passed.
     523            0 :             layers_to_rewrite.push(layer);
     524              :         }
     525              : 
     526              :         // Drop read lock on layer map before we start doing time-consuming I/O
     527            0 :         drop(layers);
     528            0 : 
     529            0 :         let mut replace_image_layers = Vec::new();
     530              : 
     531            0 :         for layer in layers_to_rewrite {
     532            0 :             tracing::info!(layer=%layer, "Rewriting layer after shard split...");
     533            0 :             let mut image_layer_writer = ImageLayerWriter::new(
     534            0 :                 self.conf,
     535            0 :                 self.timeline_id,
     536            0 :                 self.tenant_shard_id,
     537            0 :                 &layer.layer_desc().key_range,
     538            0 :                 layer.layer_desc().image_layer_lsn(),
     539            0 :                 ctx,
     540            0 :             )
     541            0 :             .await
     542            0 :             .map_err(CompactionError::Other)?;
     543              : 
     544              :             // Safety of layer rewrites:
     545              :             // - We are writing to a different local file path than we are reading from, so the old Layer
     546              :             //   cannot interfere with the new one.
     547              :             // - In the page cache, contents for a particular VirtualFile are stored with a file_id that
     548              :             //   is different for two layers with the same name (in `ImageLayerInner::new` we always
     549              :             //   acquire a fresh id from [`crate::page_cache::next_file_id`].  So readers do not risk
     550              :             //   reading the index from one layer file, and then data blocks from the rewritten layer file.
     551              :             // - Any readers that have a reference to the old layer will keep it alive until they are done
     552              :             //   with it. If they are trying to promote from remote storage, that will fail, but this is the same
     553              :             //   as for compaction generally: compaction is allowed to delete layers that readers might be trying to use.
     554              :             // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
     555              :             //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
     556              :             //    - ingestion, which only inserts layers, therefore cannot collide with us.
     557            0 :             let resident = layer.download_and_keep_resident().await?;
     558              : 
     559            0 :             let keys_written = resident
     560            0 :                 .filter(&self.shard_identity, &mut image_layer_writer, ctx)
     561            0 :                 .await?;
     562              : 
     563            0 :             if keys_written > 0 {
     564            0 :                 let new_layer = image_layer_writer
     565            0 :                     .finish(self, ctx)
     566            0 :                     .await
     567            0 :                     .map_err(CompactionError::Other)?;
     568            0 :                 tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
     569            0 :                     layer.metadata().file_size,
     570            0 :                     new_layer.metadata().file_size);
     571              : 
     572            0 :                 replace_image_layers.push((layer, new_layer));
     573            0 :             } else {
     574            0 :                 // Drop the old layer.  Usually for this case we would already have noticed that
     575            0 :                 // the layer has no data for us with the ShardedRange check above, but
     576            0 :                 drop_layers.push(layer);
     577            0 :             }
     578              :         }
     579              : 
     580              :         // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
     581              :         // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
     582              :         // to remote index) and be removed. This is inefficient but safe.
     583              :         fail::fail_point!("compact-shard-ancestors-localonly");
     584              : 
     585              :         // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
     586            0 :         self.rewrite_layers(replace_image_layers, drop_layers)
     587            0 :             .await?;
     588              : 
     589              :         fail::fail_point!("compact-shard-ancestors-enqueued");
     590              : 
     591              :         // We wait for all uploads to complete before finishing this compaction stage.  This is not
     592              :         // necessary for correctness, but it simplifies testing, and avoids proceeding with another
     593              :         // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
     594              :         // load.
     595            0 :         match self.remote_client.wait_completion().await {
     596            0 :             Ok(()) => (),
     597            0 :             Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
     598              :             Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
     599            0 :                 return Err(CompactionError::ShuttingDown)
     600              :             }
     601              :         }
     602              : 
     603              :         fail::fail_point!("compact-shard-ancestors-persistent");
     604              : 
     605            0 :         Ok(())
     606            0 :     }
     607              : 
     608              :     /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is
     609              :     /// an image layer between them and the most recent readable LSN (branch point or tip of timeline).  The
     610              :     /// purpose of the visibility hint is to record which layers need to be available to service reads.
     611              :     ///
     612              :     /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers
     613              :     /// that we know won't be needed for reads.
     614         2148 :     pub(super) async fn update_layer_visibility(
     615         2148 :         &self,
     616         2148 :     ) -> Result<(), super::layer_manager::Shutdown> {
     617         2148 :         let head_lsn = self.get_last_record_lsn();
     618              : 
     619              :         // We will sweep through layers in reverse-LSN order.  We only do historic layers.  L0 deltas
     620              :         // are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here.
     621              :         // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
     622              :         // they will be subject to L0->L1 compaction in the near future.
     623         2148 :         let layer_manager = self.layers.read().await;
     624         2148 :         let layer_map = layer_manager.layer_map()?;
     625              : 
     626         2148 :         let readable_points = {
     627         2148 :             let children = self.gc_info.read().unwrap().retain_lsns.clone();
     628         2148 : 
     629         2148 :             let mut readable_points = Vec::with_capacity(children.len() + 1);
     630         2148 :             for (child_lsn, _child_timeline_id) in &children {
     631            0 :                 readable_points.push(*child_lsn);
     632            0 :             }
     633         2148 :             readable_points.push(head_lsn);
     634         2148 :             readable_points
     635         2148 :         };
     636         2148 : 
     637         2148 :         let (layer_visibility, covered) = layer_map.get_visibility(readable_points);
     638        10218 :         for (layer_desc, visibility) in layer_visibility {
     639         8070 :             // FIXME: a more efficiency bulk zip() through the layers rather than NlogN getting each one
     640         8070 :             let layer = layer_manager.get_from_desc(&layer_desc);
     641         8070 :             layer.set_visibility(visibility);
     642         8070 :         }
     643              : 
     644              :         // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can
     645              :         // avoid assuming that everything at a branch point is visible.
     646         2148 :         drop(covered);
     647         2148 :         Ok(())
     648         2148 :     }
     649              : 
     650              :     /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
     651              :     /// as Level 1 files. Returns whether the L0 layers are fully compacted.
     652         1092 :     async fn compact_level0(
     653         1092 :         self: &Arc<Self>,
     654         1092 :         target_file_size: u64,
     655         1092 :         ctx: &RequestContext,
     656         1092 :     ) -> Result<bool, CompactionError> {
     657              :         let CompactLevel0Phase1Result {
     658         1092 :             new_layers,
     659         1092 :             deltas_to_compact,
     660         1092 :             fully_compacted,
     661              :         } = {
     662         1092 :             let phase1_span = info_span!("compact_level0_phase1");
     663         1092 :             let ctx = ctx.attached_child();
     664         1092 :             let mut stats = CompactLevel0Phase1StatsBuilder {
     665         1092 :                 version: Some(2),
     666         1092 :                 tenant_id: Some(self.tenant_shard_id),
     667         1092 :                 timeline_id: Some(self.timeline_id),
     668         1092 :                 ..Default::default()
     669         1092 :             };
     670         1092 : 
     671         1092 :             let begin = tokio::time::Instant::now();
     672         1092 :             let phase1_layers_locked = self.layers.read().await;
     673         1092 :             let now = tokio::time::Instant::now();
     674         1092 :             stats.read_lock_acquisition_micros =
     675         1092 :                 DurationRecorder::Recorded(RecordedDuration(now - begin), now);
     676         1092 :             self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx)
     677         1092 :                 .instrument(phase1_span)
     678       137497 :                 .await?
     679              :         };
     680              : 
     681         1092 :         if new_layers.is_empty() && deltas_to_compact.is_empty() {
     682              :             // nothing to do
     683         1008 :             return Ok(true);
     684           84 :         }
     685           84 : 
     686           84 :         self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
     687            0 :             .await?;
     688           84 :         Ok(fully_compacted)
     689         1092 :     }
     690              : 
     691              :     /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
     692         1092 :     async fn compact_level0_phase1<'a>(
     693         1092 :         self: &'a Arc<Self>,
     694         1092 :         guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
     695         1092 :         mut stats: CompactLevel0Phase1StatsBuilder,
     696         1092 :         target_file_size: u64,
     697         1092 :         ctx: &RequestContext,
     698         1092 :     ) -> Result<CompactLevel0Phase1Result, CompactionError> {
     699         1092 :         stats.read_lock_held_spawn_blocking_startup_micros =
     700         1092 :             stats.read_lock_acquisition_micros.till_now(); // set by caller
     701         1092 :         let layers = guard.layer_map()?;
     702         1092 :         let level0_deltas = layers.level0_deltas();
     703         1092 :         stats.level0_deltas_count = Some(level0_deltas.len());
     704         1092 : 
     705         1092 :         // Only compact if enough layers have accumulated.
     706         1092 :         let threshold = self.get_compaction_threshold();
     707         1092 :         if level0_deltas.is_empty() || level0_deltas.len() < threshold {
     708         1008 :             debug!(
     709            0 :                 level0_deltas = level0_deltas.len(),
     710            0 :                 threshold, "too few deltas to compact"
     711              :             );
     712         1008 :             return Ok(CompactLevel0Phase1Result::default());
     713           84 :         }
     714           84 : 
     715           84 :         let mut level0_deltas = level0_deltas
     716           84 :             .iter()
     717         1206 :             .map(|x| guard.get_from_desc(x))
     718           84 :             .collect::<Vec<_>>();
     719           84 : 
     720           84 :         // Gather the files to compact in this iteration.
     721           84 :         //
     722           84 :         // Start with the oldest Level 0 delta file, and collect any other
     723           84 :         // level 0 files that form a contiguous sequence, such that the end
     724           84 :         // LSN of previous file matches the start LSN of the next file.
     725           84 :         //
     726           84 :         // Note that if the files don't form such a sequence, we might
     727           84 :         // "compact" just a single file. That's a bit pointless, but it allows
     728           84 :         // us to get rid of the level 0 file, and compact the other files on
     729           84 :         // the next iteration. This could probably made smarter, but such
     730           84 :         // "gaps" in the sequence of level 0 files should only happen in case
     731           84 :         // of a crash, partial download from cloud storage, or something like
     732           84 :         // that, so it's not a big deal in practice.
     733         2244 :         level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start);
     734           84 :         let mut level0_deltas_iter = level0_deltas.iter();
     735           84 : 
     736           84 :         let first_level0_delta = level0_deltas_iter.next().unwrap();
     737           84 :         let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
     738           84 :         let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
     739           84 : 
     740           84 :         // Accumulate the size of layers in `deltas_to_compact`
     741           84 :         let mut deltas_to_compact_bytes = 0;
     742           84 : 
     743           84 :         // Under normal circumstances, we will accumulate up to compaction_interval L0s of size
     744           84 :         // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
     745           84 :         // work in this function to only operate on this much delta data at once.
     746           84 :         //
     747           84 :         // Take the max of the configured value & the default, so that tests that configure tiny values
     748           84 :         // can still use a sensible amount of memory, but if a deployed system configures bigger values we
     749           84 :         // still let them compact a full stack of L0s in one go.
     750           84 :         let delta_size_limit = std::cmp::max(
     751           84 :             self.get_compaction_threshold(),
     752           84 :             DEFAULT_COMPACTION_THRESHOLD,
     753           84 :         ) as u64
     754           84 :             * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
     755           84 : 
     756           84 :         let mut fully_compacted = true;
     757           84 : 
     758           84 :         deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
     759         1206 :         for l in level0_deltas_iter {
     760         1122 :             let lsn_range = &l.layer_desc().lsn_range;
     761         1122 : 
     762         1122 :             if lsn_range.start != prev_lsn_end {
     763            0 :                 break;
     764         1122 :             }
     765         1122 :             deltas_to_compact.push(l.download_and_keep_resident().await?);
     766         1122 :             deltas_to_compact_bytes += l.metadata().file_size;
     767         1122 :             prev_lsn_end = lsn_range.end;
     768         1122 : 
     769         1122 :             if deltas_to_compact_bytes >= delta_size_limit {
     770            0 :                 info!(
     771            0 :                     l0_deltas_selected = deltas_to_compact.len(),
     772            0 :                     l0_deltas_total = level0_deltas.len(),
     773            0 :                     "L0 compaction picker hit max delta layer size limit: {}",
     774              :                     delta_size_limit
     775              :                 );
     776            0 :                 fully_compacted = false;
     777            0 : 
     778            0 :                 // Proceed with compaction, but only a subset of L0s
     779            0 :                 break;
     780         1122 :             }
     781              :         }
     782           84 :         let lsn_range = Range {
     783           84 :             start: deltas_to_compact
     784           84 :                 .first()
     785           84 :                 .unwrap()
     786           84 :                 .layer_desc()
     787           84 :                 .lsn_range
     788           84 :                 .start,
     789           84 :             end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end,
     790           84 :         };
     791           84 : 
     792           84 :         info!(
     793            0 :             "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
     794            0 :             lsn_range.start,
     795            0 :             lsn_range.end,
     796            0 :             deltas_to_compact.len(),
     797            0 :             level0_deltas.len()
     798              :         );
     799              : 
     800         1206 :         for l in deltas_to_compact.iter() {
     801         1206 :             info!("compact includes {l}");
     802              :         }
     803              : 
     804              :         // We don't need the original list of layers anymore. Drop it so that
     805              :         // we don't accidentally use it later in the function.
     806           84 :         drop(level0_deltas);
     807           84 : 
     808           84 :         stats.read_lock_held_prerequisites_micros = stats
     809           84 :             .read_lock_held_spawn_blocking_startup_micros
     810           84 :             .till_now();
     811              : 
     812              :         // TODO: replace with streaming k-merge
     813           84 :         let all_keys = {
     814           84 :             let mut all_keys = Vec::new();
     815         1206 :             for l in deltas_to_compact.iter() {
     816         1206 :                 if self.cancel.is_cancelled() {
     817            0 :                     return Err(CompactionError::ShuttingDown);
     818         1206 :                 }
     819         7083 :                 all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
     820              :             }
     821              :             // The current stdlib sorting implementation is designed in a way where it is
     822              :             // particularly fast where the slice is made up of sorted sub-ranges.
     823     13295398 :             all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
     824           84 :             all_keys
     825           84 :         };
     826           84 : 
     827           84 :         stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
     828              : 
     829              :         // Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start.
     830              :         //
     831              :         // A hole is a key range for which this compaction doesn't have any WAL records.
     832              :         // Our goal in this compaction iteration is to avoid creating L1s that, in terms of their key range,
     833              :         // cover the hole, but actually don't contain any WAL records for that key range.
     834              :         // The reason is that the mere stack of L1s (`count_deltas`) triggers image layer creation (`create_image_layers`).
     835              :         // That image layer creation would be useless for a hole range covered by L1s that don't contain any WAL records.
     836              :         //
     837              :         // The algorithm chooses holes as follows.
     838              :         // - Slide a 2-window over the keys in key orde to get the hole range (=distance between two keys).
     839              :         // - Filter: min threshold on range length
     840              :         // - Rank: by coverage size (=number of image layers required to reconstruct each key in the range for which we have any data)
     841              :         //
     842              :         // For more details, intuition, and some ASCII art see https://github.com/neondatabase/neon/pull/3597#discussion_r1112704451
     843              :         #[derive(PartialEq, Eq)]
     844              :         struct Hole {
     845              :             key_range: Range<Key>,
     846              :             coverage_size: usize,
     847              :         }
     848           84 :         let holes: Vec<Hole> = {
     849              :             use std::cmp::Ordering;
     850              :             impl Ord for Hole {
     851            0 :                 fn cmp(&self, other: &Self) -> Ordering {
     852            0 :                     self.coverage_size.cmp(&other.coverage_size).reverse()
     853            0 :                 }
     854              :             }
     855              :             impl PartialOrd for Hole {
     856            0 :                 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
     857            0 :                     Some(self.cmp(other))
     858            0 :                 }
     859              :             }
     860           84 :             let max_holes = deltas_to_compact.len();
     861           84 :             let last_record_lsn = self.get_last_record_lsn();
     862           84 :             let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
     863           84 :             let min_hole_coverage_size = 3; // TODO: something more flexible?
     864           84 :                                             // min-heap (reserve space for one more element added before eviction)
     865           84 :             let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
     866           84 :             let mut prev: Option<Key> = None;
     867              : 
     868      6192114 :             for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
     869      6192114 :                 if let Some(prev_key) = prev {
     870              :                     // just first fast filter, do not create hole entries for metadata keys. The last hole in the
     871              :                     // compaction is the gap between data key and metadata keys.
     872      6192030 :                     if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
     873            0 :                         && !Key::is_metadata_key(&prev_key)
     874              :                     {
     875            0 :                         let key_range = prev_key..next_key;
     876            0 :                         // Measuring hole by just subtraction of i128 representation of key range boundaries
     877            0 :                         // has not so much sense, because largest holes will corresponds field1/field2 changes.
     878            0 :                         // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
     879            0 :                         // That is why it is better to measure size of hole as number of covering image layers.
     880            0 :                         let coverage_size =
     881            0 :                             layers.image_coverage(&key_range, last_record_lsn).len();
     882            0 :                         if coverage_size >= min_hole_coverage_size {
     883            0 :                             heap.push(Hole {
     884            0 :                                 key_range,
     885            0 :                                 coverage_size,
     886            0 :                             });
     887            0 :                             if heap.len() > max_holes {
     888            0 :                                 heap.pop(); // remove smallest hole
     889            0 :                             }
     890            0 :                         }
     891      6192030 :                     }
     892           84 :                 }
     893      6192114 :                 prev = Some(next_key.next());
     894              :             }
     895           84 :             let mut holes = heap.into_vec();
     896           84 :             holes.sort_unstable_by_key(|hole| hole.key_range.start);
     897           84 :             holes
     898           84 :         };
     899           84 :         stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
     900           84 :         drop_rlock(guard);
     901           84 : 
     902           84 :         if self.cancel.is_cancelled() {
     903            0 :             return Err(CompactionError::ShuttingDown);
     904           84 :         }
     905           84 : 
     906           84 :         stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
     907              : 
     908              :         // This iterator walks through all key-value pairs from all the layers
     909              :         // we're compacting, in key, LSN order.
     910              :         // If there's both a Value::Image and Value::WalRecord for the same (key,lsn),
     911              :         // then the Value::Image is ordered before Value::WalRecord.
     912              :         //
     913              :         // TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io
     914              :         // option and validation code once we've reached confidence.
     915              :         enum AllValuesIter<'a> {
     916              :             PageCachedBlobIo {
     917              :                 all_keys_iter: VecIter<'a>,
     918              :             },
     919              :             StreamingKmergeBypassingPageCache {
     920              :                 merge_iter: MergeIterator<'a>,
     921              :             },
     922              :             ValidatingStreamingKmergeBypassingPageCache {
     923              :                 mode: CompactL0BypassPageCacheValidation,
     924              :                 merge_iter: MergeIterator<'a>,
     925              :                 all_keys_iter: VecIter<'a>,
     926              :             },
     927              :         }
     928              :         type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes
     929              :         impl AllValuesIter<'_> {
     930      6192198 :             async fn next_all_keys_iter(
     931      6192198 :                 iter: &mut VecIter<'_>,
     932      6192198 :                 ctx: &RequestContext,
     933      6192198 :             ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
     934              :                 let Some(DeltaEntry {
     935      6192114 :                     key,
     936      6192114 :                     lsn,
     937      6192114 :                     val: value_ref,
     938              :                     ..
     939      6192198 :                 }) = iter.next()
     940              :                 else {
     941           84 :                     return Ok(None);
     942              :                 };
     943      6192114 :                 let value = value_ref.load(ctx).await?;
     944      6192114 :                 Ok(Some((*key, *lsn, value)))
     945      6192198 :             }
     946      6192198 :             async fn next(
     947      6192198 :                 &mut self,
     948      6192198 :                 ctx: &RequestContext,
     949      6192198 :             ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
     950      6192198 :                 match self {
     951            0 :                     AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => {
     952            0 :                       Self::next_all_keys_iter(iter, ctx).await
     953              :                     }
     954            0 :                     AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await,
     955      6192198 :                     AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async {
     956              :                         // advance both iterators
     957      6192198 :                         let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await;
     958      6192198 :                         let merge_iter_item = merge_iter.next().await;
     959              :                         // compare results & log warnings as needed
     960              :                         macro_rules! rate_limited_warn {
     961              :                             ($($arg:tt)*) => {{
     962              :                                 if cfg!(debug_assertions) || cfg!(feature = "testing") {
     963              :                                     warn!($($arg)*);
     964              :                                     panic!("CompactL0BypassPageCacheValidation failure, check logs");
     965              :                                 }
     966              :                                 use once_cell::sync::Lazy;
     967              :                                 use utils::rate_limit::RateLimit;
     968              :                                 use std::sync::Mutex;
     969              :                                 use std::time::Duration;
     970              :                                 static LOGGED: Lazy<Mutex<RateLimit>> =
     971            0 :                                     Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
     972              :                                 let mut rate_limit = LOGGED.lock().unwrap();
     973            0 :                                 rate_limit.call(|| {
     974            0 :                                     warn!($($arg)*);
     975            0 :                                 });
     976              :                             }}
     977              :                         }
     978      6192198 :                         match (&all_keys_iter_item, &merge_iter_item) {
     979            0 :                             (Err(_), Err(_)) => {
     980            0 :                                 // don't bother asserting equivality of the errors
     981            0 :                             }
     982            0 :                             (Err(all_keys), Ok(merge)) => {
     983            0 :                                 rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}");
     984            0 :                             },
     985            0 :                             (Ok(all_keys), Err(merge)) => {
     986            0 :                                 rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}");
     987            0 :                             },
     988           84 :                             (Ok(None), Ok(None)) => { }
     989            0 :                             (Ok(Some(all_keys)), Ok(None)) => {
     990            0 :                                 rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some");
     991            0 :                             }
     992            0 :                             (Ok(None), Ok(Some(merge))) => {
     993            0 :                                 rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some");
     994            0 :                             }
     995      6192114 :                             (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => {
     996      6192114 :                                 match mode {
     997              :                                     // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one
     998              :                                     CompactL0BypassPageCacheValidation::KeyLsn => {
     999            0 :                                         let all_keys = (all_keys_key, all_keys_lsn);
    1000            0 :                                         let merge = (merge_key, merge_lsn);
    1001            0 :                                         if all_keys != merge {
    1002            0 :                                             rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter");
    1003            0 :                                         }
    1004              :                                     }
    1005              :                                     CompactL0BypassPageCacheValidation::KeyLsnValue => {
    1006      6192114 :                                         let all_keys = (all_keys_key, all_keys_lsn, all_keys_value);
    1007      6192114 :                                         let merge = (merge_key, merge_lsn, merge_value);
    1008      6192114 :                                         if all_keys != merge {
    1009            0 :                                             rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter");
    1010      6192114 :                                         }
    1011              :                                     }
    1012              :                                 }
    1013              :                             }
    1014              :                         }
    1015              :                         // in case of mismatch, trust the legacy all_keys_iter_item
    1016      6192198 :                         all_keys_iter_item
    1017      6192198 :                     }.instrument(info_span!("next")).await
    1018              :                 }
    1019      6192198 :             }
    1020              :         }
    1021           84 :         let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access {
    1022            0 :             CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo {
    1023            0 :                 all_keys_iter: all_keys.iter(),
    1024            0 :             },
    1025           84 :             CompactL0Phase1ValueAccess::StreamingKmerge { validate } => {
    1026           84 :                 let merge_iter = {
    1027           84 :                     let mut deltas = Vec::with_capacity(deltas_to_compact.len());
    1028         1206 :                     for l in deltas_to_compact.iter() {
    1029         1206 :                         let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
    1030         1206 :                         deltas.push(l);
    1031              :                     }
    1032           84 :                     MergeIterator::create(&deltas, &[], ctx)
    1033           84 :                 };
    1034           84 :                 match validate {
    1035            0 :                     None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter },
    1036           84 :                     Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache {
    1037           84 :                         mode: validate.clone(),
    1038           84 :                         merge_iter,
    1039           84 :                         all_keys_iter: all_keys.iter(),
    1040           84 :                     },
    1041              :                 }
    1042              :             }
    1043              :         };
    1044              : 
    1045              :         // This iterator walks through all keys and is needed to calculate size used by each key
    1046           84 :         let mut all_keys_iter = all_keys
    1047           84 :             .iter()
    1048      6192114 :             .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
    1049      6192030 :             .coalesce(|mut prev, cur| {
    1050      6192030 :                 // Coalesce keys that belong to the same key pair.
    1051      6192030 :                 // This ensures that compaction doesn't put them
    1052      6192030 :                 // into different layer files.
    1053      6192030 :                 // Still limit this by the target file size,
    1054      6192030 :                 // so that we keep the size of the files in
    1055      6192030 :                 // check.
    1056      6192030 :                 if prev.0 == cur.0 && prev.2 < target_file_size {
    1057       120114 :                     prev.2 += cur.2;
    1058       120114 :                     Ok(prev)
    1059              :                 } else {
    1060      6071916 :                     Err((prev, cur))
    1061              :                 }
    1062      6192030 :             });
    1063           84 : 
    1064           84 :         // Merge the contents of all the input delta layers into a new set
    1065           84 :         // of delta layers, based on the current partitioning.
    1066           84 :         //
    1067           84 :         // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one.
    1068           84 :         // It's possible that there is a single key with so many page versions that storing all of them in a single layer file
    1069           84 :         // would be too large. In that case, we also split on the LSN dimension.
    1070           84 :         //
    1071           84 :         // LSN
    1072           84 :         //  ^
    1073           84 :         //  |
    1074           84 :         //  | +-----------+            +--+--+--+--+
    1075           84 :         //  | |           |            |  |  |  |  |
    1076           84 :         //  | +-----------+            |  |  |  |  |
    1077           84 :         //  | |           |            |  |  |  |  |
    1078           84 :         //  | +-----------+     ==>    |  |  |  |  |
    1079           84 :         //  | |           |            |  |  |  |  |
    1080           84 :         //  | +-----------+            |  |  |  |  |
    1081           84 :         //  | |           |            |  |  |  |  |
    1082           84 :         //  | +-----------+            +--+--+--+--+
    1083           84 :         //  |
    1084           84 :         //  +--------------> key
    1085           84 :         //
    1086           84 :         //
    1087           84 :         // If one key (X) has a lot of page versions:
    1088           84 :         //
    1089           84 :         // LSN
    1090           84 :         //  ^
    1091           84 :         //  |                                 (X)
    1092           84 :         //  | +-----------+            +--+--+--+--+
    1093           84 :         //  | |           |            |  |  |  |  |
    1094           84 :         //  | +-----------+            |  |  +--+  |
    1095           84 :         //  | |           |            |  |  |  |  |
    1096           84 :         //  | +-----------+     ==>    |  |  |  |  |
    1097           84 :         //  | |           |            |  |  +--+  |
    1098           84 :         //  | +-----------+            |  |  |  |  |
    1099           84 :         //  | |           |            |  |  |  |  |
    1100           84 :         //  | +-----------+            +--+--+--+--+
    1101           84 :         //  |
    1102           84 :         //  +--------------> key
    1103           84 :         // TODO: this actually divides the layers into fixed-size chunks, not
    1104           84 :         // based on the partitioning.
    1105           84 :         //
    1106           84 :         // TODO: we should also opportunistically materialize and
    1107           84 :         // garbage collect what we can.
    1108           84 :         let mut new_layers = Vec::new();
    1109           84 :         let mut prev_key: Option<Key> = None;
    1110           84 :         let mut writer: Option<DeltaLayerWriter> = None;
    1111           84 :         let mut key_values_total_size = 0u64;
    1112           84 :         let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
    1113           84 :         let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
    1114           84 :         let mut next_hole = 0; // index of next hole in holes vector
    1115           84 : 
    1116           84 :         let mut keys = 0;
    1117              : 
    1118      6192198 :         while let Some((key, lsn, value)) = all_values_iter
    1119      6192198 :             .next(ctx)
    1120       118085 :             .await
    1121      6192198 :             .map_err(CompactionError::Other)?
    1122              :         {
    1123      6192114 :             keys += 1;
    1124      6192114 : 
    1125      6192114 :             if keys % 32_768 == 0 && self.cancel.is_cancelled() {
    1126              :                 // avoid hitting the cancellation token on every key. in benches, we end up
    1127              :                 // shuffling an order of million keys per layer, this means we'll check it
    1128              :                 // around tens of times per layer.
    1129            0 :                 return Err(CompactionError::ShuttingDown);
    1130      6192114 :             }
    1131      6192114 : 
    1132      6192114 :             let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
    1133      6192114 :             // We need to check key boundaries once we reach next key or end of layer with the same key
    1134      6192114 :             if !same_key || lsn == dup_end_lsn {
    1135      6072000 :                 let mut next_key_size = 0u64;
    1136      6072000 :                 let is_dup_layer = dup_end_lsn.is_valid();
    1137      6072000 :                 dup_start_lsn = Lsn::INVALID;
    1138      6072000 :                 if !same_key {
    1139      6072000 :                     dup_end_lsn = Lsn::INVALID;
    1140      6072000 :                 }
    1141              :                 // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
    1142      6072000 :                 for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
    1143      6072000 :                     next_key_size = next_size;
    1144      6072000 :                     if key != next_key {
    1145      6071916 :                         if dup_end_lsn.is_valid() {
    1146            0 :                             // We are writting segment with duplicates:
    1147            0 :                             // place all remaining values of this key in separate segment
    1148            0 :                             dup_start_lsn = dup_end_lsn; // new segments starts where old stops
    1149            0 :                             dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
    1150      6071916 :                         }
    1151      6071916 :                         break;
    1152           84 :                     }
    1153           84 :                     key_values_total_size += next_size;
    1154           84 :                     // Check if it is time to split segment: if total keys size is larger than target file size.
    1155           84 :                     // We need to avoid generation of empty segments if next_size > target_file_size.
    1156           84 :                     if key_values_total_size > target_file_size && lsn != next_lsn {
    1157              :                         // Split key between multiple layers: such layer can contain only single key
    1158            0 :                         dup_start_lsn = if dup_end_lsn.is_valid() {
    1159            0 :                             dup_end_lsn // new segment with duplicates starts where old one stops
    1160              :                         } else {
    1161            0 :                             lsn // start with the first LSN for this key
    1162              :                         };
    1163            0 :                         dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
    1164            0 :                         break;
    1165           84 :                     }
    1166              :                 }
    1167              :                 // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
    1168      6072000 :                 if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
    1169            0 :                     dup_start_lsn = dup_end_lsn;
    1170            0 :                     dup_end_lsn = lsn_range.end;
    1171      6072000 :                 }
    1172      6072000 :                 if writer.is_some() {
    1173      6071916 :                     let written_size = writer.as_mut().unwrap().size();
    1174      6071916 :                     let contains_hole =
    1175      6071916 :                         next_hole < holes.len() && key >= holes[next_hole].key_range.end;
    1176              :                     // check if key cause layer overflow or contains hole...
    1177      6071916 :                     if is_dup_layer
    1178      6071916 :                         || dup_end_lsn.is_valid()
    1179      6071916 :                         || written_size + key_values_total_size > target_file_size
    1180      6071076 :                         || contains_hole
    1181              :                     {
    1182              :                         // ... if so, flush previous layer and prepare to write new one
    1183          840 :                         let (desc, path) = writer
    1184          840 :                             .take()
    1185          840 :                             .unwrap()
    1186          840 :                             .finish(prev_key.unwrap().next(), ctx)
    1187         2160 :                             .await
    1188          840 :                             .map_err(CompactionError::Other)?;
    1189          840 :                         let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
    1190          840 :                             .map_err(CompactionError::Other)?;
    1191              : 
    1192          840 :                         new_layers.push(new_delta);
    1193          840 :                         writer = None;
    1194          840 : 
    1195          840 :                         if contains_hole {
    1196            0 :                             // skip hole
    1197            0 :                             next_hole += 1;
    1198          840 :                         }
    1199      6071076 :                     }
    1200           84 :                 }
    1201              :                 // Remember size of key value because at next iteration we will access next item
    1202      6072000 :                 key_values_total_size = next_key_size;
    1203       120114 :             }
    1204      6192114 :             fail_point!("delta-layer-writer-fail-before-finish", |_| {
    1205            0 :                 Err(CompactionError::Other(anyhow::anyhow!(
    1206            0 :                     "failpoint delta-layer-writer-fail-before-finish"
    1207            0 :                 )))
    1208      6192114 :             });
    1209              : 
    1210      6192114 :             if !self.shard_identity.is_key_disposable(&key) {
    1211      6192114 :                 if writer.is_none() {
    1212          924 :                     if self.cancel.is_cancelled() {
    1213              :                         // to be somewhat responsive to cancellation, check for each new layer
    1214            0 :                         return Err(CompactionError::ShuttingDown);
    1215          924 :                     }
    1216              :                     // Create writer if not initiaized yet
    1217          924 :                     writer = Some(
    1218              :                         DeltaLayerWriter::new(
    1219          924 :                             self.conf,
    1220          924 :                             self.timeline_id,
    1221          924 :                             self.tenant_shard_id,
    1222          924 :                             key,
    1223          924 :                             if dup_end_lsn.is_valid() {
    1224              :                                 // this is a layer containing slice of values of the same key
    1225            0 :                                 debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
    1226            0 :                                 dup_start_lsn..dup_end_lsn
    1227              :                             } else {
    1228          924 :                                 debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
    1229          924 :                                 lsn_range.clone()
    1230              :                             },
    1231          924 :                             ctx,
    1232              :                         )
    1233          462 :                         .await
    1234          924 :                         .map_err(CompactionError::Other)?,
    1235              :                     );
    1236              : 
    1237          924 :                     keys = 0;
    1238      6191190 :                 }
    1239              : 
    1240      6192114 :                 writer
    1241      6192114 :                     .as_mut()
    1242      6192114 :                     .unwrap()
    1243      6192114 :                     .put_value(key, lsn, value, ctx)
    1244         3653 :                     .await
    1245      6192114 :                     .map_err(CompactionError::Other)?;
    1246              :             } else {
    1247            0 :                 debug!(
    1248            0 :                     "Dropping key {} during compaction (it belongs on shard {:?})",
    1249            0 :                     key,
    1250            0 :                     self.shard_identity.get_shard_number(&key)
    1251              :                 );
    1252              :             }
    1253              : 
    1254      6192114 :             if !new_layers.is_empty() {
    1255        59358 :                 fail_point!("after-timeline-compacted-first-L1");
    1256      6132756 :             }
    1257              : 
    1258      6192114 :             prev_key = Some(key);
    1259              :         }
    1260           84 :         if let Some(writer) = writer {
    1261           84 :             let (desc, path) = writer
    1262           84 :                 .finish(prev_key.unwrap().next(), ctx)
    1263         5970 :                 .await
    1264           84 :                 .map_err(CompactionError::Other)?;
    1265           84 :             let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
    1266           84 :                 .map_err(CompactionError::Other)?;
    1267           84 :             new_layers.push(new_delta);
    1268            0 :         }
    1269              : 
    1270              :         // Sync layers
    1271           84 :         if !new_layers.is_empty() {
    1272              :             // Print a warning if the created layer is larger than double the target size
    1273              :             // Add two pages for potential overhead. This should in theory be already
    1274              :             // accounted for in the target calculation, but for very small targets,
    1275              :             // we still might easily hit the limit otherwise.
    1276           84 :             let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
    1277          924 :             for layer in new_layers.iter() {
    1278          924 :                 if layer.layer_desc().file_size > warn_limit {
    1279            0 :                     warn!(
    1280              :                         %layer,
    1281            0 :                         "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size
    1282              :                     );
    1283          924 :                 }
    1284              :             }
    1285              : 
    1286              :             // The writer.finish() above already did the fsync of the inodes.
    1287              :             // We just need to fsync the directory in which these inodes are linked,
    1288              :             // which we know to be the timeline directory.
    1289              :             //
    1290              :             // We use fatal_err() below because the after writer.finish() returns with success,
    1291              :             // the in-memory state of the filesystem already has the layer file in its final place,
    1292              :             // and subsequent pageserver code could think it's durable while it really isn't.
    1293           84 :             let timeline_dir = VirtualFile::open(
    1294           84 :                 &self
    1295           84 :                     .conf
    1296           84 :                     .timeline_path(&self.tenant_shard_id, &self.timeline_id),
    1297           84 :                 ctx,
    1298           84 :             )
    1299           42 :             .await
    1300           84 :             .fatal_err("VirtualFile::open for timeline dir fsync");
    1301           84 :             timeline_dir
    1302           84 :                 .sync_all()
    1303           42 :                 .await
    1304           84 :                 .fatal_err("VirtualFile::sync_all timeline dir");
    1305            0 :         }
    1306              : 
    1307           84 :         stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
    1308           84 :         stats.new_deltas_count = Some(new_layers.len());
    1309          924 :         stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum());
    1310           84 : 
    1311           84 :         match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
    1312           84 :             .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
    1313              :         {
    1314           84 :             Ok(stats_json) => {
    1315           84 :                 info!(
    1316            0 :                     stats_json = stats_json.as_str(),
    1317            0 :                     "compact_level0_phase1 stats available"
    1318              :                 )
    1319              :             }
    1320            0 :             Err(e) => {
    1321            0 :                 warn!("compact_level0_phase1 stats failed to serialize: {:#}", e);
    1322              :             }
    1323              :         }
    1324              : 
    1325              :         // Without this, rustc complains about deltas_to_compact still
    1326              :         // being borrowed when we `.into_iter()` below.
    1327           84 :         drop(all_values_iter);
    1328           84 : 
    1329           84 :         Ok(CompactLevel0Phase1Result {
    1330           84 :             new_layers,
    1331           84 :             deltas_to_compact: deltas_to_compact
    1332           84 :                 .into_iter()
    1333         1206 :                 .map(|x| x.drop_eviction_guard())
    1334           84 :                 .collect::<Vec<_>>(),
    1335           84 :             fully_compacted,
    1336           84 :         })
    1337         1092 :     }
    1338              : }
    1339              : 
    1340              : #[derive(Default)]
    1341              : struct CompactLevel0Phase1Result {
    1342              :     new_layers: Vec<ResidentLayer>,
    1343              :     deltas_to_compact: Vec<Layer>,
    1344              :     // Whether we have included all L0 layers, or selected only part of them due to the
    1345              :     // L0 compaction size limit.
    1346              :     fully_compacted: bool,
    1347              : }
    1348              : 
    1349              : #[derive(Default)]
    1350              : struct CompactLevel0Phase1StatsBuilder {
    1351              :     version: Option<u64>,
    1352              :     tenant_id: Option<TenantShardId>,
    1353              :     timeline_id: Option<TimelineId>,
    1354              :     read_lock_acquisition_micros: DurationRecorder,
    1355              :     read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
    1356              :     read_lock_held_key_sort_micros: DurationRecorder,
    1357              :     read_lock_held_prerequisites_micros: DurationRecorder,
    1358              :     read_lock_held_compute_holes_micros: DurationRecorder,
    1359              :     read_lock_drop_micros: DurationRecorder,
    1360              :     write_layer_files_micros: DurationRecorder,
    1361              :     level0_deltas_count: Option<usize>,
    1362              :     new_deltas_count: Option<usize>,
    1363              :     new_deltas_size: Option<u64>,
    1364              : }
    1365              : 
    1366              : #[derive(serde::Serialize)]
    1367              : struct CompactLevel0Phase1Stats {
    1368              :     version: u64,
    1369              :     tenant_id: TenantShardId,
    1370              :     timeline_id: TimelineId,
    1371              :     read_lock_acquisition_micros: RecordedDuration,
    1372              :     read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
    1373              :     read_lock_held_key_sort_micros: RecordedDuration,
    1374              :     read_lock_held_prerequisites_micros: RecordedDuration,
    1375              :     read_lock_held_compute_holes_micros: RecordedDuration,
    1376              :     read_lock_drop_micros: RecordedDuration,
    1377              :     write_layer_files_micros: RecordedDuration,
    1378              :     level0_deltas_count: usize,
    1379              :     new_deltas_count: usize,
    1380              :     new_deltas_size: u64,
    1381              : }
    1382              : 
    1383              : impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
    1384              :     type Error = anyhow::Error;
    1385              : 
    1386           84 :     fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result<Self, Self::Error> {
    1387           84 :         Ok(Self {
    1388           84 :             version: value.version.ok_or_else(|| anyhow!("version not set"))?,
    1389           84 :             tenant_id: value
    1390           84 :                 .tenant_id
    1391           84 :                 .ok_or_else(|| anyhow!("tenant_id not set"))?,
    1392           84 :             timeline_id: value
    1393           84 :                 .timeline_id
    1394           84 :                 .ok_or_else(|| anyhow!("timeline_id not set"))?,
    1395           84 :             read_lock_acquisition_micros: value
    1396           84 :                 .read_lock_acquisition_micros
    1397           84 :                 .into_recorded()
    1398           84 :                 .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?,
    1399           84 :             read_lock_held_spawn_blocking_startup_micros: value
    1400           84 :                 .read_lock_held_spawn_blocking_startup_micros
    1401           84 :                 .into_recorded()
    1402           84 :                 .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
    1403           84 :             read_lock_held_key_sort_micros: value
    1404           84 :                 .read_lock_held_key_sort_micros
    1405           84 :                 .into_recorded()
    1406           84 :                 .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
    1407           84 :             read_lock_held_prerequisites_micros: value
    1408           84 :                 .read_lock_held_prerequisites_micros
    1409           84 :                 .into_recorded()
    1410           84 :                 .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?,
    1411           84 :             read_lock_held_compute_holes_micros: value
    1412           84 :                 .read_lock_held_compute_holes_micros
    1413           84 :                 .into_recorded()
    1414           84 :                 .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?,
    1415           84 :             read_lock_drop_micros: value
    1416           84 :                 .read_lock_drop_micros
    1417           84 :                 .into_recorded()
    1418           84 :                 .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
    1419           84 :             write_layer_files_micros: value
    1420           84 :                 .write_layer_files_micros
    1421           84 :                 .into_recorded()
    1422           84 :                 .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?,
    1423           84 :             level0_deltas_count: value
    1424           84 :                 .level0_deltas_count
    1425           84 :                 .ok_or_else(|| anyhow!("level0_deltas_count not set"))?,
    1426           84 :             new_deltas_count: value
    1427           84 :                 .new_deltas_count
    1428           84 :                 .ok_or_else(|| anyhow!("new_deltas_count not set"))?,
    1429           84 :             new_deltas_size: value
    1430           84 :                 .new_deltas_size
    1431           84 :                 .ok_or_else(|| anyhow!("new_deltas_size not set"))?,
    1432              :         })
    1433           84 :     }
    1434              : }
    1435              : 
    1436            0 : #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
    1437              : #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
    1438              : pub enum CompactL0Phase1ValueAccess {
    1439              :     /// The old way.
    1440              :     PageCachedBlobIo,
    1441              :     /// The new way.
    1442              :     StreamingKmerge {
    1443              :         /// If set, we run both the old way and the new way, validate that
    1444              :         /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
    1445              :         /// and if the validation fails,
    1446              :         /// - in tests: fail them with a panic or
    1447              :         /// - in prod, log a rate-limited warning and use the old way's results.
    1448              :         ///
    1449              :         /// If not set, we only run the new way and trust its results.
    1450              :         validate: Option<CompactL0BypassPageCacheValidation>,
    1451              :     },
    1452              : }
    1453              : 
    1454              : /// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
    1455            0 : #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
    1456              : #[serde(rename_all = "kebab-case")]
    1457              : pub enum CompactL0BypassPageCacheValidation {
    1458              :     /// Validate that the series of (key, lsn) pairs are the same.
    1459              :     KeyLsn,
    1460              :     /// Validate that the entire output of old and new way is identical.
    1461              :     KeyLsnValue,
    1462              : }
    1463              : 
    1464              : impl Default for CompactL0Phase1ValueAccess {
    1465          666 :     fn default() -> Self {
    1466          666 :         CompactL0Phase1ValueAccess::StreamingKmerge {
    1467          666 :             // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
    1468          666 :             validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
    1469          666 :         }
    1470          666 :     }
    1471              : }
    1472              : 
    1473              : impl Timeline {
    1474              :     /// Entry point for new tiered compaction algorithm.
    1475              :     ///
    1476              :     /// All the real work is in the implementation in the pageserver_compaction
    1477              :     /// crate. The code here would apply to any algorithm implemented by the
    1478              :     /// same interface, but tiered is the only one at the moment.
    1479              :     ///
    1480              :     /// TODO: cancellation
    1481            0 :     pub(crate) async fn compact_tiered(
    1482            0 :         self: &Arc<Self>,
    1483            0 :         _cancel: &CancellationToken,
    1484            0 :         ctx: &RequestContext,
    1485            0 :     ) -> Result<(), CompactionError> {
    1486            0 :         let fanout = self.get_compaction_threshold() as u64;
    1487            0 :         let target_file_size = self.get_checkpoint_distance();
    1488              : 
    1489              :         // Find the top of the historical layers
    1490            0 :         let end_lsn = {
    1491            0 :             let guard = self.layers.read().await;
    1492            0 :             let layers = guard.layer_map()?;
    1493              : 
    1494            0 :             let l0_deltas = layers.level0_deltas();
    1495            0 : 
    1496            0 :             // As an optimization, if we find that there are too few L0 layers,
    1497            0 :             // bail out early. We know that the compaction algorithm would do
    1498            0 :             // nothing in that case.
    1499            0 :             if l0_deltas.len() < fanout as usize {
    1500              :                 // doesn't need compacting
    1501            0 :                 return Ok(());
    1502            0 :             }
    1503            0 :             l0_deltas.iter().map(|l| l.lsn_range.end).max().unwrap()
    1504            0 :         };
    1505            0 : 
    1506            0 :         // Is the timeline being deleted?
    1507            0 :         if self.is_stopping() {
    1508            0 :             trace!("Dropping out of compaction on timeline shutdown");
    1509            0 :             return Err(CompactionError::ShuttingDown);
    1510            0 :         }
    1511              : 
    1512            0 :         let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
    1513              :         // TODO(chi): ignore sparse_keyspace for now, compact it in the future.
    1514            0 :         let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
    1515            0 : 
    1516            0 :         pageserver_compaction::compact_tiered::compact_tiered(
    1517            0 :             &mut adaptor,
    1518            0 :             end_lsn,
    1519            0 :             target_file_size,
    1520            0 :             fanout,
    1521            0 :             ctx,
    1522            0 :         )
    1523            0 :         .await
    1524              :         // TODO: compact_tiered needs to return CompactionError
    1525            0 :         .map_err(CompactionError::Other)?;
    1526              : 
    1527            0 :         adaptor.flush_updates().await?;
    1528            0 :         Ok(())
    1529            0 :     }
    1530              : 
    1531              :     /// Take a list of images and deltas, produce images and deltas according to GC horizon and retain_lsns.
    1532              :     ///
    1533              :     /// It takes a key, the values of the key within the compaction process, a GC horizon, and all retain_lsns below the horizon.
    1534              :     /// For now, it requires the `accumulated_values` contains the full history of the key (i.e., the key with the lowest LSN is
    1535              :     /// an image or a WAL not requiring a base image). This restriction will be removed once we implement gc-compaction on branch.
    1536              :     ///
    1537              :     /// The function returns the deltas and the base image that need to be placed at each of the retain LSN. For example, we have:
    1538              :     ///
    1539              :     /// A@0x10, +B@0x20, +C@0x30, +D@0x40, +E@0x50, +F@0x60
    1540              :     /// horizon = 0x50, retain_lsn = 0x20, 0x40, delta_threshold=3
    1541              :     ///
    1542              :     /// The function will produce:
    1543              :     ///
    1544              :     /// ```plain
    1545              :     /// 0x20(retain_lsn) -> img=AB@0x20                  always produce a single image below the lowest retain LSN
    1546              :     /// 0x40(retain_lsn) -> deltas=[+C@0x30, +D@0x40]    two deltas since the last base image, keeping the deltas
    1547              :     /// 0x50(horizon)    -> deltas=[ABCDE@0x50]          three deltas since the last base image, generate an image but put it in the delta
    1548              :     /// above_horizon    -> deltas=[+F@0x60]             full history above the horizon
    1549              :     /// ```
    1550              :     ///
    1551              :     /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key.
    1552         1290 :     pub(crate) async fn generate_key_retention(
    1553         1290 :         self: &Arc<Timeline>,
    1554         1290 :         key: Key,
    1555         1290 :         full_history: &[(Key, Lsn, Value)],
    1556         1290 :         horizon: Lsn,
    1557         1290 :         retain_lsn_below_horizon: &[Lsn],
    1558         1290 :         delta_threshold_cnt: usize,
    1559         1290 :         base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
    1560         1290 :     ) -> anyhow::Result<KeyHistoryRetention> {
    1561         1290 :         // Pre-checks for the invariants
    1562         1290 :         if cfg!(debug_assertions) {
    1563         3120 :             for (log_key, _, _) in full_history {
    1564         1830 :                 assert_eq!(log_key, &key, "mismatched key");
    1565              :             }
    1566         1290 :             for i in 1..full_history.len() {
    1567          540 :                 assert!(full_history[i - 1].1 <= full_history[i].1, "unordered LSN");
    1568          540 :                 if full_history[i - 1].1 == full_history[i].1 {
    1569            0 :                     assert!(
    1570            0 :                         matches!(full_history[i - 1].2, Value::Image(_)),
    1571            0 :                         "unordered delta/image, or duplicated delta"
    1572              :                     );
    1573          540 :                 }
    1574              :             }
    1575              :             // There was an assertion for no base image that checks if the first
    1576              :             // record in the history is `will_init` before, but it was removed.
    1577              :             // This is explained in the test cases for generate_key_retention.
    1578              :             // Search "incomplete history" for more information.
    1579         3000 :             for lsn in retain_lsn_below_horizon {
    1580         1710 :                 assert!(lsn < &horizon, "retain lsn must be below horizon")
    1581              :             }
    1582         1290 :             for i in 1..retain_lsn_below_horizon.len() {
    1583          834 :                 assert!(
    1584          834 :                     retain_lsn_below_horizon[i - 1] <= retain_lsn_below_horizon[i],
    1585            0 :                     "unordered LSN"
    1586              :                 );
    1587              :             }
    1588            0 :         }
    1589         1290 :         let has_ancestor = base_img_from_ancestor.is_some();
    1590              :         // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon,
    1591              :         // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket.
    1592         1290 :         let (mut split_history, lsn_split_points) = {
    1593         1290 :             let mut split_history = Vec::new();
    1594         1290 :             split_history.resize_with(retain_lsn_below_horizon.len() + 2, Vec::new);
    1595         1290 :             let mut lsn_split_points = Vec::with_capacity(retain_lsn_below_horizon.len() + 1);
    1596         3000 :             for lsn in retain_lsn_below_horizon {
    1597         1710 :                 lsn_split_points.push(*lsn);
    1598         1710 :             }
    1599         1290 :             lsn_split_points.push(horizon);
    1600         1290 :             let mut current_idx = 0;
    1601         3120 :             for item @ (_, lsn, _) in full_history {
    1602         2316 :                 while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] {
    1603          486 :                     current_idx += 1;
    1604          486 :                 }
    1605         1830 :                 split_history[current_idx].push(item);
    1606              :             }
    1607         1290 :             (split_history, lsn_split_points)
    1608              :         };
    1609              :         // Step 2: filter out duplicated records due to the k-merge of image/delta layers
    1610         5580 :         for split_for_lsn in &mut split_history {
    1611         4290 :             let mut prev_lsn = None;
    1612         4290 :             let mut new_split_for_lsn = Vec::with_capacity(split_for_lsn.len());
    1613         4290 :             for record @ (_, lsn, _) in std::mem::take(split_for_lsn) {
    1614         1830 :                 if let Some(prev_lsn) = &prev_lsn {
    1615          198 :                     if *prev_lsn == lsn {
    1616              :                         // The case that we have an LSN with both data from the delta layer and the image layer. As
    1617              :                         // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply
    1618              :                         // drop this delta and keep the image.
    1619              :                         //
    1620              :                         // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
    1621              :                         // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
    1622              :                         // dropped.
    1623              :                         //
    1624              :                         // TODO: in case we have both delta + images for a given LSN and it does not exceed the delta
    1625              :                         // threshold, we could have kept delta instead to save space. This is an optimization for the future.
    1626            0 :                         continue;
    1627          198 :                     }
    1628         1632 :                 }
    1629         1830 :                 prev_lsn = Some(lsn);
    1630         1830 :                 new_split_for_lsn.push(record);
    1631              :             }
    1632         4290 :             *split_for_lsn = new_split_for_lsn;
    1633              :         }
    1634              :         // Step 3: generate images when necessary
    1635         1290 :         let mut retention = Vec::with_capacity(split_history.len());
    1636         1290 :         let mut records_since_last_image = 0;
    1637         1290 :         let batch_cnt = split_history.len();
    1638         1290 :         assert!(
    1639         1290 :             batch_cnt >= 2,
    1640            0 :             "should have at least below + above horizon batches"
    1641              :         );
    1642         1290 :         let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
    1643         1290 :         if let Some((key, lsn, img)) = base_img_from_ancestor {
    1644           54 :             replay_history.push((key, lsn, Value::Image(img)));
    1645         1236 :         }
    1646              : 
    1647              :         /// Generate debug information for the replay history
    1648            0 :         fn generate_history_trace(replay_history: &[(Key, Lsn, Value)]) -> String {
    1649            0 :             use std::fmt::Write;
    1650            0 :             let mut output = String::new();
    1651            0 :             if let Some((key, _, _)) = replay_history.first() {
    1652            0 :                 write!(output, "key={} ", key).unwrap();
    1653            0 :                 let mut cnt = 0;
    1654            0 :                 for (_, lsn, val) in replay_history {
    1655            0 :                     if val.is_image() {
    1656            0 :                         write!(output, "i@{} ", lsn).unwrap();
    1657            0 :                     } else if val.will_init() {
    1658            0 :                         write!(output, "di@{} ", lsn).unwrap();
    1659            0 :                     } else {
    1660            0 :                         write!(output, "d@{} ", lsn).unwrap();
    1661            0 :                     }
    1662            0 :                     cnt += 1;
    1663            0 :                     if cnt >= 128 {
    1664            0 :                         write!(output, "... and more").unwrap();
    1665            0 :                         break;
    1666            0 :                     }
    1667              :                 }
    1668            0 :             } else {
    1669            0 :                 write!(output, "<no history>").unwrap();
    1670            0 :             }
    1671            0 :             output
    1672            0 :         }
    1673              : 
    1674            0 :         fn generate_debug_trace(
    1675            0 :             replay_history: Option<&[(Key, Lsn, Value)]>,
    1676            0 :             full_history: &[(Key, Lsn, Value)],
    1677            0 :             lsns: &[Lsn],
    1678            0 :             horizon: Lsn,
    1679            0 :         ) -> String {
    1680            0 :             use std::fmt::Write;
    1681            0 :             let mut output = String::new();
    1682            0 :             if let Some(replay_history) = replay_history {
    1683            0 :                 writeln!(
    1684            0 :                     output,
    1685            0 :                     "replay_history: {}",
    1686            0 :                     generate_history_trace(replay_history)
    1687            0 :                 )
    1688            0 :                 .unwrap();
    1689            0 :             } else {
    1690            0 :                 writeln!(output, "replay_history: <disabled>",).unwrap();
    1691            0 :             }
    1692            0 :             writeln!(
    1693            0 :                 output,
    1694            0 :                 "full_history: {}",
    1695            0 :                 generate_history_trace(full_history)
    1696            0 :             )
    1697            0 :             .unwrap();
    1698            0 :             writeln!(
    1699            0 :                 output,
    1700            0 :                 "when processing: [{}] horizon={}",
    1701            0 :                 lsns.iter().map(|l| format!("{l}")).join(","),
    1702            0 :                 horizon
    1703            0 :             )
    1704            0 :             .unwrap();
    1705            0 :             output
    1706            0 :         }
    1707              : 
    1708         4290 :         for (i, split_for_lsn) in split_history.into_iter().enumerate() {
    1709              :             // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
    1710         4290 :             records_since_last_image += split_for_lsn.len();
    1711         4290 :             let generate_image = if i == 0 && !has_ancestor {
    1712              :                 // We always generate images for the first batch (below horizon / lowest retain_lsn)
    1713         1236 :                 true
    1714         3054 :             } else if i == batch_cnt - 1 {
    1715              :                 // Do not generate images for the last batch (above horizon)
    1716         1290 :                 false
    1717         1764 :             } else if records_since_last_image >= delta_threshold_cnt {
    1718              :                 // Generate images when there are too many records
    1719           18 :                 true
    1720              :             } else {
    1721         1746 :                 false
    1722              :             };
    1723         4290 :             replay_history.extend(split_for_lsn.iter().map(|x| (*x).clone()));
    1724              :             // Only retain the items after the last image record
    1725         5274 :             for idx in (0..replay_history.len()).rev() {
    1726         5274 :                 if replay_history[idx].2.will_init() {
    1727         4290 :                     replay_history = replay_history[idx..].to_vec();
    1728         4290 :                     break;
    1729          984 :                 }
    1730              :             }
    1731         4290 :             if let Some((_, _, val)) = replay_history.first() {
    1732         4290 :                 if !val.will_init() {
    1733            0 :                     return Err(anyhow::anyhow!("invalid history, no base image")).with_context(
    1734            0 :                         || {
    1735            0 :                             generate_debug_trace(
    1736            0 :                                 Some(&replay_history),
    1737            0 :                                 full_history,
    1738            0 :                                 retain_lsn_below_horizon,
    1739            0 :                                 horizon,
    1740            0 :                             )
    1741            0 :                         },
    1742            0 :                     );
    1743         4290 :                 }
    1744            0 :             }
    1745         4290 :             if generate_image && records_since_last_image > 0 {
    1746         1254 :                 records_since_last_image = 0;
    1747         1254 :                 let replay_history_for_debug = if cfg!(debug_assertions) {
    1748         1254 :                     Some(replay_history.clone())
    1749              :                 } else {
    1750            0 :                     None
    1751              :                 };
    1752         1254 :                 let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
    1753         1254 :                 let history = std::mem::take(&mut replay_history);
    1754         1254 :                 let mut img = None;
    1755         1254 :                 let mut records = Vec::with_capacity(history.len());
    1756         1254 :                 if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
    1757         1254 :                     img = Some((*lsn, val.clone()));
    1758         1254 :                     for (_, lsn, val) in history.into_iter().skip(1) {
    1759          102 :                         let Value::WalRecord(rec) = val else {
    1760            0 :                             return Err(anyhow::anyhow!(
    1761            0 :                                 "invalid record, first record is image, expect walrecords"
    1762            0 :                             ))
    1763            0 :                             .with_context(|| {
    1764            0 :                                 generate_debug_trace(
    1765            0 :                                     replay_history_for_debug_ref,
    1766            0 :                                     full_history,
    1767            0 :                                     retain_lsn_below_horizon,
    1768            0 :                                     horizon,
    1769            0 :                                 )
    1770            0 :                             });
    1771              :                         };
    1772          102 :                         records.push((lsn, rec));
    1773              :                     }
    1774              :                 } else {
    1775            0 :                     for (_, lsn, val) in history.into_iter() {
    1776            0 :                         let Value::WalRecord(rec) = val else {
    1777            0 :                             return Err(anyhow::anyhow!("invalid record, first record is walrecord, expect rest are walrecord"))
    1778            0 :                                 .with_context(|| generate_debug_trace(
    1779            0 :                                     replay_history_for_debug_ref,
    1780            0 :                                     full_history,
    1781            0 :                                     retain_lsn_below_horizon,
    1782            0 :                                     horizon,
    1783            0 :                                 ));
    1784              :                         };
    1785            0 :                         records.push((lsn, rec));
    1786              :                     }
    1787              :                 }
    1788         1254 :                 records.reverse();
    1789         1254 :                 let state = ValueReconstructState { img, records };
    1790         1254 :                 let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range
    1791         1254 :                 let img = self.reconstruct_value(key, request_lsn, state).await?;
    1792         1254 :                 replay_history.push((key, request_lsn, Value::Image(img.clone())));
    1793         1254 :                 retention.push(vec![(request_lsn, Value::Image(img))]);
    1794         3036 :             } else {
    1795         3036 :                 let deltas = split_for_lsn
    1796         3036 :                     .iter()
    1797         3036 :                     .map(|(_, lsn, value)| (*lsn, value.clone()))
    1798         3036 :                     .collect_vec();
    1799         3036 :                 retention.push(deltas);
    1800         3036 :             }
    1801              :         }
    1802         1290 :         let mut result = Vec::with_capacity(retention.len());
    1803         1290 :         assert_eq!(retention.len(), lsn_split_points.len() + 1);
    1804         4290 :         for (idx, logs) in retention.into_iter().enumerate() {
    1805         4290 :             if idx == lsn_split_points.len() {
    1806         1290 :                 return Ok(KeyHistoryRetention {
    1807         1290 :                     below_horizon: result,
    1808         1290 :                     above_horizon: KeyLogAtLsn(logs),
    1809         1290 :                 });
    1810         3000 :             } else {
    1811         3000 :                 result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
    1812         3000 :             }
    1813              :         }
    1814            0 :         unreachable!("key retention is empty")
    1815         1290 :     }
    1816              : 
    1817              :     /// An experimental compaction building block that combines compaction with garbage collection.
    1818              :     ///
    1819              :     /// The current implementation picks all delta + image layers that are below or intersecting with
    1820              :     /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
    1821              :     /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
    1822              :     /// and create delta layers with all deltas >= gc horizon.
    1823           78 :     pub(crate) async fn compact_with_gc(
    1824           78 :         self: &Arc<Self>,
    1825           78 :         cancel: &CancellationToken,
    1826           78 :         flags: EnumSet<CompactFlags>,
    1827           78 :         ctx: &RequestContext,
    1828           78 :     ) -> anyhow::Result<()> {
    1829           78 :         use std::collections::BTreeSet;
    1830           78 : 
    1831           78 :         // Block other compaction/GC tasks from running for now. GC-compaction could run along
    1832           78 :         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
    1833           78 :         // Note that we already acquired the compaction lock when the outer `compact` function gets called.
    1834           78 : 
    1835           78 :         let gc_lock = async {
    1836              :             tokio::select! {
    1837              :                 guard = self.gc_lock.lock() => Ok(guard),
    1838              :                 // TODO: refactor to CompactionError to correctly pass cancelled error
    1839              :                 _ = cancel.cancelled() => Err(anyhow!("cancelled")),
    1840              :             }
    1841           78 :         };
    1842              : 
    1843           78 :         let gc_lock = crate::timed(
    1844           78 :             gc_lock,
    1845           78 :             "acquires gc lock",
    1846           78 :             std::time::Duration::from_secs(5),
    1847           78 :         )
    1848            0 :         .await?;
    1849              : 
    1850           78 :         let dry_run = flags.contains(CompactFlags::DryRun);
    1851           78 : 
    1852           78 :         info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
    1853              : 
    1854              :         scopeguard::defer! {
    1855              :             info!("done enhanced gc bottom-most compaction");
    1856              :         };
    1857              : 
    1858           78 :         let mut stat = CompactionStatistics::default();
    1859              : 
    1860              :         // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
    1861              :         // The layer selection has the following properties:
    1862              :         // 1. If a layer is in the selection, all layers below it are in the selection.
    1863              :         // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
    1864           78 :         let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
    1865           78 :             let guard = self.layers.read().await;
    1866           78 :             let layers = guard.layer_map()?;
    1867           78 :             let gc_info = self.gc_info.read().unwrap();
    1868           78 :             let mut retain_lsns_below_horizon = Vec::new();
    1869           78 :             let gc_cutoff = gc_info.cutoffs.select_min();
    1870          102 :             for (lsn, _timeline_id) in &gc_info.retain_lsns {
    1871          102 :                 if lsn < &gc_cutoff {
    1872          102 :                     retain_lsns_below_horizon.push(*lsn);
    1873          102 :                 }
    1874              :             }
    1875           78 :             for lsn in gc_info.leases.keys() {
    1876            0 :                 if lsn < &gc_cutoff {
    1877            0 :                     retain_lsns_below_horizon.push(*lsn);
    1878            0 :                 }
    1879              :             }
    1880           78 :             let mut selected_layers = Vec::new();
    1881           78 :             drop(gc_info);
    1882              :             // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
    1883           78 :             let Some(max_layer_lsn) = layers
    1884           78 :                 .iter_historic_layers()
    1885          300 :                 .filter(|desc| desc.get_lsn_range().start <= gc_cutoff)
    1886          246 :                 .map(|desc| desc.get_lsn_range().end)
    1887           78 :                 .max()
    1888              :             else {
    1889            0 :                 info!("no layers to compact with gc");
    1890            0 :                 return Ok(());
    1891              :             };
    1892              :             // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
    1893              :             // layers to compact.
    1894          300 :             for desc in layers.iter_historic_layers() {
    1895          300 :                 if desc.get_lsn_range().end <= max_layer_lsn {
    1896          246 :                     selected_layers.push(guard.get_from_desc(&desc));
    1897          246 :                 }
    1898              :             }
    1899           78 :             if selected_layers.is_empty() {
    1900            0 :                 info!("no layers to compact with gc");
    1901            0 :                 return Ok(());
    1902           78 :             }
    1903           78 :             retain_lsns_below_horizon.sort();
    1904           78 :             (selected_layers, gc_cutoff, retain_lsns_below_horizon)
    1905              :         };
    1906           78 :         let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
    1907            6 :             Lsn(self.ancestor_lsn.0 + 1)
    1908              :         } else {
    1909           72 :             let res = retain_lsns_below_horizon
    1910           72 :                 .first()
    1911           72 :                 .copied()
    1912           72 :                 .unwrap_or(gc_cutoff);
    1913           72 :             if cfg!(debug_assertions) {
    1914           72 :                 assert_eq!(
    1915           72 :                     res,
    1916           72 :                     retain_lsns_below_horizon
    1917           72 :                         .iter()
    1918           72 :                         .min()
    1919           72 :                         .copied()
    1920           72 :                         .unwrap_or(gc_cutoff)
    1921           72 :                 );
    1922            0 :             }
    1923           72 :             res
    1924              :         };
    1925           78 :         info!(
    1926            0 :             "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
    1927            0 :             layer_selection.len(),
    1928              :             gc_cutoff,
    1929              :             lowest_retain_lsn
    1930              :         );
    1931              :         // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
    1932              :         // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
    1933           78 :         let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
    1934          324 :         for layer in &layer_selection {
    1935          246 :             let desc = layer.layer_desc();
    1936          246 :             if desc.is_delta() {
    1937              :                 // ignore single-key layer files
    1938          138 :                 if desc.key_range.start.next() != desc.key_range.end {
    1939          114 :                     let lsn_range = &desc.lsn_range;
    1940          114 :                     lsn_split_point.insert(lsn_range.start);
    1941          114 :                     lsn_split_point.insert(lsn_range.end);
    1942          114 :                 }
    1943          138 :                 stat.visit_delta_layer(desc.file_size());
    1944          108 :             } else {
    1945          108 :                 stat.visit_image_layer(desc.file_size());
    1946          108 :             }
    1947              :         }
    1948          324 :         for layer in &layer_selection {
    1949          246 :             let desc = layer.layer_desc();
    1950          246 :             let key_range = &desc.key_range;
    1951          246 :             if desc.is_delta() && key_range.start.next() != key_range.end {
    1952          114 :                 let lsn_range = desc.lsn_range.clone();
    1953          114 :                 let intersects = lsn_split_point.range(lsn_range).collect_vec();
    1954          114 :                 if intersects.len() > 1 {
    1955            0 :                     bail!(
    1956            0 :                         "cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
    1957            0 :                         desc.key(),
    1958            0 :                         intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
    1959            0 :                     );
    1960          114 :                 }
    1961          132 :             }
    1962              :         }
    1963              :         // The maximum LSN we are processing in this compaction loop
    1964           78 :         let end_lsn = layer_selection
    1965           78 :             .iter()
    1966          246 :             .map(|l| l.layer_desc().lsn_range.end)
    1967           78 :             .max()
    1968           78 :             .unwrap();
    1969           78 :         // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
    1970           78 :         // as an L0 layer.
    1971           78 :         let hack_end_key = Key::NON_L0_MAX;
    1972           78 :         let mut delta_layers = Vec::new();
    1973           78 :         let mut image_layers = Vec::new();
    1974           78 :         let mut downloaded_layers = Vec::new();
    1975          324 :         for layer in &layer_selection {
    1976          246 :             let resident_layer = layer.download_and_keep_resident().await?;
    1977          246 :             downloaded_layers.push(resident_layer);
    1978              :         }
    1979          324 :         for resident_layer in &downloaded_layers {
    1980          246 :             if resident_layer.layer_desc().is_delta() {
    1981          138 :                 let layer = resident_layer.get_as_delta(ctx).await?;
    1982          138 :                 delta_layers.push(layer);
    1983          108 :             } else {
    1984          108 :                 let layer = resident_layer.get_as_image(ctx).await?;
    1985          108 :                 image_layers.push(layer);
    1986              :             }
    1987              :         }
    1988           78 :         let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
    1989           78 :         // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
    1990           78 :         // Data of the same key.
    1991           78 :         let mut accumulated_values = Vec::new();
    1992           78 :         let mut last_key: Option<Key> = None;
    1993              : 
    1994              :         // Only create image layers when there is no ancestor branches. TODO: create covering image layer
    1995              :         // when some condition meet.
    1996           78 :         let mut image_layer_writer = if self.ancestor_timeline.is_none() {
    1997              :             Some(
    1998           72 :                 SplitImageLayerWriter::new(
    1999           72 :                     self.conf,
    2000           72 :                     self.timeline_id,
    2001           72 :                     self.tenant_shard_id,
    2002           72 :                     Key::MIN,
    2003           72 :                     lowest_retain_lsn,
    2004           72 :                     self.get_compaction_target_size(),
    2005           72 :                     ctx,
    2006           72 :                 )
    2007           36 :                 .await?,
    2008              :             )
    2009              :         } else {
    2010            6 :             None
    2011              :         };
    2012              : 
    2013           78 :         let mut delta_layer_writer = SplitDeltaLayerWriter::new(
    2014           78 :             self.conf,
    2015           78 :             self.timeline_id,
    2016           78 :             self.tenant_shard_id,
    2017           78 :             Key::MIN,
    2018           78 :             lowest_retain_lsn..end_lsn,
    2019           78 :             self.get_compaction_target_size(),
    2020           78 :             ctx,
    2021           78 :         )
    2022           39 :         .await?;
    2023              : 
    2024              :         /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
    2025              :         ///
    2026              :         /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
    2027              :         /// is needed for reconstruction. This should be fixed in the future.
    2028              :         ///
    2029              :         /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor
    2030              :         /// images.
    2031         1266 :         async fn get_ancestor_image(
    2032         1266 :             tline: &Arc<Timeline>,
    2033         1266 :             key: Key,
    2034         1266 :             ctx: &RequestContext,
    2035         1266 :         ) -> anyhow::Result<Option<(Key, Lsn, Bytes)>> {
    2036         1266 :             if tline.ancestor_timeline.is_none() {
    2037         1224 :                 return Ok(None);
    2038           42 :             };
    2039              :             // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing
    2040              :             // as much existing code as possible.
    2041           42 :             let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
    2042           42 :             Ok(Some((key, tline.ancestor_lsn, img)))
    2043         1266 :         }
    2044              : 
    2045              :         // Actually, we can decide not to write to the image layer at all at this point because
    2046              :         // the key and LSN range are determined. However, to keep things simple here, we still
    2047              :         // create this writer, and discard the writer in the end.
    2048              : 
    2049         1758 :         while let Some((key, lsn, val)) = merge_iter.next().await? {
    2050         1680 :             if cancel.is_cancelled() {
    2051            0 :                 return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
    2052         1680 :             }
    2053         1680 :             match val {
    2054         1260 :                 Value::Image(_) => stat.visit_image_key(&val),
    2055          420 :                 Value::WalRecord(_) => stat.visit_wal_key(&val),
    2056              :             }
    2057         1680 :             if last_key.is_none() || last_key.as_ref() == Some(&key) {
    2058          492 :                 if last_key.is_none() {
    2059           78 :                     last_key = Some(key);
    2060          414 :                 }
    2061          492 :                 accumulated_values.push((key, lsn, val));
    2062              :             } else {
    2063         1188 :                 let last_key = last_key.as_mut().unwrap();
    2064         1188 :                 stat.on_unique_key_visited();
    2065         1188 :                 let retention = self
    2066         1188 :                     .generate_key_retention(
    2067         1188 :                         *last_key,
    2068         1188 :                         &accumulated_values,
    2069         1188 :                         gc_cutoff,
    2070         1188 :                         &retain_lsns_below_horizon,
    2071         1188 :                         COMPACTION_DELTA_THRESHOLD,
    2072         1188 :                         get_ancestor_image(self, *last_key, ctx).await?,
    2073              :                     )
    2074            0 :                     .await?;
    2075              :                 // Put the image into the image layer. Currently we have a single big layer for the compaction.
    2076         1188 :                 retention
    2077         1188 :                     .pipe_to(
    2078         1188 :                         *last_key,
    2079         1188 :                         self,
    2080         1188 :                         &mut delta_layer_writer,
    2081         1188 :                         image_layer_writer.as_mut(),
    2082         1188 :                         &mut stat,
    2083         1188 :                         dry_run,
    2084         1188 :                         ctx,
    2085         1188 :                     )
    2086         1170 :                     .await?;
    2087         1188 :                 accumulated_values.clear();
    2088         1188 :                 *last_key = key;
    2089         1188 :                 accumulated_values.push((key, lsn, val));
    2090              :             }
    2091              :         }
    2092              : 
    2093           78 :         let last_key = last_key.expect("no keys produced during compaction");
    2094           78 :         // TODO: move this part to the loop body
    2095           78 :         stat.on_unique_key_visited();
    2096           78 :         let retention = self
    2097           78 :             .generate_key_retention(
    2098           78 :                 last_key,
    2099           78 :                 &accumulated_values,
    2100           78 :                 gc_cutoff,
    2101           78 :                 &retain_lsns_below_horizon,
    2102           78 :                 COMPACTION_DELTA_THRESHOLD,
    2103           78 :                 get_ancestor_image(self, last_key, ctx).await?,
    2104              :             )
    2105            0 :             .await?;
    2106              :         // Put the image into the image layer. Currently we have a single big layer for the compaction.
    2107           78 :         retention
    2108           78 :             .pipe_to(
    2109           78 :                 last_key,
    2110           78 :                 self,
    2111           78 :                 &mut delta_layer_writer,
    2112           78 :                 image_layer_writer.as_mut(),
    2113           78 :                 &mut stat,
    2114           78 :                 dry_run,
    2115           78 :                 ctx,
    2116           78 :             )
    2117           72 :             .await?;
    2118              : 
    2119          114 :         let discard = |key: &PersistentLayerKey| {
    2120          114 :             let key = key.clone();
    2121          114 :             async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await }
    2122          114 :         };
    2123              : 
    2124           78 :         let produced_image_layers = if let Some(writer) = image_layer_writer {
    2125           72 :             if !dry_run {
    2126           60 :                 writer
    2127           60 :                     .finish_with_discard_fn(self, ctx, hack_end_key, discard)
    2128           72 :                     .await?
    2129              :             } else {
    2130           12 :                 let (layers, _) = writer.take()?;
    2131           12 :                 assert!(layers.is_empty(), "image layers produced in dry run mode?");
    2132           12 :                 Vec::new()
    2133              :             }
    2134              :         } else {
    2135            6 :             Vec::new()
    2136              :         };
    2137              : 
    2138           78 :         let produced_delta_layers = if !dry_run {
    2139           66 :             delta_layer_writer
    2140           66 :                 .finish_with_discard_fn(self, ctx, hack_end_key, discard)
    2141           81 :                 .await?
    2142              :         } else {
    2143           12 :             let (layers, _) = delta_layer_writer.take()?;
    2144           12 :             assert!(layers.is_empty(), "delta layers produced in dry run mode?");
    2145           12 :             Vec::new()
    2146              :         };
    2147              : 
    2148           78 :         let mut compact_to = Vec::new();
    2149           78 :         let mut keep_layers = HashSet::new();
    2150           78 :         let produced_delta_layers_len = produced_delta_layers.len();
    2151           78 :         let produced_image_layers_len = produced_image_layers.len();
    2152          132 :         for action in produced_delta_layers {
    2153           54 :             match action {
    2154           30 :                 SplitWriterResult::Produced(layer) => {
    2155           30 :                     stat.produce_delta_layer(layer.layer_desc().file_size());
    2156           30 :                     compact_to.push(layer);
    2157           30 :                 }
    2158           24 :                 SplitWriterResult::Discarded(l) => {
    2159           24 :                     keep_layers.insert(l);
    2160           24 :                     stat.discard_delta_layer();
    2161           24 :                 }
    2162              :             }
    2163              :         }
    2164          138 :         for action in produced_image_layers {
    2165           60 :             match action {
    2166           36 :                 SplitWriterResult::Produced(layer) => {
    2167           36 :                     stat.produce_image_layer(layer.layer_desc().file_size());
    2168           36 :                     compact_to.push(layer);
    2169           36 :                 }
    2170           24 :                 SplitWriterResult::Discarded(l) => {
    2171           24 :                     keep_layers.insert(l);
    2172           24 :                     stat.discard_image_layer();
    2173           24 :                 }
    2174              :             }
    2175              :         }
    2176           78 :         let mut layer_selection = layer_selection;
    2177          246 :         layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
    2178           78 : 
    2179           78 :         info!(
    2180            0 :             "gc-compaction statistics: {}",
    2181            0 :             serde_json::to_string(&stat)?
    2182              :         );
    2183              : 
    2184           78 :         if dry_run {
    2185           12 :             return Ok(());
    2186           66 :         }
    2187           66 : 
    2188           66 :         info!(
    2189            0 :             "produced {} delta layers and {} image layers, {} layers are kept",
    2190            0 :             produced_delta_layers_len,
    2191            0 :             produced_image_layers_len,
    2192            0 :             layer_selection.len()
    2193              :         );
    2194              : 
    2195              :         // Step 3: Place back to the layer map.
    2196              :         {
    2197           66 :             let mut guard = self.layers.write().await;
    2198           66 :             guard
    2199           66 :                 .open_mut()?
    2200           66 :                 .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
    2201           66 :         };
    2202           66 :         self.remote_client
    2203           66 :             .schedule_compaction_update(&layer_selection, &compact_to)?;
    2204              : 
    2205           66 :         drop(gc_lock);
    2206           66 : 
    2207           66 :         Ok(())
    2208           78 :     }
    2209              : }
    2210              : 
    2211              : struct TimelineAdaptor {
    2212              :     timeline: Arc<Timeline>,
    2213              : 
    2214              :     keyspace: (Lsn, KeySpace),
    2215              : 
    2216              :     new_deltas: Vec<ResidentLayer>,
    2217              :     new_images: Vec<ResidentLayer>,
    2218              :     layers_to_delete: Vec<Arc<PersistentLayerDesc>>,
    2219              : }
    2220              : 
    2221              : impl TimelineAdaptor {
    2222            0 :     pub fn new(timeline: &Arc<Timeline>, keyspace: (Lsn, KeySpace)) -> Self {
    2223            0 :         Self {
    2224            0 :             timeline: timeline.clone(),
    2225            0 :             keyspace,
    2226            0 :             new_images: Vec::new(),
    2227            0 :             new_deltas: Vec::new(),
    2228            0 :             layers_to_delete: Vec::new(),
    2229            0 :         }
    2230            0 :     }
    2231              : 
    2232            0 :     pub async fn flush_updates(&mut self) -> Result<(), CompactionError> {
    2233            0 :         let layers_to_delete = {
    2234            0 :             let guard = self.timeline.layers.read().await;
    2235            0 :             self.layers_to_delete
    2236            0 :                 .iter()
    2237            0 :                 .map(|x| guard.get_from_desc(x))
    2238            0 :                 .collect::<Vec<Layer>>()
    2239            0 :         };
    2240            0 :         self.timeline
    2241            0 :             .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
    2242            0 :             .await?;
    2243              : 
    2244            0 :         self.timeline
    2245            0 :             .upload_new_image_layers(std::mem::take(&mut self.new_images))?;
    2246              : 
    2247            0 :         self.new_deltas.clear();
    2248            0 :         self.layers_to_delete.clear();
    2249            0 :         Ok(())
    2250            0 :     }
    2251              : }
    2252              : 
    2253              : #[derive(Clone)]
    2254              : struct ResidentDeltaLayer(ResidentLayer);
    2255              : #[derive(Clone)]
    2256              : struct ResidentImageLayer(ResidentLayer);
    2257              : 
    2258              : impl CompactionJobExecutor for TimelineAdaptor {
    2259              :     type Key = crate::repository::Key;
    2260              : 
    2261              :     type Layer = OwnArc<PersistentLayerDesc>;
    2262              :     type DeltaLayer = ResidentDeltaLayer;
    2263              :     type ImageLayer = ResidentImageLayer;
    2264              : 
    2265              :     type RequestContext = crate::context::RequestContext;
    2266              : 
    2267            0 :     fn get_shard_identity(&self) -> &ShardIdentity {
    2268            0 :         self.timeline.get_shard_identity()
    2269            0 :     }
    2270              : 
    2271            0 :     async fn get_layers(
    2272            0 :         &mut self,
    2273            0 :         key_range: &Range<Key>,
    2274            0 :         lsn_range: &Range<Lsn>,
    2275            0 :         _ctx: &RequestContext,
    2276            0 :     ) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
    2277            0 :         self.flush_updates().await?;
    2278              : 
    2279            0 :         let guard = self.timeline.layers.read().await;
    2280            0 :         let layer_map = guard.layer_map()?;
    2281              : 
    2282            0 :         let result = layer_map
    2283            0 :             .iter_historic_layers()
    2284            0 :             .filter(|l| {
    2285            0 :                 overlaps_with(&l.lsn_range, lsn_range) && overlaps_with(&l.key_range, key_range)
    2286            0 :             })
    2287            0 :             .map(OwnArc)
    2288            0 :             .collect();
    2289            0 :         Ok(result)
    2290            0 :     }
    2291              : 
    2292            0 :     async fn get_keyspace(
    2293            0 :         &mut self,
    2294            0 :         key_range: &Range<Key>,
    2295            0 :         lsn: Lsn,
    2296            0 :         _ctx: &RequestContext,
    2297            0 :     ) -> anyhow::Result<Vec<Range<Key>>> {
    2298            0 :         if lsn == self.keyspace.0 {
    2299            0 :             Ok(pageserver_compaction::helpers::intersect_keyspace(
    2300            0 :                 &self.keyspace.1.ranges,
    2301            0 :                 key_range,
    2302            0 :             ))
    2303              :         } else {
    2304              :             // The current compaction implementation only ever requests the key space
    2305              :             // at the compaction end LSN.
    2306            0 :             anyhow::bail!("keyspace not available for requested lsn");
    2307              :         }
    2308            0 :     }
    2309              : 
    2310            0 :     async fn downcast_delta_layer(
    2311            0 :         &self,
    2312            0 :         layer: &OwnArc<PersistentLayerDesc>,
    2313            0 :     ) -> anyhow::Result<Option<ResidentDeltaLayer>> {
    2314            0 :         // this is a lot more complex than a simple downcast...
    2315            0 :         if layer.is_delta() {
    2316            0 :             let l = {
    2317            0 :                 let guard = self.timeline.layers.read().await;
    2318            0 :                 guard.get_from_desc(layer)
    2319              :             };
    2320            0 :             let result = l.download_and_keep_resident().await?;
    2321              : 
    2322            0 :             Ok(Some(ResidentDeltaLayer(result)))
    2323              :         } else {
    2324            0 :             Ok(None)
    2325              :         }
    2326            0 :     }
    2327              : 
    2328            0 :     async fn create_image(
    2329            0 :         &mut self,
    2330            0 :         lsn: Lsn,
    2331            0 :         key_range: &Range<Key>,
    2332            0 :         ctx: &RequestContext,
    2333            0 :     ) -> anyhow::Result<()> {
    2334            0 :         Ok(self.create_image_impl(lsn, key_range, ctx).await?)
    2335            0 :     }
    2336              : 
    2337            0 :     async fn create_delta(
    2338            0 :         &mut self,
    2339            0 :         lsn_range: &Range<Lsn>,
    2340            0 :         key_range: &Range<Key>,
    2341            0 :         input_layers: &[ResidentDeltaLayer],
    2342            0 :         ctx: &RequestContext,
    2343            0 :     ) -> anyhow::Result<()> {
    2344            0 :         debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
    2345              : 
    2346            0 :         let mut all_entries = Vec::new();
    2347            0 :         for dl in input_layers.iter() {
    2348            0 :             all_entries.extend(dl.load_keys(ctx).await?);
    2349              :         }
    2350              : 
    2351              :         // The current stdlib sorting implementation is designed in a way where it is
    2352              :         // particularly fast where the slice is made up of sorted sub-ranges.
    2353            0 :         all_entries.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
    2354              : 
    2355            0 :         let mut writer = DeltaLayerWriter::new(
    2356            0 :             self.timeline.conf,
    2357            0 :             self.timeline.timeline_id,
    2358            0 :             self.timeline.tenant_shard_id,
    2359            0 :             key_range.start,
    2360            0 :             lsn_range.clone(),
    2361            0 :             ctx,
    2362            0 :         )
    2363            0 :         .await?;
    2364              : 
    2365            0 :         let mut dup_values = 0;
    2366            0 : 
    2367            0 :         // This iterator walks through all key-value pairs from all the layers
    2368            0 :         // we're compacting, in key, LSN order.
    2369            0 :         let mut prev: Option<(Key, Lsn)> = None;
    2370              :         for &DeltaEntry {
    2371            0 :             key, lsn, ref val, ..
    2372            0 :         } in all_entries.iter()
    2373              :         {
    2374            0 :             if prev == Some((key, lsn)) {
    2375              :                 // This is a duplicate. Skip it.
    2376              :                 //
    2377              :                 // It can happen if compaction is interrupted after writing some
    2378              :                 // layers but not all, and we are compacting the range again.
    2379              :                 // The calculations in the algorithm assume that there are no
    2380              :                 // duplicates, so the math on targeted file size is likely off,
    2381              :                 // and we will create smaller files than expected.
    2382            0 :                 dup_values += 1;
    2383            0 :                 continue;
    2384            0 :             }
    2385              : 
    2386            0 :             let value = val.load(ctx).await?;
    2387              : 
    2388            0 :             writer.put_value(key, lsn, value, ctx).await?;
    2389              : 
    2390            0 :             prev = Some((key, lsn));
    2391              :         }
    2392              : 
    2393            0 :         if dup_values > 0 {
    2394            0 :             warn!("delta layer created with {} duplicate values", dup_values);
    2395            0 :         }
    2396              : 
    2397            0 :         fail_point!("delta-layer-writer-fail-before-finish", |_| {
    2398            0 :             Err(anyhow::anyhow!(
    2399            0 :                 "failpoint delta-layer-writer-fail-before-finish"
    2400            0 :             ))
    2401            0 :         });
    2402              : 
    2403            0 :         let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?;
    2404            0 :         let new_delta_layer =
    2405            0 :             Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
    2406              : 
    2407            0 :         self.new_deltas.push(new_delta_layer);
    2408            0 :         Ok(())
    2409            0 :     }
    2410              : 
    2411            0 :     async fn delete_layer(
    2412            0 :         &mut self,
    2413            0 :         layer: &OwnArc<PersistentLayerDesc>,
    2414            0 :         _ctx: &RequestContext,
    2415            0 :     ) -> anyhow::Result<()> {
    2416            0 :         self.layers_to_delete.push(layer.clone().0);
    2417            0 :         Ok(())
    2418            0 :     }
    2419              : }
    2420              : 
    2421              : impl TimelineAdaptor {
    2422            0 :     async fn create_image_impl(
    2423            0 :         &mut self,
    2424            0 :         lsn: Lsn,
    2425            0 :         key_range: &Range<Key>,
    2426            0 :         ctx: &RequestContext,
    2427            0 :     ) -> Result<(), CreateImageLayersError> {
    2428            0 :         let timer = self.timeline.metrics.create_images_time_histo.start_timer();
    2429              : 
    2430            0 :         let image_layer_writer = ImageLayerWriter::new(
    2431            0 :             self.timeline.conf,
    2432            0 :             self.timeline.timeline_id,
    2433            0 :             self.timeline.tenant_shard_id,
    2434            0 :             key_range,
    2435            0 :             lsn,
    2436            0 :             ctx,
    2437            0 :         )
    2438            0 :         .await?;
    2439              : 
    2440            0 :         fail_point!("image-layer-writer-fail-before-finish", |_| {
    2441            0 :             Err(CreateImageLayersError::Other(anyhow::anyhow!(
    2442            0 :                 "failpoint image-layer-writer-fail-before-finish"
    2443            0 :             )))
    2444            0 :         });
    2445              : 
    2446            0 :         let keyspace = KeySpace {
    2447            0 :             ranges: self.get_keyspace(key_range, lsn, ctx).await?,
    2448              :         };
    2449              :         // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
    2450            0 :         let start = Key::MIN;
    2451              :         let ImageLayerCreationOutcome {
    2452            0 :             image,
    2453              :             next_start_key: _,
    2454            0 :         } = self
    2455            0 :             .timeline
    2456            0 :             .create_image_layer_for_rel_blocks(
    2457            0 :                 &keyspace,
    2458            0 :                 image_layer_writer,
    2459            0 :                 lsn,
    2460            0 :                 ctx,
    2461            0 :                 key_range.clone(),
    2462            0 :                 start,
    2463            0 :             )
    2464            0 :             .await?;
    2465              : 
    2466            0 :         if let Some(image_layer) = image {
    2467            0 :             self.new_images.push(image_layer);
    2468            0 :         }
    2469              : 
    2470            0 :         timer.stop_and_record();
    2471            0 : 
    2472            0 :         Ok(())
    2473            0 :     }
    2474              : }
    2475              : 
    2476              : impl CompactionRequestContext for crate::context::RequestContext {}
    2477              : 
    2478              : #[derive(Debug, Clone)]
    2479              : pub struct OwnArc<T>(pub Arc<T>);
    2480              : 
    2481              : impl<T> Deref for OwnArc<T> {
    2482              :     type Target = <Arc<T> as Deref>::Target;
    2483            0 :     fn deref(&self) -> &Self::Target {
    2484            0 :         &self.0
    2485            0 :     }
    2486              : }
    2487              : 
    2488              : impl<T> AsRef<T> for OwnArc<T> {
    2489            0 :     fn as_ref(&self) -> &T {
    2490            0 :         self.0.as_ref()
    2491            0 :     }
    2492              : }
    2493              : 
    2494              : impl CompactionLayer<Key> for OwnArc<PersistentLayerDesc> {
    2495            0 :     fn key_range(&self) -> &Range<Key> {
    2496            0 :         &self.key_range
    2497            0 :     }
    2498            0 :     fn lsn_range(&self) -> &Range<Lsn> {
    2499            0 :         &self.lsn_range
    2500            0 :     }
    2501            0 :     fn file_size(&self) -> u64 {
    2502            0 :         self.file_size
    2503            0 :     }
    2504            0 :     fn short_id(&self) -> std::string::String {
    2505            0 :         self.as_ref().short_id().to_string()
    2506            0 :     }
    2507            0 :     fn is_delta(&self) -> bool {
    2508            0 :         self.as_ref().is_delta()
    2509            0 :     }
    2510              : }
    2511              : 
    2512              : impl CompactionLayer<Key> for OwnArc<DeltaLayer> {
    2513            0 :     fn key_range(&self) -> &Range<Key> {
    2514            0 :         &self.layer_desc().key_range
    2515            0 :     }
    2516            0 :     fn lsn_range(&self) -> &Range<Lsn> {
    2517            0 :         &self.layer_desc().lsn_range
    2518            0 :     }
    2519            0 :     fn file_size(&self) -> u64 {
    2520            0 :         self.layer_desc().file_size
    2521            0 :     }
    2522            0 :     fn short_id(&self) -> std::string::String {
    2523            0 :         self.layer_desc().short_id().to_string()
    2524            0 :     }
    2525            0 :     fn is_delta(&self) -> bool {
    2526            0 :         true
    2527            0 :     }
    2528              : }
    2529              : 
    2530              : use crate::tenant::timeline::DeltaEntry;
    2531              : 
    2532              : impl CompactionLayer<Key> for ResidentDeltaLayer {
    2533            0 :     fn key_range(&self) -> &Range<Key> {
    2534            0 :         &self.0.layer_desc().key_range
    2535            0 :     }
    2536            0 :     fn lsn_range(&self) -> &Range<Lsn> {
    2537            0 :         &self.0.layer_desc().lsn_range
    2538            0 :     }
    2539            0 :     fn file_size(&self) -> u64 {
    2540            0 :         self.0.layer_desc().file_size
    2541            0 :     }
    2542            0 :     fn short_id(&self) -> std::string::String {
    2543            0 :         self.0.layer_desc().short_id().to_string()
    2544            0 :     }
    2545            0 :     fn is_delta(&self) -> bool {
    2546            0 :         true
    2547            0 :     }
    2548              : }
    2549              : 
    2550              : impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
    2551              :     type DeltaEntry<'a> = DeltaEntry<'a>;
    2552              : 
    2553            0 :     async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
    2554            0 :         self.0.load_keys(ctx).await
    2555            0 :     }
    2556              : }
    2557              : 
    2558              : impl CompactionLayer<Key> for ResidentImageLayer {
    2559            0 :     fn key_range(&self) -> &Range<Key> {
    2560            0 :         &self.0.layer_desc().key_range
    2561            0 :     }
    2562            0 :     fn lsn_range(&self) -> &Range<Lsn> {
    2563            0 :         &self.0.layer_desc().lsn_range
    2564            0 :     }
    2565            0 :     fn file_size(&self) -> u64 {
    2566            0 :         self.0.layer_desc().file_size
    2567            0 :     }
    2568            0 :     fn short_id(&self) -> std::string::String {
    2569            0 :         self.0.layer_desc().short_id().to_string()
    2570            0 :     }
    2571            0 :     fn is_delta(&self) -> bool {
    2572            0 :         false
    2573            0 :     }
    2574              : }
    2575              : impl CompactionImageLayer<TimelineAdaptor> for ResidentImageLayer {}

Generated by: LCOV version 2.1-beta