LCOV - 49aa928ec5b4b510172d8b5c6d154da28e70a46c.info - pageserver/src/tenant/timeline/compaction.rs

LCOV - code coverage report

Current view:	top level - pageserver/src/tenant/timeline - compaction.rs (source / functions)		Coverage	Total	Hit
Test:	49aa928ec5b4b510172d8b5c6d154da28e70a46c.info	Lines:	62.3 %	1826	1138
Test Date:	2024-11-13 18:23:39	Functions:	43.3 %	134	58

            Line data    Source code

       1              : //! New compaction implementation. The algorithm itself is implemented in the
       2              : //! compaction crate. This file implements the callbacks and structs that allow
       3              : //! the algorithm to drive the process.
       4              : //!
       5              : //! The old legacy algorithm is implemented directly in `timeline.rs`.
       6              : 
       7              : use std::collections::{BinaryHeap, HashMap, HashSet};
       8              : use std::ops::{Deref, Range};
       9              : use std::sync::Arc;
      10              : 
      11              : use super::layer_manager::LayerManager;
      12              : use super::{
      13              :     CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
      14              :     RecordedDuration, Timeline,
      15              : };
      16              : 
      17              : use anyhow::{anyhow, bail, Context};
      18              : use bytes::Bytes;
      19              : use enumset::EnumSet;
      20              : use fail::fail_point;
      21              : use itertools::Itertools;
      22              : use pageserver_api::key::KEY_SIZE;
      23              : use pageserver_api::keyspace::ShardedRange;
      24              : use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
      25              : use serde::Serialize;
      26              : use tokio_util::sync::CancellationToken;
      27              : use tracing::{debug, info, info_span, trace, warn, Instrument};
      28              : use utils::id::TimelineId;
      29              : 
      30              : use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
      31              : use crate::page_cache;
      32              : use crate::statvfs::Statvfs;
      33              : use crate::tenant::checks::check_valid_layermap;
      34              : use crate::tenant::remote_timeline_client::WaitCompletionError;
      35              : use crate::tenant::storage_layer::batch_split_writer::{
      36              :     BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
      37              : };
      38              : use crate::tenant::storage_layer::filter_iterator::FilterIterator;
      39              : use crate::tenant::storage_layer::merge_iterator::MergeIterator;
      40              : use crate::tenant::storage_layer::{
      41              :     AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
      42              : };
      43              : use crate::tenant::timeline::ImageLayerCreationOutcome;
      44              : use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
      45              : use crate::tenant::timeline::{Layer, ResidentLayer};
      46              : use crate::tenant::{DeltaLayer, MaybeOffloaded};
      47              : use crate::virtual_file::{MaybeFatalIo, VirtualFile};
      48              : use pageserver_api::config::tenant_conf_defaults::{
      49              :     DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
      50              : };
      51              : 
      52              : use pageserver_api::key::Key;
      53              : use pageserver_api::keyspace::KeySpace;
      54              : use pageserver_api::record::NeonWalRecord;
      55              : use pageserver_api::value::Value;
      56              : 
      57              : use utils::lsn::Lsn;
      58              : 
      59              : use pageserver_compaction::helpers::{fully_contains, overlaps_with};
      60              : use pageserver_compaction::interface::*;
      61              : 
      62              : use super::CompactionError;
      63              : 
      64              : /// Maximum number of deltas before generating an image layer in bottom-most compaction.
      65              : const COMPACTION_DELTA_THRESHOLD: usize = 5;
      66              : 
      67              : pub struct GcCompactionJobDescription {
      68              :     /// All layers to read in the compaction job
      69              :     selected_layers: Vec<Layer>,
      70              :     /// GC cutoff of the job
      71              :     gc_cutoff: Lsn,
      72              :     /// LSNs to retain for the job
      73              :     retain_lsns_below_horizon: Vec<Lsn>,
      74              :     /// Maximum layer LSN processed in this compaction
      75              :     max_layer_lsn: Lsn,
      76              :     /// Only compact layers overlapping with this range
      77              :     compaction_key_range: Range<Key>,
      78              :     /// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap.
      79              :     /// This field is here solely for debugging. The field will not be read once the compaction
      80              :     /// description is generated.
      81              :     rewrite_layers: Vec<Arc<PersistentLayerDesc>>,
      82              : }
      83              : 
      84              : /// The result of bottom-most compaction for a single key at each LSN.
      85              : #[derive(Debug)]
      86              : #[cfg_attr(test, derive(PartialEq))]
      87              : pub struct KeyLogAtLsn(pub Vec<(Lsn, Value)>);
      88              : 
      89              : /// The result of bottom-most compaction.
      90              : #[derive(Debug)]
      91              : #[cfg_attr(test, derive(PartialEq))]
      92              : pub(crate) struct KeyHistoryRetention {
      93              :     /// Stores logs to reconstruct the value at the given LSN, that is to say, logs <= LSN or image == LSN.
      94              :     pub(crate) below_horizon: Vec<(Lsn, KeyLogAtLsn)>,
      95              :     /// Stores logs to reconstruct the value at any LSN above the horizon, that is to say, log > LSN.
      96              :     pub(crate) above_horizon: KeyLogAtLsn,
      97              : }
      98              : 
      99              : impl KeyHistoryRetention {
     100              :     /// Hack: skip delta layer if we need to produce a layer of a same key-lsn.
     101              :     ///
     102              :     /// This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
     103              :     /// For example, consider the case where a single delta with range [0x10,0x50) exists.
     104              :     /// And we have branches at LSN 0x10, 0x20, 0x30.
     105              :     /// Then we delete branch @ 0x20.
     106              :     /// Bottom-most compaction may now delete the delta [0x20,0x30).
     107              :     /// And that wouldnt' change the shape of the layer.
     108              :     ///
     109              :     /// Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
     110              :     ///
     111              :     /// `discard_key` will only be called when the writer reaches its target (instead of for every key), so it's fine to grab a lock inside.
     112           58 :     async fn discard_key(key: &PersistentLayerKey, tline: &Arc<Timeline>, dry_run: bool) -> bool {
     113           58 :         if dry_run {
     114            0 :             return true;
     115           58 :         }
     116           58 :         let guard = tline.layers.read().await;
     117           58 :         if !guard.contains_key(key) {
     118           38 :             return false;
     119           20 :         }
     120           20 :         let layer_generation = guard.get_from_key(key).metadata().generation;
     121           20 :         drop(guard);
     122           20 :         if layer_generation == tline.generation {
     123           20 :             info!(
     124              :                 key=%key,
     125              :                 ?layer_generation,
     126            0 :                 "discard layer due to duplicated layer key in the same generation",
     127              :             );
     128           20 :             true
     129              :         } else {
     130            0 :             false
     131              :         }
     132           58 :     }
     133              : 
     134              :     /// Pipe a history of a single key to the writers.
     135              :     ///
     136              :     /// If `image_writer` is none, the images will be placed into the delta layers.
     137              :     /// The delta writer will contain all images and deltas (below and above the horizon) except the bottom-most images.
     138              :     #[allow(clippy::too_many_arguments)]
     139          530 :     async fn pipe_to(
     140          530 :         self,
     141          530 :         key: Key,
     142          530 :         delta_writer: &mut SplitDeltaLayerWriter,
     143          530 :         mut image_writer: Option<&mut SplitImageLayerWriter>,
     144          530 :         stat: &mut CompactionStatistics,
     145          530 :         ctx: &RequestContext,
     146          530 :     ) -> anyhow::Result<()> {
     147          530 :         let mut first_batch = true;
     148         1658 :         for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
     149         1128 :             if first_batch {
     150          530 :                 if logs.len() == 1 && logs[0].1.is_image() {
     151          516 :                     let Value::Image(img) = &logs[0].1 else {
     152            0 :                         unreachable!()
     153              :                     };
     154          516 :                     stat.produce_image_key(img);
     155          516 :                     if let Some(image_writer) = image_writer.as_mut() {
     156          516 :                         image_writer.put_image(key, img.clone(), ctx).await?;
     157              :                     } else {
     158            0 :                         delta_writer
     159            0 :                             .put_value(key, cutoff_lsn, Value::Image(img.clone()), ctx)
     160            0 :                             .await?;
     161              :                     }
     162              :                 } else {
     163           28 :                     for (lsn, val) in logs {
     164           14 :                         stat.produce_key(&val);
     165           14 :                         delta_writer.put_value(key, lsn, val, ctx).await?;
     166              :                     }
     167              :                 }
     168          530 :                 first_batch = false;
     169              :             } else {
     170          684 :                 for (lsn, val) in logs {
     171           86 :                     stat.produce_key(&val);
     172           86 :                     delta_writer.put_value(key, lsn, val, ctx).await?;
     173              :                 }
     174              :             }
     175              :         }
     176          530 :         let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
     177          568 :         for (lsn, val) in above_horizon_logs {
     178           38 :             stat.produce_key(&val);
     179           38 :             delta_writer.put_value(key, lsn, val, ctx).await?;
     180              :         }
     181          530 :         Ok(())
     182          530 :     }
     183              : }
     184              : 
     185              : #[derive(Debug, Serialize, Default)]
     186              : struct CompactionStatisticsNumSize {
     187              :     num: u64,
     188              :     size: u64,
     189              : }
     190              : 
     191              : #[derive(Debug, Serialize, Default)]
     192              : pub struct CompactionStatistics {
     193              :     delta_layer_visited: CompactionStatisticsNumSize,
     194              :     image_layer_visited: CompactionStatisticsNumSize,
     195              :     delta_layer_produced: CompactionStatisticsNumSize,
     196              :     image_layer_produced: CompactionStatisticsNumSize,
     197              :     num_delta_layer_discarded: usize,
     198              :     num_image_layer_discarded: usize,
     199              :     num_unique_keys_visited: usize,
     200              :     wal_keys_visited: CompactionStatisticsNumSize,
     201              :     image_keys_visited: CompactionStatisticsNumSize,
     202              :     wal_produced: CompactionStatisticsNumSize,
     203              :     image_produced: CompactionStatisticsNumSize,
     204              : }
     205              : 
     206              : impl CompactionStatistics {
     207          846 :     fn estimated_size_of_value(val: &Value) -> usize {
     208          304 :         match val {
     209          542 :             Value::Image(img) => img.len(),
     210            0 :             Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
     211          304 :             _ => std::mem::size_of::<NeonWalRecord>(),
     212              :         }
     213          846 :     }
     214         1372 :     fn estimated_size_of_key() -> usize {
     215         1372 :         KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer)
     216         1372 :     }
     217           62 :     fn visit_delta_layer(&mut self, size: u64) {
     218           62 :         self.delta_layer_visited.num += 1;
     219           62 :         self.delta_layer_visited.size += size;
     220           62 :     }
     221           58 :     fn visit_image_layer(&mut self, size: u64) {
     222           58 :         self.image_layer_visited.num += 1;
     223           58 :         self.image_layer_visited.size += size;
     224           58 :     }
     225          530 :     fn on_unique_key_visited(&mut self) {
     226          530 :         self.num_unique_keys_visited += 1;
     227          530 :     }
     228          176 :     fn visit_wal_key(&mut self, val: &Value) {
     229          176 :         self.wal_keys_visited.num += 1;
     230          176 :         self.wal_keys_visited.size +=
     231          176 :             Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
     232          176 :     }
     233          542 :     fn visit_image_key(&mut self, val: &Value) {
     234          542 :         self.image_keys_visited.num += 1;
     235          542 :         self.image_keys_visited.size +=
     236          542 :             Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
     237          542 :     }
     238          138 :     fn produce_key(&mut self, val: &Value) {
     239          138 :         match val {
     240           10 :             Value::Image(img) => self.produce_image_key(img),
     241          128 :             Value::WalRecord(_) => self.produce_wal_key(val),
     242              :         }
     243          138 :     }
     244          128 :     fn produce_wal_key(&mut self, val: &Value) {
     245          128 :         self.wal_produced.num += 1;
     246          128 :         self.wal_produced.size +=
     247          128 :             Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
     248          128 :     }
     249          526 :     fn produce_image_key(&mut self, val: &Bytes) {
     250          526 :         self.image_produced.num += 1;
     251          526 :         self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
     252          526 :     }
     253           12 :     fn discard_delta_layer(&mut self) {
     254           12 :         self.num_delta_layer_discarded += 1;
     255           12 :     }
     256            8 :     fn discard_image_layer(&mut self) {
     257            8 :         self.num_image_layer_discarded += 1;
     258            8 :     }
     259           12 :     fn produce_delta_layer(&mut self, size: u64) {
     260           12 :         self.delta_layer_produced.num += 1;
     261           12 :         self.delta_layer_produced.size += size;
     262           12 :     }
     263           26 :     fn produce_image_layer(&mut self, size: u64) {
     264           26 :         self.image_layer_produced.num += 1;
     265           26 :         self.image_layer_produced.size += size;
     266           26 :     }
     267              : }
     268              : 
     269              : impl Timeline {
     270              :     /// TODO: cancellation
     271              :     ///
     272              :     /// Returns whether the compaction has pending tasks.
     273          364 :     pub(crate) async fn compact_legacy(
     274          364 :         self: &Arc<Self>,
     275          364 :         cancel: &CancellationToken,
     276          364 :         flags: EnumSet<CompactFlags>,
     277          364 :         ctx: &RequestContext,
     278          364 :     ) -> Result<bool, CompactionError> {
     279          364 :         if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
     280            0 :             self.compact_with_gc(cancel, flags, ctx)
     281            0 :                 .await
     282            0 :                 .map_err(CompactionError::Other)?;
     283            0 :             return Ok(false);
     284          364 :         }
     285          364 : 
     286          364 :         if flags.contains(CompactFlags::DryRun) {
     287            0 :             return Err(CompactionError::Other(anyhow!(
     288            0 :                 "dry-run mode is not supported for legacy compaction for now"
     289            0 :             )));
     290          364 :         }
     291          364 : 
     292          364 :         // High level strategy for compaction / image creation:
     293          364 :         //
     294          364 :         // 1. First, calculate the desired "partitioning" of the
     295          364 :         // currently in-use key space. The goal is to partition the
     296          364 :         // key space into roughly fixed-size chunks, but also take into
     297          364 :         // account any existing image layers, and try to align the
     298          364 :         // chunk boundaries with the existing image layers to avoid
     299          364 :         // too much churn. Also try to align chunk boundaries with
     300          364 :         // relation boundaries.  In principle, we don't know about
     301          364 :         // relation boundaries here, we just deal with key-value
     302          364 :         // pairs, and the code in pgdatadir_mapping.rs knows how to
     303          364 :         // map relations into key-value pairs. But in practice we know
     304          364 :         // that 'field6' is the block number, and the fields 1-5
     305          364 :         // identify a relation. This is just an optimization,
     306          364 :         // though.
     307          364 :         //
     308          364 :         // 2. Once we know the partitioning, for each partition,
     309          364 :         // decide if it's time to create a new image layer. The
     310          364 :         // criteria is: there has been too much "churn" since the last
     311          364 :         // image layer? The "churn" is fuzzy concept, it's a
     312          364 :         // combination of too many delta files, or too much WAL in
     313          364 :         // total in the delta file. Or perhaps: if creating an image
     314          364 :         // file would allow to delete some older files.
     315          364 :         //
     316          364 :         // 3. After that, we compact all level0 delta files if there
     317          364 :         // are too many of them.  While compacting, we also garbage
     318          364 :         // collect any page versions that are no longer needed because
     319          364 :         // of the new image layers we created in step 2.
     320          364 :         //
     321          364 :         // TODO: This high level strategy hasn't been implemented yet.
     322          364 :         // Below are functions compact_level0() and create_image_layers()
     323          364 :         // but they are a bit ad hoc and don't quite work like it's explained
     324          364 :         // above. Rewrite it.
     325          364 : 
     326          364 :         // Is the timeline being deleted?
     327          364 :         if self.is_stopping() {
     328            0 :             trace!("Dropping out of compaction on timeline shutdown");
     329            0 :             return Err(CompactionError::ShuttingDown);
     330          364 :         }
     331          364 : 
     332          364 :         let target_file_size = self.get_checkpoint_distance();
     333              : 
     334              :         // Define partitioning schema if needed
     335              : 
     336              :         // FIXME: the match should only cover repartitioning, not the next steps
     337          364 :         let (partition_count, has_pending_tasks) = match self
     338          364 :             .repartition(
     339          364 :                 self.get_last_record_lsn(),
     340          364 :                 self.get_compaction_target_size(),
     341          364 :                 flags,
     342          364 :                 ctx,
     343          364 :             )
     344        15721 :             .await
     345              :         {
     346          364 :             Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
     347          364 :                 // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
     348          364 :                 let image_ctx = RequestContextBuilder::extend(ctx)
     349          364 :                     .access_stats_behavior(AccessStatsBehavior::Skip)
     350          364 :                     .build();
     351          364 : 
     352          364 :                 // 2. Compact
     353          364 :                 let timer = self.metrics.compact_time_histo.start_timer();
     354          364 :                 let fully_compacted = self
     355          364 :                     .compact_level0(
     356          364 :                         target_file_size,
     357          364 :                         flags.contains(CompactFlags::ForceL0Compaction),
     358          364 :                         ctx,
     359          364 :                     )
     360         9899 :                     .await?;
     361          364 :                 timer.stop_and_record();
     362          364 : 
     363          364 :                 let mut partitioning = dense_partitioning;
     364          364 :                 partitioning
     365          364 :                     .parts
     366          364 :                     .extend(sparse_partitioning.into_dense().parts);
     367          364 : 
     368          364 :                 // 3. Create new image layers for partitions that have been modified
     369          364 :                 // "enough". Skip image layer creation if L0 compaction cannot keep up.
     370          364 :                 if fully_compacted {
     371          364 :                     let image_layers = self
     372          364 :                         .create_image_layers(
     373          364 :                             &partitioning,
     374          364 :                             lsn,
     375          364 :                             if flags.contains(CompactFlags::ForceImageLayerCreation) {
     376           14 :                                 ImageLayerCreationMode::Force
     377              :                             } else {
     378          350 :                                 ImageLayerCreationMode::Try
     379              :                             },
     380          364 :                             &image_ctx,
     381              :                         )
     382        11439 :                         .await?;
     383              : 
     384          364 :                     self.upload_new_image_layers(image_layers)?;
     385              :                 } else {
     386            0 :                     info!("skipping image layer generation due to L0 compaction did not include all layers.");
     387              :                 }
     388          364 :                 (partitioning.parts.len(), !fully_compacted)
     389              :             }
     390            0 :             Err(err) => {
     391            0 :                 // no partitioning? This is normal, if the timeline was just created
     392            0 :                 // as an empty timeline. Also in unit tests, when we use the timeline
     393            0 :                 // as a simple key-value store, ignoring the datadir layout. Log the
     394            0 :                 // error but continue.
     395            0 :                 //
     396            0 :                 // Suppress error when it's due to cancellation
     397            0 :                 if !self.cancel.is_cancelled() && !err.is_cancelled() {
     398            0 :                     tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
     399            0 :                 }
     400            0 :                 (1, false)
     401              :             }
     402              :         };
     403              : 
     404          364 :         if self.shard_identity.count >= ShardCount::new(2) {
     405              :             // Limit the number of layer rewrites to the number of partitions: this means its
     406              :             // runtime should be comparable to a full round of image layer creations, rather than
     407              :             // being potentially much longer.
     408            0 :             let rewrite_max = partition_count;
     409            0 : 
     410            0 :             self.compact_shard_ancestors(rewrite_max, ctx).await?;
     411          364 :         }
     412              : 
     413          364 :         Ok(has_pending_tasks)
     414          364 :     }
     415              : 
     416              :     /// Check for layers that are elegible to be rewritten:
     417              :     /// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that
     418              :     ///   we don't indefinitely retain keys in this shard that aren't needed.
     419              :     /// - For future use: layers beyond pitr_interval that are in formats we would
     420              :     ///   rather not maintain compatibility with indefinitely.
     421              :     ///
     422              :     /// Note: this phase may read and write many gigabytes of data: use rewrite_max to bound
     423              :     /// how much work it will try to do in each compaction pass.
     424            0 :     async fn compact_shard_ancestors(
     425            0 :         self: &Arc<Self>,
     426            0 :         rewrite_max: usize,
     427            0 :         ctx: &RequestContext,
     428            0 :     ) -> Result<(), CompactionError> {
     429            0 :         let mut drop_layers = Vec::new();
     430            0 :         let mut layers_to_rewrite: Vec<Layer> = Vec::new();
     431            0 : 
     432            0 :         // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
     433            0 :         // layer is behind this Lsn, it indicates that the layer is being retained beyond the
     434            0 :         // pitr_interval, for example because a branchpoint references it.
     435            0 :         //
     436            0 :         // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
     437            0 :         // are rewriting layers.
     438            0 :         let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
     439            0 : 
     440            0 :         tracing::info!(
     441            0 :             "latest_gc_cutoff: {}, pitr cutoff {}",
     442            0 :             *latest_gc_cutoff,
     443            0 :             self.gc_info.read().unwrap().cutoffs.time
     444              :         );
     445              : 
     446            0 :         let layers = self.layers.read().await;
     447            0 :         for layer_desc in layers.layer_map()?.iter_historic_layers() {
     448            0 :             let layer = layers.get_from_desc(&layer_desc);
     449            0 :             if layer.metadata().shard.shard_count == self.shard_identity.count {
     450              :                 // This layer does not belong to a historic ancestor, no need to re-image it.
     451            0 :                 continue;
     452            0 :             }
     453            0 : 
     454            0 :             // This layer was created on an ancestor shard: check if it contains any data for this shard.
     455            0 :             let sharded_range = ShardedRange::new(layer_desc.get_key_range(), &self.shard_identity);
     456            0 :             let layer_local_page_count = sharded_range.page_count();
     457            0 :             let layer_raw_page_count = ShardedRange::raw_size(&layer_desc.get_key_range());
     458            0 :             if layer_local_page_count == 0 {
     459              :                 // This ancestral layer only covers keys that belong to other shards.
     460              :                 // We include the full metadata in the log: if we had some critical bug that caused
     461              :                 // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
     462            0 :                 info!(%layer, old_metadata=?layer.metadata(),
     463            0 :                     "dropping layer after shard split, contains no keys for this shard.",
     464              :                 );
     465              : 
     466            0 :                 if cfg!(debug_assertions) {
     467              :                     // Expensive, exhaustive check of keys in this layer: this guards against ShardedRange's calculations being
     468              :                     // wrong.  If ShardedRange claims the local page count is zero, then no keys in this layer
     469              :                     // should be !is_key_disposable()
     470            0 :                     let range = layer_desc.get_key_range();
     471            0 :                     let mut key = range.start;
     472            0 :                     while key < range.end {
     473            0 :                         debug_assert!(self.shard_identity.is_key_disposable(&key));
     474            0 :                         key = key.next();
     475              :                     }
     476            0 :                 }
     477              : 
     478            0 :                 drop_layers.push(layer);
     479            0 :                 continue;
     480            0 :             } else if layer_local_page_count != u32::MAX
     481            0 :                 && layer_local_page_count == layer_raw_page_count
     482              :             {
     483            0 :                 debug!(%layer,
     484            0 :                     "layer is entirely shard local ({} keys), no need to filter it",
     485              :                     layer_local_page_count
     486              :                 );
     487            0 :                 continue;
     488            0 :             }
     489            0 : 
     490            0 :             // Don't bother re-writing a layer unless it will at least halve its size
     491            0 :             if layer_local_page_count != u32::MAX
     492            0 :                 && layer_local_page_count > layer_raw_page_count / 2
     493              :             {
     494            0 :                 debug!(%layer,
     495            0 :                     "layer is already mostly local ({}/{}), not rewriting",
     496              :                     layer_local_page_count,
     497              :                     layer_raw_page_count
     498              :                 );
     499            0 :             }
     500              : 
     501              :             // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
     502              :             // without incurring the I/O cost of a rewrite.
     503            0 :             if layer_desc.get_lsn_range().end >= *latest_gc_cutoff {
     504            0 :                 debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})",
     505            0 :                     layer_desc.get_lsn_range().end, *latest_gc_cutoff);
     506            0 :                 continue;
     507            0 :             }
     508            0 : 
     509            0 :             if layer_desc.is_delta() {
     510              :                 // We do not yet implement rewrite of delta layers
     511            0 :                 debug!(%layer, "Skipping rewrite of delta layer");
     512            0 :                 continue;
     513            0 :             }
     514            0 : 
     515            0 :             // Only rewrite layers if their generations differ.  This guarantees:
     516            0 :             //  - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
     517            0 :             //  - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage
     518            0 :             if layer.metadata().generation == self.generation {
     519            0 :                 debug!(%layer, "Skipping rewrite, is not from old generation");
     520            0 :                 continue;
     521            0 :             }
     522            0 : 
     523            0 :             if layers_to_rewrite.len() >= rewrite_max {
     524            0 :                 tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
     525            0 :                     layers_to_rewrite.len()
     526              :                 );
     527            0 :                 continue;
     528            0 :             }
     529            0 : 
     530            0 :             // Fall through: all our conditions for doing a rewrite passed.
     531            0 :             layers_to_rewrite.push(layer);
     532              :         }
     533              : 
     534              :         // Drop read lock on layer map before we start doing time-consuming I/O
     535            0 :         drop(layers);
     536            0 : 
     537            0 :         let mut replace_image_layers = Vec::new();
     538              : 
     539            0 :         for layer in layers_to_rewrite {
     540            0 :             tracing::info!(layer=%layer, "Rewriting layer after shard split...");
     541            0 :             let mut image_layer_writer = ImageLayerWriter::new(
     542            0 :                 self.conf,
     543            0 :                 self.timeline_id,
     544            0 :                 self.tenant_shard_id,
     545            0 :                 &layer.layer_desc().key_range,
     546            0 :                 layer.layer_desc().image_layer_lsn(),
     547            0 :                 ctx,
     548            0 :             )
     549            0 :             .await
     550            0 :             .map_err(CompactionError::Other)?;
     551              : 
     552              :             // Safety of layer rewrites:
     553              :             // - We are writing to a different local file path than we are reading from, so the old Layer
     554              :             //   cannot interfere with the new one.
     555              :             // - In the page cache, contents for a particular VirtualFile are stored with a file_id that
     556              :             //   is different for two layers with the same name (in `ImageLayerInner::new` we always
     557              :             //   acquire a fresh id from [`crate::page_cache::next_file_id`].  So readers do not risk
     558              :             //   reading the index from one layer file, and then data blocks from the rewritten layer file.
     559              :             // - Any readers that have a reference to the old layer will keep it alive until they are done
     560              :             //   with it. If they are trying to promote from remote storage, that will fail, but this is the same
     561              :             //   as for compaction generally: compaction is allowed to delete layers that readers might be trying to use.
     562              :             // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
     563              :             //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
     564              :             //    - ingestion, which only inserts layers, therefore cannot collide with us.
     565            0 :             let resident = layer.download_and_keep_resident().await?;
     566              : 
     567            0 :             let keys_written = resident
     568            0 :                 .filter(&self.shard_identity, &mut image_layer_writer, ctx)
     569            0 :                 .await?;
     570              : 
     571            0 :             if keys_written > 0 {
     572            0 :                 let (desc, path) = image_layer_writer
     573            0 :                     .finish(ctx)
     574            0 :                     .await
     575            0 :                     .map_err(CompactionError::Other)?;
     576            0 :                 let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
     577            0 :                     .map_err(CompactionError::Other)?;
     578            0 :                 tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
     579            0 :                     layer.metadata().file_size,
     580            0 :                     new_layer.metadata().file_size);
     581              : 
     582            0 :                 replace_image_layers.push((layer, new_layer));
     583            0 :             } else {
     584            0 :                 // Drop the old layer.  Usually for this case we would already have noticed that
     585            0 :                 // the layer has no data for us with the ShardedRange check above, but
     586            0 :                 drop_layers.push(layer);
     587            0 :             }
     588              :         }
     589              : 
     590              :         // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
     591              :         // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
     592              :         // to remote index) and be removed. This is inefficient but safe.
     593            0 :         fail::fail_point!("compact-shard-ancestors-localonly");
     594            0 : 
     595            0 :         // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
     596            0 :         self.rewrite_layers(replace_image_layers, drop_layers)
     597            0 :             .await?;
     598              : 
     599            0 :         fail::fail_point!("compact-shard-ancestors-enqueued");
     600            0 : 
     601            0 :         // We wait for all uploads to complete before finishing this compaction stage.  This is not
     602            0 :         // necessary for correctness, but it simplifies testing, and avoids proceeding with another
     603            0 :         // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
     604            0 :         // load.
     605            0 :         match self.remote_client.wait_completion().await {
     606            0 :             Ok(()) => (),
     607            0 :             Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
     608              :             Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
     609            0 :                 return Err(CompactionError::ShuttingDown)
     610              :             }
     611              :         }
     612              : 
     613            0 :         fail::fail_point!("compact-shard-ancestors-persistent");
     614            0 : 
     615            0 :         Ok(())
     616            0 :     }
     617              : 
     618              :     /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is
     619              :     /// an image layer between them and the most recent readable LSN (branch point or tip of timeline).  The
     620              :     /// purpose of the visibility hint is to record which layers need to be available to service reads.
     621              :     ///
     622              :     /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers
     623              :     /// that we know won't be needed for reads.
     624          196 :     pub(super) async fn update_layer_visibility(
     625          196 :         &self,
     626          196 :     ) -> Result<(), super::layer_manager::Shutdown> {
     627          196 :         let head_lsn = self.get_last_record_lsn();
     628              : 
     629              :         // We will sweep through layers in reverse-LSN order.  We only do historic layers.  L0 deltas
     630              :         // are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here.
     631              :         // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
     632              :         // they will be subject to L0->L1 compaction in the near future.
     633          196 :         let layer_manager = self.layers.read().await;
     634          196 :         let layer_map = layer_manager.layer_map()?;
     635              : 
     636          196 :         let readable_points = {
     637          196 :             let children = self.gc_info.read().unwrap().retain_lsns.clone();
     638          196 : 
     639          196 :             let mut readable_points = Vec::with_capacity(children.len() + 1);
     640          196 :             for (child_lsn, _child_timeline_id, is_offloaded) in &children {
     641            0 :                 if *is_offloaded == MaybeOffloaded::Yes {
     642            0 :                     continue;
     643            0 :                 }
     644            0 :                 readable_points.push(*child_lsn);
     645              :             }
     646          196 :             readable_points.push(head_lsn);
     647          196 :             readable_points
     648          196 :         };
     649          196 : 
     650          196 :         let (layer_visibility, covered) = layer_map.get_visibility(readable_points);
     651          512 :         for (layer_desc, visibility) in layer_visibility {
     652          316 :             // FIXME: a more efficiency bulk zip() through the layers rather than NlogN getting each one
     653          316 :             let layer = layer_manager.get_from_desc(&layer_desc);
     654          316 :             layer.set_visibility(visibility);
     655          316 :         }
     656              : 
     657              :         // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can
     658              :         // avoid assuming that everything at a branch point is visible.
     659          196 :         drop(covered);
     660          196 :         Ok(())
     661          196 :     }
     662              : 
     663              :     /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
     664              :     /// as Level 1 files. Returns whether the L0 layers are fully compacted.
     665          364 :     async fn compact_level0(
     666          364 :         self: &Arc<Self>,
     667          364 :         target_file_size: u64,
     668          364 :         force_compaction_ignore_threshold: bool,
     669          364 :         ctx: &RequestContext,
     670          364 :     ) -> Result<bool, CompactionError> {
     671              :         let CompactLevel0Phase1Result {
     672          364 :             new_layers,
     673          364 :             deltas_to_compact,
     674          364 :             fully_compacted,
     675              :         } = {
     676          364 :             let phase1_span = info_span!("compact_level0_phase1");
     677          364 :             let ctx = ctx.attached_child();
     678          364 :             let mut stats = CompactLevel0Phase1StatsBuilder {
     679          364 :                 version: Some(2),
     680          364 :                 tenant_id: Some(self.tenant_shard_id),
     681          364 :                 timeline_id: Some(self.timeline_id),
     682          364 :                 ..Default::default()
     683          364 :             };
     684          364 : 
     685          364 :             let begin = tokio::time::Instant::now();
     686          364 :             let phase1_layers_locked = self.layers.read().await;
     687          364 :             let now = tokio::time::Instant::now();
     688          364 :             stats.read_lock_acquisition_micros =
     689          364 :                 DurationRecorder::Recorded(RecordedDuration(now - begin), now);
     690          364 :             self.compact_level0_phase1(
     691          364 :                 phase1_layers_locked,
     692          364 :                 stats,
     693          364 :                 target_file_size,
     694          364 :                 force_compaction_ignore_threshold,
     695          364 :                 &ctx,
     696          364 :             )
     697          364 :             .instrument(phase1_span)
     698         9897 :             .await?
     699              :         };
     700              : 
     701          364 :         if new_layers.is_empty() && deltas_to_compact.is_empty() {
     702              :             // nothing to do
     703          336 :             return Ok(true);
     704           28 :         }
     705           28 : 
     706           28 :         self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
     707            1 :             .await?;
     708           28 :         Ok(fully_compacted)
     709          364 :     }
     710              : 
     711              :     /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
     712          364 :     async fn compact_level0_phase1<'a>(
     713          364 :         self: &'a Arc<Self>,
     714          364 :         guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
     715          364 :         mut stats: CompactLevel0Phase1StatsBuilder,
     716          364 :         target_file_size: u64,
     717          364 :         force_compaction_ignore_threshold: bool,
     718          364 :         ctx: &RequestContext,
     719          364 :     ) -> Result<CompactLevel0Phase1Result, CompactionError> {
     720          364 :         stats.read_lock_held_spawn_blocking_startup_micros =
     721          364 :             stats.read_lock_acquisition_micros.till_now(); // set by caller
     722          364 :         let layers = guard.layer_map()?;
     723          364 :         let level0_deltas = layers.level0_deltas();
     724          364 :         stats.level0_deltas_count = Some(level0_deltas.len());
     725          364 : 
     726          364 :         // Only compact if enough layers have accumulated.
     727          364 :         let threshold = self.get_compaction_threshold();
     728          364 :         if level0_deltas.is_empty() || level0_deltas.len() < threshold {
     729          336 :             if force_compaction_ignore_threshold {
     730            0 :                 if !level0_deltas.is_empty() {
     731            0 :                     info!(
     732            0 :                         level0_deltas = level0_deltas.len(),
     733            0 :                         threshold, "too few deltas to compact, but forcing compaction"
     734              :                     );
     735              :                 } else {
     736            0 :                     info!(
     737            0 :                         level0_deltas = level0_deltas.len(),
     738            0 :                         threshold, "too few deltas to compact, cannot force compaction"
     739              :                     );
     740            0 :                     return Ok(CompactLevel0Phase1Result::default());
     741              :                 }
     742              :             } else {
     743          336 :                 debug!(
     744            0 :                     level0_deltas = level0_deltas.len(),
     745            0 :                     threshold, "too few deltas to compact"
     746              :                 );
     747          336 :                 return Ok(CompactLevel0Phase1Result::default());
     748              :             }
     749           28 :         }
     750              : 
     751           28 :         let mut level0_deltas = level0_deltas
     752           28 :             .iter()
     753          402 :             .map(|x| guard.get_from_desc(x))
     754           28 :             .collect::<Vec<_>>();
     755           28 : 
     756           28 :         // Gather the files to compact in this iteration.
     757           28 :         //
     758           28 :         // Start with the oldest Level 0 delta file, and collect any other
     759           28 :         // level 0 files that form a contiguous sequence, such that the end
     760           28 :         // LSN of previous file matches the start LSN of the next file.
     761           28 :         //
     762           28 :         // Note that if the files don't form such a sequence, we might
     763           28 :         // "compact" just a single file. That's a bit pointless, but it allows
     764           28 :         // us to get rid of the level 0 file, and compact the other files on
     765           28 :         // the next iteration. This could probably made smarter, but such
     766           28 :         // "gaps" in the sequence of level 0 files should only happen in case
     767           28 :         // of a crash, partial download from cloud storage, or something like
     768           28 :         // that, so it's not a big deal in practice.
     769          748 :         level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start);
     770           28 :         let mut level0_deltas_iter = level0_deltas.iter();
     771           28 : 
     772           28 :         let first_level0_delta = level0_deltas_iter.next().unwrap();
     773           28 :         let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
     774           28 :         let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
     775           28 : 
     776           28 :         // Accumulate the size of layers in `deltas_to_compact`
     777           28 :         let mut deltas_to_compact_bytes = 0;
     778           28 : 
     779           28 :         // Under normal circumstances, we will accumulate up to compaction_interval L0s of size
     780           28 :         // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
     781           28 :         // work in this function to only operate on this much delta data at once.
     782           28 :         //
     783           28 :         // Take the max of the configured value & the default, so that tests that configure tiny values
     784           28 :         // can still use a sensible amount of memory, but if a deployed system configures bigger values we
     785           28 :         // still let them compact a full stack of L0s in one go.
     786           28 :         let delta_size_limit = std::cmp::max(
     787           28 :             self.get_compaction_threshold(),
     788           28 :             DEFAULT_COMPACTION_THRESHOLD,
     789           28 :         ) as u64
     790           28 :             * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
     791           28 : 
     792           28 :         let mut fully_compacted = true;
     793           28 : 
     794           28 :         deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
     795          402 :         for l in level0_deltas_iter {
     796          374 :             let lsn_range = &l.layer_desc().lsn_range;
     797          374 : 
     798          374 :             if lsn_range.start != prev_lsn_end {
     799            0 :                 break;
     800          374 :             }
     801          374 :             deltas_to_compact.push(l.download_and_keep_resident().await?);
     802          374 :             deltas_to_compact_bytes += l.metadata().file_size;
     803          374 :             prev_lsn_end = lsn_range.end;
     804          374 : 
     805          374 :             if deltas_to_compact_bytes >= delta_size_limit {
     806            0 :                 info!(
     807            0 :                     l0_deltas_selected = deltas_to_compact.len(),
     808            0 :                     l0_deltas_total = level0_deltas.len(),
     809            0 :                     "L0 compaction picker hit max delta layer size limit: {}",
     810              :                     delta_size_limit
     811              :                 );
     812            0 :                 fully_compacted = false;
     813            0 : 
     814            0 :                 // Proceed with compaction, but only a subset of L0s
     815            0 :                 break;
     816          374 :             }
     817              :         }
     818           28 :         let lsn_range = Range {
     819           28 :             start: deltas_to_compact
     820           28 :                 .first()
     821           28 :                 .unwrap()
     822           28 :                 .layer_desc()
     823           28 :                 .lsn_range
     824           28 :                 .start,
     825           28 :             end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end,
     826           28 :         };
     827           28 : 
     828           28 :         info!(
     829            0 :             "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
     830            0 :             lsn_range.start,
     831            0 :             lsn_range.end,
     832            0 :             deltas_to_compact.len(),
     833            0 :             level0_deltas.len()
     834              :         );
     835              : 
     836          402 :         for l in deltas_to_compact.iter() {
     837          402 :             info!("compact includes {l}");
     838              :         }
     839              : 
     840              :         // We don't need the original list of layers anymore. Drop it so that
     841              :         // we don't accidentally use it later in the function.
     842           28 :         drop(level0_deltas);
     843           28 : 
     844           28 :         stats.read_lock_held_prerequisites_micros = stats
     845           28 :             .read_lock_held_spawn_blocking_startup_micros
     846           28 :             .till_now();
     847              : 
     848              :         // TODO: replace with streaming k-merge
     849           28 :         let all_keys = {
     850           28 :             let mut all_keys = Vec::new();
     851          402 :             for l in deltas_to_compact.iter() {
     852          402 :                 if self.cancel.is_cancelled() {
     853            0 :                     return Err(CompactionError::ShuttingDown);
     854          402 :                 }
     855          402 :                 let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
     856          402 :                 let keys = delta
     857          402 :                     .index_entries(ctx)
     858         2124 :                     .await
     859          402 :                     .map_err(CompactionError::Other)?;
     860          402 :                 all_keys.extend(keys);
     861              :             }
     862              :             // The current stdlib sorting implementation is designed in a way where it is
     863              :             // particularly fast where the slice is made up of sorted sub-ranges.
     864      4423784 :             all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
     865           28 :             all_keys
     866           28 :         };
     867           28 : 
     868           28 :         stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
     869              : 
     870              :         // Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start.
     871              :         //
     872              :         // A hole is a key range for which this compaction doesn't have any WAL records.
     873              :         // Our goal in this compaction iteration is to avoid creating L1s that, in terms of their key range,
     874              :         // cover the hole, but actually don't contain any WAL records for that key range.
     875              :         // The reason is that the mere stack of L1s (`count_deltas`) triggers image layer creation (`create_image_layers`).
     876              :         // That image layer creation would be useless for a hole range covered by L1s that don't contain any WAL records.
     877              :         //
     878              :         // The algorithm chooses holes as follows.
     879              :         // - Slide a 2-window over the keys in key orde to get the hole range (=distance between two keys).
     880              :         // - Filter: min threshold on range length
     881              :         // - Rank: by coverage size (=number of image layers required to reconstruct each key in the range for which we have any data)
     882              :         //
     883              :         // For more details, intuition, and some ASCII art see https://github.com/neondatabase/neon/pull/3597#discussion_r1112704451
     884              :         #[derive(PartialEq, Eq)]
     885              :         struct Hole {
     886              :             key_range: Range<Key>,
     887              :             coverage_size: usize,
     888              :         }
     889           28 :         let holes: Vec<Hole> = {
     890              :             use std::cmp::Ordering;
     891              :             impl Ord for Hole {
     892            0 :                 fn cmp(&self, other: &Self) -> Ordering {
     893            0 :                     self.coverage_size.cmp(&other.coverage_size).reverse()
     894            0 :                 }
     895              :             }
     896              :             impl PartialOrd for Hole {
     897            0 :                 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
     898            0 :                     Some(self.cmp(other))
     899            0 :                 }
     900              :             }
     901           28 :             let max_holes = deltas_to_compact.len();
     902           28 :             let last_record_lsn = self.get_last_record_lsn();
     903           28 :             let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
     904           28 :             let min_hole_coverage_size = 3; // TODO: something more flexible?
     905           28 :                                             // min-heap (reserve space for one more element added before eviction)
     906           28 :             let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
     907           28 :             let mut prev: Option<Key> = None;
     908              : 
     909      2064038 :             for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
     910      2064038 :                 if let Some(prev_key) = prev {
     911              :                     // just first fast filter, do not create hole entries for metadata keys. The last hole in the
     912              :                     // compaction is the gap between data key and metadata keys.
     913      2064010 :                     if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
     914            0 :                         && !Key::is_metadata_key(&prev_key)
     915              :                     {
     916            0 :                         let key_range = prev_key..next_key;
     917            0 :                         // Measuring hole by just subtraction of i128 representation of key range boundaries
     918            0 :                         // has not so much sense, because largest holes will corresponds field1/field2 changes.
     919            0 :                         // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
     920            0 :                         // That is why it is better to measure size of hole as number of covering image layers.
     921            0 :                         let coverage_size =
     922            0 :                             layers.image_coverage(&key_range, last_record_lsn).len();
     923            0 :                         if coverage_size >= min_hole_coverage_size {
     924            0 :                             heap.push(Hole {
     925            0 :                                 key_range,
     926            0 :                                 coverage_size,
     927            0 :                             });
     928            0 :                             if heap.len() > max_holes {
     929            0 :                                 heap.pop(); // remove smallest hole
     930            0 :                             }
     931            0 :                         }
     932      2064010 :                     }
     933           28 :                 }
     934      2064038 :                 prev = Some(next_key.next());
     935              :             }
     936           28 :             let mut holes = heap.into_vec();
     937           28 :             holes.sort_unstable_by_key(|hole| hole.key_range.start);
     938           28 :             holes
     939           28 :         };
     940           28 :         stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
     941           28 :         drop_rlock(guard);
     942           28 : 
     943           28 :         if self.cancel.is_cancelled() {
     944            0 :             return Err(CompactionError::ShuttingDown);
     945           28 :         }
     946           28 : 
     947           28 :         stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
     948              : 
     949              :         // This iterator walks through all key-value pairs from all the layers
     950              :         // we're compacting, in key, LSN order.
     951              :         // If there's both a Value::Image and Value::WalRecord for the same (key,lsn),
     952              :         // then the Value::Image is ordered before Value::WalRecord.
     953           28 :         let mut all_values_iter = {
     954           28 :             let mut deltas = Vec::with_capacity(deltas_to_compact.len());
     955          402 :             for l in deltas_to_compact.iter() {
     956          402 :                 let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
     957          402 :                 deltas.push(l);
     958              :             }
     959           28 :             MergeIterator::create(&deltas, &[], ctx)
     960           28 :         };
     961           28 : 
     962           28 :         // This iterator walks through all keys and is needed to calculate size used by each key
     963           28 :         let mut all_keys_iter = all_keys
     964           28 :             .iter()
     965      2064038 :             .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
     966      2064010 :             .coalesce(|mut prev, cur| {
     967      2064010 :                 // Coalesce keys that belong to the same key pair.
     968      2064010 :                 // This ensures that compaction doesn't put them
     969      2064010 :                 // into different layer files.
     970      2064010 :                 // Still limit this by the target file size,
     971      2064010 :                 // so that we keep the size of the files in
     972      2064010 :                 // check.
     973      2064010 :                 if prev.0 == cur.0 && prev.2 < target_file_size {
     974        40038 :                     prev.2 += cur.2;
     975        40038 :                     Ok(prev)
     976              :                 } else {
     977      2023972 :                     Err((prev, cur))
     978              :                 }
     979      2064010 :             });
     980           28 : 
     981           28 :         // Merge the contents of all the input delta layers into a new set
     982           28 :         // of delta layers, based on the current partitioning.
     983           28 :         //
     984           28 :         // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one.
     985           28 :         // It's possible that there is a single key with so many page versions that storing all of them in a single layer file
     986           28 :         // would be too large. In that case, we also split on the LSN dimension.
     987           28 :         //
     988           28 :         // LSN
     989           28 :         //  ^
     990           28 :         //  |
     991           28 :         //  | +-----------+            +--+--+--+--+
     992           28 :         //  | |           |            |  |  |  |  |
     993           28 :         //  | +-----------+            |  |  |  |  |
     994           28 :         //  | |           |            |  |  |  |  |
     995           28 :         //  | +-----------+     ==>    |  |  |  |  |
     996           28 :         //  | |           |            |  |  |  |  |
     997           28 :         //  | +-----------+            |  |  |  |  |
     998           28 :         //  | |           |            |  |  |  |  |
     999           28 :         //  | +-----------+            +--+--+--+--+
    1000           28 :         //  |
    1001           28 :         //  +--------------> key
    1002           28 :         //
    1003           28 :         //
    1004           28 :         // If one key (X) has a lot of page versions:
    1005           28 :         //
    1006           28 :         // LSN
    1007           28 :         //  ^
    1008           28 :         //  |                                 (X)
    1009           28 :         //  | +-----------+            +--+--+--+--+
    1010           28 :         //  | |           |            |  |  |  |  |
    1011           28 :         //  | +-----------+            |  |  +--+  |
    1012           28 :         //  | |           |            |  |  |  |  |
    1013           28 :         //  | +-----------+     ==>    |  |  |  |  |
    1014           28 :         //  | |           |            |  |  +--+  |
    1015           28 :         //  | +-----------+            |  |  |  |  |
    1016           28 :         //  | |           |            |  |  |  |  |
    1017           28 :         //  | +-----------+            +--+--+--+--+
    1018           28 :         //  |
    1019           28 :         //  +--------------> key
    1020           28 :         // TODO: this actually divides the layers into fixed-size chunks, not
    1021           28 :         // based on the partitioning.
    1022           28 :         //
    1023           28 :         // TODO: we should also opportunistically materialize and
    1024           28 :         // garbage collect what we can.
    1025           28 :         let mut new_layers = Vec::new();
    1026           28 :         let mut prev_key: Option<Key> = None;
    1027           28 :         let mut writer: Option<DeltaLayerWriter> = None;
    1028           28 :         let mut key_values_total_size = 0u64;
    1029           28 :         let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
    1030           28 :         let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
    1031           28 :         let mut next_hole = 0; // index of next hole in holes vector
    1032           28 : 
    1033           28 :         let mut keys = 0;
    1034              : 
    1035      2064066 :         while let Some((key, lsn, value)) = all_values_iter
    1036      2064066 :             .next()
    1037         3425 :             .await
    1038      2064066 :             .map_err(CompactionError::Other)?
    1039              :         {
    1040      2064038 :             keys += 1;
    1041      2064038 : 
    1042      2064038 :             if keys % 32_768 == 0 && self.cancel.is_cancelled() {
    1043              :                 // avoid hitting the cancellation token on every key. in benches, we end up
    1044              :                 // shuffling an order of million keys per layer, this means we'll check it
    1045              :                 // around tens of times per layer.
    1046            0 :                 return Err(CompactionError::ShuttingDown);
    1047      2064038 :             }
    1048      2064038 : 
    1049      2064038 :             let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
    1050      2064038 :             // We need to check key boundaries once we reach next key or end of layer with the same key
    1051      2064038 :             if !same_key || lsn == dup_end_lsn {
    1052      2024000 :                 let mut next_key_size = 0u64;
    1053      2024000 :                 let is_dup_layer = dup_end_lsn.is_valid();
    1054      2024000 :                 dup_start_lsn = Lsn::INVALID;
    1055      2024000 :                 if !same_key {
    1056      2024000 :                     dup_end_lsn = Lsn::INVALID;
    1057      2024000 :                 }
    1058              :                 // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
    1059      2024000 :                 for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
    1060      2024000 :                     next_key_size = next_size;
    1061      2024000 :                     if key != next_key {
    1062      2023972 :                         if dup_end_lsn.is_valid() {
    1063            0 :                             // We are writting segment with duplicates:
    1064            0 :                             // place all remaining values of this key in separate segment
    1065            0 :                             dup_start_lsn = dup_end_lsn; // new segments starts where old stops
    1066            0 :                             dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
    1067      2023972 :                         }
    1068      2023972 :                         break;
    1069           28 :                     }
    1070           28 :                     key_values_total_size += next_size;
    1071           28 :                     // Check if it is time to split segment: if total keys size is larger than target file size.
    1072           28 :                     // We need to avoid generation of empty segments if next_size > target_file_size.
    1073           28 :                     if key_values_total_size > target_file_size && lsn != next_lsn {
    1074              :                         // Split key between multiple layers: such layer can contain only single key
    1075            0 :                         dup_start_lsn = if dup_end_lsn.is_valid() {
    1076            0 :                             dup_end_lsn // new segment with duplicates starts where old one stops
    1077              :                         } else {
    1078            0 :                             lsn // start with the first LSN for this key
    1079              :                         };
    1080            0 :                         dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
    1081            0 :                         break;
    1082           28 :                     }
    1083              :                 }
    1084              :                 // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
    1085      2024000 :                 if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
    1086            0 :                     dup_start_lsn = dup_end_lsn;
    1087            0 :                     dup_end_lsn = lsn_range.end;
    1088      2024000 :                 }
    1089      2024000 :                 if writer.is_some() {
    1090      2023972 :                     let written_size = writer.as_mut().unwrap().size();
    1091      2023972 :                     let contains_hole =
    1092      2023972 :                         next_hole < holes.len() && key >= holes[next_hole].key_range.end;
    1093              :                     // check if key cause layer overflow or contains hole...
    1094      2023972 :                     if is_dup_layer
    1095      2023972 :                         || dup_end_lsn.is_valid()
    1096      2023972 :                         || written_size + key_values_total_size > target_file_size
    1097      2023692 :                         || contains_hole
    1098              :                     {
    1099              :                         // ... if so, flush previous layer and prepare to write new one
    1100          280 :                         let (desc, path) = writer
    1101          280 :                             .take()
    1102          280 :                             .unwrap()
    1103          280 :                             .finish(prev_key.unwrap().next(), ctx)
    1104          711 :                             .await
    1105          280 :                             .map_err(CompactionError::Other)?;
    1106          280 :                         let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
    1107          280 :                             .map_err(CompactionError::Other)?;
    1108              : 
    1109          280 :                         new_layers.push(new_delta);
    1110          280 :                         writer = None;
    1111          280 : 
    1112          280 :                         if contains_hole {
    1113            0 :                             // skip hole
    1114            0 :                             next_hole += 1;
    1115          280 :                         }
    1116      2023692 :                     }
    1117           28 :                 }
    1118              :                 // Remember size of key value because at next iteration we will access next item
    1119      2024000 :                 key_values_total_size = next_key_size;
    1120        40038 :             }
    1121      2064038 :             fail_point!("delta-layer-writer-fail-before-finish", |_| {
    1122            0 :                 Err(CompactionError::Other(anyhow::anyhow!(
    1123            0 :                     "failpoint delta-layer-writer-fail-before-finish"
    1124            0 :                 )))
    1125      2064038 :             });
    1126              : 
    1127      2064038 :             if !self.shard_identity.is_key_disposable(&key) {
    1128      2064038 :                 if writer.is_none() {
    1129          308 :                     if self.cancel.is_cancelled() {
    1130              :                         // to be somewhat responsive to cancellation, check for each new layer
    1131            0 :                         return Err(CompactionError::ShuttingDown);
    1132          308 :                     }
    1133              :                     // Create writer if not initiaized yet
    1134          308 :                     writer = Some(
    1135              :                         DeltaLayerWriter::new(
    1136          308 :                             self.conf,
    1137          308 :                             self.timeline_id,
    1138          308 :                             self.tenant_shard_id,
    1139          308 :                             key,
    1140          308 :                             if dup_end_lsn.is_valid() {
    1141              :                                 // this is a layer containing slice of values of the same key
    1142            0 :                                 debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
    1143            0 :                                 dup_start_lsn..dup_end_lsn
    1144              :                             } else {
    1145          308 :                                 debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
    1146          308 :                                 lsn_range.clone()
    1147              :                             },
    1148          308 :                             ctx,
    1149              :                         )
    1150          154 :                         .await
    1151          308 :                         .map_err(CompactionError::Other)?,
    1152              :                     );
    1153              : 
    1154          308 :                     keys = 0;
    1155      2063730 :                 }
    1156              : 
    1157      2064038 :                 writer
    1158      2064038 :                     .as_mut()
    1159      2064038 :                     .unwrap()
    1160      2064038 :                     .put_value(key, lsn, value, ctx)
    1161         1225 :                     .await
    1162      2064038 :                     .map_err(CompactionError::Other)?;
    1163              :             } else {
    1164            0 :                 debug!(
    1165            0 :                     "Dropping key {} during compaction (it belongs on shard {:?})",
    1166            0 :                     key,
    1167            0 :                     self.shard_identity.get_shard_number(&key)
    1168              :                 );
    1169              :             }
    1170              : 
    1171      2064038 :             if !new_layers.is_empty() {
    1172        19786 :                 fail_point!("after-timeline-compacted-first-L1");
    1173      2044252 :             }
    1174              : 
    1175      2064038 :             prev_key = Some(key);
    1176              :         }
    1177           28 :         if let Some(writer) = writer {
    1178           28 :             let (desc, path) = writer
    1179           28 :                 .finish(prev_key.unwrap().next(), ctx)
    1180         1988 :                 .await
    1181           28 :                 .map_err(CompactionError::Other)?;
    1182           28 :             let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
    1183           28 :                 .map_err(CompactionError::Other)?;
    1184           28 :             new_layers.push(new_delta);
    1185            0 :         }
    1186              : 
    1187              :         // Sync layers
    1188           28 :         if !new_layers.is_empty() {
    1189              :             // Print a warning if the created layer is larger than double the target size
    1190              :             // Add two pages for potential overhead. This should in theory be already
    1191              :             // accounted for in the target calculation, but for very small targets,
    1192              :             // we still might easily hit the limit otherwise.
    1193           28 :             let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
    1194          308 :             for layer in new_layers.iter() {
    1195          308 :                 if layer.layer_desc().file_size > warn_limit {
    1196            0 :                     warn!(
    1197              :                         %layer,
    1198            0 :                         "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size
    1199              :                     );
    1200          308 :                 }
    1201              :             }
    1202              : 
    1203              :             // The writer.finish() above already did the fsync of the inodes.
    1204              :             // We just need to fsync the directory in which these inodes are linked,
    1205              :             // which we know to be the timeline directory.
    1206              :             //
    1207              :             // We use fatal_err() below because the after writer.finish() returns with success,
    1208              :             // the in-memory state of the filesystem already has the layer file in its final place,
    1209              :             // and subsequent pageserver code could think it's durable while it really isn't.
    1210           28 :             let timeline_dir = VirtualFile::open(
    1211           28 :                 &self
    1212           28 :                     .conf
    1213           28 :                     .timeline_path(&self.tenant_shard_id, &self.timeline_id),
    1214           28 :                 ctx,
    1215           28 :             )
    1216           14 :             .await
    1217           28 :             .fatal_err("VirtualFile::open for timeline dir fsync");
    1218           28 :             timeline_dir
    1219           28 :                 .sync_all()
    1220           14 :                 .await
    1221           28 :                 .fatal_err("VirtualFile::sync_all timeline dir");
    1222            0 :         }
    1223              : 
    1224           28 :         stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
    1225           28 :         stats.new_deltas_count = Some(new_layers.len());
    1226          308 :         stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum());
    1227           28 : 
    1228           28 :         match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
    1229           28 :             .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
    1230              :         {
    1231           28 :             Ok(stats_json) => {
    1232           28 :                 info!(
    1233            0 :                     stats_json = stats_json.as_str(),
    1234            0 :                     "compact_level0_phase1 stats available"
    1235              :                 )
    1236              :             }
    1237            0 :             Err(e) => {
    1238            0 :                 warn!("compact_level0_phase1 stats failed to serialize: {:#}", e);
    1239              :             }
    1240              :         }
    1241              : 
    1242              :         // Without this, rustc complains about deltas_to_compact still
    1243              :         // being borrowed when we `.into_iter()` below.
    1244           28 :         drop(all_values_iter);
    1245           28 : 
    1246           28 :         Ok(CompactLevel0Phase1Result {
    1247           28 :             new_layers,
    1248           28 :             deltas_to_compact: deltas_to_compact
    1249           28 :                 .into_iter()
    1250          402 :                 .map(|x| x.drop_eviction_guard())
    1251           28 :                 .collect::<Vec<_>>(),
    1252           28 :             fully_compacted,
    1253           28 :         })
    1254          364 :     }
    1255              : }
    1256              : 
    1257              : #[derive(Default)]
    1258              : struct CompactLevel0Phase1Result {
    1259              :     new_layers: Vec<ResidentLayer>,
    1260              :     deltas_to_compact: Vec<Layer>,
    1261              :     // Whether we have included all L0 layers, or selected only part of them due to the
    1262              :     // L0 compaction size limit.
    1263              :     fully_compacted: bool,
    1264              : }
    1265              : 
    1266              : #[derive(Default)]
    1267              : struct CompactLevel0Phase1StatsBuilder {
    1268              :     version: Option<u64>,
    1269              :     tenant_id: Option<TenantShardId>,
    1270              :     timeline_id: Option<TimelineId>,
    1271              :     read_lock_acquisition_micros: DurationRecorder,
    1272              :     read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
    1273              :     read_lock_held_key_sort_micros: DurationRecorder,
    1274              :     read_lock_held_prerequisites_micros: DurationRecorder,
    1275              :     read_lock_held_compute_holes_micros: DurationRecorder,
    1276              :     read_lock_drop_micros: DurationRecorder,
    1277              :     write_layer_files_micros: DurationRecorder,
    1278              :     level0_deltas_count: Option<usize>,
    1279              :     new_deltas_count: Option<usize>,
    1280              :     new_deltas_size: Option<u64>,
    1281              : }
    1282              : 
    1283              : #[derive(serde::Serialize)]
    1284              : struct CompactLevel0Phase1Stats {
    1285              :     version: u64,
    1286              :     tenant_id: TenantShardId,
    1287              :     timeline_id: TimelineId,
    1288              :     read_lock_acquisition_micros: RecordedDuration,
    1289              :     read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
    1290              :     read_lock_held_key_sort_micros: RecordedDuration,
    1291              :     read_lock_held_prerequisites_micros: RecordedDuration,
    1292              :     read_lock_held_compute_holes_micros: RecordedDuration,
    1293              :     read_lock_drop_micros: RecordedDuration,
    1294              :     write_layer_files_micros: RecordedDuration,
    1295              :     level0_deltas_count: usize,
    1296              :     new_deltas_count: usize,
    1297              :     new_deltas_size: u64,
    1298              : }
    1299              : 
    1300              : impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
    1301              :     type Error = anyhow::Error;
    1302              : 
    1303           28 :     fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result<Self, Self::Error> {
    1304           28 :         Ok(Self {
    1305           28 :             version: value.version.ok_or_else(|| anyhow!("version not set"))?,
    1306           28 :             tenant_id: value
    1307           28 :                 .tenant_id
    1308           28 :                 .ok_or_else(|| anyhow!("tenant_id not set"))?,
    1309           28 :             timeline_id: value
    1310           28 :                 .timeline_id
    1311           28 :                 .ok_or_else(|| anyhow!("timeline_id not set"))?,
    1312           28 :             read_lock_acquisition_micros: value
    1313           28 :                 .read_lock_acquisition_micros
    1314           28 :                 .into_recorded()
    1315           28 :                 .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?,
    1316           28 :             read_lock_held_spawn_blocking_startup_micros: value
    1317           28 :                 .read_lock_held_spawn_blocking_startup_micros
    1318           28 :                 .into_recorded()
    1319           28 :                 .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
    1320           28 :             read_lock_held_key_sort_micros: value
    1321           28 :                 .read_lock_held_key_sort_micros
    1322           28 :                 .into_recorded()
    1323           28 :                 .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
    1324           28 :             read_lock_held_prerequisites_micros: value
    1325           28 :                 .read_lock_held_prerequisites_micros
    1326           28 :                 .into_recorded()
    1327           28 :                 .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?,
    1328           28 :             read_lock_held_compute_holes_micros: value
    1329           28 :                 .read_lock_held_compute_holes_micros
    1330           28 :                 .into_recorded()
    1331           28 :                 .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?,
    1332           28 :             read_lock_drop_micros: value
    1333           28 :                 .read_lock_drop_micros
    1334           28 :                 .into_recorded()
    1335           28 :                 .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
    1336           28 :             write_layer_files_micros: value
    1337           28 :                 .write_layer_files_micros
    1338           28 :                 .into_recorded()
    1339           28 :                 .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?,
    1340           28 :             level0_deltas_count: value
    1341           28 :                 .level0_deltas_count
    1342           28 :                 .ok_or_else(|| anyhow!("level0_deltas_count not set"))?,
    1343           28 :             new_deltas_count: value
    1344           28 :                 .new_deltas_count
    1345           28 :                 .ok_or_else(|| anyhow!("new_deltas_count not set"))?,
    1346           28 :             new_deltas_size: value
    1347           28 :                 .new_deltas_size
    1348           28 :                 .ok_or_else(|| anyhow!("new_deltas_size not set"))?,
    1349              :         })
    1350           28 :     }
    1351              : }
    1352              : 
    1353              : impl Timeline {
    1354              :     /// Entry point for new tiered compaction algorithm.
    1355              :     ///
    1356              :     /// All the real work is in the implementation in the pageserver_compaction
    1357              :     /// crate. The code here would apply to any algorithm implemented by the
    1358              :     /// same interface, but tiered is the only one at the moment.
    1359              :     ///
    1360              :     /// TODO: cancellation
    1361            0 :     pub(crate) async fn compact_tiered(
    1362            0 :         self: &Arc<Self>,
    1363            0 :         _cancel: &CancellationToken,
    1364            0 :         ctx: &RequestContext,
    1365            0 :     ) -> Result<(), CompactionError> {
    1366            0 :         let fanout = self.get_compaction_threshold() as u64;
    1367            0 :         let target_file_size = self.get_checkpoint_distance();
    1368              : 
    1369              :         // Find the top of the historical layers
    1370            0 :         let end_lsn = {
    1371            0 :             let guard = self.layers.read().await;
    1372            0 :             let layers = guard.layer_map()?;
    1373              : 
    1374            0 :             let l0_deltas = layers.level0_deltas();
    1375            0 : 
    1376            0 :             // As an optimization, if we find that there are too few L0 layers,
    1377            0 :             // bail out early. We know that the compaction algorithm would do
    1378            0 :             // nothing in that case.
    1379            0 :             if l0_deltas.len() < fanout as usize {
    1380              :                 // doesn't need compacting
    1381            0 :                 return Ok(());
    1382            0 :             }
    1383            0 :             l0_deltas.iter().map(|l| l.lsn_range.end).max().unwrap()
    1384            0 :         };
    1385            0 : 
    1386            0 :         // Is the timeline being deleted?
    1387            0 :         if self.is_stopping() {
    1388            0 :             trace!("Dropping out of compaction on timeline shutdown");
    1389            0 :             return Err(CompactionError::ShuttingDown);
    1390            0 :         }
    1391              : 
    1392            0 :         let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
    1393              :         // TODO(chi): ignore sparse_keyspace for now, compact it in the future.
    1394            0 :         let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
    1395            0 : 
    1396            0 :         pageserver_compaction::compact_tiered::compact_tiered(
    1397            0 :             &mut adaptor,
    1398            0 :             end_lsn,
    1399            0 :             target_file_size,
    1400            0 :             fanout,
    1401            0 :             ctx,
    1402            0 :         )
    1403            0 :         .await
    1404              :         // TODO: compact_tiered needs to return CompactionError
    1405            0 :         .map_err(CompactionError::Other)?;
    1406              : 
    1407            0 :         adaptor.flush_updates().await?;
    1408            0 :         Ok(())
    1409            0 :     }
    1410              : 
    1411              :     /// Take a list of images and deltas, produce images and deltas according to GC horizon and retain_lsns.
    1412              :     ///
    1413              :     /// It takes a key, the values of the key within the compaction process, a GC horizon, and all retain_lsns below the horizon.
    1414              :     /// For now, it requires the `accumulated_values` contains the full history of the key (i.e., the key with the lowest LSN is
    1415              :     /// an image or a WAL not requiring a base image). This restriction will be removed once we implement gc-compaction on branch.
    1416              :     ///
    1417              :     /// The function returns the deltas and the base image that need to be placed at each of the retain LSN. For example, we have:
    1418              :     ///
    1419              :     /// A@0x10, +B@0x20, +C@0x30, +D@0x40, +E@0x50, +F@0x60
    1420              :     /// horizon = 0x50, retain_lsn = 0x20, 0x40, delta_threshold=3
    1421              :     ///
    1422              :     /// The function will produce:
    1423              :     ///
    1424              :     /// ```plain
    1425              :     /// 0x20(retain_lsn) -> img=AB@0x20                  always produce a single image below the lowest retain LSN
    1426              :     /// 0x40(retain_lsn) -> deltas=[+C@0x30, +D@0x40]    two deltas since the last base image, keeping the deltas
    1427              :     /// 0x50(horizon)    -> deltas=[ABCDE@0x50]          three deltas since the last base image, generate an image but put it in the delta
    1428              :     /// above_horizon    -> deltas=[+F@0x60]             full history above the horizon
    1429              :     /// ```
    1430              :     ///
    1431              :     /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key.
    1432          538 :     pub(crate) async fn generate_key_retention(
    1433          538 :         self: &Arc<Timeline>,
    1434          538 :         key: Key,
    1435          538 :         full_history: &[(Key, Lsn, Value)],
    1436          538 :         horizon: Lsn,
    1437          538 :         retain_lsn_below_horizon: &[Lsn],
    1438          538 :         delta_threshold_cnt: usize,
    1439          538 :         base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
    1440          538 :     ) -> anyhow::Result<KeyHistoryRetention> {
    1441          538 :         // Pre-checks for the invariants
    1442          538 :         if cfg!(debug_assertions) {
    1443         1306 :             for (log_key, _, _) in full_history {
    1444          768 :                 assert_eq!(log_key, &key, "mismatched key");
    1445              :             }
    1446          538 :             for i in 1..full_history.len() {
    1447          230 :                 assert!(full_history[i - 1].1 <= full_history[i].1, "unordered LSN");
    1448          230 :                 if full_history[i - 1].1 == full_history[i].1 {
    1449            0 :                     assert!(
    1450            0 :                         matches!(full_history[i - 1].2, Value::Image(_)),
    1451            0 :                         "unordered delta/image, or duplicated delta"
    1452              :                     );
    1453          230 :                 }
    1454              :             }
    1455              :             // There was an assertion for no base image that checks if the first
    1456              :             // record in the history is `will_init` before, but it was removed.
    1457              :             // This is explained in the test cases for generate_key_retention.
    1458              :             // Search "incomplete history" for more information.
    1459         1148 :             for lsn in retain_lsn_below_horizon {
    1460          610 :                 assert!(lsn < &horizon, "retain lsn must be below horizon")
    1461              :             }
    1462          538 :             for i in 1..retain_lsn_below_horizon.len() {
    1463          278 :                 assert!(
    1464          278 :                     retain_lsn_below_horizon[i - 1] <= retain_lsn_below_horizon[i],
    1465            0 :                     "unordered LSN"
    1466              :                 );
    1467              :             }
    1468            0 :         }
    1469          538 :         let has_ancestor = base_img_from_ancestor.is_some();
    1470              :         // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon,
    1471              :         // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket.
    1472          538 :         let (mut split_history, lsn_split_points) = {
    1473          538 :             let mut split_history = Vec::new();
    1474          538 :             split_history.resize_with(retain_lsn_below_horizon.len() + 2, Vec::new);
    1475          538 :             let mut lsn_split_points = Vec::with_capacity(retain_lsn_below_horizon.len() + 1);
    1476         1148 :             for lsn in retain_lsn_below_horizon {
    1477          610 :                 lsn_split_points.push(*lsn);
    1478          610 :             }
    1479          538 :             lsn_split_points.push(horizon);
    1480          538 :             let mut current_idx = 0;
    1481         1306 :             for item @ (_, lsn, _) in full_history {
    1482          944 :                 while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] {
    1483          176 :                     current_idx += 1;
    1484          176 :                 }
    1485          768 :                 split_history[current_idx].push(item);
    1486              :             }
    1487          538 :             (split_history, lsn_split_points)
    1488              :         };
    1489              :         // Step 2: filter out duplicated records due to the k-merge of image/delta layers
    1490         2224 :         for split_for_lsn in &mut split_history {
    1491         1686 :             let mut prev_lsn = None;
    1492         1686 :             let mut new_split_for_lsn = Vec::with_capacity(split_for_lsn.len());
    1493         1686 :             for record @ (_, lsn, _) in std::mem::take(split_for_lsn) {
    1494          768 :                 if let Some(prev_lsn) = &prev_lsn {
    1495          106 :                     if *prev_lsn == lsn {
    1496              :                         // The case that we have an LSN with both data from the delta layer and the image layer. As
    1497              :                         // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply
    1498              :                         // drop this delta and keep the image.
    1499              :                         //
    1500              :                         // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
    1501              :                         // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
    1502              :                         // dropped.
    1503              :                         //
    1504              :                         // TODO: in case we have both delta + images for a given LSN and it does not exceed the delta
    1505              :                         // threshold, we could have kept delta instead to save space. This is an optimization for the future.
    1506            0 :                         continue;
    1507          106 :                     }
    1508          662 :                 }
    1509          768 :                 prev_lsn = Some(lsn);
    1510          768 :                 new_split_for_lsn.push(record);
    1511              :             }
    1512         1686 :             *split_for_lsn = new_split_for_lsn;
    1513              :         }
    1514              :         // Step 3: generate images when necessary
    1515          538 :         let mut retention = Vec::with_capacity(split_history.len());
    1516          538 :         let mut records_since_last_image = 0;
    1517          538 :         let batch_cnt = split_history.len();
    1518          538 :         assert!(
    1519          538 :             batch_cnt >= 2,
    1520            0 :             "should have at least below + above horizon batches"
    1521              :         );
    1522          538 :         let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
    1523          538 :         if let Some((key, lsn, img)) = base_img_from_ancestor {
    1524           18 :             replay_history.push((key, lsn, Value::Image(img)));
    1525          520 :         }
    1526              : 
    1527              :         /// Generate debug information for the replay history
    1528            0 :         fn generate_history_trace(replay_history: &[(Key, Lsn, Value)]) -> String {
    1529              :             use std::fmt::Write;
    1530            0 :             let mut output = String::new();
    1531            0 :             if let Some((key, _, _)) = replay_history.first() {
    1532            0 :                 write!(output, "key={} ", key).unwrap();
    1533            0 :                 let mut cnt = 0;
    1534            0 :                 for (_, lsn, val) in replay_history {
    1535            0 :                     if val.is_image() {
    1536            0 :                         write!(output, "i@{} ", lsn).unwrap();
    1537            0 :                     } else if val.will_init() {
    1538            0 :                         write!(output, "di@{} ", lsn).unwrap();
    1539            0 :                     } else {
    1540            0 :                         write!(output, "d@{} ", lsn).unwrap();
    1541            0 :                     }
    1542            0 :                     cnt += 1;
    1543            0 :                     if cnt >= 128 {
    1544            0 :                         write!(output, "... and more").unwrap();
    1545            0 :                         break;
    1546            0 :                     }
    1547              :                 }
    1548            0 :             } else {
    1549            0 :                 write!(output, "<no history>").unwrap();
    1550            0 :             }
    1551            0 :             output
    1552            0 :         }
    1553              : 
    1554            0 :         fn generate_debug_trace(
    1555            0 :             replay_history: Option<&[(Key, Lsn, Value)]>,
    1556            0 :             full_history: &[(Key, Lsn, Value)],
    1557            0 :             lsns: &[Lsn],
    1558            0 :             horizon: Lsn,
    1559            0 :         ) -> String {
    1560              :             use std::fmt::Write;
    1561            0 :             let mut output = String::new();
    1562            0 :             if let Some(replay_history) = replay_history {
    1563            0 :                 writeln!(
    1564            0 :                     output,
    1565            0 :                     "replay_history: {}",
    1566            0 :                     generate_history_trace(replay_history)
    1567            0 :                 )
    1568            0 :                 .unwrap();
    1569            0 :             } else {
    1570            0 :                 writeln!(output, "replay_history: <disabled>",).unwrap();
    1571            0 :             }
    1572            0 :             writeln!(
    1573            0 :                 output,
    1574            0 :                 "full_history: {}",
    1575            0 :                 generate_history_trace(full_history)
    1576            0 :             )
    1577            0 :             .unwrap();
    1578            0 :             writeln!(
    1579            0 :                 output,
    1580            0 :                 "when processing: [{}] horizon={}",
    1581            0 :                 lsns.iter().map(|l| format!("{l}")).join(","),
    1582            0 :                 horizon
    1583            0 :             )
    1584            0 :             .unwrap();
    1585            0 :             output
    1586            0 :         }
    1587              : 
    1588         1686 :         for (i, split_for_lsn) in split_history.into_iter().enumerate() {
    1589              :             // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
    1590         1686 :             records_since_last_image += split_for_lsn.len();
    1591         1686 :             let generate_image = if i == 0 && !has_ancestor {
    1592              :                 // We always generate images for the first batch (below horizon / lowest retain_lsn)
    1593          520 :                 true
    1594         1166 :             } else if i == batch_cnt - 1 {
    1595              :                 // Do not generate images for the last batch (above horizon)
    1596          538 :                 false
    1597          628 :             } else if records_since_last_image >= delta_threshold_cnt {
    1598              :                 // Generate images when there are too many records
    1599            6 :                 true
    1600              :             } else {
    1601          622 :                 false
    1602              :             };
    1603         1686 :             replay_history.extend(split_for_lsn.iter().map(|x| (*x).clone()));
    1604              :             // Only retain the items after the last image record
    1605         2030 :             for idx in (0..replay_history.len()).rev() {
    1606         2030 :                 if replay_history[idx].2.will_init() {
    1607         1686 :                     replay_history = replay_history[idx..].to_vec();
    1608         1686 :                     break;
    1609          344 :                 }
    1610              :             }
    1611         1686 :             if let Some((_, _, val)) = replay_history.first() {
    1612         1686 :                 if !val.will_init() {
    1613            0 :                     return Err(anyhow::anyhow!("invalid history, no base image")).with_context(
    1614            0 :                         || {
    1615            0 :                             generate_debug_trace(
    1616            0 :                                 Some(&replay_history),
    1617            0 :                                 full_history,
    1618            0 :                                 retain_lsn_below_horizon,
    1619            0 :                                 horizon,
    1620            0 :                             )
    1621            0 :                         },
    1622            0 :                     );
    1623         1686 :                 }
    1624            0 :             }
    1625         1686 :             if generate_image && records_since_last_image > 0 {
    1626          526 :                 records_since_last_image = 0;
    1627          526 :                 let replay_history_for_debug = if cfg!(debug_assertions) {
    1628          526 :                     Some(replay_history.clone())
    1629              :                 } else {
    1630            0 :                     None
    1631              :                 };
    1632          526 :                 let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
    1633          526 :                 let history = std::mem::take(&mut replay_history);
    1634          526 :                 let mut img = None;
    1635          526 :                 let mut records = Vec::with_capacity(history.len());
    1636          526 :                 if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
    1637          504 :                     img = Some((*lsn, val.clone()));
    1638          504 :                     for (_, lsn, val) in history.into_iter().skip(1) {
    1639           34 :                         let Value::WalRecord(rec) = val else {
    1640            0 :                             return Err(anyhow::anyhow!(
    1641            0 :                                 "invalid record, first record is image, expect walrecords"
    1642            0 :                             ))
    1643            0 :                             .with_context(|| {
    1644            0 :                                 generate_debug_trace(
    1645            0 :                                     replay_history_for_debug_ref,
    1646            0 :                                     full_history,
    1647            0 :                                     retain_lsn_below_horizon,
    1648            0 :                                     horizon,
    1649            0 :                                 )
    1650            0 :                             });
    1651              :                         };
    1652           34 :                         records.push((lsn, rec));
    1653              :                     }
    1654              :                 } else {
    1655           36 :                     for (_, lsn, val) in history.into_iter() {
    1656           36 :                         let Value::WalRecord(rec) = val else {
    1657            0 :                             return Err(anyhow::anyhow!("invalid record, first record is walrecord, expect rest are walrecord"))
    1658            0 :                                 .with_context(|| generate_debug_trace(
    1659            0 :                                     replay_history_for_debug_ref,
    1660            0 :                                     full_history,
    1661            0 :                                     retain_lsn_below_horizon,
    1662            0 :                                     horizon,
    1663            0 :                                 ));
    1664              :                         };
    1665           36 :                         records.push((lsn, rec));
    1666              :                     }
    1667              :                 }
    1668          526 :                 records.reverse();
    1669          526 :                 let state = ValueReconstructState { img, records };
    1670          526 :                 let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range
    1671          526 :                 let img = self.reconstruct_value(key, request_lsn, state).await?;
    1672          526 :                 replay_history.push((key, request_lsn, Value::Image(img.clone())));
    1673          526 :                 retention.push(vec![(request_lsn, Value::Image(img))]);
    1674         1160 :             } else {
    1675         1160 :                 let deltas = split_for_lsn
    1676         1160 :                     .iter()
    1677         1160 :                     .map(|(_, lsn, value)| (*lsn, value.clone()))
    1678         1160 :                     .collect_vec();
    1679         1160 :                 retention.push(deltas);
    1680         1160 :             }
    1681              :         }
    1682          538 :         let mut result = Vec::with_capacity(retention.len());
    1683          538 :         assert_eq!(retention.len(), lsn_split_points.len() + 1);
    1684         1686 :         for (idx, logs) in retention.into_iter().enumerate() {
    1685         1686 :             if idx == lsn_split_points.len() {
    1686          538 :                 return Ok(KeyHistoryRetention {
    1687          538 :                     below_horizon: result,
    1688          538 :                     above_horizon: KeyLogAtLsn(logs),
    1689          538 :                 });
    1690         1148 :             } else {
    1691         1148 :                 result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
    1692         1148 :             }
    1693              :         }
    1694            0 :         unreachable!("key retention is empty")
    1695          538 :     }
    1696              : 
    1697              :     /// Check how much space is left on the disk
    1698           40 :     async fn check_available_space(self: &Arc<Self>) -> anyhow::Result<u64> {
    1699           40 :         let tenants_dir = self.conf.tenants_path();
    1700              : 
    1701           40 :         let stat = Statvfs::get(&tenants_dir, None)
    1702           40 :             .context("statvfs failed, presumably directory got unlinked")?;
    1703              : 
    1704           40 :         let (avail_bytes, _) = stat.get_avail_total_bytes();
    1705           40 : 
    1706           40 :         Ok(avail_bytes)
    1707           40 :     }
    1708              : 
    1709              :     /// Check if the compaction can proceed safely without running out of space. We assume the size
    1710              :     /// upper bound of the produced files of a compaction job is the same as all layers involved in
    1711              :     /// the compaction. Therefore, we need `2 * layers_to_be_compacted_size` at least to do a
    1712              :     /// compaction.
    1713           40 :     async fn check_compaction_space(
    1714           40 :         self: &Arc<Self>,
    1715           40 :         layer_selection: &[Layer],
    1716           40 :     ) -> anyhow::Result<()> {
    1717           40 :         let available_space = self.check_available_space().await?;
    1718           40 :         let mut remote_layer_size = 0;
    1719           40 :         let mut all_layer_size = 0;
    1720          160 :         for layer in layer_selection {
    1721          120 :             let needs_download = layer.needs_download().await?;
    1722          120 :             if needs_download.is_some() {
    1723            0 :                 remote_layer_size += layer.layer_desc().file_size;
    1724          120 :             }
    1725          120 :             all_layer_size += layer.layer_desc().file_size;
    1726              :         }
    1727           40 :         let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */
    1728           40 :         if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space
    1729              :         {
    1730            0 :             return Err(anyhow!("not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
    1731            0 :                 available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size));
    1732           40 :         }
    1733           40 :         Ok(())
    1734           40 :     }
    1735              : 
    1736           30 :     pub(crate) async fn compact_with_gc(
    1737           30 :         self: &Arc<Self>,
    1738           30 :         cancel: &CancellationToken,
    1739           30 :         flags: EnumSet<CompactFlags>,
    1740           30 :         ctx: &RequestContext,
    1741           30 :     ) -> anyhow::Result<()> {
    1742           30 :         self.partial_compact_with_gc(Key::MIN..Key::MAX, cancel, flags, ctx)
    1743          708 :             .await
    1744           30 :     }
    1745              : 
    1746              :     /// An experimental compaction building block that combines compaction with garbage collection.
    1747              :     ///
    1748              :     /// The current implementation picks all delta + image layers that are below or intersecting with
    1749              :     /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
    1750              :     /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
    1751              :     /// and create delta layers with all deltas >= gc horizon.
    1752              :     ///
    1753              :     /// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
    1754              :     /// Partial compaction will read and process all layers overlapping with the key range, even if it might
    1755              :     /// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
    1756              :     /// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
    1757              :     /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
    1758              :     /// part of the range.
    1759           40 :     pub(crate) async fn partial_compact_with_gc(
    1760           40 :         self: &Arc<Self>,
    1761           40 :         compaction_key_range: Range<Key>,
    1762           40 :         cancel: &CancellationToken,
    1763           40 :         flags: EnumSet<CompactFlags>,
    1764           40 :         ctx: &RequestContext,
    1765           40 :     ) -> anyhow::Result<()> {
    1766           40 :         // Block other compaction/GC tasks from running for now. GC-compaction could run along
    1767           40 :         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
    1768           40 :         // Note that we already acquired the compaction lock when the outer `compact` function gets called.
    1769           40 : 
    1770           40 :         let gc_lock = async {
    1771           40 :             tokio::select! {
    1772           40 :                 guard = self.gc_lock.lock() => Ok(guard),
    1773              :                 // TODO: refactor to CompactionError to correctly pass cancelled error
    1774           40 :                 _ = cancel.cancelled() => Err(anyhow!("cancelled")),
    1775              :             }
    1776           40 :         };
    1777              : 
    1778           40 :         let gc_lock = crate::timed(
    1779           40 :             gc_lock,
    1780           40 :             "acquires gc lock",
    1781           40 :             std::time::Duration::from_secs(5),
    1782           40 :         )
    1783            1 :         .await?;
    1784              : 
    1785           40 :         let dry_run = flags.contains(CompactFlags::DryRun);
    1786           40 : 
    1787           40 :         if compaction_key_range == (Key::MIN..Key::MAX) {
    1788           30 :             info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
    1789              :         } else {
    1790           10 :             info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
    1791              :         }
    1792              : 
    1793           40 :         scopeguard::defer! {
    1794           40 :             info!("done enhanced gc bottom-most compaction");
    1795           40 :         };
    1796           40 : 
    1797           40 :         let mut stat = CompactionStatistics::default();
    1798              : 
    1799              :         // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
    1800              :         // The layer selection has the following properties:
    1801              :         // 1. If a layer is in the selection, all layers below it are in the selection.
    1802              :         // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
    1803           40 :         let job_desc = {
    1804           40 :             let guard = self.layers.read().await;
    1805           40 :             let layers = guard.layer_map()?;
    1806           40 :             let gc_info = self.gc_info.read().unwrap();
    1807           40 :             let mut retain_lsns_below_horizon = Vec::new();
    1808           40 :             let gc_cutoff = gc_info.cutoffs.select_min();
    1809           44 :             for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns {
    1810           44 :                 if lsn < &gc_cutoff {
    1811           44 :                     retain_lsns_below_horizon.push(*lsn);
    1812           44 :                 }
    1813              :             }
    1814           40 :             for lsn in gc_info.leases.keys() {
    1815            0 :                 if lsn < &gc_cutoff {
    1816            0 :                     retain_lsns_below_horizon.push(*lsn);
    1817            0 :                 }
    1818              :             }
    1819           40 :             let mut selected_layers: Vec<Layer> = Vec::new();
    1820           40 :             drop(gc_info);
    1821              :             // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
    1822           40 :             let Some(max_layer_lsn) = layers
    1823           40 :                 .iter_historic_layers()
    1824          182 :                 .filter(|desc| desc.get_lsn_range().start <= gc_cutoff)
    1825          150 :                 .map(|desc| desc.get_lsn_range().end)
    1826           40 :                 .max()
    1827              :             else {
    1828            0 :                 info!("no layers to compact with gc");
    1829            0 :                 return Ok(());
    1830              :             };
    1831              :             // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
    1832              :             // layers to compact.
    1833           40 :             let mut rewrite_layers = Vec::new();
    1834          182 :             for desc in layers.iter_historic_layers() {
    1835          182 :                 if desc.get_lsn_range().end <= max_layer_lsn
    1836          150 :                     && overlaps_with(&desc.get_key_range(), &compaction_key_range)
    1837              :                 {
    1838              :                     // If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range,
    1839              :                     // even if it might contain extra keys
    1840          120 :                     selected_layers.push(guard.get_from_desc(&desc));
    1841          120 :                     // If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine
    1842          120 :                     // to overlap image layers)
    1843          120 :                     if desc.is_delta()
    1844           62 :                         && !fully_contains(&compaction_key_range, &desc.get_key_range())
    1845            2 :                     {
    1846            2 :                         rewrite_layers.push(desc);
    1847          118 :                     }
    1848           62 :                 }
    1849              :             }
    1850           40 :             if selected_layers.is_empty() {
    1851            0 :                 info!("no layers to compact with gc");
    1852            0 :                 return Ok(());
    1853           40 :             }
    1854           40 :             retain_lsns_below_horizon.sort();
    1855           40 :             GcCompactionJobDescription {
    1856           40 :                 selected_layers,
    1857           40 :                 gc_cutoff,
    1858           40 :                 retain_lsns_below_horizon,
    1859           40 :                 max_layer_lsn,
    1860           40 :                 compaction_key_range,
    1861           40 :                 rewrite_layers,
    1862           40 :             }
    1863              :         };
    1864           40 :         let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
    1865            2 :             Lsn(self.ancestor_lsn.0 + 1)
    1866              :         } else {
    1867           38 :             let res = job_desc
    1868           38 :                 .retain_lsns_below_horizon
    1869           38 :                 .first()
    1870           38 :                 .copied()
    1871           38 :                 .unwrap_or(job_desc.gc_cutoff);
    1872           38 :             if cfg!(debug_assertions) {
    1873           38 :                 assert_eq!(
    1874           38 :                     res,
    1875           38 :                     job_desc
    1876           38 :                         .retain_lsns_below_horizon
    1877           38 :                         .iter()
    1878           38 :                         .min()
    1879           38 :                         .copied()
    1880           38 :                         .unwrap_or(job_desc.gc_cutoff)
    1881           38 :                 );
    1882            0 :             }
    1883           38 :             res
    1884              :         };
    1885           40 :         info!(
    1886            0 :             "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}",
    1887            0 :             job_desc.selected_layers.len(),
    1888            0 :             job_desc.rewrite_layers.len(),
    1889              :             job_desc.max_layer_lsn,
    1890              :             job_desc.gc_cutoff,
    1891              :             lowest_retain_lsn,
    1892              :             job_desc.compaction_key_range.start,
    1893              :             job_desc.compaction_key_range.end
    1894              :         );
    1895              : 
    1896          160 :         for layer in &job_desc.selected_layers {
    1897          120 :             debug!("read layer: {}", layer.layer_desc().key());
    1898              :         }
    1899           42 :         for layer in &job_desc.rewrite_layers {
    1900            2 :             debug!("rewrite layer: {}", layer.key());
    1901              :         }
    1902              : 
    1903           40 :         self.check_compaction_space(&job_desc.selected_layers)
    1904           91 :             .await?;
    1905              : 
    1906              :         // Generate statistics for the compaction
    1907          160 :         for layer in &job_desc.selected_layers {
    1908          120 :             let desc = layer.layer_desc();
    1909          120 :             if desc.is_delta() {
    1910           62 :                 stat.visit_delta_layer(desc.file_size());
    1911           62 :             } else {
    1912           58 :                 stat.visit_image_layer(desc.file_size());
    1913           58 :             }
    1914              :         }
    1915              : 
    1916              :         // Step 1: construct a k-merge iterator over all layers.
    1917              :         // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
    1918           40 :         let layer_names = job_desc
    1919           40 :             .selected_layers
    1920           40 :             .iter()
    1921          120 :             .map(|layer| layer.layer_desc().layer_name())
    1922           40 :             .collect_vec();
    1923           40 :         if let Some(err) = check_valid_layermap(&layer_names) {
    1924            0 :             warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
    1925           40 :         }
    1926              :         // The maximum LSN we are processing in this compaction loop
    1927           40 :         let end_lsn = job_desc
    1928           40 :             .selected_layers
    1929           40 :             .iter()
    1930          120 :             .map(|l| l.layer_desc().lsn_range.end)
    1931           40 :             .max()
    1932           40 :             .unwrap();
    1933           40 :         let mut delta_layers = Vec::new();
    1934           40 :         let mut image_layers = Vec::new();
    1935           40 :         let mut downloaded_layers = Vec::new();
    1936          160 :         for layer in &job_desc.selected_layers {
    1937          120 :             let resident_layer = layer.download_and_keep_resident().await?;
    1938          120 :             downloaded_layers.push(resident_layer);
    1939              :         }
    1940          160 :         for resident_layer in &downloaded_layers {
    1941          120 :             if resident_layer.layer_desc().is_delta() {
    1942           62 :                 let layer = resident_layer.get_as_delta(ctx).await?;
    1943           62 :                 delta_layers.push(layer);
    1944              :             } else {
    1945           58 :                 let layer = resident_layer.get_as_image(ctx).await?;
    1946           58 :                 image_layers.push(layer);
    1947              :             }
    1948              :         }
    1949           40 :         let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
    1950           40 :         let mut merge_iter = FilterIterator::create(
    1951           40 :             MergeIterator::create(&delta_layers, &image_layers, ctx),
    1952           40 :             dense_ks,
    1953           40 :             sparse_ks,
    1954           40 :         )?;
    1955              : 
    1956              :         // Step 2: Produce images+deltas.
    1957           40 :         let mut accumulated_values = Vec::new();
    1958           40 :         let mut last_key: Option<Key> = None;
    1959              : 
    1960              :         // Only create image layers when there is no ancestor branches. TODO: create covering image layer
    1961              :         // when some condition meet.
    1962           40 :         let mut image_layer_writer = if self.ancestor_timeline.is_none() {
    1963              :             Some(
    1964           38 :                 SplitImageLayerWriter::new(
    1965           38 :                     self.conf,
    1966           38 :                     self.timeline_id,
    1967           38 :                     self.tenant_shard_id,
    1968           38 :                     job_desc.compaction_key_range.start,
    1969           38 :                     lowest_retain_lsn,
    1970           38 :                     self.get_compaction_target_size(),
    1971           38 :                     ctx,
    1972           38 :                 )
    1973           19 :                 .await?,
    1974              :             )
    1975              :         } else {
    1976            2 :             None
    1977              :         };
    1978              : 
    1979           40 :         let mut delta_layer_writer = SplitDeltaLayerWriter::new(
    1980           40 :             self.conf,
    1981           40 :             self.timeline_id,
    1982           40 :             self.tenant_shard_id,
    1983           40 :             lowest_retain_lsn..end_lsn,
    1984           40 :             self.get_compaction_target_size(),
    1985           40 :         )
    1986            0 :         .await?;
    1987              : 
    1988              :         #[derive(Default)]
    1989              :         struct RewritingLayers {
    1990              :             before: Option<DeltaLayerWriter>,
    1991              :             after: Option<DeltaLayerWriter>,
    1992              :         }
    1993           40 :         let mut delta_layer_rewriters = HashMap::<Arc<PersistentLayerKey>, RewritingLayers>::new();
    1994              : 
    1995              :         /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
    1996              :         ///
    1997              :         /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
    1998              :         /// is needed for reconstruction. This should be fixed in the future.
    1999              :         ///
    2000              :         /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor
    2001              :         /// images.
    2002          530 :         async fn get_ancestor_image(
    2003          530 :             tline: &Arc<Timeline>,
    2004          530 :             key: Key,
    2005          530 :             ctx: &RequestContext,
    2006          530 :         ) -> anyhow::Result<Option<(Key, Lsn, Bytes)>> {
    2007          530 :             if tline.ancestor_timeline.is_none() {
    2008          516 :                 return Ok(None);
    2009           14 :             };
    2010              :             // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing
    2011              :             // as much existing code as possible.
    2012           14 :             let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
    2013           14 :             Ok(Some((key, tline.ancestor_lsn, img)))
    2014          530 :         }
    2015              : 
    2016              :         // Actually, we can decide not to write to the image layer at all at this point because
    2017              :         // the key and LSN range are determined. However, to keep things simple here, we still
    2018              :         // create this writer, and discard the writer in the end.
    2019              : 
    2020          822 :         while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
    2021          782 :             if cancel.is_cancelled() {
    2022            0 :                 return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
    2023          782 :             }
    2024          782 :             if !job_desc.compaction_key_range.contains(&key) {
    2025           64 :                 if !desc.is_delta {
    2026           60 :                     continue;
    2027            4 :                 }
    2028            4 :                 let rewriter = delta_layer_rewriters.entry(desc.clone()).or_default();
    2029            4 :                 let rewriter = if key < job_desc.compaction_key_range.start {
    2030            0 :                     if rewriter.before.is_none() {
    2031            0 :                         rewriter.before = Some(
    2032            0 :                             DeltaLayerWriter::new(
    2033            0 :                                 self.conf,
    2034            0 :                                 self.timeline_id,
    2035            0 :                                 self.tenant_shard_id,
    2036            0 :                                 desc.key_range.start,
    2037            0 :                                 desc.lsn_range.clone(),
    2038            0 :                                 ctx,
    2039            0 :                             )
    2040            0 :                             .await?,
    2041              :                         );
    2042            0 :                     }
    2043            0 :                     rewriter.before.as_mut().unwrap()
    2044            4 :                 } else if key >= job_desc.compaction_key_range.end {
    2045            4 :                     if rewriter.after.is_none() {
    2046            2 :                         rewriter.after = Some(
    2047            2 :                             DeltaLayerWriter::new(
    2048            2 :                                 self.conf,
    2049            2 :                                 self.timeline_id,
    2050            2 :                                 self.tenant_shard_id,
    2051            2 :                                 job_desc.compaction_key_range.end,
    2052            2 :                                 desc.lsn_range.clone(),
    2053            2 :                                 ctx,
    2054            2 :                             )
    2055            1 :                             .await?,
    2056              :                         );
    2057            2 :                     }
    2058            4 :                     rewriter.after.as_mut().unwrap()
    2059              :                 } else {
    2060            0 :                     unreachable!()
    2061              :                 };
    2062            4 :                 rewriter.put_value(key, lsn, val, ctx).await?;
    2063            4 :                 continue;
    2064          718 :             }
    2065          718 :             match val {
    2066          542 :                 Value::Image(_) => stat.visit_image_key(&val),
    2067          176 :                 Value::WalRecord(_) => stat.visit_wal_key(&val),
    2068              :             }
    2069          718 :             if last_key.is_none() || last_key.as_ref() == Some(&key) {
    2070          228 :                 if last_key.is_none() {
    2071           40 :                     last_key = Some(key);
    2072          188 :                 }
    2073          228 :                 accumulated_values.push((key, lsn, val));
    2074              :             } else {
    2075          490 :                 let last_key: &mut Key = last_key.as_mut().unwrap();
    2076          490 :                 stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction
    2077          490 :                 let retention = self
    2078          490 :                     .generate_key_retention(
    2079          490 :                         *last_key,
    2080          490 :                         &accumulated_values,
    2081          490 :                         job_desc.gc_cutoff,
    2082          490 :                         &job_desc.retain_lsns_below_horizon,
    2083          490 :                         COMPACTION_DELTA_THRESHOLD,
    2084          490 :                         get_ancestor_image(self, *last_key, ctx).await?,
    2085              :                     )
    2086            0 :                     .await?;
    2087          490 :                 retention
    2088          490 :                     .pipe_to(
    2089          490 :                         *last_key,
    2090          490 :                         &mut delta_layer_writer,
    2091          490 :                         image_layer_writer.as_mut(),
    2092          490 :                         &mut stat,
    2093          490 :                         ctx,
    2094          490 :                     )
    2095          492 :                     .await?;
    2096          490 :                 accumulated_values.clear();
    2097          490 :                 *last_key = key;
    2098          490 :                 accumulated_values.push((key, lsn, val));
    2099              :             }
    2100              :         }
    2101              : 
    2102              :         // TODO: move the below part to the loop body
    2103           40 :         let last_key = last_key.expect("no keys produced during compaction");
    2104           40 :         stat.on_unique_key_visited();
    2105              : 
    2106           40 :         let retention = self
    2107           40 :             .generate_key_retention(
    2108           40 :                 last_key,
    2109           40 :                 &accumulated_values,
    2110           40 :                 job_desc.gc_cutoff,
    2111           40 :                 &job_desc.retain_lsns_below_horizon,
    2112           40 :                 COMPACTION_DELTA_THRESHOLD,
    2113           40 :                 get_ancestor_image(self, last_key, ctx).await?,
    2114              :             )
    2115            0 :             .await?;
    2116           40 :         retention
    2117           40 :             .pipe_to(
    2118           40 :                 last_key,
    2119           40 :                 &mut delta_layer_writer,
    2120           40 :                 image_layer_writer.as_mut(),
    2121           40 :                 &mut stat,
    2122           40 :                 ctx,
    2123           40 :             )
    2124           38 :             .await?;
    2125              :         // end: move the above part to the loop body
    2126              : 
    2127           40 :         let mut rewrote_delta_layers = Vec::new();
    2128           42 :         for (key, writers) in delta_layer_rewriters {
    2129            2 :             if let Some(delta_writer_before) = writers.before {
    2130            0 :                 let (desc, path) = delta_writer_before
    2131            0 :                     .finish(job_desc.compaction_key_range.start, ctx)
    2132            0 :                     .await?;
    2133            0 :                 let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
    2134            0 :                 rewrote_delta_layers.push(layer);
    2135            2 :             }
    2136            2 :             if let Some(delta_writer_after) = writers.after {
    2137            5 :                 let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
    2138            2 :                 let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
    2139            2 :                 rewrote_delta_layers.push(layer);
    2140            0 :             }
    2141              :         }
    2142              : 
    2143           58 :         let discard = |key: &PersistentLayerKey| {
    2144           58 :             let key = key.clone();
    2145           58 :             async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await }
    2146           58 :         };
    2147              : 
    2148           40 :         let produced_image_layers = if let Some(writer) = image_layer_writer {
    2149           38 :             if !dry_run {
    2150           34 :                 let end_key = job_desc.compaction_key_range.end;
    2151           34 :                 writer
    2152           34 :                     .finish_with_discard_fn(self, ctx, end_key, discard)
    2153           52 :                     .await?
    2154              :             } else {
    2155            4 :                 drop(writer);
    2156            4 :                 Vec::new()
    2157              :             }
    2158              :         } else {
    2159            2 :             Vec::new()
    2160              :         };
    2161              : 
    2162           40 :         let produced_delta_layers = if !dry_run {
    2163           36 :             delta_layer_writer
    2164           36 :                 .finish_with_discard_fn(self, ctx, discard)
    2165           30 :                 .await?
    2166              :         } else {
    2167            4 :             drop(delta_layer_writer);
    2168            4 :             Vec::new()
    2169              :         };
    2170              : 
    2171              :         // TODO: make image/delta/rewrote_delta layers generation atomic. At this point, we already generated resident layers, and if
    2172              :         // compaction is cancelled at this point, we might have some layers that are not cleaned up.
    2173           40 :         let mut compact_to = Vec::new();
    2174           40 :         let mut keep_layers = HashSet::new();
    2175           40 :         let produced_delta_layers_len = produced_delta_layers.len();
    2176           40 :         let produced_image_layers_len = produced_image_layers.len();
    2177           64 :         for action in produced_delta_layers {
    2178           24 :             match action {
    2179           12 :                 BatchWriterResult::Produced(layer) => {
    2180           12 :                     if cfg!(debug_assertions) {
    2181           12 :                         info!("produced delta layer: {}", layer.layer_desc().key());
    2182            0 :                     }
    2183           12 :                     stat.produce_delta_layer(layer.layer_desc().file_size());
    2184           12 :                     compact_to.push(layer);
    2185              :                 }
    2186           12 :                 BatchWriterResult::Discarded(l) => {
    2187           12 :                     if cfg!(debug_assertions) {
    2188           12 :                         info!("discarded delta layer: {}", l);
    2189            0 :                     }
    2190           12 :                     keep_layers.insert(l);
    2191           12 :                     stat.discard_delta_layer();
    2192              :                 }
    2193              :             }
    2194              :         }
    2195           42 :         for layer in &rewrote_delta_layers {
    2196            2 :             debug!(
    2197            0 :                 "produced rewritten delta layer: {}",
    2198            0 :                 layer.layer_desc().key()
    2199              :             );
    2200              :         }
    2201           40 :         compact_to.extend(rewrote_delta_layers);
    2202           74 :         for action in produced_image_layers {
    2203           34 :             match action {
    2204           26 :                 BatchWriterResult::Produced(layer) => {
    2205           26 :                     debug!("produced image layer: {}", layer.layer_desc().key());
    2206           26 :                     stat.produce_image_layer(layer.layer_desc().file_size());
    2207           26 :                     compact_to.push(layer);
    2208              :                 }
    2209            8 :                 BatchWriterResult::Discarded(l) => {
    2210            8 :                     debug!("discarded image layer: {}", l);
    2211            8 :                     keep_layers.insert(l);
    2212            8 :                     stat.discard_image_layer();
    2213              :                 }
    2214              :             }
    2215              :         }
    2216              : 
    2217           40 :         let mut layer_selection = job_desc.selected_layers;
    2218              : 
    2219              :         // Partial compaction might select more data than it processes, e.g., if
    2220              :         // the compaction_key_range only partially overlaps:
    2221              :         //
    2222              :         //         [---compaction_key_range---]
    2223              :         //   [---A----][----B----][----C----][----D----]
    2224              :         //
    2225              :         // For delta layers, we will rewrite the layers so that it is cut exactly at
    2226              :         // the compaction key range, so we can always discard them. However, for image
    2227              :         // layers, as we do not rewrite them for now, we need to handle them differently.
    2228              :         // Assume image layers  A, B, C, D are all in the `layer_selection`.
    2229              :         //
    2230              :         // The created image layers contain whatever is needed from B, C, and from
    2231              :         // `----]` of A, and from  `[---` of D.
    2232              :         //
    2233              :         // In contrast, `[---A` and `D----]` have not been processed, so, we must
    2234              :         // keep that data.
    2235              :         //
    2236              :         // The solution for now is to keep A and D completely if they are image layers.
    2237              :         // (layer_selection is what we'll remove from the layer map, so, retain what
    2238              :         // is _not_ fully covered by compaction_key_range).
    2239          160 :         for layer in &layer_selection {
    2240          120 :             if !layer.layer_desc().is_delta() {
    2241           58 :                 if !overlaps_with(
    2242           58 :                     &layer.layer_desc().key_range,
    2243           58 :                     &job_desc.compaction_key_range,
    2244           58 :                 ) {
    2245            0 :                     bail!("violated constraint: image layer outside of compaction key range");
    2246           58 :                 }
    2247           58 :                 if !fully_contains(
    2248           58 :                     &job_desc.compaction_key_range,
    2249           58 :                     &layer.layer_desc().key_range,
    2250           58 :                 ) {
    2251            8 :                     keep_layers.insert(layer.layer_desc().key());
    2252           50 :                 }
    2253           62 :             }
    2254              :         }
    2255              : 
    2256          120 :         layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
    2257           40 : 
    2258           40 :         info!(
    2259            0 :             "gc-compaction statistics: {}",
    2260            0 :             serde_json::to_string(&stat)?
    2261              :         );
    2262              : 
    2263           40 :         if dry_run {
    2264            4 :             return Ok(());
    2265           36 :         }
    2266           36 : 
    2267           36 :         info!(
    2268            0 :             "produced {} delta layers and {} image layers, {} layers are kept",
    2269            0 :             produced_delta_layers_len,
    2270            0 :             produced_image_layers_len,
    2271            0 :             layer_selection.len()
    2272              :         );
    2273              : 
    2274              :         // Step 3: Place back to the layer map.
    2275              :         {
    2276              :             // TODO: sanity check if the layer map is valid (i.e., should not have overlaps)
    2277           36 :             let mut guard = self.layers.write().await;
    2278           36 :             guard
    2279           36 :                 .open_mut()?
    2280           36 :                 .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
    2281           36 :         };
    2282           36 :         self.remote_client
    2283           36 :             .schedule_compaction_update(&layer_selection, &compact_to)?;
    2284              : 
    2285           36 :         drop(gc_lock);
    2286           36 : 
    2287           36 :         Ok(())
    2288           40 :     }
    2289              : }
    2290              : 
    2291              : struct TimelineAdaptor {
    2292              :     timeline: Arc<Timeline>,
    2293              : 
    2294              :     keyspace: (Lsn, KeySpace),
    2295              : 
    2296              :     new_deltas: Vec<ResidentLayer>,
    2297              :     new_images: Vec<ResidentLayer>,
    2298              :     layers_to_delete: Vec<Arc<PersistentLayerDesc>>,
    2299              : }
    2300              : 
    2301              : impl TimelineAdaptor {
    2302            0 :     pub fn new(timeline: &Arc<Timeline>, keyspace: (Lsn, KeySpace)) -> Self {
    2303            0 :         Self {
    2304            0 :             timeline: timeline.clone(),
    2305            0 :             keyspace,
    2306            0 :             new_images: Vec::new(),
    2307            0 :             new_deltas: Vec::new(),
    2308            0 :             layers_to_delete: Vec::new(),
    2309            0 :         }
    2310            0 :     }
    2311              : 
    2312            0 :     pub async fn flush_updates(&mut self) -> Result<(), CompactionError> {
    2313            0 :         let layers_to_delete = {
    2314            0 :             let guard = self.timeline.layers.read().await;
    2315            0 :             self.layers_to_delete
    2316            0 :                 .iter()
    2317            0 :                 .map(|x| guard.get_from_desc(x))
    2318            0 :                 .collect::<Vec<Layer>>()
    2319            0 :         };
    2320            0 :         self.timeline
    2321            0 :             .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
    2322            0 :             .await?;
    2323              : 
    2324            0 :         self.timeline
    2325            0 :             .upload_new_image_layers(std::mem::take(&mut self.new_images))?;
    2326              : 
    2327            0 :         self.new_deltas.clear();
    2328            0 :         self.layers_to_delete.clear();
    2329            0 :         Ok(())
    2330            0 :     }
    2331              : }
    2332              : 
    2333              : #[derive(Clone)]
    2334              : struct ResidentDeltaLayer(ResidentLayer);
    2335              : #[derive(Clone)]
    2336              : struct ResidentImageLayer(ResidentLayer);
    2337              : 
    2338              : impl CompactionJobExecutor for TimelineAdaptor {
    2339              :     type Key = pageserver_api::key::Key;
    2340              : 
    2341              :     type Layer = OwnArc<PersistentLayerDesc>;
    2342              :     type DeltaLayer = ResidentDeltaLayer;
    2343              :     type ImageLayer = ResidentImageLayer;
    2344              : 
    2345              :     type RequestContext = crate::context::RequestContext;
    2346              : 
    2347            0 :     fn get_shard_identity(&self) -> &ShardIdentity {
    2348            0 :         self.timeline.get_shard_identity()
    2349            0 :     }
    2350              : 
    2351            0 :     async fn get_layers(
    2352            0 :         &mut self,
    2353            0 :         key_range: &Range<Key>,
    2354            0 :         lsn_range: &Range<Lsn>,
    2355            0 :         _ctx: &RequestContext,
    2356            0 :     ) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
    2357            0 :         self.flush_updates().await?;
    2358              : 
    2359            0 :         let guard = self.timeline.layers.read().await;
    2360            0 :         let layer_map = guard.layer_map()?;
    2361              : 
    2362            0 :         let result = layer_map
    2363            0 :             .iter_historic_layers()
    2364            0 :             .filter(|l| {
    2365            0 :                 overlaps_with(&l.lsn_range, lsn_range) && overlaps_with(&l.key_range, key_range)
    2366            0 :             })
    2367            0 :             .map(OwnArc)
    2368            0 :             .collect();
    2369            0 :         Ok(result)
    2370            0 :     }
    2371              : 
    2372            0 :     async fn get_keyspace(
    2373            0 :         &mut self,
    2374            0 :         key_range: &Range<Key>,
    2375            0 :         lsn: Lsn,
    2376            0 :         _ctx: &RequestContext,
    2377            0 :     ) -> anyhow::Result<Vec<Range<Key>>> {
    2378            0 :         if lsn == self.keyspace.0 {
    2379            0 :             Ok(pageserver_compaction::helpers::intersect_keyspace(
    2380            0 :                 &self.keyspace.1.ranges,
    2381            0 :                 key_range,
    2382            0 :             ))
    2383              :         } else {
    2384              :             // The current compaction implementation only ever requests the key space
    2385              :             // at the compaction end LSN.
    2386            0 :             anyhow::bail!("keyspace not available for requested lsn");
    2387              :         }
    2388            0 :     }
    2389              : 
    2390            0 :     async fn downcast_delta_layer(
    2391            0 :         &self,
    2392            0 :         layer: &OwnArc<PersistentLayerDesc>,
    2393            0 :     ) -> anyhow::Result<Option<ResidentDeltaLayer>> {
    2394            0 :         // this is a lot more complex than a simple downcast...
    2395            0 :         if layer.is_delta() {
    2396            0 :             let l = {
    2397            0 :                 let guard = self.timeline.layers.read().await;
    2398            0 :                 guard.get_from_desc(layer)
    2399              :             };
    2400            0 :             let result = l.download_and_keep_resident().await?;
    2401              : 
    2402            0 :             Ok(Some(ResidentDeltaLayer(result)))
    2403              :         } else {
    2404            0 :             Ok(None)
    2405              :         }
    2406            0 :     }
    2407              : 
    2408            0 :     async fn create_image(
    2409            0 :         &mut self,
    2410            0 :         lsn: Lsn,
    2411            0 :         key_range: &Range<Key>,
    2412            0 :         ctx: &RequestContext,
    2413            0 :     ) -> anyhow::Result<()> {
    2414            0 :         Ok(self.create_image_impl(lsn, key_range, ctx).await?)
    2415            0 :     }
    2416              : 
    2417            0 :     async fn create_delta(
    2418            0 :         &mut self,
    2419            0 :         lsn_range: &Range<Lsn>,
    2420            0 :         key_range: &Range<Key>,
    2421            0 :         input_layers: &[ResidentDeltaLayer],
    2422            0 :         ctx: &RequestContext,
    2423            0 :     ) -> anyhow::Result<()> {
    2424            0 :         debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
    2425              : 
    2426            0 :         let mut all_entries = Vec::new();
    2427            0 :         for dl in input_layers.iter() {
    2428            0 :             all_entries.extend(dl.load_keys(ctx).await?);
    2429              :         }
    2430              : 
    2431              :         // The current stdlib sorting implementation is designed in a way where it is
    2432              :         // particularly fast where the slice is made up of sorted sub-ranges.
    2433            0 :         all_entries.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
    2434              : 
    2435            0 :         let mut writer = DeltaLayerWriter::new(
    2436            0 :             self.timeline.conf,
    2437            0 :             self.timeline.timeline_id,
    2438            0 :             self.timeline.tenant_shard_id,
    2439            0 :             key_range.start,
    2440            0 :             lsn_range.clone(),
    2441            0 :             ctx,
    2442            0 :         )
    2443            0 :         .await?;
    2444              : 
    2445            0 :         let mut dup_values = 0;
    2446            0 : 
    2447            0 :         // This iterator walks through all key-value pairs from all the layers
    2448            0 :         // we're compacting, in key, LSN order.
    2449            0 :         let mut prev: Option<(Key, Lsn)> = None;
    2450              :         for &DeltaEntry {
    2451            0 :             key, lsn, ref val, ..
    2452            0 :         } in all_entries.iter()
    2453              :         {
    2454            0 :             if prev == Some((key, lsn)) {
    2455              :                 // This is a duplicate. Skip it.
    2456              :                 //
    2457              :                 // It can happen if compaction is interrupted after writing some
    2458              :                 // layers but not all, and we are compacting the range again.
    2459              :                 // The calculations in the algorithm assume that there are no
    2460              :                 // duplicates, so the math on targeted file size is likely off,
    2461              :                 // and we will create smaller files than expected.
    2462            0 :                 dup_values += 1;
    2463            0 :                 continue;
    2464            0 :             }
    2465              : 
    2466            0 :             let value = val.load(ctx).await?;
    2467              : 
    2468            0 :             writer.put_value(key, lsn, value, ctx).await?;
    2469              : 
    2470            0 :             prev = Some((key, lsn));
    2471              :         }
    2472              : 
    2473            0 :         if dup_values > 0 {
    2474            0 :             warn!("delta layer created with {} duplicate values", dup_values);
    2475            0 :         }
    2476              : 
    2477            0 :         fail_point!("delta-layer-writer-fail-before-finish", |_| {
    2478            0 :             Err(anyhow::anyhow!(
    2479            0 :                 "failpoint delta-layer-writer-fail-before-finish"
    2480            0 :             ))
    2481            0 :         });
    2482              : 
    2483            0 :         let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?;
    2484            0 :         let new_delta_layer =
    2485            0 :             Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
    2486              : 
    2487            0 :         self.new_deltas.push(new_delta_layer);
    2488            0 :         Ok(())
    2489            0 :     }
    2490              : 
    2491            0 :     async fn delete_layer(
    2492            0 :         &mut self,
    2493            0 :         layer: &OwnArc<PersistentLayerDesc>,
    2494            0 :         _ctx: &RequestContext,
    2495            0 :     ) -> anyhow::Result<()> {
    2496            0 :         self.layers_to_delete.push(layer.clone().0);
    2497            0 :         Ok(())
    2498            0 :     }
    2499              : }
    2500              : 
    2501              : impl TimelineAdaptor {
    2502            0 :     async fn create_image_impl(
    2503            0 :         &mut self,
    2504            0 :         lsn: Lsn,
    2505            0 :         key_range: &Range<Key>,
    2506            0 :         ctx: &RequestContext,
    2507            0 :     ) -> Result<(), CreateImageLayersError> {
    2508            0 :         let timer = self.timeline.metrics.create_images_time_histo.start_timer();
    2509              : 
    2510            0 :         let image_layer_writer = ImageLayerWriter::new(
    2511            0 :             self.timeline.conf,
    2512            0 :             self.timeline.timeline_id,
    2513            0 :             self.timeline.tenant_shard_id,
    2514            0 :             key_range,
    2515            0 :             lsn,
    2516            0 :             ctx,
    2517            0 :         )
    2518            0 :         .await?;
    2519              : 
    2520            0 :         fail_point!("image-layer-writer-fail-before-finish", |_| {
    2521            0 :             Err(CreateImageLayersError::Other(anyhow::anyhow!(
    2522            0 :                 "failpoint image-layer-writer-fail-before-finish"
    2523            0 :             )))
    2524            0 :         });
    2525              : 
    2526            0 :         let keyspace = KeySpace {
    2527            0 :             ranges: self.get_keyspace(key_range, lsn, ctx).await?,
    2528              :         };
    2529              :         // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
    2530            0 :         let start = Key::MIN;
    2531              :         let ImageLayerCreationOutcome {
    2532            0 :             image,
    2533              :             next_start_key: _,
    2534            0 :         } = self
    2535            0 :             .timeline
    2536            0 :             .create_image_layer_for_rel_blocks(
    2537            0 :                 &keyspace,
    2538            0 :                 image_layer_writer,
    2539            0 :                 lsn,
    2540            0 :                 ctx,
    2541            0 :                 key_range.clone(),
    2542            0 :                 start,
    2543            0 :             )
    2544            0 :             .await?;
    2545              : 
    2546            0 :         if let Some(image_layer) = image {
    2547            0 :             self.new_images.push(image_layer);
    2548            0 :         }
    2549              : 
    2550            0 :         timer.stop_and_record();
    2551            0 : 
    2552            0 :         Ok(())
    2553            0 :     }
    2554              : }
    2555              : 
    2556              : impl CompactionRequestContext for crate::context::RequestContext {}
    2557              : 
    2558              : #[derive(Debug, Clone)]
    2559              : pub struct OwnArc<T>(pub Arc<T>);
    2560              : 
    2561              : impl<T> Deref for OwnArc<T> {
    2562              :     type Target = <Arc<T> as Deref>::Target;
    2563            0 :     fn deref(&self) -> &Self::Target {
    2564            0 :         &self.0
    2565            0 :     }
    2566              : }
    2567              : 
    2568              : impl<T> AsRef<T> for OwnArc<T> {
    2569            0 :     fn as_ref(&self) -> &T {
    2570            0 :         self.0.as_ref()
    2571            0 :     }
    2572              : }
    2573              : 
    2574              : impl CompactionLayer<Key> for OwnArc<PersistentLayerDesc> {
    2575            0 :     fn key_range(&self) -> &Range<Key> {
    2576            0 :         &self.key_range
    2577            0 :     }
    2578            0 :     fn lsn_range(&self) -> &Range<Lsn> {
    2579            0 :         &self.lsn_range
    2580            0 :     }
    2581            0 :     fn file_size(&self) -> u64 {
    2582            0 :         self.file_size
    2583            0 :     }
    2584            0 :     fn short_id(&self) -> std::string::String {
    2585            0 :         self.as_ref().short_id().to_string()
    2586            0 :     }
    2587            0 :     fn is_delta(&self) -> bool {
    2588            0 :         self.as_ref().is_delta()
    2589            0 :     }
    2590              : }
    2591              : 
    2592              : impl CompactionLayer<Key> for OwnArc<DeltaLayer> {
    2593            0 :     fn key_range(&self) -> &Range<Key> {
    2594            0 :         &self.layer_desc().key_range
    2595            0 :     }
    2596            0 :     fn lsn_range(&self) -> &Range<Lsn> {
    2597            0 :         &self.layer_desc().lsn_range
    2598            0 :     }
    2599            0 :     fn file_size(&self) -> u64 {
    2600            0 :         self.layer_desc().file_size
    2601            0 :     }
    2602            0 :     fn short_id(&self) -> std::string::String {
    2603            0 :         self.layer_desc().short_id().to_string()
    2604            0 :     }
    2605            0 :     fn is_delta(&self) -> bool {
    2606            0 :         true
    2607            0 :     }
    2608              : }
    2609              : 
    2610              : use crate::tenant::timeline::DeltaEntry;
    2611              : 
    2612              : impl CompactionLayer<Key> for ResidentDeltaLayer {
    2613            0 :     fn key_range(&self) -> &Range<Key> {
    2614            0 :         &self.0.layer_desc().key_range
    2615            0 :     }
    2616            0 :     fn lsn_range(&self) -> &Range<Lsn> {
    2617            0 :         &self.0.layer_desc().lsn_range
    2618            0 :     }
    2619            0 :     fn file_size(&self) -> u64 {
    2620            0 :         self.0.layer_desc().file_size
    2621            0 :     }
    2622            0 :     fn short_id(&self) -> std::string::String {
    2623            0 :         self.0.layer_desc().short_id().to_string()
    2624            0 :     }
    2625            0 :     fn is_delta(&self) -> bool {
    2626            0 :         true
    2627            0 :     }
    2628              : }
    2629              : 
    2630              : impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
    2631              :     type DeltaEntry<'a> = DeltaEntry<'a>;
    2632              : 
    2633            0 :     async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
    2634            0 :         self.0.get_as_delta(ctx).await?.index_entries(ctx).await
    2635            0 :     }
    2636              : }
    2637              : 
    2638              : impl CompactionLayer<Key> for ResidentImageLayer {
    2639            0 :     fn key_range(&self) -> &Range<Key> {
    2640            0 :         &self.0.layer_desc().key_range
    2641            0 :     }
    2642            0 :     fn lsn_range(&self) -> &Range<Lsn> {
    2643            0 :         &self.0.layer_desc().lsn_range
    2644            0 :     }
    2645            0 :     fn file_size(&self) -> u64 {
    2646            0 :         self.0.layer_desc().file_size
    2647            0 :     }
    2648            0 :     fn short_id(&self) -> std::string::String {
    2649            0 :         self.0.layer_desc().short_id().to_string()
    2650            0 :     }
    2651            0 :     fn is_delta(&self) -> bool {
    2652            0 :         false
    2653            0 :     }
    2654              : }
    2655              : impl CompactionImageLayer<TimelineAdaptor> for ResidentImageLayer {}

Generated by: LCOV version 2.1-beta