LCOV - code coverage report
Current view: top level - pageserver/pagebench/src/cmd - ondemand_download_churn.rs (source / functions) Coverage Total Hit
Test: 6df3fc19ec669bcfbbf9aba41d1338898d24eaa0.info Lines: 0.0 % 230 0
Test Date: 2025-03-12 18:28:53 Functions: 0.0 % 23 0

            Line data    Source code
       1              : use std::f64;
       2              : use std::num::NonZeroUsize;
       3              : use std::sync::Arc;
       4              : use std::sync::atomic::{AtomicU64, Ordering};
       5              : use std::time::{Duration, Instant};
       6              : 
       7              : use pageserver_api::models::HistoricLayerInfo;
       8              : use pageserver_api::shard::TenantShardId;
       9              : use pageserver_client::mgmt_api;
      10              : use rand::seq::SliceRandom;
      11              : use tokio::sync::{OwnedSemaphorePermit, mpsc};
      12              : use tokio::task::JoinSet;
      13              : use tokio_util::sync::CancellationToken;
      14              : use tracing::{debug, info};
      15              : use utils::id::{TenantTimelineId, TimelineId};
      16              : 
      17              : /// Evict & on-demand download random layers.
      18              : #[derive(clap::Parser)]
      19              : pub(crate) struct Args {
      20              :     #[clap(long, default_value = "http://localhost:9898")]
      21            0 :     mgmt_api_endpoint: String,
      22              :     #[clap(long)]
      23              :     pageserver_jwt: Option<String>,
      24              :     #[clap(long)]
      25              :     runtime: Option<humantime::Duration>,
      26              :     #[clap(long, default_value = "1")]
      27            0 :     tasks_per_target: NonZeroUsize,
      28              :     #[clap(long, default_value = "1")]
      29            0 :     concurrency_per_target: NonZeroUsize,
      30              :     /// Probability for sending `latest=true` in the request (uniform distribution).
      31              :     #[clap(long)]
      32              :     limit_to_first_n_targets: Option<usize>,
      33              :     /// Before starting the benchmark, live-reconfigure the pageserver to use the given
      34              :     /// [`pageserver_api::models::virtual_file::IoEngineKind`].
      35              :     #[clap(long)]
      36              :     set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
      37            0 :     targets: Option<Vec<TenantTimelineId>>,
      38              : }
      39              : 
      40            0 : pub(crate) fn main(args: Args) -> anyhow::Result<()> {
      41            0 :     let rt = tokio::runtime::Builder::new_multi_thread()
      42            0 :         .enable_all()
      43            0 :         .build()?;
      44            0 :     let task = rt.spawn(main_impl(args));
      45            0 :     rt.block_on(task).unwrap().unwrap();
      46            0 :     Ok(())
      47            0 : }
      48              : 
      49            0 : #[derive(serde::Serialize)]
      50              : struct Output {
      51              :     downloads_count: u64,
      52              :     downloads_bytes: u64,
      53              :     evictions_count: u64,
      54              :     timeline_restarts: u64,
      55              :     #[serde(with = "humantime_serde")]
      56              :     runtime: Duration,
      57              : }
      58              : 
      59              : #[derive(Debug, Default)]
      60              : struct LiveStats {
      61              :     evictions_count: AtomicU64,
      62              :     downloads_count: AtomicU64,
      63              :     downloads_bytes: AtomicU64,
      64              :     timeline_restarts: AtomicU64,
      65              : }
      66              : 
      67              : impl LiveStats {
      68            0 :     fn eviction_done(&self) {
      69            0 :         self.evictions_count.fetch_add(1, Ordering::Relaxed);
      70            0 :     }
      71            0 :     fn download_done(&self, size: u64) {
      72            0 :         self.downloads_count.fetch_add(1, Ordering::Relaxed);
      73            0 :         self.downloads_bytes.fetch_add(size, Ordering::Relaxed);
      74            0 :     }
      75            0 :     fn timeline_restart_done(&self) {
      76            0 :         self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
      77            0 :     }
      78              : }
      79              : 
      80            0 : async fn main_impl(args: Args) -> anyhow::Result<()> {
      81            0 :     let args: &'static Args = Box::leak(Box::new(args));
      82              : 
      83            0 :     let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
      84            0 :         args.mgmt_api_endpoint.clone(),
      85            0 :         args.pageserver_jwt.as_deref(),
      86            0 :         None, // TODO: support ssl_ca_file for https APIs in pagebench.
      87            0 :     )?);
      88              : 
      89            0 :     if let Some(engine_str) = &args.set_io_engine {
      90            0 :         mgmt_api_client.put_io_engine(engine_str).await?;
      91            0 :     }
      92              : 
      93              :     // discover targets
      94            0 :     let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
      95            0 :         &mgmt_api_client,
      96            0 :         crate::util::cli::targets::Spec {
      97            0 :             limit_to_first_n_targets: args.limit_to_first_n_targets,
      98            0 :             targets: args.targets.clone(),
      99            0 :         },
     100            0 :     )
     101            0 :     .await?;
     102              : 
     103            0 :     let token = CancellationToken::new();
     104            0 :     let mut tasks = JoinSet::new();
     105            0 : 
     106            0 :     let periodic_stats = Arc::new(LiveStats::default());
     107            0 :     let total_stats = Arc::new(LiveStats::default());
     108            0 : 
     109            0 :     let start = Instant::now();
     110            0 :     tasks.spawn({
     111            0 :         let periodic_stats = Arc::clone(&periodic_stats);
     112            0 :         let total_stats = Arc::clone(&total_stats);
     113            0 :         let cloned_token = token.clone();
     114            0 :         async move {
     115            0 :             let mut last_at = Instant::now();
     116              :             loop {
     117            0 :                 if cloned_token.is_cancelled() {
     118            0 :                     return;
     119            0 :                 }
     120            0 :                 tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
     121            0 :                 let now = Instant::now();
     122            0 :                 let delta: Duration = now - last_at;
     123            0 :                 last_at = now;
     124            0 : 
     125            0 :                 let LiveStats {
     126            0 :                     evictions_count,
     127            0 :                     downloads_count,
     128            0 :                     downloads_bytes,
     129            0 :                     timeline_restarts,
     130            0 :                 } = &*periodic_stats;
     131            0 :                 let evictions_count = evictions_count.swap(0, Ordering::Relaxed);
     132            0 :                 let downloads_count = downloads_count.swap(0, Ordering::Relaxed);
     133            0 :                 let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed);
     134            0 :                 let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
     135            0 : 
     136            0 :                 total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed);
     137            0 :                 total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed);
     138            0 :                 total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed);
     139            0 :                 total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed);
     140            0 : 
     141            0 :                 let evictions_per_s = evictions_count as f64 / delta.as_secs_f64();
     142            0 :                 let downloads_per_s = downloads_count as f64 / delta.as_secs_f64();
     143            0 :                 let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64);
     144            0 : 
     145            0 :                 info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}");
     146              :             }
     147            0 :         }
     148            0 :     });
     149              : 
     150            0 :     for tl in timelines {
     151            0 :         for _ in 0..args.tasks_per_target.get() {
     152            0 :             tasks.spawn(timeline_actor(
     153            0 :                 args,
     154            0 :                 Arc::clone(&mgmt_api_client),
     155            0 :                 tl,
     156            0 :                 Arc::clone(&periodic_stats),
     157            0 :                 token.clone(),
     158            0 :             ));
     159            0 :         }
     160              :     }
     161            0 :     if let Some(runtime) = args.runtime {
     162            0 :         tokio::spawn(async move {
     163            0 :             tokio::time::sleep(runtime.into()).await;
     164            0 :             token.cancel();
     165            0 :         });
     166            0 :     }
     167              : 
     168            0 :     while let Some(res) = tasks.join_next().await {
     169            0 :         res.unwrap();
     170            0 :     }
     171            0 :     let end = Instant::now();
     172            0 :     let duration: Duration = end - start;
     173            0 : 
     174            0 :     let output = {
     175            0 :         let LiveStats {
     176            0 :             evictions_count,
     177            0 :             downloads_count,
     178            0 :             downloads_bytes,
     179            0 :             timeline_restarts,
     180            0 :         } = &*total_stats;
     181            0 :         Output {
     182            0 :             downloads_count: downloads_count.load(Ordering::Relaxed),
     183            0 :             downloads_bytes: downloads_bytes.load(Ordering::Relaxed),
     184            0 :             evictions_count: evictions_count.load(Ordering::Relaxed),
     185            0 :             timeline_restarts: timeline_restarts.load(Ordering::Relaxed),
     186            0 :             runtime: duration,
     187            0 :         }
     188            0 :     };
     189            0 :     let output = serde_json::to_string_pretty(&output).unwrap();
     190            0 :     println!("{output}");
     191            0 : 
     192            0 :     Ok(())
     193            0 : }
     194              : 
     195            0 : async fn timeline_actor(
     196            0 :     args: &'static Args,
     197            0 :     mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
     198            0 :     timeline: TenantTimelineId,
     199            0 :     live_stats: Arc<LiveStats>,
     200            0 :     token: CancellationToken,
     201            0 : ) {
     202            0 :     // TODO: support sharding
     203            0 :     let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
     204              : 
     205              :     struct Timeline {
     206              :         joinset: JoinSet<()>,
     207              :         layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
     208              :         concurrency: Arc<tokio::sync::Semaphore>,
     209              :     }
     210            0 :     while !token.is_cancelled() {
     211            0 :         debug!("restarting timeline");
     212            0 :         let layer_map_info = mgmt_api_client
     213            0 :             .layer_map_info(tenant_shard_id, timeline.timeline_id)
     214            0 :             .await
     215            0 :             .unwrap();
     216            0 :         let concurrency = Arc::new(tokio::sync::Semaphore::new(
     217            0 :             args.concurrency_per_target.get(),
     218            0 :         ));
     219            0 : 
     220            0 :         let mut joinset = JoinSet::new();
     221            0 :         let layers = layer_map_info
     222            0 :             .historic_layers
     223            0 :             .into_iter()
     224            0 :             .map(|historic_layer| {
     225            0 :                 let (tx, rx) = mpsc::channel(1);
     226            0 :                 joinset.spawn(layer_actor(
     227            0 :                     tenant_shard_id,
     228            0 :                     timeline.timeline_id,
     229            0 :                     historic_layer,
     230            0 :                     rx,
     231            0 :                     Arc::clone(&mgmt_api_client),
     232            0 :                     Arc::clone(&live_stats),
     233            0 :                 ));
     234            0 :                 tx
     235            0 :             })
     236            0 :             .collect::<Vec<_>>();
     237            0 : 
     238            0 :         let mut timeline = Timeline {
     239            0 :             joinset,
     240            0 :             layers,
     241            0 :             concurrency,
     242            0 :         };
     243            0 : 
     244            0 :         live_stats.timeline_restart_done();
     245              : 
     246            0 :         while !token.is_cancelled() {
     247            0 :             assert!(!timeline.joinset.is_empty());
     248            0 :             if let Some(res) = timeline.joinset.try_join_next() {
     249            0 :                 debug!(?res, "a layer actor exited, should not happen");
     250            0 :                 timeline.joinset.shutdown().await;
     251            0 :                 break;
     252            0 :             }
     253              : 
     254            0 :             let mut permit = Some(
     255            0 :                 Arc::clone(&timeline.concurrency)
     256            0 :                     .acquire_owned()
     257            0 :                     .await
     258            0 :                     .unwrap(),
     259              :             );
     260              : 
     261              :             loop {
     262            0 :                 let layer_tx = {
     263            0 :                     let mut rng = rand::thread_rng();
     264            0 :                     timeline.layers.choose_mut(&mut rng).expect("no layers")
     265            0 :                 };
     266            0 :                 match layer_tx.try_send(permit.take().unwrap()) {
     267            0 :                     Ok(_) => break,
     268            0 :                     Err(e) => match e {
     269            0 :                         mpsc::error::TrySendError::Full(back) => {
     270            0 :                             // TODO: retrying introduces bias away from slow downloaders
     271            0 :                             permit.replace(back);
     272            0 :                         }
     273            0 :                         mpsc::error::TrySendError::Closed(_) => panic!(),
     274              :                     },
     275              :                 }
     276              :             }
     277              :         }
     278              :     }
     279            0 : }
     280              : 
     281            0 : async fn layer_actor(
     282            0 :     tenant_shard_id: TenantShardId,
     283            0 :     timeline_id: TimelineId,
     284            0 :     mut layer: HistoricLayerInfo,
     285            0 :     mut rx: mpsc::Receiver<tokio::sync::OwnedSemaphorePermit>,
     286            0 :     mgmt_api_client: Arc<mgmt_api::Client>,
     287            0 :     live_stats: Arc<LiveStats>,
     288            0 : ) {
     289              :     #[derive(Clone, Copy)]
     290              :     enum Action {
     291              :         Evict,
     292              :         OnDemandDownload,
     293              :     }
     294              : 
     295            0 :     while let Some(_permit) = rx.recv().await {
     296            0 :         let action = if layer.is_remote() {
     297            0 :             Action::OnDemandDownload
     298              :         } else {
     299            0 :             Action::Evict
     300              :         };
     301              : 
     302            0 :         let did_it = match action {
     303              :             Action::Evict => {
     304            0 :                 let did_it = mgmt_api_client
     305            0 :                     .layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name())
     306            0 :                     .await
     307            0 :                     .unwrap();
     308            0 :                 live_stats.eviction_done();
     309            0 :                 did_it
     310              :             }
     311              :             Action::OnDemandDownload => {
     312            0 :                 let did_it = mgmt_api_client
     313            0 :                     .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
     314            0 :                     .await
     315            0 :                     .unwrap();
     316            0 :                 live_stats.download_done(layer.layer_file_size());
     317            0 :                 did_it
     318              :             }
     319              :         };
     320            0 :         if !did_it {
     321            0 :             debug!("local copy of layer map appears out of sync, re-downloading");
     322            0 :             return;
     323            0 :         }
     324            0 :         debug!("did it");
     325            0 :         layer.set_remote(match action {
     326            0 :             Action::Evict => true,
     327            0 :             Action::OnDemandDownload => false,
     328              :         });
     329              :     }
     330            0 : }
        

Generated by: LCOV version 2.1-beta