LCOV - code coverage report
Current view: top level - pageserver/src - virtual_file.rs (source / functions) Coverage Total Hit
Test: 2b0730d767f560e20b6748f57465922aa8bb805e.info Lines: 93.0 % 995 925
Test Date: 2024-09-25 14:04:07 Functions: 91.2 % 204 186

            Line data    Source code
       1              : //! VirtualFile is like a normal File, but it's not bound directly to
       2              : //! a file descriptor.
       3              : //!
       4              : //! Instead, the file is opened when it's read from,
       5              : //! and if too many files are open globally in the system, least-recently
       6              : //! used ones are closed.
       7              : //!
       8              : //! To track which files have been recently used, we use the clock algorithm
       9              : //! with a 'recently_used' flag on each slot.
      10              : //!
      11              : //! This is similar to PostgreSQL's virtual file descriptor facility in
      12              : //! src/backend/storage/file/fd.c
      13              : //!
      14              : use crate::context::RequestContext;
      15              : use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
      16              : 
      17              : use crate::page_cache::{PageWriteGuard, PAGE_SZ};
      18              : use crate::tenant::TENANTS_SEGMENT_NAME;
      19              : use camino::{Utf8Path, Utf8PathBuf};
      20              : use once_cell::sync::OnceCell;
      21              : use owned_buffers_io::io_buf_ext::FullSlice;
      22              : use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
      23              : use pageserver_api::shard::TenantShardId;
      24              : use std::fs::File;
      25              : use std::io::{Error, ErrorKind, Seek, SeekFrom};
      26              : use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
      27              : 
      28              : use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
      29              : use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
      30              : use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
      31              : use tokio::time::Instant;
      32              : 
      33              : pub use pageserver_api::models::virtual_file as api;
      34              : pub(crate) mod io_engine;
      35              : pub use io_engine::feature_test as io_engine_feature_test;
      36              : pub use io_engine::io_engine_for_bench;
      37              : pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
      38              : mod metadata;
      39              : mod open_options;
      40              : use self::owned_buffers_io::write::OwnedAsyncWriter;
      41              : pub(crate) use api::DirectIoMode;
      42              : pub(crate) use io_engine::IoEngineKind;
      43              : pub(crate) use metadata::Metadata;
      44              : pub(crate) use open_options::*;
      45              : 
      46              : pub(crate) mod owned_buffers_io {
      47              :     //! Abstractions for IO with owned buffers.
      48              :     //!
      49              :     //! Not actually tied to [`crate::virtual_file`] specifically, but, it's the primary
      50              :     //! reason we need this abstraction.
      51              :     //!
      52              :     //! Over time, this could move into the `tokio-epoll-uring` crate, maybe `uring-common`,
      53              :     //! but for the time being we're proving out the primitives in the neon.git repo
      54              :     //! for faster iteration.
      55              : 
      56              :     pub(crate) mod io_buf_ext;
      57              :     pub(crate) mod slice;
      58              :     pub(crate) mod write;
      59              :     pub(crate) mod util {
      60              :         pub(crate) mod size_tracking_writer;
      61              :     }
      62              : }
      63              : 
      64              : ///
      65              : /// A virtual file descriptor. You can use this just like std::fs::File, but internally
      66              : /// the underlying file is closed if the system is low on file descriptors,
      67              : /// and re-opened when it's accessed again.
      68              : ///
      69              : /// Like with std::fs::File, multiple threads can read/write the file concurrently,
      70              : /// holding just a shared reference the same VirtualFile, using the read_at() / write_at()
      71              : /// functions from the FileExt trait. But the functions from the Read/Write/Seek traits
      72              : /// require a mutable reference, because they modify the "current position".
      73              : ///
      74              : /// Each VirtualFile has a physical file descriptor in the global OPEN_FILES array, at the
      75              : /// slot that 'handle points to, if the underlying file is currently open. If it's not
      76              : /// currently open, the 'handle' can still point to the slot where it was last kept. The
      77              : /// 'tag' field is used to detect whether the handle still is valid or not.
      78              : ///
      79              : #[derive(Debug)]
      80              : pub struct VirtualFile {
      81              :     /// Lazy handle to the global file descriptor cache. The slot that this points to
      82              :     /// might contain our File, or it may be empty, or it may contain a File that
      83              :     /// belongs to a different VirtualFile.
      84              :     handle: RwLock<SlotHandle>,
      85              : 
      86              :     /// Current file position
      87              :     pos: u64,
      88              : 
      89              :     /// File path and options to use to open it.
      90              :     ///
      91              :     /// Note: this only contains the options needed to re-open it. For example,
      92              :     /// if a new file is created, we only pass the create flag when it's initially
      93              :     /// opened, in the VirtualFile::create() function, and strip the flag before
      94              :     /// storing it here.
      95              :     pub path: Utf8PathBuf,
      96              :     open_options: OpenOptions,
      97              : 
      98              :     // These are strings becase we only use them for metrics, and those expect strings.
      99              :     // It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into
     100              :     // strings.
     101              :     tenant_id: String,
     102              :     shard_id: String,
     103              :     timeline_id: String,
     104              : }
     105              : 
     106              : #[derive(Debug, PartialEq, Clone, Copy)]
     107              : struct SlotHandle {
     108              :     /// Index into OPEN_FILES.slots
     109              :     index: usize,
     110              : 
     111              :     /// Value of 'tag' in the slot. If slot's tag doesn't match, then the slot has
     112              :     /// been recycled and no longer contains the FD for this virtual file.
     113              :     tag: u64,
     114              : }
     115              : 
     116              : /// OPEN_FILES is the global array that holds the physical file descriptors that
     117              : /// are currently open. Each slot in the array is protected by a separate lock,
     118              : /// so that different files can be accessed independently. The lock must be held
     119              : /// in write mode to replace the slot with a different file, but a read mode
     120              : /// is enough to operate on the file, whether you're reading or writing to it.
     121              : ///
     122              : /// OPEN_FILES starts in uninitialized state, and it's initialized by
     123              : /// the virtual_file::init() function. It must be called exactly once at page
     124              : /// server startup.
     125              : static OPEN_FILES: OnceCell<OpenFiles> = OnceCell::new();
     126              : 
     127              : struct OpenFiles {
     128              :     slots: &'static [Slot],
     129              : 
     130              :     /// clock arm for the clock algorithm
     131              :     next: AtomicUsize,
     132              : }
     133              : 
     134              : struct Slot {
     135              :     inner: RwLock<SlotInner>,
     136              : 
     137              :     /// has this file been used since last clock sweep?
     138              :     recently_used: AtomicBool,
     139              : }
     140              : 
     141              : struct SlotInner {
     142              :     /// Counter that's incremented every time a different file is stored here.
     143              :     /// To avoid the ABA problem.
     144              :     tag: u64,
     145              : 
     146              :     /// the underlying file
     147              :     file: Option<OwnedFd>,
     148              : }
     149              : 
     150              : /// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`].
     151              : struct PageWriteGuardBuf {
     152              :     page: PageWriteGuard<'static>,
     153              : }
     154              : // Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot,
     155              : // and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved.
     156              : // Page cache pages are zero-initialized, so, wrt uninitialized memory we're good.
     157              : // (Page cache tracks separately whether the contents are valid, see `PageWriteGuard::mark_valid`.)
     158              : unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf {
     159       385525 :     fn stable_ptr(&self) -> *const u8 {
     160       385525 :         self.page.as_ptr()
     161       385525 :     }
     162       722982 :     fn bytes_init(&self) -> usize {
     163       722982 :         self.page.len()
     164       722982 :     }
     165       289389 :     fn bytes_total(&self) -> usize {
     166       289389 :         self.page.len()
     167       289389 :     }
     168              : }
     169              : // Safety: see above, plus: the ownership of [`PageWriteGuard`] means exclusive access,
     170              : // hence it's safe to hand out the `stable_mut_ptr()`.
     171              : unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf {
     172       144531 :     fn stable_mut_ptr(&mut self) -> *mut u8 {
     173       144531 :         self.page.as_mut_ptr()
     174       144531 :     }
     175              : 
     176        96463 :     unsafe fn set_init(&mut self, pos: usize) {
     177        96463 :         // There shouldn't really be any reason to call this API since bytes_init() == bytes_total().
     178        96463 :         assert!(pos <= self.page.len());
     179        96463 :     }
     180              : }
     181              : 
     182              : impl OpenFiles {
     183              :     /// Find a slot to use, evicting an existing file descriptor if needed.
     184              :     ///
     185              :     /// On return, we hold a lock on the slot, and its 'tag' has been updated
     186              :     /// recently_used has been set. It's all ready for reuse.
     187       589777 :     async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
     188       589777 :         //
     189       589777 :         // Run the clock algorithm to find a slot to replace.
     190       589777 :         //
     191       589777 :         let num_slots = self.slots.len();
     192       589777 :         let mut retries = 0;
     193              :         let mut slot;
     194              :         let mut slot_guard;
     195              :         let index;
     196              :         loop {
     197      7958170 :             let next = self.next.fetch_add(1, Ordering::AcqRel) % num_slots;
     198      7958170 :             slot = &self.slots[next];
     199      7958170 : 
     200      7958170 :             // If the recently_used flag on this slot is set, continue the clock
     201      7958170 :             // sweep. Otherwise try to use this slot. If we cannot acquire the
     202      7958170 :             // lock, also continue the clock sweep.
     203      7958170 :             //
     204      7958170 :             // We only continue in this manner for a while, though. If we loop
     205      7958170 :             // through the array twice without finding a victim, just pick the
     206      7958170 :             // next slot and wait until we can reuse it. This way, we avoid
     207      7958170 :             // spinning in the extreme case that all the slots are busy with an
     208      7958170 :             // I/O operation.
     209      7958170 :             if retries < num_slots * 2 {
     210      7648192 :                 if !slot.recently_used.swap(false, Ordering::Release) {
     211      7021149 :                     if let Ok(guard) = slot.inner.try_write() {
     212       279799 :                         slot_guard = guard;
     213       279799 :                         index = next;
     214       279799 :                         break;
     215      6741350 :                     }
     216       627043 :                 }
     217      7368393 :                 retries += 1;
     218              :             } else {
     219       309978 :                 slot_guard = slot.inner.write().await;
     220       309978 :                 index = next;
     221       309978 :                 break;
     222              :             }
     223              :         }
     224              : 
     225              :         //
     226              :         // We now have the victim slot locked. If it was in use previously, close the
     227              :         // old file.
     228              :         //
     229       589777 :         if let Some(old_file) = slot_guard.file.take() {
     230       575556 :             // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
     231       575556 :             // distinguish the two.
     232       575556 :             STORAGE_IO_TIME_METRIC
     233       575556 :                 .get(StorageIoOperation::CloseByReplace)
     234       575556 :                 .observe_closure_duration(|| drop(old_file));
     235       575556 :         }
     236              : 
     237              :         // Prepare the slot for reuse and return it
     238       589777 :         slot_guard.tag += 1;
     239       589777 :         slot.recently_used.store(true, Ordering::Relaxed);
     240       589777 :         (
     241       589777 :             SlotHandle {
     242       589777 :                 index,
     243       589777 :                 tag: slot_guard.tag,
     244       589777 :             },
     245       589777 :             slot_guard,
     246       589777 :         )
     247       589777 :     }
     248              : }
     249              : 
     250              : /// Identify error types that should alwways terminate the process.  Other
     251              : /// error types may be elegible for retry.
     252            6 : pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
     253              :     use nix::errno::Errno::*;
     254            6 :     match e.raw_os_error().map(nix::errno::from_i32) {
     255              :         Some(EIO) => {
     256              :             // Terminate on EIO because we no longer trust the device to store
     257              :             // data safely, or to uphold persistence guarantees on fsync.
     258            0 :             true
     259              :         }
     260              :         Some(EROFS) => {
     261              :             // Terminate on EROFS because a filesystem is usually remounted
     262              :             // readonly when it has experienced some critical issue, so the same
     263              :             // logic as EIO applies.
     264            0 :             true
     265              :         }
     266              :         Some(EACCES) => {
     267              :             // Terminate on EACCESS because we should always have permissions
     268              :             // for our own data dir: if we don't, then we can't do our job and
     269              :             // need administrative intervention to fix permissions.  Terminating
     270              :             // is the best way to make sure we stop cleanly rather than going
     271              :             // into infinite retry loops, and will make it clear to the outside
     272              :             // world that we need help.
     273            0 :             true
     274              :         }
     275              :         _ => {
     276              :             // Treat all other local file I/O errors are retryable.  This includes:
     277              :             // - ENOSPC: we stay up and wait for eviction to free some space
     278              :             // - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue
     279              :             // - WriteZero, Interrupted: these are used internally VirtualFile
     280            6 :             false
     281              :         }
     282              :     }
     283            6 : }
     284              : 
     285              : /// Call this when the local filesystem gives us an error with an external
     286              : /// cause: this includes EIO, EROFS, and EACCESS: all these indicate either
     287              : /// bad storage or bad configuration, and we can't fix that from inside
     288              : /// a running process.
     289            0 : pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
     290            0 :     tracing::error!("Fatal I/O error: {e}: {context})");
     291            0 :     std::process::abort();
     292              : }
     293              : 
     294              : pub(crate) trait MaybeFatalIo<T> {
     295              :     fn maybe_fatal_err(self, context: &str) -> std::io::Result<T>;
     296              :     fn fatal_err(self, context: &str) -> T;
     297              : }
     298              : 
     299              : impl<T> MaybeFatalIo<T> for std::io::Result<T> {
     300              :     /// Terminate the process if the result is an error of a fatal type, else pass it through
     301              :     ///
     302              :     /// This is appropriate for writes, where we typically want to die on EIO/ACCES etc, but
     303              :     /// not on ENOSPC.
     304      3407577 :     fn maybe_fatal_err(self, context: &str) -> std::io::Result<T> {
     305      3407577 :         if let Err(e) = &self {
     306            6 :             if is_fatal_io_error(e) {
     307            0 :                 on_fatal_io_error(e, context);
     308            6 :             }
     309      3407571 :         }
     310      3407577 :         self
     311      3407577 :     }
     312              : 
     313              :     /// Terminate the process on any I/O error.
     314              :     ///
     315              :     /// This is appropriate for reads on files that we know exist: they should always work.
     316         6114 :     fn fatal_err(self, context: &str) -> T {
     317         6114 :         match self {
     318         6114 :             Ok(v) => v,
     319            0 :             Err(e) => {
     320            0 :                 on_fatal_io_error(&e, context);
     321              :             }
     322              :         }
     323         6114 :     }
     324              : }
     325              : 
     326              : /// Observe duration for the given storage I/O operation
     327              : ///
     328              : /// Unlike `observe_closure_duration`, this supports async,
     329              : /// where "support" means that we measure wall clock time.
     330              : macro_rules! observe_duration {
     331              :     ($op:expr, $($body:tt)*) => {{
     332              :         let instant = Instant::now();
     333              :         let result = $($body)*;
     334              :         let elapsed = instant.elapsed().as_secs_f64();
     335              :         STORAGE_IO_TIME_METRIC
     336              :             .get($op)
     337              :             .observe(elapsed);
     338              :         result
     339              :     }}
     340              : }
     341              : 
     342              : macro_rules! with_file {
     343              :     ($this:expr, $op:expr, | $ident:ident | $($body:tt)*) => {{
     344              :         let $ident = $this.lock_file().await?;
     345              :         observe_duration!($op, $($body)*)
     346              :     }};
     347              :     ($this:expr, $op:expr, | mut $ident:ident | $($body:tt)*) => {{
     348              :         let mut $ident = $this.lock_file().await?;
     349              :         observe_duration!($op, $($body)*)
     350              :     }};
     351              : }
     352              : 
     353              : impl VirtualFile {
     354              :     /// Open a file in read-only mode. Like File::open.
     355         6594 :     pub async fn open<P: AsRef<Utf8Path>>(
     356         6594 :         path: P,
     357         6594 :         ctx: &RequestContext,
     358         6594 :     ) -> Result<VirtualFile, std::io::Error> {
     359         6594 :         Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
     360         6594 :     }
     361              : 
     362              :     /// Create a new file for writing. If the file exists, it will be truncated.
     363              :     /// Like File::create.
     364         4371 :     pub async fn create<P: AsRef<Utf8Path>>(
     365         4371 :         path: P,
     366         4371 :         ctx: &RequestContext,
     367         4371 :     ) -> Result<VirtualFile, std::io::Error> {
     368         4371 :         Self::open_with_options(
     369         4371 :             path.as_ref(),
     370         4371 :             OpenOptions::new().write(true).create(true).truncate(true),
     371         4371 :             ctx,
     372         4371 :         )
     373         2254 :         .await
     374         4371 :     }
     375              : 
     376              :     /// Open a file with given options.
     377              :     ///
     378              :     /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt,
     379              :     /// they will be applied also when the file is subsequently re-opened, not only
     380              :     /// on the first time. Make sure that's sane!
     381        17517 :     pub async fn open_with_options<P: AsRef<Utf8Path>>(
     382        17517 :         path: P,
     383        17517 :         open_options: &OpenOptions,
     384        17517 :         _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
     385        17517 :     ) -> Result<VirtualFile, std::io::Error> {
     386        17517 :         let path_ref = path.as_ref();
     387        17517 :         let path_str = path_ref.to_string();
     388        17517 :         let parts = path_str.split('/').collect::<Vec<&str>>();
     389        17517 :         let (tenant_id, shard_id, timeline_id) =
     390        17517 :             if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
     391        13035 :                 let tenant_shard_part = parts[parts.len() - 4];
     392        13035 :                 let (tenant_id, shard_id) = match tenant_shard_part.parse::<TenantShardId>() {
     393        13035 :                     Ok(tenant_shard_id) => (
     394        13035 :                         tenant_shard_id.tenant_id.to_string(),
     395        13035 :                         format!("{}", tenant_shard_id.shard_slug()),
     396        13035 :                     ),
     397              :                     Err(_) => {
     398              :                         // Malformed path: this ID is just for observability, so tolerate it
     399              :                         // and pass through
     400            0 :                         (tenant_shard_part.to_string(), "*".to_string())
     401              :                     }
     402              :                 };
     403        13035 :                 (tenant_id, shard_id, parts[parts.len() - 2].to_string())
     404              :             } else {
     405         4482 :                 ("*".to_string(), "*".to_string(), "*".to_string())
     406              :             };
     407        17517 :         let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
     408              : 
     409              :         // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
     410              :         // where our caller doesn't get to use the returned VirtualFile before its
     411              :         // slot gets re-used by someone else.
     412        17517 :         let file = observe_duration!(StorageIoOperation::Open, {
     413        17517 :             open_options.open(path_ref.as_std_path()).await?
     414              :         });
     415              : 
     416              :         // Strip all options other than read and write.
     417              :         //
     418              :         // It would perhaps be nicer to check just for the read and write flags
     419              :         // explicitly, but OpenOptions doesn't contain any functions to read flags,
     420              :         // only to set them.
     421        17517 :         let mut reopen_options = open_options.clone();
     422        17517 :         reopen_options.create(false);
     423        17517 :         reopen_options.create_new(false);
     424        17517 :         reopen_options.truncate(false);
     425        17517 : 
     426        17517 :         let vfile = VirtualFile {
     427        17517 :             handle: RwLock::new(handle),
     428        17517 :             pos: 0,
     429        17517 :             path: path_ref.to_path_buf(),
     430        17517 :             open_options: reopen_options,
     431        17517 :             tenant_id,
     432        17517 :             shard_id,
     433        17517 :             timeline_id,
     434        17517 :         };
     435        17517 : 
     436        17517 :         // TODO: Under pressure, it's likely the slot will get re-used and
     437        17517 :         // the underlying file closed before they get around to using it.
     438        17517 :         // => https://github.com/neondatabase/neon/issues/6065
     439        17517 :         slot_guard.file.replace(file);
     440        17517 : 
     441        17517 :         Ok(vfile)
     442        17517 :     }
     443              : 
     444              :     /// Async version of [`::utils::crashsafe::overwrite`].
     445              :     ///
     446              :     /// # NB:
     447              :     ///
     448              :     /// Doesn't actually use the [`VirtualFile`] file descriptor cache, but,
     449              :     /// it did at an earlier time.
     450              :     /// And it will use this module's [`io_engine`] in the near future, so, leaving it here.
     451           84 :     pub async fn crashsafe_overwrite<B: BoundedBuf<Buf = Buf> + Send, Buf: IoBuf + Send>(
     452           84 :         final_path: Utf8PathBuf,
     453           84 :         tmp_path: Utf8PathBuf,
     454           84 :         content: B,
     455           84 :     ) -> std::io::Result<()> {
     456           84 :         // TODO: use tokio_epoll_uring if configured as `io_engine`.
     457           84 :         // See https://github.com/neondatabase/neon/issues/6663
     458           84 : 
     459           84 :         tokio::task::spawn_blocking(move || {
     460           84 :             let slice_storage;
     461           84 :             let content_len = content.bytes_init();
     462           84 :             let content = if content.bytes_init() > 0 {
     463           84 :                 slice_storage = Some(content.slice(0..content_len));
     464           84 :                 slice_storage.as_deref().expect("just set it to Some()")
     465              :             } else {
     466            0 :                 &[]
     467              :             };
     468           84 :             utils::crashsafe::overwrite(&final_path, &tmp_path, content)
     469           84 :         })
     470           84 :         .await
     471           84 :         .expect("blocking task is never aborted")
     472           84 :     }
     473              : 
     474              :     /// Call File::sync_all() on the underlying File.
     475         8139 :     pub async fn sync_all(&self) -> Result<(), Error> {
     476         8139 :         with_file!(self, StorageIoOperation::Fsync, |file_guard| {
     477         8139 :             let (_file_guard, res) = io_engine::get().sync_all(file_guard).await;
     478         8139 :             res
     479              :         })
     480         8139 :     }
     481              : 
     482              :     /// Call File::sync_data() on the underlying File.
     483            0 :     pub async fn sync_data(&self) -> Result<(), Error> {
     484            0 :         with_file!(self, StorageIoOperation::Fsync, |file_guard| {
     485            0 :             let (_file_guard, res) = io_engine::get().sync_data(file_guard).await;
     486            0 :             res
     487              :         })
     488            0 :     }
     489              : 
     490         5130 :     pub async fn metadata(&self) -> Result<Metadata, Error> {
     491         5130 :         with_file!(self, StorageIoOperation::Metadata, |file_guard| {
     492         5130 :             let (_file_guard, res) = io_engine::get().metadata(file_guard).await;
     493         5130 :             res
     494              :         })
     495         5130 :     }
     496              : 
     497              :     /// Helper function internal to `VirtualFile` that looks up the underlying File,
     498              :     /// opens it and evicts some other File if necessary. The passed parameter is
     499              :     /// assumed to be a function available for the physical `File`.
     500              :     ///
     501              :     /// We are doing it via a macro as Rust doesn't support async closures that
     502              :     /// take on parameters with lifetimes.
     503      5707256 :     async fn lock_file(&self) -> Result<FileGuard, Error> {
     504      5707256 :         let open_files = get_open_files();
     505              : 
     506       572260 :         let mut handle_guard = {
     507              :             // Read the cached slot handle, and see if the slot that it points to still
     508              :             // contains our File.
     509              :             //
     510              :             // We only need to hold the handle lock while we read the current handle. If
     511              :             // another thread closes the file and recycles the slot for a different file,
     512              :             // we will notice that the handle we read is no longer valid and retry.
     513      5707256 :             let mut handle = *self.handle.read().await;
     514              :             loop {
     515              :                 // Check if the slot contains our File
     516              :                 {
     517      6024445 :                     let slot = &open_files.slots[handle.index];
     518      6024445 :                     let slot_guard = slot.inner.read().await;
     519      6024445 :                     if slot_guard.tag == handle.tag && slot_guard.file.is_some() {
     520              :                         // Found a cached file descriptor.
     521      5134996 :                         slot.recently_used.store(true, Ordering::Relaxed);
     522      5134996 :                         return Ok(FileGuard { slot_guard });
     523       889449 :                     }
     524              :                 }
     525              : 
     526              :                 // The slot didn't contain our File. We will have to open it ourselves,
     527              :                 // but before that, grab a write lock on handle in the VirtualFile, so
     528              :                 // that no other thread will try to concurrently open the same file.
     529       889449 :                 let handle_guard = self.handle.write().await;
     530              : 
     531              :                 // If another thread changed the handle while we were not holding the lock,
     532              :                 // then the handle might now be valid again. Loop back to retry.
     533       889449 :                 if *handle_guard != handle {
     534       317189 :                     handle = *handle_guard;
     535       317189 :                     continue;
     536       572260 :                 }
     537       572260 :                 break handle_guard;
     538              :             }
     539              :         };
     540              : 
     541              :         // We need to open the file ourselves. The handle in the VirtualFile is
     542              :         // now locked in write-mode. Find a free slot to put it in.
     543       572260 :         let (handle, mut slot_guard) = open_files.find_victim_slot().await;
     544              : 
     545              :         // Re-open the physical file.
     546              :         // NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
     547              :         // case from StorageIoOperation::Open. This helps with identifying thrashing
     548              :         // of the virtual file descriptor cache.
     549       572260 :         let file = observe_duration!(StorageIoOperation::OpenAfterReplace, {
     550       572260 :             self.open_options.open(self.path.as_std_path()).await?
     551              :         });
     552              : 
     553              :         // Store the File in the slot and update the handle in the VirtualFile
     554              :         // to point to it.
     555       572260 :         slot_guard.file.replace(file);
     556       572260 : 
     557       572260 :         *handle_guard = handle;
     558       572260 : 
     559       572260 :         return Ok(FileGuard {
     560       572260 :             slot_guard: slot_guard.downgrade(),
     561       572260 :         });
     562      5707256 :     }
     563              : 
     564          612 :     pub fn remove(self) {
     565          612 :         let path = self.path.clone();
     566          612 :         drop(self);
     567          612 :         std::fs::remove_file(path).expect("failed to remove the virtual file");
     568          612 :     }
     569              : 
     570        16038 :     pub async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
     571        16038 :         match pos {
     572        16008 :             SeekFrom::Start(offset) => {
     573        16008 :                 self.pos = offset;
     574        16008 :             }
     575           12 :             SeekFrom::End(offset) => {
     576           12 :                 self.pos = with_file!(self, StorageIoOperation::Seek, |mut file_guard| file_guard
     577           12 :                     .with_std_file_mut(|std_file| std_file.seek(SeekFrom::End(offset))))?
     578              :             }
     579           18 :             SeekFrom::Current(offset) => {
     580           18 :                 let pos = self.pos as i128 + offset as i128;
     581           18 :                 if pos < 0 {
     582            6 :                     return Err(Error::new(
     583            6 :                         ErrorKind::InvalidInput,
     584            6 :                         "offset would be negative",
     585            6 :                     ));
     586           12 :                 }
     587           12 :                 if pos > u64::MAX as i128 {
     588            0 :                     return Err(Error::new(ErrorKind::InvalidInput, "offset overflow"));
     589           12 :                 }
     590           12 :                 self.pos = pos as u64;
     591              :             }
     592              :         }
     593        16026 :         Ok(self.pos)
     594        16038 :     }
     595              : 
     596              :     /// Read the file contents in range `offset..(offset + slice.bytes_total())` into `slice[0..slice.bytes_total()]`.
     597              :     ///
     598              :     /// The returned `Slice<Buf>` is equivalent to the input `slice`, i.e., it's the same view into the same buffer.
     599      2285132 :     pub async fn read_exact_at<Buf>(
     600      2285132 :         &self,
     601      2285132 :         slice: Slice<Buf>,
     602      2285132 :         offset: u64,
     603      2285132 :         ctx: &RequestContext,
     604      2285132 :     ) -> Result<Slice<Buf>, Error>
     605      2285132 :     where
     606      2285132 :         Buf: IoBufMut + Send,
     607      2285132 :     {
     608      2285132 :         let assert_we_return_original_bounds = if cfg!(debug_assertions) {
     609      2285132 :             Some((slice.stable_ptr() as usize, slice.bytes_total()))
     610              :         } else {
     611            0 :             None
     612              :         };
     613              : 
     614      2285132 :         let original_bounds = slice.bounds();
     615      2285132 :         let (buf, res) =
     616      2813739 :             read_exact_at_impl(slice, offset, |buf, offset| self.read_at(buf, offset, ctx)).await;
     617      2285132 :         let res = res.map(|_| buf.slice(original_bounds));
     618              : 
     619      2285132 :         if let Some(original_bounds) = assert_we_return_original_bounds {
     620      2285132 :             if let Ok(slice) = &res {
     621      2285132 :                 let returned_bounds = (slice.stable_ptr() as usize, slice.bytes_total());
     622      2285132 :                 assert_eq!(original_bounds, returned_bounds);
     623            0 :             }
     624            0 :         }
     625              : 
     626      2285132 :         res
     627      2285132 :     }
     628              : 
     629              :     /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`].
     630        96463 :     pub async fn read_exact_at_page(
     631        96463 :         &self,
     632        96463 :         page: PageWriteGuard<'static>,
     633        96463 :         offset: u64,
     634        96463 :         ctx: &RequestContext,
     635        96463 :     ) -> Result<PageWriteGuard<'static>, Error> {
     636        96463 :         let buf = PageWriteGuardBuf { page }.slice_full();
     637        96463 :         debug_assert_eq!(buf.bytes_total(), PAGE_SZ);
     638        96463 :         self.read_exact_at(buf, offset, ctx)
     639        66016 :             .await
     640        96463 :             .map(|slice| slice.into_inner().page)
     641        96463 :     }
     642              : 
     643              :     // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
     644           12 :     pub async fn write_all_at<Buf: IoBuf + Send>(
     645           12 :         &self,
     646           12 :         buf: FullSlice<Buf>,
     647           12 :         mut offset: u64,
     648           12 :         ctx: &RequestContext,
     649           12 :     ) -> (FullSlice<Buf>, Result<(), Error>) {
     650           12 :         let buf = buf.into_raw_slice();
     651           12 :         let bounds = buf.bounds();
     652           12 :         let restore =
     653           12 :             |buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds));
     654           12 :         let mut buf = buf;
     655           24 :         while !buf.is_empty() {
     656           12 :             let (tmp, res) = self.write_at(FullSlice::must_new(buf), offset, ctx).await;
     657           12 :             buf = tmp.into_raw_slice();
     658            0 :             match res {
     659              :                 Ok(0) => {
     660            0 :                     return (
     661            0 :                         restore(buf),
     662            0 :                         Err(Error::new(
     663            0 :                             std::io::ErrorKind::WriteZero,
     664            0 :                             "failed to write whole buffer",
     665            0 :                         )),
     666            0 :                     );
     667              :                 }
     668           12 :                 Ok(n) => {
     669           12 :                     buf = buf.slice(n..);
     670           12 :                     offset += n as u64;
     671           12 :                 }
     672            0 :                 Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {}
     673            0 :                 Err(e) => return (restore(buf), Err(e)),
     674              :             }
     675              :         }
     676           12 :         (restore(buf), Ok(()))
     677           12 :     }
     678              : 
     679              :     /// Writes `buf` to the file at the current offset.
     680              :     ///
     681              :     /// Panics if there is an uninitialized range in `buf`, as that is most likely a bug in the caller.
     682      3407613 :     pub async fn write_all<Buf: IoBuf + Send>(
     683      3407613 :         &mut self,
     684      3407613 :         buf: FullSlice<Buf>,
     685      3407613 :         ctx: &RequestContext,
     686      3407613 :     ) -> (FullSlice<Buf>, Result<usize, Error>) {
     687      3407613 :         let buf = buf.into_raw_slice();
     688      3407613 :         let bounds = buf.bounds();
     689      3407613 :         let restore =
     690      3407613 :             |buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds));
     691      3407613 :         let nbytes = buf.len();
     692      3407613 :         let mut buf = buf;
     693      6815106 :         while !buf.is_empty() {
     694      3407499 :             let (tmp, res) = self.write(FullSlice::must_new(buf), ctx).await;
     695      3407499 :             buf = tmp.into_raw_slice();
     696            6 :             match res {
     697              :                 Ok(0) => {
     698            0 :                     return (
     699            0 :                         restore(buf),
     700            0 :                         Err(Error::new(
     701            0 :                             std::io::ErrorKind::WriteZero,
     702            0 :                             "failed to write whole buffer",
     703            0 :                         )),
     704            0 :                     );
     705              :                 }
     706      3407493 :                 Ok(n) => {
     707      3407493 :                     buf = buf.slice(n..);
     708      3407493 :                 }
     709            6 :                 Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
     710            6 :                 Err(e) => return (restore(buf), Err(e)),
     711              :             }
     712              :         }
     713      3407607 :         (restore(buf), Ok(nbytes))
     714      3407613 :     }
     715              : 
     716      3407499 :     async fn write<B: IoBuf + Send>(
     717      3407499 :         &mut self,
     718      3407499 :         buf: FullSlice<B>,
     719      3407499 :         ctx: &RequestContext,
     720      3407499 :     ) -> (FullSlice<B>, Result<usize, std::io::Error>) {
     721      3407499 :         let pos = self.pos;
     722      3407499 :         let (buf, res) = self.write_at(buf, pos, ctx).await;
     723      3407499 :         let n = match res {
     724      3407493 :             Ok(n) => n,
     725            6 :             Err(e) => return (buf, Err(e)),
     726              :         };
     727      3407493 :         self.pos += n as u64;
     728      3407493 :         (buf, Ok(n))
     729      3407499 :     }
     730              : 
     731      2286464 :     pub(crate) async fn read_at<Buf>(
     732      2286464 :         &self,
     733      2286464 :         buf: tokio_epoll_uring::Slice<Buf>,
     734      2286464 :         offset: u64,
     735      2286464 :         _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
     736      2286464 :     ) -> (tokio_epoll_uring::Slice<Buf>, Result<usize, Error>)
     737      2286464 :     where
     738      2286464 :         Buf: tokio_epoll_uring::IoBufMut + Send,
     739      2286464 :     {
     740      2286464 :         let file_guard = match self.lock_file().await {
     741      2286464 :             Ok(file_guard) => file_guard,
     742            0 :             Err(e) => return (buf, Err(e)),
     743              :         };
     744              : 
     745      2286464 :         observe_duration!(StorageIoOperation::Read, {
     746      2286464 :             let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
     747      2286464 :             if let Ok(size) = res {
     748      2286458 :                 STORAGE_IO_SIZE
     749      2286458 :                     .with_label_values(&[
     750      2286458 :                         "read",
     751      2286458 :                         &self.tenant_id,
     752      2286458 :                         &self.shard_id,
     753      2286458 :                         &self.timeline_id,
     754      2286458 :                     ])
     755      2286458 :                     .add(size as i64);
     756      2286458 :             }
     757      2286464 :             (buf, res)
     758              :         })
     759      2286464 :     }
     760              : 
     761              :     /// The function aborts the process if the error is fatal.
     762      3407511 :     async fn write_at<B: IoBuf + Send>(
     763      3407511 :         &self,
     764      3407511 :         buf: FullSlice<B>,
     765      3407511 :         offset: u64,
     766      3407511 :         _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
     767      3407511 :     ) -> (FullSlice<B>, Result<usize, Error>) {
     768      3407511 :         let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
     769      3407511 :         let result = result.maybe_fatal_err("write_at");
     770      3407511 :         (slice, result)
     771      3407511 :     }
     772              : 
     773      3407511 :     async fn write_at_inner<B: IoBuf + Send>(
     774      3407511 :         &self,
     775      3407511 :         buf: FullSlice<B>,
     776      3407511 :         offset: u64,
     777      3407511 :         _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
     778      3407511 :     ) -> (FullSlice<B>, Result<usize, Error>) {
     779      3407511 :         let file_guard = match self.lock_file().await {
     780      3407511 :             Ok(file_guard) => file_guard,
     781            0 :             Err(e) => return (buf, Err(e)),
     782              :         };
     783      3407511 :         observe_duration!(StorageIoOperation::Write, {
     784      3407511 :             let ((_file_guard, buf), result) =
     785      3407511 :                 io_engine::get().write_at(file_guard, offset, buf).await;
     786      3407511 :             if let Ok(size) = result {
     787      3407505 :                 STORAGE_IO_SIZE
     788      3407505 :                     .with_label_values(&[
     789      3407505 :                         "write",
     790      3407505 :                         &self.tenant_id,
     791      3407505 :                         &self.shard_id,
     792      3407505 :                         &self.timeline_id,
     793      3407505 :                     ])
     794      3407505 :                     .add(size as i64);
     795      3407505 :             }
     796      3407511 :             (buf, result)
     797              :         })
     798      3407511 :     }
     799              : }
     800              : 
     801              : // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
     802      2285156 : pub async fn read_exact_at_impl<Buf, F, Fut>(
     803      2285156 :     mut buf: tokio_epoll_uring::Slice<Buf>,
     804      2285156 :     mut offset: u64,
     805      2285156 :     mut read_at: F,
     806      2285156 : ) -> (Buf, std::io::Result<()>)
     807      2285156 : where
     808      2285156 :     Buf: IoBufMut + Send,
     809      2285156 :     F: FnMut(tokio_epoll_uring::Slice<Buf>, u64) -> Fut,
     810      2285156 :     Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<Buf>, std::io::Result<usize>)>,
     811      2285156 : {
     812      4570318 :     while buf.bytes_total() != 0 {
     813              :         let res;
     814      2813739 :         (buf, res) = read_at(buf, offset).await;
     815            0 :         match res {
     816            6 :             Ok(0) => break,
     817      2285162 :             Ok(n) => {
     818      2285162 :                 buf = buf.slice(n..);
     819      2285162 :                 offset += n as u64;
     820      2285162 :             }
     821            0 :             Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
     822            0 :             Err(e) => return (buf.into_inner(), Err(e)),
     823              :         }
     824              :     }
     825              :     // NB: don't use `buf.is_empty()` here; it is from the
     826              :     // `impl Deref for Slice { Target = [u8] }`; the &[u8]
     827              :     // returned by it only covers the initialized portion of `buf`.
     828              :     // Whereas we're interested in ensuring that we filled the entire
     829              :     // buffer that the user passed in.
     830      2285156 :     if buf.bytes_total() != 0 {
     831            6 :         (
     832            6 :             buf.into_inner(),
     833            6 :             Err(std::io::Error::new(
     834            6 :                 std::io::ErrorKind::UnexpectedEof,
     835            6 :                 "failed to fill whole buffer",
     836            6 :             )),
     837            6 :         )
     838              :     } else {
     839      2285150 :         assert_eq!(buf.len(), buf.bytes_total());
     840      2285150 :         (buf.into_inner(), Ok(()))
     841              :     }
     842      2285156 : }
     843              : 
     844              : #[cfg(test)]
     845              : mod test_read_exact_at_impl {
     846              : 
     847              :     use std::{collections::VecDeque, sync::Arc};
     848              : 
     849              :     use tokio_epoll_uring::{BoundedBuf, BoundedBufMut};
     850              : 
     851              :     use super::read_exact_at_impl;
     852              : 
     853              :     struct Expectation {
     854              :         offset: u64,
     855              :         bytes_total: usize,
     856              :         result: std::io::Result<Vec<u8>>,
     857              :     }
     858              :     struct MockReadAt {
     859              :         expectations: VecDeque<Expectation>,
     860              :     }
     861              : 
     862              :     impl MockReadAt {
     863           36 :         async fn read_at(
     864           36 :             &mut self,
     865           36 :             mut buf: tokio_epoll_uring::Slice<Vec<u8>>,
     866           36 :             offset: u64,
     867           36 :         ) -> (tokio_epoll_uring::Slice<Vec<u8>>, std::io::Result<usize>) {
     868           36 :             let exp = self
     869           36 :                 .expectations
     870           36 :                 .pop_front()
     871           36 :                 .expect("read_at called but we have no expectations left");
     872           36 :             assert_eq!(exp.offset, offset);
     873           36 :             assert_eq!(exp.bytes_total, buf.bytes_total());
     874           36 :             match exp.result {
     875           36 :                 Ok(bytes) => {
     876           36 :                     assert!(bytes.len() <= buf.bytes_total());
     877           36 :                     buf.put_slice(&bytes);
     878           36 :                     (buf, Ok(bytes.len()))
     879              :                 }
     880            0 :                 Err(e) => (buf, Err(e)),
     881              :             }
     882           36 :         }
     883              :     }
     884              : 
     885              :     impl Drop for MockReadAt {
     886           24 :         fn drop(&mut self) {
     887           24 :             assert_eq!(self.expectations.len(), 0);
     888           24 :         }
     889              :     }
     890              : 
     891              :     #[tokio::test]
     892            6 :     async fn test_basic() {
     893            6 :         let buf = Vec::with_capacity(5).slice_full();
     894            6 :         let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
     895            6 :             expectations: VecDeque::from(vec![Expectation {
     896            6 :                 offset: 0,
     897            6 :                 bytes_total: 5,
     898            6 :                 result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
     899            6 :             }]),
     900            6 :         }));
     901            6 :         let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
     902            6 :             let mock_read_at = Arc::clone(&mock_read_at);
     903            6 :             async move { mock_read_at.lock().await.read_at(buf, offset).await }
     904            6 :         })
     905            6 :         .await;
     906            6 :         assert!(res.is_ok());
     907            6 :         assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
     908            6 :     }
     909              : 
     910              :     #[tokio::test]
     911            6 :     async fn test_empty_buf_issues_no_syscall() {
     912            6 :         let buf = Vec::new().slice_full();
     913            6 :         let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
     914            6 :             expectations: VecDeque::new(),
     915            6 :         }));
     916            6 :         let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
     917            0 :             let mock_read_at = Arc::clone(&mock_read_at);
     918            6 :             async move { mock_read_at.lock().await.read_at(buf, offset).await }
     919            6 :         })
     920            6 :         .await;
     921            6 :         assert!(res.is_ok());
     922            6 :     }
     923              : 
     924              :     #[tokio::test]
     925            6 :     async fn test_two_read_at_calls_needed_until_buf_filled() {
     926            6 :         let buf = Vec::with_capacity(4).slice_full();
     927            6 :         let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
     928            6 :             expectations: VecDeque::from(vec![
     929            6 :                 Expectation {
     930            6 :                     offset: 0,
     931            6 :                     bytes_total: 4,
     932            6 :                     result: Ok(vec![b'a', b'b']),
     933            6 :                 },
     934            6 :                 Expectation {
     935            6 :                     offset: 2,
     936            6 :                     bytes_total: 2,
     937            6 :                     result: Ok(vec![b'c', b'd']),
     938            6 :                 },
     939            6 :             ]),
     940            6 :         }));
     941           12 :         let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
     942           12 :             let mock_read_at = Arc::clone(&mock_read_at);
     943           12 :             async move { mock_read_at.lock().await.read_at(buf, offset).await }
     944           12 :         })
     945            6 :         .await;
     946            6 :         assert!(res.is_ok());
     947            6 :         assert_eq!(buf, vec![b'a', b'b', b'c', b'd']);
     948            6 :     }
     949              : 
     950              :     #[tokio::test]
     951            6 :     async fn test_eof_before_buffer_full() {
     952            6 :         let buf = Vec::with_capacity(3).slice_full();
     953            6 :         let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
     954            6 :             expectations: VecDeque::from(vec![
     955            6 :                 Expectation {
     956            6 :                     offset: 0,
     957            6 :                     bytes_total: 3,
     958            6 :                     result: Ok(vec![b'a']),
     959            6 :                 },
     960            6 :                 Expectation {
     961            6 :                     offset: 1,
     962            6 :                     bytes_total: 2,
     963            6 :                     result: Ok(vec![b'b']),
     964            6 :                 },
     965            6 :                 Expectation {
     966            6 :                     offset: 2,
     967            6 :                     bytes_total: 1,
     968            6 :                     result: Ok(vec![]),
     969            6 :                 },
     970            6 :             ]),
     971            6 :         }));
     972           18 :         let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
     973           18 :             let mock_read_at = Arc::clone(&mock_read_at);
     974           18 :             async move { mock_read_at.lock().await.read_at(buf, offset).await }
     975           18 :         })
     976            6 :         .await;
     977            6 :         let Err(err) = res else {
     978            6 :             panic!("should return an error");
     979            6 :         };
     980            6 :         assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof);
     981            6 :         assert_eq!(format!("{err}"), "failed to fill whole buffer");
     982            6 :         // buffer contents on error are unspecified
     983            6 :     }
     984              : }
     985              : 
     986              : struct FileGuard {
     987              :     slot_guard: RwLockReadGuard<'static, SlotInner>,
     988              : }
     989              : 
     990              : impl AsRef<OwnedFd> for FileGuard {
     991      5707256 :     fn as_ref(&self) -> &OwnedFd {
     992      5707256 :         // This unwrap is safe because we only create `FileGuard`s
     993      5707256 :         // if we know that the file is Some.
     994      5707256 :         self.slot_guard.file.as_ref().unwrap()
     995      5707256 :     }
     996              : }
     997              : 
     998              : impl FileGuard {
     999              :     /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
    1000      2853431 :     fn with_std_file<F, R>(&self, with: F) -> R
    1001      2853431 :     where
    1002      2853431 :         F: FnOnce(&File) -> R,
    1003      2853431 :     {
    1004      2853431 :         // SAFETY:
    1005      2853431 :         // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
    1006      2853431 :         // - `&` usage below: `self` is `&`, hence Rust typesystem guarantees there are is no `&mut`
    1007      2853431 :         let file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
    1008      2853431 :         let res = with(&file);
    1009      2853431 :         let _ = file.into_raw_fd();
    1010      2853431 :         res
    1011      2853431 :     }
    1012              :     /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
    1013           12 :     fn with_std_file_mut<F, R>(&mut self, with: F) -> R
    1014           12 :     where
    1015           12 :         F: FnOnce(&mut File) -> R,
    1016           12 :     {
    1017           12 :         // SAFETY:
    1018           12 :         // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
    1019           12 :         // - &mut usage below: `self` is `&mut`, hence this call is the only task/thread that has control over the underlying fd
    1020           12 :         let mut file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
    1021           12 :         let res = with(&mut file);
    1022           12 :         let _ = file.into_raw_fd();
    1023           12 :         res
    1024           12 :     }
    1025              : }
    1026              : 
    1027              : impl tokio_epoll_uring::IoFd for FileGuard {
    1028      2853813 :     unsafe fn as_fd(&self) -> RawFd {
    1029      2853813 :         let owned_fd: &OwnedFd = self.as_ref();
    1030      2853813 :         owned_fd.as_raw_fd()
    1031      2853813 :     }
    1032              : }
    1033              : 
    1034              : #[cfg(test)]
    1035              : impl VirtualFile {
    1036        62748 :     pub(crate) async fn read_blk(
    1037        62748 :         &self,
    1038        62748 :         blknum: u32,
    1039        62748 :         ctx: &RequestContext,
    1040        62748 :     ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
    1041              :         use crate::page_cache::PAGE_SZ;
    1042        62748 :         let slice = Vec::with_capacity(PAGE_SZ).slice_full();
    1043        62748 :         assert_eq!(slice.bytes_total(), PAGE_SZ);
    1044        62748 :         let slice = self
    1045        62748 :             .read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx)
    1046        31857 :             .await?;
    1047        62748 :         Ok(crate::tenant::block_io::BlockLease::Vec(slice.into_inner()))
    1048        62748 :     }
    1049              : 
    1050          672 :     async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
    1051          672 :         let mut tmp = vec![0; 128];
    1052              :         loop {
    1053         1332 :             let slice = tmp.slice(..128);
    1054         1332 :             let (slice, res) = self.read_at(slice, self.pos, ctx).await;
    1055            6 :             match res {
    1056          666 :                 Ok(0) => return Ok(()),
    1057          660 :                 Ok(n) => {
    1058          660 :                     self.pos += n as u64;
    1059          660 :                     buf.extend_from_slice(&slice[..n]);
    1060          660 :                 }
    1061            6 :                 Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
    1062            6 :                 Err(e) => return Err(e),
    1063              :             }
    1064          660 :             tmp = slice.into_inner();
    1065              :         }
    1066          672 :     }
    1067              : }
    1068              : 
    1069              : impl Drop for VirtualFile {
    1070              :     /// If a VirtualFile is dropped, close the underlying file if it was open.
    1071        15161 :     fn drop(&mut self) {
    1072        15161 :         let handle = self.handle.get_mut();
    1073              : 
    1074        15161 :         fn clean_slot(slot: &Slot, mut slot_guard: RwLockWriteGuard<'_, SlotInner>, tag: u64) {
    1075        15161 :             if slot_guard.tag == tag {
    1076        13481 :                 slot.recently_used.store(false, Ordering::Relaxed);
    1077              :                 // there is also operation "close-by-replace" for closes done on eviction for
    1078              :                 // comparison.
    1079        13481 :                 if let Some(fd) = slot_guard.file.take() {
    1080        13481 :                     STORAGE_IO_TIME_METRIC
    1081        13481 :                         .get(StorageIoOperation::Close)
    1082        13481 :                         .observe_closure_duration(|| drop(fd));
    1083        13481 :                 }
    1084         1680 :             }
    1085        15161 :         }
    1086              : 
    1087              :         // We don't have async drop so we cannot directly await the lock here.
    1088              :         // Instead, first do a best-effort attempt at closing the underlying
    1089              :         // file descriptor by using `try_write`, and if that fails, spawn
    1090              :         // a tokio task to do it asynchronously: we just want it to be
    1091              :         // cleaned up eventually.
    1092              :         // Most of the time, the `try_lock` should succeed though,
    1093              :         // as we have `&mut self` access. In other words, if the slot
    1094              :         // is still occupied by our file, there should be no access from
    1095              :         // other I/O operations; the only other possible place to lock
    1096              :         // the slot is the lock algorithm looking for free slots.
    1097        15161 :         let slot = &get_open_files().slots[handle.index];
    1098        15161 :         if let Ok(slot_guard) = slot.inner.try_write() {
    1099        15161 :             clean_slot(slot, slot_guard, handle.tag);
    1100        15161 :         } else {
    1101            0 :             let tag = handle.tag;
    1102            0 :             tokio::spawn(async move {
    1103            0 :                 let slot_guard = slot.inner.write().await;
    1104            0 :                 clean_slot(slot, slot_guard, tag);
    1105            0 :             });
    1106            0 :         };
    1107        15161 :     }
    1108              : }
    1109              : 
    1110              : impl OwnedAsyncWriter for VirtualFile {
    1111              :     #[inline(always)]
    1112        19791 :     async fn write_all<Buf: IoBuf + Send>(
    1113        19791 :         &mut self,
    1114        19791 :         buf: FullSlice<Buf>,
    1115        19791 :         ctx: &RequestContext,
    1116        19791 :     ) -> std::io::Result<(usize, FullSlice<Buf>)> {
    1117        19791 :         let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
    1118        19791 :         res.map(move |v| (v, buf))
    1119        19791 :     }
    1120              : }
    1121              : 
    1122              : impl OpenFiles {
    1123          600 :     fn new(num_slots: usize) -> OpenFiles {
    1124          600 :         let mut slots = Box::new(Vec::with_capacity(num_slots));
    1125         6000 :         for _ in 0..num_slots {
    1126         6000 :             let slot = Slot {
    1127         6000 :                 recently_used: AtomicBool::new(false),
    1128         6000 :                 inner: RwLock::new(SlotInner { tag: 0, file: None }),
    1129         6000 :             };
    1130         6000 :             slots.push(slot);
    1131         6000 :         }
    1132              : 
    1133          600 :         OpenFiles {
    1134          600 :             next: AtomicUsize::new(0),
    1135          600 :             slots: Box::leak(slots),
    1136          600 :         }
    1137          600 :     }
    1138              : }
    1139              : 
    1140              : ///
    1141              : /// Initialize the virtual file module. This must be called once at page
    1142              : /// server startup.
    1143              : ///
    1144              : #[cfg(not(test))]
    1145            0 : pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) {
    1146            0 :     if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
    1147            0 :         panic!("virtual_file::init called twice");
    1148            0 :     }
    1149            0 :     if set_io_buffer_alignment(io_buffer_alignment).is_err() {
    1150            0 :         panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two");
    1151            0 :     }
    1152            0 :     io_engine::init(engine);
    1153            0 :     crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
    1154            0 : }
    1155              : 
    1156              : const TEST_MAX_FILE_DESCRIPTORS: usize = 10;
    1157              : 
    1158              : // Get a handle to the global slots array.
    1159      5739934 : fn get_open_files() -> &'static OpenFiles {
    1160      5739934 :     //
    1161      5739934 :     // In unit tests, page server startup doesn't happen and no one calls
    1162      5739934 :     // virtual_file::init(). Initialize it here, with a small array.
    1163      5739934 :     //
    1164      5739934 :     // This applies to the virtual file tests below, but all other unit
    1165      5739934 :     // tests too, so the virtual file facility is always usable in
    1166      5739934 :     // unit tests.
    1167      5739934 :     //
    1168      5739934 :     if cfg!(test) {
    1169      5739934 :         OPEN_FILES.get_or_init(|| OpenFiles::new(TEST_MAX_FILE_DESCRIPTORS))
    1170              :     } else {
    1171            0 :         OPEN_FILES.get().expect("virtual_file::init not called yet")
    1172              :     }
    1173      5739934 : }
    1174              : 
    1175              : static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT);
    1176              : 
    1177              : /// Returns true if `x` is zero or a power of two.
    1178      1265282 : fn is_zero_or_power_of_two(x: usize) -> bool {
    1179      1265282 :     (x == 0) || ((x & (x - 1)) == 0)
    1180      1265282 : }
    1181              : 
    1182              : #[allow(unused)]
    1183            0 : pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
    1184            0 :     if is_zero_or_power_of_two(align) {
    1185            0 :         IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed);
    1186            0 :         Ok(())
    1187              :     } else {
    1188            0 :         Err(align)
    1189              :     }
    1190            0 : }
    1191              : 
    1192              : /// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified.
    1193              : ///
    1194              : /// This function should be used to check the raw config value.
    1195      1265282 : pub(crate) fn get_io_buffer_alignment_raw() -> usize {
    1196      1265282 :     let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed);
    1197      1265282 : 
    1198      1265282 :     if cfg!(test) {
    1199      1265282 :         let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
    1200      1265282 :         if let Some(test_align) = utils::env::var(env_var_name) {
    1201      1265282 :             if is_zero_or_power_of_two(test_align) {
    1202      1265282 :                 test_align
    1203              :             } else {
    1204            0 :                 panic!("IO buffer alignment ({test_align}) is not a power of two");
    1205              :             }
    1206              :         } else {
    1207            0 :             align
    1208              :         }
    1209              :     } else {
    1210            0 :         align
    1211              :     }
    1212      1265282 : }
    1213              : 
    1214              : /// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero.
    1215              : ///
    1216              : /// This function should be used for getting the actual alignment value to use.
    1217       624179 : pub(crate) fn get_io_buffer_alignment() -> usize {
    1218       624179 :     let align = get_io_buffer_alignment_raw();
    1219       624179 :     align.max(1)
    1220       624179 : }
    1221              : 
    1222              : #[cfg(test)]
    1223              : mod tests {
    1224              :     use crate::context::DownloadBehavior;
    1225              :     use crate::task_mgr::TaskKind;
    1226              : 
    1227              :     use super::*;
    1228              :     use owned_buffers_io::io_buf_ext::IoBufExt;
    1229              :     use owned_buffers_io::slice::SliceMutExt;
    1230              :     use rand::seq::SliceRandom;
    1231              :     use rand::thread_rng;
    1232              :     use rand::Rng;
    1233              :     use std::io::Write;
    1234              :     use std::os::unix::fs::FileExt;
    1235              :     use std::sync::Arc;
    1236              : 
    1237              :     enum MaybeVirtualFile {
    1238              :         VirtualFile(VirtualFile),
    1239              :         File(File),
    1240              :     }
    1241              : 
    1242              :     impl From<VirtualFile> for MaybeVirtualFile {
    1243           18 :         fn from(vf: VirtualFile) -> Self {
    1244           18 :             MaybeVirtualFile::VirtualFile(vf)
    1245           18 :         }
    1246              :     }
    1247              : 
    1248              :     impl MaybeVirtualFile {
    1249         1212 :         async fn read_exact_at(
    1250         1212 :             &self,
    1251         1212 :             mut slice: tokio_epoll_uring::Slice<Vec<u8>>,
    1252         1212 :             offset: u64,
    1253         1212 :             ctx: &RequestContext,
    1254         1212 :         ) -> Result<tokio_epoll_uring::Slice<Vec<u8>>, Error> {
    1255         1212 :             match self {
    1256          608 :                 MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await,
    1257          606 :                 MaybeVirtualFile::File(file) => {
    1258          606 :                     let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed();
    1259          606 :                     file.read_exact_at(rust_slice, offset).map(|()| slice)
    1260              :                 }
    1261              :             }
    1262         1212 :         }
    1263           24 :         async fn write_all_at<Buf: IoBuf + Send>(
    1264           24 :             &self,
    1265           24 :             buf: FullSlice<Buf>,
    1266           24 :             offset: u64,
    1267           24 :             ctx: &RequestContext,
    1268           24 :         ) -> Result<(), Error> {
    1269           24 :             match self {
    1270           12 :                 MaybeVirtualFile::VirtualFile(file) => {
    1271           12 :                     let (_buf, res) = file.write_all_at(buf, offset, ctx).await;
    1272           12 :                     res
    1273              :                 }
    1274           12 :                 MaybeVirtualFile::File(file) => file.write_all_at(&buf[..], offset),
    1275              :             }
    1276           24 :         }
    1277          108 :         async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
    1278          108 :             match self {
    1279           54 :                 MaybeVirtualFile::VirtualFile(file) => file.seek(pos).await,
    1280           54 :                 MaybeVirtualFile::File(file) => file.seek(pos),
    1281              :             }
    1282          108 :         }
    1283           24 :         async fn write_all<Buf: IoBuf + Send>(
    1284           24 :             &mut self,
    1285           24 :             buf: FullSlice<Buf>,
    1286           24 :             ctx: &RequestContext,
    1287           24 :         ) -> Result<(), Error> {
    1288           24 :             match self {
    1289           12 :                 MaybeVirtualFile::VirtualFile(file) => {
    1290           12 :                     let (_buf, res) = file.write_all(buf, ctx).await;
    1291           12 :                     res.map(|_| ())
    1292              :                 }
    1293           12 :                 MaybeVirtualFile::File(file) => file.write_all(&buf[..]),
    1294              :             }
    1295           24 :         }
    1296              : 
    1297              :         // Helper function to slurp contents of a file, starting at the current position,
    1298              :         // into a string
    1299         1326 :         async fn read_string(&mut self, ctx: &RequestContext) -> Result<String, Error> {
    1300              :             use std::io::Read;
    1301         1326 :             let mut buf = String::new();
    1302         1326 :             match self {
    1303          672 :                 MaybeVirtualFile::VirtualFile(file) => {
    1304          672 :                     let mut buf = Vec::new();
    1305          678 :                     file.read_to_end(&mut buf, ctx).await?;
    1306          666 :                     return Ok(String::from_utf8(buf).unwrap());
    1307              :                 }
    1308          654 :                 MaybeVirtualFile::File(file) => {
    1309          654 :                     file.read_to_string(&mut buf)?;
    1310              :                 }
    1311              :             }
    1312          648 :             Ok(buf)
    1313         1326 :         }
    1314              : 
    1315              :         // Helper function to slurp a portion of a file into a string
    1316         1212 :         async fn read_string_at(
    1317         1212 :             &mut self,
    1318         1212 :             pos: u64,
    1319         1212 :             len: usize,
    1320         1212 :             ctx: &RequestContext,
    1321         1212 :         ) -> Result<String, Error> {
    1322         1212 :             let slice = Vec::with_capacity(len).slice_full();
    1323         1212 :             assert_eq!(slice.bytes_total(), len);
    1324         1212 :             let slice = self.read_exact_at(slice, pos, ctx).await?;
    1325         1212 :             let vec = slice.into_inner();
    1326         1212 :             assert_eq!(vec.len(), len);
    1327         1212 :             Ok(String::from_utf8(vec).unwrap())
    1328         1212 :         }
    1329              :     }
    1330              : 
    1331              :     #[tokio::test]
    1332            6 :     async fn test_virtual_files() -> anyhow::Result<()> {
    1333            6 :         // The real work is done in the test_files() helper function. This
    1334            6 :         // allows us to run the same set of tests against a native File, and
    1335            6 :         // VirtualFile. We trust the native Files and wouldn't need to test them,
    1336            6 :         // but this allows us to verify that the operations return the same
    1337            6 :         // results with VirtualFiles as with native Files. (Except that with
    1338            6 :         // native files, you will run out of file descriptors if the ulimit
    1339            6 :         // is low enough.)
    1340            6 :         struct A;
    1341            6 : 
    1342            6 :         impl Adapter for A {
    1343          618 :             async fn open(
    1344          618 :                 path: Utf8PathBuf,
    1345          618 :                 opts: OpenOptions,
    1346          618 :                 ctx: &RequestContext,
    1347          618 :             ) -> Result<MaybeVirtualFile, anyhow::Error> {
    1348          618 :                 let vf = VirtualFile::open_with_options(&path, &opts, ctx).await?;
    1349          618 :                 Ok(MaybeVirtualFile::VirtualFile(vf))
    1350          618 :             }
    1351            6 :         }
    1352         1592 :         test_files::<A>("virtual_files").await
    1353            6 :     }
    1354              : 
    1355              :     #[tokio::test]
    1356            6 :     async fn test_physical_files() -> anyhow::Result<()> {
    1357            6 :         struct B;
    1358            6 : 
    1359            6 :         impl Adapter for B {
    1360          618 :             async fn open(
    1361          618 :                 path: Utf8PathBuf,
    1362          618 :                 opts: OpenOptions,
    1363          618 :                 _ctx: &RequestContext,
    1364          618 :             ) -> Result<MaybeVirtualFile, anyhow::Error> {
    1365            6 :                 Ok(MaybeVirtualFile::File({
    1366          618 :                     let owned_fd = opts.open(path.as_std_path()).await?;
    1367          618 :                     File::from(owned_fd)
    1368            6 :                 }))
    1369          618 :             }
    1370            6 :         }
    1371            6 : 
    1372          312 :         test_files::<B>("physical_files").await
    1373            6 :     }
    1374              : 
    1375              :     /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition
    1376              :     /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function
    1377              :     /// in trait which benefits from the new lifetime capture rules already.
    1378              :     trait Adapter {
    1379              :         async fn open(
    1380              :             path: Utf8PathBuf,
    1381              :             opts: OpenOptions,
    1382              :             ctx: &RequestContext,
    1383              :         ) -> Result<MaybeVirtualFile, anyhow::Error>;
    1384              :     }
    1385              : 
    1386           12 :     async fn test_files<A>(testname: &str) -> anyhow::Result<()>
    1387           12 :     where
    1388           12 :         A: Adapter,
    1389           12 :     {
    1390           12 :         let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
    1391           12 :         let testdir = crate::config::PageServerConf::test_repo_dir(testname);
    1392           12 :         std::fs::create_dir_all(&testdir)?;
    1393              : 
    1394           12 :         let path_a = testdir.join("file_a");
    1395           12 :         let mut file_a = A::open(
    1396           12 :             path_a.clone(),
    1397           12 :             OpenOptions::new()
    1398           12 :                 .write(true)
    1399           12 :                 .create(true)
    1400           12 :                 .truncate(true)
    1401           12 :                 .to_owned(),
    1402           12 :             &ctx,
    1403           12 :         )
    1404           12 :         .await?;
    1405           12 :         file_a
    1406           12 :             .write_all(b"foobar".to_vec().slice_len(), &ctx)
    1407            3 :             .await?;
    1408              : 
    1409              :         // cannot read from a file opened in write-only mode
    1410           12 :         let _ = file_a.read_string(&ctx).await.unwrap_err();
    1411              : 
    1412              :         // Close the file and re-open for reading
    1413           12 :         let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?;
    1414              : 
    1415              :         // cannot write to a file opened in read-only mode
    1416           12 :         let _ = file_a
    1417           12 :             .write_all(b"bar".to_vec().slice_len(), &ctx)
    1418            3 :             .await
    1419           12 :             .unwrap_err();
    1420           12 : 
    1421           12 :         // Try simple read
    1422           12 :         assert_eq!("foobar", file_a.read_string(&ctx).await?);
    1423              : 
    1424              :         // It's positioned at the EOF now.
    1425           12 :         assert_eq!("", file_a.read_string(&ctx).await?);
    1426              : 
    1427              :         // Test seeks.
    1428           12 :         assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
    1429           12 :         assert_eq!("oobar", file_a.read_string(&ctx).await?);
    1430              : 
    1431           12 :         assert_eq!(file_a.seek(SeekFrom::End(-2)).await?, 4);
    1432           12 :         assert_eq!("ar", file_a.read_string(&ctx).await?);
    1433              : 
    1434           12 :         assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
    1435           12 :         assert_eq!(file_a.seek(SeekFrom::Current(2)).await?, 3);
    1436           12 :         assert_eq!("bar", file_a.read_string(&ctx).await?);
    1437              : 
    1438           12 :         assert_eq!(file_a.seek(SeekFrom::Current(-5)).await?, 1);
    1439           12 :         assert_eq!("oobar", file_a.read_string(&ctx).await?);
    1440              : 
    1441              :         // Test erroneous seeks to before byte 0
    1442           12 :         file_a.seek(SeekFrom::End(-7)).await.unwrap_err();
    1443           12 :         assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
    1444           12 :         file_a.seek(SeekFrom::Current(-2)).await.unwrap_err();
    1445           12 : 
    1446           12 :         // the erroneous seek should have left the position unchanged
    1447           12 :         assert_eq!("oobar", file_a.read_string(&ctx).await?);
    1448              : 
    1449              :         // Create another test file, and try FileExt functions on it.
    1450           12 :         let path_b = testdir.join("file_b");
    1451           12 :         let mut file_b = A::open(
    1452           12 :             path_b.clone(),
    1453           12 :             OpenOptions::new()
    1454           12 :                 .read(true)
    1455           12 :                 .write(true)
    1456           12 :                 .create(true)
    1457           12 :                 .truncate(true)
    1458           12 :                 .to_owned(),
    1459           12 :             &ctx,
    1460           12 :         )
    1461            6 :         .await?;
    1462           12 :         file_b
    1463           12 :             .write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx)
    1464            3 :             .await?;
    1465           12 :         file_b
    1466           12 :             .write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx)
    1467            3 :             .await?;
    1468              : 
    1469           12 :         assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
    1470              : 
    1471              :         // Open a lot of files, enough to cause some evictions. (Or to be precise,
    1472              :         // open the same file many times. The effect is the same.)
    1473              :         //
    1474              :         // leave file_a positioned at offset 1 before we start
    1475           12 :         assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
    1476              : 
    1477           12 :         let mut vfiles = Vec::new();
    1478         1212 :         for _ in 0..100 {
    1479         1200 :             let mut vfile = A::open(
    1480         1200 :                 path_b.clone(),
    1481         1200 :                 OpenOptions::new().read(true).to_owned(),
    1482         1200 :                 &ctx,
    1483         1200 :             )
    1484          600 :             .await?;
    1485         1200 :             assert_eq!("FOOBAR", vfile.read_string(&ctx).await?);
    1486         1200 :             vfiles.push(vfile);
    1487              :         }
    1488              : 
    1489              :         // make sure we opened enough files to definitely cause evictions.
    1490           12 :         assert!(vfiles.len() > TEST_MAX_FILE_DESCRIPTORS * 2);
    1491              : 
    1492              :         // The underlying file descriptor for 'file_a' should be closed now. Try to read
    1493              :         // from it again. We left the file positioned at offset 1 above.
    1494           12 :         assert_eq!("oobar", file_a.read_string(&ctx).await?);
    1495              : 
    1496              :         // Check that all the other FDs still work too. Use them in random order for
    1497              :         // good measure.
    1498           12 :         vfiles.as_mut_slice().shuffle(&mut thread_rng());
    1499         1200 :         for vfile in vfiles.iter_mut() {
    1500         1200 :             assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?);
    1501              :         }
    1502              : 
    1503           12 :         Ok(())
    1504           12 :     }
    1505              : 
    1506              :     /// Test using VirtualFiles from many threads concurrently. This tests both using
    1507              :     /// a lot of VirtualFiles concurrently, causing evictions, and also using the same
    1508              :     /// VirtualFile from multiple threads concurrently.
    1509              :     #[tokio::test]
    1510            6 :     async fn test_vfile_concurrency() -> Result<(), Error> {
    1511            6 :         const SIZE: usize = 8 * 1024;
    1512            6 :         const VIRTUAL_FILES: usize = 100;
    1513            6 :         const THREADS: usize = 100;
    1514            6 :         const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
    1515            6 : 
    1516            6 :         let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
    1517            6 :         let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
    1518            6 :         std::fs::create_dir_all(&testdir)?;
    1519            6 : 
    1520            6 :         // Create a test file.
    1521            6 :         let test_file_path = testdir.join("concurrency_test_file");
    1522            6 :         {
    1523            6 :             let file = File::create(&test_file_path)?;
    1524            6 :             file.write_all_at(&SAMPLE, 0)?;
    1525            6 :         }
    1526            6 : 
    1527            6 :         // Open the file many times.
    1528            6 :         let mut files = Vec::new();
    1529          606 :         for _ in 0..VIRTUAL_FILES {
    1530          600 :             let f = VirtualFile::open_with_options(
    1531          600 :                 &test_file_path,
    1532          600 :                 OpenOptions::new().read(true),
    1533          600 :                 &ctx,
    1534          600 :             )
    1535          303 :             .await?;
    1536          600 :             files.push(f);
    1537            6 :         }
    1538            6 :         let files = Arc::new(files);
    1539            6 : 
    1540            6 :         // Launch many threads, and use the virtual files concurrently in random order.
    1541            6 :         let rt = tokio::runtime::Builder::new_multi_thread()
    1542            6 :             .worker_threads(THREADS)
    1543            6 :             .thread_name("test_vfile_concurrency thread")
    1544            6 :             .build()
    1545            6 :             .unwrap();
    1546            6 :         let mut hdls = Vec::new();
    1547          606 :         for _threadno in 0..THREADS {
    1548          600 :             let files = files.clone();
    1549          600 :             let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error);
    1550          600 :             let hdl = rt.spawn(async move {
    1551          600 :                 let mut buf = vec![0u8; SIZE];
    1552          600 :                 let mut rng = rand::rngs::OsRng;
    1553       600000 :                 for _ in 1..1000 {
    1554       599400 :                     let f = &files[rng.gen_range(0..files.len())];
    1555       599400 :                     buf = f
    1556       599400 :                         .read_exact_at(buf.slice_full(), 0, &ctx)
    1557      1940053 :                         .await
    1558       599400 :                         .unwrap()
    1559       599400 :                         .into_inner();
    1560       599400 :                     assert!(buf == SAMPLE);
    1561            6 :                 }
    1562          600 :             });
    1563          600 :             hdls.push(hdl);
    1564          600 :         }
    1565          606 :         for hdl in hdls {
    1566          600 :             hdl.await?;
    1567            6 :         }
    1568            6 :         std::mem::forget(rt);
    1569            6 : 
    1570            6 :         Ok(())
    1571            6 :     }
    1572              : 
    1573              :     #[tokio::test]
    1574            6 :     async fn test_atomic_overwrite_basic() {
    1575            6 :         let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
    1576            6 :         let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
    1577            6 :         std::fs::create_dir_all(&testdir).unwrap();
    1578            6 : 
    1579            6 :         let path = testdir.join("myfile");
    1580            6 :         let tmp_path = testdir.join("myfile.tmp");
    1581            6 : 
    1582            6 :         VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
    1583            6 :             .await
    1584            6 :             .unwrap();
    1585            6 :         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
    1586            6 :         let post = file.read_string(&ctx).await.unwrap();
    1587            6 :         assert_eq!(post, "foo");
    1588            6 :         assert!(!tmp_path.exists());
    1589            6 :         drop(file);
    1590            6 : 
    1591            6 :         VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
    1592            6 :             .await
    1593            6 :             .unwrap();
    1594            6 :         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
    1595            6 :         let post = file.read_string(&ctx).await.unwrap();
    1596            6 :         assert_eq!(post, "bar");
    1597            6 :         assert!(!tmp_path.exists());
    1598            6 :         drop(file);
    1599            6 :     }
    1600              : 
    1601              :     #[tokio::test]
    1602            6 :     async fn test_atomic_overwrite_preexisting_tmp() {
    1603            6 :         let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
    1604            6 :         let testdir =
    1605            6 :             crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
    1606            6 :         std::fs::create_dir_all(&testdir).unwrap();
    1607            6 : 
    1608            6 :         let path = testdir.join("myfile");
    1609            6 :         let tmp_path = testdir.join("myfile.tmp");
    1610            6 : 
    1611            6 :         std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
    1612            6 :         assert!(tmp_path.exists());
    1613            6 : 
    1614            6 :         VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
    1615            6 :             .await
    1616            6 :             .unwrap();
    1617            6 : 
    1618            6 :         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
    1619            6 :         let post = file.read_string(&ctx).await.unwrap();
    1620            6 :         assert_eq!(post, "foo");
    1621            6 :         assert!(!tmp_path.exists());
    1622            6 :         drop(file);
    1623            6 :     }
    1624              : }
        

Generated by: LCOV version 2.1-beta