Line data Source code
1 : //!
2 : //! Global page cache
3 : //!
4 : //! The page cache uses up most of the memory in the page server. It is shared
5 : //! by all tenants, and it is used to store different kinds of pages. Sharing
6 : //! the cache allows memory to be dynamically allocated where it's needed the
7 : //! most.
8 : //!
9 : //! The page cache consists of fixed-size buffers, 8 kB each to match the
10 : //! PostgreSQL buffer size, and a Slot struct for each buffer to contain
11 : //! information about what's stored in the buffer.
12 : //!
13 : //! # Types Of Pages
14 : //!
15 : //! [`PageCache`] only supports immutable pages.
16 : //! Hence there is no need to worry about coherency.
17 : //!
18 : //! Two types of pages are supported:
19 : //!
20 : //! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`].
21 : //!
22 : //! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only.
23 : //! It uses the page cache only for the blocks that are already fully written and immutable.
24 : //!
25 : //! # Filling The Page Cache
26 : //!
27 : //! Page cache maps from a cache key to a buffer slot.
28 : //! The cache key uniquely identifies the piece of data that is being cached.
29 : //!
30 : //! The cache key for **immutable file** pages is [`FileId`] and a block number.
31 : //! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following:
32 : //! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`].
33 : //! * Get a [`FileId`] using [`next_file_id`].
34 : //! * Use the mechanism to associate the on-disk file with the returned [`FileId`].
35 : //! * Use [`PageCache::read_immutable_buf`] to get a [`ReadBufResult`].
36 : //! * If the page was already cached, it'll be the [`ReadBufResult::Found`] variant that contains
37 : //! a read guard for the page. Just use it.
38 : //! * If the page was not cached, it'll be the [`ReadBufResult::NotFound`] variant that contains
39 : //! a write guard for the page. Fill the page with the contents of the on-disk file.
40 : //! Then call [`PageWriteGuard::mark_valid`] to mark the page as valid.
41 : //! Then try again to [`PageCache::read_immutable_buf`].
42 : //! Unless there's high cache pressure, the page should now be cached.
43 : //! (TODO: allow downgrading the write guard to a read guard to ensure forward progress.)
44 : //!
45 : //! # Locking
46 : //!
47 : //! There are two levels of locking involved: There's one lock for the "mapping"
48 : //! from page identifier (tenant ID, timeline ID, rel, block, LSN) to the buffer
49 : //! slot, and a separate lock on each slot. To read or write the contents of a
50 : //! slot, you must hold the lock on the slot in read or write mode,
51 : //! respectively. To change the mapping of a slot, i.e. to evict a page or to
52 : //! assign a buffer for a page, you must hold the mapping lock and the lock on
53 : //! the slot at the same time.
54 : //!
55 : //! Whenever you need to hold both locks simultaneously, the slot lock must be
56 : //! acquired first. This consistent ordering avoids deadlocks. To look up a page
57 : //! in the cache, you would first look up the mapping, while holding the mapping
58 : //! lock, and then lock the slot. You must release the mapping lock in between,
59 : //! to obey the lock ordering and avoid deadlock.
60 : //!
61 : //! A slot can momentarily have invalid contents, even if it's already been
62 : //! inserted to the mapping, but you must hold the write-lock on the slot until
63 : //! the contents are valid. If you need to release the lock without initializing
64 : //! the contents, you must remove the mapping first. We make that easy for the
65 : //! callers with PageWriteGuard: the caller must explicitly call guard.mark_valid() after it has
66 : //! initialized it. If the guard is dropped without calling mark_valid(), the
67 : //! mapping is automatically removed and the slot is marked free.
68 : //!
69 :
70 : use std::{
71 : collections::{hash_map::Entry, HashMap},
72 : sync::{
73 : atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
74 : Arc, Weak,
75 : },
76 : time::Duration,
77 : };
78 :
79 : use anyhow::Context;
80 : use once_cell::sync::OnceCell;
81 :
82 : use crate::{
83 : context::RequestContext,
84 : metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
85 : };
86 :
87 : static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
88 : const TEST_PAGE_CACHE_SIZE: usize = 50;
89 :
90 : ///
91 : /// Initialize the page cache. This must be called once at page server startup.
92 : ///
93 0 : pub fn init(size: usize) {
94 0 : if PAGE_CACHE.set(PageCache::new(size)).is_err() {
95 0 : panic!("page cache already initialized");
96 0 : }
97 0 : }
98 :
99 : ///
100 : /// Get a handle to the page cache.
101 : ///
102 1652447 : pub fn get() -> &'static PageCache {
103 1652447 : //
104 1652447 : // In unit tests, page server startup doesn't happen and no one calls
105 1652447 : // page_cache::init(). Initialize it here with a tiny cache, so that the
106 1652447 : // page cache is usable in unit tests.
107 1652447 : //
108 1652447 : if cfg!(test) {
109 1652447 : PAGE_CACHE.get_or_init(|| PageCache::new(TEST_PAGE_CACHE_SIZE))
110 : } else {
111 0 : PAGE_CACHE.get().expect("page cache not initialized")
112 : }
113 1652447 : }
114 :
115 : pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize;
116 : const MAX_USAGE_COUNT: u8 = 5;
117 :
118 : /// See module-level comment.
119 : #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
120 : pub struct FileId(u64);
121 :
122 : static NEXT_ID: AtomicU64 = AtomicU64::new(1);
123 :
124 : /// See module-level comment.
125 7290 : pub fn next_file_id() -> FileId {
126 7290 : FileId(NEXT_ID.fetch_add(1, Ordering::Relaxed))
127 7290 : }
128 :
129 : ///
130 : /// CacheKey uniquely identifies a "thing" to cache in the page cache.
131 : ///
132 : #[derive(Debug, PartialEq, Eq, Clone)]
133 : #[allow(clippy::enum_variant_names)]
134 : enum CacheKey {
135 : ImmutableFilePage { file_id: FileId, blkno: u32 },
136 : }
137 :
138 : struct Slot {
139 : inner: tokio::sync::RwLock<SlotInner>,
140 : usage_count: AtomicU8,
141 : }
142 :
143 : struct SlotInner {
144 : key: Option<CacheKey>,
145 : // for `coalesce_readers_permit`
146 : permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
147 : buf: &'static mut [u8; PAGE_SZ],
148 : }
149 :
150 : impl Slot {
151 : /// Increment usage count on the buffer, with ceiling at MAX_USAGE_COUNT.
152 1556103 : fn inc_usage_count(&self) {
153 1556103 : let _ = self
154 1556103 : .usage_count
155 1556103 : .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
156 1556103 : if val == MAX_USAGE_COUNT {
157 1484602 : None
158 : } else {
159 71501 : Some(val + 1)
160 : }
161 1556103 : });
162 1556103 : }
163 :
164 : /// Decrement usage count on the buffer, unless it's already zero. Returns
165 : /// the old usage count.
166 257793 : fn dec_usage_count(&self) -> u8 {
167 257793 : let count_res =
168 257793 : self.usage_count
169 257793 : .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
170 257793 : if val == 0 {
171 96344 : None
172 : } else {
173 161449 : Some(val - 1)
174 : }
175 257793 : });
176 257793 :
177 257793 : match count_res {
178 161449 : Ok(usage_count) => usage_count,
179 96344 : Err(usage_count) => usage_count,
180 : }
181 257793 : }
182 :
183 : /// Sets the usage count to a specific value.
184 96344 : fn set_usage_count(&self, count: u8) {
185 96344 : self.usage_count.store(count, Ordering::Relaxed);
186 96344 : }
187 : }
188 :
189 : impl SlotInner {
190 : /// If there is aready a reader, drop our permit and share its permit, just like we share read access.
191 1556103 : fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc<PinnedSlotsPermit> {
192 1556103 : let mut guard = self.permit.lock().unwrap();
193 1556103 : if let Some(existing_permit) = guard.upgrade() {
194 0 : drop(guard);
195 0 : drop(permit);
196 0 : existing_permit
197 : } else {
198 1556103 : let permit = Arc::new(permit);
199 1556103 : *guard = Arc::downgrade(&permit);
200 1556103 : permit
201 : }
202 1556103 : }
203 : }
204 :
205 : pub struct PageCache {
206 : immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,
207 :
208 : /// The actual buffers with their metadata.
209 : slots: Box<[Slot]>,
210 :
211 : pinned_slots: Arc<tokio::sync::Semaphore>,
212 :
213 : /// Index of the next candidate to evict, for the Clock replacement algorithm.
214 : /// This is interpreted modulo the page cache size.
215 : next_evict_slot: AtomicUsize,
216 :
217 : size_metrics: &'static PageCacheSizeMetrics,
218 : }
219 :
220 : struct PinnedSlotsPermit {
221 : _permit: tokio::sync::OwnedSemaphorePermit,
222 : }
223 :
224 : ///
225 : /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
226 : /// until the guard is dropped.
227 : ///
228 : pub struct PageReadGuard<'i> {
229 : _permit: Arc<PinnedSlotsPermit>,
230 : slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>,
231 : }
232 :
233 : impl std::ops::Deref for PageReadGuard<'_> {
234 : type Target = [u8; PAGE_SZ];
235 :
236 1652687 : fn deref(&self) -> &Self::Target {
237 1652687 : self.slot_guard.buf
238 1652687 : }
239 : }
240 :
241 : impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
242 0 : fn as_ref(&self) -> &[u8; PAGE_SZ] {
243 0 : self.slot_guard.buf
244 0 : }
245 : }
246 :
247 : ///
248 : /// PageWriteGuard is a lease on a buffer for modifying it. The page is kept locked
249 : /// until the guard is dropped.
250 : ///
251 : /// Counterintuitively, this is used even for a read, if the requested page is not
252 : /// currently found in the page cache. In that case, the caller of lock_for_read()
253 : /// is expected to fill in the page contents and call mark_valid().
254 : pub struct PageWriteGuard<'i> {
255 : state: PageWriteGuardState<'i>,
256 : }
257 :
258 : enum PageWriteGuardState<'i> {
259 : Invalid {
260 : inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
261 : _permit: PinnedSlotsPermit,
262 : },
263 : Downgraded,
264 : }
265 :
266 : impl std::ops::DerefMut for PageWriteGuard<'_> {
267 144593 : fn deref_mut(&mut self) -> &mut Self::Target {
268 144593 : match &mut self.state {
269 144593 : PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
270 0 : PageWriteGuardState::Downgraded => unreachable!(),
271 : }
272 144593 : }
273 : }
274 :
275 : impl std::ops::Deref for PageWriteGuard<'_> {
276 : type Target = [u8; PAGE_SZ];
277 :
278 1590061 : fn deref(&self) -> &Self::Target {
279 1590061 : match &self.state {
280 1590061 : PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
281 0 : PageWriteGuardState::Downgraded => unreachable!(),
282 : }
283 1590061 : }
284 : }
285 :
286 : impl<'a> PageWriteGuard<'a> {
287 : /// Mark that the buffer contents are now valid.
288 : #[must_use]
289 96344 : pub fn mark_valid(mut self) -> PageReadGuard<'a> {
290 96344 : let prev = std::mem::replace(&mut self.state, PageWriteGuardState::Downgraded);
291 96344 : match prev {
292 96344 : PageWriteGuardState::Invalid { inner, _permit } => {
293 96344 : assert!(inner.key.is_some());
294 96344 : PageReadGuard {
295 96344 : _permit: Arc::new(_permit),
296 96344 : slot_guard: inner.downgrade(),
297 96344 : }
298 : }
299 0 : PageWriteGuardState::Downgraded => unreachable!(),
300 : }
301 96344 : }
302 : }
303 :
304 : impl Drop for PageWriteGuard<'_> {
305 : ///
306 : /// If the buffer was allocated for a page that was not already in the
307 : /// cache, but the lock_for_read/write() caller dropped the buffer without
308 : /// initializing it, remove the mapping from the page cache.
309 : ///
310 96344 : fn drop(&mut self) {
311 96344 : match &mut self.state {
312 0 : PageWriteGuardState::Invalid { inner, _permit } => {
313 0 : assert!(inner.key.is_some());
314 0 : let self_key = inner.key.as_ref().unwrap();
315 0 : PAGE_CACHE.get().unwrap().remove_mapping(self_key);
316 0 : inner.key = None;
317 : }
318 96344 : PageWriteGuardState::Downgraded => {}
319 : }
320 96344 : }
321 : }
322 :
323 : /// lock_for_read() return value
324 : pub enum ReadBufResult<'a> {
325 : Found(PageReadGuard<'a>),
326 : NotFound(PageWriteGuard<'a>),
327 : }
328 :
329 : impl PageCache {
330 1652447 : pub async fn read_immutable_buf(
331 1652447 : &self,
332 1652447 : file_id: FileId,
333 1652447 : blkno: u32,
334 1652447 : ctx: &RequestContext,
335 1652447 : ) -> anyhow::Result<ReadBufResult> {
336 1652447 : self.lock_for_read(&(CacheKey::ImmutableFilePage { file_id, blkno }), ctx)
337 14965 : .await
338 1652447 : }
339 :
340 : //
341 : // Section 2: Internal interface functions for lookup/update.
342 : //
343 : // To add support for a new kind of "thing" to cache, you will need
344 : // to add public interface routines above, and code to deal with the
345 : // "mappings" after this section. But the routines in this section should
346 : // not require changes.
347 :
348 1652447 : async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
349 1652447 : match tokio::time::timeout(
350 1652447 : // Choose small timeout, neon_smgr does its own retries.
351 1652447 : // https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
352 1652447 : Duration::from_secs(10),
353 1652447 : Arc::clone(&self.pinned_slots).acquire_owned(),
354 1652447 : )
355 6921 : .await
356 : {
357 1652447 : Ok(res) => Ok(PinnedSlotsPermit {
358 1652447 : _permit: res.expect("this semaphore is never closed"),
359 1652447 : }),
360 0 : Err(_timeout) => {
361 0 : crate::metrics::page_cache_errors_inc(
362 0 : crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
363 0 : );
364 0 : anyhow::bail!("timeout: there were page guards alive for all page cache slots")
365 : }
366 : }
367 1652447 : }
368 :
369 : /// Look up a page in the cache.
370 : ///
371 1652447 : async fn try_lock_for_read(
372 1652447 : &self,
373 1652447 : cache_key: &CacheKey,
374 1652447 : permit: &mut Option<PinnedSlotsPermit>,
375 1652447 : ) -> Option<PageReadGuard> {
376 1652447 : if let Some(slot_idx) = self.search_mapping(cache_key) {
377 : // The page was found in the mapping. Lock the slot, and re-check
378 : // that it's still what we expected (because we released the mapping
379 : // lock already, another thread could have evicted the page)
380 1556103 : let slot = &self.slots[slot_idx];
381 1556103 : let inner = slot.inner.read().await;
382 1556103 : if inner.key.as_ref() == Some(cache_key) {
383 1556103 : slot.inc_usage_count();
384 1556103 : return Some(PageReadGuard {
385 1556103 : _permit: inner.coalesce_readers_permit(permit.take().unwrap()),
386 1556103 : slot_guard: inner,
387 1556103 : });
388 0 : }
389 96344 : }
390 96344 : None
391 1652447 : }
392 :
393 : /// Return a locked buffer for given block.
394 : ///
395 : /// Like try_lock_for_read(), if the search criteria is not exact and the
396 : /// page is already found in the cache, *cache_key is updated.
397 : ///
398 : /// If the page is not found in the cache, this allocates a new buffer for
399 : /// it. The caller may then initialize the buffer with the contents, and
400 : /// call mark_valid().
401 : ///
402 : /// Example usage:
403 : ///
404 : /// ```ignore
405 : /// let cache = page_cache::get();
406 : ///
407 : /// match cache.lock_for_read(&key) {
408 : /// ReadBufResult::Found(read_guard) => {
409 : /// // The page was found in cache. Use it
410 : /// },
411 : /// ReadBufResult::NotFound(write_guard) => {
412 : /// // The page was not found in cache. Read it from disk into the
413 : /// // buffer.
414 : /// //read_my_page_from_disk(write_guard);
415 : ///
416 : /// // The buffer contents are now valid. Tell the page cache.
417 : /// write_guard.mark_valid();
418 : /// },
419 : /// }
420 : /// ```
421 : ///
422 1652447 : async fn lock_for_read(
423 1652447 : &self,
424 1652447 : cache_key: &CacheKey,
425 1652447 : ctx: &RequestContext,
426 1652447 : ) -> anyhow::Result<ReadBufResult> {
427 1652447 : let mut permit = Some(self.try_get_pinned_slot_permit().await?);
428 :
429 1652447 : let (read_access, hit) = match cache_key {
430 1652447 : CacheKey::ImmutableFilePage { .. } => (
431 1652447 : &crate::metrics::PAGE_CACHE
432 1652447 : .for_ctx(ctx)
433 1652447 : .read_accesses_immutable,
434 1652447 : &crate::metrics::PAGE_CACHE.for_ctx(ctx).read_hits_immutable,
435 1652447 : ),
436 1652447 : };
437 1652447 : read_access.inc();
438 1652447 :
439 1652447 : let mut is_first_iteration = true;
440 : loop {
441 : // First check if the key already exists in the cache.
442 1652447 : if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
443 1556103 : debug_assert!(permit.is_none());
444 1556103 : if is_first_iteration {
445 1556103 : hit.inc();
446 1556103 : }
447 1556103 : return Ok(ReadBufResult::Found(read_guard));
448 96344 : }
449 96344 : debug_assert!(permit.is_some());
450 96344 : is_first_iteration = false;
451 :
452 : // Not found. Find a victim buffer
453 96344 : let (slot_idx, mut inner) = self
454 96344 : .find_victim(permit.as_ref().unwrap())
455 0 : .await
456 96344 : .context("Failed to find evict victim")?;
457 :
458 : // Insert mapping for this. At this point, we may find that another
459 : // thread did the same thing concurrently. In that case, we evicted
460 : // our victim buffer unnecessarily. Put it into the free list and
461 : // continue with the slot that the other thread chose.
462 96344 : if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
463 : // TODO: put to free list
464 :
465 : // We now just loop back to start from beginning. This is not
466 : // optimal, we'll perform the lookup in the mapping again, which
467 : // is not really necessary because we already got
468 : // 'existing_slot_idx'. But this shouldn't happen often enough
469 : // to matter much.
470 0 : continue;
471 96344 : }
472 96344 :
473 96344 : // Make the slot ready
474 96344 : let slot = &self.slots[slot_idx];
475 96344 : inner.key = Some(cache_key.clone());
476 96344 : slot.set_usage_count(1);
477 96344 :
478 96344 : debug_assert!(
479 : {
480 96344 : let guard = inner.permit.lock().unwrap();
481 96344 : guard.upgrade().is_none()
482 : },
483 0 : "we hold a write lock, so, no one else should have a permit"
484 : );
485 :
486 96344 : return Ok(ReadBufResult::NotFound(PageWriteGuard {
487 96344 : state: PageWriteGuardState::Invalid {
488 96344 : _permit: permit.take().unwrap(),
489 96344 : inner,
490 96344 : },
491 96344 : }));
492 : }
493 1652447 : }
494 :
495 : //
496 : // Section 3: Mapping functions
497 : //
498 :
499 : /// Search for a page in the cache using the given search key.
500 : ///
501 : /// Returns the slot index, if any.
502 : ///
503 : /// NOTE: We don't hold any lock on the mapping on return, so the slot might
504 : /// get recycled for an unrelated page immediately after this function
505 : /// returns. The caller is responsible for re-checking that the slot still
506 : /// contains the page with the same key before using it.
507 : ///
508 1652447 : fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
509 1652447 : match cache_key {
510 1652447 : CacheKey::ImmutableFilePage { file_id, blkno } => {
511 1652447 : let map = self.immutable_page_map.read().unwrap();
512 1652447 : Some(*map.get(&(*file_id, *blkno))?)
513 : }
514 : }
515 1652447 : }
516 :
517 : ///
518 : /// Remove mapping for given key.
519 : ///
520 91904 : fn remove_mapping(&self, old_key: &CacheKey) {
521 91904 : match old_key {
522 91904 : CacheKey::ImmutableFilePage { file_id, blkno } => {
523 91904 : let mut map = self.immutable_page_map.write().unwrap();
524 91904 : map.remove(&(*file_id, *blkno))
525 91904 : .expect("could not find old key in mapping");
526 91904 : self.size_metrics.current_bytes_immutable.sub_page_sz(1);
527 91904 : }
528 91904 : }
529 91904 : }
530 :
531 : ///
532 : /// Insert mapping for given key.
533 : ///
534 : /// If a mapping already existed for the given key, returns the slot index
535 : /// of the existing mapping and leaves it untouched.
536 96344 : fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
537 96344 : match new_key {
538 96344 : CacheKey::ImmutableFilePage { file_id, blkno } => {
539 96344 : let mut map = self.immutable_page_map.write().unwrap();
540 96344 : match map.entry((*file_id, *blkno)) {
541 0 : Entry::Occupied(entry) => Some(*entry.get()),
542 96344 : Entry::Vacant(entry) => {
543 96344 : entry.insert(slot_idx);
544 96344 : self.size_metrics.current_bytes_immutable.add_page_sz(1);
545 96344 : None
546 : }
547 : }
548 : }
549 : }
550 96344 : }
551 :
552 : //
553 : // Section 4: Misc internal helpers
554 : //
555 :
556 : /// Find a slot to evict.
557 : ///
558 : /// On return, the slot is empty and write-locked.
559 96344 : async fn find_victim(
560 96344 : &self,
561 96344 : _permit_witness: &PinnedSlotsPermit,
562 96344 : ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
563 96344 : let iter_limit = self.slots.len() * 10;
564 96344 : let mut iters = 0;
565 : loop {
566 257793 : iters += 1;
567 257793 : let slot_idx = self.next_evict_slot.fetch_add(1, Ordering::Relaxed) % self.slots.len();
568 257793 :
569 257793 : let slot = &self.slots[slot_idx];
570 257793 :
571 257793 : if slot.dec_usage_count() == 0 {
572 96344 : let mut inner = match slot.inner.try_write() {
573 96344 : Ok(inner) => inner,
574 0 : Err(_err) => {
575 0 : if iters > iter_limit {
576 : // NB: Even with the permits, there's no hard guarantee that we will find a slot with
577 : // any particular number of iterations: other threads might race ahead and acquire and
578 : // release pins just as we're scanning the array.
579 : //
580 : // Imagine that nslots is 2, and as starting point, usage_count==1 on all
581 : // slots. There are two threads running concurrently, A and B. A has just
582 : // acquired the permit from the semaphore.
583 : //
584 : // A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
585 : // B: Acquire permit.
586 : // B: Look at slot 2, decrement its usage_count to zero and continue the search
587 : // B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
588 : // B: Release pin and permit again
589 : // B: Acquire permit.
590 : // B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
591 : // B: Release pin and permit again
592 : //
593 : // Now we're back in the starting situation that both slots have
594 : // usage_count 1, but A has now been through one iteration of the
595 : // find_victim() loop. This can repeat indefinitely and on each
596 : // iteration, A's iteration count increases by one.
597 : //
598 : // So, even though the semaphore for the permits is fair, the victim search
599 : // itself happens in parallel and is not fair.
600 : // Hence even with a permit, a task can theoretically be starved.
601 : // To avoid this, we'd need tokio to give priority to tasks that are holding
602 : // permits for longer.
603 : // Note that just yielding to tokio during iteration without such
604 : // priority boosting is likely counter-productive. We'd just give more opportunities
605 : // for B to bump usage count, further starving A.
606 0 : page_cache_eviction_metrics::observe(
607 0 : page_cache_eviction_metrics::Outcome::ItersExceeded {
608 0 : iters: iters.try_into().unwrap(),
609 0 : },
610 0 : );
611 0 : anyhow::bail!("exceeded evict iter limit");
612 0 : }
613 0 : continue;
614 : }
615 : };
616 96344 : if let Some(old_key) = &inner.key {
617 91904 : // remove mapping for old buffer
618 91904 : self.remove_mapping(old_key);
619 91904 : inner.key = None;
620 91904 : page_cache_eviction_metrics::observe(
621 91904 : page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
622 91904 : iters: iters.try_into().unwrap(),
623 91904 : },
624 91904 : );
625 91904 : } else {
626 4440 : page_cache_eviction_metrics::observe(
627 4440 : page_cache_eviction_metrics::Outcome::FoundSlotUnused {
628 4440 : iters: iters.try_into().unwrap(),
629 4440 : },
630 4440 : );
631 4440 : }
632 96344 : return Ok((slot_idx, inner));
633 161449 : }
634 : }
635 96344 : }
636 :
637 : /// Initialize a new page cache
638 : ///
639 : /// This should be called only once at page server startup.
640 258 : fn new(num_pages: usize) -> Self {
641 258 : assert!(num_pages > 0, "page cache size must be > 0");
642 :
643 : // We could use Vec::leak here, but that potentially also leaks
644 : // uninitialized reserved capacity. With into_boxed_slice and Box::leak
645 : // this is avoided.
646 258 : let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
647 258 :
648 258 : let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
649 258 : size_metrics.max_bytes.set_page_sz(num_pages);
650 258 : size_metrics.current_bytes_immutable.set_page_sz(0);
651 258 :
652 258 : let slots = page_buffer
653 258 : .chunks_exact_mut(PAGE_SZ)
654 12900 : .map(|chunk| {
655 12900 : let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
656 12900 :
657 12900 : Slot {
658 12900 : inner: tokio::sync::RwLock::new(SlotInner {
659 12900 : key: None,
660 12900 : buf,
661 12900 : permit: std::sync::Mutex::new(Weak::new()),
662 12900 : }),
663 12900 : usage_count: AtomicU8::new(0),
664 12900 : }
665 12900 : })
666 258 : .collect();
667 258 :
668 258 : Self {
669 258 : immutable_page_map: Default::default(),
670 258 : slots,
671 258 : next_evict_slot: AtomicUsize::new(0),
672 258 : size_metrics,
673 258 : pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
674 258 : }
675 258 : }
676 : }
677 :
678 : trait PageSzBytesMetric {
679 : fn set_page_sz(&self, count: usize);
680 : fn add_page_sz(&self, count: usize);
681 : fn sub_page_sz(&self, count: usize);
682 : }
683 :
684 : #[inline(always)]
685 188764 : fn count_times_page_sz(count: usize) -> u64 {
686 188764 : u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
687 188764 : }
688 :
689 : impl PageSzBytesMetric for metrics::UIntGauge {
690 516 : fn set_page_sz(&self, count: usize) {
691 516 : self.set(count_times_page_sz(count));
692 516 : }
693 96344 : fn add_page_sz(&self, count: usize) {
694 96344 : self.add(count_times_page_sz(count));
695 96344 : }
696 91904 : fn sub_page_sz(&self, count: usize) {
697 91904 : self.sub(count_times_page_sz(count));
698 91904 : }
699 : }
|