Line data Source code
1 : //! Implementation of append-only file data structure
2 : //! used to keep in-memory layers spilled on disk.
3 :
4 : use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
5 : use crate::config::PageServerConf;
6 : use crate::context::RequestContext;
7 : use crate::page_cache;
8 : use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
9 : use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
10 : use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
11 : use crate::virtual_file::owned_buffers_io::write::Buffer;
12 : use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile};
13 : use camino::Utf8PathBuf;
14 : use num_traits::Num;
15 : use pageserver_api::shard::TenantShardId;
16 : use tokio_epoll_uring::{BoundedBuf, Slice};
17 : use tracing::error;
18 :
19 : use std::io;
20 : use std::sync::atomic::AtomicU64;
21 : use std::sync::Arc;
22 : use utils::id::TimelineId;
23 :
24 : pub struct EphemeralFile {
25 : _tenant_shard_id: TenantShardId,
26 : _timeline_id: TimelineId,
27 : page_cache_file_id: page_cache::FileId,
28 : bytes_written: u64,
29 : buffered_writer: owned_buffers_io::write::BufferedWriter<IoBufferMut, VirtualFile>,
30 : /// Gate guard is held on as long as we need to do operations in the path (delete on drop)
31 : _gate_guard: utils::sync::gate::GateGuard,
32 : }
33 :
34 : const TAIL_SZ: usize = 64 * 1024;
35 :
36 : impl EphemeralFile {
37 2612 : pub async fn create(
38 2612 : conf: &PageServerConf,
39 2612 : tenant_shard_id: TenantShardId,
40 2612 : timeline_id: TimelineId,
41 2612 : gate: &utils::sync::gate::Gate,
42 2612 : ctx: &RequestContext,
43 2612 : ) -> anyhow::Result<EphemeralFile> {
44 : static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
45 2612 : let filename_disambiguator =
46 2612 : NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
47 2612 :
48 2612 : let filename = conf
49 2612 : .timeline_path(&tenant_shard_id, &timeline_id)
50 2612 : .join(Utf8PathBuf::from(format!(
51 2612 : "ephemeral-{filename_disambiguator}"
52 2612 : )));
53 :
54 2612 : let file = Arc::new(
55 2612 : VirtualFile::open_with_options_v2(
56 2612 : &filename,
57 2612 : virtual_file::OpenOptions::new()
58 2612 : .read(true)
59 2612 : .write(true)
60 2612 : .create(true),
61 2612 : ctx,
62 2612 : )
63 2612 : .await?,
64 : );
65 :
66 2612 : let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
67 2612 :
68 2612 : Ok(EphemeralFile {
69 2612 : _tenant_shard_id: tenant_shard_id,
70 2612 : _timeline_id: timeline_id,
71 2612 : page_cache_file_id,
72 2612 : bytes_written: 0,
73 2612 : buffered_writer: owned_buffers_io::write::BufferedWriter::new(
74 2612 : file,
75 5224 : || IoBufferMut::with_capacity(TAIL_SZ),
76 2612 : gate.enter()?,
77 2612 : ctx,
78 2612 : ),
79 2612 : _gate_guard: gate.enter()?,
80 : })
81 2612 : }
82 : }
83 :
84 : impl Drop for EphemeralFile {
85 2368 : fn drop(&mut self) {
86 2368 : // unlink the file
87 2368 : // we are clear to do this, because we have entered a gate
88 2368 : let path = self.buffered_writer.as_inner().path();
89 2368 : let res = std::fs::remove_file(path);
90 2368 : if let Err(e) = res {
91 4 : if e.kind() != std::io::ErrorKind::NotFound {
92 : // just never log the not found errors, we cannot do anything for them; on detach
93 : // the tenant directory is already gone.
94 : //
95 : // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
96 0 : error!("could not remove ephemeral file '{path}': {e}");
97 4 : }
98 2364 : }
99 2368 : }
100 : }
101 :
102 : impl EphemeralFile {
103 19226164 : pub(crate) fn len(&self) -> u64 {
104 19226164 : self.bytes_written
105 19226164 : }
106 :
107 2596 : pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
108 2596 : self.page_cache_file_id
109 2596 : }
110 :
111 1940 : pub(crate) async fn load_to_io_buf(
112 1940 : &self,
113 1940 : ctx: &RequestContext,
114 1940 : ) -> Result<IoBufferMut, io::Error> {
115 1940 : let size = self.len().into_usize();
116 1940 : let buf = IoBufferMut::with_capacity(size);
117 1940 : let (slice, nread) = self.read_exact_at_eof_ok(0, buf.slice_full(), ctx).await?;
118 1940 : assert_eq!(nread, size);
119 1940 : let buf = slice.into_inner();
120 1940 : assert_eq!(buf.len(), nread);
121 1940 : assert_eq!(buf.capacity(), size, "we shouldn't be reallocating");
122 1940 : Ok(buf)
123 1940 : }
124 :
125 : /// Returns the offset at which the first byte of the input was written, for use
126 : /// in constructing indices over the written value.
127 : ///
128 : /// Panics if the write is short because there's no way we can recover from that.
129 : /// TODO: make upstack handle this as an error.
130 9609748 : pub(crate) async fn write_raw(
131 9609748 : &mut self,
132 9609748 : srcbuf: &[u8],
133 9609748 : ctx: &RequestContext,
134 9609748 : ) -> std::io::Result<u64> {
135 9609748 : let (pos, control) = self.write_raw_controlled(srcbuf, ctx).await?;
136 9609748 : if let Some(control) = control {
137 11068 : control.release().await;
138 9598680 : }
139 9609748 : Ok(pos)
140 9609748 : }
141 :
142 9609752 : async fn write_raw_controlled(
143 9609752 : &mut self,
144 9609752 : srcbuf: &[u8],
145 9609752 : ctx: &RequestContext,
146 9609752 : ) -> std::io::Result<(u64, Option<owned_buffers_io::write::FlushControl>)> {
147 9609752 : let pos = self.bytes_written;
148 :
149 9609752 : let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
150 0 : std::io::Error::new(
151 0 : std::io::ErrorKind::Other,
152 0 : format!(
153 0 : "write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}",
154 0 : srcbuf_len = srcbuf.len(),
155 0 : ),
156 0 : )
157 9609752 : })?;
158 :
159 : // Write the payload
160 9609752 : let (nwritten, control) = self
161 9609752 : .buffered_writer
162 9609752 : .write_buffered_borrowed_controlled(srcbuf, ctx)
163 9609752 : .await?;
164 9609752 : assert_eq!(
165 9609752 : nwritten,
166 9609752 : srcbuf.len(),
167 0 : "buffered writer has no short writes"
168 : );
169 :
170 9609752 : self.bytes_written = new_bytes_written;
171 9609752 :
172 9609752 : Ok((pos, control))
173 9609752 : }
174 : }
175 :
176 : impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
177 997247 : async fn read_exact_at_eof_ok<B: IoBufAlignedMut + Send>(
178 997247 : &self,
179 997247 : start: u64,
180 997247 : dst: tokio_epoll_uring::Slice<B>,
181 997247 : ctx: &RequestContext,
182 997247 : ) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
183 997247 : let submitted_offset = self.buffered_writer.bytes_submitted();
184 997247 :
185 997247 : let mutable = self.buffered_writer.inspect_mutable();
186 997247 : let mutable = &mutable[0..mutable.pending()];
187 997247 :
188 997247 : let maybe_flushed = self.buffered_writer.inspect_maybe_flushed();
189 997247 :
190 997247 : let dst_cap = dst.bytes_total().into_u64();
191 997247 : let end = {
192 : // saturating_add is correct here because the max file size is u64::MAX, so,
193 : // if start + dst.len() > u64::MAX, then we know it will be a short read
194 997247 : let mut end: u64 = start.saturating_add(dst_cap);
195 997247 : if end > self.bytes_written {
196 554296 : end = self.bytes_written;
197 554296 : }
198 997247 : end
199 : };
200 :
201 : // inclusive, exclusive
202 : #[derive(Debug)]
203 : struct Range<N>(N, N);
204 : impl<N: Num + Clone + Copy + PartialOrd + Ord> Range<N> {
205 6611569 : fn len(&self) -> N {
206 6611569 : if self.0 > self.1 {
207 3548884 : N::zero()
208 : } else {
209 3062685 : self.1 - self.0
210 : }
211 6611569 : }
212 : }
213 :
214 997247 : let (written_range, maybe_flushed_range) = {
215 997247 : if maybe_flushed.is_some() {
216 : // [ written ][ maybe_flushed ][ mutable ]
217 : // <- TAIL_SZ -><- TAIL_SZ ->
218 : // ^
219 : // `submitted_offset`
220 : // <++++++ on disk +++++++????????????????>
221 976560 : (
222 976560 : Range(
223 976560 : start,
224 976560 : std::cmp::min(end, submitted_offset.saturating_sub(TAIL_SZ as u64)),
225 976560 : ),
226 976560 : Range(
227 976560 : std::cmp::max(start, submitted_offset.saturating_sub(TAIL_SZ as u64)),
228 976560 : std::cmp::min(end, submitted_offset),
229 976560 : ),
230 976560 : )
231 : } else {
232 : // [ written ][ mutable ]
233 : // <- TAIL_SZ ->
234 : // ^
235 : // `submitted_offset`
236 : // <++++++ on disk +++++++++++++++++++++++>
237 20687 : (
238 20687 : Range(start, std::cmp::min(end, submitted_offset)),
239 20687 : // zero len
240 20687 : Range(submitted_offset, u64::MIN),
241 20687 : )
242 : }
243 : };
244 :
245 997247 : let mutable_range = Range(std::cmp::max(start, submitted_offset), end);
246 :
247 997247 : let dst = if written_range.len() > 0 {
248 20305 : let file: &VirtualFile = self.buffered_writer.as_inner();
249 20305 : let bounds = dst.bounds();
250 20305 : let slice = file
251 20305 : .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
252 20305 : .await?;
253 20305 : Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
254 : } else {
255 976942 : dst
256 : };
257 :
258 997247 : let dst = if maybe_flushed_range.len() > 0 {
259 320573 : let offset_in_buffer = maybe_flushed_range
260 320573 : .0
261 320573 : .checked_sub(submitted_offset.saturating_sub(TAIL_SZ as u64))
262 320573 : .unwrap()
263 320573 : .into_usize();
264 320573 : // Checked previously the buffer is Some.
265 320573 : let maybe_flushed = maybe_flushed.unwrap();
266 320573 : let to_copy = &maybe_flushed
267 320573 : [offset_in_buffer..(offset_in_buffer + maybe_flushed_range.len().into_usize())];
268 320573 : let bounds = dst.bounds();
269 320573 : let mut view = dst.slice({
270 320573 : let start = written_range.len().into_usize();
271 320573 : let end = start
272 320573 : .checked_add(maybe_flushed_range.len().into_usize())
273 320573 : .unwrap();
274 320573 : start..end
275 320573 : });
276 320573 : view.as_mut_rust_slice_full_zeroed()
277 320573 : .copy_from_slice(to_copy);
278 320573 : Slice::from_buf_bounds(Slice::into_inner(view), bounds)
279 : } else {
280 676674 : dst
281 : };
282 :
283 997247 : let dst = if mutable_range.len() > 0 {
284 659451 : let offset_in_buffer = mutable_range
285 659451 : .0
286 659451 : .checked_sub(submitted_offset)
287 659451 : .unwrap()
288 659451 : .into_usize();
289 659451 : let to_copy =
290 659451 : &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
291 659451 : let bounds = dst.bounds();
292 659451 : let mut view = dst.slice({
293 659451 : let start =
294 659451 : written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
295 659451 : let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
296 659451 : start..end
297 659451 : });
298 659451 : view.as_mut_rust_slice_full_zeroed()
299 659451 : .copy_from_slice(to_copy);
300 659451 : Slice::from_buf_bounds(Slice::into_inner(view), bounds)
301 : } else {
302 337796 : dst
303 : };
304 :
305 : // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
306 :
307 997247 : Ok((dst, (end - start).into_usize()))
308 997247 : }
309 : }
310 :
311 : /// Does the given filename look like an ephemeral file?
312 0 : pub fn is_ephemeral_file(filename: &str) -> bool {
313 0 : if let Some(rest) = filename.strip_prefix("ephemeral-") {
314 0 : rest.parse::<u32>().is_ok()
315 : } else {
316 0 : false
317 : }
318 0 : }
319 :
320 : #[cfg(test)]
321 : mod tests {
322 : use rand::Rng;
323 :
324 : use super::*;
325 : use crate::context::DownloadBehavior;
326 : use crate::task_mgr::TaskKind;
327 : use std::fs;
328 : use std::str::FromStr;
329 :
330 16 : fn harness(
331 16 : test_name: &str,
332 16 : ) -> Result<
333 16 : (
334 16 : &'static PageServerConf,
335 16 : TenantShardId,
336 16 : TimelineId,
337 16 : RequestContext,
338 16 : ),
339 16 : io::Error,
340 16 : > {
341 16 : let repo_dir = PageServerConf::test_repo_dir(test_name);
342 16 : let _ = fs::remove_dir_all(&repo_dir);
343 16 : let conf = PageServerConf::dummy_conf(repo_dir);
344 16 : // Make a static copy of the config. This can never be free'd, but that's
345 16 : // OK in a test.
346 16 : let conf: &'static PageServerConf = Box::leak(Box::new(conf));
347 16 :
348 16 : let tenant_shard_id = TenantShardId::from_str("11000000000000000000000000000000").unwrap();
349 16 : let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
350 16 : fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?;
351 :
352 16 : let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
353 16 :
354 16 : Ok((conf, tenant_shard_id, timeline_id, ctx))
355 16 : }
356 :
357 : #[tokio::test]
358 4 : async fn ephemeral_file_holds_gate_open() {
359 4 : const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
360 4 :
361 4 : let (conf, tenant_id, timeline_id, ctx) =
362 4 : harness("ephemeral_file_holds_gate_open").unwrap();
363 4 :
364 4 : let gate = utils::sync::gate::Gate::default();
365 4 :
366 4 : let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
367 4 : .await
368 4 : .unwrap();
369 4 :
370 4 : let mut closing = tokio::task::spawn(async move {
371 4 : gate.close().await;
372 4 : });
373 4 :
374 4 : // gate is entered until the ephemeral file is dropped
375 4 : // do not start paused tokio-epoll-uring has a sleep loop
376 4 : tokio::time::pause();
377 4 : tokio::time::timeout(FOREVER, &mut closing)
378 4 : .await
379 4 : .expect_err("closing cannot complete before dropping");
380 4 :
381 4 : // this is a requirement of the reset_tenant functionality: we have to be able to restart a
382 4 : // tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate
383 4 : drop(file);
384 4 :
385 4 : tokio::time::timeout(FOREVER, &mut closing)
386 4 : .await
387 4 : .expect("closing completes right away")
388 4 : .expect("closing does not panic");
389 4 : }
390 :
391 : #[tokio::test]
392 4 : async fn test_ephemeral_file_basics() {
393 4 : let (conf, tenant_id, timeline_id, ctx) = harness("test_ephemeral_file_basics").unwrap();
394 4 :
395 4 : let gate = utils::sync::gate::Gate::default();
396 4 :
397 4 : let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
398 4 : .await
399 4 : .unwrap();
400 4 :
401 4 : let mutable = file.buffered_writer.inspect_mutable();
402 4 : let cap = mutable.capacity();
403 4 : let align = mutable.align();
404 4 :
405 4 : let write_nbytes = cap * 2 + cap / 2;
406 4 :
407 4 : let content: Vec<u8> = rand::thread_rng()
408 4 : .sample_iter(rand::distributions::Standard)
409 4 : .take(write_nbytes)
410 4 : .collect();
411 4 :
412 4 : let mut value_offsets = Vec::new();
413 1280 : for range in (0..write_nbytes)
414 4 : .step_by(align)
415 1280 : .map(|start| start..(start + align).min(write_nbytes))
416 4 : {
417 1280 : let off = file.write_raw(&content[range], &ctx).await.unwrap();
418 1280 : value_offsets.push(off);
419 4 : }
420 4 :
421 4 : assert_eq!(file.len() as usize, write_nbytes);
422 1280 : for (i, range) in (0..write_nbytes)
423 4 : .step_by(align)
424 1280 : .map(|start| start..(start + align).min(write_nbytes))
425 4 : .enumerate()
426 4 : {
427 1280 : assert_eq!(value_offsets[i], range.start.into_u64());
428 1280 : let buf = IoBufferMut::with_capacity(range.len());
429 1280 : let (buf_slice, nread) = file
430 1280 : .read_exact_at_eof_ok(range.start.into_u64(), buf.slice_full(), &ctx)
431 1280 : .await
432 1280 : .unwrap();
433 1280 : let buf = buf_slice.into_inner();
434 1280 : assert_eq!(nread, range.len());
435 1280 : assert_eq!(&buf, &content[range]);
436 4 : }
437 4 :
438 4 : let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap();
439 4 : assert!(file_contents == content[0..cap * 2]);
440 4 :
441 4 : let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap();
442 4 : assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]);
443 4 :
444 4 : let mutable_buffer_contents = file.buffered_writer.inspect_mutable();
445 4 : assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]);
446 4 : }
447 :
448 : #[tokio::test]
449 4 : async fn test_flushes_do_happen() {
450 4 : let (conf, tenant_id, timeline_id, ctx) = harness("test_flushes_do_happen").unwrap();
451 4 :
452 4 : let gate = utils::sync::gate::Gate::default();
453 4 :
454 4 : let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
455 4 : .await
456 4 : .unwrap();
457 4 :
458 4 : // mutable buffer and maybe_flushed buffer each has `cap` bytes.
459 4 : let cap = file.buffered_writer.inspect_mutable().capacity();
460 4 :
461 4 : let content: Vec<u8> = rand::thread_rng()
462 4 : .sample_iter(rand::distributions::Standard)
463 4 : .take(cap * 2 + cap / 2)
464 4 : .collect();
465 4 :
466 4 : file.write_raw(&content, &ctx).await.unwrap();
467 4 :
468 4 : // assert the state is as this test expects it to be
469 4 : assert_eq!(
470 4 : &file.load_to_io_buf(&ctx).await.unwrap(),
471 4 : &content[0..cap * 2 + cap / 2]
472 4 : );
473 4 : let md = file.buffered_writer.as_inner().path().metadata().unwrap();
474 4 : assert_eq!(
475 4 : md.len(),
476 4 : 2 * cap.into_u64(),
477 4 : "buffered writer requires one write to be flushed if we write 2.5x buffer capacity"
478 4 : );
479 4 : assert_eq!(
480 4 : &file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap],
481 4 : &content[cap..cap * 2]
482 4 : );
483 4 : assert_eq!(
484 4 : &file.buffered_writer.inspect_mutable()[0..cap / 2],
485 4 : &content[cap * 2..cap * 2 + cap / 2]
486 4 : );
487 4 : }
488 :
489 : #[tokio::test]
490 4 : async fn test_read_split_across_file_and_buffer() {
491 4 : // This test exercises the logic on the read path that splits the logical read
492 4 : // into a read from the flushed part (= the file) and a copy from the buffered writer's buffer.
493 4 : //
494 4 : // This test build on the assertions in test_flushes_do_happen
495 4 :
496 4 : let (conf, tenant_id, timeline_id, ctx) =
497 4 : harness("test_read_split_across_file_and_buffer").unwrap();
498 4 :
499 4 : let gate = utils::sync::gate::Gate::default();
500 4 :
501 4 : let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
502 4 : .await
503 4 : .unwrap();
504 4 :
505 4 : let mutable = file.buffered_writer.inspect_mutable();
506 4 : let cap = mutable.capacity();
507 4 : let align = mutable.align();
508 4 : let content: Vec<u8> = rand::thread_rng()
509 4 : .sample_iter(rand::distributions::Standard)
510 4 : .take(cap * 2 + cap / 2)
511 4 : .collect();
512 4 :
513 4 : let (_, control) = file.write_raw_controlled(&content, &ctx).await.unwrap();
514 4 :
515 108 : let test_read = |start: usize, len: usize| {
516 108 : let file = &file;
517 108 : let ctx = &ctx;
518 108 : let content = &content;
519 108 : async move {
520 108 : let (buf, nread) = file
521 108 : .read_exact_at_eof_ok(
522 108 : start.into_u64(),
523 108 : IoBufferMut::with_capacity(len).slice_full(),
524 108 : ctx,
525 108 : )
526 108 : .await
527 108 : .unwrap();
528 108 : assert_eq!(nread, len);
529 108 : assert_eq!(&buf.into_inner(), &content[start..(start + len)]);
530 108 : }
531 108 : };
532 4 :
533 12 : let test_read_all_offset_combinations = || {
534 12 : async move {
535 12 : test_read(align, align).await;
536 4 : // border onto edge of file
537 12 : test_read(cap - align, align).await;
538 4 : // read across file and buffer
539 12 : test_read(cap - align, 2 * align).await;
540 4 : // stay from start of maybe flushed buffer
541 12 : test_read(cap, align).await;
542 4 : // completely within maybe flushed buffer
543 12 : test_read(cap + align, align).await;
544 4 : // border onto edge of maybe flushed buffer.
545 12 : test_read(cap * 2 - align, align).await;
546 4 : // read across maybe flushed and mutable buffer
547 12 : test_read(cap * 2 - align, 2 * align).await;
548 4 : // read across three segments
549 12 : test_read(cap - align, cap + 2 * align).await;
550 4 : // completely within mutable buffer
551 12 : test_read(cap * 2 + align, align).await;
552 12 : }
553 12 : };
554 4 :
555 4 : // completely within the file range
556 4 : assert!(align < cap, "test assumption");
557 4 : assert!(cap % align == 0);
558 4 :
559 4 : // test reads at different flush stages.
560 4 : let not_started = control.unwrap().into_not_started();
561 4 : test_read_all_offset_combinations().await;
562 4 : let in_progress = not_started.ready_to_flush();
563 4 : test_read_all_offset_combinations().await;
564 4 : in_progress.wait_until_flush_is_done().await;
565 4 : test_read_all_offset_combinations().await;
566 4 : }
567 : }
|