Line data Source code
1 : use std::borrow::Cow;
2 : use std::fs::{self, File};
3 : use std::io::{self, Write};
4 : use std::os::fd::AsRawFd;
5 :
6 : use camino::{Utf8Path, Utf8PathBuf};
7 :
8 : /// Similar to [`std::fs::create_dir`], except we fsync the
9 : /// created directory and its parent.
10 895 : pub fn create_dir(path: impl AsRef<Utf8Path>) -> io::Result<()> {
11 895 : let path = path.as_ref();
12 895 :
13 895 : fs::create_dir(path)?;
14 893 : fsync_file_and_parent(path)?;
15 893 : Ok(())
16 3 : }
17 :
18 : /// Similar to [`std::fs::create_dir_all`], except we fsync all
19 : /// newly created directories and the pre-existing parent.
20 5 : pub fn create_dir_all(path: impl AsRef<Utf8Path>) -> io::Result<()> {
21 5 : let mut path = path.as_ref();
22 5 :
23 5 : let mut dirs_to_create = Vec::new();
24 :
25 : // Figure out which directories we need to create.
26 : loop {
27 8 : match path.metadata() {
28 4 : Ok(metadata) if metadata.is_dir() => break,
29 : Ok(_) => {
30 1 : return Err(io::Error::new(
31 1 : io::ErrorKind::AlreadyExists,
32 1 : format!("non-directory found in path: {path}"),
33 1 : ));
34 : }
35 4 : Err(ref e) if e.kind() == io::ErrorKind::NotFound => {}
36 1 : Err(e) => return Err(e),
37 : }
38 :
39 3 : dirs_to_create.push(path);
40 3 :
41 3 : match path.parent() {
42 3 : Some(parent) => path = parent,
43 : None => {
44 0 : return Err(io::Error::new(
45 0 : io::ErrorKind::InvalidInput,
46 0 : format!("can't find parent of path '{path}'"),
47 0 : ));
48 : }
49 : }
50 : }
51 :
52 : // Create directories from parent to child.
53 3 : for &path in dirs_to_create.iter().rev() {
54 3 : fs::create_dir(path)?;
55 : }
56 :
57 : // Fsync the created directories from child to parent.
58 3 : for &path in dirs_to_create.iter() {
59 3 : fsync(path)?;
60 : }
61 :
62 : // If we created any new directories, fsync the parent.
63 3 : if !dirs_to_create.is_empty() {
64 2 : fsync(path)?;
65 1 : }
66 :
67 3 : Ok(())
68 5 : }
69 :
70 : /// Adds a suffix to the file(directory) name, either appending the suffix to the end of its extension,
71 : /// or if there's no extension, creates one and puts a suffix there.
72 6383 : pub fn path_with_suffix_extension(
73 6383 : original_path: impl AsRef<Utf8Path>,
74 6383 : suffix: &str,
75 6383 : ) -> Utf8PathBuf {
76 6383 : let new_extension = match original_path.as_ref().extension() {
77 3101 : Some(extension) => Cow::Owned(format!("{extension}.{suffix}")),
78 3282 : None => Cow::Borrowed(suffix),
79 : };
80 6383 : original_path.as_ref().with_extension(new_extension)
81 6383 : }
82 :
83 893 : pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
84 893 : let parent = file_path.parent().ok_or_else(|| {
85 0 : io::Error::new(
86 0 : io::ErrorKind::Other,
87 0 : format!("File {file_path:?} has no parent"),
88 0 : )
89 893 : })?;
90 :
91 893 : fsync(file_path)?;
92 893 : fsync(parent)?;
93 893 : Ok(())
94 893 : }
95 :
96 1791 : pub fn fsync(path: &Utf8Path) -> io::Result<()> {
97 1791 : File::open(path)
98 1791 : .map_err(|e| io::Error::new(e.kind(), format!("Failed to open the file {path:?}: {e}")))
99 1791 : .and_then(|file| {
100 1791 : file.sync_all().map_err(|e| {
101 0 : io::Error::new(
102 0 : e.kind(),
103 0 : format!("Failed to sync file {path:?} data and metadata: {e}"),
104 0 : )
105 1791 : })
106 1791 : })
107 1791 : .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
108 1791 : }
109 :
110 52 : pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Error> {
111 52 : tokio::fs::File::open(path.as_ref()).await?.sync_all().await
112 0 : }
113 :
114 56 : pub async fn fsync_async_opt(
115 56 : path: impl AsRef<Utf8Path>,
116 56 : do_fsync: bool,
117 56 : ) -> Result<(), std::io::Error> {
118 56 : if do_fsync {
119 48 : fsync_async(path.as_ref()).await?;
120 0 : }
121 56 : Ok(())
122 0 : }
123 :
124 : /// Like postgres' durable_rename, renames a file and issues fsyncs to make it durable. After
125 : /// returning, both the file and rename are guaranteed to be persisted. Both paths must be on the
126 : /// same file system.
127 : ///
128 : /// Unlike postgres, it only fsyncs 1) the file to make contents durable, and 2) the directory to
129 : /// make the rename durable. This sequence ensures the target file will never be incomplete.
130 : ///
131 : /// Postgres also:
132 : ///
133 : /// * Fsyncs the target file, if it exists, before the rename, to ensure either the new or existing
134 : /// file survives a crash. Current callers don't need this as it should already be fsynced if
135 : /// durability is needed.
136 : ///
137 : /// * Fsyncs the file after the rename. This can be required with certain OSes or file systems (e.g.
138 : /// NFS), but not on Linux with most common file systems like ext4 (which we currently use).
139 : ///
140 : /// An audit of 8 other databases found that none fsynced the file after a rename:
141 : /// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837180535>
142 : ///
143 : /// eBPF probes confirmed that this is sufficient with ext4, XFS, and ZFS, but possibly not Btrfs:
144 : /// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837926218>
145 : ///
146 : /// virtual_file.rs has similar code, but it doesn't use vfs.
147 : ///
148 : /// Useful links: <https://lwn.net/Articles/457667/>
149 : /// <https://www.postgresql.org/message-id/flat/56583BDD.9060302%402ndquadrant.com>
150 : /// <https://thunk.org/tytso/blog/2009/03/15/dont-fear-the-fsync/>
151 28 : pub async fn durable_rename(
152 28 : old_path: impl AsRef<Utf8Path>,
153 28 : new_path: impl AsRef<Utf8Path>,
154 28 : do_fsync: bool,
155 28 : ) -> io::Result<()> {
156 28 : // first fsync the file
157 28 : fsync_async_opt(old_path.as_ref(), do_fsync).await?;
158 :
159 : // Time to do the real deal.
160 28 : tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;
161 :
162 : // Now fsync the parent
163 28 : let parent = match new_path.as_ref().parent() {
164 28 : Some(p) => p,
165 0 : None => Utf8Path::new("./"), // assume current dir if there is no parent
166 : };
167 28 : fsync_async_opt(parent, do_fsync).await?;
168 :
169 28 : Ok(())
170 0 : }
171 :
172 : /// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`].
173 : ///
174 : /// The file is first written to the specified `tmp_path`, and in a second
175 : /// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync
176 : /// and atomic rename guarantee that, if we crash at any point, there will never
177 : /// be a partially written file at `final_path` (but maybe at `tmp_path`).
178 : ///
179 : /// Callers are responsible for serializing calls of this function for a given `final_path`.
180 : /// If they don't, there may be an error due to conflicting `tmp_path`, or there will
181 : /// be no error and the content of `final_path` will be the "winner" caller's `content`.
182 : /// I.e., the atomticity guarantees still hold.
183 56 : pub fn overwrite(
184 56 : final_path: &Utf8Path,
185 56 : tmp_path: &Utf8Path,
186 56 : content: &[u8],
187 56 : ) -> std::io::Result<()> {
188 56 : let Some(final_path_parent) = final_path.parent() else {
189 0 : return Err(std::io::Error::from_raw_os_error(
190 0 : nix::errno::Errno::EINVAL as i32,
191 0 : ));
192 : };
193 56 : std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
194 56 : let mut file = std::fs::OpenOptions::new()
195 56 : .write(true)
196 56 : // Use `create_new` so that, if we race with ourselves or something else,
197 56 : // we bail out instead of causing damage.
198 56 : .create_new(true)
199 56 : .open(tmp_path)?;
200 56 : file.write_all(content)?;
201 56 : file.sync_all()?;
202 56 : drop(file); // don't keep the fd open for longer than we have to
203 56 :
204 56 : std::fs::rename(tmp_path, final_path)?;
205 :
206 56 : let final_parent_dirfd = std::fs::OpenOptions::new()
207 56 : .read(true)
208 56 : .open(final_path_parent)?;
209 :
210 56 : final_parent_dirfd.sync_all()?;
211 56 : Ok(())
212 56 : }
213 :
214 : /// Syncs the filesystem for the given file descriptor.
215 : #[cfg_attr(target_os = "macos", allow(unused_variables))]
216 0 : pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
217 : // Linux guarantees durability for syncfs.
218 : // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
219 : #[cfg(target_os = "linux")]
220 0 : {
221 0 : use anyhow::Context;
222 0 : nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
223 : }
224 : #[cfg(target_os = "macos")]
225 : {
226 : // macOS is not a production platform for Neon, don't even bother.
227 : }
228 : #[cfg(not(any(target_os = "linux", target_os = "macos")))]
229 : {
230 : compile_error!("Unsupported OS");
231 : }
232 0 : Ok(())
233 0 : }
234 :
235 : #[cfg(test)]
236 : mod tests {
237 :
238 : use super::*;
239 :
240 : #[test]
241 1 : fn test_create_dir_fsyncd() {
242 1 : let dir = camino_tempfile::tempdir().unwrap();
243 1 :
244 1 : let existing_dir_path = dir.path();
245 1 : let err = create_dir(existing_dir_path).unwrap_err();
246 1 : assert_eq!(err.kind(), io::ErrorKind::AlreadyExists);
247 :
248 1 : let child_dir = existing_dir_path.join("child");
249 1 : create_dir(child_dir).unwrap();
250 1 :
251 1 : let nested_child_dir = existing_dir_path.join("child1").join("child2");
252 1 : let err = create_dir(nested_child_dir).unwrap_err();
253 1 : assert_eq!(err.kind(), io::ErrorKind::NotFound);
254 1 : }
255 :
256 : #[test]
257 1 : fn test_create_dir_all_fsyncd() {
258 1 : let dir = camino_tempfile::tempdir().unwrap();
259 1 :
260 1 : let existing_dir_path = dir.path();
261 1 : create_dir_all(existing_dir_path).unwrap();
262 1 :
263 1 : let child_dir = existing_dir_path.join("child");
264 1 : assert!(!child_dir.exists());
265 1 : create_dir_all(&child_dir).unwrap();
266 1 : assert!(child_dir.exists());
267 :
268 1 : let nested_child_dir = existing_dir_path.join("child1").join("child2");
269 1 : assert!(!nested_child_dir.exists());
270 1 : create_dir_all(&nested_child_dir).unwrap();
271 1 : assert!(nested_child_dir.exists());
272 :
273 1 : let file_path = existing_dir_path.join("file");
274 1 : std::fs::write(&file_path, b"").unwrap();
275 1 :
276 1 : let err = create_dir_all(&file_path).unwrap_err();
277 1 : assert_eq!(err.kind(), io::ErrorKind::AlreadyExists);
278 :
279 1 : let invalid_dir_path = file_path.join("folder");
280 1 : create_dir_all(invalid_dir_path).unwrap_err();
281 1 : }
282 :
283 : #[test]
284 1 : fn test_path_with_suffix_extension() {
285 1 : let p = Utf8PathBuf::from("/foo/bar");
286 1 : assert_eq!(
287 1 : &path_with_suffix_extension(p, "temp").to_string(),
288 1 : "/foo/bar.temp"
289 1 : );
290 1 : let p = Utf8PathBuf::from("/foo/bar");
291 1 : assert_eq!(
292 1 : &path_with_suffix_extension(p, "temp.temp").to_string(),
293 1 : "/foo/bar.temp.temp"
294 1 : );
295 1 : let p = Utf8PathBuf::from("/foo/bar.baz");
296 1 : assert_eq!(
297 1 : &path_with_suffix_extension(p, "temp.temp").to_string(),
298 1 : "/foo/bar.baz.temp.temp"
299 1 : );
300 1 : let p = Utf8PathBuf::from("/foo/bar.baz");
301 1 : assert_eq!(
302 1 : &path_with_suffix_extension(p, ".temp").to_string(),
303 1 : "/foo/bar.baz..temp"
304 1 : );
305 1 : let p = Utf8PathBuf::from("/foo/bar/dir/");
306 1 : assert_eq!(
307 1 : &path_with_suffix_extension(p, ".temp").to_string(),
308 1 : "/foo/bar/dir..temp"
309 1 : );
310 1 : }
311 : }
|