Line data Source code
1 : //! Abstraction to create & read pidfiles.
2 : //!
3 : //! A pidfile is a file in the filesystem that stores a process's PID.
4 : //! Its purpose is to implement a singleton behavior where only
5 : //! one process of some "kind" is supposed to be running at a given time.
6 : //! The "kind" is identified by the pidfile.
7 : //!
8 : //! During process startup, the process that is supposed to be a singleton
9 : //! must [claim][`claim_for_current_process`] the pidfile first.
10 : //! If that is unsuccessful, the process must not act as the singleton, i.e.,
11 : //! it must not access any of the resources that only the singleton may access.
12 : //!
13 : //! A common need is to signal a running singleton process, e.g., to make
14 : //! it shut down and exit.
15 : //! For that, we have to [`read`] the pidfile. The result of the `read` operation
16 : //! tells us if there is any singleton process, and if so, what PID it has.
17 : //! We can then proceed to signal it, although some caveats still apply.
18 : //! Read the function-level documentation of [`read`] for that.
19 : //!
20 : //! ## Never Remove Pidfiles
21 : //!
22 : //! It would be natural to assume that the process who claimed the pidfile
23 : //! should remove it upon exit to avoid leaving a stale pidfile in place.
24 : //! However, we already have a reliable way to detect staleness of the pidfile,
25 : //! i.e., the `flock` that [claiming][`claim_for_current_process`] puts on it.
26 : //!
27 : //! And further, removing pidfiles would introduce a **catastrophic race condition**
28 : //! where two processes are running that are supposed to be singletons.
29 : //! Suppose we were to remove our pidfile during process shutdown.
30 : //! Here is how the race plays out:
31 : //! - Suppose we have a service called `myservice` with pidfile `myservice.pidfile`.
32 : //! - Process `A` starts to shut down.
33 : //! - Process `B` is just starting up
34 : //! - It `open("myservice.pid", O_WRONLY|O_CREAT)` the file
35 : //! - It blocks on `flock`
36 : //! - Process `A` removes the pidfile as the last step of its shutdown procedure
37 : //! - `unlink("myservice.pid")
38 : //! - Process `A` exits
39 : //! - This releases its `flock` and unblocks `B`
40 : //! - Process `B` still has the file descriptor for `myservice.pid` open
41 : //! - Process `B` writes its PID into `myservice.pid`.
42 : //! - But the `myservice.pid` file has been unlinked, so, there is `myservice.pid`
43 : //! in the directory.
44 : //! - Process `C` starts
45 : //! - It `open("myservice.pid", O_WRONLY|O_CREAT)` which creates a new file (new inode)
46 : //! - It `flock`s the file, which, since it's a different file, does not block
47 : //! - It writes its PID into the file
48 : //!
49 : //! At this point, `B` and `C` are running, which is hazardous.
50 : //! Morale of the story: don't unlink pidfiles, ever.
51 :
52 : use std::ops::Deref;
53 :
54 : use anyhow::Context;
55 : use camino::Utf8Path;
56 : use nix::unistd::Pid;
57 :
58 : use crate::lock_file::{self, LockFileRead};
59 :
60 : /// Keeps a claim on a pidfile alive until it is dropped.
61 : /// Returned by [`claim_for_current_process`].
62 : #[must_use]
63 : pub struct PidFileGuard(lock_file::LockFileGuard);
64 :
65 : impl Deref for PidFileGuard {
66 : type Target = lock_file::LockFileGuard;
67 :
68 0 : fn deref(&self) -> &Self::Target {
69 0 : &self.0
70 0 : }
71 : }
72 :
73 : /// Try to claim `path` as a pidfile for the current process.
74 : ///
75 : /// If another process has already claimed the pidfile, and it is still running,
76 : /// this function returns ane error.
77 : /// Otherwise, the function `flock`s the file and updates its contents to the
78 : /// current process's PID.
79 : /// If the update fails, the flock is released and an error returned.
80 : /// On success, the function returns a [`PidFileGuard`] to keep the flock alive.
81 : ///
82 : /// ### Maintaining A Claim
83 : ///
84 : /// It is the caller's responsibility to maintain the claim.
85 : /// The claim ends as soon as the returned guard object is dropped.
86 : /// To maintain the claim for the remaining lifetime of the current process,
87 : /// use [`std::mem::forget`] or similar.
88 0 : pub fn claim_for_current_process(path: &Utf8Path) -> anyhow::Result<PidFileGuard> {
89 0 : let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?;
90 : // if any of the next steps fail, we drop the file descriptor and thereby release the lock
91 0 : let guard = unwritten_lock_file
92 0 : .write_content(Pid::this().to_string())
93 0 : .context("write pid to lock file")?;
94 0 : Ok(PidFileGuard(guard))
95 0 : }
96 :
97 : /// Returned by [`read`].
98 : pub enum PidFileRead {
99 : /// No file exists at the given path.
100 : NotExist,
101 : /// The given pidfile is currently not claimed by any process.
102 : /// To determine this, the [`read`] operation acquired
103 : /// an exclusive flock on the file. The lock is still held and responsibility
104 : /// to release it is returned through the guard object.
105 : /// Before releasing it, other [`claim_for_current_process`] or [`read`] calls
106 : /// will fail.
107 : ///
108 : /// ### Caveats
109 : ///
110 : /// Do not unlink the pidfile from the filesystem. See module-comment for why.
111 : NotHeldByAnyProcess(PidFileGuard),
112 : /// The given pidfile is still claimed by another process whose PID is given
113 : /// as part of this variant.
114 : ///
115 : /// ### Caveats
116 : ///
117 : /// 1. The other process might exit at any time, turning the given PID stale.
118 : /// 2. There is a small window in which `claim_for_current_process` has already
119 : /// locked the file but not yet updates its contents. [`read`] will return
120 : /// this variant here, but with the old file contents, i.e., a stale PID.
121 : ///
122 : /// The kernel is free to recycle PID once it has been `wait(2)`ed upon by
123 : /// its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill`
124 : /// system call on it, bears the risk of killing an unrelated process.
125 : /// This is an inherent limitation of using pidfiles.
126 : /// The only race-free solution is to have a supervisor-process with a lifetime
127 : /// that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`).
128 : LockedByOtherProcess(Pid),
129 : }
130 :
131 : /// Try to read the file at the given path as a pidfile that was previously created
132 : /// through [`claim_for_current_process`].
133 : ///
134 : /// On success, this function returns a [`PidFileRead`].
135 : /// Check its docs for a description of the meaning of its different variants.
136 0 : pub fn read(pidfile: &Utf8Path) -> anyhow::Result<PidFileRead> {
137 0 : let res = lock_file::read_and_hold_lock_file(pidfile).context("read and hold pid file")?;
138 0 : let ret = match res {
139 0 : LockFileRead::NotExist => PidFileRead::NotExist,
140 0 : LockFileRead::NotHeldByAnyProcess(guard, _) => {
141 0 : PidFileRead::NotHeldByAnyProcess(PidFileGuard(guard))
142 : }
143 : LockFileRead::LockedByOtherProcess {
144 0 : not_locked_file: _not_locked_file,
145 0 : content,
146 0 : } => {
147 0 : // XXX the read races with the write in claim_pid_file_for_pid().
148 0 : // But pids are smaller than a page, so the kernel page cache will lock for us.
149 0 : // The only problem is that we might get the old contents here.
150 0 : // Can only fix that by implementing some scheme that downgrades the
151 0 : // exclusive lock to shared lock in claim_pid_file_for_pid().
152 0 : PidFileRead::LockedByOtherProcess(parse_pidfile_content(&content)?)
153 : }
154 : };
155 0 : Ok(ret)
156 0 : }
157 :
158 0 : fn parse_pidfile_content(content: &str) -> anyhow::Result<Pid> {
159 0 : let pid: i32 = content
160 0 : .parse()
161 0 : .map_err(|_| anyhow::anyhow!("parse pidfile content to PID"))?;
162 0 : if pid < 1 {
163 0 : anyhow::bail!("bad value in pidfile '{pid}'");
164 0 : }
165 0 : Ok(Pid::from_raw(pid))
166 0 : }
|