Line data Source code
1 : //! Abstraction to create & read pidfiles.
2 : //!
3 : //! A pidfile is a file in the filesystem that stores a process's PID.
4 : //! Its purpose is to implement a singleton behavior where only
5 : //! one process of some "kind" is supposed to be running at a given time.
6 : //! The "kind" is identified by the pidfile.
7 : //!
8 : //! During process startup, the process that is supposed to be a singleton
9 : //! must [claim][`claim_for_current_process`] the pidfile first.
10 : //! If that is unsuccessful, the process must not act as the singleton, i.e.,
11 : //! it must not access any of the resources that only the singleton may access.
12 : //!
13 : //! A common need is to signal a running singleton process, e.g., to make
14 : //! it shut down and exit.
15 : //! For that, we have to [`read`] the pidfile. The result of the `read` operation
16 : //! tells us if there is any singleton process, and if so, what PID it has.
17 : //! We can then proceed to signal it, although some caveats still apply.
18 : //! Read the function-level documentation of [`read`] for that.
19 : //!
20 : //! ## Never Remove Pidfiles
21 : //!
22 : //! It would be natural to assume that the process who claimed the pidfile
23 : //! should remove it upon exit to avoid leaving a stale pidfile in place.
24 : //! However, we already have a reliable way to detect staleness of the pidfile,
25 : //! i.e., the `flock` that [claiming][`claim_for_current_process`] puts on it.
26 : //!
27 : //! And further, removing pidfiles would introduce a **catastrophic race condition**
28 : //! where two processes are running that are supposed to be singletons.
29 : //! Suppose we were to remove our pidfile during process shutdown.
30 : //! Here is how the race plays out:
31 : //! - Suppose we have a service called `myservice` with pidfile `myservice.pidfile`.
32 : //! - Process `A` starts to shut down.
33 : //! - Process `B` is just starting up
34 : //! - It `open("myservice.pid", O_WRONLY|O_CREAT)` the file
35 : //! - It blocks on `flock`
36 : //! - Process `A` removes the pidfile as the last step of its shutdown procedure
37 : //! - `unlink("myservice.pid")
38 : //! - Process `A` exits
39 : //! - This releases its `flock` and unblocks `B`
40 : //! - Process `B` still has the file descriptor for `myservice.pid` open
41 : //! - Process `B` writes its PID into `myservice.pid`.
42 : //! - But the `myservice.pid` file has been unlinked, so, there is `myservice.pid`
43 : //! in the directory.
44 : //! - Process `C` starts
45 : //! - It `open("myservice.pid", O_WRONLY|O_CREAT)` which creates a new file (new inode)
46 : //! - It `flock`s the file, which, since it's a different file, does not block
47 : //! - It writes its PID into the file
48 : //!
49 : //! At this point, `B` and `C` are running, which is hazardous.
50 : //! Morale of the story: don't unlink pidfiles, ever.
51 :
52 : use std::{ops::Deref, path::Path};
53 :
54 : use anyhow::Context;
55 : use nix::unistd::Pid;
56 :
57 : use crate::lock_file::{self, LockFileRead};
58 :
59 : /// Keeps a claim on a pidfile alive until it is dropped.
60 : /// Returned by [`claim_for_current_process`].
61 : #[must_use]
62 : pub struct PidFileGuard(lock_file::LockFileGuard);
63 :
64 : impl Deref for PidFileGuard {
65 : type Target = lock_file::LockFileGuard;
66 :
67 0 : fn deref(&self) -> &Self::Target {
68 0 : &self.0
69 0 : }
70 : }
71 :
72 : /// Try to claim `path` as a pidfile for the current process.
73 : ///
74 : /// If another process has already claimed the pidfile, and it is still running,
75 : /// this function returns ane error.
76 : /// Otherwise, the function `flock`s the file and updates its contents to the
77 : /// current process's PID.
78 : /// If the update fails, the flock is released and an error returned.
79 : /// On success, the function returns a [`PidFileGuard`] to keep the flock alive.
80 : ///
81 : /// ### Maintaining A Claim
82 : ///
83 : /// It is the caller's responsibility to maintain the claim.
84 : /// The claim ends as soon as the returned guard object is dropped.
85 : /// To maintain the claim for the remaining lifetime of the current process,
86 : /// use [`std::mem::forget`] or similar.
87 1092 : pub fn claim_for_current_process(path: &Path) -> anyhow::Result<PidFileGuard> {
88 1092 : let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?;
89 : // if any of the next steps fail, we drop the file descriptor and thereby release the lock
90 1092 : let guard = unwritten_lock_file
91 1092 : .write_content(Pid::this().to_string())
92 1092 : .context("write pid to lock file")?;
93 1092 : Ok(PidFileGuard(guard))
94 1092 : }
95 :
96 : /// Returned by [`read`].
97 : pub enum PidFileRead {
98 : /// No file exists at the given path.
99 : NotExist,
100 : /// The given pidfile is currently not claimed by any process.
101 : /// To determine this, the [`read`] operation acquired
102 : /// an exclusive flock on the file. The lock is still held and responsibility
103 : /// to release it is returned through the guard object.
104 : /// Before releasing it, other [`claim_for_current_process`] or [`read`] calls
105 : /// will fail.
106 : ///
107 : /// ### Caveats
108 : ///
109 : /// Do not unlink the pidfile from the filesystem. See module-comment for why.
110 : NotHeldByAnyProcess(PidFileGuard),
111 : /// The given pidfile is still claimed by another process whose PID is given
112 : /// as part of this variant.
113 : ///
114 : /// ### Caveats
115 : ///
116 : /// 1. The other process might exit at any time, turning the given PID stale.
117 : /// 2. There is a small window in which `claim_for_current_process` has already
118 : /// locked the file but not yet updates its contents. [`read`] will return
119 : /// this variant here, but with the old file contents, i.e., a stale PID.
120 : ///
121 : /// The kernel is free to recycle PID once it has been `wait(2)`ed upon by
122 : /// its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill`
123 : /// system call on it, bears the risk of killing an unrelated process.
124 : /// This is an inherent limitation of using pidfiles.
125 : /// The only race-free solution is to have a supervisor-process with a lifetime
126 : /// that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`).
127 : LockedByOtherProcess(Pid),
128 : }
129 :
130 : /// Try to read the file at the given path as a pidfile that was previously created
131 : /// through [`claim_for_current_process`].
132 : ///
133 : /// On success, this function returns a [`PidFileRead`].
134 : /// Check its docs for a description of the meaning of its different variants.
135 2224 : pub fn read(pidfile: &Path) -> anyhow::Result<PidFileRead> {
136 2224 : let res = lock_file::read_and_hold_lock_file(pidfile).context("read and hold pid file")?;
137 2224 : let ret = match res {
138 0 : LockFileRead::NotExist => PidFileRead::NotExist,
139 38 : LockFileRead::NotHeldByAnyProcess(guard, _) => {
140 38 : PidFileRead::NotHeldByAnyProcess(PidFileGuard(guard))
141 : }
142 : LockFileRead::LockedByOtherProcess {
143 2186 : not_locked_file: _not_locked_file,
144 2186 : content,
145 2186 : } => {
146 2186 : // XXX the read races with the write in claim_pid_file_for_pid().
147 2186 : // But pids are smaller than a page, so the kernel page cache will lock for us.
148 2186 : // The only problem is that we might get the old contents here.
149 2186 : // Can only fix that by implementing some scheme that downgrades the
150 2186 : // exclusive lock to shared lock in claim_pid_file_for_pid().
151 2186 : PidFileRead::LockedByOtherProcess(parse_pidfile_content(&content)?)
152 : }
153 : };
154 2224 : Ok(ret)
155 2224 : }
156 :
157 2186 : fn parse_pidfile_content(content: &str) -> anyhow::Result<Pid> {
158 2186 : let pid: i32 = content
159 2186 : .parse()
160 2186 : .map_err(|_| anyhow::anyhow!("parse pidfile content to PID"))?;
161 2186 : if pid < 1 {
162 0 : anyhow::bail!("bad value in pidfile '{pid}'");
163 2186 : }
164 2186 : Ok(Pid::from_raw(pid))
165 2186 : }
|