Line data Source code
1 : use tracing::instrument;
2 : use tracing::{error, info};
3 :
4 : use crate::metrics::WalRedoKillCause;
5 : use crate::metrics::WAL_REDO_PROCESS_COUNTERS;
6 :
7 : use std::io;
8 : use std::process::Command;
9 :
10 : use std::ops::DerefMut;
11 :
12 : use std::ops::Deref;
13 :
14 : use std::process::Child;
15 :
16 : use pageserver_api::shard::TenantShardId;
17 :
18 : /// Wrapper type around `std::process::Child` which guarantees that the child
19 : /// will be killed and waited-for by this process before being dropped.
20 : pub(crate) struct NoLeakChild {
21 : pub(crate) tenant_id: TenantShardId,
22 : pub(crate) child: Option<Child>,
23 : }
24 :
25 : impl Deref for NoLeakChild {
26 : type Target = Child;
27 :
28 72 : fn deref(&self) -> &Self::Target {
29 72 : self.child.as_ref().expect("must not use from drop")
30 72 : }
31 : }
32 :
33 : impl DerefMut for NoLeakChild {
34 72 : fn deref_mut(&mut self) -> &mut Self::Target {
35 72 : self.child.as_mut().expect("must not use from drop")
36 72 : }
37 : }
38 :
39 : impl NoLeakChild {
40 24 : pub(crate) fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
41 24 : let child = command.spawn()?;
42 24 : Ok(NoLeakChild {
43 24 : tenant_id,
44 24 : child: Some(child),
45 24 : })
46 24 : }
47 :
48 24 : pub(crate) fn kill_and_wait(mut self, cause: WalRedoKillCause) {
49 24 : let child = match self.child.take() {
50 24 : Some(child) => child,
51 0 : None => return,
52 : };
53 24 : Self::kill_and_wait_impl(child, cause);
54 24 : }
55 :
56 24 : #[instrument(skip_all, fields(pid=child.id(), ?cause))]
57 : pub(crate) fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
58 : scopeguard::defer! {
59 : WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
60 : }
61 : let res = child.kill();
62 : if let Err(e) = res {
63 : // This branch is very unlikely because:
64 : // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
65 : // - This is the only place that calls .kill()
66 : // - We consume `self`, so, .kill() can't be called twice.
67 : // - If the process exited by itself or was killed by someone else,
68 : // .kill() will still succeed because we haven't wait()'ed yet.
69 : //
70 : // So, if we arrive here, we have really no idea what happened,
71 : // whether the PID stored in self.child is still valid, etc.
72 : // If this function were fallible, we'd return an error, but
73 : // since it isn't, all we can do is log an error and proceed
74 : // with the wait().
75 : error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
76 : }
77 :
78 : match child.wait() {
79 : Ok(exit_status) => {
80 : info!(exit_status = %exit_status, "wait successful");
81 : }
82 : Err(e) => {
83 : error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
84 : }
85 : }
86 : }
87 : }
88 :
89 : impl Drop for NoLeakChild {
90 24 : fn drop(&mut self) {
91 24 : let child = match self.child.take() {
92 0 : Some(child) => child,
93 24 : None => return,
94 : };
95 0 : let tenant_shard_id = self.tenant_id;
96 0 : // Offload the kill+wait of the child process into the background.
97 0 : // If someone stops the runtime, we'll leak the child process.
98 0 : // We can ignore that case because we only stop the runtime on pageserver exit.
99 0 : tokio::runtime::Handle::current().spawn(async move {
100 0 : tokio::task::spawn_blocking(move || {
101 : // Intentionally don't inherit the tracing context from whoever is dropping us.
102 : // This thread here is going to outlive of our dropper.
103 0 : let span = tracing::info_span!(
104 : "walredo",
105 : tenant_id = %tenant_shard_id.tenant_id,
106 0 : shard_id = %tenant_shard_id.shard_slug()
107 : );
108 0 : let _entered = span.enter();
109 0 : Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
110 0 : })
111 0 : .await
112 0 : });
113 24 : }
114 : }
115 :
116 : pub(crate) trait NoLeakChildCommandExt {
117 : fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
118 : }
119 :
120 : impl NoLeakChildCommandExt for Command {
121 24 : fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
122 24 : NoLeakChild::spawn(tenant_id, self)
123 24 : }
124 : }
|