Line data Source code
1 : //! Like [`::tokio_epoll_uring::thread_local_system()`], but with pageserver-specific
2 : //! handling in case the instance can't launched.
3 : //!
4 : //! This is primarily necessary due to ENOMEM aka OutOfMemory errors during io_uring creation
5 : //! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series.
6 : //! See <https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391> for more details.
7 :
8 : use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
9 : use std::sync::Arc;
10 :
11 : use tokio_util::sync::CancellationToken;
12 : use tracing::{error, info, info_span, warn, Instrument};
13 : use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
14 :
15 : use tokio_epoll_uring::{System, SystemHandle};
16 :
17 : use crate::virtual_file::on_fatal_io_error;
18 :
19 : use crate::metrics::tokio_epoll_uring as metrics;
20 :
21 : #[derive(Clone)]
22 : struct ThreadLocalState(Arc<ThreadLocalStateInner>);
23 :
24 : struct ThreadLocalStateInner {
25 : cell: tokio::sync::OnceCell<SystemHandle>,
26 : launch_attempts: AtomicU32,
27 : /// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`]
28 : thread_local_state_id: u64,
29 : }
30 :
31 : impl ThreadLocalState {
32 393 : pub fn new() -> Self {
33 393 : Self(Arc::new(ThreadLocalStateInner {
34 393 : cell: tokio::sync::OnceCell::default(),
35 393 : launch_attempts: AtomicU32::new(0),
36 393 : thread_local_state_id: THREAD_LOCAL_STATE_ID.fetch_add(1, Ordering::Relaxed),
37 393 : }))
38 393 : }
39 :
40 393 : pub fn make_id_string(&self) -> String {
41 393 : format!("{}", self.0.thread_local_state_id)
42 393 : }
43 : }
44 :
45 : static THREAD_LOCAL_STATE_ID: AtomicU64 = AtomicU64::new(0);
46 :
47 : thread_local! {
48 : static THREAD_LOCAL: ThreadLocalState = ThreadLocalState::new();
49 : }
50 :
51 : /// Panics if we cannot [`System::launch`].
52 1040546 : pub async fn thread_local_system() -> Handle {
53 1040546 : let fake_cancel = CancellationToken::new();
54 1040546 : loop {
55 1040546 : let thread_local_state = THREAD_LOCAL.with(|arc| arc.clone());
56 1040546 : let inner = &thread_local_state.0;
57 1040546 : let get_or_init_res = inner
58 1040546 : .cell
59 1040546 : .get_or_try_init(|| async {
60 393 : let attempt_no = inner
61 393 : .launch_attempts
62 393 : .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
63 393 : let span = info_span!("tokio_epoll_uring_ext::thread_local_system", thread_local=%thread_local_state.make_id_string(), %attempt_no);
64 393 : async {
65 393 : // Rate-limit retries per thread-local.
66 393 : // NB: doesn't yield to executor at attempt_no=0.
67 393 : utils::backoff::exponential_backoff(
68 393 : attempt_no,
69 393 : DEFAULT_BASE_BACKOFF_SECONDS,
70 393 : DEFAULT_MAX_BACKOFF_SECONDS,
71 393 : &fake_cancel,
72 393 : )
73 0 : .await;
74 393 : let res = System::launch()
75 : // this might move us to another executor thread => loop outside the get_or_try_init, not inside it
76 393 : .await;
77 0 : match res {
78 393 : Ok(system) => {
79 393 : info!("successfully launched system");
80 393 : metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc();
81 393 : Ok(system)
82 : }
83 0 : Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
84 0 : warn!("not enough locked memory to tokio-epoll-uring, will retry");
85 0 : info_span!("stats").in_scope(|| {
86 0 : emit_launch_failure_process_stats();
87 0 : });
88 0 : metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
89 0 : Err(())
90 : }
91 : // abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere.
92 : // This is equivalent to a fatal IO error.
93 0 : Err(ref e @ tokio_epoll_uring::LaunchResult::IoUringBuild(ref inner)) => {
94 0 : error!(error=%e, "failed to launch thread-local tokio-epoll-uring, this should not happen, aborting process");
95 0 : info_span!("stats").in_scope(|| {
96 0 : emit_launch_failure_process_stats();
97 0 : });
98 0 : on_fatal_io_error(inner, "launch thread-local tokio-epoll-uring");
99 : },
100 : }
101 393 : }
102 393 : .instrument(span)
103 393 : .await
104 1040546 : })
105 393 : .await;
106 1040546 : if get_or_init_res.is_ok() {
107 1040546 : return Handle(thread_local_state);
108 0 : }
109 : }
110 1040546 : }
111 :
112 0 : fn emit_launch_failure_process_stats() {
113 0 : // tokio-epoll-uring stats
114 0 : // vmlck + rlimit
115 0 : // number of threads
116 0 : // rss / system memory usage generally
117 0 :
118 0 : let tokio_epoll_uring::metrics::Metrics {
119 0 : systems_created,
120 0 : systems_destroyed,
121 0 : } = tokio_epoll_uring::metrics::global();
122 0 : info!(systems_created, systems_destroyed, "tokio-epoll-uring");
123 :
124 0 : match procfs::process::Process::myself() {
125 0 : Ok(myself) => {
126 0 : match myself.limits() {
127 0 : Ok(limits) => {
128 0 : info!(?limits.max_locked_memory, "/proc/self/limits");
129 : }
130 0 : Err(error) => {
131 0 : info!(%error, "no limit stats due to error");
132 : }
133 : }
134 :
135 0 : match myself.status() {
136 0 : Ok(status) => {
137 0 : let procfs::process::Status {
138 0 : vmsize,
139 0 : vmlck,
140 0 : vmpin,
141 0 : vmrss,
142 0 : rssanon,
143 0 : rssfile,
144 0 : rssshmem,
145 0 : vmdata,
146 0 : vmstk,
147 0 : vmexe,
148 0 : vmlib,
149 0 : vmpte,
150 0 : threads,
151 0 : ..
152 0 : } = status;
153 0 : info!(
154 : vmsize,
155 : vmlck,
156 : vmpin,
157 : vmrss,
158 : rssanon,
159 : rssfile,
160 : rssshmem,
161 : vmdata,
162 : vmstk,
163 : vmexe,
164 : vmlib,
165 : vmpte,
166 : threads,
167 0 : "/proc/self/status"
168 : );
169 : }
170 0 : Err(error) => {
171 0 : info!(%error, "no status status due to error");
172 : }
173 : }
174 : }
175 0 : Err(error) => {
176 0 : info!(%error, "no process stats due to error");
177 : }
178 : };
179 0 : }
180 :
181 : #[derive(Clone)]
182 : pub struct Handle(ThreadLocalState);
183 :
184 : impl std::ops::Deref for Handle {
185 : type Target = SystemHandle;
186 :
187 1040546 : fn deref(&self) -> &Self::Target {
188 1040546 : self.0
189 1040546 : .0
190 1040546 : .cell
191 1040546 : .get()
192 1040546 : .expect("must be already initialized when using this")
193 1040546 : }
194 : }
|