Line data Source code
1 : //
2 : // Main entry point for the safekeeper executable
3 : //
4 : use anyhow::{bail, Context, Result};
5 : use clap::Parser;
6 : use futures::future::BoxFuture;
7 : use futures::stream::FuturesUnordered;
8 : use futures::{FutureExt, StreamExt};
9 : use remote_storage::RemoteStorageConfig;
10 : use tokio::runtime::Handle;
11 : use tokio::signal::unix::{signal, SignalKind};
12 : use tokio::task::JoinError;
13 : use toml_edit::Document;
14 :
15 : use std::fs::{self, File};
16 : use std::io::{ErrorKind, Write};
17 : use std::path::{Path, PathBuf};
18 : use std::str::FromStr;
19 : use std::sync::Arc;
20 : use std::time::Duration;
21 : use storage_broker::Uri;
22 : use tokio::sync::mpsc;
23 :
24 : use tracing::*;
25 : use utils::pid_file;
26 :
27 : use metrics::set_build_info_metric;
28 : use safekeeper::defaults::{
29 : DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
30 : DEFAULT_PG_LISTEN_ADDR,
31 : };
32 : use safekeeper::wal_service;
33 : use safekeeper::GlobalTimelines;
34 : use safekeeper::SafeKeeperConf;
35 : use safekeeper::{broker, WAL_SERVICE_RUNTIME};
36 : use safekeeper::{control_file, BROKER_RUNTIME};
37 : use safekeeper::{http, WAL_REMOVER_RUNTIME};
38 : use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
39 : use safekeeper::{wal_backup, HTTP_RUNTIME};
40 : use storage_broker::DEFAULT_ENDPOINT;
41 : use utils::auth::{JwtAuth, Scope};
42 : use utils::{
43 : id::NodeId,
44 : logging::{self, LogFormat},
45 : project_git_version,
46 : sentry_init::init_sentry,
47 : tcp_listener,
48 : };
49 :
50 : const PID_FILE_NAME: &str = "safekeeper.pid";
51 : const ID_FILE_NAME: &str = "safekeeper.id";
52 :
53 : project_git_version!(GIT_VERSION);
54 :
55 : const ABOUT: &str = r#"
56 : A fleet of safekeepers is responsible for reliably storing WAL received from
57 : compute, passing it through consensus (mitigating potential computes brain
58 : split), and serving the hardened part further downstream to pageserver(s).
59 : "#;
60 :
61 1035 : #[derive(Parser)]
62 : #[command(name = "Neon safekeeper", version = GIT_VERSION, about = ABOUT, long_about = None)]
63 : struct Args {
64 : /// Path to the safekeeper data directory.
65 : #[arg(short = 'D', long, default_value = "./")]
66 0 : datadir: PathBuf,
67 : /// Safekeeper node id.
68 : #[arg(long)]
69 : id: Option<u64>,
70 : /// Initialize safekeeper with given id and exit.
71 : #[arg(long)]
72 0 : init: bool,
73 0 : /// Listen endpoint for receiving/sending WAL in the form host:port.
74 : #[arg(short, long, default_value = DEFAULT_PG_LISTEN_ADDR)]
75 0 : listen_pg: String,
76 : /// Listen endpoint for receiving/sending WAL in the form host:port allowing
77 : /// only tenant scoped auth tokens. Pointless if auth is disabled.
78 : #[arg(long, default_value = None, verbatim_doc_comment)]
79 : listen_pg_tenant_only: Option<String>,
80 : /// Listen http endpoint for management and metrics in the form host:port.
81 : #[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)]
82 0 : listen_http: String,
83 : /// Advertised endpoint for receiving/sending WAL in the form host:port. If not
84 : /// specified, listen_pg is used to advertise instead.
85 : #[arg(long, default_value = None)]
86 : advertise_pg: Option<String>,
87 : /// Availability zone of the safekeeper.
88 : #[arg(long)]
89 : availability_zone: Option<String>,
90 : /// Do not wait for changes to be written safely to disk. Unsafe.
91 : #[arg(short, long)]
92 0 : no_sync: bool,
93 0 : /// Dump control file at path specified by this argument and exit.
94 : #[arg(long)]
95 : dump_control_file: Option<PathBuf>,
96 : /// Broker endpoint for storage nodes coordination in the form
97 : /// http[s]://host:port. In case of https schema TLS is connection is
98 : /// established; plaintext otherwise.
99 : #[arg(long, default_value = DEFAULT_ENDPOINT, verbatim_doc_comment)]
100 0 : broker_endpoint: Uri,
101 0 : /// Broker keepalive interval.
102 : #[arg(long, value_parser= humantime::parse_duration, default_value = storage_broker::DEFAULT_KEEPALIVE_INTERVAL)]
103 0 : broker_keepalive_interval: Duration,
104 : /// Peer safekeeper is considered dead after not receiving heartbeats from
105 : /// it during this period passed as a human readable duration.
106 : #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT, verbatim_doc_comment)]
107 0 : heartbeat_timeout: Duration,
108 : /// Remote storage configuration for WAL backup (offloading to s3) as TOML
109 : /// inline table, e.g.
110 : /// {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "<BUCKETNAME>", "bucket_region":"<REGION>", "concurrency_limit": 119}
111 : /// Safekeeper offloads WAL to
112 : /// [prefix_in_bucket/]<tenant_id>/<timeline_id>/<segment_file>, mirroring
113 : /// structure on the file system.
114 : #[arg(long, value_parser = parse_remote_storage, verbatim_doc_comment)]
115 : remote_storage: Option<RemoteStorageConfig>,
116 : /// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes
117 518 : #[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)]
118 0 : max_offloader_lag: u64,
119 0 : /// Number of max parallel WAL segments to be offloaded to remote storage.
120 : #[arg(long, default_value = "5")]
121 0 : wal_backup_parallel_jobs: usize,
122 0 : /// Disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring
123 : /// WAL backup horizon.
124 : #[arg(long)]
125 0 : disable_wal_backup: bool,
126 0 : /// If given, enables auth on incoming connections to WAL service endpoint
127 : /// (--listen-pg). Value specifies path to a .pem public key used for
128 : /// validations of JWT tokens. Empty string is allowed and means disabling
129 : /// auth.
130 : #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
131 : pg_auth_public_key_path: Option<PathBuf>,
132 : /// If given, enables auth on incoming connections to tenant only WAL
133 : /// service endpoint (--listen-pg-tenant-only). Value specifies path to a
134 : /// .pem public key used for validations of JWT tokens. Empty string is
135 : /// allowed and means disabling auth.
136 : #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
137 : pg_tenant_only_auth_public_key_path: Option<PathBuf>,
138 : /// If given, enables auth on incoming connections to http management
139 : /// service endpoint (--listen-http). Value specifies path to a .pem public
140 : /// key used for validations of JWT tokens. Empty string is allowed and
141 : /// means disabling auth.
142 : #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
143 : http_auth_public_key_path: Option<PathBuf>,
144 : /// Format for logging, either 'plain' or 'json'.
145 : #[arg(long, default_value = "plain")]
146 0 : log_format: String,
147 : /// Run everything in single threaded current thread runtime, might be
148 : /// useful for debugging.
149 : #[arg(long)]
150 0 : current_thread_runtime: bool,
151 0 : }
152 :
153 : // Like PathBufValueParser, but allows empty string.
154 63 : fn opt_pathbuf_parser(s: &str) -> Result<PathBuf, String> {
155 63 : Ok(PathBuf::from_str(s).unwrap())
156 63 : }
157 :
158 : #[tokio::main(flavor = "current_thread")]
159 517 : async fn main() -> anyhow::Result<()> {
160 517 : // We want to allow multiple occurences of the same arg (taking the last) so
161 517 : // that neon_local could generate command with defaults + overrides without
162 517 : // getting 'argument cannot be used multiple times' error. This seems to be
163 517 : // impossible with pure Derive API, so convert struct to Command, modify it,
164 517 : // parse arguments, and then fill the struct back.
165 517 : let cmd = <Args as clap::CommandFactory>::command().args_override_self(true);
166 517 : let mut matches = cmd.get_matches();
167 517 : let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;
168 :
169 : // I failed to modify opt_pathbuf_parser to return Option<PathBuf> in
170 : // reasonable time, so turn empty string into option post factum.
171 517 : if let Some(pb) = &args.pg_auth_public_key_path {
172 20 : if pb.as_os_str().is_empty() {
173 1 : args.pg_auth_public_key_path = None;
174 19 : }
175 497 : }
176 517 : if let Some(pb) = &args.pg_tenant_only_auth_public_key_path {
177 20 : if pb.as_os_str().is_empty() {
178 0 : args.pg_tenant_only_auth_public_key_path = None;
179 20 : }
180 497 : }
181 517 : if let Some(pb) = &args.http_auth_public_key_path {
182 20 : if pb.as_os_str().is_empty() {
183 2 : args.http_auth_public_key_path = None;
184 18 : }
185 497 : }
186 :
187 517 : if let Some(addr) = args.dump_control_file {
188 0 : let state = control_file::FileStorage::load_control_file(addr)?;
189 0 : let json = serde_json::to_string(&state)?;
190 0 : print!("{json}");
191 0 : return Ok(());
192 517 : }
193 517 :
194 517 : // important to keep the order of:
195 517 : // 1. init logging
196 517 : // 2. tracing panic hook
197 517 : // 3. sentry
198 517 : logging::init(
199 517 : LogFormat::from_config(&args.log_format)?,
200 517 : logging::TracingErrorLayerEnablement::Disabled,
201 0 : )?;
202 517 : logging::replace_panic_hook_with_tracing_panic_hook().forget();
203 517 : info!("version: {GIT_VERSION}");
204 :
205 517 : let args_workdir = &args.datadir;
206 517 : let workdir = args_workdir.canonicalize().with_context(|| {
207 0 : format!("Failed to get the absolute path for input workdir {args_workdir:?}")
208 517 : })?;
209 :
210 : // Change into the data directory.
211 517 : std::env::set_current_dir(&workdir)?;
212 :
213 : // Set or read our ID.
214 517 : let id = set_id(&workdir, args.id.map(NodeId))?;
215 517 : if args.init {
216 0 : return Ok(());
217 517 : }
218 :
219 517 : let pg_auth = match args.pg_auth_public_key_path.as_ref() {
220 : None => {
221 498 : info!("pg auth is disabled");
222 498 : None
223 : }
224 19 : Some(path) => {
225 19 : info!("loading pg auth JWT key from {}", path.display());
226 : Some(Arc::new(
227 19 : JwtAuth::from_key_path(path).context("failed to load the auth key")?,
228 : ))
229 : }
230 : };
231 517 : let pg_tenant_only_auth = match args.pg_tenant_only_auth_public_key_path.as_ref() {
232 : None => {
233 497 : info!("pg tenant only auth is disabled");
234 497 : None
235 : }
236 20 : Some(path) => {
237 20 : info!(
238 20 : "loading pg tenant only auth JWT key from {}",
239 20 : path.display()
240 20 : );
241 : Some(Arc::new(
242 20 : JwtAuth::from_key_path(path).context("failed to load the auth key")?,
243 : ))
244 : }
245 : };
246 517 : let http_auth = match args.http_auth_public_key_path.as_ref() {
247 : None => {
248 499 : info!("http auth is disabled");
249 499 : None
250 : }
251 18 : Some(path) => {
252 18 : info!("loading http auth JWT key from {}", path.display());
253 : Some(Arc::new(
254 18 : JwtAuth::from_key_path(path).context("failed to load the auth key")?,
255 : ))
256 : }
257 : };
258 :
259 517 : let conf = SafeKeeperConf {
260 517 : workdir,
261 517 : my_id: id,
262 517 : listen_pg_addr: args.listen_pg,
263 517 : listen_pg_addr_tenant_only: args.listen_pg_tenant_only,
264 517 : listen_http_addr: args.listen_http,
265 517 : advertise_pg_addr: args.advertise_pg,
266 517 : availability_zone: args.availability_zone,
267 517 : no_sync: args.no_sync,
268 517 : broker_endpoint: args.broker_endpoint,
269 517 : broker_keepalive_interval: args.broker_keepalive_interval,
270 517 : heartbeat_timeout: args.heartbeat_timeout,
271 517 : remote_storage: args.remote_storage,
272 517 : max_offloader_lag_bytes: args.max_offloader_lag,
273 517 : wal_backup_enabled: !args.disable_wal_backup,
274 517 : backup_parallel_jobs: args.wal_backup_parallel_jobs,
275 517 : pg_auth,
276 517 : pg_tenant_only_auth,
277 517 : http_auth,
278 517 : current_thread_runtime: args.current_thread_runtime,
279 517 : };
280 517 :
281 517 : // initialize sentry if SENTRY_DSN is provided
282 517 : let _sentry_guard = init_sentry(
283 517 : Some(GIT_VERSION.into()),
284 517 : &[("node_id", &conf.my_id.to_string())],
285 517 : );
286 1034 : start_safekeeper(conf).await
287 : }
288 :
289 : /// Result of joining any of main tasks: upper error means task failed to
290 : /// complete, e.g. panicked, inner is error produced by task itself.
291 : type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
292 :
293 517 : async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
294 517 : // Prevent running multiple safekeepers on the same directory
295 517 : let lock_file_path = conf.workdir.join(PID_FILE_NAME);
296 517 : let lock_file =
297 517 : pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
298 517 : info!("claimed pid file at {lock_file_path:?}");
299 :
300 : // ensure that the lock file is held even if the main thread of the process is panics
301 : // we need to release the lock file only when the current process is gone
302 517 : std::mem::forget(lock_file);
303 :
304 517 : info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
305 517 : let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
306 0 : error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
307 0 : e
308 517 : })?;
309 :
310 517 : let pg_listener_tenant_only =
311 517 : if let Some(listen_pg_addr_tenant_only) = &conf.listen_pg_addr_tenant_only {
312 517 : info!(
313 517 : "starting safekeeper tenant scoped WAL service on {}",
314 517 : listen_pg_addr_tenant_only
315 517 : );
316 517 : let listener = tcp_listener::bind(listen_pg_addr_tenant_only.clone()).map_err(|e| {
317 0 : error!(
318 0 : "failed to bind to address {}: {}",
319 0 : listen_pg_addr_tenant_only, e
320 0 : );
321 0 : e
322 517 : })?;
323 517 : Some(listener)
324 : } else {
325 0 : None
326 : };
327 :
328 517 : info!(
329 517 : "starting safekeeper HTTP service on {}",
330 517 : conf.listen_http_addr
331 517 : );
332 517 : let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
333 0 : error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
334 0 : e
335 517 : })?;
336 :
337 : // Register metrics collector for active timelines. It's important to do this
338 : // after daemonizing, otherwise process collector will be upset.
339 517 : let timeline_collector = safekeeper::metrics::TimelineCollector::new();
340 517 : metrics::register_internal(Box::new(timeline_collector))?;
341 :
342 517 : let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
343 517 :
344 517 : // Keep handles to main tasks to die if any of them disappears.
345 517 : let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
346 517 : FuturesUnordered::new();
347 517 :
348 517 : // Start wal backup launcher before loading timelines as we'll notify it
349 517 : // through the channel about timelines which need offloading, not draining
350 517 : // the channel would cause deadlock.
351 517 : let current_thread_rt = conf
352 517 : .current_thread_runtime
353 517 : .then(|| Handle::try_current().expect("no runtime in main"));
354 517 : let conf_ = conf.clone();
355 517 : let wal_backup_handle = current_thread_rt
356 517 : .as_ref()
357 517 : .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
358 517 : .spawn(wal_backup::wal_backup_launcher_task_main(
359 517 : conf_,
360 517 : wal_backup_launcher_rx,
361 517 : ))
362 517 : .map(|res| ("WAL backup launcher".to_owned(), res));
363 517 : tasks_handles.push(Box::pin(wal_backup_handle));
364 517 :
365 517 : // Load all timelines from disk to memory.
366 517 : GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?;
367 :
368 517 : let conf_ = conf.clone();
369 517 : // Run everything in current thread rt, if asked.
370 517 : if conf.current_thread_runtime {
371 0 : info!("running in current thread runtime");
372 517 : }
373 :
374 517 : let wal_service_handle = current_thread_rt
375 517 : .as_ref()
376 517 : .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
377 517 : .spawn(wal_service::task_main(
378 517 : conf_,
379 517 : pg_listener,
380 517 : Scope::SafekeeperData,
381 517 : ))
382 517 : // wrap with task name for error reporting
383 517 : .map(|res| ("WAL service main".to_owned(), res));
384 517 : tasks_handles.push(Box::pin(wal_service_handle));
385 :
386 517 : if let Some(pg_listener_tenant_only) = pg_listener_tenant_only {
387 517 : let conf_ = conf.clone();
388 517 : let wal_service_handle = current_thread_rt
389 517 : .as_ref()
390 517 : .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
391 517 : .spawn(wal_service::task_main(
392 517 : conf_,
393 517 : pg_listener_tenant_only,
394 517 : Scope::Tenant,
395 517 : ))
396 517 : // wrap with task name for error reporting
397 517 : .map(|res| ("WAL service tenant only main".to_owned(), res));
398 517 : tasks_handles.push(Box::pin(wal_service_handle));
399 517 : }
400 :
401 517 : let conf_ = conf.clone();
402 517 : let http_handle = current_thread_rt
403 517 : .as_ref()
404 517 : .unwrap_or_else(|| HTTP_RUNTIME.handle())
405 517 : .spawn(http::task_main(conf_, http_listener))
406 517 : .map(|res| ("HTTP service main".to_owned(), res));
407 517 : tasks_handles.push(Box::pin(http_handle));
408 517 :
409 517 : let conf_ = conf.clone();
410 517 : let broker_task_handle = current_thread_rt
411 517 : .as_ref()
412 517 : .unwrap_or_else(|| BROKER_RUNTIME.handle())
413 517 : .spawn(broker::task_main(conf_).instrument(info_span!("broker")))
414 517 : .map(|res| ("broker main".to_owned(), res));
415 517 : tasks_handles.push(Box::pin(broker_task_handle));
416 517 :
417 517 : let conf_ = conf.clone();
418 517 : let wal_remover_handle = current_thread_rt
419 517 : .as_ref()
420 517 : .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
421 517 : .spawn(remove_wal::task_main(conf_))
422 517 : .map(|res| ("WAL remover".to_owned(), res));
423 517 : tasks_handles.push(Box::pin(wal_remover_handle));
424 517 :
425 517 : set_build_info_metric(GIT_VERSION);
426 :
427 : // TODO: update tokio-stream, convert to real async Stream with
428 : // SignalStream, map it to obtain missing signal name, combine streams into
429 : // single stream we can easily sit on.
430 517 : let mut sigquit_stream = signal(SignalKind::quit())?;
431 517 : let mut sigint_stream = signal(SignalKind::interrupt())?;
432 517 : let mut sigterm_stream = signal(SignalKind::terminate())?;
433 :
434 517 : tokio::select! {
435 0 : Some((task_name, res)) = tasks_handles.next()=> {
436 0 : error!("{} task failed: {:?}, exiting", task_name, res);
437 : std::process::exit(1);
438 : }
439 : // On any shutdown signal, log receival and exit. Additionally, handling
440 : // SIGQUIT prevents coredump.
441 411 : _ = sigquit_stream.recv() => info!("received SIGQUIT, terminating"),
442 0 : _ = sigint_stream.recv() => info!("received SIGINT, terminating"),
443 106 : _ = sigterm_stream.recv() => info!("received SIGTERM, terminating")
444 :
445 : };
446 517 : std::process::exit(0);
447 0 : }
448 :
449 : /// Determine safekeeper id.
450 517 : fn set_id(workdir: &Path, given_id: Option<NodeId>) -> Result<NodeId> {
451 517 : let id_file_path = workdir.join(ID_FILE_NAME);
452 517 :
453 517 : let my_id: NodeId;
454 517 : // If file with ID exists, read it in; otherwise set one passed.
455 517 : match fs::read(&id_file_path) {
456 87 : Ok(id_serialized) => {
457 87 : my_id = NodeId(
458 87 : std::str::from_utf8(&id_serialized)
459 87 : .context("failed to parse safekeeper id")?
460 87 : .parse()
461 87 : .context("failed to parse safekeeper id")?,
462 : );
463 87 : if let Some(given_id) = given_id {
464 87 : if given_id != my_id {
465 0 : bail!(
466 0 : "safekeeper already initialized with id {}, can't set {}",
467 0 : my_id,
468 0 : given_id
469 0 : );
470 87 : }
471 0 : }
472 87 : info!("safekeeper ID {}", my_id);
473 : }
474 430 : Err(error) => match error.kind() {
475 : ErrorKind::NotFound => {
476 430 : my_id = if let Some(given_id) = given_id {
477 430 : given_id
478 : } else {
479 0 : bail!("safekeeper id is not specified");
480 : };
481 430 : let mut f = File::create(&id_file_path)
482 430 : .with_context(|| format!("Failed to create id file at {id_file_path:?}"))?;
483 430 : f.write_all(my_id.to_string().as_bytes())?;
484 430 : f.sync_all()?;
485 430 : info!("initialized safekeeper id {}", my_id);
486 : }
487 : _ => {
488 0 : return Err(error.into());
489 : }
490 : },
491 : }
492 517 : Ok(my_id)
493 517 : }
494 :
495 : // Parse RemoteStorage from TOML table.
496 36 : fn parse_remote_storage(storage_conf: &str) -> anyhow::Result<RemoteStorageConfig> {
497 36 : // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse
498 36 : let storage_conf_toml = format!("remote_storage = {storage_conf}");
499 36 : let parsed_toml = storage_conf_toml.parse::<Document>()?; // parse
500 36 : let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again
501 36 : RemoteStorageConfig::from_toml(storage_conf_parsed_toml).and_then(|parsed_config| {
502 36 : // XXX: Don't print the original toml here, there might be some sensitive data
503 36 : parsed_config.context("Incorrectly parsed remote storage toml as no remote storage config")
504 36 : })
505 36 : }
506 :
507 1 : #[test]
508 1 : fn verify_cli() {
509 1 : use clap::CommandFactory;
510 1 : Args::command().debug_assert()
511 1 : }
|