Line data Source code
1 : //!
2 : //! Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
3 : //! `ExecStart` option. It will handle all the `Neon` specifics during compute node
4 : //! initialization:
5 : //! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
6 : //! - Every start is a fresh start, so the data directory is removed and
7 : //! initialized again on each run.
8 : //! - If remote_extension_config is provided, it will be used to fetch extensions list
9 : //! and download `shared_preload_libraries` from the remote storage.
10 : //! - Next it will put configuration files into the `PGDATA` directory.
11 : //! - Sync safekeepers and get commit LSN.
12 : //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
13 : //! - Try to start `postgres` and wait until it is ready to accept connections.
14 : //! - Check and alter/drop/create roles and databases.
15 : //! - Hang waiting on the `postmaster` process to exit.
16 : //!
17 : //! Also `compute_ctl` spawns two separate service threads:
18 : //! - `compute-monitor` checks the last Postgres activity timestamp and saves it
19 : //! into the shared `ComputeNode`;
20 : //! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
21 : //! last activity requests.
22 : //!
23 : //! If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
24 : //! `vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
25 : //! `vm-monitor` communicates with the VM autoscaling system. It coordinates
26 : //! downscaling and requests immediate upscaling under resource pressure.
27 : //!
28 : //! Usage example:
29 : //! ```sh
30 : //! compute_ctl -D /var/db/postgres/compute \
31 : //! -C 'postgresql://cloud_admin@localhost/postgres' \
32 : //! -S /var/db/postgres/specs/current.json \
33 : //! -b /usr/local/bin/postgres \
34 : //! -r http://pg-ext-s3-gateway \
35 : //! ```
36 : //!
37 : use std::collections::HashMap;
38 : use std::fs::File;
39 : use std::path::Path;
40 : use std::process::exit;
41 : use std::sync::atomic::Ordering;
42 : use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
43 : use std::{thread, time::Duration};
44 :
45 : use anyhow::{Context, Result};
46 : use chrono::Utc;
47 : use clap::Arg;
48 : use nix::sys::signal::{kill, Signal};
49 : use signal_hook::consts::{SIGQUIT, SIGTERM};
50 : use signal_hook::{consts::SIGINT, iterator::Signals};
51 : use tracing::{error, info};
52 : use url::Url;
53 :
54 : use compute_api::responses::ComputeStatus;
55 :
56 : use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
57 : use compute_tools::configurator::launch_configurator;
58 : use compute_tools::extension_server::get_pg_version;
59 : use compute_tools::http::api::launch_http_server;
60 : use compute_tools::logger::*;
61 : use compute_tools::monitor::launch_monitor;
62 : use compute_tools::params::*;
63 : use compute_tools::spec::*;
64 :
65 : // this is an arbitrary build tag. Fine as a default / for testing purposes
66 : // in-case of not-set environment var
67 : const BUILD_TAG_DEFAULT: &str = "latest";
68 :
69 575 : fn main() -> Result<()> {
70 575 : init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
71 :
72 575 : let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
73 575 : thread::spawn(move || {
74 575 : for sig in signals.forever() {
75 1 : handle_exit_signal(sig);
76 1 : }
77 575 : });
78 575 :
79 575 : let build_tag = option_env!("BUILD_TAG")
80 575 : .unwrap_or(BUILD_TAG_DEFAULT)
81 575 : .to_string();
82 575 : info!("build_tag: {build_tag}");
83 :
84 575 : let matches = cli().get_matches();
85 575 : let pgbin_default = String::from("postgres");
86 575 : let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
87 575 :
88 575 : let ext_remote_storage = matches
89 575 : .get_one::<String>("remote-ext-config")
90 575 : // Compatibility hack: if the control plane specified any remote-ext-config
91 575 : // use the default value for extension storage proxy gateway.
92 575 : // Remove this once the control plane is updated to pass the gateway URL
93 575 : .map(|conf| {
94 1 : if conf.starts_with("http") {
95 1 : conf.trim_end_matches('/')
96 : } else {
97 0 : "http://pg-ext-s3-gateway"
98 : }
99 575 : });
100 575 :
101 575 : let http_port = *matches
102 575 : .get_one::<u16>("http-port")
103 575 : .expect("http-port is required");
104 575 : let pgdata = matches
105 575 : .get_one::<String>("pgdata")
106 575 : .expect("PGDATA path is required");
107 575 : let connstr = matches
108 575 : .get_one::<String>("connstr")
109 575 : .expect("Postgres connection string is required");
110 575 : let spec_json = matches.get_one::<String>("spec");
111 575 : let spec_path = matches.get_one::<String>("spec-path");
112 575 :
113 575 : // Extract OpenTelemetry context for the startup actions from the
114 575 : // TRACEPARENT and TRACESTATE env variables, and attach it to the current
115 575 : // tracing context.
116 575 : //
117 575 : // This is used to propagate the context for the 'start_compute' operation
118 575 : // from the neon control plane. This allows linking together the wider
119 575 : // 'start_compute' operation that creates the compute container, with the
120 575 : // startup actions here within the container.
121 575 : //
122 575 : // There is no standard for passing context in env variables, but a lot of
123 575 : // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
124 575 : // https://github.com/open-telemetry/opentelemetry-specification/issues/740
125 575 : //
126 575 : // Switch to the startup context here, and exit it once the startup has
127 575 : // completed and Postgres is up and running.
128 575 : //
129 575 : // If this pod is pre-created without binding it to any particular endpoint
130 575 : // yet, this isn't the right place to enter the startup context. In that
131 575 : // case, the control plane should pass the tracing context as part of the
132 575 : // /configure API call.
133 575 : //
134 575 : // NOTE: This is supposed to only cover the *startup* actions. Once
135 575 : // postgres is configured and up-and-running, we exit this span. Any other
136 575 : // actions that are performed on incoming HTTP requests, for example, are
137 575 : // performed in separate spans.
138 575 : //
139 575 : // XXX: If the pod is restarted, we perform the startup actions in the same
140 575 : // context as the original startup actions, which probably doesn't make
141 575 : // sense.
142 575 : let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
143 575 : if let Ok(val) = std::env::var("TRACEPARENT") {
144 0 : startup_tracing_carrier.insert("traceparent".to_string(), val);
145 575 : }
146 575 : if let Ok(val) = std::env::var("TRACESTATE") {
147 0 : startup_tracing_carrier.insert("tracestate".to_string(), val);
148 575 : }
149 575 : let startup_context_guard = if !startup_tracing_carrier.is_empty() {
150 : use opentelemetry::propagation::TextMapPropagator;
151 : use opentelemetry::sdk::propagation::TraceContextPropagator;
152 0 : let guard = TraceContextPropagator::new()
153 0 : .extract(&startup_tracing_carrier)
154 0 : .attach();
155 0 : info!("startup tracing context attached");
156 0 : Some(guard)
157 : } else {
158 575 : None
159 : };
160 :
161 575 : let compute_id = matches.get_one::<String>("compute-id");
162 575 : let control_plane_uri = matches.get_one::<String>("control-plane-uri");
163 575 :
164 575 : let spec;
165 575 : let mut live_config_allowed = false;
166 575 : match spec_json {
167 : // First, try to get cluster spec from the cli argument
168 0 : Some(json) => {
169 0 : info!("got spec from cli argument {}", json);
170 0 : spec = Some(serde_json::from_str(json)?);
171 : }
172 : None => {
173 : // Second, try to read it from the file if path is provided
174 575 : if let Some(sp) = spec_path {
175 575 : let path = Path::new(sp);
176 575 : let file = File::open(path)?;
177 575 : spec = Some(serde_json::from_reader(file)?);
178 575 : live_config_allowed = true;
179 0 : } else if let Some(id) = compute_id {
180 0 : if let Some(cp_base) = control_plane_uri {
181 0 : live_config_allowed = true;
182 0 : spec = match get_spec_from_control_plane(cp_base, id) {
183 0 : Ok(s) => s,
184 0 : Err(e) => {
185 0 : error!("cannot get response from control plane: {}", e);
186 0 : panic!("neither spec nor confirmation that compute is in the Empty state was received");
187 : }
188 : };
189 : } else {
190 0 : panic!("must specify both --control-plane-uri and --compute-id or none");
191 : }
192 : } else {
193 0 : panic!(
194 0 : "compute spec should be provided by one of the following ways: \
195 0 : --spec OR --spec-path OR --control-plane-uri and --compute-id"
196 0 : );
197 : }
198 : }
199 : };
200 :
201 575 : let mut new_state = ComputeState::new();
202 : let spec_set;
203 :
204 575 : if let Some(spec) = spec {
205 575 : let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
206 575 : info!("new pspec.spec: {:?}", pspec.spec);
207 575 : new_state.pspec = Some(pspec);
208 575 : spec_set = true;
209 0 : } else {
210 0 : spec_set = false;
211 0 : }
212 575 : let compute_node = ComputeNode {
213 575 : connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
214 575 : pgdata: pgdata.to_string(),
215 575 : pgbin: pgbin.to_string(),
216 575 : pgversion: get_pg_version(pgbin),
217 575 : live_config_allowed,
218 575 : state: Mutex::new(new_state),
219 575 : state_changed: Condvar::new(),
220 575 : ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
221 575 : ext_download_progress: RwLock::new(HashMap::new()),
222 575 : build_tag,
223 575 : };
224 575 : let compute = Arc::new(compute_node);
225 575 :
226 575 : // If this is a pooled VM, prewarm before starting HTTP server and becoming
227 575 : // available for binding. Prewarming helps Postgres start quicker later,
228 575 : // because QEMU will already have it's memory allocated from the host, and
229 575 : // the necessary binaries will already be cached.
230 575 : if !spec_set {
231 0 : compute.prewarm_postgres()?;
232 575 : }
233 :
234 : // Launch http service first, so we were able to serve control-plane
235 : // requests, while configuration is still in progress.
236 575 : let _http_handle =
237 575 : launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
238 575 :
239 575 : let extension_server_port: u16 = http_port;
240 575 :
241 575 : if !spec_set {
242 : // No spec provided, hang waiting for it.
243 0 : info!("no compute spec provided, waiting");
244 :
245 0 : let mut state = compute.state.lock().unwrap();
246 0 : while state.status != ComputeStatus::ConfigurationPending {
247 0 : state = compute.state_changed.wait(state).unwrap();
248 0 :
249 0 : if state.status == ComputeStatus::ConfigurationPending {
250 0 : info!("got spec, continue configuration");
251 : // Spec is already set by the http server handler.
252 0 : break;
253 0 : }
254 : }
255 575 : }
256 :
257 : // We got all we need, update the state.
258 575 : let mut state = compute.state.lock().unwrap();
259 575 :
260 575 : // Record for how long we slept waiting for the spec.
261 575 : state.metrics.wait_for_spec_ms = Utc::now()
262 575 : .signed_duration_since(state.start_time)
263 575 : .to_std()
264 575 : .unwrap()
265 575 : .as_millis() as u64;
266 575 : // Reset start time to the actual start of the configuration, so that
267 575 : // total startup time was properly measured at the end.
268 575 : state.start_time = Utc::now();
269 575 :
270 575 : state.status = ComputeStatus::Init;
271 575 : compute.state_changed.notify_all();
272 575 :
273 575 : info!(
274 575 : "running compute with features: {:?}",
275 575 : state.pspec.as_ref().unwrap().spec.features
276 575 : );
277 575 : drop(state);
278 575 :
279 575 : // Launch remaining service threads
280 575 : let _monitor_handle = launch_monitor(&compute);
281 575 : let _configurator_handle = launch_configurator(&compute);
282 575 :
283 575 : // Start Postgres
284 575 : let mut delay_exit = false;
285 575 : let mut exit_code = None;
286 575 : let pg = match compute.start_compute(extension_server_port) {
287 575 : Ok(pg) => Some(pg),
288 0 : Err(err) => {
289 0 : error!("could not start the compute node: {:#}", err);
290 0 : let mut state = compute.state.lock().unwrap();
291 0 : state.error = Some(format!("{:?}", err));
292 0 : state.status = ComputeStatus::Failed;
293 0 : // Notify others that Postgres failed to start. In case of configuring the
294 0 : // empty compute, it's likely that API handler is still waiting for compute
295 0 : // state change. With this we will notify it that compute is in Failed state,
296 0 : // so control plane will know about it earlier and record proper error instead
297 0 : // of timeout.
298 0 : compute.state_changed.notify_all();
299 0 : drop(state); // unlock
300 0 : delay_exit = true;
301 0 : None
302 : }
303 : };
304 :
305 : // Start the vm-monitor if directed to. The vm-monitor only runs on linux
306 : // because it requires cgroups.
307 : cfg_if::cfg_if! {
308 : if #[cfg(target_os = "linux")] {
309 : use std::env;
310 : use tokio_util::sync::CancellationToken;
311 575 : let vm_monitor_addr = matches
312 575 : .get_one::<String>("vm-monitor-addr")
313 575 : .expect("--vm-monitor-addr should always be set because it has a default arg");
314 575 : let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
315 575 : let cgroup = matches.get_one::<String>("cgroup");
316 :
317 : // Only make a runtime if we need to.
318 : // Note: it seems like you can make a runtime in an inner scope and
319 : // if you start a task in it it won't be dropped. However, make it
320 : // in the outermost scope just to be safe.
321 575 : let rt = if env::var_os("AUTOSCALING").is_some() {
322 0 : Some(
323 0 : tokio::runtime::Builder::new_multi_thread()
324 0 : .worker_threads(4)
325 0 : .enable_all()
326 0 : .build()
327 0 : .expect("failed to create tokio runtime for monitor")
328 0 : )
329 : } else {
330 575 : None
331 : };
332 :
333 : // This token is used internally by the monitor to clean up all threads
334 575 : let token = CancellationToken::new();
335 575 :
336 575 : let vm_monitor = &rt.as_ref().map(|rt| {
337 0 : rt.spawn(vm_monitor::start(
338 0 : Box::leak(Box::new(vm_monitor::Args {
339 0 : cgroup: cgroup.cloned(),
340 0 : pgconnstr: file_cache_connstr.cloned(),
341 0 : addr: vm_monitor_addr.clone(),
342 0 : })),
343 0 : token.clone(),
344 0 : ))
345 575 : });
346 : }
347 : }
348 :
349 : // Wait for the child Postgres process forever. In this state Ctrl+C will
350 : // propagate to Postgres and it will be shut down as well.
351 575 : if let Some((mut pg, logs_handle)) = pg {
352 : // Startup is finished, exit the startup tracing span
353 575 : drop(startup_context_guard);
354 575 :
355 575 : let ecode = pg
356 575 : .wait()
357 575 : .expect("failed to start waiting on Postgres process");
358 575 : PG_PID.store(0, Ordering::SeqCst);
359 575 :
360 575 : // Process has exited, so we can join the logs thread.
361 575 : let _ = logs_handle
362 575 : .join()
363 575 : .map_err(|e| tracing::error!("log thread panicked: {:?}", e));
364 575 :
365 575 : info!("Postgres exited with code {}, shutting down", ecode);
366 575 : exit_code = ecode.code()
367 0 : }
368 :
369 : // Terminate the vm_monitor so it releases the file watcher on
370 : // /sys/fs/cgroup/neon-postgres.
371 : // Note: the vm-monitor only runs on linux because it requires cgroups.
372 : cfg_if::cfg_if! {
373 : if #[cfg(target_os = "linux")] {
374 575 : if let Some(handle) = vm_monitor {
375 0 : // Kills all threads spawned by the monitor
376 0 : token.cancel();
377 0 : // Kills the actual task running the monitor
378 0 : handle.abort();
379 0 :
380 0 : // If handle is some, rt must have been used to produce it, and
381 0 : // hence is also some
382 0 : rt.unwrap().shutdown_timeout(Duration::from_secs(2));
383 575 : }
384 : }
385 : }
386 :
387 : // Maybe sync safekeepers again, to speed up next startup
388 575 : let compute_state = compute.state.lock().unwrap().clone();
389 575 : let pspec = compute_state.pspec.as_ref().expect("spec must be set");
390 575 : if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
391 526 : info!("syncing safekeepers on shutdown");
392 526 : let storage_auth_token = pspec.storage_auth_token.clone();
393 526 : let lsn = compute.sync_safekeepers(storage_auth_token)?;
394 525 : info!("synced safekeepers at lsn {lsn}");
395 49 : }
396 :
397 574 : if let Err(err) = compute.check_for_core_dumps() {
398 0 : error!("error while checking for core dumps: {err:?}");
399 574 : }
400 :
401 : // If launch failed, keep serving HTTP requests for a while, so the cloud
402 : // control plane can get the actual error.
403 574 : if delay_exit {
404 0 : info!("giving control plane 30s to collect the error before shutdown");
405 0 : thread::sleep(Duration::from_secs(30));
406 574 : }
407 :
408 : // Shutdown trace pipeline gracefully, so that it has a chance to send any
409 : // pending traces before we exit. Shutting down OTEL tracing provider may
410 : // hang for quite some time, see, for example:
411 : // - https://github.com/open-telemetry/opentelemetry-rust/issues/868
412 : // - and our problems with staging https://github.com/neondatabase/cloud/issues/3707#issuecomment-1493983636
413 : //
414 : // Yet, we want computes to shut down fast enough, as we may need a new one
415 : // for the same timeline ASAP. So wait no longer than 2s for the shutdown to
416 : // complete, then just error out and exit the main thread.
417 574 : info!("shutting down tracing");
418 574 : let (sender, receiver) = mpsc::channel();
419 574 : let _ = thread::spawn(move || {
420 574 : tracing_utils::shutdown_tracing();
421 574 : sender.send(()).ok()
422 574 : });
423 574 : let shutdown_res = receiver.recv_timeout(Duration::from_millis(2000));
424 574 : if shutdown_res.is_err() {
425 0 : error!("timed out while shutting down tracing, exiting anyway");
426 574 : }
427 :
428 574 : info!("shutting down");
429 574 : exit(exit_code.unwrap_or(1))
430 1 : }
431 :
432 577 : fn cli() -> clap::Command {
433 577 : // Env variable is set by `cargo`
434 577 : let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
435 577 : clap::Command::new("compute_ctl")
436 577 : .version(version)
437 577 : .arg(
438 577 : Arg::new("http-port")
439 577 : .long("http-port")
440 577 : .value_name("HTTP_PORT")
441 577 : .default_value("3080")
442 577 : .value_parser(clap::value_parser!(u16))
443 577 : .required(false),
444 577 : )
445 577 : .arg(
446 577 : Arg::new("connstr")
447 577 : .short('C')
448 577 : .long("connstr")
449 577 : .value_name("DATABASE_URL")
450 577 : .required(true),
451 577 : )
452 577 : .arg(
453 577 : Arg::new("pgdata")
454 577 : .short('D')
455 577 : .long("pgdata")
456 577 : .value_name("DATADIR")
457 577 : .required(true),
458 577 : )
459 577 : .arg(
460 577 : Arg::new("pgbin")
461 577 : .short('b')
462 577 : .long("pgbin")
463 577 : .default_value("postgres")
464 577 : .value_name("POSTGRES_PATH"),
465 577 : )
466 577 : .arg(
467 577 : Arg::new("spec")
468 577 : .short('s')
469 577 : .long("spec")
470 577 : .value_name("SPEC_JSON"),
471 577 : )
472 577 : .arg(
473 577 : Arg::new("spec-path")
474 577 : .short('S')
475 577 : .long("spec-path")
476 577 : .value_name("SPEC_PATH"),
477 577 : )
478 577 : .arg(
479 577 : Arg::new("compute-id")
480 577 : .short('i')
481 577 : .long("compute-id")
482 577 : .value_name("COMPUTE_ID"),
483 577 : )
484 577 : .arg(
485 577 : Arg::new("control-plane-uri")
486 577 : .short('p')
487 577 : .long("control-plane-uri")
488 577 : .value_name("CONTROL_PLANE_API_BASE_URI"),
489 577 : )
490 577 : .arg(
491 577 : Arg::new("remote-ext-config")
492 577 : .short('r')
493 577 : .long("remote-ext-config")
494 577 : .value_name("REMOTE_EXT_CONFIG"),
495 577 : )
496 577 : // TODO(fprasx): we currently have default arguments because the cloud PR
497 577 : // to pass them in hasn't been merged yet. We should get rid of them once
498 577 : // the PR is merged.
499 577 : .arg(
500 577 : Arg::new("vm-monitor-addr")
501 577 : .long("vm-monitor-addr")
502 577 : .default_value("0.0.0.0:10301")
503 577 : .value_name("VM_MONITOR_ADDR"),
504 577 : )
505 577 : .arg(
506 577 : Arg::new("cgroup")
507 577 : .long("cgroup")
508 577 : .default_value("neon-postgres")
509 577 : .value_name("CGROUP"),
510 577 : )
511 577 : .arg(
512 577 : Arg::new("filecache-connstr")
513 577 : .long("filecache-connstr")
514 577 : .default_value(
515 577 : "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
516 577 : )
517 577 : .value_name("FILECACHE_CONNSTR"),
518 577 : )
519 577 : }
520 :
521 : /// When compute_ctl is killed, send also termination signal to sync-safekeepers
522 : /// to prevent leakage. TODO: it is better to convert compute_ctl to async and
523 : /// wait for termination which would be easy then.
524 1 : fn handle_exit_signal(sig: i32) {
525 1 : info!("received {sig} termination signal");
526 1 : let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
527 1 : if ss_pid != 0 {
528 1 : let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
529 1 : kill(ss_pid, Signal::SIGTERM).ok();
530 1 : }
531 1 : let pg_pid = PG_PID.load(Ordering::SeqCst);
532 1 : if pg_pid != 0 {
533 0 : let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
534 0 : kill(pg_pid, Signal::SIGTERM).ok();
535 1 : }
536 1 : exit(1);
537 : }
538 :
539 2 : #[test]
540 2 : fn verify_cli() {
541 2 : cli().debug_assert()
542 2 : }
|