Line data Source code
1 : use crate::{
2 : background_process,
3 : local_env::{LocalEnv, NeonStorageControllerConf},
4 : };
5 : use camino::{Utf8Path, Utf8PathBuf};
6 : use pageserver_api::{
7 : controller_api::{
8 : NodeConfigureRequest, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse,
9 : TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
10 : },
11 : models::{
12 : TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
13 : },
14 : shard::{ShardStripeSize, TenantShardId},
15 : };
16 : use pageserver_client::mgmt_api::ResponseErrorMessageExt;
17 : use postgres_backend::AuthType;
18 : use reqwest::Method;
19 : use serde::{de::DeserializeOwned, Deserialize, Serialize};
20 : use std::{fs, str::FromStr, time::Duration};
21 : use tokio::process::Command;
22 : use tracing::instrument;
23 : use url::Url;
24 : use utils::{
25 : auth::{encode_from_key_file, Claims, Scope},
26 : id::{NodeId, TenantId},
27 : };
28 :
29 : pub struct StorageController {
30 : env: LocalEnv,
31 : listen: String,
32 : path: Utf8PathBuf,
33 : private_key: Option<Vec<u8>>,
34 : public_key: Option<String>,
35 : postgres_port: u16,
36 : client: reqwest::Client,
37 : config: NeonStorageControllerConf,
38 : }
39 :
40 : const COMMAND: &str = "storage_controller";
41 :
42 : const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
43 :
44 0 : #[derive(Serialize, Deserialize)]
45 : pub struct AttachHookRequest {
46 : pub tenant_shard_id: TenantShardId,
47 : pub node_id: Option<NodeId>,
48 : pub generation_override: Option<i32>,
49 : }
50 :
51 0 : #[derive(Serialize, Deserialize)]
52 : pub struct AttachHookResponse {
53 : pub gen: Option<u32>,
54 : }
55 :
56 0 : #[derive(Serialize, Deserialize)]
57 : pub struct InspectRequest {
58 : pub tenant_shard_id: TenantShardId,
59 : }
60 :
61 0 : #[derive(Serialize, Deserialize)]
62 : pub struct InspectResponse {
63 : pub attachment: Option<(u32, NodeId)>,
64 : }
65 :
66 : impl StorageController {
67 0 : pub fn from_env(env: &LocalEnv) -> Self {
68 0 : let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
69 0 : .unwrap()
70 0 : .join("attachments.json");
71 0 :
72 0 : // Makes no sense to construct this if pageservers aren't going to use it: assume
73 0 : // pageservers have control plane API set
74 0 : let listen_url = env.control_plane_api.clone().unwrap();
75 0 :
76 0 : let listen = format!(
77 0 : "{}:{}",
78 0 : listen_url.host_str().unwrap(),
79 0 : listen_url.port().unwrap()
80 0 : );
81 0 :
82 0 : // Convention: NeonEnv in python tests reserves the next port after the control_plane_api
83 0 : // port, for use by our captive postgres.
84 0 : let postgres_port = listen_url
85 0 : .port()
86 0 : .expect("Control plane API setting should always have a port")
87 0 : + 1;
88 0 :
89 0 : // Assume all pageservers have symmetric auth configuration: this service
90 0 : // expects to use one JWT token to talk to all of them.
91 0 : let ps_conf = env
92 0 : .pageservers
93 0 : .first()
94 0 : .expect("Config is validated to contain at least one pageserver");
95 0 : let (private_key, public_key) = match ps_conf.http_auth_type {
96 0 : AuthType::Trust => (None, None),
97 : AuthType::NeonJWT => {
98 0 : let private_key_path = env.get_private_key_path();
99 0 : let private_key = fs::read(private_key_path).expect("failed to read private key");
100 0 :
101 0 : // If pageserver auth is enabled, this implicitly enables auth for this service,
102 0 : // using the same credentials.
103 0 : let public_key_path =
104 0 : camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem"))
105 0 : .unwrap();
106 :
107 : // This service takes keys as a string rather than as a path to a file/dir: read the key into memory.
108 0 : let public_key = if std::fs::metadata(&public_key_path)
109 0 : .expect("Can't stat public key")
110 0 : .is_dir()
111 : {
112 : // Our config may specify a directory: this is for the pageserver's ability to handle multiple
113 : // keys. We only use one key at a time, so, arbitrarily load the first one in the directory.
114 0 : let mut dir =
115 0 : std::fs::read_dir(&public_key_path).expect("Can't readdir public key path");
116 0 : let dent = dir
117 0 : .next()
118 0 : .expect("Empty key dir")
119 0 : .expect("Error reading key dir");
120 0 :
121 0 : std::fs::read_to_string(dent.path()).expect("Can't read public key")
122 : } else {
123 0 : std::fs::read_to_string(&public_key_path).expect("Can't read public key")
124 : };
125 0 : (Some(private_key), Some(public_key))
126 : }
127 : };
128 :
129 0 : Self {
130 0 : env: env.clone(),
131 0 : path,
132 0 : listen,
133 0 : private_key,
134 0 : public_key,
135 0 : postgres_port,
136 0 : client: reqwest::ClientBuilder::new()
137 0 : .build()
138 0 : .expect("Failed to construct http client"),
139 0 : config: env.storage_controller.clone(),
140 0 : }
141 0 : }
142 :
143 0 : fn pid_file(&self) -> Utf8PathBuf {
144 0 : Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
145 0 : .expect("non-Unicode path")
146 0 : }
147 :
148 : /// PIDFile for the postgres instance used to store storage controller state
149 0 : fn postgres_pid_file(&self) -> Utf8PathBuf {
150 0 : Utf8PathBuf::from_path_buf(
151 0 : self.env
152 0 : .base_data_dir
153 0 : .join("storage_controller_postgres.pid"),
154 0 : )
155 0 : .expect("non-Unicode path")
156 0 : }
157 :
158 : /// Find the directory containing postgres subdirectories, such `bin` and `lib`
159 : ///
160 : /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
161 : /// to other versions if that one isn't found. Some automated tests create circumstances
162 : /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
163 0 : async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
164 0 : let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
165 :
166 0 : for v in prefer_versions {
167 0 : let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
168 0 : if tokio::fs::try_exists(&path).await? {
169 0 : return Ok(path);
170 0 : }
171 : }
172 :
173 : // Fall through
174 0 : anyhow::bail!(
175 0 : "Postgres directory '{}' not found in {}",
176 0 : dir_name,
177 0 : self.env.pg_distrib_dir.display(),
178 0 : );
179 0 : }
180 :
181 0 : pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
182 0 : self.get_pg_dir("bin").await
183 0 : }
184 :
185 0 : pub async fn get_pg_lib_dir(&self) -> anyhow::Result<Utf8PathBuf> {
186 0 : self.get_pg_dir("lib").await
187 0 : }
188 :
189 : /// Readiness check for our postgres process
190 0 : async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
191 0 : let bin_path = pg_bin_dir.join("pg_isready");
192 0 : let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
193 0 : let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
194 :
195 0 : Ok(exitcode.success())
196 0 : }
197 :
198 : /// Create our database if it doesn't exist, and run migrations.
199 : ///
200 : /// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
201 : /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
202 : /// who just want to run `cargo neon_local` without knowing about diesel.
203 : ///
204 : /// Returns the database url
205 0 : pub async fn setup_database(&self) -> anyhow::Result<String> {
206 0 : const DB_NAME: &str = "storage_controller";
207 0 : let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
208 :
209 0 : let pg_bin_dir = self.get_pg_bin_dir().await?;
210 0 : let createdb_path = pg_bin_dir.join("createdb");
211 0 : let output = Command::new(&createdb_path)
212 0 : .args([
213 0 : "-h",
214 0 : "localhost",
215 0 : "-p",
216 0 : &format!("{}", self.postgres_port),
217 0 : DB_NAME,
218 0 : ])
219 0 : .output()
220 0 : .await
221 0 : .expect("Failed to spawn createdb");
222 0 :
223 0 : if !output.status.success() {
224 0 : let stderr = String::from_utf8(output.stderr).expect("Non-UTF8 output from createdb");
225 0 : if stderr.contains("already exists") {
226 0 : tracing::info!("Database {DB_NAME} already exists");
227 : } else {
228 0 : anyhow::bail!("createdb failed with status {}: {stderr}", output.status);
229 : }
230 0 : }
231 :
232 0 : Ok(database_url)
233 0 : }
234 :
235 0 : pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
236 0 : // Start a vanilla Postgres process used by the storage controller for persistence.
237 0 : let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
238 0 : .unwrap()
239 0 : .join("storage_controller_db");
240 0 : let pg_bin_dir = self.get_pg_bin_dir().await?;
241 0 : let pg_lib_dir = self.get_pg_lib_dir().await?;
242 0 : let pg_log_path = pg_data_path.join("postgres.log");
243 0 :
244 0 : if !tokio::fs::try_exists(&pg_data_path).await? {
245 : // Initialize empty database
246 0 : let initdb_path = pg_bin_dir.join("initdb");
247 0 : let mut child = Command::new(&initdb_path)
248 0 : .envs(vec![
249 0 : ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
250 0 : ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
251 0 : ])
252 0 : .args(["-D", pg_data_path.as_ref()])
253 0 : .spawn()
254 0 : .expect("Failed to spawn initdb");
255 0 : let status = child.wait().await?;
256 0 : if !status.success() {
257 0 : anyhow::bail!("initdb failed with status {status}");
258 0 : }
259 0 :
260 0 : // Write a minimal config file:
261 0 : // - Specify the port, since this is chosen dynamically
262 0 : // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
263 0 : // the storage controller we don't want a slow local disk to interfere with that.
264 0 : tokio::fs::write(
265 0 : &pg_data_path.join("postgresql.conf"),
266 0 : format!("port = {}\nfsync=off\n", self.postgres_port),
267 0 : )
268 0 : .await?;
269 0 : };
270 :
271 0 : println!("Starting storage controller database...");
272 0 : let db_start_args = [
273 0 : "-w",
274 0 : "-D",
275 0 : pg_data_path.as_ref(),
276 0 : "-l",
277 0 : pg_log_path.as_ref(),
278 0 : "start",
279 0 : ];
280 0 :
281 0 : background_process::start_process(
282 0 : "storage_controller_db",
283 0 : &self.env.base_data_dir,
284 0 : pg_bin_dir.join("pg_ctl").as_std_path(),
285 0 : db_start_args,
286 0 : vec![
287 0 : ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
288 0 : ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
289 0 : ],
290 0 : background_process::InitialPidFile::Create(self.postgres_pid_file()),
291 0 : retry_timeout,
292 0 : || self.pg_isready(&pg_bin_dir),
293 0 : )
294 0 : .await?;
295 :
296 : // Run migrations on every startup, in case something changed.
297 0 : let database_url = self.setup_database().await?;
298 :
299 0 : let mut args = vec![
300 0 : "-l",
301 0 : &self.listen,
302 0 : "-p",
303 0 : self.path.as_ref(),
304 0 : "--dev",
305 0 : "--database-url",
306 0 : &database_url,
307 0 : "--max-unavailable-interval",
308 0 : &humantime::Duration::from(self.config.max_unavailable).to_string(),
309 0 : ]
310 0 : .into_iter()
311 0 : .map(|s| s.to_string())
312 0 : .collect::<Vec<_>>();
313 0 : if let Some(private_key) = &self.private_key {
314 0 : let claims = Claims::new(None, Scope::PageServerApi);
315 0 : let jwt_token =
316 0 : encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
317 0 : args.push(format!("--jwt-token={jwt_token}"));
318 0 : }
319 :
320 0 : if let Some(public_key) = &self.public_key {
321 0 : args.push(format!("--public-key=\"{public_key}\""));
322 0 : }
323 :
324 0 : if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api {
325 0 : args.push(format!(
326 0 : "--compute-hook-url={control_plane_compute_hook_api}"
327 0 : ));
328 0 : }
329 :
330 0 : if let Some(split_threshold) = self.config.split_threshold.as_ref() {
331 0 : args.push(format!("--split-threshold={split_threshold}"))
332 0 : }
333 :
334 0 : args.push(format!(
335 0 : "--neon-local-repo-dir={}",
336 0 : self.env.base_data_dir.display()
337 0 : ));
338 0 :
339 0 : background_process::start_process(
340 0 : COMMAND,
341 0 : &self.env.base_data_dir,
342 0 : &self.env.storage_controller_bin(),
343 0 : args,
344 0 : vec![
345 0 : ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
346 0 : ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
347 0 : ],
348 0 : background_process::InitialPidFile::Create(self.pid_file()),
349 0 : retry_timeout,
350 0 : || async {
351 0 : match self.ready().await {
352 0 : Ok(_) => Ok(true),
353 0 : Err(_) => Ok(false),
354 0 : }
355 0 : },
356 0 : )
357 0 : .await?;
358 :
359 0 : Ok(())
360 0 : }
361 :
362 0 : pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
363 0 : background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
364 :
365 0 : let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
366 0 : let pg_bin_dir = self.get_pg_bin_dir().await?;
367 :
368 0 : println!("Stopping storage controller database...");
369 0 : let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
370 0 : let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
371 0 : .args(pg_stop_args)
372 0 : .spawn()?
373 0 : .wait()
374 0 : .await?;
375 0 : if !stop_status.success() {
376 0 : let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
377 0 : let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
378 0 : .args(pg_status_args)
379 0 : .spawn()?
380 0 : .wait()
381 0 : .await?;
382 :
383 : // pg_ctl status returns this exit code if postgres is not running: in this case it is
384 : // fine that stop failed. Otherwise it is an error that stop failed.
385 : const PG_STATUS_NOT_RUNNING: i32 = 3;
386 0 : if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
387 0 : println!("Storage controller database is already stopped");
388 0 : return Ok(());
389 : } else {
390 0 : anyhow::bail!("Failed to stop storage controller database: {stop_status}")
391 : }
392 0 : }
393 0 :
394 0 : Ok(())
395 0 : }
396 :
397 0 : fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
398 0 : let category = match path.find('/') {
399 0 : Some(idx) => &path[..idx],
400 0 : None => path,
401 : };
402 :
403 0 : match category {
404 0 : "status" | "ready" => Ok(None),
405 0 : "control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))),
406 0 : "v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))),
407 0 : _ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)),
408 : }
409 0 : }
410 :
411 : /// Simple HTTP request wrapper for calling into storage controller
412 0 : async fn dispatch<RQ, RS>(
413 0 : &self,
414 0 : method: reqwest::Method,
415 0 : path: String,
416 0 : body: Option<RQ>,
417 0 : ) -> anyhow::Result<RS>
418 0 : where
419 0 : RQ: Serialize + Sized,
420 0 : RS: DeserializeOwned + Sized,
421 0 : {
422 0 : // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
423 0 : // for general purpose API access.
424 0 : let listen_url = self.env.control_plane_api.clone().unwrap();
425 0 : let url = Url::from_str(&format!(
426 0 : "http://{}:{}/{path}",
427 0 : listen_url.host_str().unwrap(),
428 0 : listen_url.port().unwrap()
429 0 : ))
430 0 : .unwrap();
431 0 :
432 0 : let mut builder = self.client.request(method, url);
433 0 : if let Some(body) = body {
434 0 : builder = builder.json(&body)
435 0 : }
436 0 : if let Some(private_key) = &self.private_key {
437 0 : println!("Getting claims for path {}", path);
438 0 : if let Some(required_claims) = Self::get_claims_for_path(&path)? {
439 0 : println!("Got claims {:?} for path {}", required_claims, path);
440 0 : let jwt_token = encode_from_key_file(&required_claims, private_key)?;
441 0 : builder = builder.header(
442 0 : reqwest::header::AUTHORIZATION,
443 0 : format!("Bearer {jwt_token}"),
444 0 : );
445 0 : }
446 0 : }
447 :
448 0 : let response = builder.send().await?;
449 0 : let response = response.error_from_body().await?;
450 :
451 0 : Ok(response
452 0 : .json()
453 0 : .await
454 0 : .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?)
455 0 : }
456 :
457 : /// Call into the attach_hook API, for use before handing out attachments to pageservers
458 0 : #[instrument(skip(self))]
459 : pub async fn attach_hook(
460 : &self,
461 : tenant_shard_id: TenantShardId,
462 : pageserver_id: NodeId,
463 : ) -> anyhow::Result<Option<u32>> {
464 : let request = AttachHookRequest {
465 : tenant_shard_id,
466 : node_id: Some(pageserver_id),
467 : generation_override: None,
468 : };
469 :
470 : let response = self
471 : .dispatch::<_, AttachHookResponse>(
472 : Method::POST,
473 : "debug/v1/attach-hook".to_string(),
474 : Some(request),
475 : )
476 : .await?;
477 :
478 : Ok(response.gen)
479 : }
480 :
481 0 : #[instrument(skip(self))]
482 : pub async fn inspect(
483 : &self,
484 : tenant_shard_id: TenantShardId,
485 : ) -> anyhow::Result<Option<(u32, NodeId)>> {
486 : let request = InspectRequest { tenant_shard_id };
487 :
488 : let response = self
489 : .dispatch::<_, InspectResponse>(
490 : Method::POST,
491 : "debug/v1/inspect".to_string(),
492 : Some(request),
493 : )
494 : .await?;
495 :
496 : Ok(response.attachment)
497 : }
498 :
499 0 : #[instrument(skip(self))]
500 : pub async fn tenant_create(
501 : &self,
502 : req: TenantCreateRequest,
503 : ) -> anyhow::Result<TenantCreateResponse> {
504 : self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req))
505 : .await
506 : }
507 :
508 0 : #[instrument(skip(self))]
509 : pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
510 : self.dispatch::<(), TenantCreateResponse>(
511 : Method::POST,
512 : format!("debug/v1/tenant/{tenant_id}/import"),
513 : None,
514 : )
515 : .await
516 : }
517 :
518 0 : #[instrument(skip(self))]
519 : pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
520 : self.dispatch::<(), _>(
521 : Method::GET,
522 : format!("debug/v1/tenant/{tenant_id}/locate"),
523 : None,
524 : )
525 : .await
526 : }
527 :
528 0 : #[instrument(skip(self))]
529 : pub async fn tenant_migrate(
530 : &self,
531 : tenant_shard_id: TenantShardId,
532 : node_id: NodeId,
533 : ) -> anyhow::Result<TenantShardMigrateResponse> {
534 : self.dispatch(
535 : Method::PUT,
536 : format!("control/v1/tenant/{tenant_shard_id}/migrate"),
537 : Some(TenantShardMigrateRequest {
538 : tenant_shard_id,
539 : node_id,
540 : }),
541 : )
542 : .await
543 : }
544 :
545 0 : #[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
546 : pub async fn tenant_split(
547 : &self,
548 : tenant_id: TenantId,
549 : new_shard_count: u8,
550 : new_stripe_size: Option<ShardStripeSize>,
551 : ) -> anyhow::Result<TenantShardSplitResponse> {
552 : self.dispatch(
553 : Method::PUT,
554 : format!("control/v1/tenant/{tenant_id}/shard_split"),
555 : Some(TenantShardSplitRequest {
556 : new_shard_count,
557 : new_stripe_size,
558 : }),
559 : )
560 : .await
561 : }
562 :
563 0 : #[instrument(skip_all, fields(node_id=%req.node_id))]
564 : pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
565 : self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
566 : .await
567 : }
568 :
569 0 : #[instrument(skip_all, fields(node_id=%req.node_id))]
570 : pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
571 : self.dispatch::<_, ()>(
572 : Method::PUT,
573 : format!("control/v1/node/{}/config", req.node_id),
574 : Some(req),
575 : )
576 : .await
577 : }
578 :
579 0 : #[instrument(skip(self))]
580 : pub async fn ready(&self) -> anyhow::Result<()> {
581 : self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
582 : .await
583 : }
584 :
585 0 : #[instrument(skip_all, fields(%tenant_id, timeline_id=%req.new_timeline_id))]
586 : pub async fn tenant_timeline_create(
587 : &self,
588 : tenant_id: TenantId,
589 : req: TimelineCreateRequest,
590 : ) -> anyhow::Result<TimelineInfo> {
591 : self.dispatch(
592 : Method::POST,
593 : format!("v1/tenant/{tenant_id}/timeline"),
594 : Some(req),
595 : )
596 : .await
597 : }
598 : }
|