LCOV - code coverage report
Current view: top level - libs/pageserver_api/src - controller_api.rs (source / functions) Coverage Total Hit
Test: 727bdccc1d7d53837da843959afb612f56da4e79.info Lines: 28.7 % 115 33
Test Date: 2025-01-30 15:18:43 Functions: 2.3 % 303 7

            Line data    Source code
       1              : use std::collections::{HashMap, HashSet};
       2              : use std::fmt::Display;
       3              : use std::str::FromStr;
       4              : use std::time::{Duration, Instant};
       5              : 
       6              : /// Request/response types for the storage controller
       7              : /// API (`/control/v1` prefix).  Implemented by the server
       8              : /// in [`storage_controller::http`]
       9              : use serde::{Deserialize, Serialize};
      10              : use utils::id::{NodeId, TenantId};
      11              : 
      12              : use crate::models::PageserverUtilization;
      13              : use crate::{
      14              :     models::{ShardParameters, TenantConfig},
      15              :     shard::{ShardStripeSize, TenantShardId},
      16              : };
      17              : 
      18            2 : #[derive(Serialize, Deserialize, Debug)]
      19              : #[serde(deny_unknown_fields)]
      20              : pub struct TenantCreateRequest {
      21              :     pub new_tenant_id: TenantShardId,
      22              :     #[serde(default)]
      23              :     #[serde(skip_serializing_if = "Option::is_none")]
      24              :     pub generation: Option<u32>,
      25              : 
      26              :     // If omitted, create a single shard with TenantShardId::unsharded()
      27              :     #[serde(default)]
      28              :     #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
      29              :     pub shard_parameters: ShardParameters,
      30              : 
      31              :     #[serde(default)]
      32              :     #[serde(skip_serializing_if = "Option::is_none")]
      33              :     pub placement_policy: Option<PlacementPolicy>,
      34              : 
      35              :     #[serde(flatten)]
      36              :     pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
      37              : }
      38              : 
      39            0 : #[derive(Serialize, Deserialize)]
      40              : pub struct TenantCreateResponseShard {
      41              :     pub shard_id: TenantShardId,
      42              :     pub node_id: NodeId,
      43              :     pub generation: u32,
      44              : }
      45              : 
      46            0 : #[derive(Serialize, Deserialize)]
      47              : pub struct TenantCreateResponse {
      48              :     pub shards: Vec<TenantCreateResponseShard>,
      49              : }
      50              : 
      51            0 : #[derive(Serialize, Deserialize, Debug, Clone)]
      52              : pub struct NodeRegisterRequest {
      53              :     pub node_id: NodeId,
      54              : 
      55              :     pub listen_pg_addr: String,
      56              :     pub listen_pg_port: u16,
      57              : 
      58              :     pub listen_http_addr: String,
      59              :     pub listen_http_port: u16,
      60              : 
      61              :     pub availability_zone_id: AvailabilityZone,
      62              : }
      63              : 
      64            0 : #[derive(Serialize, Deserialize)]
      65              : pub struct NodeConfigureRequest {
      66              :     pub node_id: NodeId,
      67              : 
      68              :     pub availability: Option<NodeAvailabilityWrapper>,
      69              :     pub scheduling: Option<NodeSchedulingPolicy>,
      70              : }
      71              : 
      72            0 : #[derive(Serialize, Deserialize)]
      73              : pub struct TenantPolicyRequest {
      74              :     pub placement: Option<PlacementPolicy>,
      75              :     pub scheduling: Option<ShardSchedulingPolicy>,
      76              : }
      77              : 
      78            0 : #[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)]
      79              : pub struct AvailabilityZone(pub String);
      80              : 
      81              : impl Display for AvailabilityZone {
      82          300 :     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
      83          300 :         write!(f, "{}", self.0)
      84          300 :     }
      85              : }
      86              : 
      87            0 : #[derive(Serialize, Deserialize)]
      88              : pub struct ShardsPreferredAzsRequest {
      89              :     #[serde(flatten)]
      90              :     pub preferred_az_ids: HashMap<TenantShardId, Option<AvailabilityZone>>,
      91              : }
      92              : 
      93            0 : #[derive(Serialize, Deserialize)]
      94              : pub struct ShardsPreferredAzsResponse {
      95              :     pub updated: Vec<TenantShardId>,
      96              : }
      97              : 
      98            0 : #[derive(Serialize, Deserialize, Debug)]
      99              : pub struct TenantLocateResponseShard {
     100              :     pub shard_id: TenantShardId,
     101              :     pub node_id: NodeId,
     102              : 
     103              :     pub listen_pg_addr: String,
     104              :     pub listen_pg_port: u16,
     105              : 
     106              :     pub listen_http_addr: String,
     107              :     pub listen_http_port: u16,
     108              : }
     109              : 
     110            0 : #[derive(Serialize, Deserialize)]
     111              : pub struct TenantLocateResponse {
     112              :     pub shards: Vec<TenantLocateResponseShard>,
     113              :     pub shard_params: ShardParameters,
     114              : }
     115              : 
     116            0 : #[derive(Serialize, Deserialize, Debug)]
     117              : pub struct TenantDescribeResponse {
     118              :     pub tenant_id: TenantId,
     119              :     pub shards: Vec<TenantDescribeResponseShard>,
     120              :     pub stripe_size: ShardStripeSize,
     121              :     pub policy: PlacementPolicy,
     122              :     pub config: TenantConfig,
     123              : }
     124              : 
     125            0 : #[derive(Serialize, Deserialize, Debug)]
     126              : pub struct NodeShardResponse {
     127              :     pub node_id: NodeId,
     128              :     pub shards: Vec<NodeShard>,
     129              : }
     130              : 
     131            0 : #[derive(Serialize, Deserialize, Debug)]
     132              : pub struct NodeShard {
     133              :     pub tenant_shard_id: TenantShardId,
     134              :     /// Whether the shard is observed secondary on a specific node. True = yes, False = no, None = not on this node.
     135              :     pub is_observed_secondary: Option<bool>,
     136              :     /// Whether the shard is intended to be a secondary on a specific node. True = yes, False = no, None = not on this node.
     137              :     pub is_intended_secondary: Option<bool>,
     138              : }
     139              : 
     140            0 : #[derive(Serialize, Deserialize)]
     141              : pub struct NodeDescribeResponse {
     142              :     pub id: NodeId,
     143              : 
     144              :     pub availability: NodeAvailabilityWrapper,
     145              :     pub scheduling: NodeSchedulingPolicy,
     146              : 
     147              :     pub availability_zone_id: String,
     148              : 
     149              :     pub listen_http_addr: String,
     150              :     pub listen_http_port: u16,
     151              : 
     152              :     pub listen_pg_addr: String,
     153              :     pub listen_pg_port: u16,
     154              : }
     155              : 
     156            0 : #[derive(Serialize, Deserialize, Debug)]
     157              : pub struct TenantDescribeResponseShard {
     158              :     pub tenant_shard_id: TenantShardId,
     159              : 
     160              :     pub node_attached: Option<NodeId>,
     161              :     pub node_secondary: Vec<NodeId>,
     162              : 
     163              :     pub last_error: String,
     164              : 
     165              :     /// A task is currently running to reconcile this tenant's intent state with the state on pageservers
     166              :     pub is_reconciling: bool,
     167              :     /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
     168              :     pub is_pending_compute_notification: bool,
     169              :     /// A shard split is currently underway
     170              :     pub is_splitting: bool,
     171              : 
     172              :     pub scheduling_policy: ShardSchedulingPolicy,
     173              : 
     174              :     pub preferred_az_id: Option<String>,
     175              : }
     176              : 
     177              : /// Migration request for a given tenant shard to a given node.
     178              : ///
     179              : /// Explicitly migrating a particular shard is a low level operation
     180              : /// TODO: higher level "Reschedule tenant" operation where the request
     181              : /// specifies some constraints, e.g. asking it to get off particular node(s)
     182            0 : #[derive(Serialize, Deserialize, Debug)]
     183              : pub struct TenantShardMigrateRequest {
     184              :     pub node_id: NodeId,
     185              : }
     186              : 
     187              : #[derive(Serialize, Clone, Debug)]
     188              : #[serde(into = "NodeAvailabilityWrapper")]
     189              : pub enum NodeAvailability {
     190              :     // Normal, happy state
     191              :     Active(PageserverUtilization),
     192              :     // Node is warming up, but we expect it to become available soon. Covers
     193              :     // the time span between the re-attach response being composed on the storage controller
     194              :     // and the first successful heartbeat after the processing of the re-attach response
     195              :     // finishes on the pageserver.
     196              :     WarmingUp(Instant),
     197              :     // Offline: Tenants shouldn't try to attach here, but they may assume that their
     198              :     // secondary locations on this node still exist.  Newly added nodes are in this
     199              :     // state until we successfully contact them.
     200              :     Offline,
     201              : }
     202              : 
     203              : impl PartialEq for NodeAvailability {
     204            0 :     fn eq(&self, other: &Self) -> bool {
     205              :         use NodeAvailability::*;
     206            0 :         matches!(
     207            0 :             (self, other),
     208              :             (Active(_), Active(_)) | (Offline, Offline) | (WarmingUp(_), WarmingUp(_))
     209              :         )
     210            0 :     }
     211              : }
     212              : 
     213              : impl Eq for NodeAvailability {}
     214              : 
     215              : // This wrapper provides serde functionality and it should only be used to
     216              : // communicate with external callers which don't know or care about the
     217              : // utilisation score of the pageserver it is targeting.
     218            0 : #[derive(Serialize, Deserialize, Clone, Copy, Debug)]
     219              : pub enum NodeAvailabilityWrapper {
     220              :     Active,
     221              :     WarmingUp,
     222              :     Offline,
     223              : }
     224              : 
     225              : impl From<NodeAvailabilityWrapper> for NodeAvailability {
     226            0 :     fn from(val: NodeAvailabilityWrapper) -> Self {
     227            0 :         match val {
     228              :             // Assume the worst utilisation score to begin with. It will later be updated by
     229              :             // the heartbeats.
     230              :             NodeAvailabilityWrapper::Active => {
     231            0 :                 NodeAvailability::Active(PageserverUtilization::full())
     232              :             }
     233            0 :             NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
     234            0 :             NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
     235              :         }
     236            0 :     }
     237              : }
     238              : 
     239              : impl From<NodeAvailability> for NodeAvailabilityWrapper {
     240            0 :     fn from(val: NodeAvailability) -> Self {
     241            0 :         match val {
     242            0 :             NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
     243            0 :             NodeAvailability::WarmingUp(_) => NodeAvailabilityWrapper::WarmingUp,
     244            0 :             NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
     245              :         }
     246            0 :     }
     247              : }
     248              : 
     249              : /// Scheduling policy enables us to selectively disable some automatic actions that the
     250              : /// controller performs on a tenant shard. This is only set to a non-default value by
     251              : /// human intervention, and it is reset to the default value (Active) when the tenant's
     252              : /// placement policy is modified away from Attached.
     253              : ///
     254              : /// The typical use of a non-Active scheduling policy is one of:
     255              : /// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy)
     256              : /// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed)
     257              : ///
     258              : /// If you're not sure which policy to use to pin a shard to its current location, you probably
     259              : /// want Pause.
     260            0 : #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
     261              : pub enum ShardSchedulingPolicy {
     262              :     // Normal mode: the tenant's scheduled locations may be updated at will, including
     263              :     // for non-essential optimization.
     264              :     Active,
     265              : 
     266              :     // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
     267              :     // For example, this still permits a node's attachment location to change to a secondary in
     268              :     // response to a node failure, or to assign a new secondary if a node was removed.
     269              :     Essential,
     270              : 
     271              :     // No scheduling: leave the shard running wherever it currently is.  Even if the shard is
     272              :     // unavailable, it will not be rescheduled to another node.
     273              :     Pause,
     274              : 
     275              :     // No reconciling: we will make no location_conf API calls to pageservers at all.  If the
     276              :     // shard is unavailable, it stays that way.  If a node fails, this shard doesn't get failed over.
     277              :     Stop,
     278              : }
     279              : 
     280              : impl Default for ShardSchedulingPolicy {
     281        12839 :     fn default() -> Self {
     282        12839 :         Self::Active
     283        12839 :     }
     284              : }
     285              : 
     286            0 : #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
     287              : pub enum NodeSchedulingPolicy {
     288              :     Active,
     289              :     Filling,
     290              :     Pause,
     291              :     PauseForRestart,
     292              :     Draining,
     293              : }
     294              : 
     295              : impl FromStr for NodeSchedulingPolicy {
     296              :     type Err = anyhow::Error;
     297              : 
     298            0 :     fn from_str(s: &str) -> Result<Self, Self::Err> {
     299            0 :         match s {
     300            0 :             "active" => Ok(Self::Active),
     301            0 :             "filling" => Ok(Self::Filling),
     302            0 :             "pause" => Ok(Self::Pause),
     303            0 :             "pause_for_restart" => Ok(Self::PauseForRestart),
     304            0 :             "draining" => Ok(Self::Draining),
     305            0 :             _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
     306              :         }
     307            0 :     }
     308              : }
     309              : 
     310              : impl From<NodeSchedulingPolicy> for String {
     311            0 :     fn from(value: NodeSchedulingPolicy) -> String {
     312              :         use NodeSchedulingPolicy::*;
     313            0 :         match value {
     314            0 :             Active => "active",
     315            0 :             Filling => "filling",
     316            0 :             Pause => "pause",
     317            0 :             PauseForRestart => "pause_for_restart",
     318            0 :             Draining => "draining",
     319              :         }
     320            0 :         .to_string()
     321            0 :     }
     322              : }
     323              : 
     324            0 : #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
     325              : pub enum SkSchedulingPolicy {
     326              :     Active,
     327              :     Pause,
     328              :     Decomissioned,
     329              : }
     330              : 
     331              : impl FromStr for SkSchedulingPolicy {
     332              :     type Err = anyhow::Error;
     333              : 
     334            0 :     fn from_str(s: &str) -> Result<Self, Self::Err> {
     335            0 :         Ok(match s {
     336            0 :             "active" => Self::Active,
     337            0 :             "pause" => Self::Pause,
     338            0 :             "decomissioned" => Self::Decomissioned,
     339              :             _ => {
     340            0 :                 return Err(anyhow::anyhow!(
     341            0 :                     "Unknown scheduling policy '{s}', try active,pause,decomissioned"
     342            0 :                 ))
     343              :             }
     344              :         })
     345            0 :     }
     346              : }
     347              : 
     348              : impl From<SkSchedulingPolicy> for String {
     349            0 :     fn from(value: SkSchedulingPolicy) -> String {
     350              :         use SkSchedulingPolicy::*;
     351            0 :         match value {
     352            0 :             Active => "active",
     353            0 :             Pause => "pause",
     354            0 :             Decomissioned => "decomissioned",
     355              :         }
     356            0 :         .to_string()
     357            0 :     }
     358              : }
     359              : 
     360              : /// Controls how tenant shards are mapped to locations on pageservers, e.g. whether
     361              : /// to create secondary locations.
     362            2 : #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
     363              : pub enum PlacementPolicy {
     364              :     /// Normal live state: one attached pageserver and zero or more secondaries.
     365              :     Attached(usize),
     366              :     /// Create one secondary mode locations. This is useful when onboarding
     367              :     /// a tenant, or for an idle tenant that we might want to bring online quickly.
     368              :     Secondary,
     369              : 
     370              :     /// Do not attach to any pageservers.  This is appropriate for tenants that
     371              :     /// have been idle for a long time, where we do not mind some delay in making
     372              :     /// them available in future.
     373              :     Detached,
     374              : }
     375              : 
     376              : impl PlacementPolicy {
     377           43 :     pub fn want_secondaries(&self) -> usize {
     378           43 :         match self {
     379           43 :             PlacementPolicy::Attached(secondary_count) => *secondary_count,
     380            0 :             PlacementPolicy::Secondary => 1,
     381            0 :             PlacementPolicy::Detached => 0,
     382              :         }
     383           43 :     }
     384              : }
     385              : 
     386            0 : #[derive(Serialize, Deserialize, Debug)]
     387              : pub struct TenantShardMigrateResponse {}
     388              : 
     389              : /// Metadata health record posted from scrubber.
     390            0 : #[derive(Serialize, Deserialize, Debug)]
     391              : pub struct MetadataHealthRecord {
     392              :     pub tenant_shard_id: TenantShardId,
     393              :     pub healthy: bool,
     394              :     pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
     395              : }
     396              : 
     397            0 : #[derive(Serialize, Deserialize, Debug)]
     398              : pub struct MetadataHealthUpdateRequest {
     399              :     pub healthy_tenant_shards: HashSet<TenantShardId>,
     400              :     pub unhealthy_tenant_shards: HashSet<TenantShardId>,
     401              : }
     402              : 
     403            0 : #[derive(Serialize, Deserialize, Debug)]
     404              : pub struct MetadataHealthUpdateResponse {}
     405              : 
     406            0 : #[derive(Serialize, Deserialize, Debug)]
     407              : pub struct MetadataHealthListUnhealthyResponse {
     408              :     pub unhealthy_tenant_shards: Vec<TenantShardId>,
     409              : }
     410              : 
     411            0 : #[derive(Serialize, Deserialize, Debug)]
     412              : pub struct MetadataHealthListOutdatedRequest {
     413              :     #[serde(with = "humantime_serde")]
     414              :     pub not_scrubbed_for: Duration,
     415              : }
     416              : 
     417            0 : #[derive(Serialize, Deserialize, Debug)]
     418              : pub struct MetadataHealthListOutdatedResponse {
     419              :     pub health_records: Vec<MetadataHealthRecord>,
     420              : }
     421              : 
     422              : /// Publicly exposed safekeeper description
     423            0 : #[derive(Serialize, Deserialize, Clone)]
     424              : pub struct SafekeeperDescribeResponse {
     425              :     pub id: NodeId,
     426              :     pub region_id: String,
     427              :     /// 1 is special, it means just created (not currently posted to storcon).
     428              :     /// Zero or negative is not really expected.
     429              :     /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag.
     430              :     pub version: i64,
     431              :     pub host: String,
     432              :     pub port: i32,
     433              :     pub http_port: i32,
     434              :     pub availability_zone_id: String,
     435              :     pub scheduling_policy: SkSchedulingPolicy,
     436              : }
     437              : 
     438            0 : #[derive(Serialize, Deserialize, Clone)]
     439              : pub struct SafekeeperSchedulingPolicyRequest {
     440              :     pub scheduling_policy: SkSchedulingPolicy,
     441              : }
     442              : 
     443              : #[cfg(test)]
     444              : mod test {
     445              :     use super::*;
     446              :     use serde_json;
     447              : 
     448              :     /// Check stability of PlacementPolicy's serialization
     449              :     #[test]
     450            1 :     fn placement_policy_encoding() -> anyhow::Result<()> {
     451            1 :         let v = PlacementPolicy::Attached(1);
     452            1 :         let encoded = serde_json::to_string(&v)?;
     453            1 :         assert_eq!(encoded, "{\"Attached\":1}");
     454            1 :         assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
     455              : 
     456            1 :         let v = PlacementPolicy::Detached;
     457            1 :         let encoded = serde_json::to_string(&v)?;
     458            1 :         assert_eq!(encoded, "\"Detached\"");
     459            1 :         assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
     460            1 :         Ok(())
     461            1 :     }
     462              : 
     463              :     #[test]
     464            1 :     fn test_reject_unknown_field() {
     465            1 :         let id = TenantId::generate();
     466            1 :         let create_request = serde_json::json!({
     467            1 :             "new_tenant_id": id.to_string(),
     468            1 :             "unknown_field": "unknown_value".to_string(),
     469            1 :         });
     470            1 :         let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
     471            1 :         assert!(
     472            1 :             err.to_string().contains("unknown field `unknown_field`"),
     473            0 :             "expect unknown field `unknown_field` error, got: {}",
     474              :             err
     475              :         );
     476            1 :     }
     477              : }
        

Generated by: LCOV version 2.1-beta