LCOV - 1e20c4f2b28aa592527961bb32170ebbd2c9172f.info - storage

LCOV - code coverage report

Current view:	top level - storage_controller/src - scheduler.rs (source / functions)		Coverage	Total	Hit
Test:	1e20c4f2b28aa592527961bb32170ebbd2c9172f.info	Lines:	89.3 %	954	852
Test Date:	2025-07-16 12:29:03	Functions:	81.0 %	79	64

            Line data    Source code

       1              : use std::collections::HashMap;
       2              : use std::fmt::Debug;
       3              : 
       4              : use http_utils::error::ApiError;
       5              : use itertools::Itertools;
       6              : use pageserver_api::controller_api::AvailabilityZone;
       7              : use pageserver_api::models::PageserverUtilization;
       8              : use serde::Serialize;
       9              : use utils::id::NodeId;
      10              : 
      11              : use crate::metrics::NodeLabelGroup;
      12              : use crate::node::Node;
      13              : use crate::tenant_shard::TenantShard;
      14              : 
      15              : /// Scenarios in which we cannot find a suitable location for a tenant shard
      16              : #[derive(thiserror::Error, Debug)]
      17              : pub enum ScheduleError {
      18              :     #[error("No pageservers found")]
      19              :     NoPageservers,
      20              :     #[error("No pageserver found matching constraint")]
      21              :     ImpossibleConstraint,
      22              : }
      23              : 
      24              : impl From<ScheduleError> for ApiError {
      25            0 :     fn from(value: ScheduleError) -> Self {
      26            0 :         ApiError::Conflict(format!("Scheduling error: {value}"))
      27            0 :     }
      28              : }
      29              : 
      30              : #[derive(Serialize)]
      31              : pub enum MaySchedule {
      32              :     Yes(PageserverUtilization),
      33              :     No,
      34              : }
      35              : 
      36              : #[derive(Serialize)]
      37              : pub(crate) struct SchedulerNode {
      38              :     /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
      39              :     shard_count: usize,
      40              :     /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
      41              :     attached_shard_count: usize,
      42              :     /// How many shards have a location on this node (via [`crate::tenant_shard::IntentState`]) _and_ this node
      43              :     /// is in their preferred AZ (i.e. this is their 'home' location)
      44              :     home_shard_count: usize,
      45              :     /// Availability zone id in which the node resides
      46              :     az: AvailabilityZone,
      47              : 
      48              :     /// Whether this node is currently elegible to have new shards scheduled (this is derived
      49              :     /// from a node's availability state and scheduling policy).
      50              :     may_schedule: MaySchedule,
      51              : }
      52              : 
      53              : pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized {
      54              :     fn generate(
      55              :         node_id: &NodeId,
      56              :         node: &mut SchedulerNode,
      57              :         preferred_az: &Option<AvailabilityZone>,
      58              :         context: &ScheduleContext,
      59              :     ) -> Option<Self>;
      60              : 
      61              :     /// Return a score that drops any components based on node utilization: this is useful
      62              :     /// for finding scores for scheduling optimisation, when we want to avoid rescheduling
      63              :     /// shards due to e.g. disk usage, to avoid flapping.
      64              :     fn for_optimization(&self) -> Self;
      65              : 
      66              :     fn is_overloaded(&self) -> bool;
      67              :     fn node_id(&self) -> NodeId;
      68              : }
      69              : 
      70              : pub(crate) trait ShardTag {
      71              :     type Score: NodeSchedulingScore;
      72              : }
      73              : 
      74              : pub(crate) struct AttachedShardTag {}
      75              : impl ShardTag for AttachedShardTag {
      76              :     type Score = NodeAttachmentSchedulingScore;
      77              : }
      78              : 
      79              : pub(crate) struct SecondaryShardTag {}
      80              : impl ShardTag for SecondaryShardTag {
      81              :     type Score = NodeSecondarySchedulingScore;
      82              : }
      83              : 
      84              : #[derive(PartialEq, Eq, Debug, Clone, Copy)]
      85              : enum AzMatch {
      86              :     Yes,
      87              :     No,
      88              :     Unknown,
      89              : }
      90              : 
      91              : impl AzMatch {
      92        91487 :     fn new(node_az: &AvailabilityZone, shard_preferred_az: Option<&AvailabilityZone>) -> Self {
      93        91315 :         match shard_preferred_az {
      94        91315 :             Some(preferred_az) if preferred_az == node_az => Self::Yes,
      95        52686 :             Some(_preferred_az) => Self::No,
      96          172 :             None => Self::Unknown,
      97              :         }
      98        91487 :     }
      99              : }
     100              : 
     101              : #[derive(PartialEq, Eq, Debug, Clone, Copy)]
     102              : struct AttachmentAzMatch(AzMatch);
     103              : 
     104              : impl Ord for AttachmentAzMatch {
     105        64985 :     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
     106              :         // Lower scores indicate a more suitable node.
     107              :         // Note that we prefer a node for which we don't have
     108              :         // info to a node which we are certain doesn't match the
     109              :         // preferred AZ of the shard.
     110       129970 :         let az_match_score = |az_match: &AzMatch| match az_match {
     111        63743 :             AzMatch::Yes => 0,
     112          176 :             AzMatch::Unknown => 1,
     113        66051 :             AzMatch::No => 2,
     114       129970 :         };
     115              : 
     116        64985 :         az_match_score(&self.0).cmp(&az_match_score(&other.0))
     117        64985 :     }
     118              : }
     119              : 
     120              : impl PartialOrd for AttachmentAzMatch {
     121        64985 :     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
     122        64985 :         Some(self.cmp(other))
     123        64985 :     }
     124              : }
     125              : 
     126              : #[derive(PartialEq, Eq, Debug, Clone, Copy)]
     127              : struct SecondaryAzMatch(AzMatch);
     128              : 
     129              : impl Ord for SecondaryAzMatch {
     130        36220 :     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
     131              :         // Lower scores indicate a more suitable node.
     132              :         // For secondary locations we wish to avoid the preferred AZ
     133              :         // of the shard.
     134        72440 :         let az_match_score = |az_match: &AzMatch| match az_match {
     135        50467 :             AzMatch::No => 0,
     136           22 :             AzMatch::Unknown => 1,
     137        21951 :             AzMatch::Yes => 2,
     138        72440 :         };
     139              : 
     140        36220 :         az_match_score(&self.0).cmp(&az_match_score(&other.0))
     141        36220 :     }
     142              : }
     143              : 
     144              : impl PartialOrd for SecondaryAzMatch {
     145        36216 :     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
     146        36216 :         Some(self.cmp(other))
     147        36216 :     }
     148              : }
     149              : 
     150              : /// Scheduling score of a given node for shard attachments.
     151              : /// Lower scores indicate more suitable nodes.
     152              : /// Ordering is given by member declaration order (top to bottom).
     153              : #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
     154              : pub(crate) struct NodeAttachmentSchedulingScore {
     155              :     /// Flag indicating whether this node matches the preferred AZ
     156              :     /// of the shard. For equal affinity scores, nodes in the matching AZ
     157              :     /// are considered first.
     158              :     az_match: AttachmentAzMatch,
     159              :     /// The number of shards belonging to the tenant currently being
     160              :     /// scheduled that are attached to this node.
     161              :     affinity_score: AffinityScore,
     162              :     /// Utilisation score that combines shard count and disk utilisation
     163              :     utilization_score: u64,
     164              :     /// Total number of shards attached to this node. When nodes have identical utilisation, this
     165              :     /// acts as an anti-affinity between attached shards.
     166              :     total_attached_shard_count: usize,
     167              :     /// Convenience to make selection deterministic in tests and empty systems
     168              :     node_id: NodeId,
     169              : }
     170              : 
     171              : impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
     172        52291 :     fn generate(
     173        52291 :         node_id: &NodeId,
     174        52291 :         node: &mut SchedulerNode,
     175        52291 :         preferred_az: &Option<AvailabilityZone>,
     176        52291 :         context: &ScheduleContext,
     177        52291 :     ) -> Option<Self> {
     178        52291 :         let utilization = match &mut node.may_schedule {
     179        52289 :             MaySchedule::Yes(u) => u,
     180              :             MaySchedule::No => {
     181            2 :                 return None;
     182              :             }
     183              :         };
     184              : 
     185        52289 :         Some(Self {
     186        52289 :             affinity_score: context
     187        52289 :                 .nodes
     188        52289 :                 .get(node_id)
     189        52289 :                 .copied()
     190        52289 :                 .unwrap_or(AffinityScore::FREE),
     191        52289 :             az_match: AttachmentAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
     192        52289 :             utilization_score: utilization.cached_score(),
     193        52289 :             total_attached_shard_count: node.attached_shard_count,
     194        52289 :             node_id: *node_id,
     195        52289 :         })
     196        52291 :     }
     197              : 
     198              :     /// For use in scheduling optimisation, where we only want to consider the aspects
     199              :     /// of the score that can only be resolved by moving things (such as inter-shard affinity
     200              :     /// and AZ affinity), and ignore aspects that reflect the total utilization of a node (which
     201              :     /// can fluctuate for other reasons)
     202          118 :     fn for_optimization(&self) -> Self {
     203          118 :         Self {
     204          118 :             utilization_score: 0,
     205          118 :             total_attached_shard_count: 0,
     206          118 :             node_id: NodeId(0),
     207          118 :             ..*self
     208          118 :         }
     209          118 :     }
     210              : 
     211        52170 :     fn is_overloaded(&self) -> bool {
     212        52170 :         PageserverUtilization::is_overloaded(self.utilization_score)
     213        52170 :     }
     214              : 
     215        12880 :     fn node_id(&self) -> NodeId {
     216        12880 :         self.node_id
     217        12880 :     }
     218              : }
     219              : 
     220              : /// Scheduling score of a given node for shard secondaries.
     221              : /// Lower scores indicate more suitable nodes.
     222              : /// Ordering is given by member declaration order (top to bottom).
     223              : #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
     224              : pub(crate) struct NodeSecondarySchedulingScore {
     225              :     /// Flag indicating whether this node matches the preferred AZ
     226              :     /// of the shard. For secondary locations we wish to avoid nodes in.
     227              :     /// the preferred AZ of the shard, since that's where the attached location
     228              :     /// should be scheduled and having the secondary in the same AZ is bad for HA.
     229              :     az_match: SecondaryAzMatch,
     230              :     /// The number of shards belonging to the tenant currently being
     231              :     /// scheduled that are attached to this node.
     232              :     affinity_score: AffinityScore,
     233              :     /// Utilisation score that combines shard count and disk utilisation
     234              :     utilization_score: u64,
     235              :     /// Anti-affinity with other non-home locations: this gives the behavior that secondaries
     236              :     /// will spread out across the nodes in an AZ.
     237              :     total_non_home_shard_count: usize,
     238              :     /// Convenience to make selection deterministic in tests and empty systems
     239              :     node_id: NodeId,
     240              : }
     241              : 
     242              : impl NodeSchedulingScore for NodeSecondarySchedulingScore {
     243        39199 :     fn generate(
     244        39199 :         node_id: &NodeId,
     245        39199 :         node: &mut SchedulerNode,
     246        39199 :         preferred_az: &Option<AvailabilityZone>,
     247        39199 :         context: &ScheduleContext,
     248        39199 :     ) -> Option<Self> {
     249        39199 :         let utilization = match &mut node.may_schedule {
     250        39198 :             MaySchedule::Yes(u) => u,
     251              :             MaySchedule::No => {
     252            1 :                 return None;
     253              :             }
     254              :         };
     255              : 
     256        39198 :         Some(Self {
     257        39198 :             az_match: SecondaryAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
     258        39198 :             affinity_score: context
     259        39198 :                 .nodes
     260        39198 :                 .get(node_id)
     261        39198 :                 .copied()
     262        39198 :                 .unwrap_or(AffinityScore::FREE),
     263        39198 :             utilization_score: utilization.cached_score(),
     264        39198 :             total_non_home_shard_count: (node.shard_count - node.home_shard_count),
     265        39198 :             node_id: *node_id,
     266        39198 :         })
     267        39199 :     }
     268              : 
     269           14 :     fn for_optimization(&self) -> Self {
     270           14 :         Self {
     271           14 :             utilization_score: 0,
     272           14 :             total_non_home_shard_count: 0,
     273           14 :             node_id: NodeId(0),
     274           14 :             ..*self
     275           14 :         }
     276           14 :     }
     277              : 
     278        39173 :     fn is_overloaded(&self) -> bool {
     279        39173 :         PageserverUtilization::is_overloaded(self.utilization_score)
     280        39173 :     }
     281              : 
     282        12847 :     fn node_id(&self) -> NodeId {
     283        12847 :         self.node_id
     284        12847 :     }
     285              : }
     286              : 
     287              : impl PartialEq for SchedulerNode {
     288            3 :     fn eq(&self, other: &Self) -> bool {
     289            3 :         let may_schedule_matches = matches!(
     290            3 :             (&self.may_schedule, &other.may_schedule),
     291              :             (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
     292              :         );
     293              : 
     294            3 :         may_schedule_matches
     295            3 :             && self.shard_count == other.shard_count
     296            3 :             && self.attached_shard_count == other.attached_shard_count
     297            3 :             && self.az == other.az
     298            3 :     }
     299              : }
     300              : 
     301              : impl Eq for SchedulerNode {}
     302              : 
     303              : /// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
     304              : /// on which to run.
     305              : ///
     306              : /// The type has no persistent state of its own: this is all populated at startup.  The Serialize
     307              : /// impl is only for debug dumps.
     308              : #[derive(Serialize)]
     309              : pub(crate) struct Scheduler {
     310              :     nodes: HashMap<NodeId, SchedulerNode>,
     311              : }
     312              : 
     313              : /// Score for soft constraint scheduling: lower scores are preferred to higher scores.
     314              : ///
     315              : /// For example, we may set an affinity score based on the number of shards from the same
     316              : /// tenant already on a node, to implicitly prefer to balance out shards.
     317              : #[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
     318              : pub(crate) struct AffinityScore(pub(crate) usize);
     319              : 
     320              : impl AffinityScore {
     321              :     /// If we have no anti-affinity at all toward a node, this is its score.  It means
     322              :     /// the scheduler has a free choice amongst nodes with this score, and may pick a node
     323              :     /// based on other information such as total utilization.
     324              :     pub(crate) const FREE: Self = Self(0);
     325              : 
     326        25783 :     pub(crate) fn inc(&mut self) {
     327        25783 :         self.0 += 1;
     328        25783 :     }
     329              : 
     330          126 :     pub(crate) fn dec(&mut self) {
     331          126 :         self.0 -= 1;
     332          126 :     }
     333              : }
     334              : 
     335              : impl std::ops::Add for AffinityScore {
     336              :     type Output = Self;
     337              : 
     338            0 :     fn add(self, rhs: Self) -> Self::Output {
     339            0 :         Self(self.0 + rhs.0)
     340            0 :     }
     341              : }
     342              : 
     343              : /// Hint for whether this is a sincere attempt to schedule, or a speculative
     344              : /// check for where we _would_ schedule (done during optimization)
     345              : #[derive(Debug, Clone)]
     346              : pub(crate) enum ScheduleMode {
     347              :     Normal,
     348              :     Speculative,
     349              : }
     350              : 
     351              : impl Default for ScheduleMode {
     352         5334 :     fn default() -> Self {
     353         5334 :         Self::Normal
     354         5334 :     }
     355              : }
     356              : 
     357              : // For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
     358              : // it for many shards in the same tenant.
     359              : #[derive(Debug, Default, Clone)]
     360              : pub(crate) struct ScheduleContext {
     361              :     /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
     362              :     pub(crate) nodes: HashMap<NodeId, AffinityScore>,
     363              : 
     364              :     pub(crate) mode: ScheduleMode,
     365              : }
     366              : 
     367              : impl ScheduleContext {
     368            3 :     pub(crate) fn new(mode: ScheduleMode) -> Self {
     369            3 :         Self {
     370            3 :             nodes: HashMap::new(),
     371            3 :             mode,
     372            3 :         }
     373            3 :     }
     374              : 
     375              :     /// Input is a list of nodes we would like to avoid using again within this context.  The more
     376              :     /// times a node is passed into this call, the less inclined we are to use it.
     377        12900 :     pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
     378        38683 :         for node_id in nodes {
     379        25783 :             let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
     380        25783 :             entry.inc()
     381              :         }
     382        12900 :     }
     383              : 
     384              :     /// Remove `shard`'s contributions to this context.  This is useful when considering scheduling
     385              :     /// this shard afresh, where we don't want it to e.g. experience anti-affinity to its current location.
     386           63 :     pub(crate) fn project_detach(&self, shard: &TenantShard) -> Self {
     387           63 :         let mut new_context = self.clone();
     388              : 
     389           63 :         if let Some(attached) = shard.intent.get_attached() {
     390           60 :             if let Some(score) = new_context.nodes.get_mut(attached) {
     391           60 :                 score.dec();
     392           60 :             }
     393            3 :         }
     394              : 
     395           68 :         for secondary in shard.intent.get_secondary() {
     396           68 :             if let Some(score) = new_context.nodes.get_mut(secondary) {
     397           66 :                 score.dec();
     398           66 :             }
     399              :         }
     400              : 
     401           63 :         new_context
     402           63 :     }
     403              : 
     404              :     /// For test, track the sum of AffinityScore values, which is effectively how many
     405              :     /// attached or secondary locations have been registered with this context.
     406              :     #[cfg(test)]
     407            3 :     pub(crate) fn location_count(&self) -> usize {
     408            3 :         self.nodes.values().map(|i| i.0).sum()
     409            3 :     }
     410              : }
     411              : 
     412              : pub(crate) enum RefCountUpdate<'a> {
     413              :     PromoteSecondary,
     414              :     Attach,
     415              :     Detach,
     416              :     DemoteAttached,
     417              :     AddSecondary,
     418              :     RemoveSecondary,
     419              :     ChangePreferredAzFrom(Option<&'a AvailabilityZone>),
     420              : }
     421              : 
     422              : impl Scheduler {
     423           70 :     pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
     424           70 :         let mut scheduler_nodes = HashMap::new();
     425          137 :         for node in nodes {
     426           67 :             scheduler_nodes.insert(
     427           67 :                 node.get_id(),
     428           67 :                 SchedulerNode {
     429           67 :                     shard_count: 0,
     430           67 :                     attached_shard_count: 0,
     431           67 :                     home_shard_count: 0,
     432           67 :                     may_schedule: node.may_schedule(),
     433           67 :                     az: node.get_availability_zone_id().clone(),
     434           67 :                 },
     435           67 :             );
     436           67 :         }
     437              : 
     438           70 :         Self {
     439           70 :             nodes: scheduler_nodes,
     440           70 :         }
     441           70 :     }
     442              : 
     443              :     /// For debug/support: check that our internal statistics are in sync with the state of
     444              :     /// the nodes & tenant shards.
     445              :     ///
     446              :     /// If anything is inconsistent, log details and return an error.
     447            1 :     pub(crate) fn consistency_check<'a>(
     448            1 :         &self,
     449            1 :         nodes: impl Iterator<Item = &'a Node>,
     450            1 :         shards: impl Iterator<Item = &'a TenantShard>,
     451            1 :     ) -> anyhow::Result<()> {
     452            1 :         let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
     453            4 :         for node in nodes {
     454            3 :             expect_nodes.insert(
     455            3 :                 node.get_id(),
     456            3 :                 SchedulerNode {
     457            3 :                     shard_count: 0,
     458            3 :                     attached_shard_count: 0,
     459            3 :                     home_shard_count: 0,
     460            3 :                     may_schedule: node.may_schedule(),
     461            3 :                     az: node.get_availability_zone_id().clone(),
     462            3 :                 },
     463            3 :             );
     464            3 :         }
     465              : 
     466            2 :         for shard in shards {
     467            1 :             if let Some(node_id) = shard.intent.get_attached() {
     468            1 :                 match expect_nodes.get_mut(node_id) {
     469            1 :                     Some(node) => {
     470            1 :                         node.shard_count += 1;
     471            1 :                         node.attached_shard_count += 1;
     472            1 :                         if Some(&node.az) == shard.preferred_az() {
     473            0 :                             node.home_shard_count += 1;
     474            1 :                         }
     475              :                     }
     476            0 :                     None => anyhow::bail!(
     477            0 :                         "Tenant {} references nonexistent node {}",
     478              :                         shard.tenant_shard_id,
     479              :                         node_id
     480              :                     ),
     481              :                 }
     482            0 :             }
     483              : 
     484            1 :             for node_id in shard.intent.get_secondary() {
     485            1 :                 match expect_nodes.get_mut(node_id) {
     486            1 :                     Some(node) => {
     487            1 :                         node.shard_count += 1;
     488            1 :                         if Some(&node.az) == shard.preferred_az() {
     489            0 :                             node.home_shard_count += 1;
     490            1 :                         }
     491              :                     }
     492            0 :                     None => anyhow::bail!(
     493            0 :                         "Tenant {} references nonexistent node {}",
     494              :                         shard.tenant_shard_id,
     495              :                         node_id
     496              :                     ),
     497              :                 }
     498              :             }
     499              :         }
     500              : 
     501            4 :         for (node_id, expect_node) in &expect_nodes {
     502            3 :             let Some(self_node) = self.nodes.get(node_id) else {
     503            0 :                 anyhow::bail!("Node {node_id} not found in Self")
     504              :             };
     505              : 
     506            3 :             if self_node != expect_node {
     507            0 :                 tracing::error!("Inconsistency detected in scheduling state for node {node_id}");
     508            0 :                 tracing::error!("Expected state: {}", serde_json::to_string(expect_node)?);
     509            0 :                 tracing::error!("Self state: {}", serde_json::to_string(self_node)?);
     510              : 
     511            0 :                 anyhow::bail!("Inconsistent state on {node_id}");
     512            3 :             }
     513              :         }
     514              : 
     515            1 :         if expect_nodes.len() != self.nodes.len() {
     516              :             // We just checked that all the expected nodes are present.  If the lengths don't match,
     517              :             // it means that we have nodes in Self that are unexpected.
     518            0 :             for node_id in self.nodes.keys() {
     519            0 :                 if !expect_nodes.contains_key(node_id) {
     520            0 :                     anyhow::bail!("Node {node_id} found in Self but not in expected nodes");
     521            0 :                 }
     522              :             }
     523            1 :         }
     524              : 
     525            1 :         Ok(())
     526            1 :     }
     527              : 
     528              :     /// Update the reference counts of a node. These reference counts are used to guide scheduling
     529              :     /// decisions, not for memory management: they represent the number of tenant shard whose IntentState
     530              :     /// targets this node and the number of tenants shars whose IntentState is attached to this
     531              :     /// node.
     532              :     ///
     533              :     /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
     534              :     /// [`Self::new`] or [`Self::node_upsert`])
     535        51432 :     pub(crate) fn update_node_ref_counts(
     536        51432 :         &mut self,
     537        51432 :         node_id: NodeId,
     538        51432 :         preferred_az: Option<&AvailabilityZone>,
     539        51432 :         update: RefCountUpdate,
     540        51432 :     ) {
     541        51432 :         let Some(node) = self.nodes.get_mut(&node_id) else {
     542            0 :             debug_assert!(false);
     543            0 :             tracing::error!("Scheduler missing node {node_id}");
     544            0 :             return;
     545              :         };
     546              : 
     547        51432 :         let is_home_az = Some(&node.az) == preferred_az;
     548              : 
     549        51432 :         match update {
     550            7 :             RefCountUpdate::PromoteSecondary => {
     551            7 :                 node.attached_shard_count += 1;
     552            7 :             }
     553              :             RefCountUpdate::Attach => {
     554        12859 :                 node.shard_count += 1;
     555        12859 :                 node.attached_shard_count += 1;
     556        12859 :                 if is_home_az {
     557        12819 :                     node.home_shard_count += 1;
     558        12819 :                 }
     559              :             }
     560              :             RefCountUpdate::Detach => {
     561        12857 :                 node.shard_count -= 1;
     562        12857 :                 node.attached_shard_count -= 1;
     563        12857 :                 if is_home_az {
     564        12819 :                     node.home_shard_count -= 1;
     565        12819 :                 }
     566              :             }
     567            8 :             RefCountUpdate::DemoteAttached => {
     568            8 :                 node.attached_shard_count -= 1;
     569            8 :             }
     570              :             RefCountUpdate::AddSecondary => {
     571        12848 :                 node.shard_count += 1;
     572        12848 :                 if is_home_az {
     573            6 :                     node.home_shard_count += 1;
     574        12842 :                 }
     575              :             }
     576              :             RefCountUpdate::RemoveSecondary => {
     577        12849 :                 node.shard_count -= 1;
     578        12849 :                 if is_home_az {
     579            5 :                     node.home_shard_count -= 1;
     580        12844 :                 }
     581              :             }
     582            4 :             RefCountUpdate::ChangePreferredAzFrom(old_az) => {
     583            4 :                 if Some(&node.az) == old_az {
     584            2 :                     node.home_shard_count -= 1;
     585            2 :                 }
     586            4 :                 if is_home_az {
     587            1 :                     node.home_shard_count += 1;
     588            3 :                 }
     589              :             }
     590              :         }
     591              : 
     592              :         // Maybe update PageserverUtilization
     593        51432 :         match update {
     594              :             RefCountUpdate::AddSecondary | RefCountUpdate::Attach => {
     595              :                 // Referencing the node: if this takes our shard_count above the utilzation structure's
     596              :                 // shard count, then artifically bump it: this ensures that the scheduler immediately
     597              :                 // recognizes that this node has more work on it, without waiting for the next heartbeat
     598              :                 // to update the utilization.
     599        25707 :                 if let MaySchedule::Yes(utilization) = &mut node.may_schedule {
     600        25707 :                     utilization.adjust_shard_count_max(node.shard_count as u32);
     601        25707 :                 }
     602              :             }
     603              :             RefCountUpdate::PromoteSecondary
     604              :             | RefCountUpdate::Detach
     605              :             | RefCountUpdate::RemoveSecondary
     606              :             | RefCountUpdate::DemoteAttached
     607        25725 :             | RefCountUpdate::ChangePreferredAzFrom(_) => {
     608        25725 :                 // De-referencing the node: leave the utilization's shard_count at a stale higher
     609        25725 :                 // value until some future heartbeat after we have physically removed this shard
     610        25725 :                 // from the node: this prevents the scheduler over-optimistically trying to schedule
     611        25725 :                 // more work onto the node before earlier detaches are done.
     612        25725 :             }
     613              :         }
     614        51432 :     }
     615              : 
     616              :     // Check if the number of shards attached to a given node is lagging below
     617              :     // the cluster average. If that's the case, the node should be filled.
     618            0 :     pub(crate) fn compute_fill_requirement(&self, node_id: NodeId) -> usize {
     619            0 :         let Some(node) = self.nodes.get(&node_id) else {
     620            0 :             debug_assert!(false);
     621            0 :             tracing::error!("Scheduler missing node {node_id}");
     622            0 :             return 0;
     623              :         };
     624            0 :         assert!(!self.nodes.is_empty());
     625            0 :         let expected_attached_shards_per_node = self.expected_attached_shard_count();
     626              : 
     627            0 :         for (node_id, node) in self.nodes.iter() {
     628            0 :             tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node);
     629              :         }
     630              : 
     631            0 :         expected_attached_shards_per_node.saturating_sub(node.attached_shard_count)
     632            0 :     }
     633              : 
     634            0 :     pub(crate) fn expected_attached_shard_count(&self) -> usize {
     635            0 :         let total_attached_shards: usize =
     636            0 :             self.nodes.values().map(|n| n.attached_shard_count).sum();
     637              : 
     638            0 :         assert!(!self.nodes.is_empty());
     639            0 :         total_attached_shards / self.nodes.len()
     640            0 :     }
     641              : 
     642            0 :     pub(crate) fn nodes_by_attached_shard_count(&self) -> Vec<(NodeId, usize)> {
     643            0 :         self.nodes
     644            0 :             .iter()
     645            0 :             .map(|(node_id, stats)| (*node_id, stats.attached_shard_count))
     646            0 :             .sorted_by(|lhs, rhs| Ord::cmp(&lhs.1, &rhs.1).reverse())
     647            0 :             .collect()
     648            0 :     }
     649              : 
     650          215 :     pub(crate) fn node_upsert(&mut self, node: &Node) {
     651              :         use std::collections::hash_map::Entry::*;
     652          215 :         match self.nodes.entry(node.get_id()) {
     653            4 :             Occupied(mut entry) => {
     654              :                 // Updates to MaySchedule are how we receive updated PageserverUtilization: adjust these values
     655              :                 // to account for any shards scheduled on the controller but not yet visible to the pageserver.
     656            4 :                 let mut may_schedule = node.may_schedule();
     657            4 :                 match &mut may_schedule {
     658            2 :                     MaySchedule::Yes(utilization) => {
     659            2 :                         utilization.adjust_shard_count_max(entry.get().shard_count as u32);
     660            2 :                     }
     661            2 :                     MaySchedule::No => { // Nothing to tweak
     662            2 :                     }
     663              :                 }
     664              : 
     665            4 :                 entry.get_mut().may_schedule = may_schedule;
     666              :             }
     667          211 :             Vacant(entry) => {
     668          211 :                 entry.insert(SchedulerNode {
     669          211 :                     shard_count: 0,
     670          211 :                     attached_shard_count: 0,
     671          211 :                     home_shard_count: 0,
     672          211 :                     may_schedule: node.may_schedule(),
     673          211 :                     az: node.get_availability_zone_id().clone(),
     674          211 :                 });
     675          211 :             }
     676              :         }
     677          215 :     }
     678              : 
     679            0 :     pub(crate) fn node_remove(&mut self, node_id: NodeId) {
     680            0 :         if self.nodes.remove(&node_id).is_none() {
     681            0 :             tracing::warn!(node_id=%node_id, "Removed non-existent node from scheduler");
     682            0 :         }
     683            0 :     }
     684              : 
     685              :     /// Calculate a single node's score, used in optimizer logic to compare specific
     686              :     /// nodes' scores.
     687          146 :     pub(crate) fn compute_node_score<Score>(
     688          146 :         &mut self,
     689          146 :         node_id: NodeId,
     690          146 :         preferred_az: &Option<AvailabilityZone>,
     691          146 :         context: &ScheduleContext,
     692          146 :     ) -> Option<Score>
     693          146 :     where
     694          146 :         Score: NodeSchedulingScore,
     695              :     {
     696          146 :         self.nodes
     697          146 :             .get_mut(&node_id)
     698          146 :             .and_then(|node| Score::generate(&node_id, node, preferred_az, context))
     699          146 :     }
     700              : 
     701              :     /// Compute a schedulling score for each node that the scheduler knows of
     702              :     /// minus a set of hard excluded nodes.
     703        25727 :     fn compute_node_scores<Score>(
     704        25727 :         &mut self,
     705        25727 :         hard_exclude: &[NodeId],
     706        25727 :         preferred_az: &Option<AvailabilityZone>,
     707        25727 :         context: &ScheduleContext,
     708        25727 :     ) -> Vec<Score>
     709        25727 :     where
     710        25727 :         Score: NodeSchedulingScore,
     711              :     {
     712        25727 :         self.nodes
     713        25727 :             .iter_mut()
     714       104191 :             .filter_map(|(k, v)| {
     715       104191 :                 if hard_exclude.contains(k) {
     716        12847 :                     None
     717              :                 } else {
     718        91344 :                     Score::generate(k, v, preferred_az, context)
     719              :                 }
     720       104191 :             })
     721        25727 :             .collect()
     722        25727 :     }
     723              : 
     724              :     /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
     725              :     /// are already in use by this shard -- we use this to avoid picking the same node
     726              :     /// as both attached and secondary location.  This is a hard constraint: if we cannot
     727              :     /// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
     728              :     ///
     729              :     /// context: we prefer to avoid using nodes identified in the context, according
     730              :     /// to their anti-affinity score.  We use this to prefeer to avoid placing shards in
     731              :     /// the same tenant on the same node.  This is a soft constraint: the context will never
     732              :     /// cause us to fail to schedule a shard.
     733        25727 :     pub(crate) fn schedule_shard<Tag: ShardTag>(
     734        25727 :         &mut self,
     735        25727 :         hard_exclude: &[NodeId],
     736        25727 :         preferred_az: &Option<AvailabilityZone>,
     737        25727 :         context: &ScheduleContext,
     738        25727 :     ) -> Result<NodeId, ScheduleError> {
     739        25727 :         if self.nodes.is_empty() {
     740            0 :             return Err(ScheduleError::NoPageservers);
     741        25727 :         }
     742              : 
     743        25727 :         let mut scores =
     744        25727 :             self.compute_node_scores::<Tag::Score>(hard_exclude, preferred_az, context);
     745              : 
     746              :         // Exclude nodes whose utilization is critically high, if there are alternatives available.  This will
     747              :         // cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example
     748              :         // we may place shards in the same tenant together on the same pageserver if all other pageservers are
     749              :         // overloaded.
     750        25727 :         let non_overloaded_scores = scores
     751        25727 :             .iter()
     752        91343 :             .filter(|i| !i.is_overloaded())
     753        25727 :             .copied()
     754        25727 :             .collect::<Vec<_>>();
     755        25727 :         if !non_overloaded_scores.is_empty() {
     756        25727 :             scores = non_overloaded_scores;
     757        25727 :         }
     758              : 
     759              :         // Sort the nodes by score. The one with the lowest scores will be the preferred node.
     760              :         // Refer to [`NodeAttachmentSchedulingScore`] for attached locations and
     761              :         // [`NodeSecondarySchedulingScore`] for secondary locations to understand how the nodes
     762              :         // are ranked.
     763        25727 :         scores.sort();
     764              : 
     765        25727 :         if scores.is_empty() {
     766              :             // After applying constraints, no pageservers were left.
     767            0 :             if !matches!(context.mode, ScheduleMode::Speculative) {
     768              :                 // If this was not a speculative attempt, log details to understand why we couldn't
     769              :                 // schedule: this may help an engineer understand if some nodes are marked offline
     770              :                 // in a way that's preventing progress.
     771            0 :                 tracing::info!(
     772            0 :                     "Scheduling failure, while excluding {hard_exclude:?}, node states:"
     773              :                 );
     774            0 :                 for (node_id, node) in &self.nodes {
     775            0 :                     tracing::info!(
     776            0 :                         "Node {node_id}: may_schedule={} shards={}",
     777            0 :                         !matches!(node.may_schedule, MaySchedule::No),
     778              :                         node.shard_count
     779              :                     );
     780              :                 }
     781            0 :             }
     782            0 :             return Err(ScheduleError::ImpossibleConstraint);
     783        25727 :         }
     784              : 
     785              :         // Lowest score wins
     786        25727 :         let node_id = scores.first().unwrap().node_id();
     787              : 
     788        25727 :         if !matches!(context.mode, ScheduleMode::Speculative) {
     789        25727 :             tracing::info!(
     790            0 :                 "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})",
     791            0 :                 scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>(),
     792              :                 preferred_az,
     793              :             );
     794            0 :         }
     795              : 
     796              :         // Note that we do not update shard count here to reflect the scheduling: that
     797              :         // is IntentState's job when the scheduled location is used.
     798              : 
     799        25727 :         Ok(node_id)
     800        25727 :     }
     801              : 
     802              :     /// Selects any available node. This is suitable for performing background work (e.g. S3
     803              :     /// deletions).
     804            0 :     pub(crate) fn any_available_node(&mut self) -> Result<NodeId, ScheduleError> {
     805            0 :         self.schedule_shard::<AttachedShardTag>(&[], &None, &ScheduleContext::default())
     806            0 :     }
     807              : 
     808              :     /// For choosing which AZ to schedule a new shard into, use this.  It will return the
     809              :     /// AZ with the the lowest number of shards currently scheduled in this AZ as their home
     810              :     /// location.
     811              :     ///
     812              :     /// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded
     813              :     /// node, because while tenants start out single sharded, when they grow and undergo
     814              :     /// shard-split, they will occupy space on many nodes within an AZ.  It is important
     815              :     /// that we pick the AZ in a way that balances this _future_ load.
     816              :     ///
     817              :     /// Once we've picked an AZ, subsequent scheduling within that AZ will be driven by
     818              :     /// nodes' utilization scores.
     819          303 :     pub(crate) fn get_az_for_new_tenant(&self) -> Option<AvailabilityZone> {
     820          303 :         if self.nodes.is_empty() {
     821            0 :             return None;
     822          303 :         }
     823              : 
     824              :         #[derive(Default)]
     825              :         struct AzScore {
     826              :             home_shard_count: usize,
     827              :             scheduleable: bool,
     828              :             node_count: usize,
     829              :         }
     830              : 
     831          303 :         let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new();
     832         1818 :         for node in self.nodes.values() {
     833         1818 :             let az = azs.entry(&node.az).or_default();
     834         1818 :             az.home_shard_count += node.home_shard_count;
     835         1818 :             az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_));
     836         1818 :             az.node_count += 1;
     837              :         }
     838              : 
     839              :         // If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where
     840              :         // all nodes are overloaded or otherwise unschedulable).
     841          303 :         if azs.values().any(|i| i.scheduleable) {
     842          303 :             azs.retain(|_, i| i.scheduleable);
     843            0 :         }
     844              : 
     845              :         // We will multiply up shard counts by the max node count for scoring, before dividing
     846              :         // by per-node max node count, to get a normalized score that doesn't collapse to zero
     847              :         // when the absolute shard count is less than the node count.
     848          303 :         let max_node_count = azs.values().map(|i| i.node_count).max().unwrap_or(0);
     849              : 
     850              :         // Find the AZ with the lowest number of shards currently allocated
     851              :         Some(
     852          303 :             azs.into_iter()
     853          906 :                 .min_by_key(|i| {
     854          906 :                     (
     855          906 :                         (i.1.home_shard_count * max_node_count) / i.1.node_count,
     856          906 :                         i.0,
     857          906 :                     )
     858          906 :                 })
     859          303 :                 .unwrap()
     860              :                 .0
     861          303 :                 .clone(),
     862              :         )
     863          303 :     }
     864              : 
     865            9 :     pub(crate) fn get_node_az(&self, node_id: &NodeId) -> Option<AvailabilityZone> {
     866            9 :         self.nodes.get(node_id).map(|n| n.az.clone())
     867            9 :     }
     868              : 
     869              :     /// For use when choosing a preferred secondary location: filter out nodes that are not
     870              :     /// available, and gather their AZs.
     871        12832 :     pub(crate) fn filter_usable_nodes(
     872        12832 :         &self,
     873        12832 :         nodes: &[NodeId],
     874        12832 :     ) -> Vec<(NodeId, Option<AvailabilityZone>)> {
     875        12832 :         nodes
     876        12832 :             .iter()
     877        12832 :             .filter_map(|node_id| {
     878            3 :                 let node = self
     879            3 :                     .nodes
     880            3 :                     .get(node_id)
     881            3 :                     .expect("Referenced nodes always exist");
     882            3 :                 if matches!(node.may_schedule, MaySchedule::Yes(_)) {
     883            2 :                     Some((*node_id, Some(node.az.clone())))
     884              :                 } else {
     885            1 :                     None
     886              :                 }
     887            3 :             })
     888        12832 :             .collect()
     889        12832 :     }
     890              : 
     891              :     /// Unit test access to internal state
     892              :     #[cfg(test)]
     893           19 :     pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
     894           19 :         self.nodes.get(&node_id).unwrap().shard_count
     895           19 :     }
     896              : 
     897              :     #[cfg(test)]
     898           19 :     pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize {
     899           19 :         self.nodes.get(&node_id).unwrap().attached_shard_count
     900           19 :     }
     901              : 
     902              :     /// Some metrics that we only calculate periodically: this is simpler than
     903              :     /// rigorously updating them on every change.
     904            0 :     pub(crate) fn update_metrics(&self) {
     905            0 :         for (node_id, node) in &self.nodes {
     906            0 :             let node_id_str = format!("{node_id}");
     907            0 :             let label_group = NodeLabelGroup {
     908            0 :                 az: &node.az.0,
     909            0 :                 node_id: &node_id_str,
     910            0 :             };
     911            0 : 
     912            0 :             crate::metrics::METRICS_REGISTRY
     913            0 :                 .metrics_group
     914            0 :                 .storage_controller_node_shards
     915            0 :                 .set(label_group.clone(), node.shard_count as i64);
     916            0 : 
     917            0 :             crate::metrics::METRICS_REGISTRY
     918            0 :                 .metrics_group
     919            0 :                 .storage_controller_node_attached_shards
     920            0 :                 .set(label_group.clone(), node.attached_shard_count as i64);
     921            0 : 
     922            0 :             crate::metrics::METRICS_REGISTRY
     923            0 :                 .metrics_group
     924            0 :                 .storage_controller_node_home_shards
     925            0 :                 .set(label_group.clone(), node.home_shard_count as i64);
     926            0 :         }
     927            0 :     }
     928              : }
     929              : 
     930              : #[cfg(test)]
     931              : pub(crate) mod test_utils {
     932              : 
     933              :     use std::collections::HashMap;
     934              : 
     935              :     use pageserver_api::controller_api::{AvailabilityZone, NodeAvailability};
     936              :     use pageserver_api::models::utilization::test_utilization;
     937              :     use utils::id::NodeId;
     938              : 
     939              :     use crate::node::Node;
     940              : 
     941              :     /// Test helper: synthesize the requested number of nodes, all in active state.
     942              :     ///
     943              :     /// Node IDs start at one.
     944              :     ///
     945              :     /// The `azs` argument specifies the list of availability zones which will be assigned
     946              :     /// to nodes in round-robin fashion. If empy, a default AZ is assigned.
     947           70 :     pub(crate) fn make_test_nodes(n: u64, azs: &[AvailabilityZone]) -> HashMap<NodeId, Node> {
     948           70 :         let mut az_iter = azs.iter().cycle();
     949              : 
     950           70 :         (1..n + 1)
     951          278 :             .map(|i| {
     952          278 :                 (NodeId(i), {
     953          278 :                     let mut node = Node::new(
     954          278 :                         NodeId(i),
     955          278 :                         format!("httphost-{i}"),
     956          278 :                         80 + i as u16,
     957          278 :                         None,
     958          278 :                         format!("pghost-{i}"),
     959          278 :                         5432 + i as u16,
     960          278 :                         Some(format!("grpchost-{i}")),
     961          278 :                         Some(51051 + i as u16),
     962          278 :                         az_iter
     963          278 :                             .next()
     964          278 :                             .cloned()
     965          278 :                             .unwrap_or(AvailabilityZone("test-az".to_string())),
     966              :                         false,
     967              :                     )
     968          278 :                     .unwrap();
     969          278 :                     node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
     970          278 :                     assert!(node.is_available());
     971          278 :                     node
     972              :                 })
     973          278 :             })
     974           70 :             .collect()
     975           70 :     }
     976              : }
     977              : 
     978              : #[cfg(test)]
     979              : mod tests {
     980              :     use pageserver_api::controller_api::NodeAvailability;
     981              :     use pageserver_api::models::utilization::test_utilization;
     982              :     use pageserver_api::shard::ShardIdentity;
     983              :     use utils::id::TenantId;
     984              :     use utils::shard::{ShardCount, ShardNumber, TenantShardId};
     985              : 
     986              :     use super::*;
     987              :     use crate::tenant_shard::IntentState;
     988              :     #[test]
     989            1 :     fn scheduler_basic() -> anyhow::Result<()> {
     990            1 :         let nodes = test_utils::make_test_nodes(2, &[]);
     991              : 
     992            1 :         let mut scheduler = Scheduler::new(nodes.values());
     993            1 :         let mut t1_intent = IntentState::new(None);
     994            1 :         let mut t2_intent = IntentState::new(None);
     995              : 
     996            1 :         let context = ScheduleContext::default();
     997              : 
     998            1 :         let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &None, &context)?;
     999            1 :         t1_intent.set_attached(&mut scheduler, Some(scheduled));
    1000            1 :         let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &None, &context)?;
    1001            1 :         t2_intent.set_attached(&mut scheduler, Some(scheduled));
    1002              : 
    1003            1 :         assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
    1004            1 :         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
    1005              : 
    1006            1 :         assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
    1007            1 :         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
    1008              : 
    1009            1 :         let scheduled = scheduler.schedule_shard::<AttachedShardTag>(
    1010            1 :             &t1_intent.all_pageservers(),
    1011            1 :             &None,
    1012            1 :             &context,
    1013            0 :         )?;
    1014            1 :         t1_intent.push_secondary(&mut scheduler, scheduled);
    1015              : 
    1016            1 :         assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
    1017            1 :         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
    1018              : 
    1019            1 :         assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
    1020            1 :         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
    1021              : 
    1022            1 :         t1_intent.clear(&mut scheduler);
    1023            1 :         assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0);
    1024            1 :         assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
    1025              : 
    1026            1 :         let total_attached = scheduler.get_node_attached_shard_count(NodeId(1))
    1027            1 :             + scheduler.get_node_attached_shard_count(NodeId(2));
    1028            1 :         assert_eq!(total_attached, 1);
    1029              : 
    1030            1 :         if cfg!(debug_assertions) {
    1031              :             // Dropping an IntentState without clearing it causes a panic in debug mode,
    1032              :             // because we have failed to properly update scheduler shard counts.
    1033            1 :             let result = std::panic::catch_unwind(move || {
    1034            1 :                 drop(t2_intent);
    1035            1 :             });
    1036            1 :             assert!(result.is_err());
    1037              :         } else {
    1038            0 :             t2_intent.clear(&mut scheduler);
    1039              : 
    1040            0 :             assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0);
    1041            0 :             assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 0);
    1042              : 
    1043            0 :             assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 0);
    1044            0 :             assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
    1045              :         }
    1046              : 
    1047            1 :         Ok(())
    1048            1 :     }
    1049              : 
    1050              :     #[test]
    1051              :     /// Test the PageserverUtilization's contribution to scheduling algorithm
    1052            1 :     fn scheduler_utilization() {
    1053            1 :         let mut nodes = test_utils::make_test_nodes(3, &[]);
    1054            1 :         let mut scheduler = Scheduler::new(nodes.values());
    1055              : 
    1056              :         // Need to keep these alive because they contribute to shard counts via RAII
    1057            1 :         let mut scheduled_intents = Vec::new();
    1058              : 
    1059            1 :         let empty_context = ScheduleContext::default();
    1060              : 
    1061           11 :         fn assert_scheduler_chooses(
    1062           11 :             expect_node: NodeId,
    1063           11 :             scheduled_intents: &mut Vec<IntentState>,
    1064           11 :             scheduler: &mut Scheduler,
    1065           11 :             context: &ScheduleContext,
    1066           11 :         ) {
    1067           11 :             let scheduled = scheduler
    1068           11 :                 .schedule_shard::<AttachedShardTag>(&[], &None, context)
    1069           11 :                 .unwrap();
    1070           11 :             let mut intent = IntentState::new(None);
    1071           11 :             intent.set_attached(scheduler, Some(scheduled));
    1072           11 :             scheduled_intents.push(intent);
    1073           11 :             assert_eq!(scheduled, expect_node);
    1074           11 :         }
    1075              : 
    1076              :         // Independent schedule calls onto empty nodes should round-robin, because each node's
    1077              :         // utilization's shard count is updated inline.  The order is determinsitic because when all other factors are
    1078              :         // equal, we order by node ID.
    1079            1 :         assert_scheduler_chooses(
    1080            1 :             NodeId(1),
    1081            1 :             &mut scheduled_intents,
    1082            1 :             &mut scheduler,
    1083            1 :             &empty_context,
    1084              :         );
    1085            1 :         assert_scheduler_chooses(
    1086            1 :             NodeId(2),
    1087            1 :             &mut scheduled_intents,
    1088            1 :             &mut scheduler,
    1089            1 :             &empty_context,
    1090              :         );
    1091            1 :         assert_scheduler_chooses(
    1092            1 :             NodeId(3),
    1093            1 :             &mut scheduled_intents,
    1094            1 :             &mut scheduler,
    1095            1 :             &empty_context,
    1096              :         );
    1097              : 
    1098              :         // Manually setting utilization higher should cause schedule calls to round-robin the other nodes
    1099              :         // which have equal utilization.
    1100            1 :         nodes
    1101            1 :             .get_mut(&NodeId(1))
    1102            1 :             .unwrap()
    1103            1 :             .set_availability(NodeAvailability::Active(test_utilization::simple(
    1104            1 :                 10,
    1105            1 :                 1024 * 1024 * 1024,
    1106            1 :             )));
    1107            1 :         scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
    1108              : 
    1109            1 :         assert_scheduler_chooses(
    1110            1 :             NodeId(2),
    1111            1 :             &mut scheduled_intents,
    1112            1 :             &mut scheduler,
    1113            1 :             &empty_context,
    1114              :         );
    1115            1 :         assert_scheduler_chooses(
    1116            1 :             NodeId(3),
    1117            1 :             &mut scheduled_intents,
    1118            1 :             &mut scheduler,
    1119            1 :             &empty_context,
    1120              :         );
    1121            1 :         assert_scheduler_chooses(
    1122            1 :             NodeId(2),
    1123            1 :             &mut scheduled_intents,
    1124            1 :             &mut scheduler,
    1125            1 :             &empty_context,
    1126              :         );
    1127            1 :         assert_scheduler_chooses(
    1128            1 :             NodeId(3),
    1129            1 :             &mut scheduled_intents,
    1130            1 :             &mut scheduler,
    1131            1 :             &empty_context,
    1132              :         );
    1133              : 
    1134              :         // The scheduler should prefer nodes with lower affinity score,
    1135              :         // even if they have higher utilization (as long as they aren't utilized at >100%)
    1136            1 :         let mut context_prefer_node1 = ScheduleContext::default();
    1137            1 :         context_prefer_node1.avoid(&[NodeId(2), NodeId(3)]);
    1138            1 :         assert_scheduler_chooses(
    1139            1 :             NodeId(1),
    1140            1 :             &mut scheduled_intents,
    1141            1 :             &mut scheduler,
    1142            1 :             &context_prefer_node1,
    1143              :         );
    1144            1 :         assert_scheduler_chooses(
    1145            1 :             NodeId(1),
    1146            1 :             &mut scheduled_intents,
    1147            1 :             &mut scheduler,
    1148            1 :             &context_prefer_node1,
    1149              :         );
    1150              : 
    1151              :         // If a node is over-utilized, it will not be used even if affinity scores prefer it
    1152            1 :         nodes
    1153            1 :             .get_mut(&NodeId(1))
    1154            1 :             .unwrap()
    1155            1 :             .set_availability(NodeAvailability::Active(test_utilization::simple(
    1156            1 :                 20000,
    1157            1 :                 1024 * 1024 * 1024,
    1158            1 :             )));
    1159            1 :         scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
    1160            1 :         assert_scheduler_chooses(
    1161            1 :             NodeId(2),
    1162            1 :             &mut scheduled_intents,
    1163            1 :             &mut scheduler,
    1164            1 :             &context_prefer_node1,
    1165              :         );
    1166            1 :         assert_scheduler_chooses(
    1167            1 :             NodeId(3),
    1168            1 :             &mut scheduled_intents,
    1169            1 :             &mut scheduler,
    1170            1 :             &context_prefer_node1,
    1171              :         );
    1172              : 
    1173           12 :         for mut intent in scheduled_intents {
    1174           11 :             intent.clear(&mut scheduler);
    1175           11 :         }
    1176            1 :     }
    1177              : 
    1178              :     #[test]
    1179              :     /// A simple test that showcases AZ-aware scheduling and its interaction with
    1180              :     /// affinity scores.
    1181            1 :     fn az_scheduling() {
    1182            1 :         let az_a_tag = AvailabilityZone("az-a".to_string());
    1183            1 :         let az_b_tag = AvailabilityZone("az-b".to_string());
    1184              : 
    1185            1 :         let nodes = test_utils::make_test_nodes(3, &[az_a_tag.clone(), az_b_tag.clone()]);
    1186            1 :         let mut scheduler = Scheduler::new(nodes.values());
    1187              : 
    1188              :         // Need to keep these alive because they contribute to shard counts via RAII
    1189            1 :         let mut scheduled_intents = Vec::new();
    1190              : 
    1191            1 :         let mut context = ScheduleContext::default();
    1192              : 
    1193            4 :         fn assert_scheduler_chooses<Tag: ShardTag>(
    1194            4 :             expect_node: NodeId,
    1195            4 :             preferred_az: Option<AvailabilityZone>,
    1196            4 :             scheduled_intents: &mut Vec<IntentState>,
    1197            4 :             scheduler: &mut Scheduler,
    1198            4 :             context: &mut ScheduleContext,
    1199            4 :         ) {
    1200            4 :             let scheduled = scheduler
    1201            4 :                 .schedule_shard::<Tag>(&[], &preferred_az, context)
    1202            4 :                 .unwrap();
    1203            4 :             let mut intent = IntentState::new(preferred_az.clone());
    1204            4 :             intent.set_attached(scheduler, Some(scheduled));
    1205            4 :             scheduled_intents.push(intent);
    1206            4 :             assert_eq!(scheduled, expect_node);
    1207              : 
    1208            4 :             context.avoid(&[scheduled]);
    1209            4 :         }
    1210              : 
    1211            1 :         assert_scheduler_chooses::<AttachedShardTag>(
    1212            1 :             NodeId(1),
    1213            1 :             Some(az_a_tag.clone()),
    1214            1 :             &mut scheduled_intents,
    1215            1 :             &mut scheduler,
    1216            1 :             &mut context,
    1217              :         );
    1218              : 
    1219              :         // Node 2 and 3 have affinity score equal to 0, but node 3
    1220              :         // is in "az-a" so we prefer that.
    1221            1 :         assert_scheduler_chooses::<AttachedShardTag>(
    1222            1 :             NodeId(3),
    1223            1 :             Some(az_a_tag.clone()),
    1224            1 :             &mut scheduled_intents,
    1225            1 :             &mut scheduler,
    1226            1 :             &mut context,
    1227              :         );
    1228              : 
    1229              :         // Node 1 and 3 (az-a) have same affinity score, so prefer the lowest node id.
    1230            1 :         assert_scheduler_chooses::<AttachedShardTag>(
    1231            1 :             NodeId(1),
    1232            1 :             Some(az_a_tag.clone()),
    1233            1 :             &mut scheduled_intents,
    1234            1 :             &mut scheduler,
    1235            1 :             &mut context,
    1236              :         );
    1237              : 
    1238              :         // Avoid nodes in "az-a" for the secondary location.
    1239            1 :         assert_scheduler_chooses::<SecondaryShardTag>(
    1240            1 :             NodeId(2),
    1241            1 :             Some(az_a_tag.clone()),
    1242            1 :             &mut scheduled_intents,
    1243            1 :             &mut scheduler,
    1244            1 :             &mut context,
    1245              :         );
    1246              : 
    1247            5 :         for mut intent in scheduled_intents {
    1248            4 :             intent.clear(&mut scheduler);
    1249            4 :         }
    1250            1 :     }
    1251              : 
    1252              :     #[test]
    1253            1 :     fn az_scheduling_for_new_tenant() {
    1254            1 :         let az_a_tag = AvailabilityZone("az-a".to_string());
    1255            1 :         let az_b_tag = AvailabilityZone("az-b".to_string());
    1256            1 :         let nodes = test_utils::make_test_nodes(
    1257              :             6,
    1258            1 :             &[
    1259            1 :                 az_a_tag.clone(),
    1260            1 :                 az_a_tag.clone(),
    1261            1 :                 az_a_tag.clone(),
    1262            1 :                 az_b_tag.clone(),
    1263            1 :                 az_b_tag.clone(),
    1264            1 :                 az_b_tag.clone(),
    1265            1 :             ],
    1266              :         );
    1267              : 
    1268            1 :         let mut scheduler = Scheduler::new(nodes.values());
    1269              : 
    1270              :         /// Force the `home_shard_count` of a node directly: this is the metric used
    1271              :         /// by the scheduler when picking AZs.
    1272            3 :         fn set_shard_count(scheduler: &mut Scheduler, node_id: NodeId, shard_count: usize) {
    1273            3 :             let node = scheduler.nodes.get_mut(&node_id).unwrap();
    1274            3 :             node.home_shard_count = shard_count;
    1275            3 :         }
    1276              : 
    1277              :         // Initial empty state.  Scores are tied, scheduler prefers lower AZ ID.
    1278            1 :         assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
    1279              : 
    1280              :         // Home shard count is higher in AZ A, so AZ B will be preferred
    1281            1 :         set_shard_count(&mut scheduler, NodeId(1), 10);
    1282            1 :         assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone()));
    1283              : 
    1284              :         // Total home shard count is higher in AZ B, so we revert to preferring AZ A
    1285            1 :         set_shard_count(&mut scheduler, NodeId(4), 6);
    1286            1 :         set_shard_count(&mut scheduler, NodeId(5), 6);
    1287            1 :         assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
    1288            1 :     }
    1289              : 
    1290              :     /// Test that when selecting AZs for many new tenants, we get the expected balance across nodes
    1291              :     #[test]
    1292            1 :     fn az_selection_many() {
    1293            1 :         let az_a_tag = AvailabilityZone("az-a".to_string());
    1294            1 :         let az_b_tag = AvailabilityZone("az-b".to_string());
    1295            1 :         let az_c_tag = AvailabilityZone("az-c".to_string());
    1296            1 :         let nodes = test_utils::make_test_nodes(
    1297              :             6,
    1298            1 :             &[
    1299            1 :                 az_a_tag.clone(),
    1300            1 :                 az_b_tag.clone(),
    1301            1 :                 az_c_tag.clone(),
    1302            1 :                 az_a_tag.clone(),
    1303            1 :                 az_b_tag.clone(),
    1304            1 :                 az_c_tag.clone(),
    1305            1 :             ],
    1306              :         );
    1307              : 
    1308            1 :         let mut scheduler = Scheduler::new(nodes.values());
    1309              : 
    1310              :         // We should get 1/6th of these on each node, give or take a few...
    1311            1 :         let total_tenants = 300;
    1312              : 
    1313              :         // ...where the 'few' is the number of AZs, because the scheduling will sometimes overshoot
    1314              :         // on one AZ before correcting itself.  This is because we select the 'home' AZ based on
    1315              :         // an AZ-wide metric, but we select the location for secondaries on a purely node-based
    1316              :         // metric (while excluding the home AZ).
    1317            1 :         let grace = 3;
    1318              : 
    1319            1 :         let mut scheduled_shards = Vec::new();
    1320          300 :         for _i in 0..total_tenants {
    1321          300 :             let preferred_az = scheduler.get_az_for_new_tenant().unwrap();
    1322              : 
    1323          300 :             let mut node_home_counts = scheduler
    1324          300 :                 .nodes
    1325          300 :                 .iter()
    1326         1800 :                 .map(|(node_id, node)| (node_id, node.home_shard_count))
    1327          300 :                 .collect::<Vec<_>>();
    1328          300 :             node_home_counts.sort_by_key(|i| i.0);
    1329          300 :             eprintln!("Selected {preferred_az}, vs nodes {node_home_counts:?}");
    1330              : 
    1331          300 :             let tenant_shard_id = TenantShardId {
    1332          300 :                 tenant_id: TenantId::generate(),
    1333          300 :                 shard_number: ShardNumber(0),
    1334          300 :                 shard_count: ShardCount(1),
    1335          300 :             };
    1336              : 
    1337          300 :             let shard_identity = ShardIdentity::new(
    1338          300 :                 tenant_shard_id.shard_number,
    1339          300 :                 tenant_shard_id.shard_count,
    1340          300 :                 pageserver_api::shard::ShardStripeSize(1),
    1341              :             )
    1342          300 :             .unwrap();
    1343          300 :             let mut shard = TenantShard::new(
    1344          300 :                 tenant_shard_id,
    1345          300 :                 shard_identity,
    1346          300 :                 pageserver_api::controller_api::PlacementPolicy::Attached(1),
    1347          300 :                 Some(preferred_az),
    1348              :             );
    1349              : 
    1350          300 :             let mut context = ScheduleContext::default();
    1351          300 :             shard.schedule(&mut scheduler, &mut context).unwrap();
    1352          300 :             eprintln!("Scheduled shard at {:?}", shard.intent);
    1353              : 
    1354          300 :             scheduled_shards.push(shard);
    1355              :         }
    1356              : 
    1357            7 :         for (node_id, node) in &scheduler.nodes {
    1358            6 :             eprintln!(
    1359            6 :                 "Node {}: {} {} {}",
    1360            6 :                 node_id, node.shard_count, node.attached_shard_count, node.home_shard_count
    1361            6 :             );
    1362            6 :         }
    1363              : 
    1364            6 :         for node in scheduler.nodes.values() {
    1365            6 :             assert!((node.home_shard_count as i64 - total_tenants as i64 / 6).abs() < grace);
    1366              :         }
    1367              : 
    1368          301 :         for mut shard in scheduled_shards {
    1369          300 :             shard.intent.clear(&mut scheduler);
    1370          300 :         }
    1371            1 :     }
    1372              : 
    1373              :     #[test]
    1374              :     /// Make sure that when we have an odd number of nodes and an even number of shards, we still
    1375              :     /// get scheduling stability.
    1376            1 :     fn odd_nodes_stability() {
    1377            1 :         let az_a = AvailabilityZone("az-a".to_string());
    1378            1 :         let az_b = AvailabilityZone("az-b".to_string());
    1379              : 
    1380            1 :         let nodes = test_utils::make_test_nodes(
    1381              :             10,
    1382            1 :             &[
    1383            1 :                 az_a.clone(),
    1384            1 :                 az_a.clone(),
    1385            1 :                 az_a.clone(),
    1386            1 :                 az_a.clone(),
    1387            1 :                 az_a.clone(),
    1388            1 :                 az_b.clone(),
    1389            1 :                 az_b.clone(),
    1390            1 :                 az_b.clone(),
    1391            1 :                 az_b.clone(),
    1392            1 :                 az_b.clone(),
    1393            1 :             ],
    1394              :         );
    1395            1 :         let mut scheduler = Scheduler::new(nodes.values());
    1396              : 
    1397              :         // Need to keep these alive because they contribute to shard counts via RAII
    1398            1 :         let mut scheduled_shards = Vec::new();
    1399              : 
    1400            1 :         let mut context = ScheduleContext::default();
    1401              : 
    1402            8 :         fn schedule_shard(
    1403            8 :             tenant_shard_id: TenantShardId,
    1404            8 :             expect_attached: NodeId,
    1405            8 :             expect_secondary: NodeId,
    1406            8 :             scheduled_shards: &mut Vec<TenantShard>,
    1407            8 :             scheduler: &mut Scheduler,
    1408            8 :             preferred_az: Option<AvailabilityZone>,
    1409            8 :             context: &mut ScheduleContext,
    1410            8 :         ) {
    1411            8 :             let shard_identity = ShardIdentity::new(
    1412            8 :                 tenant_shard_id.shard_number,
    1413            8 :                 tenant_shard_id.shard_count,
    1414            8 :                 pageserver_api::shard::ShardStripeSize(1),
    1415              :             )
    1416            8 :             .unwrap();
    1417            8 :             let mut shard = TenantShard::new(
    1418            8 :                 tenant_shard_id,
    1419            8 :                 shard_identity,
    1420            8 :                 pageserver_api::controller_api::PlacementPolicy::Attached(1),
    1421            8 :                 preferred_az,
    1422              :             );
    1423              : 
    1424            8 :             shard.schedule(scheduler, context).unwrap();
    1425              : 
    1426            8 :             assert_eq!(shard.intent.get_attached().unwrap(), expect_attached);
    1427            8 :             assert_eq!(
    1428            8 :                 shard.intent.get_secondary().first().unwrap(),
    1429            8 :                 &expect_secondary
    1430              :             );
    1431              : 
    1432            8 :             scheduled_shards.push(shard);
    1433            8 :         }
    1434              : 
    1435            1 :         let tenant_id = TenantId::generate();
    1436              : 
    1437            1 :         schedule_shard(
    1438            1 :             TenantShardId {
    1439            1 :                 tenant_id,
    1440            1 :                 shard_number: ShardNumber(0),
    1441            1 :                 shard_count: ShardCount(8),
    1442            1 :             },
    1443            1 :             NodeId(1),
    1444            1 :             NodeId(6),
    1445            1 :             &mut scheduled_shards,
    1446            1 :             &mut scheduler,
    1447            1 :             Some(az_a.clone()),
    1448            1 :             &mut context,
    1449              :         );
    1450              : 
    1451            1 :         schedule_shard(
    1452            1 :             TenantShardId {
    1453            1 :                 tenant_id,
    1454            1 :                 shard_number: ShardNumber(1),
    1455            1 :                 shard_count: ShardCount(8),
    1456            1 :             },
    1457            1 :             NodeId(2),
    1458            1 :             NodeId(7),
    1459            1 :             &mut scheduled_shards,
    1460            1 :             &mut scheduler,
    1461            1 :             Some(az_a.clone()),
    1462            1 :             &mut context,
    1463              :         );
    1464              : 
    1465            1 :         schedule_shard(
    1466            1 :             TenantShardId {
    1467            1 :                 tenant_id,
    1468            1 :                 shard_number: ShardNumber(2),
    1469            1 :                 shard_count: ShardCount(8),
    1470            1 :             },
    1471            1 :             NodeId(3),
    1472            1 :             NodeId(8),
    1473            1 :             &mut scheduled_shards,
    1474            1 :             &mut scheduler,
    1475            1 :             Some(az_a.clone()),
    1476            1 :             &mut context,
    1477              :         );
    1478              : 
    1479            1 :         schedule_shard(
    1480            1 :             TenantShardId {
    1481            1 :                 tenant_id,
    1482            1 :                 shard_number: ShardNumber(3),
    1483            1 :                 shard_count: ShardCount(8),
    1484            1 :             },
    1485            1 :             NodeId(4),
    1486            1 :             NodeId(9),
    1487            1 :             &mut scheduled_shards,
    1488            1 :             &mut scheduler,
    1489            1 :             Some(az_a.clone()),
    1490            1 :             &mut context,
    1491              :         );
    1492              : 
    1493            1 :         schedule_shard(
    1494            1 :             TenantShardId {
    1495            1 :                 tenant_id,
    1496            1 :                 shard_number: ShardNumber(4),
    1497            1 :                 shard_count: ShardCount(8),
    1498            1 :             },
    1499            1 :             NodeId(5),
    1500            1 :             NodeId(10),
    1501            1 :             &mut scheduled_shards,
    1502            1 :             &mut scheduler,
    1503            1 :             Some(az_a.clone()),
    1504            1 :             &mut context,
    1505              :         );
    1506              : 
    1507            1 :         schedule_shard(
    1508            1 :             TenantShardId {
    1509            1 :                 tenant_id,
    1510            1 :                 shard_number: ShardNumber(5),
    1511            1 :                 shard_count: ShardCount(8),
    1512            1 :             },
    1513            1 :             NodeId(1),
    1514            1 :             NodeId(6),
    1515            1 :             &mut scheduled_shards,
    1516            1 :             &mut scheduler,
    1517            1 :             Some(az_a.clone()),
    1518            1 :             &mut context,
    1519              :         );
    1520              : 
    1521            1 :         schedule_shard(
    1522            1 :             TenantShardId {
    1523            1 :                 tenant_id,
    1524            1 :                 shard_number: ShardNumber(6),
    1525            1 :                 shard_count: ShardCount(8),
    1526            1 :             },
    1527            1 :             NodeId(2),
    1528            1 :             NodeId(7),
    1529            1 :             &mut scheduled_shards,
    1530            1 :             &mut scheduler,
    1531            1 :             Some(az_a.clone()),
    1532            1 :             &mut context,
    1533              :         );
    1534              : 
    1535            1 :         schedule_shard(
    1536            1 :             TenantShardId {
    1537            1 :                 tenant_id,
    1538            1 :                 shard_number: ShardNumber(7),
    1539            1 :                 shard_count: ShardCount(8),
    1540            1 :             },
    1541            1 :             NodeId(3),
    1542            1 :             NodeId(8),
    1543            1 :             &mut scheduled_shards,
    1544            1 :             &mut scheduler,
    1545            1 :             Some(az_a.clone()),
    1546            1 :             &mut context,
    1547              :         );
    1548              : 
    1549              :         // Assert that the optimizer suggests nochanges, i.e. our initial scheduling was stable.
    1550            9 :         for shard in &scheduled_shards {
    1551            8 :             assert_eq!(shard.optimize_attachment(&mut scheduler, &context), None);
    1552              :         }
    1553              : 
    1554            9 :         for mut shard in scheduled_shards {
    1555            8 :             shard.intent.clear(&mut scheduler);
    1556            8 :         }
    1557            1 :     }
    1558              : 
    1559              :     #[test]
    1560            1 :     fn change_preferred_az() {
    1561            1 :         let az_a = AvailabilityZone("az-a".to_string());
    1562            1 :         let az_b = AvailabilityZone("az-b".to_string());
    1563              : 
    1564              :         // 2 nodes: 1 az_a and 1 az_b.
    1565            1 :         let nodes = test_utils::make_test_nodes(2, &[az_a.clone(), az_b.clone()]);
    1566            1 :         let mut scheduler = Scheduler::new(nodes.values());
    1567              : 
    1568            1 :         let tenant_shard_id = TenantShardId {
    1569            1 :             tenant_id: TenantId::generate(),
    1570            1 :             shard_number: ShardNumber(0),
    1571            1 :             shard_count: ShardCount(1),
    1572            1 :         };
    1573            1 :         let shard_identity = ShardIdentity::new(
    1574            1 :             tenant_shard_id.shard_number,
    1575            1 :             tenant_shard_id.shard_count,
    1576            1 :             pageserver_api::shard::ShardStripeSize(1),
    1577              :         )
    1578            1 :         .unwrap();
    1579              :         // 1 attached and 1 secondary.
    1580            1 :         let mut shard = TenantShard::new(
    1581            1 :             tenant_shard_id,
    1582            1 :             shard_identity,
    1583            1 :             pageserver_api::controller_api::PlacementPolicy::Attached(1),
    1584            1 :             Some(az_a.clone()),
    1585              :         );
    1586              : 
    1587            1 :         let mut context = ScheduleContext::default();
    1588            1 :         shard.schedule(&mut scheduler, &mut context).unwrap();
    1589            1 :         eprintln!("Scheduled shard at {:?}", shard.intent);
    1590              : 
    1591            2 :         for node in scheduler.nodes.values() {
    1592              :             // Only 2 nodes, one tenant shard should be scheduled on each of them.
    1593            2 :             assert_eq!(node.shard_count, 1);
    1594            2 :             if node.az == az_a {
    1595            1 :                 assert_eq!(node.home_shard_count, 1);
    1596              :             } else {
    1597            1 :                 assert_eq!(node.home_shard_count, 0);
    1598              :             }
    1599              :         }
    1600              : 
    1601            1 :         shard.set_preferred_az(&mut scheduler, Some(az_b.clone()));
    1602              :         // Home AZ flipped.
    1603            2 :         for node in scheduler.nodes.values() {
    1604            2 :             assert_eq!(node.shard_count, 1);
    1605            2 :             if node.az == az_a {
    1606            1 :                 assert_eq!(node.home_shard_count, 0);
    1607              :             } else {
    1608            1 :                 assert_eq!(node.home_shard_count, 1);
    1609              :             }
    1610              :         }
    1611              : 
    1612            1 :         shard.set_preferred_az(&mut scheduler, None);
    1613              :         // No home AZ.
    1614            2 :         for node in scheduler.nodes.values() {
    1615            2 :             assert_eq!(node.shard_count, 1);
    1616            2 :             assert_eq!(node.home_shard_count, 0);
    1617              :         }
    1618              : 
    1619            1 :         shard.intent.clear(&mut scheduler);
    1620            1 :     }
    1621              : }

Generated by: LCOV version 2.1-beta