|             Line data    Source code 
       1              : use std::time::SystemTime;
       2              : 
       3              : use utils::serde_percent::Percent;
       4              : use utils::serde_system_time;
       5              : 
       6              : /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
       7              : /// the next tenant.
       8              : ///
       9              : /// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth.
      10              : ///
      11              : /// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
      12              : /// not handle full u64 values properly.
      13            0 : #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
      14              : pub struct PageserverUtilization {
      15              :     /// Used disk space (physical, ground truth from statfs())
      16              :     #[serde(serialize_with = "ser_saturating_u63")]
      17              :     pub disk_usage_bytes: u64,
      18              :     /// Free disk space
      19              :     #[serde(serialize_with = "ser_saturating_u63")]
      20              :     pub free_space_bytes: u64,
      21              : 
      22              :     /// Wanted disk space, based on the tenant shards currently present on this pageserver: this
      23              :     /// is like disk_usage_bytes, but it is stable and does not change with the cache state of
      24              :     /// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay
      25              :     /// there, or may be unrealistically low if the pageserver has attached tenants which haven't
      26              :     /// downloaded layers yet.
      27              :     #[serde(serialize_with = "ser_saturating_u63", default)]
      28              :     pub disk_wanted_bytes: u64,
      29              : 
      30              :     // What proportion of total disk space will this pageserver use before it starts evicting data?
      31              :     #[serde(default = "unity_percent")]
      32              :     pub disk_usable_pct: Percent,
      33              : 
      34              :     // How many shards are currently on this node?
      35              :     #[serde(default)]
      36              :     pub shard_count: u32,
      37              : 
      38              :     // How many shards should this node be able to handle at most?
      39              :     #[serde(default)]
      40              :     pub max_shard_count: u32,
      41              : 
      42              :     /// Cached result of [`Self::score`]
      43              :     pub utilization_score: Option<u64>,
      44              : 
      45              :     /// When was this snapshot captured, pageserver local time.
      46              :     ///
      47              :     /// Use millis to give confidence that the value is regenerated often enough.
      48              :     pub captured_at: serde_system_time::SystemTime,
      49              : }
      50              : 
      51            0 : fn unity_percent() -> Percent {
      52            0 :     Percent::new(0).unwrap()
      53            0 : }
      54              : 
      55              : pub type RawScore = u64;
      56              : 
      57              : impl PageserverUtilization {
      58              :     const UTILIZATION_FULL: u64 = 1000000;
      59              : 
      60              :     /// Calculate a utilization score.  The result is to be inrepreted as a fraction of
      61              :     /// Self::UTILIZATION_FULL.
      62              :     ///
      63              :     /// Lower values are more affine to scheduling more work on this node.
      64              :     /// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work.
      65              :     /// - 0.0 represents an empty node.
      66              :     /// - Negative values are forbidden
      67              :     /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
      68              :     ///   layer eviction.
      69        25837 :     pub fn score(&self) -> RawScore {
      70        25837 :         let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
      71        25837 :             * self.disk_usable_pct.get() as u64)
      72        25837 :             / 100;
      73        25837 :         let disk_utilization_score =
      74        25837 :             self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity;
      75              : 
      76        25837 :         let shard_utilization_score =
      77        25837 :             self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64;
      78        25837 :         std::cmp::max(disk_utilization_score, shard_utilization_score)
      79        25837 :     }
      80              : 
      81        91487 :     pub fn cached_score(&mut self) -> RawScore {
      82        91487 :         match self.utilization_score {
      83              :             None => {
      84        25837 :                 let s = self.score();
      85        25837 :                 self.utilization_score = Some(s);
      86        25837 :                 s
      87              :             }
      88        65650 :             Some(s) => s,
      89              :         }
      90        91487 :     }
      91              : 
      92              :     /// If a node is currently hosting more work than it can comfortably handle.  This does not indicate that
      93              :     /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
      94              :     ///
      95              :     /// When a node is overloaded, we may override soft affinity preferences and do things like scheduling
      96              :     /// into a node in a less desirable AZ, if all the nodes in the preferred AZ are overloaded.
      97        91343 :     pub fn is_overloaded(score: RawScore) -> bool {
      98              :         // Why the factor of two?  This is unscientific but reflects behavior of real systems:
      99              :         // - In terms of shard counts, a node's preferred max count is a soft limit intended to keep
     100              :         //   startup and housekeeping jobs nice and responsive.  We can go to double this limit if needed
     101              :         //   until some more nodes are deployed.
     102              :         // - In terms of disk space, the node's utilization heuristic assumes every tenant needs to
     103              :         //   hold its biggest timeline fully on disk, which is tends to be an over estimate when
     104              :         //   some tenants are very idle and have dropped layers from disk.  In practice going up to
     105              :         //   double is generally better than giving up and scheduling in a sub-optimal AZ.
     106        91343 :         score >= 2 * Self::UTILIZATION_FULL
     107        91343 :     }
     108              : 
     109        25709 :     pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
     110        25709 :         if self.shard_count < shard_count {
     111        25705 :             self.shard_count = shard_count;
     112        25705 : 
     113        25705 :             // Dirty cache: this will be calculated next time someone retrives the score
     114        25705 :             self.utilization_score = None;
     115        25705 :         }
     116        25709 :     }
     117              : 
     118              :     /// A utilization structure that has a full utilization score: use this as a placeholder when
     119              :     /// you need a utilization but don't have real values yet.
     120            0 :     pub fn full() -> Self {
     121            0 :         Self {
     122            0 :             disk_usage_bytes: 1,
     123            0 :             free_space_bytes: 0,
     124            0 :             disk_wanted_bytes: 1,
     125            0 :             disk_usable_pct: Percent::new(100).unwrap(),
     126            0 :             shard_count: 1,
     127            0 :             max_shard_count: 1,
     128            0 :             utilization_score: Some(Self::UTILIZATION_FULL),
     129            0 :             captured_at: serde_system_time::SystemTime(SystemTime::now()),
     130            0 :         }
     131            0 :     }
     132              : }
     133              : 
     134              : /// Test helper
     135              : pub mod test_utilization {
     136              :     use std::time::SystemTime;
     137              : 
     138              :     use utils::serde_percent::Percent;
     139              :     use utils::serde_system_time::{self};
     140              : 
     141              :     use super::PageserverUtilization;
     142              : 
     143              :     // Parameters of the imaginary node used for test utilization instances
     144              :     const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
     145              :     const TEST_SHARDS_MAX: u32 = 1000;
     146              : 
     147              :     /// Unit test helper.  Unconditionally compiled because cfg(test) doesn't carry across crates.  Do
     148              :     /// not abuse this function from non-test code.
     149              :     ///
     150              :     /// Emulates a node with a 1000 shard limit and a 1TB disk.
     151          280 :     pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
     152          280 :         PageserverUtilization {
     153          280 :             disk_usage_bytes: disk_wanted_bytes,
     154          280 :             free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
     155          280 :             disk_wanted_bytes,
     156          280 :             disk_usable_pct: Percent::new(100).unwrap(),
     157          280 :             shard_count,
     158          280 :             max_shard_count: TEST_SHARDS_MAX,
     159          280 :             utilization_score: None,
     160          280 :             captured_at: serde_system_time::SystemTime(SystemTime::now()),
     161          280 :         }
     162          280 :     }
     163              : }
     164              : 
     165              : /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
     166              : ///
     167              : /// Instead of newtype, use this because a newtype would get require handling deserializing values
     168              : /// with the highest bit set which is properly parsed by serde formats, but would create a
     169              : /// conundrum on how to handle and again serialize such values at type level. It will be a few
     170              : /// years until we can use more than `i64::MAX` bytes on a disk.
     171            3 : fn ser_saturating_u63<S: serde::Serializer>(value: &u64, serializer: S) -> Result<S::Ok, S::Error> {
     172              :     const MAX_FORMAT_INT64: u64 = i64::MAX as u64;
     173              : 
     174            3 :     let value = (*value).min(MAX_FORMAT_INT64);
     175              : 
     176            3 :     serializer.serialize_u64(value)
     177            3 : }
     178              : 
     179              : #[cfg(test)]
     180              : mod tests {
     181              :     use std::time::Duration;
     182              : 
     183              :     use super::*;
     184              : 
     185              :     #[test]
     186            1 :     fn u64_max_is_serialized_as_u63_max() {
     187            1 :         let doc = PageserverUtilization {
     188            1 :             disk_usage_bytes: u64::MAX,
     189            1 :             free_space_bytes: 0,
     190            1 :             disk_wanted_bytes: u64::MAX,
     191            1 :             utilization_score: Some(13),
     192            1 :             disk_usable_pct: Percent::new(90).unwrap(),
     193            1 :             shard_count: 100,
     194            1 :             max_shard_count: 200,
     195            1 :             captured_at: serde_system_time::SystemTime(
     196            1 :                 std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
     197            1 :             ),
     198            1 :         };
     199              : 
     200            1 :         let s = serde_json::to_string(&doc).unwrap();
     201              : 
     202            1 :         let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}";
     203              : 
     204            1 :         assert_eq!(s, expected);
     205            1 :     }
     206              : }
         |