Line data Source code
1 : use std::time::SystemTime;
2 :
3 : use utils::serde_percent::Percent;
4 : use utils::serde_system_time;
5 :
6 : /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
7 : /// the next tenant.
8 : ///
9 : /// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth.
10 : ///
11 : /// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
12 : /// not handle full u64 values properly.
13 3 : #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
14 : pub struct PageserverUtilization {
15 : /// Used disk space (physical, ground truth from statfs())
16 : #[serde(serialize_with = "ser_saturating_u63")]
17 : pub disk_usage_bytes: u64,
18 : /// Free disk space
19 : #[serde(serialize_with = "ser_saturating_u63")]
20 : pub free_space_bytes: u64,
21 :
22 : /// Wanted disk space, based on the tenant shards currently present on this pageserver: this
23 : /// is like disk_usage_bytes, but it is stable and does not change with the cache state of
24 : /// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay
25 : /// there, or may be unrealistically low if the pageserver has attached tenants which haven't
26 : /// downloaded layers yet.
27 : #[serde(serialize_with = "ser_saturating_u63", default)]
28 : pub disk_wanted_bytes: u64,
29 :
30 : // What proportion of total disk space will this pageserver use before it starts evicting data?
31 : #[serde(default = "unity_percent")]
32 : pub disk_usable_pct: Percent,
33 :
34 : // How many shards are currently on this node?
35 : #[serde(default)]
36 : pub shard_count: u32,
37 :
38 : // How many shards should this node be able to handle at most?
39 : #[serde(default)]
40 : pub max_shard_count: u32,
41 :
42 : /// Cached result of [`Self::score`]
43 : pub utilization_score: Option<u64>,
44 :
45 : /// When was this snapshot captured, pageserver local time.
46 : ///
47 : /// Use millis to give confidence that the value is regenerated often enough.
48 : pub captured_at: serde_system_time::SystemTime,
49 : }
50 :
51 0 : fn unity_percent() -> Percent {
52 0 : Percent::new(0).unwrap()
53 0 : }
54 :
55 : pub type RawScore = u64;
56 :
57 : impl PageserverUtilization {
58 : const UTILIZATION_FULL: u64 = 1000000;
59 :
60 : /// Calculate a utilization score. The result is to be inrepreted as a fraction of
61 : /// Self::UTILIZATION_FULL.
62 : ///
63 : /// Lower values are more affine to scheduling more work on this node.
64 : /// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work.
65 : /// - 0.0 represents an empty node.
66 : /// - Negative values are forbidden
67 : /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
68 : /// layer eviction.
69 25837 : pub fn score(&self) -> RawScore {
70 25837 : let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
71 25837 : * self.disk_usable_pct.get() as u64)
72 25837 : / 100;
73 25837 : let disk_utilization_score =
74 25837 : self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity;
75 25837 :
76 25837 : let shard_utilization_score =
77 25837 : self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64;
78 25837 : std::cmp::max(disk_utilization_score, shard_utilization_score)
79 25837 : }
80 :
81 91487 : pub fn cached_score(&mut self) -> RawScore {
82 91487 : match self.utilization_score {
83 : None => {
84 25837 : let s = self.score();
85 25837 : self.utilization_score = Some(s);
86 25837 : s
87 : }
88 65650 : Some(s) => s,
89 : }
90 91487 : }
91 :
92 : /// If a node is currently hosting more work than it can comfortably handle. This does not indicate that
93 : /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
94 : ///
95 : /// When a node is overloaded, we may override soft affinity preferences and do things like scheduling
96 : /// into a node in a less desirable AZ, if all the nodes in the preferred AZ are overloaded.
97 91343 : pub fn is_overloaded(score: RawScore) -> bool {
98 91343 : // Why the factor of two? This is unscientific but reflects behavior of real systems:
99 91343 : // - In terms of shard counts, a node's preferred max count is a soft limit intended to keep
100 91343 : // startup and housekeeping jobs nice and responsive. We can go to double this limit if needed
101 91343 : // until some more nodes are deployed.
102 91343 : // - In terms of disk space, the node's utilization heuristic assumes every tenant needs to
103 91343 : // hold its biggest timeline fully on disk, which is tends to be an over estimate when
104 91343 : // some tenants are very idle and have dropped layers from disk. In practice going up to
105 91343 : // double is generally better than giving up and scheduling in a sub-optimal AZ.
106 91343 : score >= 2 * Self::UTILIZATION_FULL
107 91343 : }
108 :
109 25709 : pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
110 25709 : if self.shard_count < shard_count {
111 25705 : self.shard_count = shard_count;
112 25705 :
113 25705 : // Dirty cache: this will be calculated next time someone retrives the score
114 25705 : self.utilization_score = None;
115 25705 : }
116 25709 : }
117 :
118 : /// A utilization structure that has a full utilization score: use this as a placeholder when
119 : /// you need a utilization but don't have real values yet.
120 0 : pub fn full() -> Self {
121 0 : Self {
122 0 : disk_usage_bytes: 1,
123 0 : free_space_bytes: 0,
124 0 : disk_wanted_bytes: 1,
125 0 : disk_usable_pct: Percent::new(100).unwrap(),
126 0 : shard_count: 1,
127 0 : max_shard_count: 1,
128 0 : utilization_score: Some(Self::UTILIZATION_FULL),
129 0 : captured_at: serde_system_time::SystemTime(SystemTime::now()),
130 0 : }
131 0 : }
132 : }
133 :
134 : /// Test helper
135 : pub mod test_utilization {
136 : use std::time::SystemTime;
137 :
138 : use utils::serde_percent::Percent;
139 : use utils::serde_system_time::{self};
140 :
141 : use super::PageserverUtilization;
142 :
143 : // Parameters of the imaginary node used for test utilization instances
144 : const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
145 : const TEST_SHARDS_MAX: u32 = 1000;
146 :
147 : /// Unit test helper. Unconditionally compiled because cfg(test) doesn't carry across crates. Do
148 : /// not abuse this function from non-test code.
149 : ///
150 : /// Emulates a node with a 1000 shard limit and a 1TB disk.
151 280 : pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
152 280 : PageserverUtilization {
153 280 : disk_usage_bytes: disk_wanted_bytes,
154 280 : free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
155 280 : disk_wanted_bytes,
156 280 : disk_usable_pct: Percent::new(100).unwrap(),
157 280 : shard_count,
158 280 : max_shard_count: TEST_SHARDS_MAX,
159 280 : utilization_score: None,
160 280 : captured_at: serde_system_time::SystemTime(SystemTime::now()),
161 280 : }
162 280 : }
163 : }
164 :
165 : /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
166 : ///
167 : /// Instead of newtype, use this because a newtype would get require handling deserializing values
168 : /// with the highest bit set which is properly parsed by serde formats, but would create a
169 : /// conundrum on how to handle and again serialize such values at type level. It will be a few
170 : /// years until we can use more than `i64::MAX` bytes on a disk.
171 3 : fn ser_saturating_u63<S: serde::Serializer>(value: &u64, serializer: S) -> Result<S::Ok, S::Error> {
172 : const MAX_FORMAT_INT64: u64 = i64::MAX as u64;
173 :
174 3 : let value = (*value).min(MAX_FORMAT_INT64);
175 3 :
176 3 : serializer.serialize_u64(value)
177 3 : }
178 :
179 : #[cfg(test)]
180 : mod tests {
181 : use std::time::Duration;
182 :
183 : use super::*;
184 :
185 : #[test]
186 1 : fn u64_max_is_serialized_as_u63_max() {
187 1 : let doc = PageserverUtilization {
188 1 : disk_usage_bytes: u64::MAX,
189 1 : free_space_bytes: 0,
190 1 : disk_wanted_bytes: u64::MAX,
191 1 : utilization_score: Some(13),
192 1 : disk_usable_pct: Percent::new(90).unwrap(),
193 1 : shard_count: 100,
194 1 : max_shard_count: 200,
195 1 : captured_at: serde_system_time::SystemTime(
196 1 : std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
197 1 : ),
198 1 : };
199 1 :
200 1 : let s = serde_json::to_string(&doc).unwrap();
201 1 :
202 1 : let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}";
203 1 :
204 1 : assert_eq!(s, expected);
205 1 : }
206 : }
|