Line data Source code
1 : use std::time::SystemTime;
2 : use utils::{serde_percent::Percent, serde_system_time};
3 :
4 : /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
5 : /// the next tenant.
6 : ///
7 : /// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth.
8 : ///
9 : /// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
10 : /// not handle full u64 values properly.
11 3 : #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
12 : pub struct PageserverUtilization {
13 : /// Used disk space (physical, ground truth from statfs())
14 : #[serde(serialize_with = "ser_saturating_u63")]
15 : pub disk_usage_bytes: u64,
16 : /// Free disk space
17 : #[serde(serialize_with = "ser_saturating_u63")]
18 : pub free_space_bytes: u64,
19 :
20 : /// Wanted disk space, based on the tenant shards currently present on this pageserver: this
21 : /// is like disk_usage_bytes, but it is stable and does not change with the cache state of
22 : /// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay
23 : /// there, or may be unrealistically low if the pageserver has attached tenants which haven't
24 : /// downloaded layers yet.
25 : #[serde(serialize_with = "ser_saturating_u63", default)]
26 : pub disk_wanted_bytes: u64,
27 :
28 : // What proportion of total disk space will this pageserver use before it starts evicting data?
29 : #[serde(default = "unity_percent")]
30 : pub disk_usable_pct: Percent,
31 :
32 : // How many shards are currently on this node?
33 : #[serde(default)]
34 : pub shard_count: u32,
35 :
36 : // How many shards should this node be able to handle at most?
37 : #[serde(default)]
38 : pub max_shard_count: u32,
39 :
40 : /// Cached result of [`Self::score`]
41 : pub utilization_score: Option<u64>,
42 :
43 : /// When was this snapshot captured, pageserver local time.
44 : ///
45 : /// Use millis to give confidence that the value is regenerated often enough.
46 : pub captured_at: serde_system_time::SystemTime,
47 : }
48 :
49 0 : fn unity_percent() -> Percent {
50 0 : Percent::new(0).unwrap()
51 0 : }
52 :
53 : pub type RawScore = u64;
54 :
55 : impl PageserverUtilization {
56 : const UTILIZATION_FULL: u64 = 1000000;
57 :
58 : /// Calculate a utilization score. The result is to be inrepreted as a fraction of
59 : /// Self::UTILIZATION_FULL.
60 : ///
61 : /// Lower values are more affine to scheduling more work on this node.
62 : /// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work.
63 : /// - 0.0 represents an empty node.
64 : /// - Negative values are forbidden
65 : /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
66 : /// layer eviction.
67 41 : pub fn score(&self) -> RawScore {
68 41 : let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
69 41 : * self.disk_usable_pct.get() as u64)
70 41 : / 100;
71 41 : let disk_utilization_score =
72 41 : self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity;
73 41 :
74 41 : let shard_utilization_score =
75 41 : self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64;
76 41 : std::cmp::max(disk_utilization_score, shard_utilization_score)
77 41 : }
78 :
79 84 : pub fn cached_score(&mut self) -> RawScore {
80 84 : match self.utilization_score {
81 : None => {
82 41 : let s = self.score();
83 41 : self.utilization_score = Some(s);
84 41 : s
85 : }
86 43 : Some(s) => s,
87 : }
88 84 : }
89 :
90 : /// If a node is currently hosting more work than it can comfortably handle. This does not indicate that
91 : /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
92 : ///
93 : /// When a node is overloaded, we may override soft affinity preferences and do things like scheduling
94 : /// into a node in a less desirable AZ, if all the nodes in the preferred AZ are overloaded.
95 84 : pub fn is_overloaded(score: RawScore) -> bool {
96 84 : // Why the factor of two? This is unscientific but reflects behavior of real systems:
97 84 : // - In terms of shard counts, a node's preferred max count is a soft limit intended to keep
98 84 : // startup and housekeeping jobs nice and responsive. We can go to double this limit if needed
99 84 : // until some more nodes are deployed.
100 84 : // - In terms of disk space, the node's utilization heuristic assumes every tenant needs to
101 84 : // hold its biggest timeline fully on disk, which is tends to be an over estimate when
102 84 : // some tenants are very idle and have dropped layers from disk. In practice going up to
103 84 : // double is generally better than giving up and scheduling in a sub-optimal AZ.
104 84 : score >= 2 * Self::UTILIZATION_FULL
105 84 : }
106 :
107 43 : pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
108 43 : if self.shard_count < shard_count {
109 39 : self.shard_count = shard_count;
110 39 :
111 39 : // Dirty cache: this will be calculated next time someone retrives the score
112 39 : self.utilization_score = None;
113 39 : }
114 43 : }
115 :
116 : /// A utilization structure that has a full utilization score: use this as a placeholder when
117 : /// you need a utilization but don't have real values yet.
118 0 : pub fn full() -> Self {
119 0 : Self {
120 0 : disk_usage_bytes: 1,
121 0 : free_space_bytes: 0,
122 0 : disk_wanted_bytes: 1,
123 0 : disk_usable_pct: Percent::new(100).unwrap(),
124 0 : shard_count: 1,
125 0 : max_shard_count: 1,
126 0 : utilization_score: Some(Self::UTILIZATION_FULL),
127 0 : captured_at: serde_system_time::SystemTime(SystemTime::now()),
128 0 : }
129 0 : }
130 : }
131 :
132 : /// Test helper
133 : pub mod test_utilization {
134 : use super::PageserverUtilization;
135 : use std::time::SystemTime;
136 : use utils::{
137 : serde_percent::Percent,
138 : serde_system_time::{self},
139 : };
140 :
141 : // Parameters of the imaginary node used for test utilization instances
142 : const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
143 : const TEST_SHARDS_MAX: u32 = 1000;
144 :
145 : /// Unit test helper. Unconditionally compiled because cfg(test) doesn't carry across crates. Do
146 : /// not abuse this function from non-test code.
147 : ///
148 : /// Emulates a node with a 1000 shard limit and a 1TB disk.
149 27 : pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
150 27 : PageserverUtilization {
151 27 : disk_usage_bytes: disk_wanted_bytes,
152 27 : free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
153 27 : disk_wanted_bytes,
154 27 : disk_usable_pct: Percent::new(100).unwrap(),
155 27 : shard_count,
156 27 : max_shard_count: TEST_SHARDS_MAX,
157 27 : utilization_score: None,
158 27 : captured_at: serde_system_time::SystemTime(SystemTime::now()),
159 27 : }
160 27 : }
161 : }
162 :
163 : /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
164 : ///
165 : /// Instead of newtype, use this because a newtype would get require handling deserializing values
166 : /// with the highest bit set which is properly parsed by serde formats, but would create a
167 : /// conundrum on how to handle and again serialize such values at type level. It will be a few
168 : /// years until we can use more than `i64::MAX` bytes on a disk.
169 3 : fn ser_saturating_u63<S: serde::Serializer>(value: &u64, serializer: S) -> Result<S::Ok, S::Error> {
170 : const MAX_FORMAT_INT64: u64 = i64::MAX as u64;
171 :
172 3 : let value = (*value).min(MAX_FORMAT_INT64);
173 3 :
174 3 : serializer.serialize_u64(value)
175 3 : }
176 :
177 : #[cfg(test)]
178 : mod tests {
179 : use std::time::Duration;
180 :
181 : use super::*;
182 :
183 : #[test]
184 1 : fn u64_max_is_serialized_as_u63_max() {
185 1 : let doc = PageserverUtilization {
186 1 : disk_usage_bytes: u64::MAX,
187 1 : free_space_bytes: 0,
188 1 : disk_wanted_bytes: u64::MAX,
189 1 : utilization_score: Some(13),
190 1 : disk_usable_pct: Percent::new(90).unwrap(),
191 1 : shard_count: 100,
192 1 : max_shard_count: 200,
193 1 : captured_at: serde_system_time::SystemTime(
194 1 : std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
195 1 : ),
196 1 : };
197 1 :
198 1 : let s = serde_json::to_string(&doc).unwrap();
199 1 :
200 1 : let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}";
201 1 :
202 1 : assert_eq!(s, expected);
203 1 : }
204 : }
|