Line data Source code
1 : //! See `pageserver_api::shard` for description on sharding.
2 :
3 : use std::{ops::RangeInclusive, str::FromStr};
4 :
5 : use hex::FromHex;
6 : use serde::{Deserialize, Serialize};
7 :
8 : use crate::id::TenantId;
9 :
10 0 : #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
11 : pub struct ShardNumber(pub u8);
12 :
13 0 : #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
14 : pub struct ShardCount(pub u8);
15 :
16 : /// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
17 : /// when we need to know which shard we're dealing with, but do not need to know the full
18 : /// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
19 : /// the fully qualified TenantShardId.
20 : #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
21 : pub struct ShardIndex {
22 : pub shard_number: ShardNumber,
23 : pub shard_count: ShardCount,
24 : }
25 :
26 : /// Formatting helper, for generating the `shard_id` label in traces.
27 : pub struct ShardSlug<'a>(&'a TenantShardId);
28 :
29 : /// TenantShardId globally identifies a particular shard in a particular tenant.
30 : ///
31 : /// These are written as `<TenantId>-<ShardSlug>`, for example:
32 : /// # The second shard in a two-shard tenant
33 : /// 072f1291a5310026820b2fe4b2968934-0102
34 : ///
35 : /// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
36 : /// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
37 : /// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
38 : ///
39 : /// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
40 : /// is both forward and backward compatible with TenantId: a legacy TenantId can be
41 : /// decoded as a TenantShardId, and when re-encoded it will be parseable
42 : /// as a TenantId.
43 : #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
44 : pub struct TenantShardId {
45 : pub tenant_id: TenantId,
46 : pub shard_number: ShardNumber,
47 : pub shard_count: ShardCount,
48 : }
49 :
50 : impl ShardCount {
51 : pub const MAX: Self = Self(u8::MAX);
52 : pub const MIN: Self = Self(0);
53 :
54 : /// The internal value of a ShardCount may be zero, which means "1 shard, but use
55 : /// legacy format for TenantShardId that excludes the shard suffix", also known
56 : /// as [`TenantShardId::unsharded`].
57 : ///
58 : /// This method returns the actual number of shards, i.e. if our internal value is
59 : /// zero, we return 1 (unsharded tenants have 1 shard).
60 14412666 : pub fn count(&self) -> u8 {
61 14412666 : if self.0 > 0 {
62 11 : self.0
63 : } else {
64 14412655 : 1
65 : }
66 14412666 : }
67 :
68 : /// The literal internal value: this is **not** the number of shards in the
69 : /// tenant, as we have a special zero value for legacy unsharded tenants. Use
70 : /// [`Self::count`] if you want to know the cardinality of shards.
71 2 : pub fn literal(&self) -> u8 {
72 2 : self.0
73 2 : }
74 :
75 : /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
76 : /// uses the legacy format for `TenantShardId`. See also the documentation for
77 : /// [`Self::count`].
78 0 : pub fn is_unsharded(&self) -> bool {
79 0 : self.0 == 0
80 0 : }
81 :
82 : /// `v` may be zero, or the number of shards in the tenant. `v` is what
83 : /// [`Self::literal`] would return.
84 5725 : pub const fn new(val: u8) -> Self {
85 5725 : Self(val)
86 5725 : }
87 : }
88 :
89 : impl ShardNumber {
90 : pub const MAX: Self = Self(u8::MAX);
91 : }
92 :
93 : impl TenantShardId {
94 95 : pub fn unsharded(tenant_id: TenantId) -> Self {
95 95 : Self {
96 95 : tenant_id,
97 95 : shard_number: ShardNumber(0),
98 95 : shard_count: ShardCount(0),
99 95 : }
100 95 : }
101 :
102 : /// The range of all TenantShardId that belong to a particular TenantId. This is useful when
103 : /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
104 0 : pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
105 0 : RangeInclusive::new(
106 0 : Self {
107 0 : tenant_id,
108 0 : shard_number: ShardNumber(0),
109 0 : shard_count: ShardCount(0),
110 0 : },
111 0 : Self {
112 0 : tenant_id,
113 0 : shard_number: ShardNumber::MAX,
114 0 : shard_count: ShardCount::MAX,
115 0 : },
116 0 : )
117 0 : }
118 :
119 33933 : pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
120 33933 : ShardSlug(self)
121 33933 : }
122 :
123 : /// Convenience for code that has special behavior on the 0th shard.
124 18 : pub fn is_shard_zero(&self) -> bool {
125 18 : self.shard_number == ShardNumber(0)
126 18 : }
127 :
128 : /// The "unsharded" value is distinct from simply having a single shard: it represents
129 : /// a tenant which is not shard-aware at all, and whose storage paths will not include
130 : /// a shard suffix.
131 0 : pub fn is_unsharded(&self) -> bool {
132 0 : self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
133 0 : }
134 :
135 : /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
136 : /// is useful when logging from code that is already in a span that includes tenant ID, to
137 : /// keep messages reasonably terse.
138 0 : pub fn to_index(&self) -> ShardIndex {
139 0 : ShardIndex {
140 0 : shard_number: self.shard_number,
141 0 : shard_count: self.shard_count,
142 0 : }
143 0 : }
144 :
145 : /// Calculate the children of this TenantShardId when splitting the overall tenant into
146 : /// the given number of shards.
147 4 : pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
148 4 : let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
149 4 : let mut child_shards = Vec::new();
150 16 : for shard_number in 0..ShardNumber(new_shard_count.0).0 {
151 : // Key mapping is based on a round robin mapping of key hash modulo shard count,
152 : // so our child shards are the ones which the same keys would map to.
153 16 : if shard_number % effective_old_shard_count == self.shard_number.0 {
154 12 : child_shards.push(TenantShardId {
155 12 : tenant_id: self.tenant_id,
156 12 : shard_number: ShardNumber(shard_number),
157 12 : shard_count: new_shard_count,
158 12 : })
159 4 : }
160 : }
161 :
162 4 : child_shards
163 4 : }
164 : }
165 :
166 : impl<'a> std::fmt::Display for ShardSlug<'a> {
167 33933 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
168 33933 : write!(
169 33933 : f,
170 33933 : "{:02x}{:02x}",
171 33933 : self.0.shard_number.0, self.0.shard_count.0
172 33933 : )
173 33933 : }
174 : }
175 :
176 : impl std::fmt::Display for TenantShardId {
177 37001 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
178 37001 : if self.shard_count != ShardCount(0) {
179 355 : write!(f, "{}-{}", self.tenant_id, self.shard_slug())
180 : } else {
181 : // Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
182 : // is distinct from the normal single shard case (shard count == 1).
183 36646 : self.tenant_id.fmt(f)
184 : }
185 37001 : }
186 : }
187 :
188 : impl std::fmt::Debug for TenantShardId {
189 9384 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
190 9384 : // Debug is the same as Display: the compact hex representation
191 9384 : write!(f, "{}", self)
192 9384 : }
193 : }
194 :
195 : impl std::str::FromStr for TenantShardId {
196 : type Err = hex::FromHexError;
197 :
198 13047 : fn from_str(s: &str) -> Result<Self, Self::Err> {
199 13047 : // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
200 13047 : if s.len() == 32 {
201 : // Legacy case: no shard specified
202 : Ok(Self {
203 12949 : tenant_id: TenantId::from_str(s)?,
204 12949 : shard_number: ShardNumber(0),
205 12949 : shard_count: ShardCount(0),
206 : })
207 98 : } else if s.len() == 37 {
208 98 : let bytes = s.as_bytes();
209 98 : let tenant_id = TenantId::from_hex(&bytes[0..32])?;
210 98 : let mut shard_parts: [u8; 2] = [0u8; 2];
211 98 : hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
212 98 : Ok(Self {
213 98 : tenant_id,
214 98 : shard_number: ShardNumber(shard_parts[0]),
215 98 : shard_count: ShardCount(shard_parts[1]),
216 98 : })
217 : } else {
218 0 : Err(hex::FromHexError::InvalidStringLength)
219 : }
220 13047 : }
221 : }
222 :
223 : impl From<[u8; 18]> for TenantShardId {
224 2 : fn from(b: [u8; 18]) -> Self {
225 2 : let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
226 2 :
227 2 : Self {
228 2 : tenant_id: TenantId::from(tenant_id_bytes),
229 2 : shard_number: ShardNumber(b[16]),
230 2 : shard_count: ShardCount(b[17]),
231 2 : }
232 2 : }
233 : }
234 :
235 : impl ShardIndex {
236 0 : pub fn new(number: ShardNumber, count: ShardCount) -> Self {
237 0 : Self {
238 0 : shard_number: number,
239 0 : shard_count: count,
240 0 : }
241 0 : }
242 306 : pub fn unsharded() -> Self {
243 306 : Self {
244 306 : shard_number: ShardNumber(0),
245 306 : shard_count: ShardCount(0),
246 306 : }
247 306 : }
248 :
249 : /// The "unsharded" value is distinct from simply having a single shard: it represents
250 : /// a tenant which is not shard-aware at all, and whose storage paths will not include
251 : /// a shard suffix.
252 216385 : pub fn is_unsharded(&self) -> bool {
253 216385 : self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
254 216385 : }
255 :
256 : /// For use in constructing remote storage paths: concatenate this with a TenantId
257 : /// to get a fully qualified TenantShardId.
258 : ///
259 : /// Backward compat: this function returns an empty string if Self::is_unsharded, such
260 : /// that the legacy pre-sharding remote key format is preserved.
261 4151 : pub fn get_suffix(&self) -> String {
262 4151 : if self.is_unsharded() {
263 4127 : "".to_string()
264 : } else {
265 24 : format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
266 : }
267 4151 : }
268 : }
269 :
270 : impl std::fmt::Display for ShardIndex {
271 5990 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
272 5990 : write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
273 5990 : }
274 : }
275 :
276 : impl std::fmt::Debug for ShardIndex {
277 4548 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
278 4548 : // Debug is the same as Display: the compact hex representation
279 4548 : write!(f, "{}", self)
280 4548 : }
281 : }
282 :
283 : impl std::str::FromStr for ShardIndex {
284 : type Err = hex::FromHexError;
285 :
286 9385 : fn from_str(s: &str) -> Result<Self, Self::Err> {
287 9385 : // Expect format: 1 byte shard number, 1 byte shard count
288 9385 : if s.len() == 4 {
289 9385 : let bytes = s.as_bytes();
290 9385 : let mut shard_parts: [u8; 2] = [0u8; 2];
291 9385 : hex::decode_to_slice(bytes, &mut shard_parts)?;
292 9385 : Ok(Self {
293 9385 : shard_number: ShardNumber(shard_parts[0]),
294 9385 : shard_count: ShardCount(shard_parts[1]),
295 9385 : })
296 : } else {
297 0 : Err(hex::FromHexError::InvalidStringLength)
298 : }
299 9385 : }
300 : }
301 :
302 : impl From<[u8; 2]> for ShardIndex {
303 1 : fn from(b: [u8; 2]) -> Self {
304 1 : Self {
305 1 : shard_number: ShardNumber(b[0]),
306 1 : shard_count: ShardCount(b[1]),
307 1 : }
308 1 : }
309 : }
310 :
311 : impl Serialize for TenantShardId {
312 126 : fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
313 126 : where
314 126 : S: serde::Serializer,
315 126 : {
316 126 : if serializer.is_human_readable() {
317 122 : serializer.collect_str(self)
318 : } else {
319 : // Note: while human encoding of [`TenantShardId`] is backward and forward
320 : // compatible, this binary encoding is not.
321 4 : let mut packed: [u8; 18] = [0; 18];
322 4 : packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
323 4 : packed[16] = self.shard_number.0;
324 4 : packed[17] = self.shard_count.0;
325 4 :
326 4 : packed.serialize(serializer)
327 : }
328 126 : }
329 : }
330 :
331 : impl<'de> Deserialize<'de> for TenantShardId {
332 21 : fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
333 21 : where
334 21 : D: serde::Deserializer<'de>,
335 21 : {
336 21 : struct IdVisitor {
337 21 : is_human_readable_deserializer: bool,
338 21 : }
339 21 :
340 21 : impl<'de> serde::de::Visitor<'de> for IdVisitor {
341 21 : type Value = TenantShardId;
342 21 :
343 21 : fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
344 0 : if self.is_human_readable_deserializer {
345 21 : formatter.write_str("value in form of hex string")
346 21 : } else {
347 21 : formatter.write_str("value in form of integer array([u8; 18])")
348 21 : }
349 21 : }
350 21 :
351 21 : fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
352 2 : where
353 2 : A: serde::de::SeqAccess<'de>,
354 2 : {
355 2 : let s = serde::de::value::SeqAccessDeserializer::new(seq);
356 21 : let id: [u8; 18] = Deserialize::deserialize(s)?;
357 21 : Ok(TenantShardId::from(id))
358 21 : }
359 21 :
360 21 : fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
361 19 : where
362 19 : E: serde::de::Error,
363 19 : {
364 19 : TenantShardId::from_str(v).map_err(E::custom)
365 19 : }
366 21 : }
367 21 :
368 21 : if deserializer.is_human_readable() {
369 19 : deserializer.deserialize_str(IdVisitor {
370 19 : is_human_readable_deserializer: true,
371 19 : })
372 : } else {
373 2 : deserializer.deserialize_tuple(
374 2 : 18,
375 2 : IdVisitor {
376 2 : is_human_readable_deserializer: false,
377 2 : },
378 2 : )
379 : }
380 21 : }
381 : }
382 :
383 : impl Serialize for ShardIndex {
384 50 : fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
385 50 : where
386 50 : S: serde::Serializer,
387 50 : {
388 50 : if serializer.is_human_readable() {
389 48 : serializer.collect_str(self)
390 : } else {
391 : // Binary encoding is not used in index_part.json, but is included in anticipation of
392 : // switching various structures (e.g. inter-process communication, remote metadata) to more
393 : // compact binary encodings in future.
394 2 : let mut packed: [u8; 2] = [0; 2];
395 2 : packed[0] = self.shard_number.0;
396 2 : packed[1] = self.shard_count.0;
397 2 : packed.serialize(serializer)
398 : }
399 50 : }
400 : }
401 :
402 : impl<'de> Deserialize<'de> for ShardIndex {
403 9385 : fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
404 9385 : where
405 9385 : D: serde::Deserializer<'de>,
406 9385 : {
407 9385 : struct IdVisitor {
408 9385 : is_human_readable_deserializer: bool,
409 9385 : }
410 9385 :
411 9385 : impl<'de> serde::de::Visitor<'de> for IdVisitor {
412 9385 : type Value = ShardIndex;
413 9385 :
414 9385 : fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
415 0 : if self.is_human_readable_deserializer {
416 9385 : formatter.write_str("value in form of hex string")
417 9385 : } else {
418 9385 : formatter.write_str("value in form of integer array([u8; 2])")
419 9385 : }
420 9385 : }
421 9385 :
422 9385 : fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
423 1 : where
424 1 : A: serde::de::SeqAccess<'de>,
425 1 : {
426 1 : let s = serde::de::value::SeqAccessDeserializer::new(seq);
427 9385 : let id: [u8; 2] = Deserialize::deserialize(s)?;
428 9385 : Ok(ShardIndex::from(id))
429 9385 : }
430 9385 :
431 9385 : fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
432 9384 : where
433 9384 : E: serde::de::Error,
434 9384 : {
435 9384 : ShardIndex::from_str(v).map_err(E::custom)
436 9384 : }
437 9385 : }
438 9385 :
439 9385 : if deserializer.is_human_readable() {
440 9384 : deserializer.deserialize_str(IdVisitor {
441 9384 : is_human_readable_deserializer: true,
442 9384 : })
443 : } else {
444 1 : deserializer.deserialize_tuple(
445 1 : 2,
446 1 : IdVisitor {
447 1 : is_human_readable_deserializer: false,
448 1 : },
449 1 : )
450 : }
451 9385 : }
452 : }
|