Line data Source code
1 : //! See `pageserver_api::shard` for description on sharding.
2 :
3 : use std::{ops::RangeInclusive, str::FromStr};
4 :
5 : use hex::FromHex;
6 : use serde::{Deserialize, Serialize};
7 :
8 : use crate::id::TenantId;
9 :
10 0 : #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
11 : pub struct ShardNumber(pub u8);
12 :
13 0 : #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
14 : pub struct ShardCount(pub u8);
15 :
16 : /// Combination of ShardNumber and ShardCount.
17 : ///
18 : /// For use within the context of a particular tenant, when we need to know which shard we're
19 : /// dealing with, but do not need to know the full ShardIdentity (because we won't be doing
20 : /// any page->shard mapping), and do not need to know the fully qualified TenantShardId.
21 : #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
22 : pub struct ShardIndex {
23 : pub shard_number: ShardNumber,
24 : pub shard_count: ShardCount,
25 : }
26 :
27 : /// Formatting helper, for generating the `shard_id` label in traces.
28 : pub struct ShardSlug<'a>(&'a TenantShardId);
29 :
30 : /// TenantShardId globally identifies a particular shard in a particular tenant.
31 : ///
32 : /// These are written as `<TenantId>-<ShardSlug>`, for example:
33 : /// # The second shard in a two-shard tenant
34 : /// 072f1291a5310026820b2fe4b2968934-0102
35 : ///
36 : /// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
37 : /// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
38 : /// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
39 : ///
40 : /// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
41 : /// is both forward and backward compatible with TenantId: a legacy TenantId can be
42 : /// decoded as a TenantShardId, and when re-encoded it will be parseable
43 : /// as a TenantId.
44 : #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
45 : pub struct TenantShardId {
46 : pub tenant_id: TenantId,
47 : pub shard_number: ShardNumber,
48 : pub shard_count: ShardCount,
49 : }
50 :
51 : impl ShardCount {
52 : pub const MAX: Self = Self(u8::MAX);
53 : pub const MIN: Self = Self(0);
54 :
55 : /// The internal value of a ShardCount may be zero, which means "1 shard, but use
56 : /// legacy format for TenantShardId that excludes the shard suffix", also known
57 : /// as [`TenantShardId::unsharded`].
58 : ///
59 : /// This method returns the actual number of shards, i.e. if our internal value is
60 : /// zero, we return 1 (unsharded tenants have 1 shard).
61 9613426 : pub fn count(&self) -> u8 {
62 9613426 : if self.0 > 0 {
63 5021 : self.0
64 : } else {
65 9608405 : 1
66 : }
67 9613426 : }
68 :
69 : /// The literal internal value: this is **not** the number of shards in the
70 : /// tenant, as we have a special zero value for legacy unsharded tenants. Use
71 : /// [`Self::count`] if you want to know the cardinality of shards.
72 2 : pub fn literal(&self) -> u8 {
73 2 : self.0
74 2 : }
75 :
76 : /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
77 : /// uses the legacy format for `TenantShardId`. See also the documentation for
78 : /// [`Self::count`].
79 0 : pub fn is_unsharded(&self) -> bool {
80 0 : self.0 == 0
81 0 : }
82 :
83 : /// `v` may be zero, or the number of shards in the tenant. `v` is what
84 : /// [`Self::literal`] would return.
85 9998 : pub const fn new(val: u8) -> Self {
86 9998 : Self(val)
87 9998 : }
88 : }
89 :
90 : impl ShardNumber {
91 : pub const MAX: Self = Self(u8::MAX);
92 : }
93 :
94 : impl TenantShardId {
95 53 : pub fn unsharded(tenant_id: TenantId) -> Self {
96 53 : Self {
97 53 : tenant_id,
98 53 : shard_number: ShardNumber(0),
99 53 : shard_count: ShardCount(0),
100 53 : }
101 53 : }
102 :
103 : /// The range of all TenantShardId that belong to a particular TenantId. This is useful when
104 : /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
105 0 : pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
106 0 : RangeInclusive::new(
107 0 : Self {
108 0 : tenant_id,
109 0 : shard_number: ShardNumber(0),
110 0 : shard_count: ShardCount(0),
111 0 : },
112 0 : Self {
113 0 : tenant_id,
114 0 : shard_number: ShardNumber::MAX,
115 0 : shard_count: ShardCount::MAX,
116 0 : },
117 0 : )
118 0 : }
119 :
120 0 : pub fn range(&self) -> RangeInclusive<Self> {
121 0 : RangeInclusive::new(*self, *self)
122 0 : }
123 :
124 38038 : pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
125 38038 : ShardSlug(self)
126 38038 : }
127 :
128 : /// Convenience for code that has special behavior on the 0th shard.
129 1130 : pub fn is_shard_zero(&self) -> bool {
130 1130 : self.shard_number == ShardNumber(0)
131 1130 : }
132 :
133 : /// The "unsharded" value is distinct from simply having a single shard: it represents
134 : /// a tenant which is not shard-aware at all, and whose storage paths will not include
135 : /// a shard suffix.
136 0 : pub fn is_unsharded(&self) -> bool {
137 0 : self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
138 0 : }
139 :
140 : /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
141 : /// is useful when logging from code that is already in a span that includes tenant ID, to
142 : /// keep messages reasonably terse.
143 0 : pub fn to_index(&self) -> ShardIndex {
144 0 : ShardIndex {
145 0 : shard_number: self.shard_number,
146 0 : shard_count: self.shard_count,
147 0 : }
148 0 : }
149 :
150 : /// Calculate the children of this TenantShardId when splitting the overall tenant into
151 : /// the given number of shards.
152 8 : pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
153 8 : let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
154 8 : let mut child_shards = Vec::new();
155 48 : for shard_number in 0..ShardNumber(new_shard_count.0).0 {
156 : // Key mapping is based on a round robin mapping of key hash modulo shard count,
157 : // so our child shards are the ones which the same keys would map to.
158 48 : if shard_number % effective_old_shard_count == self.shard_number.0 {
159 44 : child_shards.push(TenantShardId {
160 44 : tenant_id: self.tenant_id,
161 44 : shard_number: ShardNumber(shard_number),
162 44 : shard_count: new_shard_count,
163 44 : })
164 4 : }
165 : }
166 :
167 8 : child_shards
168 8 : }
169 : }
170 :
171 : impl std::fmt::Display for ShardNumber {
172 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
173 0 : self.0.fmt(f)
174 0 : }
175 : }
176 :
177 : impl std::fmt::Display for ShardSlug<'_> {
178 25143 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
179 25143 : write!(
180 25143 : f,
181 25143 : "{:02x}{:02x}",
182 25143 : self.0.shard_number.0, self.0.shard_count.0
183 25143 : )
184 25143 : }
185 : }
186 :
187 : impl std::fmt::Display for TenantShardId {
188 28428 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
189 28428 : if self.shard_count != ShardCount(0) {
190 321 : write!(f, "{}-{}", self.tenant_id, self.shard_slug())
191 : } else {
192 : // Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
193 : // is distinct from the normal single shard case (shard count == 1).
194 28107 : self.tenant_id.fmt(f)
195 : }
196 28428 : }
197 : }
198 :
199 : impl std::fmt::Debug for TenantShardId {
200 6264 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
201 6264 : // Debug is the same as Display: the compact hex representation
202 6264 : write!(f, "{}", self)
203 6264 : }
204 : }
205 :
206 : impl std::str::FromStr for TenantShardId {
207 : type Err = hex::FromHexError;
208 :
209 9244 : fn from_str(s: &str) -> Result<Self, Self::Err> {
210 9244 : // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
211 9244 : if s.len() == 32 {
212 : // Legacy case: no shard specified
213 : Ok(Self {
214 9174 : tenant_id: TenantId::from_str(s)?,
215 9174 : shard_number: ShardNumber(0),
216 9174 : shard_count: ShardCount(0),
217 : })
218 70 : } else if s.len() == 37 {
219 70 : let bytes = s.as_bytes();
220 70 : let tenant_id = TenantId::from_hex(&bytes[0..32])?;
221 70 : let mut shard_parts: [u8; 2] = [0u8; 2];
222 70 : hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
223 70 : Ok(Self {
224 70 : tenant_id,
225 70 : shard_number: ShardNumber(shard_parts[0]),
226 70 : shard_count: ShardCount(shard_parts[1]),
227 70 : })
228 : } else {
229 0 : Err(hex::FromHexError::InvalidStringLength)
230 : }
231 9244 : }
232 : }
233 :
234 : impl From<[u8; 18]> for TenantShardId {
235 94 : fn from(b: [u8; 18]) -> Self {
236 94 : let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
237 94 :
238 94 : Self {
239 94 : tenant_id: TenantId::from(tenant_id_bytes),
240 94 : shard_number: ShardNumber(b[16]),
241 94 : shard_count: ShardCount(b[17]),
242 94 : }
243 94 : }
244 : }
245 :
246 : impl ShardIndex {
247 28 : pub fn new(number: ShardNumber, count: ShardCount) -> Self {
248 28 : Self {
249 28 : shard_number: number,
250 28 : shard_count: count,
251 28 : }
252 28 : }
253 316 : pub fn unsharded() -> Self {
254 316 : Self {
255 316 : shard_number: ShardNumber(0),
256 316 : shard_count: ShardCount(0),
257 316 : }
258 316 : }
259 :
260 : /// The "unsharded" value is distinct from simply having a single shard: it represents
261 : /// a tenant which is not shard-aware at all, and whose storage paths will not include
262 : /// a shard suffix.
263 148677 : pub fn is_unsharded(&self) -> bool {
264 148677 : self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
265 148677 : }
266 :
267 : /// For use in constructing remote storage paths: concatenate this with a TenantId
268 : /// to get a fully qualified TenantShardId.
269 : ///
270 : /// Backward compat: this function returns an empty string if Self::is_unsharded, such
271 : /// that the legacy pre-sharding remote key format is preserved.
272 3973 : pub fn get_suffix(&self) -> String {
273 3973 : if self.is_unsharded() {
274 3957 : "".to_string()
275 : } else {
276 16 : format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
277 : }
278 3973 : }
279 : }
280 :
281 : impl std::fmt::Display for ShardIndex {
282 4504 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
283 4504 : write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
284 4504 : }
285 : }
286 :
287 : impl std::fmt::Debug for ShardIndex {
288 3468 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
289 3468 : // Debug is the same as Display: the compact hex representation
290 3468 : write!(f, "{}", self)
291 3468 : }
292 : }
293 :
294 : impl std::str::FromStr for ShardIndex {
295 : type Err = hex::FromHexError;
296 :
297 6257 : fn from_str(s: &str) -> Result<Self, Self::Err> {
298 6257 : // Expect format: 1 byte shard number, 1 byte shard count
299 6257 : if s.len() == 4 {
300 6257 : let bytes = s.as_bytes();
301 6257 : let mut shard_parts: [u8; 2] = [0u8; 2];
302 6257 : hex::decode_to_slice(bytes, &mut shard_parts)?;
303 6257 : Ok(Self {
304 6257 : shard_number: ShardNumber(shard_parts[0]),
305 6257 : shard_count: ShardCount(shard_parts[1]),
306 6257 : })
307 : } else {
308 0 : Err(hex::FromHexError::InvalidStringLength)
309 : }
310 6257 : }
311 : }
312 :
313 : impl From<[u8; 2]> for ShardIndex {
314 1 : fn from(b: [u8; 2]) -> Self {
315 1 : Self {
316 1 : shard_number: ShardNumber(b[0]),
317 1 : shard_count: ShardCount(b[1]),
318 1 : }
319 1 : }
320 : }
321 :
322 : impl Serialize for TenantShardId {
323 86 : fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
324 86 : where
325 86 : S: serde::Serializer,
326 86 : {
327 86 : if serializer.is_human_readable() {
328 82 : serializer.collect_str(self)
329 : } else {
330 : // Note: while human encoding of [`TenantShardId`] is backward and forward
331 : // compatible, this binary encoding is not.
332 4 : let mut packed: [u8; 18] = [0; 18];
333 4 : packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
334 4 : packed[16] = self.shard_number.0;
335 4 : packed[17] = self.shard_count.0;
336 4 :
337 4 : packed.serialize(serializer)
338 : }
339 86 : }
340 : }
341 :
342 : impl<'de> Deserialize<'de> for TenantShardId {
343 15 : fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
344 15 : where
345 15 : D: serde::Deserializer<'de>,
346 15 : {
347 : struct IdVisitor {
348 : is_human_readable_deserializer: bool,
349 : }
350 :
351 : impl<'de> serde::de::Visitor<'de> for IdVisitor {
352 : type Value = TenantShardId;
353 :
354 0 : fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
355 0 : if self.is_human_readable_deserializer {
356 0 : formatter.write_str("value in form of hex string")
357 : } else {
358 0 : formatter.write_str("value in form of integer array([u8; 18])")
359 : }
360 0 : }
361 :
362 2 : fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
363 2 : where
364 2 : A: serde::de::SeqAccess<'de>,
365 2 : {
366 2 : let s = serde::de::value::SeqAccessDeserializer::new(seq);
367 2 : let id: [u8; 18] = Deserialize::deserialize(s)?;
368 2 : Ok(TenantShardId::from(id))
369 2 : }
370 :
371 13 : fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
372 13 : where
373 13 : E: serde::de::Error,
374 13 : {
375 13 : TenantShardId::from_str(v).map_err(E::custom)
376 13 : }
377 : }
378 :
379 15 : if deserializer.is_human_readable() {
380 13 : deserializer.deserialize_str(IdVisitor {
381 13 : is_human_readable_deserializer: true,
382 13 : })
383 : } else {
384 2 : deserializer.deserialize_tuple(
385 2 : 18,
386 2 : IdVisitor {
387 2 : is_human_readable_deserializer: false,
388 2 : },
389 2 : )
390 : }
391 15 : }
392 : }
393 :
394 : impl Serialize for ShardIndex {
395 34 : fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
396 34 : where
397 34 : S: serde::Serializer,
398 34 : {
399 34 : if serializer.is_human_readable() {
400 32 : serializer.collect_str(self)
401 : } else {
402 : // Binary encoding is not used in index_part.json, but is included in anticipation of
403 : // switching various structures (e.g. inter-process communication, remote metadata) to more
404 : // compact binary encodings in future.
405 2 : let mut packed: [u8; 2] = [0; 2];
406 2 : packed[0] = self.shard_number.0;
407 2 : packed[1] = self.shard_count.0;
408 2 : packed.serialize(serializer)
409 : }
410 34 : }
411 : }
412 :
413 : impl<'de> Deserialize<'de> for ShardIndex {
414 6257 : fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
415 6257 : where
416 6257 : D: serde::Deserializer<'de>,
417 6257 : {
418 : struct IdVisitor {
419 : is_human_readable_deserializer: bool,
420 : }
421 :
422 : impl<'de> serde::de::Visitor<'de> for IdVisitor {
423 : type Value = ShardIndex;
424 :
425 0 : fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
426 0 : if self.is_human_readable_deserializer {
427 0 : formatter.write_str("value in form of hex string")
428 : } else {
429 0 : formatter.write_str("value in form of integer array([u8; 2])")
430 : }
431 0 : }
432 :
433 1 : fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
434 1 : where
435 1 : A: serde::de::SeqAccess<'de>,
436 1 : {
437 1 : let s = serde::de::value::SeqAccessDeserializer::new(seq);
438 1 : let id: [u8; 2] = Deserialize::deserialize(s)?;
439 1 : Ok(ShardIndex::from(id))
440 1 : }
441 :
442 6256 : fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
443 6256 : where
444 6256 : E: serde::de::Error,
445 6256 : {
446 6256 : ShardIndex::from_str(v).map_err(E::custom)
447 6256 : }
448 : }
449 :
450 6257 : if deserializer.is_human_readable() {
451 6256 : deserializer.deserialize_str(IdVisitor {
452 6256 : is_human_readable_deserializer: true,
453 6256 : })
454 : } else {
455 1 : deserializer.deserialize_tuple(
456 1 : 2,
457 1 : IdVisitor {
458 1 : is_human_readable_deserializer: false,
459 1 : },
460 1 : )
461 : }
462 6257 : }
463 : }
|