Line data Source code
1 : //! See `pageserver_api::shard` for description on sharding.
2 :
3 : use std::ops::RangeInclusive;
4 : use std::str::FromStr;
5 :
6 : use hex::FromHex;
7 : use serde::{Deserialize, Serialize};
8 :
9 : use crate::id::TenantId;
10 :
11 0 : #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
12 : pub struct ShardNumber(pub u8);
13 :
14 0 : #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
15 : pub struct ShardCount(pub u8);
16 :
17 : /// Combination of ShardNumber and ShardCount.
18 : ///
19 : /// For use within the context of a particular tenant, when we need to know which shard we're
20 : /// dealing with, but do not need to know the full ShardIdentity (because we won't be doing
21 : /// any page->shard mapping), and do not need to know the fully qualified TenantShardId.
22 : #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
23 : pub struct ShardIndex {
24 : pub shard_number: ShardNumber,
25 : pub shard_count: ShardCount,
26 : }
27 :
28 : /// Stripe size as number of pages.
29 : ///
30 : /// NB: don't implement Default, so callers don't lazily use it by mistake. See DEFAULT_STRIPE_SIZE.
31 0 : #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
32 : pub struct ShardStripeSize(pub u32);
33 :
34 : /// Formatting helper, for generating the `shard_id` label in traces.
35 : pub struct ShardSlug<'a>(&'a TenantShardId);
36 :
37 : /// TenantShardId globally identifies a particular shard in a particular tenant.
38 : ///
39 : /// These are written as `<TenantId>-<ShardSlug>`, for example:
40 : /// # The second shard in a two-shard tenant
41 : /// 072f1291a5310026820b2fe4b2968934-0102
42 : ///
43 : /// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
44 : /// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
45 : /// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
46 : ///
47 : /// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
48 : /// is both forward and backward compatible with TenantId: a legacy TenantId can be
49 : /// decoded as a TenantShardId, and when re-encoded it will be parseable
50 : /// as a TenantId.
51 : #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
52 : pub struct TenantShardId {
53 : pub tenant_id: TenantId,
54 : pub shard_number: ShardNumber,
55 : pub shard_count: ShardCount,
56 : }
57 :
58 : impl ShardCount {
59 : pub const MAX: Self = Self(u8::MAX);
60 : pub const MIN: Self = Self(0);
61 :
62 : /// The internal value of a ShardCount may be zero, which means "1 shard, but use
63 : /// legacy format for TenantShardId that excludes the shard suffix", also known
64 : /// as [`TenantShardId::unsharded`].
65 : ///
66 : /// This method returns the actual number of shards, i.e. if our internal value is
67 : /// zero, we return 1 (unsharded tenants have 1 shard).
68 2407224 : pub fn count(&self) -> u8 {
69 2407224 : if self.0 > 0 { self.0 } else { 1 }
70 2407224 : }
71 :
72 : /// The literal internal value: this is **not** the number of shards in the
73 : /// tenant, as we have a special zero value for legacy unsharded tenants. Use
74 : /// [`Self::count`] if you want to know the cardinality of shards.
75 2 : pub fn literal(&self) -> u8 {
76 2 : self.0
77 2 : }
78 :
79 : /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
80 : /// uses the legacy format for `TenantShardId`. See also the documentation for
81 : /// [`Self::count`].
82 0 : pub fn is_unsharded(&self) -> bool {
83 0 : self.0 == 0
84 0 : }
85 :
86 : /// `v` may be zero, or the number of shards in the tenant. `v` is what
87 : /// [`Self::literal`] would return.
88 10496 : pub const fn new(val: u8) -> Self {
89 10496 : Self(val)
90 10496 : }
91 : }
92 :
93 : impl ShardNumber {
94 : pub const MAX: Self = Self(u8::MAX);
95 : }
96 :
97 : impl TenantShardId {
98 46 : pub fn unsharded(tenant_id: TenantId) -> Self {
99 46 : Self {
100 46 : tenant_id,
101 46 : shard_number: ShardNumber(0),
102 46 : shard_count: ShardCount(0),
103 46 : }
104 46 : }
105 :
106 : /// The range of all TenantShardId that belong to a particular TenantId. This is useful when
107 : /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
108 0 : pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
109 0 : RangeInclusive::new(
110 0 : Self {
111 0 : tenant_id,
112 0 : shard_number: ShardNumber(0),
113 0 : shard_count: ShardCount(0),
114 0 : },
115 0 : Self {
116 0 : tenant_id,
117 0 : shard_number: ShardNumber::MAX,
118 0 : shard_count: ShardCount::MAX,
119 0 : },
120 : )
121 0 : }
122 :
123 0 : pub fn range(&self) -> RangeInclusive<Self> {
124 0 : RangeInclusive::new(*self, *self)
125 0 : }
126 :
127 19026 : pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
128 19026 : ShardSlug(self)
129 19026 : }
130 :
131 : /// Convenience for code that has special behavior on the 0th shard.
132 310 : pub fn is_shard_zero(&self) -> bool {
133 310 : self.shard_number == ShardNumber(0)
134 310 : }
135 :
136 : /// The "unsharded" value is distinct from simply having a single shard: it represents
137 : /// a tenant which is not shard-aware at all, and whose storage paths will not include
138 : /// a shard suffix.
139 0 : pub fn is_unsharded(&self) -> bool {
140 0 : self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
141 0 : }
142 :
143 : /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
144 : /// is useful when logging from code that is already in a span that includes tenant ID, to
145 : /// keep messages reasonably terse.
146 0 : pub fn to_index(&self) -> ShardIndex {
147 0 : ShardIndex {
148 0 : shard_number: self.shard_number,
149 0 : shard_count: self.shard_count,
150 0 : }
151 0 : }
152 :
153 : /// Calculate the children of this TenantShardId when splitting the overall tenant into
154 : /// the given number of shards.
155 5 : pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
156 5 : let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
157 5 : let mut child_shards = Vec::new();
158 24 : for shard_number in 0..ShardNumber(new_shard_count.0).0 {
159 : // Key mapping is based on a round robin mapping of key hash modulo shard count,
160 : // so our child shards are the ones which the same keys would map to.
161 24 : if shard_number % effective_old_shard_count == self.shard_number.0 {
162 20 : child_shards.push(TenantShardId {
163 20 : tenant_id: self.tenant_id,
164 20 : shard_number: ShardNumber(shard_number),
165 20 : shard_count: new_shard_count,
166 20 : })
167 4 : }
168 : }
169 :
170 5 : child_shards
171 5 : }
172 : }
173 :
174 : impl std::fmt::Display for ShardNumber {
175 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
176 0 : self.0.fmt(f)
177 0 : }
178 : }
179 :
180 : impl std::fmt::Display for ShardCount {
181 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
182 0 : self.0.fmt(f)
183 0 : }
184 : }
185 :
186 : impl std::fmt::Display for ShardStripeSize {
187 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
188 0 : self.0.fmt(f)
189 0 : }
190 : }
191 :
192 : impl std::fmt::Display for ShardSlug<'_> {
193 6123 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
194 6123 : write!(
195 6123 : f,
196 6123 : "{:02x}{:02x}",
197 : self.0.shard_number.0, self.0.shard_count.0
198 : )
199 6123 : }
200 : }
201 :
202 : impl std::fmt::Display for TenantShardId {
203 11743 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
204 11743 : if self.shard_count != ShardCount(0) {
205 118 : write!(f, "{}-{}", self.tenant_id, self.shard_slug())
206 : } else {
207 : // Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
208 : // is distinct from the normal single shard case (shard count == 1).
209 11625 : self.tenant_id.fmt(f)
210 : }
211 11743 : }
212 : }
213 :
214 : impl std::fmt::Debug for TenantShardId {
215 5681 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
216 : // Debug is the same as Display: the compact hex representation
217 5681 : write!(f, "{self}")
218 5681 : }
219 : }
220 :
221 : impl std::str::FromStr for TenantShardId {
222 : type Err = hex::FromHexError;
223 :
224 16 : fn from_str(s: &str) -> Result<Self, Self::Err> {
225 : // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
226 16 : if s.len() == 32 {
227 : // Legacy case: no shard specified
228 : Ok(Self {
229 13 : tenant_id: TenantId::from_str(s)?,
230 13 : shard_number: ShardNumber(0),
231 13 : shard_count: ShardCount(0),
232 : })
233 3 : } else if s.len() == 37 {
234 3 : let bytes = s.as_bytes();
235 3 : let tenant_id = TenantId::from_hex(&bytes[0..32])?;
236 3 : let mut shard_parts: [u8; 2] = [0u8; 2];
237 3 : hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
238 3 : Ok(Self {
239 3 : tenant_id,
240 3 : shard_number: ShardNumber(shard_parts[0]),
241 3 : shard_count: ShardCount(shard_parts[1]),
242 3 : })
243 : } else {
244 0 : Err(hex::FromHexError::InvalidStringLength)
245 : }
246 16 : }
247 : }
248 :
249 : impl From<[u8; 18]> for TenantShardId {
250 25 : fn from(b: [u8; 18]) -> Self {
251 25 : let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
252 :
253 25 : Self {
254 25 : tenant_id: TenantId::from(tenant_id_bytes),
255 25 : shard_number: ShardNumber(b[16]),
256 25 : shard_count: ShardCount(b[17]),
257 25 : }
258 25 : }
259 : }
260 :
261 : impl ShardIndex {
262 7 : pub fn new(number: ShardNumber, count: ShardCount) -> Self {
263 7 : Self {
264 7 : shard_number: number,
265 7 : shard_count: count,
266 7 : }
267 7 : }
268 83 : pub fn unsharded() -> Self {
269 83 : Self {
270 83 : shard_number: ShardNumber(0),
271 83 : shard_count: ShardCount(0),
272 83 : }
273 83 : }
274 :
275 : /// The "unsharded" value is distinct from simply having a single shard: it represents
276 : /// a tenant which is not shard-aware at all, and whose storage paths will not include
277 : /// a shard suffix.
278 37715 : pub fn is_unsharded(&self) -> bool {
279 37715 : self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
280 37715 : }
281 :
282 : /// For use in constructing remote storage paths: concatenate this with a TenantId
283 : /// to get a fully qualified TenantShardId.
284 : ///
285 : /// Backward compat: this function returns an empty string if Self::is_unsharded, such
286 : /// that the legacy pre-sharding remote key format is preserved.
287 1119 : pub fn get_suffix(&self) -> String {
288 1119 : if self.is_unsharded() {
289 1113 : "".to_string()
290 : } else {
291 6 : format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
292 : }
293 1119 : }
294 : }
295 :
296 : impl std::fmt::Display for ShardIndex {
297 1171 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
298 1171 : write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
299 1171 : }
300 : }
301 :
302 : impl std::fmt::Debug for ShardIndex {
303 895 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
304 : // Debug is the same as Display: the compact hex representation
305 895 : write!(f, "{self}")
306 895 : }
307 : }
308 :
309 : impl std::str::FromStr for ShardIndex {
310 : type Err = hex::FromHexError;
311 :
312 1565 : fn from_str(s: &str) -> Result<Self, Self::Err> {
313 : // Expect format: 1 byte shard number, 1 byte shard count
314 1565 : if s.len() == 4 {
315 1565 : let bytes = s.as_bytes();
316 1565 : let mut shard_parts: [u8; 2] = [0u8; 2];
317 1565 : hex::decode_to_slice(bytes, &mut shard_parts)?;
318 1565 : Ok(Self {
319 1565 : shard_number: ShardNumber(shard_parts[0]),
320 1565 : shard_count: ShardCount(shard_parts[1]),
321 1565 : })
322 : } else {
323 0 : Err(hex::FromHexError::InvalidStringLength)
324 : }
325 1565 : }
326 : }
327 :
328 : impl From<[u8; 2]> for ShardIndex {
329 1 : fn from(b: [u8; 2]) -> Self {
330 1 : Self {
331 1 : shard_number: ShardNumber(b[0]),
332 1 : shard_count: ShardCount(b[1]),
333 1 : }
334 1 : }
335 : }
336 :
337 : impl Serialize for TenantShardId {
338 35 : fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
339 35 : where
340 35 : S: serde::Serializer,
341 : {
342 35 : if serializer.is_human_readable() {
343 31 : serializer.collect_str(self)
344 : } else {
345 : // Note: while human encoding of [`TenantShardId`] is backward and forward
346 : // compatible, this binary encoding is not.
347 4 : let mut packed: [u8; 18] = [0; 18];
348 4 : packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
349 4 : packed[16] = self.shard_number.0;
350 4 : packed[17] = self.shard_count.0;
351 :
352 4 : packed.serialize(serializer)
353 : }
354 0 : }
355 : }
356 :
357 : impl<'de> Deserialize<'de> for TenantShardId {
358 6 : fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
359 6 : where
360 6 : D: serde::Deserializer<'de>,
361 : {
362 : struct IdVisitor {
363 : is_human_readable_deserializer: bool,
364 : }
365 :
366 : impl<'de> serde::de::Visitor<'de> for IdVisitor {
367 : type Value = TenantShardId;
368 :
369 0 : fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
370 0 : if self.is_human_readable_deserializer {
371 0 : formatter.write_str("value in form of hex string")
372 : } else {
373 0 : formatter.write_str("value in form of integer array([u8; 18])")
374 : }
375 0 : }
376 :
377 2 : fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
378 2 : where
379 2 : A: serde::de::SeqAccess<'de>,
380 : {
381 2 : let s = serde::de::value::SeqAccessDeserializer::new(seq);
382 2 : let id: [u8; 18] = Deserialize::deserialize(s)?;
383 2 : Ok(TenantShardId::from(id))
384 0 : }
385 :
386 4 : fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
387 4 : where
388 4 : E: serde::de::Error,
389 : {
390 4 : TenantShardId::from_str(v).map_err(E::custom)
391 0 : }
392 : }
393 :
394 6 : if deserializer.is_human_readable() {
395 4 : deserializer.deserialize_str(IdVisitor {
396 4 : is_human_readable_deserializer: true,
397 4 : })
398 : } else {
399 2 : deserializer.deserialize_tuple(
400 : 18,
401 2 : IdVisitor {
402 2 : is_human_readable_deserializer: false,
403 2 : },
404 : )
405 : }
406 0 : }
407 : }
408 :
409 : impl Serialize for ShardIndex {
410 16 : fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
411 16 : where
412 16 : S: serde::Serializer,
413 : {
414 16 : if serializer.is_human_readable() {
415 14 : serializer.collect_str(self)
416 : } else {
417 : // Binary encoding is not used in index_part.json, but is included in anticipation of
418 : // switching various structures (e.g. inter-process communication, remote metadata) to more
419 : // compact binary encodings in future.
420 2 : let mut packed: [u8; 2] = [0; 2];
421 2 : packed[0] = self.shard_number.0;
422 2 : packed[1] = self.shard_count.0;
423 2 : packed.serialize(serializer)
424 : }
425 0 : }
426 : }
427 :
428 : impl<'de> Deserialize<'de> for ShardIndex {
429 1565 : fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
430 1565 : where
431 1565 : D: serde::Deserializer<'de>,
432 : {
433 : struct IdVisitor {
434 : is_human_readable_deserializer: bool,
435 : }
436 :
437 : impl<'de> serde::de::Visitor<'de> for IdVisitor {
438 : type Value = ShardIndex;
439 :
440 0 : fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
441 0 : if self.is_human_readable_deserializer {
442 0 : formatter.write_str("value in form of hex string")
443 : } else {
444 0 : formatter.write_str("value in form of integer array([u8; 2])")
445 : }
446 0 : }
447 :
448 1 : fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
449 1 : where
450 1 : A: serde::de::SeqAccess<'de>,
451 : {
452 1 : let s = serde::de::value::SeqAccessDeserializer::new(seq);
453 1 : let id: [u8; 2] = Deserialize::deserialize(s)?;
454 1 : Ok(ShardIndex::from(id))
455 0 : }
456 :
457 1564 : fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
458 1564 : where
459 1564 : E: serde::de::Error,
460 : {
461 1564 : ShardIndex::from_str(v).map_err(E::custom)
462 0 : }
463 : }
464 :
465 1565 : if deserializer.is_human_readable() {
466 1564 : deserializer.deserialize_str(IdVisitor {
467 1564 : is_human_readable_deserializer: true,
468 1564 : })
469 : } else {
470 1 : deserializer.deserialize_tuple(
471 : 2,
472 1 : IdVisitor {
473 1 : is_human_readable_deserializer: false,
474 1 : },
475 : )
476 : }
477 0 : }
478 : }
|