Line data Source code
1 : //! See `pageserver_api::shard` for description on sharding.
2 :
3 : use std::{ops::RangeInclusive, str::FromStr};
4 :
5 : use hex::FromHex;
6 : use serde::{Deserialize, Serialize};
7 :
8 : use crate::id::TenantId;
9 :
10 0 : #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
11 : pub struct ShardNumber(pub u8);
12 :
13 0 : #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
14 : pub struct ShardCount(pub u8);
15 :
16 : /// Combination of ShardNumber and ShardCount.
17 : ///
18 : /// For use within the context of a particular tenant, when we need to know which shard we're
19 : /// dealing with, but do not need to know the full ShardIdentity (because we won't be doing
20 : /// any page->shard mapping), and do not need to know the fully qualified TenantShardId.
21 : #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
22 : pub struct ShardIndex {
23 : pub shard_number: ShardNumber,
24 : pub shard_count: ShardCount,
25 : }
26 :
27 : /// Formatting helper, for generating the `shard_id` label in traces.
28 : pub struct ShardSlug<'a>(&'a TenantShardId);
29 :
30 : /// TenantShardId globally identifies a particular shard in a particular tenant.
31 : ///
32 : /// These are written as `<TenantId>-<ShardSlug>`, for example:
33 : /// # The second shard in a two-shard tenant
34 : /// 072f1291a5310026820b2fe4b2968934-0102
35 : ///
36 : /// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
37 : /// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
38 : /// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
39 : ///
40 : /// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
41 : /// is both forward and backward compatible with TenantId: a legacy TenantId can be
42 : /// decoded as a TenantShardId, and when re-encoded it will be parseable
43 : /// as a TenantId.
44 : #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
45 : pub struct TenantShardId {
46 : pub tenant_id: TenantId,
47 : pub shard_number: ShardNumber,
48 : pub shard_count: ShardCount,
49 : }
50 :
51 : impl ShardCount {
52 : pub const MAX: Self = Self(u8::MAX);
53 : pub const MIN: Self = Self(0);
54 :
55 : /// The internal value of a ShardCount may be zero, which means "1 shard, but use
56 : /// legacy format for TenantShardId that excludes the shard suffix", also known
57 : /// as [`TenantShardId::unsharded`].
58 : ///
59 : /// This method returns the actual number of shards, i.e. if our internal value is
60 : /// zero, we return 1 (unsharded tenants have 1 shard).
61 4809218 : pub fn count(&self) -> u8 {
62 4809218 : if self.0 > 0 {
63 5009 : self.0
64 : } else {
65 4804209 : 1
66 : }
67 4809218 : }
68 :
69 : /// The literal internal value: this is **not** the number of shards in the
70 : /// tenant, as we have a special zero value for legacy unsharded tenants. Use
71 : /// [`Self::count`] if you want to know the cardinality of shards.
72 2 : pub fn literal(&self) -> u8 {
73 2 : self.0
74 2 : }
75 :
76 : /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
77 : /// uses the legacy format for `TenantShardId`. See also the documentation for
78 : /// [`Self::count`].
79 0 : pub fn is_unsharded(&self) -> bool {
80 0 : self.0 == 0
81 0 : }
82 :
83 : /// `v` may be zero, or the number of shards in the tenant. `v` is what
84 : /// [`Self::literal`] would return.
85 9015 : pub const fn new(val: u8) -> Self {
86 9015 : Self(val)
87 9015 : }
88 : }
89 :
90 : impl ShardNumber {
91 : pub const MAX: Self = Self(u8::MAX);
92 : }
93 :
94 : impl TenantShardId {
95 39 : pub fn unsharded(tenant_id: TenantId) -> Self {
96 39 : Self {
97 39 : tenant_id,
98 39 : shard_number: ShardNumber(0),
99 39 : shard_count: ShardCount(0),
100 39 : }
101 39 : }
102 :
103 : /// The range of all TenantShardId that belong to a particular TenantId. This is useful when
104 : /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
105 0 : pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
106 0 : RangeInclusive::new(
107 0 : Self {
108 0 : tenant_id,
109 0 : shard_number: ShardNumber(0),
110 0 : shard_count: ShardCount(0),
111 0 : },
112 0 : Self {
113 0 : tenant_id,
114 0 : shard_number: ShardNumber::MAX,
115 0 : shard_count: ShardCount::MAX,
116 0 : },
117 0 : )
118 0 : }
119 :
120 24278 : pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
121 24278 : ShardSlug(self)
122 24278 : }
123 :
124 : /// Convenience for code that has special behavior on the 0th shard.
125 6 : pub fn is_shard_zero(&self) -> bool {
126 6 : self.shard_number == ShardNumber(0)
127 6 : }
128 :
129 : /// The "unsharded" value is distinct from simply having a single shard: it represents
130 : /// a tenant which is not shard-aware at all, and whose storage paths will not include
131 : /// a shard suffix.
132 0 : pub fn is_unsharded(&self) -> bool {
133 0 : self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
134 0 : }
135 :
136 : /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
137 : /// is useful when logging from code that is already in a span that includes tenant ID, to
138 : /// keep messages reasonably terse.
139 0 : pub fn to_index(&self) -> ShardIndex {
140 0 : ShardIndex {
141 0 : shard_number: self.shard_number,
142 0 : shard_count: self.shard_count,
143 0 : }
144 0 : }
145 :
146 : /// Calculate the children of this TenantShardId when splitting the overall tenant into
147 : /// the given number of shards.
148 6 : pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
149 6 : let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
150 6 : let mut child_shards = Vec::new();
151 32 : for shard_number in 0..ShardNumber(new_shard_count.0).0 {
152 : // Key mapping is based on a round robin mapping of key hash modulo shard count,
153 : // so our child shards are the ones which the same keys would map to.
154 32 : if shard_number % effective_old_shard_count == self.shard_number.0 {
155 28 : child_shards.push(TenantShardId {
156 28 : tenant_id: self.tenant_id,
157 28 : shard_number: ShardNumber(shard_number),
158 28 : shard_count: new_shard_count,
159 28 : })
160 4 : }
161 : }
162 :
163 6 : child_shards
164 6 : }
165 : }
166 :
167 : impl std::fmt::Display for ShardSlug<'_> {
168 11719 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
169 11719 : write!(
170 11719 : f,
171 11719 : "{:02x}{:02x}",
172 11719 : self.0.shard_number.0, self.0.shard_count.0
173 11719 : )
174 11719 : }
175 : }
176 :
177 : impl std::fmt::Display for TenantShardId {
178 13199 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
179 13199 : if self.shard_count != ShardCount(0) {
180 153 : write!(f, "{}-{}", self.tenant_id, self.shard_slug())
181 : } else {
182 : // Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
183 : // is distinct from the normal single shard case (shard count == 1).
184 13046 : self.tenant_id.fmt(f)
185 : }
186 13199 : }
187 : }
188 :
189 : impl std::fmt::Debug for TenantShardId {
190 3128 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
191 3128 : // Debug is the same as Display: the compact hex representation
192 3128 : write!(f, "{}", self)
193 3128 : }
194 : }
195 :
196 : impl std::str::FromStr for TenantShardId {
197 : type Err = hex::FromHexError;
198 :
199 4445 : fn from_str(s: &str) -> Result<Self, Self::Err> {
200 4445 : // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
201 4445 : if s.len() == 32 {
202 : // Legacy case: no shard specified
203 : Ok(Self {
204 4409 : tenant_id: TenantId::from_str(s)?,
205 4409 : shard_number: ShardNumber(0),
206 4409 : shard_count: ShardCount(0),
207 : })
208 36 : } else if s.len() == 37 {
209 36 : let bytes = s.as_bytes();
210 36 : let tenant_id = TenantId::from_hex(&bytes[0..32])?;
211 36 : let mut shard_parts: [u8; 2] = [0u8; 2];
212 36 : hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
213 36 : Ok(Self {
214 36 : tenant_id,
215 36 : shard_number: ShardNumber(shard_parts[0]),
216 36 : shard_count: ShardCount(shard_parts[1]),
217 36 : })
218 : } else {
219 0 : Err(hex::FromHexError::InvalidStringLength)
220 : }
221 4445 : }
222 : }
223 :
224 : impl From<[u8; 18]> for TenantShardId {
225 2 : fn from(b: [u8; 18]) -> Self {
226 2 : let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
227 2 :
228 2 : Self {
229 2 : tenant_id: TenantId::from(tenant_id_bytes),
230 2 : shard_number: ShardNumber(b[16]),
231 2 : shard_count: ShardCount(b[17]),
232 2 : }
233 2 : }
234 : }
235 :
236 : impl ShardIndex {
237 0 : pub fn new(number: ShardNumber, count: ShardCount) -> Self {
238 0 : Self {
239 0 : shard_number: number,
240 0 : shard_count: count,
241 0 : }
242 0 : }
243 102 : pub fn unsharded() -> Self {
244 102 : Self {
245 102 : shard_number: ShardNumber(0),
246 102 : shard_count: ShardCount(0),
247 102 : }
248 102 : }
249 :
250 : /// The "unsharded" value is distinct from simply having a single shard: it represents
251 : /// a tenant which is not shard-aware at all, and whose storage paths will not include
252 : /// a shard suffix.
253 72544 : pub fn is_unsharded(&self) -> bool {
254 72544 : self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
255 72544 : }
256 :
257 : /// For use in constructing remote storage paths: concatenate this with a TenantId
258 : /// to get a fully qualified TenantShardId.
259 : ///
260 : /// Backward compat: this function returns an empty string if Self::is_unsharded, such
261 : /// that the legacy pre-sharding remote key format is preserved.
262 1452 : pub fn get_suffix(&self) -> String {
263 1452 : if self.is_unsharded() {
264 1444 : "".to_string()
265 : } else {
266 8 : format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
267 : }
268 1452 : }
269 : }
270 :
271 : impl std::fmt::Display for ShardIndex {
272 2030 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
273 2030 : write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
274 2030 : }
275 : }
276 :
277 : impl std::fmt::Debug for ShardIndex {
278 1536 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
279 1536 : // Debug is the same as Display: the compact hex representation
280 1536 : write!(f, "{}", self)
281 1536 : }
282 : }
283 :
284 : impl std::str::FromStr for ShardIndex {
285 : type Err = hex::FromHexError;
286 :
287 3129 : fn from_str(s: &str) -> Result<Self, Self::Err> {
288 3129 : // Expect format: 1 byte shard number, 1 byte shard count
289 3129 : if s.len() == 4 {
290 3129 : let bytes = s.as_bytes();
291 3129 : let mut shard_parts: [u8; 2] = [0u8; 2];
292 3129 : hex::decode_to_slice(bytes, &mut shard_parts)?;
293 3129 : Ok(Self {
294 3129 : shard_number: ShardNumber(shard_parts[0]),
295 3129 : shard_count: ShardCount(shard_parts[1]),
296 3129 : })
297 : } else {
298 0 : Err(hex::FromHexError::InvalidStringLength)
299 : }
300 3129 : }
301 : }
302 :
303 : impl From<[u8; 2]> for ShardIndex {
304 1 : fn from(b: [u8; 2]) -> Self {
305 1 : Self {
306 1 : shard_number: ShardNumber(b[0]),
307 1 : shard_count: ShardCount(b[1]),
308 1 : }
309 1 : }
310 : }
311 :
312 : impl Serialize for TenantShardId {
313 46 : fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
314 46 : where
315 46 : S: serde::Serializer,
316 46 : {
317 46 : if serializer.is_human_readable() {
318 42 : serializer.collect_str(self)
319 : } else {
320 : // Note: while human encoding of [`TenantShardId`] is backward and forward
321 : // compatible, this binary encoding is not.
322 4 : let mut packed: [u8; 18] = [0; 18];
323 4 : packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
324 4 : packed[16] = self.shard_number.0;
325 4 : packed[17] = self.shard_count.0;
326 4 :
327 4 : packed.serialize(serializer)
328 : }
329 46 : }
330 : }
331 :
332 : impl<'de> Deserialize<'de> for TenantShardId {
333 9 : fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
334 9 : where
335 9 : D: serde::Deserializer<'de>,
336 9 : {
337 : struct IdVisitor {
338 : is_human_readable_deserializer: bool,
339 : }
340 :
341 : impl<'de> serde::de::Visitor<'de> for IdVisitor {
342 : type Value = TenantShardId;
343 :
344 0 : fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
345 0 : if self.is_human_readable_deserializer {
346 0 : formatter.write_str("value in form of hex string")
347 : } else {
348 0 : formatter.write_str("value in form of integer array([u8; 18])")
349 : }
350 0 : }
351 :
352 2 : fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
353 2 : where
354 2 : A: serde::de::SeqAccess<'de>,
355 2 : {
356 2 : let s = serde::de::value::SeqAccessDeserializer::new(seq);
357 2 : let id: [u8; 18] = Deserialize::deserialize(s)?;
358 2 : Ok(TenantShardId::from(id))
359 2 : }
360 :
361 7 : fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
362 7 : where
363 7 : E: serde::de::Error,
364 7 : {
365 7 : TenantShardId::from_str(v).map_err(E::custom)
366 7 : }
367 : }
368 :
369 9 : if deserializer.is_human_readable() {
370 7 : deserializer.deserialize_str(IdVisitor {
371 7 : is_human_readable_deserializer: true,
372 7 : })
373 : } else {
374 2 : deserializer.deserialize_tuple(
375 2 : 18,
376 2 : IdVisitor {
377 2 : is_human_readable_deserializer: false,
378 2 : },
379 2 : )
380 : }
381 9 : }
382 : }
383 :
384 : impl Serialize for ShardIndex {
385 18 : fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
386 18 : where
387 18 : S: serde::Serializer,
388 18 : {
389 18 : if serializer.is_human_readable() {
390 16 : serializer.collect_str(self)
391 : } else {
392 : // Binary encoding is not used in index_part.json, but is included in anticipation of
393 : // switching various structures (e.g. inter-process communication, remote metadata) to more
394 : // compact binary encodings in future.
395 2 : let mut packed: [u8; 2] = [0; 2];
396 2 : packed[0] = self.shard_number.0;
397 2 : packed[1] = self.shard_count.0;
398 2 : packed.serialize(serializer)
399 : }
400 18 : }
401 : }
402 :
403 : impl<'de> Deserialize<'de> for ShardIndex {
404 3129 : fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
405 3129 : where
406 3129 : D: serde::Deserializer<'de>,
407 3129 : {
408 : struct IdVisitor {
409 : is_human_readable_deserializer: bool,
410 : }
411 :
412 : impl<'de> serde::de::Visitor<'de> for IdVisitor {
413 : type Value = ShardIndex;
414 :
415 0 : fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
416 0 : if self.is_human_readable_deserializer {
417 0 : formatter.write_str("value in form of hex string")
418 : } else {
419 0 : formatter.write_str("value in form of integer array([u8; 2])")
420 : }
421 0 : }
422 :
423 1 : fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
424 1 : where
425 1 : A: serde::de::SeqAccess<'de>,
426 1 : {
427 1 : let s = serde::de::value::SeqAccessDeserializer::new(seq);
428 1 : let id: [u8; 2] = Deserialize::deserialize(s)?;
429 1 : Ok(ShardIndex::from(id))
430 1 : }
431 :
432 3128 : fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
433 3128 : where
434 3128 : E: serde::de::Error,
435 3128 : {
436 3128 : ShardIndex::from_str(v).map_err(E::custom)
437 3128 : }
438 : }
439 :
440 3129 : if deserializer.is_human_readable() {
441 3128 : deserializer.deserialize_str(IdVisitor {
442 3128 : is_human_readable_deserializer: true,
443 3128 : })
444 : } else {
445 1 : deserializer.deserialize_tuple(
446 1 : 2,
447 1 : IdVisitor {
448 1 : is_human_readable_deserializer: false,
449 1 : },
450 1 : )
451 : }
452 3129 : }
453 : }
|