Line data Source code
1 : //! Common traits and structs for layers
2 :
3 : pub mod delta_layer;
4 : mod filename;
5 : pub mod image_layer;
6 : mod inmemory_layer;
7 : pub(crate) mod layer;
8 : mod layer_desc;
9 :
10 : use crate::context::{AccessStatsBehavior, RequestContext};
11 : use crate::task_mgr::TaskKind;
12 : use crate::walrecord::NeonWalRecord;
13 : use bytes::Bytes;
14 : use enum_map::EnumMap;
15 : use enumset::EnumSet;
16 : use once_cell::sync::Lazy;
17 : use pageserver_api::models::{
18 : LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
19 : };
20 : use std::ops::Range;
21 : use std::sync::Mutex;
22 : use std::time::{Duration, SystemTime, UNIX_EPOCH};
23 : use tracing::warn;
24 : use utils::history_buffer::HistoryBufferWithDropCounter;
25 : use utils::rate_limit::RateLimit;
26 :
27 : use utils::{id::TimelineId, lsn::Lsn};
28 :
29 : pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
30 : pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
31 : pub use image_layer::{ImageLayer, ImageLayerWriter};
32 : pub use inmemory_layer::InMemoryLayer;
33 : pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
34 :
35 : pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
36 :
37 0 : pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
38 0 : where
39 0 : T: PartialOrd<T>,
40 0 : {
41 0 : if a.start < b.start {
42 0 : a.end > b.start
43 : } else {
44 0 : b.end > a.start
45 : }
46 0 : }
47 :
48 : /// Struct used to communicate across calls to 'get_value_reconstruct_data'.
49 : ///
50 : /// Before first call, you can fill in 'page_img' if you have an older cached
51 : /// version of the page available. That can save work in
52 : /// 'get_value_reconstruct_data', as it can stop searching for page versions
53 : /// when all the WAL records going back to the cached image have been collected.
54 : ///
55 : /// When get_value_reconstruct_data returns Complete, 'img' is set to an image
56 : /// of the page, or the oldest WAL record in 'records' is a will_init-type
57 : /// record that initializes the page without requiring a previous image.
58 : ///
59 : /// If 'get_page_reconstruct_data' returns Continue, some 'records' may have
60 : /// been collected, but there are more records outside the current layer. Pass
61 : /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
62 : /// call, to collect more records.
63 : ///
64 0 : #[derive(Debug)]
65 : pub struct ValueReconstructState {
66 : pub records: Vec<(Lsn, NeonWalRecord)>,
67 : pub img: Option<(Lsn, Bytes)>,
68 : }
69 :
70 : /// Return value from [`Layer::get_value_reconstruct_data`]
71 1775 : #[derive(Clone, Copy, Debug)]
72 : pub enum ValueReconstructResult {
73 : /// Got all the data needed to reconstruct the requested page
74 : Complete,
75 : /// This layer didn't contain all the required data, the caller should look up
76 : /// the predecessor layer at the returned LSN and collect more data from there.
77 : Continue,
78 :
79 : /// This layer didn't contain data needed to reconstruct the page version at
80 : /// the returned LSN. This is usually considered an error, but might be OK
81 : /// in some circumstances.
82 : Missing,
83 : }
84 :
85 0 : #[derive(Debug)]
86 : pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
87 :
88 : /// This struct holds two instances of [`LayerAccessStatsInner`].
89 : /// Accesses are recorded to both instances.
90 : /// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`].
91 : /// The `for_eviction_policy` is never reset.
92 75356 : #[derive(Debug, Default, Clone)]
93 : struct LayerAccessStatsLocked {
94 : for_scraping_api: LayerAccessStatsInner,
95 : for_eviction_policy: LayerAccessStatsInner,
96 : }
97 :
98 : impl LayerAccessStatsLocked {
99 16668974 : fn iter_mut(&mut self) -> impl Iterator<Item = &mut LayerAccessStatsInner> {
100 16668974 : [&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter()
101 16668974 : }
102 : }
103 :
104 150712 : #[derive(Debug, Default, Clone)]
105 : struct LayerAccessStatsInner {
106 : first_access: Option<LayerAccessStatFullDetails>,
107 : count_by_access_kind: EnumMap<LayerAccessKind, u64>,
108 : task_kind_flag: EnumSet<TaskKind>,
109 : last_accesses: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
110 : last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
111 : }
112 :
113 0 : #[derive(Debug, Clone, Copy)]
114 : pub(crate) struct LayerAccessStatFullDetails {
115 : pub(crate) when: SystemTime,
116 : pub(crate) task_kind: TaskKind,
117 : pub(crate) access_kind: LayerAccessKind,
118 : }
119 :
120 0 : #[derive(Clone, Copy, strum_macros::EnumString)]
121 : pub enum LayerAccessStatsReset {
122 : NoReset,
123 : JustTaskKindFlags,
124 : AllStats,
125 : }
126 :
127 10737 : fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 {
128 10737 : ts.duration_since(UNIX_EPOCH)
129 10737 : .expect("better to die in this unlikely case than report false stats")
130 10737 : .as_millis()
131 10737 : .try_into()
132 10737 : .expect("64 bits is enough for few more years")
133 10737 : }
134 :
135 : impl LayerAccessStatFullDetails {
136 10737 : fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
137 10737 : let Self {
138 10737 : when,
139 10737 : task_kind,
140 10737 : access_kind,
141 10737 : } = self;
142 10737 : pageserver_api::models::LayerAccessStatFullDetails {
143 10737 : when_millis_since_epoch: system_time_to_millis_since_epoch(when),
144 10737 : task_kind: task_kind.into(), // into static str, powered by strum_macros
145 10737 : access_kind: *access_kind,
146 10737 : }
147 10737 : }
148 : }
149 :
150 : impl LayerAccessStats {
151 : /// Create an empty stats object.
152 : ///
153 : /// The caller is responsible for recording a residence event
154 : /// using [`record_residence_event`] before calling `latest_activity`.
155 : /// If they don't, [`latest_activity`] will return `None`.
156 : ///
157 : /// [`record_residence_event`]: Self::record_residence_event
158 : /// [`latest_activity`]: Self::latest_activity
159 22304 : pub(crate) fn empty_will_record_residence_event_later() -> Self {
160 22304 : LayerAccessStats(Mutex::default())
161 22304 : }
162 :
163 : /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
164 : ///
165 : /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
166 : ///
167 : /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
168 : /// [`record_residence_event`]: Self::record_residence_event
169 53052 : pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
170 53052 : let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
171 53052 : new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
172 53052 : new
173 53052 : }
174 :
175 : /// Record a change in layer residency.
176 : ///
177 : /// Recording the event must happen while holding the layer map lock to
178 : /// ensure that latest-activity-threshold-based layer eviction (eviction_task.rs)
179 : /// can do an "imitate access" to this layer, before it observes `now-latest_activity() > threshold`.
180 : ///
181 : /// If we instead recorded the residence event with a timestamp from before grabbing the layer map lock,
182 : /// the following race could happen:
183 : ///
184 : /// - Compact: Write out an L1 layer from several L0 layers. This records residence event LayerCreate with the current timestamp.
185 : /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map.
186 : /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
187 : /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
188 : ///
189 87327 : pub(crate) fn record_residence_event(
190 87327 : &self,
191 87327 : status: LayerResidenceStatus,
192 87327 : reason: LayerResidenceEventReason,
193 87327 : ) {
194 87327 : let mut locked = self.0.lock().unwrap();
195 174654 : locked.iter_mut().for_each(|inner| {
196 174654 : inner
197 174654 : .last_residence_changes
198 174654 : .write(LayerResidenceEvent::new(status, reason))
199 174654 : });
200 87327 : }
201 :
202 16847898 : fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
203 16847898 : if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
204 266251 : return;
205 16581647 : }
206 16581647 :
207 16581647 : let this_access = LayerAccessStatFullDetails {
208 16581647 : when: SystemTime::now(),
209 16581647 : task_kind: ctx.task_kind(),
210 16581647 : access_kind,
211 16581647 : };
212 16581647 :
213 16581647 : let mut locked = self.0.lock().unwrap();
214 33163284 : locked.iter_mut().for_each(|inner| {
215 33163284 : inner.first_access.get_or_insert(this_access);
216 33163284 : inner.count_by_access_kind[access_kind] += 1;
217 33163284 : inner.task_kind_flag |= ctx.task_kind();
218 33163284 : inner.last_accesses.write(this_access);
219 33163284 : })
220 16847898 : }
221 :
222 3050 : fn as_api_model(
223 3050 : &self,
224 3050 : reset: LayerAccessStatsReset,
225 3050 : ) -> pageserver_api::models::LayerAccessStats {
226 3050 : let mut locked = self.0.lock().unwrap();
227 3050 : let inner = &mut locked.for_scraping_api;
228 3050 : let LayerAccessStatsInner {
229 3050 : first_access,
230 3050 : count_by_access_kind,
231 3050 : task_kind_flag,
232 3050 : last_accesses,
233 3050 : last_residence_changes,
234 3050 : } = inner;
235 3050 : let ret = pageserver_api::models::LayerAccessStats {
236 3050 : access_count_by_access_kind: count_by_access_kind
237 3050 : .iter()
238 12200 : .map(|(kind, count)| (kind, *count))
239 3050 : .collect(),
240 3050 : task_kind_access_flag: task_kind_flag
241 3050 : .iter()
242 3050 : .map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
243 3050 : .collect(),
244 3050 : first: first_access.as_ref().map(|a| a.as_api_model()),
245 9948 : accesses_history: last_accesses.map(|m| m.as_api_model()),
246 3050 : residence_events_history: last_residence_changes.clone(),
247 3050 : };
248 3050 : match reset {
249 3050 : LayerAccessStatsReset::NoReset => (),
250 0 : LayerAccessStatsReset::JustTaskKindFlags => {
251 0 : inner.task_kind_flag.clear();
252 0 : }
253 0 : LayerAccessStatsReset::AllStats => {
254 0 : *inner = LayerAccessStatsInner::default();
255 0 : }
256 : }
257 3050 : ret
258 3050 : }
259 :
260 : /// Get the latest access timestamp, falling back to latest residence event, further falling
261 : /// back to `SystemTime::now` for a usable timestamp for eviction.
262 4194 : pub(crate) fn latest_activity_or_now(&self) -> SystemTime {
263 4194 : self.latest_activity().unwrap_or_else(SystemTime::now)
264 4194 : }
265 :
266 : /// Get the latest access timestamp, falling back to latest residence event.
267 : ///
268 : /// This function can only return `None` if there has not yet been a call to the
269 : /// [`record_residence_event`] method. That would generally be considered an
270 : /// implementation error. This function logs a rate-limited warning in that case.
271 : ///
272 : /// TODO: use type system to avoid the need for `fallback`.
273 : /// The approach in <https://github.com/neondatabase/neon/pull/3775>
274 : /// could be used to enforce that a residence event is recorded
275 : /// before a layer is added to the layer map. We could also have
276 : /// a layer wrapper type that holds the LayerAccessStats, and ensure
277 : /// that that type can only be produced by inserting into the layer map.
278 : ///
279 : /// [`record_residence_event`]: Self::record_residence_event
280 4194 : fn latest_activity(&self) -> Option<SystemTime> {
281 4194 : let locked = self.0.lock().unwrap();
282 4194 : let inner = &locked.for_eviction_policy;
283 4194 : match inner.last_accesses.recent() {
284 2125 : Some(a) => Some(a.when),
285 2069 : None => match inner.last_residence_changes.recent() {
286 2069 : Some(e) => Some(e.timestamp),
287 : None => {
288 : static WARN_RATE_LIMIT: Lazy<Mutex<(usize, RateLimit)>> =
289 0 : Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10)))));
290 0 : let mut guard = WARN_RATE_LIMIT.lock().unwrap();
291 0 : guard.0 += 1;
292 0 : let occurences = guard.0;
293 0 : guard.1.call(move || {
294 0 : warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value");
295 0 : });
296 0 : None
297 : }
298 : },
299 : }
300 4194 : }
301 : }
302 :
303 : /// Get a layer descriptor from a layer.
304 : pub trait AsLayerDesc {
305 : /// Get the layer descriptor.
306 : fn layer_desc(&self) -> &PersistentLayerDesc;
307 : }
308 :
309 : pub mod tests {
310 : use pageserver_api::shard::TenantShardId;
311 :
312 : use super::*;
313 :
314 : impl From<DeltaFileName> for PersistentLayerDesc {
315 0 : fn from(value: DeltaFileName) -> Self {
316 0 : PersistentLayerDesc::new_delta(
317 0 : TenantShardId::from([0; 18]),
318 0 : TimelineId::from_array([0; 16]),
319 0 : value.key_range,
320 0 : value.lsn_range,
321 0 : 233,
322 0 : )
323 0 : }
324 : }
325 :
326 : impl From<ImageFileName> for PersistentLayerDesc {
327 0 : fn from(value: ImageFileName) -> Self {
328 0 : PersistentLayerDesc::new_img(
329 0 : TenantShardId::from([0; 18]),
330 0 : TimelineId::from_array([0; 16]),
331 0 : value.key_range,
332 0 : value.lsn,
333 0 : 233,
334 0 : )
335 0 : }
336 : }
337 :
338 : impl From<LayerFileName> for PersistentLayerDesc {
339 0 : fn from(value: LayerFileName) -> Self {
340 0 : match value {
341 0 : LayerFileName::Delta(d) => Self::from(d),
342 0 : LayerFileName::Image(i) => Self::from(i),
343 : }
344 0 : }
345 : }
346 : }
347 :
348 : /// Range wrapping newtype, which uses display to render Debug.
349 : ///
350 : /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
351 : struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);
352 :
353 : impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> {
354 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
355 0 : write!(f, "{}..{}", self.0.start, self.0.end)
356 0 : }
357 : }
|