Line data Source code
1 : //! Common traits and structs for layers
2 :
3 : pub mod delta_layer;
4 : mod filename;
5 : pub mod image_layer;
6 : mod inmemory_layer;
7 : pub(crate) mod layer;
8 : mod layer_desc;
9 :
10 : use crate::context::{AccessStatsBehavior, RequestContext};
11 : use crate::task_mgr::TaskKind;
12 : use crate::walrecord::NeonWalRecord;
13 : use bytes::Bytes;
14 : use enum_map::EnumMap;
15 : use enumset::EnumSet;
16 : use once_cell::sync::Lazy;
17 : use pageserver_api::models::{
18 : LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
19 : };
20 : use std::ops::Range;
21 : use std::sync::Mutex;
22 : use std::time::{Duration, SystemTime, UNIX_EPOCH};
23 : use tracing::warn;
24 : use utils::history_buffer::HistoryBufferWithDropCounter;
25 : use utils::rate_limit::RateLimit;
26 :
27 : use utils::{id::TimelineId, lsn::Lsn};
28 :
29 : pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
30 : pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
31 : pub use image_layer::{ImageLayer, ImageLayerWriter};
32 : pub use inmemory_layer::InMemoryLayer;
33 : pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
34 :
35 : pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
36 :
37 0 : pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
38 0 : where
39 0 : T: PartialOrd<T>,
40 0 : {
41 0 : if a.start < b.start {
42 0 : a.end > b.start
43 : } else {
44 0 : b.end > a.start
45 : }
46 0 : }
47 :
48 : /// Struct used to communicate across calls to 'get_value_reconstruct_data'.
49 : ///
50 : /// Before first call, you can fill in 'page_img' if you have an older cached
51 : /// version of the page available. That can save work in
52 : /// 'get_value_reconstruct_data', as it can stop searching for page versions
53 : /// when all the WAL records going back to the cached image have been collected.
54 : ///
55 : /// When get_value_reconstruct_data returns Complete, 'img' is set to an image
56 : /// of the page, or the oldest WAL record in 'records' is a will_init-type
57 : /// record that initializes the page without requiring a previous image.
58 : ///
59 : /// If 'get_page_reconstruct_data' returns Continue, some 'records' may have
60 : /// been collected, but there are more records outside the current layer. Pass
61 : /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
62 : /// call, to collect more records.
63 : ///
64 0 : #[derive(Debug)]
65 : pub struct ValueReconstructState {
66 : pub records: Vec<(Lsn, NeonWalRecord)>,
67 : pub img: Option<(Lsn, Bytes)>,
68 : }
69 :
70 : /// Return value from [`Layer::get_value_reconstruct_data`]
71 1388 : #[derive(Clone, Copy, Debug)]
72 : pub enum ValueReconstructResult {
73 : /// Got all the data needed to reconstruct the requested page
74 : Complete,
75 : /// This layer didn't contain all the required data, the caller should look up
76 : /// the predecessor layer at the returned LSN and collect more data from there.
77 : Continue,
78 :
79 : /// This layer didn't contain data needed to reconstruct the page version at
80 : /// the returned LSN. This is usually considered an error, but might be OK
81 : /// in some circumstances.
82 : Missing,
83 : }
84 :
85 0 : #[derive(Debug)]
86 : pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
87 :
88 : /// This struct holds two instances of [`LayerAccessStatsInner`].
89 : /// Accesses are recorded to both instances.
90 : /// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`].
91 : /// The `for_eviction_policy` is never reset.
92 78780 : #[derive(Debug, Default, Clone)]
93 : struct LayerAccessStatsLocked {
94 : for_scraping_api: LayerAccessStatsInner,
95 : for_eviction_policy: LayerAccessStatsInner,
96 : }
97 :
98 : impl LayerAccessStatsLocked {
99 23735825 : fn iter_mut(&mut self) -> impl Iterator<Item = &mut LayerAccessStatsInner> {
100 23735825 : [&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter()
101 23735825 : }
102 : }
103 :
104 157560 : #[derive(Debug, Default, Clone)]
105 : struct LayerAccessStatsInner {
106 : first_access: Option<LayerAccessStatFullDetails>,
107 : count_by_access_kind: EnumMap<LayerAccessKind, u64>,
108 : task_kind_flag: EnumSet<TaskKind>,
109 : last_accesses: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
110 : last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
111 : }
112 :
113 0 : #[derive(Debug, Clone, Copy)]
114 : pub(crate) struct LayerAccessStatFullDetails {
115 : pub(crate) when: SystemTime,
116 : pub(crate) task_kind: TaskKind,
117 : pub(crate) access_kind: LayerAccessKind,
118 : }
119 :
120 0 : #[derive(Clone, Copy, strum_macros::EnumString)]
121 : pub enum LayerAccessStatsReset {
122 : NoReset,
123 : JustTaskKindFlags,
124 : AllStats,
125 : }
126 :
127 18200 : fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 {
128 18200 : ts.duration_since(UNIX_EPOCH)
129 18200 : .expect("better to die in this unlikely case than report false stats")
130 18200 : .as_millis()
131 18200 : .try_into()
132 18200 : .expect("64 bits is enough for few more years")
133 18200 : }
134 :
135 : impl LayerAccessStatFullDetails {
136 18200 : fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
137 18200 : let Self {
138 18200 : when,
139 18200 : task_kind,
140 18200 : access_kind,
141 18200 : } = self;
142 18200 : pageserver_api::models::LayerAccessStatFullDetails {
143 18200 : when_millis_since_epoch: system_time_to_millis_since_epoch(when),
144 18200 : task_kind: task_kind.into(), // into static str, powered by strum_macros
145 18200 : access_kind: *access_kind,
146 18200 : }
147 18200 : }
148 : }
149 :
150 : impl LayerAccessStats {
151 : /// Create an empty stats object.
152 : ///
153 : /// The caller is responsible for recording a residence event
154 : /// using [`record_residence_event`] before calling `latest_activity`.
155 : /// If they don't, [`latest_activity`] will return `None`.
156 : ///
157 : /// [`record_residence_event`]: Self::record_residence_event
158 : /// [`latest_activity`]: Self::latest_activity
159 21944 : pub(crate) fn empty_will_record_residence_event_later() -> Self {
160 21944 : LayerAccessStats(Mutex::default())
161 21944 : }
162 :
163 : /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
164 : ///
165 : /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
166 : ///
167 : /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
168 : /// [`record_residence_event`]: Self::record_residence_event
169 56836 : pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
170 56836 : let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
171 56836 : new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
172 56836 : new
173 56836 : }
174 :
175 : /// Record a change in layer residency.
176 : ///
177 : /// Recording the event must happen while holding the layer map lock to
178 : /// ensure that latest-activity-threshold-based layer eviction (eviction_task.rs)
179 : /// can do an "imitate access" to this layer, before it observes `now-latest_activity() > threshold`.
180 : ///
181 : /// If we instead recorded the residence event with a timestamp from before grabbing the layer map lock,
182 : /// the following race could happen:
183 : ///
184 : /// - Compact: Write out an L1 layer from several L0 layers. This records residence event LayerCreate with the current timestamp.
185 : /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map.
186 : /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
187 : /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
188 : ///
189 91109 : pub(crate) fn record_residence_event(
190 91109 : &self,
191 91109 : status: LayerResidenceStatus,
192 91109 : reason: LayerResidenceEventReason,
193 91109 : ) {
194 91109 : let mut locked = self.0.lock().unwrap();
195 182218 : locked.iter_mut().for_each(|inner| {
196 182218 : inner
197 182218 : .last_residence_changes
198 182218 : .write(LayerResidenceEvent::new(status, reason))
199 182218 : });
200 91109 : }
201 :
202 23938768 : fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
203 23938768 : if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
204 294052 : return;
205 23644716 : }
206 23644716 :
207 23644716 : let this_access = LayerAccessStatFullDetails {
208 23644716 : when: SystemTime::now(),
209 23644716 : task_kind: ctx.task_kind(),
210 23644716 : access_kind,
211 23644716 : };
212 23644716 :
213 23644716 : let mut locked = self.0.lock().unwrap();
214 47289436 : locked.iter_mut().for_each(|inner| {
215 47289436 : inner.first_access.get_or_insert(this_access);
216 47289436 : inner.count_by_access_kind[access_kind] += 1;
217 47289436 : inner.task_kind_flag |= ctx.task_kind();
218 47289436 : inner.last_accesses.write(this_access);
219 47289436 : })
220 23938768 : }
221 :
222 3014 : fn as_api_model(
223 3014 : &self,
224 3014 : reset: LayerAccessStatsReset,
225 3014 : ) -> pageserver_api::models::LayerAccessStats {
226 3014 : let mut locked = self.0.lock().unwrap();
227 3014 : let inner = &mut locked.for_scraping_api;
228 3014 : let LayerAccessStatsInner {
229 3014 : first_access,
230 3014 : count_by_access_kind,
231 3014 : task_kind_flag,
232 3014 : last_accesses,
233 3014 : last_residence_changes,
234 3014 : } = inner;
235 3014 : let ret = pageserver_api::models::LayerAccessStats {
236 3014 : access_count_by_access_kind: count_by_access_kind
237 3014 : .iter()
238 12056 : .map(|(kind, count)| (kind, *count))
239 3014 : .collect(),
240 3014 : task_kind_access_flag: task_kind_flag
241 3014 : .iter()
242 3014 : .map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
243 3014 : .collect(),
244 3014 : first: first_access.as_ref().map(|a| a.as_api_model()),
245 16959 : accesses_history: last_accesses.map(|m| m.as_api_model()),
246 3014 : residence_events_history: last_residence_changes.clone(),
247 3014 : };
248 3014 : match reset {
249 3014 : LayerAccessStatsReset::NoReset => (),
250 0 : LayerAccessStatsReset::JustTaskKindFlags => {
251 0 : inner.task_kind_flag.clear();
252 0 : }
253 0 : LayerAccessStatsReset::AllStats => {
254 0 : *inner = LayerAccessStatsInner::default();
255 0 : }
256 : }
257 3014 : ret
258 3014 : }
259 :
260 : /// Get the latest access timestamp, falling back to latest residence event.
261 : ///
262 : /// This function can only return `None` if there has not yet been a call to the
263 : /// [`record_residence_event`] method. That would generally be considered an
264 : /// implementation error. This function logs a rate-limited warning in that case.
265 : ///
266 : /// TODO: use type system to avoid the need for `fallback`.
267 : /// The approach in <https://github.com/neondatabase/neon/pull/3775>
268 : /// could be used to enforce that a residence event is recorded
269 : /// before a layer is added to the layer map. We could also have
270 : /// a layer wrapper type that holds the LayerAccessStats, and ensure
271 : /// that that type can only be produced by inserting into the layer map.
272 : ///
273 : /// [`record_residence_event`]: Self::record_residence_event
274 4148 : pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
275 4148 : let locked = self.0.lock().unwrap();
276 4148 : let inner = &locked.for_eviction_policy;
277 4148 : match inner.last_accesses.recent() {
278 2171 : Some(a) => Some(a.when),
279 1977 : None => match inner.last_residence_changes.recent() {
280 1977 : Some(e) => Some(e.timestamp),
281 : None => {
282 : static WARN_RATE_LIMIT: Lazy<Mutex<(usize, RateLimit)>> =
283 0 : Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10)))));
284 0 : let mut guard = WARN_RATE_LIMIT.lock().unwrap();
285 0 : guard.0 += 1;
286 0 : let occurences = guard.0;
287 0 : guard.1.call(move || {
288 0 : warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value");
289 0 : });
290 0 : None
291 : }
292 : },
293 : }
294 4148 : }
295 : }
296 :
297 : /// Get a layer descriptor from a layer.
298 : pub trait AsLayerDesc {
299 : /// Get the layer descriptor.
300 : fn layer_desc(&self) -> &PersistentLayerDesc;
301 : }
302 :
303 : pub mod tests {
304 : use pageserver_api::shard::TenantShardId;
305 :
306 : use super::*;
307 :
308 : impl From<DeltaFileName> for PersistentLayerDesc {
309 0 : fn from(value: DeltaFileName) -> Self {
310 0 : PersistentLayerDesc::new_delta(
311 0 : TenantShardId::from([0; 18]),
312 0 : TimelineId::from_array([0; 16]),
313 0 : value.key_range,
314 0 : value.lsn_range,
315 0 : 233,
316 0 : )
317 0 : }
318 : }
319 :
320 : impl From<ImageFileName> for PersistentLayerDesc {
321 0 : fn from(value: ImageFileName) -> Self {
322 0 : PersistentLayerDesc::new_img(
323 0 : TenantShardId::from([0; 18]),
324 0 : TimelineId::from_array([0; 16]),
325 0 : value.key_range,
326 0 : value.lsn,
327 0 : 233,
328 0 : )
329 0 : }
330 : }
331 :
332 : impl From<LayerFileName> for PersistentLayerDesc {
333 0 : fn from(value: LayerFileName) -> Self {
334 0 : match value {
335 0 : LayerFileName::Delta(d) => Self::from(d),
336 0 : LayerFileName::Image(i) => Self::from(i),
337 : }
338 0 : }
339 : }
340 : }
341 :
342 : /// Range wrapping newtype, which uses display to render Debug.
343 : ///
344 : /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
345 : struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);
346 :
347 : impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> {
348 0 : fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
349 0 : write!(f, "{}..{}", self.0.start, self.0.end)
350 0 : }
351 : }
|