Line data Source code
1 : //! This module contains global `(tenant_id, timeline_id)` -> `Arc<Timeline>` mapping.
2 : //! All timelines should always be present in this map, this is done by loading them
3 : //! all from the disk on startup and keeping them in memory.
4 :
5 : use std::collections::HashMap;
6 : use std::str::FromStr;
7 : use std::sync::{Arc, Mutex};
8 : use std::time::{Duration, Instant};
9 :
10 : use anyhow::{Context, Result, bail};
11 : use camino::Utf8PathBuf;
12 : use camino_tempfile::Utf8TempDir;
13 : use safekeeper_api::membership::Configuration;
14 : use safekeeper_api::models::{SafekeeperUtilization, TimelineDeleteResult};
15 : use safekeeper_api::{ServerInfo, membership};
16 : use tokio::fs;
17 : use tracing::*;
18 : use utils::crashsafe::{durable_rename, fsync_async_opt};
19 : use utils::id::{TenantId, TenantTimelineId, TimelineId};
20 : use utils::lsn::Lsn;
21 :
22 : use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
23 : use crate::http::routes::DeleteOrExcludeError;
24 : use crate::rate_limit::RateLimiter;
25 : use crate::state::TimelinePersistentState;
26 : use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_timeline_dir};
27 : use crate::timelines_set::TimelinesSet;
28 : use crate::wal_storage::Storage;
29 : use crate::{SafeKeeperConf, control_file, wal_storage};
30 :
31 : // Timeline entry in the global map: either a ready timeline, or mark that it is
32 : // being created.
33 : #[derive(Clone)]
34 : enum GlobalMapTimeline {
35 : CreationInProgress,
36 : Timeline(Arc<Timeline>),
37 : }
38 :
39 : struct GlobalTimelinesState {
40 : timelines: HashMap<TenantTimelineId, GlobalMapTimeline>,
41 :
42 : // A tombstone indicates this timeline used to exist has been deleted. These are used to prevent
43 : // on-demand timeline creation from recreating deleted timelines. This is only soft-enforced, as
44 : // this map is dropped on restart.
45 : tombstones: HashMap<TenantTimelineId, Instant>,
46 :
47 : conf: Arc<SafeKeeperConf>,
48 : broker_active_set: Arc<TimelinesSet>,
49 : global_rate_limiter: RateLimiter,
50 : }
51 :
52 : impl GlobalTimelinesState {
53 : /// Get dependencies for a timeline constructor.
54 0 : fn get_dependencies(&self) -> (Arc<SafeKeeperConf>, Arc<TimelinesSet>, RateLimiter) {
55 0 : (
56 0 : self.conf.clone(),
57 0 : self.broker_active_set.clone(),
58 0 : self.global_rate_limiter.clone(),
59 0 : )
60 0 : }
61 :
62 : /// Get timeline from the map. Returns error if timeline doesn't exist or
63 : /// creation is in progress.
64 0 : fn get(&self, ttid: &TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
65 0 : match self.timelines.get(ttid).cloned() {
66 0 : Some(GlobalMapTimeline::Timeline(tli)) => Ok(tli),
67 : Some(GlobalMapTimeline::CreationInProgress) => {
68 0 : Err(TimelineError::CreationInProgress(*ttid))
69 : }
70 0 : None => Err(TimelineError::NotFound(*ttid)),
71 : }
72 0 : }
73 :
74 0 : fn delete(&mut self, ttid: TenantTimelineId) {
75 0 : self.timelines.remove(&ttid);
76 0 : self.tombstones.insert(ttid, Instant::now());
77 0 : }
78 : }
79 :
80 : /// A struct used to manage access to the global timelines map.
81 : pub struct GlobalTimelines {
82 : state: Mutex<GlobalTimelinesState>,
83 : }
84 :
85 : impl GlobalTimelines {
86 : /// Create a new instance of the global timelines map.
87 0 : pub fn new(conf: Arc<SafeKeeperConf>) -> Self {
88 0 : Self {
89 0 : state: Mutex::new(GlobalTimelinesState {
90 0 : timelines: HashMap::new(),
91 0 : tombstones: HashMap::new(),
92 0 : conf,
93 0 : broker_active_set: Arc::new(TimelinesSet::default()),
94 0 : global_rate_limiter: RateLimiter::new(1, 1),
95 0 : }),
96 0 : }
97 0 : }
98 :
99 : /// Inject dependencies needed for the timeline constructors and load all timelines to memory.
100 0 : pub async fn init(&self) -> Result<()> {
101 0 : // clippy isn't smart enough to understand that drop(state) releases the
102 0 : // lock, so use explicit block
103 0 : let tenants_dir = {
104 0 : let mut state = self.state.lock().unwrap();
105 0 : state.global_rate_limiter = RateLimiter::new(
106 0 : state.conf.partial_backup_concurrency,
107 0 : DEFAULT_EVICTION_CONCURRENCY,
108 0 : );
109 0 :
110 0 : // Iterate through all directories and load tenants for all directories
111 0 : // named as a valid tenant_id.
112 0 : state.conf.workdir.clone()
113 0 : };
114 0 : let mut tenant_count = 0;
115 0 : for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
116 0 : .with_context(|| format!("failed to list tenants dir {}", tenants_dir))?
117 : {
118 0 : match &tenants_dir_entry {
119 0 : Ok(tenants_dir_entry) => {
120 0 : if let Ok(tenant_id) =
121 0 : TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or(""))
122 : {
123 0 : tenant_count += 1;
124 0 : self.load_tenant_timelines(tenant_id).await?;
125 0 : }
126 : }
127 0 : Err(e) => error!(
128 0 : "failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
129 : tenants_dir_entry, tenants_dir, e
130 : ),
131 : }
132 : }
133 :
134 0 : info!(
135 0 : "found {} tenants directories, successfully loaded {} timelines",
136 0 : tenant_count,
137 0 : self.state.lock().unwrap().timelines.len()
138 : );
139 0 : Ok(())
140 0 : }
141 :
142 : /// Loads all timelines for the given tenant to memory. Returns fs::read_dir
143 : /// errors if any.
144 : ///
145 : /// It is async, but self.state lock is sync and there is no important
146 : /// reason to make it async (it is always held for a short while), so we
147 : /// just lock and unlock it for each timeline -- this function is called
148 : /// during init when nothing else is running, so this is fine.
149 0 : async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> {
150 0 : let (conf, broker_active_set, partial_backup_rate_limiter) = {
151 0 : let state = self.state.lock().unwrap();
152 0 : state.get_dependencies()
153 0 : };
154 0 :
155 0 : let timelines_dir = get_tenant_dir(&conf, &tenant_id);
156 0 : for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
157 0 : .with_context(|| format!("failed to list timelines dir {}", timelines_dir))?
158 : {
159 0 : match &timelines_dir_entry {
160 0 : Ok(timeline_dir_entry) => {
161 0 : if let Ok(timeline_id) =
162 0 : TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
163 : {
164 0 : let ttid = TenantTimelineId::new(tenant_id, timeline_id);
165 0 : match Timeline::load_timeline(conf.clone(), ttid) {
166 0 : Ok(tli) => {
167 0 : let mut shared_state = tli.write_shared_state().await;
168 0 : self.state
169 0 : .lock()
170 0 : .unwrap()
171 0 : .timelines
172 0 : .insert(ttid, GlobalMapTimeline::Timeline(tli.clone()));
173 0 : tli.bootstrap(
174 0 : &mut shared_state,
175 0 : &conf,
176 0 : broker_active_set.clone(),
177 0 : partial_backup_rate_limiter.clone(),
178 0 : );
179 : }
180 : // If we can't load a timeline, it's most likely because of a corrupted
181 : // directory. We will log an error and won't allow to delete/recreate
182 : // this timeline. The only way to fix this timeline is to repair manually
183 : // and restart the safekeeper.
184 0 : Err(e) => error!(
185 0 : "failed to load timeline {} for tenant {}, reason: {:?}",
186 : timeline_id, tenant_id, e
187 : ),
188 : }
189 0 : }
190 : }
191 0 : Err(e) => error!(
192 0 : "failed to list timelines dir entry {:?} in directory {}, reason: {:?}",
193 : timelines_dir_entry, timelines_dir, e
194 : ),
195 : }
196 : }
197 :
198 0 : Ok(())
199 0 : }
200 :
201 : /// Get the number of timelines in the map.
202 0 : pub fn timelines_count(&self) -> usize {
203 0 : self.state.lock().unwrap().timelines.len()
204 0 : }
205 :
206 : /// Get the global safekeeper config.
207 0 : pub fn get_global_config(&self) -> Arc<SafeKeeperConf> {
208 0 : self.state.lock().unwrap().conf.clone()
209 0 : }
210 :
211 0 : pub fn get_global_broker_active_set(&self) -> Arc<TimelinesSet> {
212 0 : self.state.lock().unwrap().broker_active_set.clone()
213 0 : }
214 :
215 : /// Create a new timeline with the given id. If the timeline already exists, returns
216 : /// an existing timeline.
217 0 : pub(crate) async fn create(
218 0 : &self,
219 0 : ttid: TenantTimelineId,
220 0 : mconf: Configuration,
221 0 : server_info: ServerInfo,
222 0 : start_lsn: Lsn,
223 0 : commit_lsn: Lsn,
224 0 : ) -> Result<Arc<Timeline>> {
225 0 : let (conf, _, _) = {
226 0 : let state = self.state.lock().unwrap();
227 0 : if let Ok(timeline) = state.get(&ttid) {
228 : // Timeline already exists, return it.
229 0 : return Ok(timeline);
230 0 : }
231 0 :
232 0 : if state.tombstones.contains_key(&ttid) {
233 0 : anyhow::bail!("Timeline {ttid} is deleted, refusing to recreate");
234 0 : }
235 0 :
236 0 : state.get_dependencies()
237 0 : };
238 0 :
239 0 : info!("creating new timeline {}", ttid);
240 :
241 : // Do on disk initialization in tmp dir.
242 0 : let (_tmp_dir, tmp_dir_path) = create_temp_timeline_dir(&conf, ttid).await?;
243 :
244 : // TODO: currently we create only cfile. It would be reasonable to
245 : // immediately initialize first WAL segment as well.
246 0 : let state = TimelinePersistentState::new(&ttid, mconf, server_info, start_lsn, commit_lsn)?;
247 0 : control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
248 0 : let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?;
249 0 : Ok(timeline)
250 0 : }
251 :
252 : /// Move timeline from a temp directory to the main storage, and load it to
253 : /// the global map. Creating timeline in this way ensures atomicity: rename
254 : /// is atomic, so either move of the whole datadir succeeds or it doesn't,
255 : /// but corrupted data dir shouldn't be possible.
256 : ///
257 : /// We'd like to avoid holding map lock while doing IO, so it's a 3 step
258 : /// process:
259 : /// 1) check the global map that timeline doesn't exist and mark that we're
260 : /// creating it;
261 : /// 2) move the directory and load the timeline
262 : /// 3) take lock again and insert the timeline into the global map.
263 0 : pub async fn load_temp_timeline(
264 0 : &self,
265 0 : ttid: TenantTimelineId,
266 0 : tmp_path: &Utf8PathBuf,
267 0 : check_tombstone: bool,
268 0 : ) -> Result<Arc<Timeline>> {
269 : // Check for existence and mark that we're creating it.
270 0 : let (conf, broker_active_set, partial_backup_rate_limiter) = {
271 0 : let mut state = self.state.lock().unwrap();
272 0 : match state.timelines.get(&ttid) {
273 : Some(GlobalMapTimeline::CreationInProgress) => {
274 0 : bail!(TimelineError::CreationInProgress(ttid));
275 : }
276 : Some(GlobalMapTimeline::Timeline(_)) => {
277 0 : bail!(TimelineError::AlreadyExists(ttid));
278 : }
279 0 : _ => {}
280 0 : }
281 0 : if check_tombstone {
282 0 : if state.tombstones.contains_key(&ttid) {
283 0 : anyhow::bail!("timeline {ttid} is deleted, refusing to recreate");
284 0 : }
285 : } else {
286 : // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`). We trust
287 : // that the human doing this manual intervention knows what they are doing, and remove its tombstone.
288 0 : if state.tombstones.remove(&ttid).is_some() {
289 0 : warn!("un-deleted timeline {ttid}");
290 0 : }
291 : }
292 0 : state
293 0 : .timelines
294 0 : .insert(ttid, GlobalMapTimeline::CreationInProgress);
295 0 : state.get_dependencies()
296 0 : };
297 0 :
298 0 : // Do the actual move and reflect the result in the map.
299 0 : match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await {
300 0 : Ok(timeline) => {
301 0 : let mut timeline_shared_state = timeline.write_shared_state().await;
302 0 : let mut state = self.state.lock().unwrap();
303 0 : assert!(matches!(
304 0 : state.timelines.get(&ttid),
305 : Some(GlobalMapTimeline::CreationInProgress)
306 : ));
307 :
308 0 : state
309 0 : .timelines
310 0 : .insert(ttid, GlobalMapTimeline::Timeline(timeline.clone()));
311 0 : drop(state);
312 0 : timeline.bootstrap(
313 0 : &mut timeline_shared_state,
314 0 : &conf,
315 0 : broker_active_set,
316 0 : partial_backup_rate_limiter,
317 0 : );
318 0 : drop(timeline_shared_state);
319 0 : Ok(timeline)
320 : }
321 0 : Err(e) => {
322 0 : // Init failed, remove the marker from the map
323 0 : let mut state = self.state.lock().unwrap();
324 0 : assert!(matches!(
325 0 : state.timelines.get(&ttid),
326 : Some(GlobalMapTimeline::CreationInProgress)
327 : ));
328 0 : state.timelines.remove(&ttid);
329 0 : Err(e)
330 : }
331 : }
332 0 : }
333 :
334 : /// Main part of load_temp_timeline: do the move and load.
335 0 : async fn install_temp_timeline(
336 0 : ttid: TenantTimelineId,
337 0 : tmp_path: &Utf8PathBuf,
338 0 : conf: Arc<SafeKeeperConf>,
339 0 : ) -> Result<Arc<Timeline>> {
340 0 : let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id);
341 0 : let timeline_path = get_timeline_dir(conf.as_ref(), &ttid);
342 0 :
343 0 : // We must have already checked that timeline doesn't exist in the map,
344 0 : // but there might be existing datadir: if timeline is corrupted it is
345 0 : // not loaded. We don't want to overwrite such a dir, so check for its
346 0 : // existence.
347 0 : match fs::metadata(&timeline_path).await {
348 : Ok(_) => {
349 : // Timeline directory exists on disk, we should leave state unchanged
350 : // and return error.
351 0 : bail!(TimelineError::Invalid(ttid));
352 : }
353 0 : Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
354 0 : Err(e) => {
355 0 : return Err(e.into());
356 : }
357 : }
358 :
359 0 : info!(
360 0 : "moving timeline {} from {} to {}",
361 : ttid, tmp_path, timeline_path
362 : );
363 :
364 : // Now it is safe to move the timeline directory to the correct
365 : // location. First, create tenant directory. Ignore error if it already
366 : // exists.
367 0 : if let Err(e) = tokio::fs::create_dir(&tenant_path).await {
368 0 : if e.kind() != std::io::ErrorKind::AlreadyExists {
369 0 : return Err(e.into());
370 0 : }
371 0 : }
372 : // fsync it
373 0 : fsync_async_opt(&tenant_path, !conf.no_sync).await?;
374 : // and its creation
375 0 : fsync_async_opt(&conf.workdir, !conf.no_sync).await?;
376 :
377 : // Do the move.
378 0 : durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?;
379 :
380 0 : Timeline::load_timeline(conf, ttid)
381 0 : }
382 :
383 : /// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
384 : /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid,
385 : /// i.e. loaded in memory and not cancelled.
386 0 : pub(crate) fn get(&self, ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
387 0 : let tli_res = {
388 0 : let state = self.state.lock().unwrap();
389 0 : state.get(&ttid)
390 0 : };
391 0 : match tli_res {
392 0 : Ok(tli) => {
393 0 : if tli.is_cancelled() {
394 0 : return Err(TimelineError::Cancelled(ttid));
395 0 : }
396 0 : Ok(tli)
397 : }
398 0 : _ => tli_res,
399 : }
400 0 : }
401 :
402 : /// Returns all timelines. This is used for background timeline processes.
403 0 : pub fn get_all(&self) -> Vec<Arc<Timeline>> {
404 0 : let global_lock = self.state.lock().unwrap();
405 0 : global_lock
406 0 : .timelines
407 0 : .values()
408 0 : .filter_map(|t| match t {
409 0 : GlobalMapTimeline::Timeline(t) => {
410 0 : if t.is_cancelled() {
411 0 : None
412 : } else {
413 0 : Some(t.clone())
414 : }
415 : }
416 0 : _ => None,
417 0 : })
418 0 : .collect()
419 0 : }
420 :
421 : /// Returns statistics about timeline counts
422 0 : pub fn get_timeline_counts(&self) -> SafekeeperUtilization {
423 0 : let global_lock = self.state.lock().unwrap();
424 0 : let timeline_count = global_lock
425 0 : .timelines
426 0 : .values()
427 0 : .filter(|t| match t {
428 0 : GlobalMapTimeline::CreationInProgress => false,
429 0 : GlobalMapTimeline::Timeline(t) => !t.is_cancelled(),
430 0 : })
431 0 : .count() as u64;
432 0 : SafekeeperUtilization { timeline_count }
433 0 : }
434 :
435 : /// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,
436 : /// and that's why it can return cancelled timelines, to retry deleting them.
437 0 : fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec<Arc<Timeline>> {
438 0 : let global_lock = self.state.lock().unwrap();
439 0 : global_lock
440 0 : .timelines
441 0 : .values()
442 0 : .filter_map(|t| match t {
443 0 : GlobalMapTimeline::Timeline(t) => Some(t.clone()),
444 0 : _ => None,
445 0 : })
446 0 : .filter(|t| t.ttid.tenant_id == tenant_id)
447 0 : .collect()
448 0 : }
449 :
450 : /// Delete timeline, only locally on this node or globally (also cleaning
451 : /// remote storage WAL), depending on `action` value.
452 0 : pub(crate) async fn delete_or_exclude(
453 0 : &self,
454 0 : ttid: &TenantTimelineId,
455 0 : action: DeleteOrExclude,
456 0 : ) -> Result<TimelineDeleteResult, DeleteOrExcludeError> {
457 0 : let tli_res = {
458 0 : let state = self.state.lock().unwrap();
459 0 :
460 0 : if state.tombstones.contains_key(ttid) {
461 : // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do.
462 0 : info!("Timeline {ttid} was already deleted");
463 0 : return Ok(TimelineDeleteResult { dir_existed: false });
464 0 : }
465 0 :
466 0 : state.get(ttid)
467 : };
468 :
469 0 : let result = match tli_res {
470 0 : Ok(timeline) => {
471 0 : info!("deleting timeline {}, action={:?}", ttid, action);
472 :
473 : // If node is getting excluded, check the generation first.
474 : // Then, while holding the lock cancel the timeline; it will be
475 : // unusable after this point, and if node is added back first
476 : // deletion must be completed and node seeded anew.
477 : //
478 : // We would like to avoid holding the lock while waiting for the
479 : // gate to finish as this is deadlock prone, so for actual
480 : // deletion will take it second time.
481 0 : if let DeleteOrExclude::Exclude(ref mconf) = action {
482 0 : let shared_state = timeline.read_shared_state().await;
483 0 : if shared_state.sk.state().mconf.generation > mconf.generation {
484 0 : return Err(DeleteOrExcludeError::Conflict {
485 0 : requested: mconf.clone(),
486 0 : current: shared_state.sk.state().mconf.clone(),
487 0 : });
488 0 : }
489 0 : timeline.cancel().await;
490 : } else {
491 0 : timeline.cancel().await;
492 : }
493 :
494 0 : timeline.close().await;
495 :
496 0 : info!("timeline {ttid} shut down for deletion");
497 :
498 : // Take a lock and finish the deletion holding this mutex.
499 0 : let mut shared_state = timeline.write_shared_state().await;
500 :
501 0 : let only_local = !matches!(action, DeleteOrExclude::Delete);
502 0 : let dir_existed = timeline.delete(&mut shared_state, only_local).await?;
503 :
504 0 : Ok(TimelineDeleteResult { dir_existed })
505 : }
506 : Err(_) => {
507 : // Timeline is not memory, but it may still exist on disk in broken state.
508 0 : let dir_path = get_timeline_dir(self.state.lock().unwrap().conf.as_ref(), ttid);
509 0 : let dir_existed = delete_dir(&dir_path).await?;
510 :
511 0 : Ok(TimelineDeleteResult { dir_existed })
512 : }
513 : };
514 :
515 : // Finalize deletion, by dropping Timeline objects and storing smaller tombstones. The tombstones
516 : // are used to prevent still-running computes from re-creating the same timeline when they send data,
517 : // and to speed up repeated deletion calls by avoiding re-listing objects.
518 0 : self.state.lock().unwrap().delete(*ttid);
519 0 :
520 0 : result
521 0 : }
522 :
523 : /// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which
524 : /// the tenant had, `true` if a timeline was active. There may be a race if new timelines are
525 : /// created simultaneously. In that case the function will return error and the caller should
526 : /// retry tenant deletion again later.
527 : ///
528 : /// If only_local, doesn't remove WAL segments in remote storage.
529 0 : pub async fn delete_all_for_tenant(
530 0 : &self,
531 0 : tenant_id: &TenantId,
532 0 : action: DeleteOrExclude,
533 0 : ) -> Result<HashMap<TenantTimelineId, TimelineDeleteResult>> {
534 0 : info!("deleting all timelines for tenant {}", tenant_id);
535 0 : let to_delete = self.get_all_for_tenant(*tenant_id);
536 0 :
537 0 : let mut err = None;
538 0 :
539 0 : let mut deleted = HashMap::new();
540 0 : for tli in &to_delete {
541 0 : match self.delete_or_exclude(&tli.ttid, action.clone()).await {
542 0 : Ok(result) => {
543 0 : deleted.insert(tli.ttid, result);
544 0 : }
545 0 : Err(e) => {
546 0 : error!("failed to delete timeline {}: {}", tli.ttid, e);
547 : // Save error to return later.
548 0 : err = Some(e);
549 : }
550 : }
551 : }
552 :
553 : // If there was an error, return it.
554 0 : if let Some(e) = err {
555 0 : return Err(anyhow::Error::from(e));
556 0 : }
557 0 :
558 0 : // There may be broken timelines on disk, so delete the whole tenant dir as well.
559 0 : // Note that we could concurrently create new timelines while we were deleting them,
560 0 : // so the directory may be not empty. In this case timelines will have bad state
561 0 : // and timeline background jobs can panic.
562 0 : let tenant_dir = get_tenant_dir(self.state.lock().unwrap().conf.as_ref(), tenant_id);
563 0 : delete_dir(&tenant_dir).await?;
564 :
565 0 : Ok(deleted)
566 0 : }
567 :
568 0 : pub fn housekeeping(&self, tombstone_ttl: &Duration) {
569 0 : let mut state = self.state.lock().unwrap();
570 0 :
571 0 : // We keep tombstones long enough to have a good chance of preventing rogue computes from re-creating deleted
572 0 : // timelines. If a compute kept running for longer than this TTL (or across a safekeeper restart) then they
573 0 : // may recreate a deleted timeline.
574 0 : let now = Instant::now();
575 0 : state
576 0 : .tombstones
577 0 : .retain(|_, v| now.duration_since(*v) < *tombstone_ttl);
578 0 : }
579 : }
580 :
581 : /// Action for delete_or_exclude.
582 : #[derive(Clone, Debug)]
583 : pub enum DeleteOrExclude {
584 : /// Delete timeline globally.
585 : Delete,
586 : /// Legacy mode until we fully migrate to generations: like exclude deletes
587 : /// timeline only locally, but ignores generation number.
588 : DeleteLocal,
589 : /// This node is getting excluded, delete timeline locally.
590 : Exclude(membership::Configuration),
591 : }
592 :
593 : /// Create temp directory for a new timeline. It needs to be located on the same
594 : /// filesystem as the rest of the timelines. It will be automatically deleted when
595 : /// Utf8TempDir goes out of scope.
596 0 : pub async fn create_temp_timeline_dir(
597 0 : conf: &SafeKeeperConf,
598 0 : ttid: TenantTimelineId,
599 0 : ) -> Result<(Utf8TempDir, Utf8PathBuf)> {
600 0 : let temp_base = conf.workdir.join("tmp");
601 0 :
602 0 : tokio::fs::create_dir_all(&temp_base).await?;
603 :
604 0 : let tli_dir = camino_tempfile::Builder::new()
605 0 : .suffix("_temptli")
606 0 : .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
607 0 : .tempdir_in(temp_base)?;
608 :
609 0 : let tli_dir_path = tli_dir.path().to_path_buf();
610 0 :
611 0 : Ok((tli_dir, tli_dir_path))
612 0 : }
613 :
614 : /// Do basic validation of a temp timeline, before moving it to the global map.
615 0 : pub async fn validate_temp_timeline(
616 0 : conf: &SafeKeeperConf,
617 0 : ttid: TenantTimelineId,
618 0 : path: &Utf8PathBuf,
619 0 : ) -> Result<(Lsn, Lsn)> {
620 0 : let control_path = path.join("safekeeper.control");
621 :
622 0 : let control_store = control_file::FileStorage::load_control_file(control_path)?;
623 0 : if control_store.server.wal_seg_size == 0 {
624 0 : bail!("wal_seg_size is not set");
625 0 : }
626 :
627 0 : let wal_store = wal_storage::PhysicalStorage::new(&ttid, path, &control_store, conf.no_sync)?;
628 :
629 0 : let commit_lsn = control_store.commit_lsn;
630 0 : let flush_lsn = wal_store.flush_lsn();
631 0 :
632 0 : Ok((commit_lsn, flush_lsn))
633 0 : }
|