Line data Source code
1 : //! This module contains global `(tenant_id, timeline_id)` -> `Arc<Timeline>` mapping.
2 : //! All timelines should always be present in this map, this is done by loading them
3 : //! all from the disk on startup and keeping them in memory.
4 :
5 : use std::collections::HashMap;
6 : use std::str::FromStr;
7 : use std::sync::{Arc, Mutex};
8 : use std::time::{Duration, Instant};
9 :
10 : use anyhow::{Context, Result, bail};
11 : use camino::Utf8PathBuf;
12 : use camino_tempfile::Utf8TempDir;
13 : use safekeeper_api::membership::Configuration;
14 : use safekeeper_api::models::SafekeeperUtilization;
15 : use safekeeper_api::{ServerInfo, membership};
16 : use serde::Serialize;
17 : use tokio::fs;
18 : use tracing::*;
19 : use utils::crashsafe::{durable_rename, fsync_async_opt};
20 : use utils::id::{TenantId, TenantTimelineId, TimelineId};
21 : use utils::lsn::Lsn;
22 :
23 : use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
24 : use crate::http::routes::DeleteOrExcludeError;
25 : use crate::rate_limit::RateLimiter;
26 : use crate::state::TimelinePersistentState;
27 : use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_timeline_dir};
28 : use crate::timelines_set::TimelinesSet;
29 : use crate::wal_storage::Storage;
30 : use crate::{SafeKeeperConf, control_file, wal_storage};
31 :
32 : // Timeline entry in the global map: either a ready timeline, or mark that it is
33 : // being created.
34 : #[derive(Clone)]
35 : enum GlobalMapTimeline {
36 : CreationInProgress,
37 : Timeline(Arc<Timeline>),
38 : }
39 :
40 : struct GlobalTimelinesState {
41 : timelines: HashMap<TenantTimelineId, GlobalMapTimeline>,
42 :
43 : // A tombstone indicates this timeline used to exist has been deleted. These are used to prevent
44 : // on-demand timeline creation from recreating deleted timelines. This is only soft-enforced, as
45 : // this map is dropped on restart.
46 : tombstones: HashMap<TenantTimelineId, Instant>,
47 :
48 : conf: Arc<SafeKeeperConf>,
49 : broker_active_set: Arc<TimelinesSet>,
50 : global_rate_limiter: RateLimiter,
51 : }
52 :
53 : impl GlobalTimelinesState {
54 : /// Get dependencies for a timeline constructor.
55 0 : fn get_dependencies(&self) -> (Arc<SafeKeeperConf>, Arc<TimelinesSet>, RateLimiter) {
56 0 : (
57 0 : self.conf.clone(),
58 0 : self.broker_active_set.clone(),
59 0 : self.global_rate_limiter.clone(),
60 0 : )
61 0 : }
62 :
63 : /// Get timeline from the map. Returns error if timeline doesn't exist or
64 : /// creation is in progress.
65 0 : fn get(&self, ttid: &TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
66 0 : match self.timelines.get(ttid).cloned() {
67 0 : Some(GlobalMapTimeline::Timeline(tli)) => Ok(tli),
68 : Some(GlobalMapTimeline::CreationInProgress) => {
69 0 : Err(TimelineError::CreationInProgress(*ttid))
70 : }
71 0 : None => Err(TimelineError::NotFound(*ttid)),
72 : }
73 0 : }
74 :
75 0 : fn delete(&mut self, ttid: TenantTimelineId) {
76 0 : self.timelines.remove(&ttid);
77 0 : self.tombstones.insert(ttid, Instant::now());
78 0 : }
79 : }
80 :
81 : /// A struct used to manage access to the global timelines map.
82 : pub struct GlobalTimelines {
83 : state: Mutex<GlobalTimelinesState>,
84 : }
85 :
86 : impl GlobalTimelines {
87 : /// Create a new instance of the global timelines map.
88 0 : pub fn new(conf: Arc<SafeKeeperConf>) -> Self {
89 0 : Self {
90 0 : state: Mutex::new(GlobalTimelinesState {
91 0 : timelines: HashMap::new(),
92 0 : tombstones: HashMap::new(),
93 0 : conf,
94 0 : broker_active_set: Arc::new(TimelinesSet::default()),
95 0 : global_rate_limiter: RateLimiter::new(1, 1),
96 0 : }),
97 0 : }
98 0 : }
99 :
100 : /// Inject dependencies needed for the timeline constructors and load all timelines to memory.
101 0 : pub async fn init(&self) -> Result<()> {
102 0 : // clippy isn't smart enough to understand that drop(state) releases the
103 0 : // lock, so use explicit block
104 0 : let tenants_dir = {
105 0 : let mut state = self.state.lock().unwrap();
106 0 : state.global_rate_limiter = RateLimiter::new(
107 0 : state.conf.partial_backup_concurrency,
108 0 : DEFAULT_EVICTION_CONCURRENCY,
109 0 : );
110 0 :
111 0 : // Iterate through all directories and load tenants for all directories
112 0 : // named as a valid tenant_id.
113 0 : state.conf.workdir.clone()
114 0 : };
115 0 : let mut tenant_count = 0;
116 0 : for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
117 0 : .with_context(|| format!("failed to list tenants dir {}", tenants_dir))?
118 : {
119 0 : match &tenants_dir_entry {
120 0 : Ok(tenants_dir_entry) => {
121 0 : if let Ok(tenant_id) =
122 0 : TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or(""))
123 : {
124 0 : tenant_count += 1;
125 0 : self.load_tenant_timelines(tenant_id).await?;
126 0 : }
127 : }
128 0 : Err(e) => error!(
129 0 : "failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
130 : tenants_dir_entry, tenants_dir, e
131 : ),
132 : }
133 : }
134 :
135 0 : info!(
136 0 : "found {} tenants directories, successfully loaded {} timelines",
137 0 : tenant_count,
138 0 : self.state.lock().unwrap().timelines.len()
139 : );
140 0 : Ok(())
141 0 : }
142 :
143 : /// Loads all timelines for the given tenant to memory. Returns fs::read_dir
144 : /// errors if any.
145 : ///
146 : /// It is async, but self.state lock is sync and there is no important
147 : /// reason to make it async (it is always held for a short while), so we
148 : /// just lock and unlock it for each timeline -- this function is called
149 : /// during init when nothing else is running, so this is fine.
150 0 : async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> {
151 0 : let (conf, broker_active_set, partial_backup_rate_limiter) = {
152 0 : let state = self.state.lock().unwrap();
153 0 : state.get_dependencies()
154 0 : };
155 0 :
156 0 : let timelines_dir = get_tenant_dir(&conf, &tenant_id);
157 0 : for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
158 0 : .with_context(|| format!("failed to list timelines dir {}", timelines_dir))?
159 : {
160 0 : match &timelines_dir_entry {
161 0 : Ok(timeline_dir_entry) => {
162 0 : if let Ok(timeline_id) =
163 0 : TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
164 : {
165 0 : let ttid = TenantTimelineId::new(tenant_id, timeline_id);
166 0 : match Timeline::load_timeline(conf.clone(), ttid) {
167 0 : Ok(tli) => {
168 0 : let mut shared_state = tli.write_shared_state().await;
169 0 : self.state
170 0 : .lock()
171 0 : .unwrap()
172 0 : .timelines
173 0 : .insert(ttid, GlobalMapTimeline::Timeline(tli.clone()));
174 0 : tli.bootstrap(
175 0 : &mut shared_state,
176 0 : &conf,
177 0 : broker_active_set.clone(),
178 0 : partial_backup_rate_limiter.clone(),
179 0 : );
180 : }
181 : // If we can't load a timeline, it's most likely because of a corrupted
182 : // directory. We will log an error and won't allow to delete/recreate
183 : // this timeline. The only way to fix this timeline is to repair manually
184 : // and restart the safekeeper.
185 0 : Err(e) => error!(
186 0 : "failed to load timeline {} for tenant {}, reason: {:?}",
187 : timeline_id, tenant_id, e
188 : ),
189 : }
190 0 : }
191 : }
192 0 : Err(e) => error!(
193 0 : "failed to list timelines dir entry {:?} in directory {}, reason: {:?}",
194 : timelines_dir_entry, timelines_dir, e
195 : ),
196 : }
197 : }
198 :
199 0 : Ok(())
200 0 : }
201 :
202 : /// Get the number of timelines in the map.
203 0 : pub fn timelines_count(&self) -> usize {
204 0 : self.state.lock().unwrap().timelines.len()
205 0 : }
206 :
207 : /// Get the global safekeeper config.
208 0 : pub fn get_global_config(&self) -> Arc<SafeKeeperConf> {
209 0 : self.state.lock().unwrap().conf.clone()
210 0 : }
211 :
212 0 : pub fn get_global_broker_active_set(&self) -> Arc<TimelinesSet> {
213 0 : self.state.lock().unwrap().broker_active_set.clone()
214 0 : }
215 :
216 : /// Create a new timeline with the given id. If the timeline already exists, returns
217 : /// an existing timeline.
218 0 : pub(crate) async fn create(
219 0 : &self,
220 0 : ttid: TenantTimelineId,
221 0 : mconf: Configuration,
222 0 : server_info: ServerInfo,
223 0 : start_lsn: Lsn,
224 0 : commit_lsn: Lsn,
225 0 : ) -> Result<Arc<Timeline>> {
226 0 : let (conf, _, _) = {
227 0 : let state = self.state.lock().unwrap();
228 0 : if let Ok(timeline) = state.get(&ttid) {
229 : // Timeline already exists, return it.
230 0 : return Ok(timeline);
231 0 : }
232 0 :
233 0 : if state.tombstones.contains_key(&ttid) {
234 0 : anyhow::bail!("Timeline {ttid} is deleted, refusing to recreate");
235 0 : }
236 0 :
237 0 : state.get_dependencies()
238 0 : };
239 0 :
240 0 : info!("creating new timeline {}", ttid);
241 :
242 : // Do on disk initialization in tmp dir.
243 0 : let (_tmp_dir, tmp_dir_path) = create_temp_timeline_dir(&conf, ttid).await?;
244 :
245 : // TODO: currently we create only cfile. It would be reasonable to
246 : // immediately initialize first WAL segment as well.
247 0 : let state = TimelinePersistentState::new(&ttid, mconf, server_info, start_lsn, commit_lsn)?;
248 0 : control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
249 0 : let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?;
250 0 : Ok(timeline)
251 0 : }
252 :
253 : /// Move timeline from a temp directory to the main storage, and load it to
254 : /// the global map. Creating timeline in this way ensures atomicity: rename
255 : /// is atomic, so either move of the whole datadir succeeds or it doesn't,
256 : /// but corrupted data dir shouldn't be possible.
257 : ///
258 : /// We'd like to avoid holding map lock while doing IO, so it's a 3 step
259 : /// process:
260 : /// 1) check the global map that timeline doesn't exist and mark that we're
261 : /// creating it;
262 : /// 2) move the directory and load the timeline
263 : /// 3) take lock again and insert the timeline into the global map.
264 0 : pub async fn load_temp_timeline(
265 0 : &self,
266 0 : ttid: TenantTimelineId,
267 0 : tmp_path: &Utf8PathBuf,
268 0 : check_tombstone: bool,
269 0 : ) -> Result<Arc<Timeline>> {
270 : // Check for existence and mark that we're creating it.
271 0 : let (conf, broker_active_set, partial_backup_rate_limiter) = {
272 0 : let mut state = self.state.lock().unwrap();
273 0 : match state.timelines.get(&ttid) {
274 : Some(GlobalMapTimeline::CreationInProgress) => {
275 0 : bail!(TimelineError::CreationInProgress(ttid));
276 : }
277 : Some(GlobalMapTimeline::Timeline(_)) => {
278 0 : bail!(TimelineError::AlreadyExists(ttid));
279 : }
280 0 : _ => {}
281 0 : }
282 0 : if check_tombstone {
283 0 : if state.tombstones.contains_key(&ttid) {
284 0 : anyhow::bail!("timeline {ttid} is deleted, refusing to recreate");
285 0 : }
286 : } else {
287 : // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`). We trust
288 : // that the human doing this manual intervention knows what they are doing, and remove its tombstone.
289 0 : if state.tombstones.remove(&ttid).is_some() {
290 0 : warn!("un-deleted timeline {ttid}");
291 0 : }
292 : }
293 0 : state
294 0 : .timelines
295 0 : .insert(ttid, GlobalMapTimeline::CreationInProgress);
296 0 : state.get_dependencies()
297 0 : };
298 0 :
299 0 : // Do the actual move and reflect the result in the map.
300 0 : match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await {
301 0 : Ok(timeline) => {
302 0 : let mut timeline_shared_state = timeline.write_shared_state().await;
303 0 : let mut state = self.state.lock().unwrap();
304 0 : assert!(matches!(
305 0 : state.timelines.get(&ttid),
306 : Some(GlobalMapTimeline::CreationInProgress)
307 : ));
308 :
309 0 : state
310 0 : .timelines
311 0 : .insert(ttid, GlobalMapTimeline::Timeline(timeline.clone()));
312 0 : drop(state);
313 0 : timeline.bootstrap(
314 0 : &mut timeline_shared_state,
315 0 : &conf,
316 0 : broker_active_set,
317 0 : partial_backup_rate_limiter,
318 0 : );
319 0 : drop(timeline_shared_state);
320 0 : Ok(timeline)
321 : }
322 0 : Err(e) => {
323 0 : // Init failed, remove the marker from the map
324 0 : let mut state = self.state.lock().unwrap();
325 0 : assert!(matches!(
326 0 : state.timelines.get(&ttid),
327 : Some(GlobalMapTimeline::CreationInProgress)
328 : ));
329 0 : state.timelines.remove(&ttid);
330 0 : Err(e)
331 : }
332 : }
333 0 : }
334 :
335 : /// Main part of load_temp_timeline: do the move and load.
336 0 : async fn install_temp_timeline(
337 0 : ttid: TenantTimelineId,
338 0 : tmp_path: &Utf8PathBuf,
339 0 : conf: Arc<SafeKeeperConf>,
340 0 : ) -> Result<Arc<Timeline>> {
341 0 : let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id);
342 0 : let timeline_path = get_timeline_dir(conf.as_ref(), &ttid);
343 0 :
344 0 : // We must have already checked that timeline doesn't exist in the map,
345 0 : // but there might be existing datadir: if timeline is corrupted it is
346 0 : // not loaded. We don't want to overwrite such a dir, so check for its
347 0 : // existence.
348 0 : match fs::metadata(&timeline_path).await {
349 : Ok(_) => {
350 : // Timeline directory exists on disk, we should leave state unchanged
351 : // and return error.
352 0 : bail!(TimelineError::Invalid(ttid));
353 : }
354 0 : Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
355 0 : Err(e) => {
356 0 : return Err(e.into());
357 : }
358 : }
359 :
360 0 : info!(
361 0 : "moving timeline {} from {} to {}",
362 : ttid, tmp_path, timeline_path
363 : );
364 :
365 : // Now it is safe to move the timeline directory to the correct
366 : // location. First, create tenant directory. Ignore error if it already
367 : // exists.
368 0 : if let Err(e) = tokio::fs::create_dir(&tenant_path).await {
369 0 : if e.kind() != std::io::ErrorKind::AlreadyExists {
370 0 : return Err(e.into());
371 0 : }
372 0 : }
373 : // fsync it
374 0 : fsync_async_opt(&tenant_path, !conf.no_sync).await?;
375 : // and its creation
376 0 : fsync_async_opt(&conf.workdir, !conf.no_sync).await?;
377 :
378 : // Do the move.
379 0 : durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?;
380 :
381 0 : Timeline::load_timeline(conf, ttid)
382 0 : }
383 :
384 : /// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
385 : /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid,
386 : /// i.e. loaded in memory and not cancelled.
387 0 : pub(crate) fn get(&self, ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
388 0 : let tli_res = {
389 0 : let state = self.state.lock().unwrap();
390 0 : state.get(&ttid)
391 0 : };
392 0 : match tli_res {
393 0 : Ok(tli) => {
394 0 : if tli.is_cancelled() {
395 0 : return Err(TimelineError::Cancelled(ttid));
396 0 : }
397 0 : Ok(tli)
398 : }
399 0 : _ => tli_res,
400 : }
401 0 : }
402 :
403 : /// Returns all timelines. This is used for background timeline processes.
404 0 : pub fn get_all(&self) -> Vec<Arc<Timeline>> {
405 0 : let global_lock = self.state.lock().unwrap();
406 0 : global_lock
407 0 : .timelines
408 0 : .values()
409 0 : .filter_map(|t| match t {
410 0 : GlobalMapTimeline::Timeline(t) => {
411 0 : if t.is_cancelled() {
412 0 : None
413 : } else {
414 0 : Some(t.clone())
415 : }
416 : }
417 0 : _ => None,
418 0 : })
419 0 : .collect()
420 0 : }
421 :
422 : /// Returns statistics about timeline counts
423 0 : pub fn get_timeline_counts(&self) -> SafekeeperUtilization {
424 0 : let global_lock = self.state.lock().unwrap();
425 0 : let timeline_count = global_lock
426 0 : .timelines
427 0 : .values()
428 0 : .filter(|t| match t {
429 0 : GlobalMapTimeline::CreationInProgress => false,
430 0 : GlobalMapTimeline::Timeline(t) => !t.is_cancelled(),
431 0 : })
432 0 : .count() as u64;
433 0 : SafekeeperUtilization { timeline_count }
434 0 : }
435 :
436 : /// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,
437 : /// and that's why it can return cancelled timelines, to retry deleting them.
438 0 : fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec<Arc<Timeline>> {
439 0 : let global_lock = self.state.lock().unwrap();
440 0 : global_lock
441 0 : .timelines
442 0 : .values()
443 0 : .filter_map(|t| match t {
444 0 : GlobalMapTimeline::Timeline(t) => Some(t.clone()),
445 0 : _ => None,
446 0 : })
447 0 : .filter(|t| t.ttid.tenant_id == tenant_id)
448 0 : .collect()
449 0 : }
450 :
451 : /// Delete timeline, only locally on this node or globally (also cleaning
452 : /// remote storage WAL), depending on `action` value.
453 0 : pub(crate) async fn delete_or_exclude(
454 0 : &self,
455 0 : ttid: &TenantTimelineId,
456 0 : action: DeleteOrExclude,
457 0 : ) -> Result<TimelineDeleteResult, DeleteOrExcludeError> {
458 0 : let tli_res = {
459 0 : let state = self.state.lock().unwrap();
460 0 :
461 0 : if state.tombstones.contains_key(ttid) {
462 : // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do.
463 0 : info!("Timeline {ttid} was already deleted");
464 0 : return Ok(TimelineDeleteResult { dir_existed: false });
465 0 : }
466 0 :
467 0 : state.get(ttid)
468 : };
469 :
470 0 : let result = match tli_res {
471 0 : Ok(timeline) => {
472 0 : info!("deleting timeline {}, action={:?}", ttid, action);
473 :
474 : // If node is getting excluded, check the generation first.
475 : // Then, while holding the lock cancel the timeline; it will be
476 : // unusable after this point, and if node is added back first
477 : // deletion must be completed and node seeded anew.
478 : //
479 : // We would like to avoid holding the lock while waiting for the
480 : // gate to finish as this is deadlock prone, so for actual
481 : // deletion will take it second time.
482 0 : if let DeleteOrExclude::Exclude(ref mconf) = action {
483 0 : let shared_state = timeline.read_shared_state().await;
484 0 : if shared_state.sk.state().mconf.generation > mconf.generation {
485 0 : return Err(DeleteOrExcludeError::Conflict {
486 0 : requested: mconf.clone(),
487 0 : current: shared_state.sk.state().mconf.clone(),
488 0 : });
489 0 : }
490 0 : timeline.cancel().await;
491 : } else {
492 0 : timeline.cancel().await;
493 : }
494 :
495 0 : timeline.close().await;
496 :
497 0 : info!("timeline {ttid} shut down for deletion");
498 :
499 : // Take a lock and finish the deletion holding this mutex.
500 0 : let mut shared_state = timeline.write_shared_state().await;
501 :
502 0 : let only_local = !matches!(action, DeleteOrExclude::Delete);
503 0 : let dir_existed = timeline.delete(&mut shared_state, only_local).await?;
504 :
505 0 : Ok(TimelineDeleteResult { dir_existed })
506 : }
507 : Err(_) => {
508 : // Timeline is not memory, but it may still exist on disk in broken state.
509 0 : let dir_path = get_timeline_dir(self.state.lock().unwrap().conf.as_ref(), ttid);
510 0 : let dir_existed = delete_dir(&dir_path).await?;
511 :
512 0 : Ok(TimelineDeleteResult { dir_existed })
513 : }
514 : };
515 :
516 : // Finalize deletion, by dropping Timeline objects and storing smaller tombstones. The tombstones
517 : // are used to prevent still-running computes from re-creating the same timeline when they send data,
518 : // and to speed up repeated deletion calls by avoiding re-listing objects.
519 0 : self.state.lock().unwrap().delete(*ttid);
520 0 :
521 0 : result
522 0 : }
523 :
524 : /// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which
525 : /// the tenant had, `true` if a timeline was active. There may be a race if new timelines are
526 : /// created simultaneously. In that case the function will return error and the caller should
527 : /// retry tenant deletion again later.
528 : ///
529 : /// If only_local, doesn't remove WAL segments in remote storage.
530 0 : pub async fn delete_all_for_tenant(
531 0 : &self,
532 0 : tenant_id: &TenantId,
533 0 : action: DeleteOrExclude,
534 0 : ) -> Result<HashMap<TenantTimelineId, TimelineDeleteResult>> {
535 0 : info!("deleting all timelines for tenant {}", tenant_id);
536 0 : let to_delete = self.get_all_for_tenant(*tenant_id);
537 0 :
538 0 : let mut err = None;
539 0 :
540 0 : let mut deleted = HashMap::new();
541 0 : for tli in &to_delete {
542 0 : match self.delete_or_exclude(&tli.ttid, action.clone()).await {
543 0 : Ok(result) => {
544 0 : deleted.insert(tli.ttid, result);
545 0 : }
546 0 : Err(e) => {
547 0 : error!("failed to delete timeline {}: {}", tli.ttid, e);
548 : // Save error to return later.
549 0 : err = Some(e);
550 : }
551 : }
552 : }
553 :
554 : // If there was an error, return it.
555 0 : if let Some(e) = err {
556 0 : return Err(anyhow::Error::from(e));
557 0 : }
558 0 :
559 0 : // There may be broken timelines on disk, so delete the whole tenant dir as well.
560 0 : // Note that we could concurrently create new timelines while we were deleting them,
561 0 : // so the directory may be not empty. In this case timelines will have bad state
562 0 : // and timeline background jobs can panic.
563 0 : let tenant_dir = get_tenant_dir(self.state.lock().unwrap().conf.as_ref(), tenant_id);
564 0 : delete_dir(&tenant_dir).await?;
565 :
566 0 : Ok(deleted)
567 0 : }
568 :
569 0 : pub fn housekeeping(&self, tombstone_ttl: &Duration) {
570 0 : let mut state = self.state.lock().unwrap();
571 0 :
572 0 : // We keep tombstones long enough to have a good chance of preventing rogue computes from re-creating deleted
573 0 : // timelines. If a compute kept running for longer than this TTL (or across a safekeeper restart) then they
574 0 : // may recreate a deleted timeline.
575 0 : let now = Instant::now();
576 0 : state
577 0 : .tombstones
578 0 : .retain(|_, v| now.duration_since(*v) < *tombstone_ttl);
579 0 : }
580 : }
581 :
582 : #[derive(Clone, Copy, Serialize)]
583 : pub struct TimelineDeleteResult {
584 : pub dir_existed: bool,
585 : }
586 :
587 : /// Action for delete_or_exclude.
588 : #[derive(Clone, Debug)]
589 : pub enum DeleteOrExclude {
590 : /// Delete timeline globally.
591 : Delete,
592 : /// Legacy mode until we fully migrate to generations: like exclude deletes
593 : /// timeline only locally, but ignores generation number.
594 : DeleteLocal,
595 : /// This node is getting excluded, delete timeline locally.
596 : Exclude(membership::Configuration),
597 : }
598 :
599 : /// Create temp directory for a new timeline. It needs to be located on the same
600 : /// filesystem as the rest of the timelines. It will be automatically deleted when
601 : /// Utf8TempDir goes out of scope.
602 0 : pub async fn create_temp_timeline_dir(
603 0 : conf: &SafeKeeperConf,
604 0 : ttid: TenantTimelineId,
605 0 : ) -> Result<(Utf8TempDir, Utf8PathBuf)> {
606 0 : let temp_base = conf.workdir.join("tmp");
607 0 :
608 0 : tokio::fs::create_dir_all(&temp_base).await?;
609 :
610 0 : let tli_dir = camino_tempfile::Builder::new()
611 0 : .suffix("_temptli")
612 0 : .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
613 0 : .tempdir_in(temp_base)?;
614 :
615 0 : let tli_dir_path = tli_dir.path().to_path_buf();
616 0 :
617 0 : Ok((tli_dir, tli_dir_path))
618 0 : }
619 :
620 : /// Do basic validation of a temp timeline, before moving it to the global map.
621 0 : pub async fn validate_temp_timeline(
622 0 : conf: &SafeKeeperConf,
623 0 : ttid: TenantTimelineId,
624 0 : path: &Utf8PathBuf,
625 0 : ) -> Result<(Lsn, Lsn)> {
626 0 : let control_path = path.join("safekeeper.control");
627 :
628 0 : let control_store = control_file::FileStorage::load_control_file(control_path)?;
629 0 : if control_store.server.wal_seg_size == 0 {
630 0 : bail!("wal_seg_size is not set");
631 0 : }
632 :
633 0 : let wal_store = wal_storage::PhysicalStorage::new(&ttid, path, &control_store, conf.no_sync)?;
634 :
635 0 : let commit_lsn = control_store.commit_lsn;
636 0 : let flush_lsn = wal_store.flush_lsn();
637 0 :
638 0 : Ok((commit_lsn, flush_lsn))
639 0 : }
|