Line data Source code
1 : //! A helper tool to manage pageserver binary files.
2 : //! Accepts a file as an argument, attempts to parse it with all ways possible
3 : //! and prints its interpreted context.
4 : //!
5 : //! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
6 :
7 : mod download_remote_object;
8 : mod draw_timeline_dir;
9 : mod index_part;
10 : mod key;
11 : mod layer_map_analyzer;
12 : mod layers;
13 : mod page_trace;
14 :
15 : use std::str::FromStr;
16 : use std::time::{Duration, SystemTime};
17 :
18 : use camino::{Utf8Path, Utf8PathBuf};
19 : use clap::{Parser, Subcommand};
20 : use download_remote_object::DownloadRemoteObjectCmd;
21 : use index_part::IndexPartCmd;
22 : use layers::LayerCmd;
23 : use page_trace::PageTraceCmd;
24 : use pageserver::context::{DownloadBehavior, RequestContext};
25 : use pageserver::page_cache;
26 : use pageserver::task_mgr::TaskKind;
27 : use pageserver::tenant::dump_layerfile_from_path;
28 : use pageserver::tenant::metadata::TimelineMetadata;
29 : use pageserver::virtual_file::api::IoMode;
30 : use pageserver::virtual_file::{self};
31 : use pageserver_api::shard::TenantShardId;
32 : use postgres_ffi::ControlFileData;
33 : use remote_storage::{RemotePath, RemoteStorageConfig};
34 : use tokio_util::sync::CancellationToken;
35 : use utils::id::TimelineId;
36 : use utils::logging::{self, LogFormat, TracingErrorLayerEnablement};
37 : use utils::lsn::Lsn;
38 : use utils::project_git_version;
39 :
40 : project_git_version!(GIT_VERSION);
41 :
42 : #[derive(Parser)]
43 : #[command(
44 : version = GIT_VERSION,
45 : about = "Neon Pageserver binutils",
46 : long_about = "Reads pageserver (and related) binary files management utility"
47 : )]
48 : #[command(propagate_version = true)]
49 : struct CliOpts {
50 : #[command(subcommand)]
51 : command: Commands,
52 : }
53 :
54 : #[derive(Subcommand)]
55 : enum Commands {
56 : Metadata(MetadataCmd),
57 : #[command(subcommand)]
58 : IndexPart(IndexPartCmd),
59 : PrintLayerFile(PrintLayerFileCmd),
60 : TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd),
61 : DrawTimeline {},
62 : AnalyzeLayerMap(AnalyzeLayerMapCmd),
63 : #[command(subcommand)]
64 : Layer(LayerCmd),
65 : /// Debug print a hex key found from logs
66 : Key(key::DescribeKeyCommand),
67 : PageTrace(PageTraceCmd),
68 : DownloadRemoteObject(DownloadRemoteObjectCmd),
69 : }
70 :
71 : /// Read and update pageserver metadata file
72 : #[derive(Parser)]
73 : struct MetadataCmd {
74 : /// Input metadata file path
75 : metadata_path: Utf8PathBuf,
76 : /// Replace disk consistent Lsn
77 : disk_consistent_lsn: Option<Lsn>,
78 : /// Replace previous record Lsn
79 : prev_record_lsn: Option<Lsn>,
80 : /// Replace latest gc cuttoff
81 : latest_gc_cuttoff: Option<Lsn>,
82 : }
83 :
84 : #[derive(Parser)]
85 : struct PrintLayerFileCmd {
86 : /// Pageserver data path
87 : path: Utf8PathBuf,
88 : }
89 :
90 : /// Roll back the time for the specified prefix using S3 history.
91 : ///
92 : /// The command is fairly low level and powerful. Validation is only very light,
93 : /// so it is more powerful, and thus potentially more dangerous.
94 : #[derive(Parser)]
95 : struct TimeTravelRemotePrefixCmd {
96 : /// A configuration string for the remote_storage configuration.
97 : ///
98 : /// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }`
99 : config_toml_str: String,
100 : /// remote prefix to time travel recover. For safety reasons, we require it to contain
101 : /// a timeline or tenant ID in the prefix.
102 : prefix: String,
103 : /// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy.
104 : travel_to: String,
105 : /// Timestamp of the start of the operation, must be after any changes we want to roll back and after.
106 : /// You can use a few seconds before invoking the command. Same format as `travel_to`.
107 : done_if_after: Option<String>,
108 : }
109 :
110 : #[derive(Parser)]
111 : struct AnalyzeLayerMapCmd {
112 : /// Pageserver data path
113 : path: Utf8PathBuf,
114 : /// Max holes
115 : max_holes: Option<usize>,
116 : }
117 :
118 : #[tokio::main]
119 0 : async fn main() -> anyhow::Result<()> {
120 0 : logging::init(
121 0 : LogFormat::Plain,
122 0 : TracingErrorLayerEnablement::EnableWithRustLogFilter,
123 0 : logging::Output::Stdout,
124 0 : )?;
125 :
126 0 : logging::replace_panic_hook_with_tracing_panic_hook().forget();
127 :
128 0 : let cli = CliOpts::parse();
129 :
130 0 : match cli.command {
131 0 : Commands::Layer(cmd) => {
132 0 : layers::main(&cmd).await?;
133 : }
134 0 : Commands::Metadata(cmd) => {
135 0 : handle_metadata(&cmd)?;
136 : }
137 0 : Commands::IndexPart(cmd) => {
138 0 : index_part::main(&cmd).await?;
139 : }
140 : Commands::DrawTimeline {} => {
141 0 : draw_timeline_dir::main()?;
142 : }
143 0 : Commands::AnalyzeLayerMap(cmd) => {
144 0 : layer_map_analyzer::main(&cmd).await?;
145 : }
146 0 : Commands::PrintLayerFile(cmd) => {
147 0 : if let Err(e) = read_pg_control_file(&cmd.path) {
148 0 : println!(
149 0 : "Failed to read input file as a pg control one: {e:#}\n\
150 0 : Attempting to read it as layer file"
151 : );
152 0 : print_layerfile(&cmd.path).await?;
153 0 : }
154 : }
155 0 : Commands::TimeTravelRemotePrefix(cmd) => {
156 0 : let timestamp = humantime::parse_rfc3339(&cmd.travel_to)
157 0 : .map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?;
158 :
159 0 : let done_if_after = if let Some(done_if_after) = &cmd.done_if_after {
160 0 : humantime::parse_rfc3339(done_if_after).map_err(|_e| {
161 0 : anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after)
162 0 : })?
163 : } else {
164 : const SAFETY_MARGIN: Duration = Duration::from_secs(3);
165 0 : tokio::time::sleep(SAFETY_MARGIN).await;
166 : // Convert to string representation and back to get rid of sub-second values
167 0 : let done_if_after = SystemTime::now();
168 0 : tokio::time::sleep(SAFETY_MARGIN).await;
169 0 : done_if_after
170 : };
171 :
172 0 : let timestamp = strip_subsecond(timestamp);
173 0 : let done_if_after = strip_subsecond(done_if_after);
174 :
175 0 : let Some(prefix) = validate_prefix(&cmd.prefix) else {
176 0 : println!("specified prefix '{}' failed validation", cmd.prefix);
177 0 : return Ok(());
178 : };
179 0 : let config = RemoteStorageConfig::from_toml_str(&cmd.config_toml_str)?;
180 0 : let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
181 0 : let cancel = CancellationToken::new();
182 : // Complexity limit: as we are running this command locally, we should have a lot of memory available, and we do not
183 : // need to limit the number of versions we are going to delete.
184 0 : storage
185 0 : .unwrap()
186 0 : .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel, None)
187 0 : .await?;
188 : }
189 0 : Commands::Key(dkc) => dkc.execute(),
190 0 : Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
191 0 : Commands::DownloadRemoteObject(cmd) => {
192 0 : download_remote_object::main(&cmd).await?;
193 : }
194 : };
195 0 : Ok(())
196 0 : }
197 :
198 0 : fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
199 0 : let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
200 0 : println!("{control_file:?}");
201 0 : let control_file_initdb = Lsn(control_file.checkPoint);
202 0 : println!(
203 0 : "pg_initdb_lsn: {}, aligned: {}",
204 : control_file_initdb,
205 0 : control_file_initdb.align()
206 : );
207 0 : Ok(())
208 0 : }
209 :
210 0 : async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
211 : // Basic initialization of things that don't change after startup
212 0 : virtual_file::init(
213 : 10,
214 0 : virtual_file::api::IoEngineKind::StdFs,
215 0 : IoMode::preferred(),
216 0 : virtual_file::SyncMode::Sync,
217 : );
218 0 : page_cache::init(100);
219 0 : let ctx =
220 0 : RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error).with_scope_debug_tools();
221 0 : dump_layerfile_from_path(path, true, &ctx).await
222 0 : }
223 :
224 0 : fn handle_metadata(
225 0 : MetadataCmd {
226 0 : metadata_path: path,
227 0 : disk_consistent_lsn,
228 0 : prev_record_lsn,
229 0 : latest_gc_cuttoff,
230 0 : }: &MetadataCmd,
231 0 : ) -> Result<(), anyhow::Error> {
232 0 : let metadata_bytes = std::fs::read(path)?;
233 0 : let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
234 0 : println!("Current metadata:\n{meta:?}");
235 0 : let mut update_meta = false;
236 : // TODO: simplify this part
237 0 : if let Some(disk_consistent_lsn) = disk_consistent_lsn {
238 0 : meta = TimelineMetadata::new(
239 0 : *disk_consistent_lsn,
240 0 : meta.prev_record_lsn(),
241 0 : meta.ancestor_timeline(),
242 0 : meta.ancestor_lsn(),
243 0 : meta.latest_gc_cutoff_lsn(),
244 0 : meta.initdb_lsn(),
245 0 : meta.pg_version(),
246 0 : );
247 0 : update_meta = true;
248 0 : }
249 0 : if let Some(prev_record_lsn) = prev_record_lsn {
250 0 : meta = TimelineMetadata::new(
251 0 : meta.disk_consistent_lsn(),
252 0 : Some(*prev_record_lsn),
253 0 : meta.ancestor_timeline(),
254 0 : meta.ancestor_lsn(),
255 0 : meta.latest_gc_cutoff_lsn(),
256 0 : meta.initdb_lsn(),
257 0 : meta.pg_version(),
258 0 : );
259 0 : update_meta = true;
260 0 : }
261 0 : if let Some(latest_gc_cuttoff) = latest_gc_cuttoff {
262 0 : meta = TimelineMetadata::new(
263 0 : meta.disk_consistent_lsn(),
264 0 : meta.prev_record_lsn(),
265 0 : meta.ancestor_timeline(),
266 0 : meta.ancestor_lsn(),
267 0 : *latest_gc_cuttoff,
268 0 : meta.initdb_lsn(),
269 0 : meta.pg_version(),
270 0 : );
271 0 : update_meta = true;
272 0 : }
273 :
274 0 : if update_meta {
275 0 : let metadata_bytes = meta.to_bytes()?;
276 0 : std::fs::write(path, metadata_bytes)?;
277 0 : }
278 :
279 0 : Ok(())
280 0 : }
281 :
282 : /// Ensures that the given S3 prefix is sufficiently constrained.
283 : /// The command is very risky already and we don't want to expose something
284 : /// that allows usually unintentional and quite catastrophic time travel of
285 : /// an entire bucket, which would be a major catastrophy and away
286 : /// by only one character change (similar to "rm -r /home /username/foobar").
287 15 : fn validate_prefix(prefix: &str) -> Option<RemotePath> {
288 15 : if prefix.is_empty() {
289 : // Empty prefix means we want to specify the *whole* bucket
290 1 : return None;
291 14 : }
292 14 : let components = prefix.split('/').collect::<Vec<_>>();
293 14 : let (last, components) = {
294 14 : let last = components.last()?;
295 14 : if last.is_empty() {
296 : (
297 7 : components.iter().nth_back(1)?,
298 7 : &components[..(components.len() - 1)],
299 : )
300 : } else {
301 7 : (last, &components[..])
302 : }
303 : };
304 : 'valid: {
305 14 : if let Ok(_timeline_id) = TimelineId::from_str(last) {
306 : // Ends in either a tenant or timeline ID
307 5 : break 'valid;
308 9 : }
309 9 : if *last == "timelines" {
310 3 : if let Some(before_last) = components.iter().nth_back(1) {
311 3 : if let Ok(_tenant_id) = TenantShardId::from_str(before_last) {
312 : // Has a valid tenant id
313 3 : break 'valid;
314 0 : }
315 0 : }
316 6 : }
317 :
318 6 : return None;
319 : }
320 8 : RemotePath::from_string(prefix).ok()
321 15 : }
322 :
323 0 : fn strip_subsecond(timestamp: SystemTime) -> SystemTime {
324 0 : let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string();
325 0 : humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp")
326 0 : }
327 :
328 : #[cfg(test)]
329 : mod tests {
330 : use super::*;
331 :
332 : #[test]
333 1 : fn test_validate_prefix() {
334 1 : assert_eq!(validate_prefix(""), None);
335 1 : assert_eq!(validate_prefix("/"), None);
336 : #[track_caller]
337 7 : fn assert_valid(prefix: &str) {
338 7 : let remote_path = RemotePath::from_string(prefix).unwrap();
339 7 : assert_eq!(validate_prefix(prefix), Some(remote_path));
340 7 : }
341 1 : assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/");
342 : // Path is not relative but absolute
343 1 : assert_eq!(
344 1 : validate_prefix(
345 1 : "/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"
346 : ),
347 : None
348 : );
349 1 : assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/");
350 : // Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix
351 1 : assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None);
352 1 : assert_eq!(validate_prefix("wal"), None);
353 1 : assert_eq!(validate_prefix("/wal/"), None);
354 1 : assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001");
355 : // Partial tenant ID
356 1 : assert_eq!(
357 1 : validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"),
358 : None
359 : );
360 1 : assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
361 1 : assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
362 1 : assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
363 1 : assert_valid(
364 1 : "pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683",
365 : );
366 1 : assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
367 1 : }
368 : }
|