Skip to content

Commit e61b20a

Browse files
committed
Create db-dump.zip file too
Zip files use compression per file, which allows users to only extract the data that they need, instead of needlessly extracting the full tarball to read the small table that they are interested in.
1 parent f6e877d commit e61b20a

File tree

4 files changed

+204
-36
lines changed

4 files changed

+204
-36
lines changed

Cargo.lock

Lines changed: 75 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ tracing-subscriber = { version = "=0.3.18", features = ["env-filter"] }
120120
typomania = { version = "=0.1.2", default-features = false }
121121
url = "=2.5.0"
122122
unicode-xid = "=0.2.4"
123+
zip = { version = "=2.1.1", default-features = false, features = ["deflate"] }
123124

124125
[dev-dependencies]
125126
bytes = "=1.6.0"

src/tests/dump_db.rs

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use insta::{assert_debug_snapshot, assert_snapshot};
99
use once_cell::sync::Lazy;
1010
use regex::Regex;
1111
use secrecy::ExposeSecret;
12-
use std::io::Read;
12+
use std::io::{Cursor, Read};
1313
use tar::Archive;
1414

1515
static PATH_DATE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\d{4}-\d{2}-\d{2}-\d{6}").unwrap());
@@ -28,8 +28,9 @@ async fn test_dump_db_job() {
2828
app.run_pending_background_jobs().await;
2929

3030
let stored_files = app.stored_files().await;
31-
assert_eq!(stored_files.len(), 1);
31+
assert_eq!(stored_files.len(), 2);
3232
assert_eq!(stored_files[0], "db-dump.tar.gz");
33+
assert_eq!(stored_files[1], "db-dump.zip");
3334

3435
let path = object_store::path::Path::parse("db-dump.tar.gz").unwrap();
3536
let result = app.as_inner().storage.as_inner().get(&path).await.unwrap();
@@ -65,6 +66,38 @@ async fn test_dump_db_job() {
6566
"YYYY-MM-DD-HHMMSS/data/version_downloads.csv",
6667
]
6768
"###);
69+
70+
let path = object_store::path::Path::parse("db-dump.zip").unwrap();
71+
let result = app.as_inner().storage.as_inner().get(&path).await.unwrap();
72+
let bytes = result.bytes().await.unwrap();
73+
74+
let archive = zip::ZipArchive::new(Cursor::new(bytes)).unwrap();
75+
let zip_paths = archive.file_names().collect::<Vec<_>>();
76+
assert_debug_snapshot!(zip_paths, @r###"
77+
[
78+
"README.md",
79+
"export.sql",
80+
"import.sql",
81+
"metadata.json",
82+
"schema.sql",
83+
"data/",
84+
"data/categories.csv",
85+
"data/crate_downloads.csv",
86+
"data/crates.csv",
87+
"data/keywords.csv",
88+
"data/metadata.csv",
89+
"data/reserved_crate_names.csv",
90+
"data/teams.csv",
91+
"data/users.csv",
92+
"data/crates_categories.csv",
93+
"data/crates_keywords.csv",
94+
"data/crate_owners.csv",
95+
"data/versions.csv",
96+
"data/default_versions.csv",
97+
"data/dependencies.csv",
98+
"data/version_downloads.csv",
99+
]
100+
"###);
68101
}
69102

70103
fn tar_paths<R: Read>(archive: &mut Archive<R>) -> Vec<String> {

src/worker/jobs/dump_db.rs

Lines changed: 93 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use crates_io_worker::BackgroundJob;
66
use std::fs::{self, File};
77
use std::path::{Path, PathBuf};
88
use std::sync::Arc;
9+
use zip::write::SimpleFileOptions;
910

1011
#[derive(Clone, Serialize, Deserialize)]
1112
pub struct DumpDb {
@@ -28,38 +29,56 @@ impl BackgroundJob for DumpDb {
2829
/// Create CSV dumps of the public information in the database, wrap them in a
2930
/// tarball and upload to S3.
3031
async fn run(&self, env: Self::Context) -> anyhow::Result<()> {
31-
let target_name = "db-dump.tar.gz";
32+
const TAR_PATH: &str = "db-dump.tar.gz";
33+
const ZIP_PATH: &str = "db-dump.zip";
34+
3235
let database_url = self.database_url.clone();
3336

34-
let tarball = spawn_blocking(move || {
37+
let (tarball, zip) = spawn_blocking(move || {
3538
let directory = DumpDirectory::create()?;
3639

37-
info!("Begin exporting database");
40+
info!("Exporting database");
3841
directory.populate(&database_url)?;
3942

4043
let export_dir = directory.path();
41-
info!(path = ?export_dir, "Creating tarball");
42-
let prefix = PathBuf::from(directory.timestamp.format("%Y-%m-%d-%H%M%S").to_string());
43-
create_tarball(export_dir, &prefix)
44+
info!(path = ?export_dir, "Creating tarball…");
45+
let tarball_prefix =
46+
PathBuf::from(directory.timestamp.format("%Y-%m-%d-%H%M%S").to_string());
47+
create_archives(export_dir, &tarball_prefix)
4448
})
4549
.await?;
4650

47-
info!("Uploading tarball");
48-
env.storage
49-
.upload_db_dump(target_name, tarball.path())
50-
.await?;
51+
info!("Uploading tarball…");
52+
env.storage.upload_db_dump(TAR_PATH, tarball.path()).await?;
5153
info!("Database dump tarball uploaded");
5254

53-
info!("Invalidating CDN caches");
55+
info!("Invalidating CDN caches…");
56+
if let Some(cloudfront) = env.cloudfront() {
57+
if let Err(error) = cloudfront.invalidate(TAR_PATH).await {
58+
warn!("Failed to invalidate CloudFront cache: {}", error);
59+
}
60+
}
61+
62+
if let Some(fastly) = env.fastly() {
63+
if let Err(error) = fastly.invalidate(TAR_PATH).await {
64+
warn!("Failed to invalidate Fastly cache: {}", error);
65+
}
66+
}
67+
68+
info!("Uploading zip file…");
69+
env.storage.upload_db_dump(ZIP_PATH, zip.path()).await?;
70+
info!("Database dump zip file uploaded");
71+
72+
info!("Invalidating CDN caches…");
5473
if let Some(cloudfront) = env.cloudfront() {
55-
if let Err(error) = cloudfront.invalidate(target_name).await {
56-
warn!("failed to invalidate CloudFront cache: {}", error);
74+
if let Err(error) = cloudfront.invalidate(ZIP_PATH).await {
75+
warn!("Failed to invalidate CloudFront cache: {}", error);
5776
}
5877
}
5978

6079
if let Some(fastly) = env.fastly() {
61-
if let Err(error) = fastly.invalidate(target_name).await {
62-
warn!("failed to invalidate Fastly cache: {}", error);
80+
if let Err(error) = fastly.invalidate(ZIP_PATH).await {
81+
warn!("Failed to invalidate Fastly cache: {}", error);
6382
}
6483
}
6584

@@ -202,15 +221,22 @@ pub fn run_psql(script: &Path, database_url: &str) -> anyhow::Result<()> {
202221
Ok(())
203222
}
204223

205-
fn create_tarball(export_dir: &Path, prefix: &Path) -> anyhow::Result<tempfile::NamedTempFile> {
206-
debug!("Creating tarball file");
207-
let tempfile = tempfile::NamedTempFile::new()?;
208-
let encoder = flate2::write::GzEncoder::new(tempfile.as_file(), flate2::Compression::default());
224+
fn create_archives(
225+
export_dir: &Path,
226+
tarball_prefix: &Path,
227+
) -> anyhow::Result<(tempfile::NamedTempFile, tempfile::NamedTempFile)> {
228+
debug!("Creating tarball file…");
229+
let tar_tempfile = tempfile::NamedTempFile::new()?;
230+
let encoder =
231+
flate2::write::GzEncoder::new(tar_tempfile.as_file(), flate2::Compression::default());
232+
let mut tar = tar::Builder::new(encoder);
209233

210-
let mut archive = tar::Builder::new(encoder);
234+
debug!("Creating zip file…");
235+
let zip_tempfile = tempfile::NamedTempFile::new()?;
236+
let mut zip = zip::ZipWriter::new(zip_tempfile.as_file());
211237

212-
debug!(path = ?prefix, "Appending directory to tarball");
213-
archive.append_dir(prefix, export_dir)?;
238+
debug!("Appending `{tarball_prefix:?}` directory to tarball");
239+
tar.append_dir(tarball_prefix, export_dir)?;
214240

215241
// Append readme, metadata, schemas.
216242
let mut paths = Vec::new();
@@ -224,9 +250,13 @@ fn create_tarball(export_dir: &Path, prefix: &Path) -> anyhow::Result<tempfile::
224250
// Sort paths to make the tarball deterministic.
225251
paths.sort();
226252
for (path, file_name) in paths {
227-
let name_in_tar = prefix.join(file_name);
228-
debug!(name = ?name_in_tar, "Appending file to tarball");
229-
archive.append_path_with_name(path, name_in_tar)?;
253+
let name = tarball_prefix.join(&file_name);
254+
debug!("Appending `{name:?}` file to tarball…");
255+
tar.append_path_with_name(&path, name)?;
256+
257+
debug!("Appending `{file_name:?}` file to zip file…");
258+
zip.start_file_from_path(&file_name, SimpleFileOptions::default())?;
259+
std::io::copy(&mut File::open(path)?, &mut zip)?;
230260
}
231261

232262
// Append topologically sorted tables to make it possible to pipeline
@@ -236,21 +266,34 @@ fn create_tarball(export_dir: &Path, prefix: &Path) -> anyhow::Result<tempfile::
236266
let visibility_config = VisibilityConfig::get();
237267
let sorted_tables = visibility_config.topological_sort();
238268

239-
let path = prefix.join("data");
240-
debug!(?path, "Appending directory to tarball");
241-
archive.append_dir(path, export_dir.join("data"))?;
269+
let path = tarball_prefix.join("data");
270+
debug!("Appending `data` directory to tarball…");
271+
tar.append_dir(path, export_dir.join("data"))?;
272+
273+
debug!("Appending `data` directory to zip file…");
274+
zip.add_directory("data", SimpleFileOptions::default())?;
275+
242276
for table in sorted_tables {
243277
let csv_path = export_dir.join("data").join(table).with_extension("csv");
244278
if csv_path.exists() {
245-
let name_in_tar = prefix.join("data").join(table).with_extension("csv");
246-
debug!(name = ?name_in_tar, "Appending file to tarball");
247-
archive.append_path_with_name(csv_path, name_in_tar)?;
279+
let name = tarball_prefix
280+
.join("data")
281+
.join(table)
282+
.with_extension("csv");
283+
debug!("Appending `{name:?}` file to tarball…");
284+
tar.append_path_with_name(&csv_path, name)?;
285+
286+
let name = PathBuf::from("data").join(table).with_extension("csv");
287+
debug!("Appending `{name:?}` file to zip file…");
288+
zip.start_file_from_path(&name, SimpleFileOptions::default())?;
289+
std::io::copy(&mut File::open(csv_path)?, &mut zip)?;
248290
}
249291
}
250292

251-
drop(archive);
293+
drop(tar);
294+
zip.finish()?;
252295

253-
Ok(tempfile)
296+
Ok((tar_tempfile, zip_tempfile))
254297
}
255298

256299
mod configuration;
@@ -261,6 +304,7 @@ mod tests {
261304
use super::*;
262305
use flate2::read::GzDecoder;
263306
use insta::assert_debug_snapshot;
307+
use std::io::BufReader;
264308
use tar::Archive;
265309

266310
#[test]
@@ -277,7 +321,7 @@ mod tests {
277321
fs::write(p.join("data").join("crate_owners.csv"), "").unwrap();
278322
fs::write(p.join("data").join("users.csv"), "").unwrap();
279323

280-
let tarball = create_tarball(p, &PathBuf::from("0000-00-00")).unwrap();
324+
let (tarball, zip) = create_archives(p, &PathBuf::from("0000-00-00")).unwrap();
281325
let gz = GzDecoder::new(File::open(tarball.path()).unwrap());
282326
let mut tar = Archive::new(gz);
283327

@@ -296,5 +340,20 @@ mod tests {
296340
"0000-00-00/data/crate_owners.csv",
297341
]
298342
"###);
343+
344+
let file = File::open(zip.path()).unwrap();
345+
let reader = BufReader::new(file);
346+
347+
let archive = zip::ZipArchive::new(reader).unwrap();
348+
let zip_paths = archive.file_names().collect::<Vec<_>>();
349+
assert_debug_snapshot!(zip_paths, @r###"
350+
[
351+
"README.md",
352+
"data/",
353+
"data/crates.csv",
354+
"data/users.csv",
355+
"data/crate_owners.csv",
356+
]
357+
"###);
299358
}
300359
}

0 commit comments

Comments
 (0)