Skip to content

Commit 5861afb

Browse files
committed
Merge branch 'gix-corpus'
2 parents aa16c8c + 8817c24 commit 5861afb

File tree

20 files changed

+872
-629
lines changed

20 files changed

+872
-629
lines changed

.cargo/config.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
rustflags = [
33
# Rustc lints
44
# "-W", "warning_name"
5-
"-A", "clippy::let_unit_value", # in 'small' builds this triggers as the `span!` macro yields `let x = ()`. No way to prevent it in macro apparently.
65

76
# Clippy lints
87
"-W", "clippy::cloned_instead_of_copied",

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

gitoxide-core/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ estimate-hours = ["dep:itertools", "dep:fs-err", "dep:crossbeam-channel", "dep:s
2323
query = ["dep:rusqlite"]
2424
## Run algorithms on a corpus of repositories and store their results for later comparison and intelligence gathering.
2525
## *Note that* `organize` we need for finding git repositories fast.
26-
corpus = [ "dep:rusqlite", "dep:sysinfo", "organize", "dep:crossbeam-channel", "dep:serde_json", "dep:tracing-forest", "dep:tracing-subscriber", "dep:tracing" ]
26+
corpus = [ "dep:rusqlite", "dep:sysinfo", "organize", "dep:crossbeam-channel", "dep:serde_json", "dep:tracing-forest", "dep:tracing-subscriber", "dep:tracing", "dep:parking_lot" ]
2727

2828
#! ### Mutually Exclusive Networking
2929
#! If both are set, _blocking-client_ will take precedence, allowing `--all-features` to be used.
@@ -72,6 +72,7 @@ smallvec = { version = "1.10.0", optional = true }
7272
rusqlite = { version = "0.29.0", optional = true, features = ["bundled"] }
7373

7474
# for 'corpus'
75+
parking_lot = { version = "0.12.1", optional = true }
7576
sysinfo = { version = "0.29.2", optional = true, default-features = false }
7677
serde_json = { version = "1.0.65", optional = true }
7778
tracing-forest = { version = "0.1.5", features = ["serde"], optional = true }

gitoxide-core/src/corpus/db.rs

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -173,12 +173,25 @@ impl Engine {
173173
.con
174174
.query_row(
175175
"INSERT INTO gitoxide_version (version) VALUES (?1) ON CONFLICT DO UPDATE SET version = version RETURNING id",
176-
[&self.gitoxide_version],
176+
[&self.state.gitoxide_version],
177177
|r| r.get(0),
178178
)?)
179179
}
180-
pub(crate) fn tasks_or_insert(&self) -> anyhow::Result<Vec<(Id, &'static super::Task)>> {
181-
let mut out: Vec<_> = super::run::ALL.iter().map(|task| (0, task)).collect();
180+
pub(crate) fn tasks_or_insert(
181+
&self,
182+
allowed_short_names: &[String],
183+
) -> anyhow::Result<Vec<(Id, &'static super::Task)>> {
184+
let mut out: Vec<_> = super::run::ALL
185+
.iter()
186+
.filter(|task| {
187+
if allowed_short_names.is_empty() {
188+
true
189+
} else {
190+
allowed_short_names.iter().any(|allowed| task.short_name == allowed)
191+
}
192+
})
193+
.map(|task| (0, task))
194+
.collect();
182195
for (id, task) in &mut out {
183196
*id = self.con.query_row(
184197
"INSERT INTO task (short_name, description) VALUES (?1, ?2) ON CONFLICT DO UPDATE SET short_name = short_name, description = ?2 RETURNING id",

gitoxide-core/src/corpus/engine.rs

Lines changed: 97 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -10,32 +10,47 @@ use std::path::{Path, PathBuf};
1010
use std::sync::atomic::Ordering;
1111
use std::time::{Duration, Instant};
1212

13+
pub type ProgressItem = gix::progress::DoOrDiscard<gix::progress::prodash::tree::Item>;
14+
15+
pub struct State {
16+
pub progress: ProgressItem,
17+
pub gitoxide_version: String,
18+
pub trace_to_progress: bool,
19+
pub reverse_trace_lines: bool,
20+
}
21+
1322
impl Engine {
1423
/// Open the corpus DB or create it.
15-
pub fn open_or_create(db: PathBuf, gitoxide_version: String, progress: corpus::Progress) -> anyhow::Result<Engine> {
24+
pub fn open_or_create(db: PathBuf, state: State) -> anyhow::Result<Engine> {
1625
let con = crate::corpus::db::create(db).context("Could not open or create database")?;
17-
Ok(Engine {
18-
progress,
19-
con,
20-
gitoxide_version,
21-
})
26+
Ok(Engine { con, state })
2227
}
2328

2429
/// Run on the existing set of repositories we have already seen or obtain them from `path` if there is none yet.
25-
pub fn run(&mut self, corpus_path: PathBuf, threads: Option<usize>) -> anyhow::Result<()> {
30+
pub fn run(
31+
&mut self,
32+
corpus_path: PathBuf,
33+
threads: Option<usize>,
34+
dry_run: bool,
35+
repo_sql_suffix: Option<String>,
36+
allowed_task_names: Vec<String>,
37+
) -> anyhow::Result<()> {
38+
let tasks = self.tasks_or_insert(&allowed_task_names)?;
39+
if tasks.is_empty() {
40+
bail!("Cannot run without any task to perform on the repositories");
41+
}
2642
let (corpus_path, corpus_id) = self.prepare_corpus_path(corpus_path)?;
2743
let gitoxide_id = self.gitoxide_version_id_or_insert()?;
2844
let runner_id = self.runner_id_or_insert()?;
29-
let repos = self.find_repos_or_insert(&corpus_path, corpus_id)?;
30-
let tasks = self.tasks_or_insert()?;
31-
self.perform_run(&corpus_path, gitoxide_id, runner_id, &tasks, repos, threads)
45+
let repos = self.find_repos_or_insert(&corpus_path, corpus_id, repo_sql_suffix)?;
46+
self.perform_run(&corpus_path, gitoxide_id, runner_id, &tasks, repos, threads, dry_run)
3247
}
3348

3449
pub fn refresh(&mut self, corpus_path: PathBuf) -> anyhow::Result<()> {
3550
let (corpus_path, corpus_id) = self.prepare_corpus_path(corpus_path)?;
3651
let repos = self.refresh_repos(&corpus_path, corpus_id)?;
37-
self.progress.set_name("refresh repos");
38-
self.progress.info(format!(
52+
self.state.progress.set_name("refresh repos");
53+
self.state.progress.info(format!(
3954
"Added or updated {} repositories under {corpus_path:?}",
4055
repos.len()
4156
));
@@ -44,6 +59,7 @@ impl Engine {
4459
}
4560

4661
impl Engine {
62+
#[allow(clippy::too_many_arguments)]
4763
fn perform_run(
4864
&mut self,
4965
corpus_path: &Path,
@@ -52,21 +68,42 @@ impl Engine {
5268
tasks: &[(db::Id, &'static Task)],
5369
mut repos: Vec<db::Repo>,
5470
threads: Option<usize>,
71+
dry_run: bool,
5572
) -> anyhow::Result<()> {
5673
let start = Instant::now();
57-
let task_progress = &mut self.progress;
58-
task_progress.set_name("run");
59-
task_progress.init(Some(tasks.len()), gix::progress::count("tasks"));
74+
let repo_progress = &mut self.state.progress;
6075
let threads = gix::parallel::num_threads(threads);
6176
let db_path = self.con.path().expect("opened from path on disk").to_owned();
62-
for (task_id, task) in tasks {
77+
'tasks_loop: for (task_id, task) in tasks {
6378
let task_start = Instant::now();
64-
let mut repo_progress = task_progress.add_child(format!("run '{}'", task.short_name));
79+
let task_info = format!("run '{}'", task.short_name);
80+
repo_progress.set_name(task_info.clone());
6581
repo_progress.init(Some(repos.len()), gix::progress::count("repos"));
66-
67-
if task.execute_exclusive || threads == 1 {
82+
if task.execute_exclusive || threads == 1 || dry_run {
83+
if dry_run {
84+
repo_progress.set_name("WOULD run");
85+
for repo in &repos {
86+
repo_progress.info(format!(
87+
"{}",
88+
repo.path
89+
.strip_prefix(corpus_path)
90+
.expect("corpus contains repo")
91+
.display()
92+
));
93+
repo_progress.inc();
94+
}
95+
repo_progress.info(format!("with {} tasks", tasks.len()));
96+
for (_, task) in tasks {
97+
repo_progress.info(format!("task '{}' ({})", task.description, task.short_name))
98+
}
99+
break 'tasks_loop;
100+
}
68101
let mut run_progress = repo_progress.add_child("set later");
69-
let (_guard, current_id) = corpus::trace::override_thread_subscriber(db_path.as_str())?;
102+
let (_guard, current_id) = corpus::trace::override_thread_subscriber(
103+
db_path.as_str(),
104+
self.state.trace_to_progress.then(|| repo_progress.add_child("trace")),
105+
self.state.reverse_trace_lines,
106+
)?;
70107

71108
for repo in &repos {
72109
if gix::interrupt::is_triggered() {
@@ -80,7 +117,7 @@ impl Engine {
80117
.display()
81118
));
82119

83-
// TODO: wait for new release to be able to provide run_id via span attributes
120+
// TODO: wait for new release of `tracing-forest` to be able to provide run_id via span attributes
84121
let mut run = Self::insert_run(&self.con, gitoxide_id, runner_id, *task_id, repo.id)?;
85122
current_id.store(run.id, Ordering::SeqCst);
86123
tracing::info_span!("run", run_id = run.id).in_scope(|| {
@@ -98,17 +135,21 @@ impl Engine {
98135
repo_progress.show_throughput(task_start);
99136
} else {
100137
let counter = repo_progress.counter();
101-
let repo_progress = gix::threading::OwnShared::new(gix::threading::Mutable::new(repo_progress));
138+
let repo_progress = gix::threading::OwnShared::new(gix::threading::Mutable::new(
139+
repo_progress.add_child("will be changed"),
140+
));
102141
gix::parallel::in_parallel_with_slice(
103142
&mut repos,
104143
Some(threads),
105144
{
106145
let shared_repo_progress = repo_progress.clone();
107146
let db_path = db_path.clone();
108147
move |tid| {
148+
let mut progress = gix::threading::lock(&shared_repo_progress);
109149
(
110-
corpus::trace::override_thread_subscriber(db_path.as_str()),
111-
gix::threading::lock(&shared_repo_progress).add_child(format!("{tid}")),
150+
// threaded printing is usually spammy, and lines interleave so it's useless.
151+
corpus::trace::override_thread_subscriber(db_path.as_str(), None, false),
152+
progress.add_child(format!("{tid}")),
112153
rusqlite::Connection::open(&db_path),
113154
)
114155
}
@@ -154,9 +195,9 @@ impl Engine {
154195
gix::threading::lock(&repo_progress).show_throughput(task_start);
155196
}
156197

157-
task_progress.inc();
198+
repo_progress.inc();
158199
}
159-
task_progress.show_throughput(start);
200+
repo_progress.show_throughput(start);
160201
Ok(())
161202
}
162203

@@ -166,13 +207,21 @@ impl Engine {
166207
Ok((corpus_path, corpus_id))
167208
}
168209

169-
fn find_repos(&mut self, corpus_path: &Path, corpus_id: db::Id) -> anyhow::Result<Vec<db::Repo>> {
170-
self.progress.set_name("query db-repos");
171-
self.progress.init(None, gix::progress::count("repos"));
210+
fn find_repos(
211+
&mut self,
212+
corpus_path: &Path,
213+
corpus_id: db::Id,
214+
sql_suffix: Option<&str>,
215+
) -> anyhow::Result<Vec<db::Repo>> {
216+
self.state.progress.set_name("query db-repos");
217+
self.state.progress.init(None, gix::progress::count("repos"));
172218

173219
Ok(self
174220
.con
175-
.prepare("SELECT id, rela_path, odb_size, num_objects, num_references FROM repository WHERE corpus = ?1")?
221+
.prepare(&format!(
222+
"SELECT id, rela_path, odb_size, num_objects, num_references FROM repository WHERE corpus = ?1 {}",
223+
sql_suffix.unwrap_or_default()
224+
))?
176225
.query_map([corpus_id], |r| {
177226
Ok(db::Repo {
178227
id: r.get(0)?,
@@ -182,17 +231,17 @@ impl Engine {
182231
num_references: r.get(4)?,
183232
})
184233
})?
185-
.inspect(|_| self.progress.inc())
234+
.inspect(|_| self.state.progress.inc())
186235
.collect::<Result<_, _>>()?)
187236
}
188237

189238
fn refresh_repos(&mut self, corpus_path: &Path, corpus_id: db::Id) -> anyhow::Result<Vec<db::Repo>> {
190239
let start = Instant::now();
191-
self.progress.set_name("refresh");
192-
self.progress.init(None, gix::progress::count("repos"));
240+
self.state.progress.set_name("refresh");
241+
self.state.progress.init(None, gix::progress::count("repos"));
193242

194243
let repos = std::thread::scope({
195-
let progress = &mut self.progress;
244+
let progress = &mut self.state.progress;
196245
let con = &mut self.con;
197246
|scope| -> anyhow::Result<_> {
198247
let threads = std::thread::available_parallelism()
@@ -264,13 +313,23 @@ impl Engine {
264313
Ok(repos)
265314
}
266315

267-
fn find_repos_or_insert(&mut self, corpus_path: &Path, corpus_id: db::Id) -> anyhow::Result<Vec<db::Repo>> {
316+
fn find_repos_or_insert(
317+
&mut self,
318+
corpus_path: &Path,
319+
corpus_id: db::Id,
320+
sql_suffix: Option<String>,
321+
) -> anyhow::Result<Vec<db::Repo>> {
268322
let start = Instant::now();
269-
let repos = self.find_repos(corpus_path, corpus_id)?;
323+
let repos = self.find_repos(corpus_path, corpus_id, sql_suffix.as_deref())?;
270324
if repos.is_empty() {
271-
self.refresh_repos(corpus_path, corpus_id)
325+
let res = self.refresh_repos(corpus_path, corpus_id);
326+
if sql_suffix.is_some() {
327+
self.find_repos(corpus_path, corpus_id, sql_suffix.as_deref())
328+
} else {
329+
res
330+
}
272331
} else {
273-
self.progress.show_throughput(start);
332+
self.state.progress.show_throughput(start);
274333
Ok(repos)
275334
}
276335
}

0 commit comments

Comments
 (0)