Skip to content

Commit f2790a9

Browse files
committed
unify how lines in blame results are accessed
* provide `Outcome` with interner and and a list of tokens. * unify the way tokens are created.
1 parent 26bfd2d commit f2790a9

File tree

5 files changed

+89
-41
lines changed

5 files changed

+89
-41
lines changed

gitoxide-core/src/repository/blame.rs

Lines changed: 15 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use std::{ffi::OsStr, path::PathBuf, str::Lines};
1+
use std::{ffi::OsStr, path::PathBuf};
22

33
use anyhow::anyhow;
44
use gix::bstr::BStr;
@@ -18,40 +18,31 @@ pub fn blame_file(mut repo: gix::Repository, file: &OsStr, out: impl std::io::Wr
1818
.into();
1919
let file_path: &BStr = gix::path::os_str_into_bstr(file)?;
2020

21-
let blame_entries = gix::blame::file(
21+
let outcome = gix::blame::file(
2222
&repo.objects,
2323
traverse,
2424
&mut resource_cache,
2525
work_dir.clone(),
2626
file_path,
2727
)?;
28-
29-
let absolute_path = work_dir.join(file);
30-
let file_content = std::fs::read_to_string(absolute_path)?;
31-
let lines = file_content.lines();
32-
33-
write_blame_entries(out, lines, blame_entries)?;
28+
write_blame_entries(out, outcome)?;
3429

3530
Ok(())
3631
}
3732

38-
fn write_blame_entries(
39-
mut out: impl std::io::Write,
40-
mut lines: Lines<'_>,
41-
blame_entries: Vec<gix::blame::BlameEntry>,
42-
) -> Result<(), std::io::Error> {
43-
for blame_entry in blame_entries {
44-
for line_number in blame_entry.range_in_blamed_file {
45-
let line = lines.next().unwrap();
46-
47-
writeln!(
33+
fn write_blame_entries(mut out: impl std::io::Write, outcome: gix::blame::Outcome) -> Result<(), std::io::Error> {
34+
for (entry, lines_in_hunk) in outcome.entries_with_lines() {
35+
for ((actual_lno, source_lno), line) in entry
36+
.range_in_blamed_file
37+
.zip(entry.range_in_original_file)
38+
.zip(lines_in_hunk)
39+
{
40+
write!(
4841
out,
49-
"{} {} {}",
50-
blame_entry.commit_id.to_hex_with_len(8),
51-
// `line_number` is 0-based, but we want to show 1-based line numbers (as `git`
52-
// does).
53-
line_number + 1,
54-
line
42+
"{short_id} {line_no} {src_line_no} {line}",
43+
line_no = actual_lno + 1,
44+
src_line_no = source_lno + 1,
45+
short_id = entry.commit_id.to_hex_with_len(8),
5546
)?;
5647
}
5748
}

gix-blame/src/file/function.rs

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
use std::{ops::Range, path::PathBuf};
2-
1+
use gix_diff::blob::intern::TokenSource;
32
use gix_hash::ObjectId;
43
use gix_object::{bstr::BStr, FindExt};
4+
use std::{ops::Range, path::PathBuf};
55

66
use super::{process_changes, Change, Offset, UnblamedHunk};
7-
use crate::BlameEntry;
7+
use crate::{BlameEntry, Outcome};
88

99
// TODO: do not instantiate anything, get everything passed as argument.
1010
/// Produce a list of consecutive [`BlameEntry`] instances to indicate in which commits the ranges of the file
@@ -60,28 +60,35 @@ pub fn file<E>(
6060
// TODO: remove
6161
worktree_root: PathBuf,
6262
file_path: &BStr,
63-
) -> Result<Vec<BlameEntry>, E> {
63+
) -> Result<Outcome, E> {
6464
// TODO: `worktree_root` should be removed - read everything from Commit.
6565
// Worktree changes should be placed into a temporary commit.
6666
// TODO: remove this and deduplicate the respective code.
6767
use gix_object::bstr::ByteSlice;
6868
let absolute_path = worktree_root.join(gix_path::from_bstr(file_path));
6969

70-
// TODO use `imara-diff` to tokenize this just like it will be tokenized when diffing.
71-
let number_of_lines = std::fs::read_to_string(absolute_path).unwrap().lines().count();
72-
7370
let mut traverse = traverse.into_iter().peekable();
7471
let Some(Ok(suspect)) = traverse.peek().map(|res| res.as_ref().map(|item| item.id)) else {
7572
todo!("return actual error");
7673
};
7774

75+
let original_file_blob = std::fs::read(absolute_path).unwrap();
76+
let num_lines_in_original = {
77+
let mut interner = gix_diff::blob::intern::Interner::new(original_file_blob.len() / 100);
78+
tokens_for_diffing(&original_file_blob)
79+
.tokenize()
80+
.map(|token| interner.intern(token))
81+
.count()
82+
};
83+
7884
let mut hunks_to_blame = vec![UnblamedHunk::new(
79-
0..number_of_lines.try_into().unwrap(),
85+
0..num_lines_in_original.try_into().unwrap(),
8086
suspect,
8187
Offset::Added(0),
8288
)];
8389

8490
let mut out = Vec::new();
91+
let mut buf = Vec::with_capacity(512);
8592
'outer: for item in traverse {
8693
let item = item?;
8794
let suspect = item.id;
@@ -103,9 +110,8 @@ pub fn file<E>(
103110
break;
104111
}
105112

106-
let mut buffer = Vec::new();
107-
let commit_id = odb.find_commit(&suspect, &mut buffer).unwrap().tree();
108-
let tree_iter = odb.find_tree_iter(&commit_id, &mut buffer).unwrap();
113+
let commit_id = odb.find_commit(&suspect, &mut buf).unwrap().tree();
114+
let tree_iter = odb.find_tree_iter(&commit_id, &mut buf).unwrap();
109115

110116
let mut entry_buffer = Vec::new();
111117
let Some(entry) = tree_iter
@@ -247,7 +253,10 @@ pub fn file<E>(
247253
// I don’t know yet whether it would make sense to use a data structure instead that preserves
248254
// order on insertion.
249255
out.sort_by(|a, b| a.range_in_blamed_file.start.cmp(&b.range_in_blamed_file.start));
250-
Ok(coalesce_blame_entries(out))
256+
Ok(Outcome {
257+
entries: coalesce_blame_entries(out),
258+
blob: original_file_blob,
259+
})
251260
}
252261

253262
/// This function merges adjacent blame entries. It merges entries that are adjacent both in the
@@ -416,9 +425,18 @@ fn blob_changes(
416425
.unwrap();
417426

418427
let outcome = resource_cache.prepare_diff().unwrap();
419-
let input = outcome.interned_input();
428+
let input = gix_diff::blob::intern::InternedInput::new(
429+
tokens_for_diffing(outcome.old.data.as_slice().unwrap_or_default()),
430+
tokens_for_diffing(outcome.new.data.as_slice().unwrap_or_default()),
431+
);
420432
let number_of_lines_in_destination = input.after.len();
421433
let change_recorder = ChangeRecorder::new(number_of_lines_in_destination.try_into().unwrap());
422434

423435
gix_diff::blob::diff(gix_diff::blob::Algorithm::Histogram, &input, change_recorder)
424436
}
437+
438+
/// Return an iterator over tokens for use in diffing. These usually lines, but iit's important to unify them
439+
/// so the later access shows the right thing.
440+
pub(crate) fn tokens_for_diffing(data: &[u8]) -> impl TokenSource<Token = &[u8]> {
441+
gix_diff::blob::sources::byte_lines_with_terminator(data)
442+
}

gix-blame/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
#![forbid(unsafe_code)]
1616

1717
mod types;
18-
pub use types::BlameEntry;
18+
pub use types::{BlameEntry, Outcome};
1919

2020
mod file;
2121
pub use file::function::file;

gix-blame/src/types.rs

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,43 @@ use std::{
33
ops::{AddAssign, Range, SubAssign},
44
};
55

6+
use crate::file::function::tokens_for_diffing;
67
use gix_hash::ObjectId;
8+
use gix_object::bstr::BString;
9+
10+
/// The outcome of [`file()`](crate::file()).
11+
pub struct Outcome {
12+
/// One entry in sequential order, to associate a hunk in the original file with the commit (and its lines)
13+
/// that introduced it.
14+
pub entries: Vec<BlameEntry>,
15+
/// A buffer with the file content of the *Original File*, ready for tokenization.
16+
pub blob: Vec<u8>,
17+
}
18+
19+
impl Outcome {
20+
/// Return an iterator over each entry in [`Self::entries`], along with its lines, line by line.
21+
///
22+
/// Note that [`Self::blob`] must be tokenized in exactly the same way as the tokenizer that was used
23+
/// to perform the diffs, which is what this method assures.
24+
pub fn entries_with_lines(&self) -> impl Iterator<Item = (BlameEntry, Vec<BString>)> + '_ {
25+
use gix_diff::blob::intern::TokenSource;
26+
let mut interner = gix_diff::blob::intern::Interner::new(self.blob.len() / 100);
27+
let lines_as_tokens: Vec<_> = tokens_for_diffing(&self.blob)
28+
.tokenize()
29+
.map(|token| interner.intern(token))
30+
.collect();
31+
self.entries.iter().map(move |e| {
32+
let Range { start, end } = e.range_in_blamed_file.clone();
33+
(
34+
e.clone(),
35+
lines_as_tokens[start as usize..end as usize]
36+
.iter()
37+
.map(|token| BString::new(interner[*token].into()))
38+
.collect(),
39+
)
40+
})
41+
}
42+
}
743

844
/// Describes the offset of a particular hunk relative to the *Original File*.
945
#[derive(Clone, Copy, Debug, PartialEq)]
@@ -68,11 +104,12 @@ impl SubAssign<u32> for Offset {
68104
/// Both ranges are of the same size, but may use different [starting points](Range::start). Naturally,
69105
/// they have the same content, which is the reason they are in what is returned by [`file()`](crate::file()).
70106
// TODO: see if this can be encoded as `start_in_original_file` and `start_in_blamed_file` and a single `len`.
71-
#[derive(Debug, PartialEq)]
107+
#[derive(Clone, Debug, PartialEq)]
72108
pub struct BlameEntry {
73109
/// The section of tokens in the tokenized version of the *Blamed File* (typically lines).
74110
pub range_in_blamed_file: Range<u32>,
75111
/// The section of tokens in the tokenized version of the *Original File* (typically lines).
112+
// TODO: figure out why this is basically inverted. Probably that's just it - would make sense with `UnblamedHunk` then.
76113
pub range_in_original_file: Range<u32>,
77114
/// The commit that introduced the section into the *Blamed File*.
78115
pub commit_id: ObjectId,

gix-blame/tests/blame.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,8 @@ macro_rules! mktest {
199199
worktree_path,
200200
format!("{}.txt", $case).as_str().into(),
201201
)
202-
.unwrap();
202+
.unwrap()
203+
.entries;
203204

204205
assert_eq!(lines_blamed.len(), $number_of_lines);
205206

@@ -259,7 +260,8 @@ fn diff_disparity() {
259260
worktree_path,
260261
format!("{case}.txt").as_str().into(),
261262
)
262-
.unwrap();
263+
.unwrap()
264+
.entries;
263265

264266
assert_eq!(lines_blamed.len(), 5);
265267

0 commit comments

Comments
 (0)