Skip to content

Commit 3801ec3

Browse files
committed
feat: support workspace filters when fetching them from the object database
1 parent 5787434 commit 3801ec3

File tree

13 files changed

+1299
-67
lines changed

13 files changed

+1299
-67
lines changed

Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crate-status.md

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -293,16 +293,27 @@ The top-level crate that acts as hub to all functionality provided by the `gix-*
293293
Check out the [performance discussion][gix-diff-performance] as well.
294294

295295
* **tree**
296-
* [x] changes needed to obtain _other tree_
296+
* [x] changes needed to obtain _other tree_
297297
* **patches**
298-
* There are various ways to generate a patch from two blobs.
299-
* [ ] any
298+
* There are various ways to generate a patch from two blobs.
299+
* [ ] text
300+
* [ ] binary
300301
* **lines**
301-
* [x] Simple line-by-line diffs powered by the `imara-diff` crate.
302-
* diffing, merging, working with hunks of data
303-
* find differences between various states, i.e. index, working tree, commit-tree
302+
* [x] Simple line-by-line diffs powered by the `imara-diff` crate.
303+
* **generic rename tracker to find renames and copies**
304+
* [x] find by exact match
305+
* [x] find by similarity check
306+
* [ ] heuristics to find best candidate
307+
* [ ] find by basename to help detecting simple moves
308+
* **blob**
309+
* [ ] worktree conversions
310+
* [ ] `textconv` filters
311+
* [ ] caching of diff-able data
312+
* [ ] special handling of files beyond the big-file threshold.
313+
* [ ] detection of binary files by looking at header (first 8k bytes)
314+
* [ ] working with hunks of data
304315
* [x] API documentation
305-
* [ ] Examples
316+
* [ ] Examples
306317

307318
[gix-diff-performance]: https://github.com/Byron/gitoxide/discussions/74
308319

gix-diff/Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ autotests = false
1313
[features]
1414
default = ["blob"]
1515
## Enable diffing of blobs using imara-diff, which also allows for a generic rewrite tracking implementation.
16-
blob = ["dep:imara-diff"]
16+
blob = ["dep:imara-diff", "dep:gix-filter", "dep:gix-worktree"]
1717
## Data structures implement `serde::Serialize` and `serde::Deserialize`.
1818
serde = ["dep:serde", "gix-hash/serde", "gix-object/serde"]
1919
## Make it possible to compile to the `wasm32-unknown-unknown` target.
@@ -25,6 +25,8 @@ doctest = false
2525
[dependencies]
2626
gix-hash = { version = "^0.13.1", path = "../gix-hash" }
2727
gix-object = { version = "^0.38.0", path = "../gix-object" }
28+
gix-filter = { version = "^0.6.0", path = "../gix-filter", optional = true }
29+
gix-worktree = { version = "^0.27.0", path = "../gix-worktree", default-features = false, features = ["attributes"], optional = true }
2830

2931
thiserror = "1.0.32"
3032
imara-diff = { version = "0.1.3", optional = true }

gix-diff/src/blob.rs

Lines changed: 0 additions & 18 deletions
This file was deleted.

gix-diff/src/blob/mod.rs

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
//! For using text diffs, please have a look at the [`imara-diff` documentation](https://docs.rs/imara-diff),
2+
//! maintained by [Pascal Kuthe](https://github.com/pascalkuthe).
3+
pub use imara_diff::*;
4+
use std::collections::HashMap;
5+
use std::path::PathBuf;
6+
7+
use bstr::BString;
8+
9+
/// Information about the diff performed to detect similarity.
10+
#[derive(Debug, Default, Clone, Copy, PartialEq, PartialOrd)]
11+
pub struct DiffLineStats {
12+
/// The amount of lines to remove from the source to get to the destination.
13+
pub removals: u32,
14+
/// The amount of lines to add to the source to get to the destination.
15+
pub insertions: u32,
16+
/// The amount of lines of the previous state, in the source.
17+
pub before: u32,
18+
/// The amount of lines of the new state, in the destination.
19+
pub after: u32,
20+
/// A range from 0 to 1.0, where 1.0 is a perfect match and 0.5 is a similarity of 50%.
21+
/// Similarity is the ratio between all lines in the previous blob and the current blob,
22+
/// calculated as `(old_lines_count - new_lines_count) as f32 / old_lines_count.max(new_lines_count) as f32`.
23+
pub similarity: f32,
24+
}
25+
26+
/// A way to classify a resource suitable for diffing.
27+
#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq, Hash)]
28+
pub enum ResourceKind {
29+
/// The source of a rewrite, rename or copy operation, or generally the old version of a resource.
30+
OldOrSource,
31+
/// The destination of a rewrite, rename or copy operation, or generally the new version of a resource.
32+
NewOrDestination,
33+
}
34+
35+
/// A set of values to define how to diff something that is associated with it using `git-attributes`, relevant for regular files.
36+
///
37+
/// Some values are related to diffing, some are related to conversions.
38+
#[derive(Debug, Clone)]
39+
pub struct Driver {
40+
/// The name of the driver, as referred to by `[diff "name"]` in the git configuration.
41+
pub name: BString,
42+
/// The per-driver algorithm to use.
43+
pub algorithm: Option<Algorithm>,
44+
/// The external filter program to call like `<binary_to_text_command> /path/to/blob` which outputs a textual version of the provided
45+
/// binary file.
46+
/// Note that it's invoked with a shell if arguments are given.
47+
pub binary_to_text_command: Option<BString>,
48+
/// `true` if this driver deals with binary files, which means that a `binary_to_text_command` should be used to convert binary
49+
/// into a textual representation.
50+
pub is_binary: bool,
51+
}
52+
53+
/// A way to access roots for different kinds of resources that are possibly located and accessible in a worktree.
54+
#[derive(Clone, Debug, Default)]
55+
pub struct WorktreeRoots {
56+
/// A place where the source of a rewrite, rename or copy, or generally the previous version of resources, are located.
57+
pub old_root: Option<PathBuf>,
58+
/// A place where the destination of a rewrite, rename or copy, or generally the new version of resources, are located.
59+
pub new_root: Option<PathBuf>,
60+
}
61+
62+
/// A conversion pipeline to take an object or path from what's stored in `git` to what can be diffed, while
63+
/// following the guidance of git-attributes at the respective path to learn if diffing should happen or if
64+
/// the content is considered binary.
65+
///
66+
/// There are two different conversion flows, where the target of the flow is a buffer with diffable content:
67+
///
68+
/// * `worktree on disk` -> `text conversion`
69+
/// * `object` -> `worktree-filters` -> `text conversion`
70+
///
71+
/// Based on whether or not [`WorktreeRoots`] has the file in question, we either read directly from disk
72+
/// or transform from the object database.
73+
pub struct Pipeline {
74+
/// A way to read data directly from the worktree.
75+
pub roots: WorktreeRoots,
76+
/// A pipeline to convert objects from what's stored in `git` to its worktree version.
77+
pub worktree_filter: gix_filter::Pipeline,
78+
/// Drivers to help customize the conversion behaviour depending on the location of items.
79+
pub drivers: Vec<Driver>,
80+
/// The amount of bytes that an object has to reach before being treated as binary.
81+
/// These objects will not be queried, nor will their data be processed in any way.
82+
pub large_file_threshold_bytes: u64,
83+
84+
/// Pre-configured attributes to obtain additional diff-related information.
85+
attrs: gix_filter::attributes::search::Outcome,
86+
}
87+
88+
/// A utility for performing a diff of two blobs, including flexible conversions, conversion-caching
89+
/// acquisition of diff information.
90+
/// Note that this instance will not call external filters as their output can't be known programmatically,
91+
/// but it allows to prepare their input if the caller wishes to perform this task.
92+
///
93+
/// Optimized for NxM lookups with built-in caching.
94+
pub struct Platform {
95+
/// The old version of a diff-able blob, if set.
96+
old: Option<platform::Diffable>,
97+
/// The new version of a diff-able blob, if set.
98+
new: Option<platform::Diffable>,
99+
100+
/// Options to alter how diffs should be performed.
101+
pub options: platform::Options,
102+
/// A way to convert objects into a diff-able format.
103+
pub filter: Pipeline,
104+
/// A way to access .gitattributes
105+
pub attr_stack: gix_worktree::Stack,
106+
/// A continuously growing cache keeping ready-for-diff blobs by their path in the worktree,
107+
/// as that is what affects their final diff-able state.
108+
///
109+
/// That way, expensive rewrite-checks with NxM matrix checks would be as fast as possible,
110+
/// avoiding duplicate work.
111+
diff_cache: HashMap<platform::CacheKey, platform::CacheValue>,
112+
}
113+
114+
mod impls {
115+
use crate::blob::{ResourceKind, WorktreeRoots};
116+
use std::path::Path;
117+
118+
impl std::fmt::Display for ResourceKind {
119+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
120+
f.write_str(match self {
121+
ResourceKind::OldOrSource => "old",
122+
ResourceKind::NewOrDestination => "new",
123+
})
124+
}
125+
}
126+
127+
impl WorktreeRoots {
128+
/// Return the root path for the given `kind`
129+
pub fn by_kind(&self, kind: ResourceKind) -> Option<&Path> {
130+
match kind {
131+
ResourceKind::OldOrSource => self.old_root.as_deref(),
132+
ResourceKind::NewOrDestination => self.new_root.as_deref(),
133+
}
134+
}
135+
}
136+
}
137+
138+
///
139+
pub mod pipeline;
140+
141+
///
142+
pub mod platform;

gix-diff/src/blob/pipeline.rs

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
use crate::blob::{Pipeline, ResourceKind, WorktreeRoots};
2+
use bstr::BStr;
3+
4+
/// Data as part of an [Outcome].
5+
#[derive(Copy, Clone)]
6+
pub enum Data {
7+
/// The data to use for diffing was written into the buffer that was passed during the call to [`Pipeline::convert_to_diffable()`].
8+
Buffer,
9+
/// The size that the binary blob had at the given revision, without having applied filters, as it's either
10+
/// considered binary or above the big-file threshold.
11+
///
12+
/// In this state, the binary file cannot be diffed.
13+
Binary {
14+
/// The size of the object prior to performing any filtering or as it was found on disk.
15+
size: u64,
16+
},
17+
}
18+
19+
/// The outcome returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()).
20+
#[derive(Copy, Clone)]
21+
pub struct Outcome {
22+
/// If available, an index into the `drivers` field to access more diff-related information of the driver for items
23+
/// at the given path, as previously determined by git-attributes.
24+
pub driver_index: Option<usize>,
25+
/// The data itself, suitable for diffing.
26+
pub data: Data,
27+
}
28+
29+
///
30+
pub mod convert_to_diffable {
31+
/// The error returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()).
32+
#[derive(Debug, thiserror::Error)]
33+
#[allow(missing_docs)]
34+
pub enum Error {}
35+
}
36+
37+
/// Lifecycle
38+
impl Pipeline {
39+
/// Create a new instance of a pipeline which produces blobs suitable for diffing. `roots` allow to read worktree files directly, otherwise
40+
/// `worktree_filter` is used to transform object database data directly. `drivers` further configure individual paths.
41+
/// `large_file_threshold_bytes` is used to determine which objects or files are too large to process, or allow any size if this
42+
/// is set to `0`.
43+
pub fn new(
44+
roots: WorktreeRoots,
45+
worktree_filter: gix_filter::Pipeline,
46+
drivers: Vec<super::Driver>,
47+
large_file_threshold_bytes: u64,
48+
) -> Self {
49+
Pipeline {
50+
roots,
51+
worktree_filter,
52+
drivers,
53+
large_file_threshold_bytes,
54+
attrs: {
55+
let mut out = gix_filter::attributes::search::Outcome::default();
56+
out.initialize_with_selection(&Default::default(), Some("diff"));
57+
out
58+
},
59+
}
60+
}
61+
}
62+
63+
/// Conversion
64+
impl Pipeline {
65+
/// Convert the object at `id`, `mode`, `rela_path` and `kind`, providing access to `attributes` and `objects`.
66+
/// The resulting diff-able data is written into `out`
67+
pub fn convert_to_diffable(
68+
&mut self,
69+
id: &gix_hash::oid,
70+
mode: gix_object::tree::EntryKind,
71+
rela_path: &BStr,
72+
kind: ResourceKind,
73+
attributes: &mut dyn FnMut(&BStr, &mut gix_filter::attributes::search::Outcome),
74+
objects: &dyn gix_object::FindObjectOrHeader,
75+
out: &mut Vec<u8>,
76+
) -> Result<Outcome, convert_to_diffable::Error> {
77+
todo!()
78+
}
79+
}

0 commit comments

Comments
 (0)