Skip to content

Commit ce3450f

Browse files
emmaling27Convex, Inc.
authored and
Convex, Inc.
committed
Merge deletes correctly for text indexes (#27302)
This PR fixes a bug where we were not updating the deleted terms metadata when reprocessing deletes that happened in a race with a compaction. I added a `merge_deletes` method to the search index trait and had to thread through repeatable persistence to look up previous revisions for text search. It's a shame that we have to re-walk the document log and previous revisions in this race, but it also seems complicated to store the deletes and try to remap them to the right segments because we don't know how many compactions occur while we're writing the new segment to disk. GitOrigin-RevId: e7df528990f8d36bd25b0be11a6b841e192e55ce
1 parent 644068f commit ce3450f

File tree

28 files changed

+344
-101
lines changed

28 files changed

+344
-101
lines changed

crates/common/src/bootstrap_model/index/developer_index_config.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ use super::{
1212
SerializedDeveloperDatabaseIndexConfig,
1313
},
1414
text_index::{
15-
DeveloperSearchIndexConfig,
16-
SerializedDeveloperSearchIndexConfig,
15+
DeveloperTextIndexConfig,
16+
SerializedDeveloperTextIndexConfig,
1717
},
1818
vector_index::{
1919
DeveloperVectorIndexConfig,
@@ -30,7 +30,7 @@ pub enum DeveloperIndexConfig {
3030
Database(DeveloperDatabaseIndexConfig),
3131

3232
/// Full text search index.
33-
Search(DeveloperSearchIndexConfig),
33+
Search(DeveloperTextIndexConfig),
3434

3535
Vector(DeveloperVectorIndexConfig),
3636
}
@@ -60,7 +60,7 @@ enum SerializedDeveloperIndexConfig {
6060
},
6161
Search {
6262
#[serde(flatten)]
63-
config: SerializedDeveloperSearchIndexConfig,
63+
config: SerializedDeveloperTextIndexConfig,
6464
},
6565
Vector {
6666
#[serde(flatten)]

crates/common/src/bootstrap_model/index/index_config.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ use super::{
1414
SerializedDeveloperDatabaseIndexConfig,
1515
},
1616
text_index::{
17-
DeveloperSearchIndexConfig,
18-
SerializedDeveloperSearchIndexConfig,
17+
DeveloperTextIndexConfig,
18+
SerializedDeveloperTextIndexConfig,
1919
SerializedTextIndexState,
2020
TextIndexState,
2121
},
@@ -42,7 +42,7 @@ pub enum IndexConfig {
4242

4343
/// Full text search index.
4444
Search {
45-
developer_config: DeveloperSearchIndexConfig,
45+
developer_config: DeveloperTextIndexConfig,
4646

4747
/// Whether the index is fully backfilled or not on disk.
4848
on_disk_state: TextIndexState,
@@ -165,7 +165,7 @@ pub enum SerializedIndexConfig {
165165
#[serde(rename_all = "camelCase")]
166166
Search {
167167
#[serde(flatten)]
168-
developer_config: SerializedDeveloperSearchIndexConfig,
168+
developer_config: SerializedDeveloperTextIndexConfig,
169169
on_disk_state: SerializedTextIndexState,
170170
},
171171
#[serde(rename_all = "camelCase")]

crates/common/src/bootstrap_model/index/index_metadata.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ use super::{
3131
};
3232
use crate::{
3333
bootstrap_model::index::text_index::{
34-
DeveloperSearchIndexConfig,
34+
DeveloperTextIndexConfig,
3535
TextIndexBackfillState,
3636
TextIndexState,
3737
},
@@ -85,7 +85,7 @@ impl<T: IndexTableIdentifier> IndexMetadata<T> {
8585
) -> Self {
8686
Self::new_search_index(
8787
name,
88-
DeveloperSearchIndexConfig {
88+
DeveloperTextIndexConfig {
8989
search_field,
9090
filter_fields,
9191
},
@@ -118,7 +118,7 @@ impl<T: IndexTableIdentifier> IndexMetadata<T> {
118118

119119
pub fn new_search_index(
120120
name: GenericIndexName<T>,
121-
developer_config: DeveloperSearchIndexConfig,
121+
developer_config: DeveloperTextIndexConfig,
122122
on_disk_state: TextIndexState,
123123
) -> Self {
124124
Self {

crates/common/src/bootstrap_model/index/text_index/index_config.rs

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use crate::paths::FieldPath;
1010

1111
#[derive(Debug, Clone, PartialEq, Eq)]
1212
#[cfg_attr(any(test, feature = "testing"), derive(proptest_derive::Arbitrary))]
13-
pub struct DeveloperSearchIndexConfig {
13+
pub struct DeveloperTextIndexConfig {
1414
/// The field to index for full text search.
1515
pub search_field: FieldPath,
1616

@@ -20,26 +20,26 @@ pub struct DeveloperSearchIndexConfig {
2020

2121
#[derive(Serialize, Deserialize)]
2222
#[serde(rename_all = "camelCase")]
23-
pub struct SerializedDeveloperSearchIndexConfig {
23+
pub struct SerializedDeveloperTextIndexConfig {
2424
search_field: String,
2525
filter_fields: Vec<String>,
2626
}
2727

28-
impl TryFrom<DeveloperSearchIndexConfig> for SerializedDeveloperSearchIndexConfig {
28+
impl TryFrom<DeveloperTextIndexConfig> for SerializedDeveloperTextIndexConfig {
2929
type Error = anyhow::Error;
3030

31-
fn try_from(config: DeveloperSearchIndexConfig) -> anyhow::Result<Self> {
31+
fn try_from(config: DeveloperTextIndexConfig) -> anyhow::Result<Self> {
3232
Ok(Self {
3333
search_field: config.search_field.into(),
3434
filter_fields: config.filter_fields.into_iter().map(String::from).collect(),
3535
})
3636
}
3737
}
3838

39-
impl TryFrom<SerializedDeveloperSearchIndexConfig> for DeveloperSearchIndexConfig {
39+
impl TryFrom<SerializedDeveloperTextIndexConfig> for DeveloperTextIndexConfig {
4040
type Error = anyhow::Error;
4141

42-
fn try_from(config: SerializedDeveloperSearchIndexConfig) -> anyhow::Result<Self> {
42+
fn try_from(config: SerializedDeveloperTextIndexConfig) -> anyhow::Result<Self> {
4343
Ok(Self {
4444
search_field: config.search_field.parse()?,
4545
filter_fields: config
@@ -51,16 +51,13 @@ impl TryFrom<SerializedDeveloperSearchIndexConfig> for DeveloperSearchIndexConfi
5151
}
5252
}
5353

54-
codegen_convex_serialization!(
55-
DeveloperSearchIndexConfig,
56-
SerializedDeveloperSearchIndexConfig
57-
);
54+
codegen_convex_serialization!(DeveloperTextIndexConfig, SerializedDeveloperTextIndexConfig);
5855

59-
impl TryFrom<pb::searchlight::SearchIndexConfig> for DeveloperSearchIndexConfig {
56+
impl TryFrom<pb::searchlight::SearchIndexConfig> for DeveloperTextIndexConfig {
6057
type Error = anyhow::Error;
6158

6259
fn try_from(proto: pb::searchlight::SearchIndexConfig) -> anyhow::Result<Self> {
63-
Ok(DeveloperSearchIndexConfig {
60+
Ok(DeveloperTextIndexConfig {
6461
search_field: proto
6562
.search_field_path
6663
.ok_or_else(|| anyhow::format_err!("Missing search_field_path"))?
@@ -76,8 +73,8 @@ impl TryFrom<pb::searchlight::SearchIndexConfig> for DeveloperSearchIndexConfig
7673
}
7774
}
7875

79-
impl From<DeveloperSearchIndexConfig> for pb::searchlight::SearchIndexConfig {
80-
fn from(config: DeveloperSearchIndexConfig) -> Self {
76+
impl From<DeveloperTextIndexConfig> for pb::searchlight::SearchIndexConfig {
77+
fn from(config: DeveloperTextIndexConfig) -> Self {
8178
pb::searchlight::SearchIndexConfig {
8279
search_field_path: Some(config.search_field.into()),
8380
filter_fields: config

crates/common/src/bootstrap_model/index/text_index/mod.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ pub use self::{
99
TextIndexBackfillState,
1010
},
1111
index_config::{
12-
DeveloperSearchIndexConfig,
13-
SerializedDeveloperSearchIndexConfig,
12+
DeveloperTextIndexConfig,
13+
SerializedDeveloperTextIndexConfig,
1414
},
1515
index_snapshot::{
1616
FragmentedTextSegment,
@@ -36,9 +36,9 @@ mod tests {
3636
#![proptest_config(ProptestConfig { cases: 64 * env_config("CONVEX_PROPTEST_MULTIPLIER", 1), failure_persistence: None, .. ProptestConfig::default() })]
3737

3838
#[test]
39-
fn test_developer_search_index_config_roundtrips(v in any::<DeveloperSearchIndexConfig>()) {
39+
fn test_developer_search_index_config_roundtrips(v in any::<DeveloperTextIndexConfig>()) {
4040
assert_roundtrips::<
41-
DeveloperSearchIndexConfig,
41+
DeveloperTextIndexConfig,
4242
pb::searchlight::SearchIndexConfig
4343
>(v);
4444
}

crates/database/src/bootstrap_model/index.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ use common::{
1414
},
1515
index_validation_error,
1616
text_index::{
17-
DeveloperSearchIndexConfig,
17+
DeveloperTextIndexConfig,
1818
TextIndexState,
1919
},
2020
vector_index::{
@@ -897,7 +897,7 @@ impl<'a, RT: Runtime> IndexModel<'a, RT> {
897897
} => IndexMetadata::new_backfilling(*self.tx.begin_timestamp(), index_name, fields),
898898
IndexConfig::Search {
899899
developer_config:
900-
DeveloperSearchIndexConfig {
900+
DeveloperTextIndexConfig {
901901
search_field,
902902
filter_fields,
903903
},

crates/database/src/index_workers/index_meta.rs

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,6 @@ use value::{
4040
use super::search_flusher::MultipartBuildType;
4141
use crate::Snapshot;
4242

43-
pub trait PreviousSegmentsType: Send {
44-
fn maybe_delete_document(&mut self, convex_id: InternalId) -> anyhow::Result<()>;
45-
}
4643
pub trait SegmentType<T: SearchIndex> {
4744
fn id(&self) -> &str;
4845

@@ -57,13 +54,13 @@ pub trait SearchIndex: Clone + Debug {
5754
type Segment: SegmentType<Self> + Clone + Debug + Send + 'static;
5855
type NewSegment: Send;
5956

60-
type PreviousSegments: PreviousSegmentsType;
57+
type PreviousSegments: Send;
6158

6259
type Statistics: SegmentStatistics;
6360

64-
type BuildIndexArgs: Clone + Send;
61+
type BuildIndexArgs: Clone + Send + 'static;
6562

66-
type Schema: Send + Sync;
63+
type Schema: Send + Sync + 'static;
6764

6865
/// Returns the generalized `SearchIndexConfig` if it matches the type of
6966
/// the parser (e.g. Text vs Vector) and `None` otherwise.
@@ -140,6 +137,16 @@ pub trait SearchIndex: Clone + Debug {
140137
config: &Self::DeveloperConfig,
141138
segments: Vec<Self::Segment>,
142139
) -> anyhow::Result<Self::Segment>;
140+
141+
async fn merge_deletes<RT: Runtime>(
142+
runtime: &RT,
143+
previous_segments: &mut Self::PreviousSegments,
144+
document_stream: DocumentStream<'_>,
145+
repeatable_persistence: &RepeatablePersistence,
146+
build_index_args: Self::BuildIndexArgs,
147+
schema: Self::Schema,
148+
document_log_lower_bound: Timestamp,
149+
) -> anyhow::Result<()>;
143150
}
144151

145152
pub trait SegmentStatistics: Default + Debug {

crates/database/src/index_workers/search_compactor.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ impl<RT: Runtime, T: SearchIndex> SearchIndexCompactor<RT, T> {
198198
segments_to_compact.clone(),
199199
new_segment.clone(),
200200
*SEARCH_WORKER_PASSIVE_PAGES_PER_SECOND,
201+
T::new_schema(&job.developer_config),
201202
)
202203
.await?;
203204
let total_compacted_segments = total_compacted_segments as u64;

crates/database/src/index_workers/search_worker.rs

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use common::{
88
knobs::{
99
BUILD_MULTI_SEGMENT_TEXT_INDEXES,
1010
DATABASE_WORKERS_POLL_INTERVAL,
11+
MULTI_SEGMENT_FULL_SCAN_THRESHOLD_KB,
1112
},
1213
persistence::PersistenceReader,
1314
runtime::{
@@ -47,6 +48,7 @@ use crate::{
4748
new_text_flusher,
4849
TextIndexFlusher2,
4950
},
51+
BuildTextIndexArgs,
5052
TextIndexMetadataWriter,
5153
},
5254
vector_index_worker::{
@@ -55,6 +57,7 @@ use crate::{
5557
VectorIndexCompactor,
5658
},
5759
flusher::new_vector_flusher,
60+
BuildVectorIndexArgs,
5861
},
5962
Database,
6063
TextIndexFlusher,
@@ -99,10 +102,22 @@ impl<RT: Runtime> SearchIndexWorkers<RT> {
99102
let vector_index_metadata_writer = SearchIndexMetadataWriter::new(
100103
runtime.clone(),
101104
database.clone(),
105+
reader.clone(),
102106
search_storage.clone(),
107+
BuildVectorIndexArgs {
108+
full_scan_threshold_bytes: *MULTI_SEGMENT_FULL_SCAN_THRESHOLD_KB,
109+
},
110+
);
111+
let text_index_metadata_writer = TextIndexMetadataWriter::new(
112+
runtime.clone(),
113+
database.clone(),
114+
reader.clone(),
115+
search_storage.clone(),
116+
BuildTextIndexArgs {
117+
search_storage: search_storage.clone(),
118+
segment_term_metadata_fetcher: segment_term_metadata_fetcher.clone(),
119+
},
103120
);
104-
let text_index_metadata_writer =
105-
TextIndexMetadataWriter::new(runtime.clone(), database.clone(), search_storage.clone());
106121
let vector_flush = retry_loop_expect_occs_and_overloaded(
107122
"VectorFlusher",
108123
runtime.clone(),

0 commit comments

Comments
 (0)