Skip to content

Commit ee0acb5

Browse files
emmaling27Convex, Inc.
authored and
Convex, Inc.
committed
Remove USE_MULTI_SEGMENT_SEARCH_QUERY flag and associated code (#27077)
This flag is no longer very useful - the `BUILD_MULTI_SEGMENT_TEXT_INDEXES` flag controls whether we build or query multi segment text indexes. This does use the `search2` query path for both single and multisegment formats and removes the `Ranking` code which is unused on the multisegment query path. GitOrigin-RevId: 272e7d55bd2413d2d44a9d9b52dd7b9e099144a8
1 parent c7081a6 commit ee0acb5

File tree

4 files changed

+18
-719
lines changed

4 files changed

+18
-719
lines changed

crates/common/src/knobs.rs

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,18 +1039,10 @@ pub static REQUEST_TRACE_SAMPLE_CONFIG: LazyLock<SamplingConfig> =
10391039
/// We always search using whatever index format is currently built. So this
10401040
/// dictates not just want index type will be built, but also what type of query
10411041
/// we'll use.
1042-
///
1043-
/// USE_MULTI_SEGMENT_SEARCH_QUERY does not dictate whether we'll use multi
1044-
/// segment queries with multi segment indexes. Instead, it only enables using
1045-
/// the multi segment query path against the old single segment index typ.e
10461042
pub static BUILD_MULTI_SEGMENT_TEXT_INDEXES: LazyLock<bool> =
10471043
LazyLock::new(|| env_config("BUILD_MULTI_SEGMENT_TEXT_INDEXES", true));
10481044

10491045
/// If true, the backend will check the rate limiter service for capacity under
10501046
/// the "backend_startup" domain keyed by db cluster name.
10511047
pub static STARTUP_RATE_LIMIT_ENABLED: LazyLock<bool> =
10521048
LazyLock::new(|| env_config("STARTUP_RATE_LIMIT_ENABLED", false));
1053-
1054-
/// Use multi segment search algorithm for search queries.
1055-
pub static USE_MULTI_SEGMENT_SEARCH_QUERY: LazyLock<bool> =
1056-
LazyLock::new(|| env_config("USE_MULTI_SEGMENT_SEARCH_QUERY", false));

crates/isolate/src/tests/search.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,7 @@ async fn test_search_disk_index_backfill_error(rt: TestRuntime) -> anyhow::Resul
7373
}
7474

7575
fn is_multi_segment() -> bool {
76-
std::env::var("USE_MULTI_SEGMENT_SEARCH_QUERY").is_ok()
77-
|| std::env::var("BUILD_MULTI_SEGMENT_TEXT_INDEXES").is_ok()
76+
std::env::var("BUILD_MULTI_SEGMENT_TEXT_INDEXES").is_ok()
7877
}
7978

8079
fn assert_search_result_order(results: ConvexArray) -> anyhow::Result<()> {

crates/search/src/lib.rs

Lines changed: 17 additions & 153 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ mod levenshtein_dfa;
2222
mod memory_index;
2323
pub mod metrics;
2424
pub mod query;
25-
mod ranking;
2625
pub mod scoring;
2726
mod search_index_manager;
2827
pub mod searcher;
@@ -46,10 +45,6 @@ use common::{
4645
},
4746
document::ResolvedDocument,
4847
index::IndexKeyBytes,
49-
knobs::{
50-
SEARCHLIGHT_CLUSTER_NAME,
51-
USE_MULTI_SEGMENT_SEARCH_QUERY,
52-
},
5348
query::{
5449
search_value_to_bytes,
5550
InternalSearch,
@@ -112,7 +107,6 @@ use tantivy::{
112107
};
113108
pub use tantivy_query::SearchQueryResult;
114109
use value::{
115-
sorting::TotalOrdF64,
116110
values_to_bytes,
117111
ConvexValue,
118112
FieldPath,
@@ -153,7 +147,6 @@ use crate::{
153147
aggregation::TokenMatchAggregator,
154148
constants::MAX_UNIQUE_QUERY_TERMS,
155149
metrics::log_num_segments_searched_total,
156-
ranking::Ranker,
157150
searcher::{
158151
Bm25Stats,
159152
PostingListQuery,
@@ -685,154 +678,25 @@ impl TantivySearchIndexSchema {
685678
disk_index_ts: Timestamp,
686679
searcher: Arc<dyn Searcher>,
687680
) -> anyhow::Result<RevisionWithKeys> {
688-
if *USE_MULTI_SEGMENT_SEARCH_QUERY {
689-
let number_of_segments = searcher
690-
.number_of_segments(search_storage.clone(), disk_index.clone())
691-
.await?;
692-
let segments = (0..number_of_segments)
693-
.map(|i| TextStorageKeys::SingleSegment {
694-
storage_key: disk_index.clone(),
695-
segment_ord: i as u32,
696-
})
697-
.collect();
698-
return self
699-
.search2(
700-
runtime,
701-
compiled_query,
702-
memory_index,
703-
search_storage,
704-
segments,
705-
disk_index_ts,
706-
searcher,
707-
)
708-
.await;
709-
}
710-
// 1. Fetch the memory index matches for each QueryTerm in the query and bound.
711-
let (term_shortlist, term_shortlist_ids) =
712-
memory_index.bound_and_evaluate_query_terms(&compiled_query.text_query);
713-
714-
// 2. For the shortlisted terms, get the BM25 statistics for each term in the
715-
// memory index.
716-
let memory_stats_diff =
717-
memory_index.bm25_statistics_diff(disk_index_ts, &term_shortlist.terms())?;
718-
719-
// 3. Query memory index tombstones to count overfetch_delta
720-
//
721-
// Our goal is to end up with the top MAX_CANDIDATE_REVISIONS.
722-
// Some of the ones in searchlight will be filtered out if they were edited
723-
// since disk_index_ts. Count how many that is and fetch extra!
724-
let tombstoned_matches = {
725-
let term_list_query = memory_index.build_term_list_bitset_query(
726-
&compiled_query,
727-
&term_shortlist,
728-
&term_shortlist_ids,
729-
);
730-
memory_index.tombstoned_matches(disk_index_ts, &term_list_query)?
731-
};
732-
let overfetch_delta = tombstoned_matches.len();
733-
metrics::log_searchlight_overfetch_delta(overfetch_delta);
734-
let limit = MAX_CANDIDATE_REVISIONS + overfetch_delta;
735-
736-
// 4. Do disk query
737-
let search_results = {
738-
let timer = metrics::searchlight_client_execute_timer(&SEARCHLIGHT_CLUSTER_NAME);
739-
let results = searcher
740-
.execute_query(
741-
search_storage,
742-
disk_index,
743-
self,
744-
compiled_query.clone(),
745-
memory_stats_diff,
746-
term_shortlist,
747-
limit,
748-
)
749-
.await?;
750-
metrics::finish_searchlight_client_execute(timer, &results);
751-
results
752-
};
753-
754-
// 5. Do memory index query
755-
let combined_term_shortlist = search_results.combined_shortlisted_terms;
756-
let combined_term_ids =
757-
memory_index.evaluate_shortlisted_query_terms(&combined_term_shortlist);
758-
let memory_revisions = {
759-
let term_list_query = memory_index.build_term_list_bitset_query(
760-
&compiled_query,
761-
&combined_term_shortlist,
762-
&combined_term_ids,
763-
);
764-
let term_weights = build_term_weights(
765-
&combined_term_shortlist,
766-
&combined_term_ids,
767-
&term_list_query,
768-
search_results.combined_statistics,
769-
)?;
770-
memory_index.query(
771-
disk_index_ts,
772-
&term_list_query,
773-
&combined_term_ids,
774-
&term_weights,
775-
)?
776-
};
777-
778-
// 6. Filter out tombstones
779-
let current_disk_revisions = search_results
780-
.results
781-
.into_iter()
782-
.filter(|revision| !tombstoned_matches.contains(&revision.revision.id));
783-
784-
// 7. Use Bm25 to score top retrieval results
785-
let mut revisions_with_keys: Vec<_> = memory_revisions
786-
.into_iter()
787-
.chain(current_disk_revisions)
788-
.map(|candidate| {
789-
(
790-
(
791-
TotalOrdF64::from(-f64::from(candidate.revision.score)),
792-
TotalOrdF64::from(-f64::from(candidate.revision.creation_time)),
793-
Vec::<u8>::from(candidate.revision.id),
794-
),
795-
candidate,
796-
)
681+
let number_of_segments = searcher
682+
.number_of_segments(search_storage.clone(), disk_index.clone())
683+
.await?;
684+
let segments = (0..number_of_segments)
685+
.map(|i| TextStorageKeys::SingleSegment {
686+
storage_key: disk_index.clone(),
687+
segment_ord: i as u32,
797688
})
798689
.collect();
799-
revisions_with_keys.sort_by_key(|(key, _)| key.clone());
800-
let original_len = revisions_with_keys.len();
801-
revisions_with_keys.truncate(MAX_CANDIDATE_REVISIONS);
802-
metrics::log_num_discarded_revisions(original_len - revisions_with_keys.len());
803-
804-
// 8. Rank results
805-
let ranker = Ranker::create(&compiled_query.text_query, &combined_term_shortlist);
806-
let mut ranked_revisions: Vec<_> = revisions_with_keys
807-
.into_iter()
808-
.map(|(_, candidate)| {
809-
// Search results are in decreasing score order and then tie break
810-
// with decreasing creation time (newest first).
811-
//
812-
// This isn't a true index key -- notably, the last value is not the
813-
// document ID, but we're just using the index key bytes for sorting
814-
// and paginating search results within a table.
815-
let ranking_score = ranker.score(&candidate);
816-
817-
let index_fields = vec![
818-
Some(ConvexValue::Float64(-f64::from(ranking_score))),
819-
Some(ConvexValue::Float64(-f64::from(
820-
candidate.revision.creation_time,
821-
))),
822-
Some(ConvexValue::Bytes(
823-
Vec::<u8>::from(candidate.revision.id)
824-
.try_into()
825-
.expect("Could not convert internal ID to value"),
826-
)),
827-
];
828-
let bytes = values_to_bytes(&index_fields);
829-
let index_key_bytes = IndexKeyBytes(bytes);
830-
(CandidateRevision::from(candidate), index_key_bytes)
831-
})
832-
.collect();
833-
ranked_revisions.sort_by_key(|(_, key)| key.clone());
834-
835-
Ok(ranked_revisions)
690+
self.search2(
691+
runtime,
692+
compiled_query,
693+
memory_index,
694+
search_storage,
695+
segments,
696+
disk_index_ts,
697+
searcher,
698+
)
699+
.await
836700
}
837701

838702
fn compile_tokens_with_typo_tolerance(

0 commit comments

Comments
 (0)