Remove USE_MULTI_SEGMENT_SEARCH_QUERY flag and associated code (#27077)

emmaling27 · Convex, Inc. · commit ee0acb514339 · 2024-06-25T17:46:28.000Z
This flag is no longer very useful - the `BUILD_MULTI_SEGMENT_TEXT_INDEXES` flag controls whether we build or query multi segment text indexes. This does use the `search2` query path for both single and multisegment formats and removes the `Ranking` code which is unused on the multisegment query path.

GitOrigin-RevId: 272e7d55bd2413d2d44a9d9b52dd7b9e099144a8
diff --git a/crates/common/src/knobs.rs b/crates/common/src/knobs.rs
@@ -1039,18 +1039,10 @@ pub static REQUEST_TRACE_SAMPLE_CONFIG: LazyLock<SamplingConfig> =
 /// We always search using whatever index format is currently built. So this
 /// dictates not just want index type will be built, but also what type of query
 /// we'll use.
-///
-/// USE_MULTI_SEGMENT_SEARCH_QUERY does not dictate whether we'll use multi
-/// segment queries with multi segment indexes. Instead, it only enables using
-/// the multi segment query path against the old single segment index typ.e
 pub static BUILD_MULTI_SEGMENT_TEXT_INDEXES: LazyLock<bool> =
     LazyLock::new(|| env_config("BUILD_MULTI_SEGMENT_TEXT_INDEXES", true));
 
 /// If true, the backend will check the rate limiter service for capacity under
 /// the "backend_startup" domain keyed by db cluster name.
 pub static STARTUP_RATE_LIMIT_ENABLED: LazyLock<bool> =
     LazyLock::new(|| env_config("STARTUP_RATE_LIMIT_ENABLED", false));
-
-/// Use multi segment search algorithm for search queries.
-pub static USE_MULTI_SEGMENT_SEARCH_QUERY: LazyLock<bool> =
-    LazyLock::new(|| env_config("USE_MULTI_SEGMENT_SEARCH_QUERY", false));
diff --git a/crates/isolate/src/tests/search.rs b/crates/isolate/src/tests/search.rs
@@ -73,8 +73,7 @@ async fn test_search_disk_index_backfill_error(rt: TestRuntime) -> anyhow::Resul
 }
 
 fn is_multi_segment() -> bool {
-    std::env::var("USE_MULTI_SEGMENT_SEARCH_QUERY").is_ok()
-        || std::env::var("BUILD_MULTI_SEGMENT_TEXT_INDEXES").is_ok()
+    std::env::var("BUILD_MULTI_SEGMENT_TEXT_INDEXES").is_ok()
 }
 
 fn assert_search_result_order(results: ConvexArray) -> anyhow::Result<()> {
diff --git a/crates/search/src/lib.rs b/crates/search/src/lib.rs
@@ -22,7 +22,6 @@ mod levenshtein_dfa;
 mod memory_index;
 pub mod metrics;
 pub mod query;
-mod ranking;
 pub mod scoring;
 mod search_index_manager;
 pub mod searcher;
@@ -46,10 +45,6 @@ use common::{
     },
     document::ResolvedDocument,
     index::IndexKeyBytes,
-    knobs::{
-        SEARCHLIGHT_CLUSTER_NAME,
-        USE_MULTI_SEGMENT_SEARCH_QUERY,
-    },
     query::{
         search_value_to_bytes,
         InternalSearch,
@@ -112,7 +107,6 @@ use tantivy::{
 };
 pub use tantivy_query::SearchQueryResult;
 use value::{
-    sorting::TotalOrdF64,
     values_to_bytes,
     ConvexValue,
     FieldPath,
@@ -153,7 +147,6 @@ use crate::{
     aggregation::TokenMatchAggregator,
     constants::MAX_UNIQUE_QUERY_TERMS,
     metrics::log_num_segments_searched_total,
-    ranking::Ranker,
     searcher::{
         Bm25Stats,
         PostingListQuery,
@@ -685,154 +678,25 @@ impl TantivySearchIndexSchema {
         disk_index_ts: Timestamp,
         searcher: Arc<dyn Searcher>,
     ) -> anyhow::Result<RevisionWithKeys> {
-        if *USE_MULTI_SEGMENT_SEARCH_QUERY {
-            let number_of_segments = searcher
-                .number_of_segments(search_storage.clone(), disk_index.clone())
-                .await?;
-            let segments = (0..number_of_segments)
-                .map(|i| TextStorageKeys::SingleSegment {
-                    storage_key: disk_index.clone(),
-                    segment_ord: i as u32,
-                })
-                .collect();
-            return self
-                .search2(
-                    runtime,
-                    compiled_query,
-                    memory_index,
-                    search_storage,
-                    segments,
-                    disk_index_ts,
-                    searcher,
-                )
-                .await;
-        }
-        // 1. Fetch the memory index matches for each QueryTerm in the query and bound.
-        let (term_shortlist, term_shortlist_ids) =
-            memory_index.bound_and_evaluate_query_terms(&compiled_query.text_query);
-
-        // 2. For the shortlisted terms, get the BM25 statistics for each term in the
-        //    memory index.
-        let memory_stats_diff =
-            memory_index.bm25_statistics_diff(disk_index_ts, &term_shortlist.terms())?;
-
-        // 3. Query memory index tombstones to count overfetch_delta
-        //
-        // Our goal is to end up with the top MAX_CANDIDATE_REVISIONS.
-        // Some of the ones in searchlight will be filtered out if they were edited
-        // since disk_index_ts. Count how many that is and fetch extra!
-        let tombstoned_matches = {
-            let term_list_query = memory_index.build_term_list_bitset_query(
-                &compiled_query,
-                &term_shortlist,
-                &term_shortlist_ids,
-            );
-            memory_index.tombstoned_matches(disk_index_ts, &term_list_query)?
-        };
-        let overfetch_delta = tombstoned_matches.len();
-        metrics::log_searchlight_overfetch_delta(overfetch_delta);
-        let limit = MAX_CANDIDATE_REVISIONS + overfetch_delta;
-
-        // 4. Do disk query
-        let search_results = {
-            let timer = metrics::searchlight_client_execute_timer(&SEARCHLIGHT_CLUSTER_NAME);
-            let results = searcher
-                .execute_query(
-                    search_storage,
-                    disk_index,
-                    self,
-                    compiled_query.clone(),
-                    memory_stats_diff,
-                    term_shortlist,
-                    limit,
-                )
-                .await?;
-            metrics::finish_searchlight_client_execute(timer, &results);
-            results
-        };
-
-        // 5. Do memory index query
-        let combined_term_shortlist = search_results.combined_shortlisted_terms;
-        let combined_term_ids =
-            memory_index.evaluate_shortlisted_query_terms(&combined_term_shortlist);
-        let memory_revisions = {
-            let term_list_query = memory_index.build_term_list_bitset_query(
-                &compiled_query,
-                &combined_term_shortlist,
-                &combined_term_ids,
-            );
-            let term_weights = build_term_weights(
-                &combined_term_shortlist,
-                &combined_term_ids,
-                &term_list_query,
-                search_results.combined_statistics,
-            )?;
-            memory_index.query(
-                disk_index_ts,
-                &term_list_query,
-                &combined_term_ids,
-                &term_weights,
-            )?
-        };
-
-        // 6. Filter out tombstones
-        let current_disk_revisions = search_results
-            .results
-            .into_iter()
-            .filter(|revision| !tombstoned_matches.contains(&revision.revision.id));
-
-        // 7. Use Bm25 to score top retrieval results
-        let mut revisions_with_keys: Vec<_> = memory_revisions
-            .into_iter()
-            .chain(current_disk_revisions)
-            .map(|candidate| {
-                (
-                    (
-                        TotalOrdF64::from(-f64::from(candidate.revision.score)),
-                        TotalOrdF64::from(-f64::from(candidate.revision.creation_time)),
-                        Vec::<u8>::from(candidate.revision.id),
-                    ),
-                    candidate,
-                )
+        let number_of_segments = searcher
+            .number_of_segments(search_storage.clone(), disk_index.clone())
+            .await?;
+        let segments = (0..number_of_segments)
+            .map(|i| TextStorageKeys::SingleSegment {
+                storage_key: disk_index.clone(),
+                segment_ord: i as u32,
             })
             .collect();
-        revisions_with_keys.sort_by_key(|(key, _)| key.clone());
-        let original_len = revisions_with_keys.len();
-        revisions_with_keys.truncate(MAX_CANDIDATE_REVISIONS);
-        metrics::log_num_discarded_revisions(original_len - revisions_with_keys.len());
-
-        // 8. Rank results
-        let ranker = Ranker::create(&compiled_query.text_query, &combined_term_shortlist);
-        let mut ranked_revisions: Vec<_> = revisions_with_keys
-            .into_iter()
-            .map(|(_, candidate)| {
-                // Search results are in decreasing score order and then tie break
-                // with decreasing creation time (newest first).
-                //
-                // This isn't a true index key -- notably, the last value is not the
-                // document ID, but we're just using the index key bytes for sorting
-                // and paginating search results within a table.
-                let ranking_score = ranker.score(&candidate);
-
-                let index_fields = vec![
-                    Some(ConvexValue::Float64(-f64::from(ranking_score))),
-                    Some(ConvexValue::Float64(-f64::from(
-                        candidate.revision.creation_time,
-                    ))),
-                    Some(ConvexValue::Bytes(
-                        Vec::<u8>::from(candidate.revision.id)
-                            .try_into()
-                            .expect("Could not convert internal ID to value"),
-                    )),
-                ];
-                let bytes = values_to_bytes(&index_fields);
-                let index_key_bytes = IndexKeyBytes(bytes);
-                (CandidateRevision::from(candidate), index_key_bytes)
-            })
-            .collect();
-        ranked_revisions.sort_by_key(|(_, key)| key.clone());
-
-        Ok(ranked_revisions)
+        self.search2(
+            runtime,
+            compiled_query,
+            memory_index,
+            search_storage,
+            segments,
+            disk_index_ts,
+            searcher,
+        )
+        .await
     }
 
     fn compile_tokens_with_typo_tolerance(
diff --git a/crates/search/src/ranking.rs b/crates/search/src/ranking.rs

Original file line number	Diff line number	Diff line change
`@@ -73,8 +73,7 @@ async fn test_search_disk_index_backfill_error(rt: TestRuntime) -> anyhow::Resul`
`73`	`73`	`}`
`74`	`74`
`75`	`75`	`fn is_multi_segment() -> bool {`
`76`		`- std::env::var("USE_MULTI_SEGMENT_SEARCH_QUERY").is_ok()`
`77`		`- \|\| std::env::var("BUILD_MULTI_SEGMENT_TEXT_INDEXES").is_ok()`
	`76`	`+ std::env::var("BUILD_MULTI_SEGMENT_TEXT_INDEXES").is_ok()`
`78`	`77`	`}`
`79`	`78`
`80`	`79`	`fn assert_search_result_order(results: ConvexArray) -> anyhow::Result<()> {`