Skip to content

Commit 771258e

Browse files
gerzsevladvildanov
authored andcommitted
Support missing and empty values in search (#3231)
Add support for indexing and searching missing and empty values. Currently there are some limitation from the server side, for example empty values are supported only for TEXT and TAG fields.
1 parent b9fd4fa commit 771258e

File tree

3 files changed

+205
-44
lines changed

3 files changed

+205
-44
lines changed

redis/commands/search/commands.py

Lines changed: 34 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
from ._util import to_string
1010
from .aggregation import AggregateRequest, AggregateResult, Cursor
1111
from .document import Document
12+
from .field import Field
13+
from .indexDefinition import IndexDefinition
1214
from .query import Query
1315
from .result import Result
1416
from .suggestion import SuggestionParser
@@ -151,44 +153,43 @@ def batch_indexer(self, chunk_size=100):
151153

152154
def create_index(
153155
self,
154-
fields,
155-
no_term_offsets=False,
156-
no_field_flags=False,
157-
stopwords=None,
158-
definition=None,
156+
fields: List[Field],
157+
no_term_offsets: bool = False,
158+
no_field_flags: bool = False,
159+
stopwords: Optional[List[str]] = None,
160+
definition: Optional[IndexDefinition] = None,
159161
max_text_fields=False,
160162
temporary=None,
161-
no_highlight=False,
162-
no_term_frequencies=False,
163-
skip_initial_scan=False,
163+
no_highlight: bool = False,
164+
no_term_frequencies: bool = False,
165+
skip_initial_scan: bool = False,
164166
):
165167
"""
166-
Create the search index. The index must not already exist.
167-
168-
### Parameters:
169-
170-
- **fields**: a list of TextField or NumericField objects
171-
- **no_term_offsets**: If true, we will not save term offsets in
172-
the index
173-
- **no_field_flags**: If true, we will not save field flags that
174-
allow searching in specific fields
175-
- **stopwords**: If not None, we create the index with this custom
176-
stopword list. The list can be empty
177-
- **max_text_fields**: If true, we will encode indexes as if there
178-
were more than 32 text fields which allows you to add additional
179-
fields (beyond 32).
180-
- **temporary**: Create a lightweight temporary index which will
181-
expire after the specified period of inactivity (in seconds). The
182-
internal idle timer is reset whenever the index is searched or added to.
183-
- **no_highlight**: If true, disabling highlighting support.
184-
Also implied by no_term_offsets.
185-
- **no_term_frequencies**: If true, we avoid saving the term frequencies
186-
in the index.
187-
- **skip_initial_scan**: If true, we do not scan and index.
188-
189-
For more information see `FT.CREATE <https://redis.io/commands/ft.create>`_.
190-
""" # noqa
168+
Creates the search index. The index must not already exist.
169+
170+
For more information, see https://redis.io/commands/ft.create/
171+
172+
Args:
173+
fields: A list of Field objects.
174+
no_term_offsets: If `true`, term offsets will not be saved in the index.
175+
no_field_flags: If true, field flags that allow searching in specific fields
176+
will not be saved.
177+
stopwords: If provided, the index will be created with this custom stopword
178+
list. The list can be empty.
179+
definition: If provided, the index will be created with this custom index
180+
definition.
181+
max_text_fields: If true, indexes will be encoded as if there were more than
182+
32 text fields, allowing for additional fields beyond 32.
183+
temporary: Creates a lightweight temporary index which will expire after the
184+
specified period of inactivity. The internal idle timer is reset
185+
whenever the index is searched or added to.
186+
no_highlight: If true, disables highlighting support. Also implied by
187+
`no_term_offsets`.
188+
no_term_frequencies: If true, term frequencies will not be saved in the
189+
index.
190+
skip_initial_scan: If true, the initial scan and indexing will be skipped.
191191
192+
"""
192193
args = [CREATE_CMD, self.index_name]
193194
if definition is not None:
194195
args += definition.args

redis/commands/search/field.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44

55

66
class Field:
7+
"""
8+
A class representing a field in a document.
9+
"""
10+
711
NUMERIC = "NUMERIC"
812
TEXT = "TEXT"
913
WEIGHT = "WEIGHT"
@@ -14,15 +18,33 @@ class Field:
1418
NOINDEX = "NOINDEX"
1519
AS = "AS"
1620
GEOSHAPE = "GEOSHAPE"
21+
INDEX_MISSING = "INDEXMISSING"
22+
INDEX_EMPTY = "INDEXEMPTY"
1723

1824
def __init__(
1925
self,
2026
name: str,
2127
args: List[str] = None,
2228
sortable: bool = False,
2329
no_index: bool = False,
30+
index_missing: bool = False,
31+
index_empty: bool = False,
2432
as_name: str = None,
2533
):
34+
"""
35+
Create a new field object.
36+
37+
Args:
38+
name: The name of the field.
39+
args:
40+
sortable: If `True`, the field will be sortable.
41+
no_index: If `True`, the field will not be indexed.
42+
index_missing: If `True`, it will be possible to search for documents that
43+
have this field missing.
44+
index_empty: If `True`, it will be possible to search for documents that
45+
have this field empty.
46+
as_name: If provided, this alias will be used for the field.
47+
"""
2648
if args is None:
2749
args = []
2850
self.name = name
@@ -34,6 +56,10 @@ def __init__(
3456
self.args_suffix.append(Field.SORTABLE)
3557
if no_index:
3658
self.args_suffix.append(Field.NOINDEX)
59+
if index_missing:
60+
self.args_suffix.append(Field.INDEX_MISSING)
61+
if index_empty:
62+
self.args_suffix.append(Field.INDEX_EMPTY)
3763

3864
if no_index and not sortable:
3965
raise ValueError("Non-Sortable non-Indexable fields are ignored")

tests/test_search.py

Lines changed: 145 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2105,7 +2105,7 @@ def test_geo_params(client):
21052105
params_dict = {"lat": "34.95126", "lon": "29.69465", "radius": 1000, "units": "km"}
21062106
q = Query("@g:[$lon $lat $radius $units]").dialect(2)
21072107
res = client.ft().search(q, query_params=params_dict)
2108-
_assert_geosearch_result(client, res, ["doc1", "doc2", "doc3"])
2108+
_assert_search_result(client, res, ["doc1", "doc2", "doc3"])
21092109

21102110

21112111
@pytest.mark.redismod
@@ -2122,13 +2122,13 @@ def test_geoshapes_query_intersects_and_disjoint(client):
21222122
Query("@g:[intersects $shape]").dialect(3),
21232123
query_params={"shape": "POLYGON((15 15, 75 15, 50 70, 20 40, 15 15))"},
21242124
)
2125-
_assert_geosearch_result(client, intersection, ["doc_point2", "doc_polygon1"])
2125+
_assert_search_result(client, intersection, ["doc_point2", "doc_polygon1"])
21262126

21272127
disjunction = client.ft().search(
21282128
Query("@g:[disjoint $shape]").dialect(3),
21292129
query_params={"shape": "POLYGON((15 15, 75 15, 50 70, 20 40, 15 15))"},
21302130
)
2131-
_assert_geosearch_result(client, disjunction, ["doc_point1", "doc_polygon2"])
2131+
_assert_search_result(client, disjunction, ["doc_point1", "doc_polygon2"])
21322132

21332133

21342134
@pytest.mark.redismod
@@ -2146,19 +2146,19 @@ def test_geoshapes_query_contains_and_within(client):
21462146
Query("@g:[contains $shape]").dialect(3),
21472147
query_params={"shape": "POINT(25 25)"},
21482148
)
2149-
_assert_geosearch_result(client, contains_a, ["doc_polygon1"])
2149+
_assert_search_result(client, contains_a, ["doc_polygon1"])
21502150

21512151
contains_b = client.ft().search(
21522152
Query("@g:[contains $shape]").dialect(3),
21532153
query_params={"shape": "POLYGON((24 24, 24 26, 25 25, 24 24))"},
21542154
)
2155-
_assert_geosearch_result(client, contains_b, ["doc_polygon1"])
2155+
_assert_search_result(client, contains_b, ["doc_polygon1"])
21562156

21572157
within = client.ft().search(
21582158
Query("@g:[within $shape]").dialect(3),
21592159
query_params={"shape": "POLYGON((15 15, 75 15, 50 70, 20 40, 15 15))"},
21602160
)
2161-
_assert_geosearch_result(client, within, ["doc_point2", "doc_polygon1"])
2161+
_assert_search_result(client, within, ["doc_point2", "doc_polygon1"])
21622162

21632163

21642164
@pytest.mark.redismod
@@ -2322,19 +2322,153 @@ def test_geoshape(client: redis.Redis):
23222322
q2 = Query("@geom:[CONTAINS $poly]").dialect(3)
23232323
qp2 = {"poly": "POLYGON((2 2, 2 50, 50 50, 50 2, 2 2))"}
23242324
result = client.ft().search(q1, query_params=qp1)
2325-
_assert_geosearch_result(client, result, ["small"])
2325+
_assert_search_result(client, result, ["small"])
23262326
result = client.ft().search(q2, query_params=qp2)
2327-
_assert_geosearch_result(client, result, ["small", "large"])
2327+
_assert_search_result(client, result, ["small", "large"])
23282328

23292329

2330-
def _assert_geosearch_result(client, result, expected_doc_ids):
2330+
@pytest.mark.redismod
2331+
def test_search_missing_fields(client):
2332+
definition = IndexDefinition(prefix=["property:"], index_type=IndexType.HASH)
2333+
2334+
fields = [
2335+
TextField("title", sortable=True),
2336+
TagField("features", index_missing=True),
2337+
TextField("description", index_missing=True),
2338+
]
2339+
2340+
client.ft().create_index(fields, definition=definition)
2341+
2342+
# All fields present
2343+
client.hset(
2344+
"property:1",
2345+
mapping={
2346+
"title": "Luxury Villa in Malibu",
2347+
"features": "pool,sea view,modern",
2348+
"description": "A stunning modern villa overlooking the Pacific Ocean.",
2349+
},
2350+
)
2351+
2352+
# Missing features
2353+
client.hset(
2354+
"property:2",
2355+
mapping={
2356+
"title": "Downtown Flat",
2357+
"description": "Modern flat in central Paris with easy access to metro.",
2358+
},
2359+
)
2360+
2361+
# Missing description
2362+
client.hset(
2363+
"property:3",
2364+
mapping={
2365+
"title": "Beachfront Bungalow",
2366+
"features": "beachfront,sun deck",
2367+
},
2368+
)
2369+
2370+
with pytest.raises(redis.exceptions.ResponseError) as e:
2371+
client.ft().search(
2372+
Query("ismissing(@title)").dialect(5).return_field("id").no_content()
2373+
)
2374+
assert "to be defined with 'INDEXMISSING'" in e.value.args[0]
2375+
2376+
res = client.ft().search(
2377+
Query("ismissing(@features)").dialect(5).return_field("id").no_content()
2378+
)
2379+
_assert_search_result(client, res, ["property:2"])
2380+
2381+
res = client.ft().search(
2382+
Query("-ismissing(@features)").dialect(5).return_field("id").no_content()
2383+
)
2384+
_assert_search_result(client, res, ["property:1", "property:3"])
2385+
2386+
res = client.ft().search(
2387+
Query("ismissing(@description)").dialect(5).return_field("id").no_content()
2388+
)
2389+
_assert_search_result(client, res, ["property:3"])
2390+
2391+
res = client.ft().search(
2392+
Query("-ismissing(@description)").dialect(5).return_field("id").no_content()
2393+
)
2394+
_assert_search_result(client, res, ["property:1", "property:2"])
2395+
2396+
2397+
@pytest.mark.redismod
2398+
def test_search_empty_fields(client):
2399+
definition = IndexDefinition(prefix=["property:"], index_type=IndexType.HASH)
2400+
2401+
fields = [
2402+
TextField("title", sortable=True),
2403+
TagField("features", index_empty=True),
2404+
TextField("description", index_empty=True),
2405+
]
2406+
2407+
client.ft().create_index(fields, definition=definition)
2408+
2409+
# All fields present
2410+
client.hset(
2411+
"property:1",
2412+
mapping={
2413+
"title": "Luxury Villa in Malibu",
2414+
"features": "pool,sea view,modern",
2415+
"description": "A stunning modern villa overlooking the Pacific Ocean.",
2416+
},
2417+
)
2418+
2419+
# Empty features
2420+
client.hset(
2421+
"property:2",
2422+
mapping={
2423+
"title": "Downtown Flat",
2424+
"features": "",
2425+
"description": "Modern flat in central Paris with easy access to metro.",
2426+
},
2427+
)
2428+
2429+
# Empty description
2430+
client.hset(
2431+
"property:3",
2432+
mapping={
2433+
"title": "Beachfront Bungalow",
2434+
"features": "beachfront,sun deck",
2435+
"description": "",
2436+
},
2437+
)
2438+
2439+
with pytest.raises(redis.exceptions.ResponseError) as e:
2440+
client.ft().search(
2441+
Query("@title:''").dialect(5).return_field("id").no_content()
2442+
)
2443+
assert "to be defined with `INDEXEMPTY`" in e.value.args[0]
2444+
2445+
res = client.ft().search(
2446+
Query("@features:{ }").dialect(5).return_field("id").no_content()
2447+
)
2448+
_assert_search_result(client, res, ["property:2"])
2449+
2450+
res = client.ft().search(
2451+
Query("-@features:{ }").dialect(5).return_field("id").no_content()
2452+
)
2453+
_assert_search_result(client, res, ["property:1", "property:3"])
2454+
2455+
res = client.ft().search(
2456+
Query("@description:''").dialect(5).return_field("id").no_content()
2457+
)
2458+
_assert_search_result(client, res, ["property:3"])
2459+
2460+
res = client.ft().search(
2461+
Query("-@description:''").dialect(5).return_field("id").no_content()
2462+
)
2463+
_assert_search_result(client, res, ["property:1", "property:2"])
2464+
2465+
2466+
def _assert_search_result(client, result, expected_doc_ids):
23312467
"""
23322468
Make sure the result of a geo search is as expected, taking into account the RESP
23332469
version being used.
23342470
"""
23352471
if is_resp2_connection(client):
23362472
assert set([doc.id for doc in result.docs]) == set(expected_doc_ids)
2337-
assert result.total == len(expected_doc_ids)
23382473
else:
23392474
assert set([doc["id"] for doc in result["results"]]) == set(expected_doc_ids)
2340-
assert result["total_results"] == len(expected_doc_ids)

0 commit comments

Comments
 (0)