Skip to content

Commit c0f6419

Browse files
authored
Merge pull request #2 from meilisearch/distinct
Add url_hash distinct
2 parents 2fe53e0 + 74129b0 commit c0f6419

File tree

3 files changed

+65
-53
lines changed

3 files changed

+65
-53
lines changed

Pipfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ pytest = "==3.10.0"
1414
requests-iap = "==0.2.0"
1515
python-helpscout-v2 = "==1.0.1"
1616
meilisearch = "*"
17+
pyhash = "*"
1718

1819
[dev-packages]
1920
pylint = "==2.3.1"

Pipfile.lock

Lines changed: 57 additions & 51 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scraper/src/meilisearch_helper.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import time
55
import meilisearch
66
from builtins import range
7+
import pyhash
78

89
def clean_one_field(value):
910
if value is None:
@@ -31,13 +32,16 @@ def parse_record(record):
3132
for k, v in record['hierarchy_radio'].items():
3233
key = 'hierarchy_radio_' + k
3334
new_hierarchy_radio = {**{key: v}, **new_hierarchy_radio}
35+
hasher = pyhash.fnv1_32()
36+
new_record_url_hash = {}
37+
new_record_url_hash['url_hash'] = hasher(record['url'])
3438
del record['weight']
3539
del record['hierarchy']
3640
del record['hierarchy_radio']
3741
del record['hierarchy_camel']
3842
del record['hierarchy_radio_camel']
3943
del record['content_camel']
40-
return {**record, **new_weight, **new_hierarchy, **new_hierarchy_radio}
44+
return {**record, **new_weight, **new_hierarchy, **new_hierarchy_radio, **new_record_url_hash}
4145

4246
class MeiliSearchHelper:
4347
"""MeiliSearchHelper"""
@@ -54,7 +58,7 @@ class MeiliSearchHelper:
5458
"level",
5559
"position"
5660
],
57-
"distinctField": "url",
61+
"distinctField": "url_hash",
5862
"rankingRules": {
5963
"page_rank": "dsc",
6064
"level": "dsc",
@@ -96,6 +100,7 @@ def update_schema_based_on(self, record):
96100
base_schema = {
97101
'anchor': ['displayed'],
98102
'url': ['displayed'],
103+
'url_hash': ['displayed', 'ranked'],
99104
'content': ['indexed', 'displayed'],
100105
'objectID': ['identifier', 'indexed', 'displayed'],
101106
'page_rank': ['indexed', 'ranked'],

0 commit comments

Comments
 (0)