Skip to content

New format handling CSV NDJSON #329

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
500 changes: 500 additions & 0 deletions datasets/songs.csv

Large diffs are not rendered by default.

225 changes: 225 additions & 0 deletions datasets/songs.ndjson

Large diffs are not rendered by default.

34 changes: 23 additions & 11 deletions meilisearch/_httprequests.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,33 @@ def __init__(self, config: Config) -> None:
self.config = config
self.headers = {
'X-Meili-Api-Key': self.config.api_key,
'Content-Type': 'application/json'
}

def send_request(
self,
http_method: Callable,
path: str,
body: Optional[Union[Dict[str, Any], List[Dict[str, Any]], List[str]]] = None,
body: Optional[Union[Dict[str, Any], List[Dict[str, Any]], List[str], str]] = None,
content_type: Optional[str] = None,
) -> Any:
if content_type:
self.headers['Content-Type'] = content_type
try:
request_path = self.config.url + '/' + path
request = http_method(
request_path,
timeout=self.config.timeout,
headers=self.headers,
data=json.dumps(body) if body else "null"
)
if isinstance(body, bytes):
request = http_method(
request_path,
timeout=self.config.timeout,
headers=self.headers,
data=body
)
else:
request = http_method(
request_path,
timeout=self.config.timeout,
headers=self.headers,
data=json.dumps(body) if body else "null"
)
return self.__validate(request)

except requests.exceptions.Timeout as err:
Expand All @@ -45,16 +55,18 @@ def get(
def post(
self,
path: str,
body: Optional[Union[Dict[str, Any], List[Dict[str, Any]], List[str]]] = None,
body: Optional[Union[Dict[str, Any], List[Dict[str, Any]], List[str], str]] = None,
content_type: Optional[str] = 'application/json',
) -> Any:
return self.send_request(requests.post, path, body)
return self.send_request(requests.post, path, body, content_type)

def put(
self,
path: str,
body: Optional[Union[Dict[str, Any], List[Dict[str, Any]], List[str]]] = None,
content_type: Optional[str] = 'application/json',
) -> Any:
return self.send_request(requests.put, path, body)
return self.send_request(requests.put, path, body, content_type)

def delete(
self,
Expand Down
134 changes: 123 additions & 11 deletions meilisearch/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,6 @@ def get_documents(self, parameters: Optional[Dict[str, Any]] = None) -> List[Dic
"""
if parameters is None:
parameters = {}

return self.http.get(
f'{self.config.paths.index}/{self.uid}/{self.config.paths.document}?{urllib.parse.urlencode(parameters)}'
)
Expand Down Expand Up @@ -346,11 +345,7 @@ def add_documents(
MeiliSearchApiError
An error containing details about why MeiliSearch can't process your request. MeiliSearch error codes are described here: https://docs.meilisearch.com/errors/#meilisearch-errors
"""
if primary_key is None:
url = f'{self.config.paths.index}/{self.uid}/{self.config.paths.document}'
else:
primary_key = urllib.parse.urlencode({'primaryKey': primary_key})
url = f'{self.config.paths.index}/{self.uid}/{self.config.paths.document}?{primary_key}'
url = self._build_url(primary_key)
return self.http.post(url, documents)

def add_documents_in_batches(
Expand Down Expand Up @@ -391,6 +386,118 @@ def add_documents_in_batches(

return update_ids

def add_documents_json(
self,
str_documents: str,
primary_key: Optional[str] = None,
) -> Dict[str, int]:
"""Add string documents from JSON file to the index.

Parameters
----------
str_documents:
String of document from a JSON file.
primary_key (optional):
The primary-key used in index. Ignored if already set up.

Returns
-------
update:
Dictionary containing an update id to track the action:
https://docs.meilisearch.com/reference/api/updates.html#get-an-update-status

Raises
------
MeiliSearchApiError
An error containing details about why MeiliSearch can't process your request. MeiliSearch error codes are described here: https://docs.meilisearch.com/errors/#meilisearch-errors
"""
return self.add_documents_raw(str_documents, primary_key, 'application/json')

def add_documents_csv(
self,
str_documents: str,
primary_key: Optional[str] = None,
) -> Dict[str, int]:
"""Add string documents from a CSV file to the index.

Parameters
----------
str_documents:
String of document from a CSV file.
primary_key (optional):
The primary-key used in index. Ignored if already set up.

Returns
-------
update:
Dictionary containing an update id to track the action:
https://docs.meilisearch.com/reference/api/updates.html#get-an-update-status

Raises
------
MeiliSearchApiError
An error containing details about why MeiliSearch can't process your request. MeiliSearch error codes are described here: https://docs.meilisearch.com/errors/#meilisearch-errors
"""
return self.add_documents_raw(str_documents, primary_key, 'text/csv')

def add_documents_ndjson(
self,
str_documents: str,
primary_key: Optional[str] = None,
) -> Dict[str, int]:
"""Add string documents from a NDJSON file to the index.

Parameters
----------
str_documents:
String of document from a NDJSON file.
primary_key (optional):
The primary-key used in index. Ignored if already set up.

Returns
-------
update:
Dictionary containing an update id to track the action:
https://docs.meilisearch.com/reference/api/updates.html#get-an-update-status

Raises
------
MeiliSearchApiError
An error containing details about why MeiliSearch can't process your request. MeiliSearch error codes are described here: https://docs.meilisearch.com/errors/#meilisearch-errors
"""
return self.add_documents_raw(str_documents, primary_key, 'application/x-ndjson')

def add_documents_raw(
self,
str_documents: str,
primary_key: Optional[str] = None,
content_type: Optional[str] = None,
) -> Dict[str, int]:
"""Add string documents to the index.

Parameters
----------
str_documents:
String of document.
primary_key (optional):
The primary-key used in index. Ignored if already set up.
type:
The type of document. Type available: 'csv', 'json', 'jsonl'

Returns
-------
update:
Dictionary containing an update id to track the action:
https://docs.meilisearch.com/reference/api/updates.html#get-an-update-status

Raises
------
MeiliSearchApiError
An error containing details about why MeiliSearch can't process your request. MeiliSearch error codes are described here: https://docs.meilisearch.com/errors/#meilisearch-errors
"""
url = self._build_url(primary_key)
return self.http.post(url, str_documents, content_type)

def update_documents(
self,
documents: List[Dict[str, Any]],
Expand All @@ -416,11 +523,7 @@ def update_documents(
MeiliSearchApiError
An error containing details about why MeiliSearch can't process your request. MeiliSearch error codes are described here: https://docs.meilisearch.com/errors/#meilisearch-errors
"""
if primary_key is None:
url = f'{self.config.paths.index}/{self.uid}/{self.config.paths.document}'
else:
primary_key = urllib.parse.urlencode({'primaryKey': primary_key})
url = f'{self.config.paths.index}/{self.uid}/{self.config.paths.document}?{primary_key}'
url = self._build_url(primary_key)
return self.http.put(url, documents)

def update_documents_in_batches(
Expand Down Expand Up @@ -1134,3 +1237,12 @@ def _iso_to_date_time(iso_date: Optional[Union[datetime, str]]) -> Optional[date

def __settings_url_for(self, sub_route: str) -> str:
return f'{self.config.paths.index}/{self.uid}/{self.config.paths.setting}/{sub_route}'

def _build_url(
self,
primary_key: Optional[str] = None,
) -> str:
if primary_key is None:
return f'{self.config.paths.index}/{self.uid}/{self.config.paths.document}'
primary_key = urllib.parse.urlencode({'primaryKey': primary_key})
return f'{self.config.paths.index}/{self.uid}/{self.config.paths.document}?{primary_key}'
26 changes: 25 additions & 1 deletion meilisearch/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,33 @@ def small_movies():
"""
Runs once per session. Provides the content of small_movies.json.
"""
with open('./datasets/small_movies.json', 'r', encoding="utf8") as movie_file:
with open('./datasets/small_movies.json', 'r', encoding='utf-8') as movie_file:
yield json.loads(movie_file.read())

@fixture(scope='session')
def small_movies_json_file():
"""
Runs once per session. Provides the content of small_movies.json from read.
"""
with open('./datasets/small_movies.json', 'r', encoding='utf-8') as movie_json_file:
return movie_json_file.read().encode('utf-8')

@fixture(scope='session')
def songs_csv():
"""
Runs once per session. Provides the content of songs.csv from read..
"""
with open('./datasets/songs.csv', 'r', encoding='utf-8') as song_csv_file:
return song_csv_file.read().encode('utf-8')

@fixture(scope='session')
def songs_ndjson():
"""
Runs once per session. Provides the content of songs.ndjson from read..
"""
with open('./datasets/songs.ndjson', 'r', encoding='utf-8') as song_ndjson_file:
return song_ndjson_file.read().encode('utf-8')

@fixture(scope='function')
def empty_index(client):
def index_maker(index_name=common.INDEX_UID):
Expand Down
33 changes: 33 additions & 0 deletions meilisearch/tests/index/test_index_document_meilisearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,36 @@ def test_delete_all_documents(index_with_documents):
response = index.get_documents()
assert isinstance(response, list)
assert response == []

def test_add_documents_csv(empty_index, songs_csv):
"""Tests adding new documents to a clean index."""
index = empty_index()
response = index.add_documents_csv(songs_csv)
assert isinstance(response, dict)
assert 'updateId' in response
update = index.wait_for_pending_update(response['updateId'])
assert update['status'] == 'processed'
assert update['type']['number'] != 0
assert index.get_primary_key() == 'id'

def test_add_documents_json(empty_index, small_movies_json_file):
"""Tests adding new documents to a clean index."""
index = empty_index()
response = index.add_documents_json(small_movies_json_file)
assert isinstance(response, dict)
assert 'updateId' in response
update = index.wait_for_pending_update(response['updateId'])
assert update['status'] == 'processed'
assert update['type']['number'] != 0
assert index.get_primary_key() == 'id'

def test_add_documents_ndjson(empty_index, songs_ndjson):
"""Tests adding new documents to a clean index."""
index = empty_index()
response = index.add_documents_ndjson(songs_ndjson)
assert isinstance(response, dict)
assert 'updateId' in response
update = index.wait_for_pending_update(response['updateId'])
assert update['status'] == 'processed'
assert update['type']['number'] != 0
assert index.get_primary_key() == 'id'