Skip to content

Allow passing a custom serializer for documents #974

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,34 @@ index.update_filterable_attributes([
])
```

#### Custom Serializer for documents <!-- omit in toc -->

If your documents contain fields that the Python JSON serializer does not know how to handle you
can use your own custom serializer.

```py
from datetime import datetime
from json import JSONEncoder
from uuid import uuid4


class CustomEncoder(JSONEncoder):
def default(self, o):
if isinstance(o, (UUID, datetime)):
return str(o)

# Let the base class default method raise the TypeError
return super().default(o)


documents = [
{"id": uuid4(), "title": "test 1", "when": datetime.now()},
{"id": uuid4(), "title": "Test 2", "when": datetime.now()},
]
index = empty_index()
index.add_documents(documents, serializer=CustomEncoder)
```

You only need to perform this operation once.

Note that Meilisearch will rebuild your index whenever you update `filterableAttributes`. Depending on the size of your dataset, this might take time. You can track the process using the [task](https://www.meilisearch.com/docs/reference/api/tasks#get-tasks).
Expand Down Expand Up @@ -205,7 +233,6 @@ index.search(

This package guarantees compatibility with [version v1.x of Meilisearch](https://github.com/meilisearch/meilisearch/releases/latest), but some features may not be present. Please check the [issues](https://github.com/meilisearch/meilisearch-python/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22+label%3Aenhancement) for more info.


## 💡 Learn more

The following sections in our main documentation website may interest you:
Expand Down
19 changes: 12 additions & 7 deletions meilisearch/_httprequests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import json
from functools import lru_cache
from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Union
from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Type, Union

import requests

Expand Down Expand Up @@ -39,6 +39,8 @@ def send_request(
]
] = None,
content_type: Optional[str] = None,
*,
serializer: Optional[Type[json.JSONEncoder]] = None,
) -> Any:
if content_type:
self.headers["Content-Type"] = content_type
Expand All @@ -58,11 +60,10 @@ def send_request(
data=body,
)
else:
data = json.dumps(body, cls=serializer) if body else "" if body == "" else "null"

request = http_method(
request_path,
timeout=self.config.timeout,
headers=self.headers,
data=json.dumps(body) if body else "" if body == "" else "null",
request_path, timeout=self.config.timeout, headers=self.headers, data=data
)
return self.__validate(request)

Expand All @@ -81,8 +82,10 @@ def post(
Union[Mapping[str, Any], Sequence[Mapping[str, Any]], List[str], str]
] = None,
content_type: Optional[str] = "application/json",
*,
serializer: Optional[Type[json.JSONEncoder]] = None,
) -> Any:
return self.send_request(requests.post, path, body, content_type)
return self.send_request(requests.post, path, body, content_type, serializer=serializer)

def patch(
self,
Expand All @@ -108,8 +111,10 @@ def put(
]
] = None,
content_type: Optional[str] = "application/json",
*,
serializer: Optional[Type[json.JSONEncoder]] = None,
) -> Any:
return self.send_request(requests.put, path, body, content_type)
return self.send_request(requests.put, path, body, content_type, serializer=serializer)

def delete(
self,
Expand Down
80 changes: 70 additions & 10 deletions meilisearch/index.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,19 @@
from __future__ import annotations

from datetime import datetime
from typing import Any, Dict, Generator, List, Mapping, MutableMapping, Optional, Sequence, Union
from typing import (
TYPE_CHECKING,
Any,
Dict,
Generator,
List,
Mapping,
MutableMapping,
Optional,
Sequence,
Type,
Union,
)
from urllib import parse
from warnings import warn

Expand All @@ -26,6 +38,9 @@
from meilisearch.models.task import Task, TaskInfo, TaskResults
from meilisearch.task import TaskHandler

if TYPE_CHECKING:
from json import JSONEncoder


# pylint: disable=too-many-public-methods, too-many-lines
class Index:
Expand Down Expand Up @@ -403,6 +418,8 @@ def add_documents(
self,
documents: Sequence[Mapping[str, Any]],
primary_key: Optional[str] = None,
*,
serializer: Optional[Type[JSONEncoder]] = None,
) -> TaskInfo:
"""Add documents to the index.

Expand All @@ -412,6 +429,9 @@ def add_documents(
List of documents. Each document should be a dictionary.
primary_key (optional):
The primary-key used in index. Ignored if already set up.
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.

Returns
-------
Expand All @@ -425,14 +445,16 @@ def add_documents(
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
"""
url = self._build_url(primary_key)
add_document_task = self.http.post(url, documents)
add_document_task = self.http.post(url, documents, serializer=serializer)
return TaskInfo(**add_document_task)

def add_documents_in_batches(
self,
documents: Sequence[Mapping[str, Any]],
batch_size: int = 1000,
primary_key: Optional[str] = None,
*,
serializer: Optional[Type[JSONEncoder]] = None,
) -> List[TaskInfo]:
"""Add documents to the index in batches.

Expand All @@ -444,6 +466,9 @@ def add_documents_in_batches(
The number of documents that should be included in each batch. Default = 1000
primary_key (optional):
The primary-key used in index. Ignored if already set up.
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.

Returns
-------
Expand All @@ -461,7 +486,7 @@ def add_documents_in_batches(
tasks: List[TaskInfo] = []

for document_batch in self._batch(documents, batch_size):
task = self.add_documents(document_batch, primary_key)
task = self.add_documents(document_batch, primary_key, serializer=serializer)
tasks.append(task)

return tasks
Expand All @@ -470,6 +495,8 @@ def add_documents_json(
self,
str_documents: str,
primary_key: Optional[str] = None,
*,
serializer: Optional[Type[JSONEncoder]] = None,
) -> TaskInfo:
"""Add string documents from JSON file to the index.

Expand All @@ -479,6 +506,9 @@ def add_documents_json(
String of document from a JSON file.
primary_key (optional):
The primary-key used in index. Ignored if already set up.
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.

Returns
-------
Expand All @@ -491,7 +521,9 @@ def add_documents_json(
MeilisearchApiError
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
"""
return self.add_documents_raw(str_documents, primary_key, "application/json")
return self.add_documents_raw(
str_documents, primary_key, "application/json", serializer=serializer
)

def add_documents_csv(
self,
Expand Down Expand Up @@ -556,6 +588,8 @@ def add_documents_raw(
primary_key: Optional[str] = None,
content_type: Optional[str] = None,
csv_delimiter: Optional[str] = None,
*,
serializer: Optional[Type[JSONEncoder]] = None,
) -> TaskInfo:
"""Add string documents to the index.

Expand All @@ -570,6 +604,9 @@ def add_documents_raw(
csv_delimiter:
One ASCII character used to customize the delimiter for CSV.
Note: The csv delimiter can only be used with the Content-Type text/csv.
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.

Returns
-------
Expand All @@ -583,11 +620,15 @@ def add_documents_raw(
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
"""
url = self._build_url(primary_key=primary_key, csv_delimiter=csv_delimiter)
response = self.http.post(url, str_documents, content_type)
response = self.http.post(url, str_documents, content_type, serializer=serializer)
return TaskInfo(**response)

def update_documents(
self, documents: Sequence[Mapping[str, Any]], primary_key: Optional[str] = None
self,
documents: Sequence[Mapping[str, Any]],
primary_key: Optional[str] = None,
*,
serializer: Optional[Type[JSONEncoder]] = None,
) -> TaskInfo:
"""Update documents in the index.

Expand All @@ -597,6 +638,9 @@ def update_documents(
List of documents. Each document should be a dictionary.
primary_key (optional):
The primary-key used in index. Ignored if already set up
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.

Returns
-------
Expand All @@ -610,7 +654,7 @@ def update_documents(
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
"""
url = self._build_url(primary_key)
response = self.http.put(url, documents)
response = self.http.put(url, documents, serializer=serializer)
return TaskInfo(**response)

def update_documents_ndjson(
Expand Down Expand Up @@ -644,6 +688,8 @@ def update_documents_json(
self,
str_documents: str,
primary_key: Optional[str] = None,
*,
serializer: Optional[Type[JSONEncoder]] = None,
) -> TaskInfo:
"""Update documents as a json string in the index.

Expand All @@ -653,6 +699,9 @@ def update_documents_json(
String of document from a JSON file.
primary_key (optional):
The primary-key used in index. Ignored if already set up
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.

Returns
-------
Expand All @@ -665,7 +714,9 @@ def update_documents_json(
MeilisearchApiError
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
"""
return self.update_documents_raw(str_documents, primary_key, "application/json")
return self.update_documents_raw(
str_documents, primary_key, "application/json", serializer=serializer
)

def update_documents_csv(
self,
Expand Down Expand Up @@ -703,6 +754,8 @@ def update_documents_raw(
primary_key: Optional[str] = None,
content_type: Optional[str] = None,
csv_delimiter: Optional[str] = None,
*,
serializer: Optional[Type[JSONEncoder]] = None,
) -> TaskInfo:
"""Update documents as a string in the index.

Expand All @@ -717,6 +770,9 @@ def update_documents_raw(
csv_delimiter:
One ASCII character used to customize the delimiter for CSV.
Note: The csv delimiter can only be used with the Content-Type text/csv.
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.

Returns
-------
Expand All @@ -730,14 +786,15 @@ def update_documents_raw(
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
"""
url = self._build_url(primary_key=primary_key, csv_delimiter=csv_delimiter)
response = self.http.put(url, str_documents, content_type)
response = self.http.put(url, str_documents, content_type, serializer=serializer)
return TaskInfo(**response)

def update_documents_in_batches(
self,
documents: Sequence[Mapping[str, Any]],
batch_size: int = 1000,
primary_key: Optional[str] = None,
serializer: Optional[Type[JSONEncoder]] = None,
) -> List[TaskInfo]:
"""Update documents to the index in batches.

Expand All @@ -749,6 +806,9 @@ def update_documents_in_batches(
The number of documents that should be included in each batch. Default = 1000
primary_key (optional):
The primary-key used in index. Ignored if already set up.
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.

Returns
-------
Expand All @@ -766,7 +826,7 @@ def update_documents_in_batches(
tasks = []

for document_batch in self._batch(documents, batch_size):
update_task = self.update_documents(document_batch, primary_key)
update_task = self.update_documents(document_batch, primary_key, serializer=serializer)
tasks.append(update_task)

return tasks
Expand Down
Loading