Skip to content

Commit ef8274a

Browse files
authored
Support data source sync functionality (#13)
1 parent 2cfc3e7 commit ef8274a

File tree

7 files changed

+133
-25
lines changed

7 files changed

+133
-25
lines changed

apps/tasks.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ def add_data_entry_task(datasource: DataSource, datasource_entry_items: List[Dat
3030
)
3131
try:
3232
result = datasource_entry_handler.add_entry(datasource_entry_item)
33-
datasource_entry_object.config = result.config
33+
datasource_entry_config = result.config
34+
datasource_entry_config["input"] = datasource_entry_item.dict()
35+
datasource_entry_object.config = datasource_entry_config
3436
datasource_entry_object.size = result.size
3537
datasource_entry_object.status = DataSourceEntryStatus.READY
3638
datasource_entries_size += result.size
@@ -85,6 +87,30 @@ def delete_data_entry_task(datasource: DataSource, entry_data: DataSourceEntry):
8587
datasource.save()
8688
return datasource_entry_items
8789

90+
def resync_data_entry_task(datasource: DataSource, entry_data: DataSourceEntry):
91+
logger.info(f'Resyncing task for data_source_entry: %s' % str(entry_data))
92+
93+
datasource_entry_handler_cls = DataSourceTypeFactory.get_datasource_type_handler(
94+
datasource.type,
95+
)
96+
datasource_entry_handler: DataSourceProcessor = datasource_entry_handler_cls(
97+
datasource,
98+
)
99+
entry_data.status = DataSourceEntryStatus.PROCESSING
100+
entry_data.save()
101+
old_size = entry_data.size
102+
103+
result = datasource_entry_handler.resync_entry(entry_data.config)
104+
entry_data.size = result.size
105+
config_entry = result.config
106+
config_entry["input"] = entry_data.config["input"]
107+
entry_data.config = config_entry
108+
entry_data.status = DataSourceEntryStatus.READY
109+
entry_data.save()
110+
111+
datasource.size = datasource.size - old_size + result.size
112+
datasource.save()
113+
88114

89115
def delete_data_source_task(datasource):
90116
datasource_type = datasource.type

client/src/pages/data.jsx

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import { TextareaAutosize } from "@mui/base";
1515

1616
import DeleteOutlineOutlinedIcon from "@mui/icons-material/DeleteOutlineOutlined";
1717
import AddOutlinedIcon from "@mui/icons-material/AddOutlined";
18+
import SyncOutlinedIcon from "@mui/icons-material/SyncOutlined";
1819
import PeopleOutlineOutlinedIcon from "@mui/icons-material/PeopleOutlineOutlined";
1920
import PersonOutlineOutlinedIcon from "@mui/icons-material/PersonOutlineOutlined";
2021

@@ -313,6 +314,8 @@ export default function DataPage() {
313314
title: "Action",
314315
key: "operation",
315316
render: (record) => {
317+
const isAdhocSyncSupported = record?.sync_config;
318+
316319
return (
317320
<Box>
318321
<IconButton
@@ -330,8 +333,22 @@ export default function DataPage() {
330333
setDeleteConfirmationModalOpen(true);
331334
}}
332335
>
333-
<DeleteOutlineOutlinedIcon />
336+
<DeleteOutlineOutlinedIcon className="delete-dataentry-icon" />
334337
</IconButton>
338+
{isAdhocSyncSupported && (
339+
<IconButton
340+
onClick={() => {
341+
axios()
342+
.post(`/api/datasource_entries/${record.uuid}/resync`)
343+
.then((response) => {
344+
reloadDataSourceEntries();
345+
reloadDataSourceEntries();
346+
});
347+
}}
348+
>
349+
<SyncOutlinedIcon className="resync-dataentry-icon" />
350+
</IconButton>
351+
)}
335352
</Box>
336353
);
337354
},
@@ -348,8 +365,10 @@ export default function DataPage() {
348365
onRow={(record, rowIndex) => {
349366
return {
350367
onClick: (event) => {
351-
setDataSourceEntryData(record);
352-
setDataSourceEntryDrawerOpen(true);
368+
if (event.target.tagName === "TD") {
369+
setDataSourceEntryData(record);
370+
setDataSourceEntryDrawerOpen(true);
371+
}
353372
},
354373
};
355374
}}

datasources/apis.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from .serializers import DataSourceEntrySerializer
1414
from .serializers import DataSourceSerializer
1515
from .serializers import DataSourceTypeSerializer
16-
from apps.tasks import add_data_entry_task
16+
from apps.tasks import add_data_entry_task, resync_data_entry_task
1717
from apps.tasks import delete_data_entry_task
1818
from apps.tasks import delete_data_source_task
1919
from common.utils.utils import extract_urls_from_sitemap
@@ -87,7 +87,18 @@ def text_content(self, request, uid):
8787
)
8888
return DRFResponse({'content': content, 'metadata': metadata})
8989

90+
def resync(self, request, uid):
91+
datasource_entry_object = get_object_or_404(
92+
DataSourceEntry, uuid=uuid.UUID(uid),
93+
)
94+
if datasource_entry_object.datasource.owner != request.user:
95+
return DRFResponse(status=404)
9096

97+
resync_data_entry_task(
98+
datasource_entry_object.datasource, datasource_entry_object,
99+
)
100+
101+
return DRFResponse(status=202)
91102
class DataSourceViewSet(viewsets.ModelViewSet):
92103
queryset = DataSource.objects.all()
93104
serializer_class = DataSourceSerializer

datasources/handlers/datasource_type_interface.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from enum import Enum
12
import json
23
import logging
34
from string import Template
@@ -56,13 +57,23 @@ class DataSourceEntryItem(BaseModel):
5657
metadata: dict = {}
5758
data: Optional[dict] = None
5859

59-
60+
class DataSourceSyncType(str, Enum):
61+
FULL = 'full'
62+
INCREMENTAL = 'incremental'
63+
64+
class DataSourceSyncConfiguration(_Schema):
65+
sync_type: DataSourceSyncType = 'full'
66+
6067
class DataSourceProcessor(ProcessorInterface[BaseInputType, None, None]):
6168

6269
@classmethod
6370
def get_content_key(cls) -> str:
6471
datasource_type_interface = cls.__orig_bases__[0]
6572
return datasource_type_interface.__args__[0].get_content_key()
73+
74+
@classmethod
75+
def get_sync_configuration(cls) -> Optional[dict]:
76+
return None
6677

6778
@classmethod
6879
def get_weaviate_schema(cls, class_name: str) -> dict:
@@ -158,6 +169,7 @@ def _get_document_embeddings(self, text: str) -> OpenAIEmbeddingOutput:
158169
return None
159170

160171
def add_entry(self, data: dict) -> Optional[DataSourceEntryItem]:
172+
logger.info(f'Adding data_source_entry: {data}')
161173
documents = self.get_data_documents(data)
162174

163175
documents = map(
@@ -206,7 +218,13 @@ def delete_entry(self, data: dict) -> None:
206218
self.vectorstore._client.data_object.delete(
207219
document_id, self.datasource_class_name,
208220
)
209-
221+
222+
def resync_entry(self, data: dict) -> Optional[DataSourceEntryItem]:
223+
# Delete old data
224+
self.delete_entry(data)
225+
# Add new data
226+
return self.add_entry(DataSourceEntryItem(**data["input"]))
227+
210228
def delete_all_entries(self) -> None:
211229
self.vectorstore._client.schema.delete_class(
212230
self.datasource_class_name,

datasources/handlers/website/url.py

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import logging
2-
from typing import List
2+
from typing import Any, List
33
from typing import Optional
44

55
from pydantic import Field
@@ -9,7 +9,7 @@
99
from common.utils.text_extract import ExtraParams
1010
from common.utils.splitter import SpacyTextSplitter
1111
from common.utils.utils import extract_urls_from_sitemap
12-
from datasources.handlers.datasource_type_interface import DataSourceEntryItem
12+
from datasources.handlers.datasource_type_interface import DataSourceEntryItem, DataSourceSyncConfiguration, DataSourceSyncType
1313
from datasources.handlers.datasource_type_interface import DataSourceSchema
1414
from datasources.handlers.datasource_type_interface import DataSourceProcessor
1515
from datasources.handlers.datasource_type_interface import WEAVIATE_SCHEMA
@@ -54,6 +54,26 @@ def name() -> str:
5454
@staticmethod
5555
def slug() -> str:
5656
return 'url'
57+
58+
@classmethod
59+
def get_sync_configuration(cls) -> Optional[dict]:
60+
return DataSourceSyncConfiguration(sync_type=DataSourceSyncType.FULL).dict()
61+
62+
def get_url_data(self, url: str) -> Optional[DataSourceEntryItem]:
63+
if not url.startswith('https://') and not url.startswith('http://'):
64+
url = f'https://{url}'
65+
66+
text = extract_text_from_url(
67+
url, extra_params=ExtraParams(openai_key=self.openai_key),
68+
)
69+
docs = [
70+
Document(
71+
page_content_key=self.get_content_key(), page_content=t, metadata={
72+
'source': url,
73+
},
74+
) for t in SpacyTextSplitter(chunk_size=1500, length_func=len).split_text(text)
75+
]
76+
return docs
5777

5878
def validate_and_process(self, data: dict) -> List[DataSourceEntryItem]:
5979
entry = URLSchema(**data)
@@ -83,22 +103,11 @@ def validate_and_process(self, data: dict) -> List[DataSourceEntryItem]:
83103

84104
return list(map(lambda entry: DataSourceEntryItem(name=entry, data={'url': entry}), urls + sitemap_urls))
85105

106+
107+
86108
def get_data_documents(self, data: DataSourceEntryItem) -> Optional[DataSourceEntryItem]:
87109
url = data.data['url']
88-
if not url.startswith('https://') and not url.startswith('http://'):
89-
url = f'https://{url}'
90-
91-
text = extract_text_from_url(
92-
url, extra_params=ExtraParams(openai_key=self.openai_key),
93-
)
94-
docs = [
95-
Document(
96-
page_content_key=self.get_content_key(), page_content=t, metadata={
97-
'source': url,
98-
},
99-
) for t in SpacyTextSplitter(chunk_size=1500, length_func=len).split_text(text)
100-
]
101-
return docs
110+
return self.get_url_data(url)
102111

103112
def similarity_search(self, query: str, *args, **kwargs) -> List[dict]:
104113
return super().similarity_search(query, *args, **kwargs)

datasources/serializers.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
class DataSourceTypeSerializer(serializers.ModelSerializer):
1212
entry_config_schema = serializers.SerializerMethodField()
1313
entry_config_ui_schema = serializers.SerializerMethodField()
14+
sync_config = serializers.SerializerMethodField()
1415

1516
def get_entry_config_schema(self, obj):
1617
datasource_type_handler_cls = DataSourceTypeFactory.get_datasource_type_handler(
@@ -27,12 +28,20 @@ def get_entry_config_ui_schema(self, obj):
2728
if datasource_type_handler_cls is None:
2829
return {}
2930
return datasource_type_handler_cls.get_input_ui_schema()
31+
32+
def get_sync_config(self, obj):
33+
datasource_type_handler_cls = DataSourceTypeFactory.get_datasource_type_handler(
34+
obj,
35+
)
36+
if datasource_type_handler_cls is None:
37+
return None
38+
return datasource_type_handler_cls.get_sync_configuration()
3039

3140
class Meta:
3241
model = DataSourceType
3342
fields = [
3443
'id', 'name', 'description',
35-
'entry_config_schema', 'entry_config_ui_schema',
44+
'entry_config_schema', 'entry_config_ui_schema', 'sync_config'
3645
]
3746

3847

@@ -53,10 +62,22 @@ class Meta:
5362

5463
class DataSourceEntrySerializer(serializers.ModelSerializer):
5564
datasource = DataSourceSerializer()
65+
sync_config = serializers.SerializerMethodField()
66+
67+
def get_sync_config(self, obj):
68+
datasource_type_handler_cls = DataSourceTypeFactory.get_datasource_type_handler(
69+
obj.datasource.type,
70+
)
71+
if datasource_type_handler_cls is None:
72+
return None
73+
if "input" not in obj.config:
74+
return None
75+
76+
return datasource_type_handler_cls.get_sync_configuration()
5677

5778
class Meta:
5879
model = DataSourceEntry
5980
fields = [
6081
'uuid', 'datasource', 'config',
61-
'name', 'size', 'status', 'created_at', 'updated_at',
82+
'name', 'size', 'status', 'created_at', 'updated_at', 'sync_config'
6283
]

datasources/urls.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,4 +48,8 @@
4848
'api/datasource_entries/<str:uid>/text_content',
4949
apis.DataSourceEntryViewSet.as_view({'get': 'text_content'}),
5050
),
51+
path(
52+
'api/datasource_entries/<str:uid>/resync',
53+
apis.DataSourceEntryViewSet.as_view({'post': 'resync'}),
54+
),
5155
]

0 commit comments

Comments
 (0)