Skip to content

Commit bed7bfc

Browse files
fix: fixing due to new sentence behaviour (#372)
* adding fix * adding fix * adding fix * install nltk * add nltk dep to index.toml * Trigger Build --------- Co-authored-by: anakin87 <[email protected]>
1 parent 0745bee commit bed7bfc

File tree

2 files changed

+31
-25
lines changed

2 files changed

+31
-25
lines changed

index.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,7 @@ aliases = []
464464
completion_time = "10 min"
465465
created_at = 2024-10-16
466466
haystack_2 = true
467-
dependencies = []
467+
dependencies = ["nltk"]
468468
featured = true
469469

470470
[[tutorial]]

tutorials/42_Sentence_Window_Retriever.ipynb

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
"%%bash\n",
5656
"\n",
5757
"pip install --upgrade pip\n",
58-
"pip install haystack-ai"
58+
"pip install haystack-ai nltk"
5959
]
6060
},
6161
{
@@ -98,17 +98,20 @@
9898
"source": [
9999
"from haystack import Document\n",
100100
"from haystack.components.preprocessors import DocumentSplitter\n",
101-
"splitter = DocumentSplitter(split_length=1, split_overlap=0, split_by=\"sentence\")\n",
102-
"\n",
103-
"text = (\"Paul fell asleep to dream of an Arrakeen cavern, silent people all around him moving in the dim light \"\n",
104-
" \"of glowglobes. It was solemn there and like a cathedral as he listened to a faint sound—the \"\n",
105-
" \"drip-drip-drip of water. Even while he remained in the dream, Paul knew he would remember it upon \"\n",
106-
" \"awakening. He always remembered the dreams that were predictions. The dream faded. Paul awoke to feel \"\n",
107-
" \"himself in the warmth of his bed—thinking thinking. This world of Castle Caladan, without play or \"\n",
108-
" \"companions his own age, perhaps did not deserve sadness in farewell. Dr Yueh, his teacher, had \"\n",
109-
" \"hinted that the faufreluches class system was not rigidly guarded on Arrakis. The planet sheltered \"\n",
110-
" \"people who lived at the desert edge without caid or bashar to command them: will-o’-the-sand people \"\n",
111-
" \"called Fremen, marked down on no census of the Imperial Regate.\")\n",
101+
"\n",
102+
"splitter = DocumentSplitter(split_length=1, split_overlap=0, split_by=\"period\")\n",
103+
"\n",
104+
"text = (\n",
105+
" \"Paul fell asleep to dream of an Arrakeen cavern, silent people all around him moving in the dim light \"\n",
106+
" \"of glowglobes. It was solemn there and like a cathedral as he listened to a faint sound—the \"\n",
107+
" \"drip-drip-drip of water. Even while he remained in the dream, Paul knew he would remember it upon \"\n",
108+
" \"awakening. He always remembered the dreams that were predictions. The dream faded. Paul awoke to feel \"\n",
109+
" \"himself in the warmth of his bed—thinking thinking. This world of Castle Caladan, without play or \"\n",
110+
" \"companions his own age, perhaps did not deserve sadness in farewell. Dr Yueh, his teacher, had \"\n",
111+
" \"hinted that the faufreluches class system was not rigidly guarded on Arrakis. The planet sheltered \"\n",
112+
" \"people who lived at the desert edge without caid or bashar to command them: will-o’-the-sand people \"\n",
113+
" \"called Fremen, marked down on no census of the Imperial Regate.\"\n",
114+
")\n",
112115
"\n",
113116
"doc = Document(content=text)\n",
114117
"docs = splitter.run([doc])"
@@ -144,7 +147,7 @@
144147
"from haystack.document_stores.types import DuplicatePolicy\n",
145148
"\n",
146149
"doc_store = InMemoryDocumentStore()\n",
147-
"doc_store.write_documents(docs['documents'], policy=DuplicatePolicy.OVERWRITE)"
150+
"doc_store.write_documents(docs[\"documents\"], policy=DuplicatePolicy.OVERWRITE)"
148151
]
149152
},
150153
{
@@ -167,7 +170,7 @@
167170
"from haystack.components.retrievers import SentenceWindowRetriever\n",
168171
"\n",
169172
"retriever = SentenceWindowRetriever(document_store=doc_store, window_size=2)\n",
170-
"result = retriever.run(retrieved_documents=[docs['documents'][4]])"
173+
"result = retriever.run(retrieved_documents=[docs[\"documents\"][4]])"
171174
]
172175
},
173176
{
@@ -199,7 +202,7 @@
199202
}
200203
],
201204
"source": [
202-
"result['context_windows']"
205+
"result[\"context_windows\"]"
203206
]
204207
},
205208
{
@@ -224,7 +227,7 @@
224227
}
225228
],
226229
"source": [
227-
"result['context_documents']"
230+
"result[\"context_documents\"]"
228231
]
229232
},
230233
{
@@ -259,6 +262,7 @@
259262
"import csv\n",
260263
"from haystack import Document\n",
261264
"\n",
265+
"\n",
262266
"def read_documents(file: str) -> List[Document]:\n",
263267
" with open(file, \"r\") as file:\n",
264268
" reader = csv.reader(file, delimiter=\"\\t\")\n",
@@ -283,11 +287,11 @@
283287
"from pathlib import Path\n",
284288
"import requests\n",
285289
"\n",
286-
"doc = requests.get('https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv')\n",
290+
"doc = requests.get(\"https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv\")\n",
287291
"\n",
288-
"datafolder = Path('data')\n",
292+
"datafolder = Path(\"data\")\n",
289293
"datafolder.mkdir(exist_ok=True)\n",
290-
"with open(datafolder/'bbc-news-data.csv', 'wb') as f:\n",
294+
"with open(datafolder / \"bbc-news-data.csv\", \"wb\") as f:\n",
291295
" for chunk in doc.iter_content(512):\n",
292296
" f.write(chunk)"
293297
]
@@ -356,7 +360,7 @@
356360
"\n",
357361
"indexing_pipeline.connect(\"splitter\", \"writer\")\n",
358362
"\n",
359-
"indexing_pipeline.run({\"documents\":docs})"
363+
"indexing_pipeline.run({\"documents\": docs})"
360364
]
361365
},
362366
{
@@ -421,7 +425,9 @@
421425
"metadata": {},
422426
"outputs": [],
423427
"source": [
424-
"result = sentence_window_pipeline.run(data={'bm25_retriever': {'query': \"phishing attacks\", \"top_k\": 1}}, include_outputs_from={'bm25_retriever'})"
428+
"result = sentence_window_pipeline.run(\n",
429+
" data={\"bm25_retriever\": {\"query\": \"phishing attacks\", \"top_k\": 1}}, include_outputs_from={\"bm25_retriever\"}\n",
430+
")"
425431
]
426432
},
427433
{
@@ -450,7 +456,7 @@
450456
}
451457
],
452458
"source": [
453-
"result['bm25_retriever']['documents']"
459+
"result[\"bm25_retriever\"][\"documents\"]"
454460
]
455461
},
456462
{
@@ -479,7 +485,7 @@
479485
}
480486
],
481487
"source": [
482-
"result['sentence_window__retriever']['context_windows']"
488+
"result[\"sentence_window__retriever\"][\"context_windows\"]"
483489
]
484490
},
485491
{
@@ -512,7 +518,7 @@
512518
}
513519
],
514520
"source": [
515-
"result['sentence_window__retriever']['context_documents']"
521+
"result[\"sentence_window__retriever\"][\"context_documents\"]"
516522
]
517523
},
518524
{

0 commit comments

Comments
 (0)