|
55 | 55 | "%%bash\n",
|
56 | 56 | "\n",
|
57 | 57 | "pip install --upgrade pip\n",
|
58 |
| - "pip install haystack-ai" |
| 58 | + "pip install haystack-ai nltk" |
59 | 59 | ]
|
60 | 60 | },
|
61 | 61 | {
|
|
98 | 98 | "source": [
|
99 | 99 | "from haystack import Document\n",
|
100 | 100 | "from haystack.components.preprocessors import DocumentSplitter\n",
|
101 |
| - "splitter = DocumentSplitter(split_length=1, split_overlap=0, split_by=\"sentence\")\n", |
102 |
| - "\n", |
103 |
| - "text = (\"Paul fell asleep to dream of an Arrakeen cavern, silent people all around him moving in the dim light \"\n", |
104 |
| - " \"of glowglobes. It was solemn there and like a cathedral as he listened to a faint sound—the \"\n", |
105 |
| - " \"drip-drip-drip of water. Even while he remained in the dream, Paul knew he would remember it upon \"\n", |
106 |
| - " \"awakening. He always remembered the dreams that were predictions. The dream faded. Paul awoke to feel \"\n", |
107 |
| - " \"himself in the warmth of his bed—thinking thinking. This world of Castle Caladan, without play or \"\n", |
108 |
| - " \"companions his own age, perhaps did not deserve sadness in farewell. Dr Yueh, his teacher, had \"\n", |
109 |
| - " \"hinted that the faufreluches class system was not rigidly guarded on Arrakis. The planet sheltered \"\n", |
110 |
| - " \"people who lived at the desert edge without caid or bashar to command them: will-o’-the-sand people \"\n", |
111 |
| - " \"called Fremen, marked down on no census of the Imperial Regate.\")\n", |
| 101 | + "\n", |
| 102 | + "splitter = DocumentSplitter(split_length=1, split_overlap=0, split_by=\"period\")\n", |
| 103 | + "\n", |
| 104 | + "text = (\n", |
| 105 | + " \"Paul fell asleep to dream of an Arrakeen cavern, silent people all around him moving in the dim light \"\n", |
| 106 | + " \"of glowglobes. It was solemn there and like a cathedral as he listened to a faint sound—the \"\n", |
| 107 | + " \"drip-drip-drip of water. Even while he remained in the dream, Paul knew he would remember it upon \"\n", |
| 108 | + " \"awakening. He always remembered the dreams that were predictions. The dream faded. Paul awoke to feel \"\n", |
| 109 | + " \"himself in the warmth of his bed—thinking thinking. This world of Castle Caladan, without play or \"\n", |
| 110 | + " \"companions his own age, perhaps did not deserve sadness in farewell. Dr Yueh, his teacher, had \"\n", |
| 111 | + " \"hinted that the faufreluches class system was not rigidly guarded on Arrakis. The planet sheltered \"\n", |
| 112 | + " \"people who lived at the desert edge without caid or bashar to command them: will-o’-the-sand people \"\n", |
| 113 | + " \"called Fremen, marked down on no census of the Imperial Regate.\"\n", |
| 114 | + ")\n", |
112 | 115 | "\n",
|
113 | 116 | "doc = Document(content=text)\n",
|
114 | 117 | "docs = splitter.run([doc])"
|
|
144 | 147 | "from haystack.document_stores.types import DuplicatePolicy\n",
|
145 | 148 | "\n",
|
146 | 149 | "doc_store = InMemoryDocumentStore()\n",
|
147 |
| - "doc_store.write_documents(docs['documents'], policy=DuplicatePolicy.OVERWRITE)" |
| 150 | + "doc_store.write_documents(docs[\"documents\"], policy=DuplicatePolicy.OVERWRITE)" |
148 | 151 | ]
|
149 | 152 | },
|
150 | 153 | {
|
|
167 | 170 | "from haystack.components.retrievers import SentenceWindowRetriever\n",
|
168 | 171 | "\n",
|
169 | 172 | "retriever = SentenceWindowRetriever(document_store=doc_store, window_size=2)\n",
|
170 |
| - "result = retriever.run(retrieved_documents=[docs['documents'][4]])" |
| 173 | + "result = retriever.run(retrieved_documents=[docs[\"documents\"][4]])" |
171 | 174 | ]
|
172 | 175 | },
|
173 | 176 | {
|
|
199 | 202 | }
|
200 | 203 | ],
|
201 | 204 | "source": [
|
202 |
| - "result['context_windows']" |
| 205 | + "result[\"context_windows\"]" |
203 | 206 | ]
|
204 | 207 | },
|
205 | 208 | {
|
|
224 | 227 | }
|
225 | 228 | ],
|
226 | 229 | "source": [
|
227 |
| - "result['context_documents']" |
| 230 | + "result[\"context_documents\"]" |
228 | 231 | ]
|
229 | 232 | },
|
230 | 233 | {
|
|
259 | 262 | "import csv\n",
|
260 | 263 | "from haystack import Document\n",
|
261 | 264 | "\n",
|
| 265 | + "\n", |
262 | 266 | "def read_documents(file: str) -> List[Document]:\n",
|
263 | 267 | " with open(file, \"r\") as file:\n",
|
264 | 268 | " reader = csv.reader(file, delimiter=\"\\t\")\n",
|
|
283 | 287 | "from pathlib import Path\n",
|
284 | 288 | "import requests\n",
|
285 | 289 | "\n",
|
286 |
| - "doc = requests.get('https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv')\n", |
| 290 | + "doc = requests.get(\"https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv\")\n", |
287 | 291 | "\n",
|
288 |
| - "datafolder = Path('data')\n", |
| 292 | + "datafolder = Path(\"data\")\n", |
289 | 293 | "datafolder.mkdir(exist_ok=True)\n",
|
290 |
| - "with open(datafolder/'bbc-news-data.csv', 'wb') as f:\n", |
| 294 | + "with open(datafolder / \"bbc-news-data.csv\", \"wb\") as f:\n", |
291 | 295 | " for chunk in doc.iter_content(512):\n",
|
292 | 296 | " f.write(chunk)"
|
293 | 297 | ]
|
|
356 | 360 | "\n",
|
357 | 361 | "indexing_pipeline.connect(\"splitter\", \"writer\")\n",
|
358 | 362 | "\n",
|
359 |
| - "indexing_pipeline.run({\"documents\":docs})" |
| 363 | + "indexing_pipeline.run({\"documents\": docs})" |
360 | 364 | ]
|
361 | 365 | },
|
362 | 366 | {
|
|
421 | 425 | "metadata": {},
|
422 | 426 | "outputs": [],
|
423 | 427 | "source": [
|
424 |
| - "result = sentence_window_pipeline.run(data={'bm25_retriever': {'query': \"phishing attacks\", \"top_k\": 1}}, include_outputs_from={'bm25_retriever'})" |
| 428 | + "result = sentence_window_pipeline.run(\n", |
| 429 | + " data={\"bm25_retriever\": {\"query\": \"phishing attacks\", \"top_k\": 1}}, include_outputs_from={\"bm25_retriever\"}\n", |
| 430 | + ")" |
425 | 431 | ]
|
426 | 432 | },
|
427 | 433 | {
|
|
450 | 456 | }
|
451 | 457 | ],
|
452 | 458 | "source": [
|
453 |
| - "result['bm25_retriever']['documents']" |
| 459 | + "result[\"bm25_retriever\"][\"documents\"]" |
454 | 460 | ]
|
455 | 461 | },
|
456 | 462 | {
|
|
479 | 485 | }
|
480 | 486 | ],
|
481 | 487 | "source": [
|
482 |
| - "result['sentence_window__retriever']['context_windows']" |
| 488 | + "result[\"sentence_window__retriever\"][\"context_windows\"]" |
483 | 489 | ]
|
484 | 490 | },
|
485 | 491 | {
|
|
512 | 518 | }
|
513 | 519 | ],
|
514 | 520 | "source": [
|
515 |
| - "result['sentence_window__retriever']['context_documents']" |
| 521 | + "result[\"sentence_window__retriever\"][\"context_documents\"]" |
516 | 522 | ]
|
517 | 523 | },
|
518 | 524 | {
|
|
0 commit comments