Install the following python packages
pip install haystack-ai
pip install "sentence-transformers>=2.2.0"
pip install pypdf
pip install PyMuPDF
import re
from pathlib import Path
from typing import List
import fitz # pip install PyMuPDF
from haystack import Document
from haystack import Pipeline
from haystack import component
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy
@component
class MulticolumnPDFToDocumentConverter:
def __init__(self, column: str = "left"):
self.column = column
@component.output_types(documents=List[Document])
def run(self, sources: List[str]):
extracted_docs = []
for source in sources:
print(source)
doc = fitz.open(source)
text = []
for page in doc:
page_rect = page.rect
half_width = page_rect.width / 2
if self.column == "left":
left_rect = +page_rect
left_rect.x1 = half_width
left_text = page.get_text(sort=True, clip=left_rect)
text.append(left_text)
else:
right_rect = +page_rect
right_rect.x0 = half_width
right_text = page.get_text(sort=True, clip=right_rect)
text.append(right_text)
extracted_docs.append(Document(content=" ".join(text), meta={'file_name': source}))
return {"documents": extracted_docs}
@component
class ExtractQuestion:
@component.output_types(documents=List[Document])
def run(self, raw_documents: List[Document]):
# DOC, Question, Réponse
new_documents = []
inside = False
for doc in raw_documents:
parts = re.split(r'(DO \\d+\\n)', doc.content)
for question in parts[1:]:
if re.match("^DO", question) and not inside:
new_doc = Document(meta={'DOC_ID': question.strip()})
inside = True
if "\\nRéponse d" in question and inside:
question, _, answer = re.split(r'(\\nRéponse d)', question)
new_doc.content = question.replace("\\n", " ").strip()
new_doc.meta['answer'] = '\\nRéponse d' + answer.replace("\\n", " ").strip()
new_doc.meta['filename'] = doc.meta['file_name']
inside = False
new_documents.append(new_doc)
return {'documents': new_documents}
# Extract text from PDF
text_extraction_pipeline = Pipeline()
text_extraction_pipeline.add_component(instance=MulticolumnPDFToDocumentConverter(), name="pdf_extractor")
text_extraction_pipeline.add_component(instance=ExtractQuestion(), name="data_extractor")
text_extraction_pipeline.connect("pdf_extractor", "data_extractor")
# set 'consolidated_path' to the path with the unzipped 'dataset-justice-1.zip' file
consolidated_path = "/my/home/BelgiumHackathon/data/consolidated/"
all_files = list(Path(consolidated_path).rglob("*.pdf"))
files = [file for file in all_files if not file.name.startswith("ic")]
docs = text_extraction_pipeline.run({'sources': files})
all_docs = docs['data_extractor']['documents']
def indexing(documents: List[Document]):
model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
document_store = InMemoryDocumentStore()
indexing_pipeline = Pipeline()
indexing_pipeline.add_component(instance=SentenceTransformersDocumentEmbedder(model=model), name="embedder")
indexing_pipeline.add_component(instance=DocumentWriter(
document_store=document_store, policy=DuplicatePolicy.SKIP), name="writer"
)
indexing_pipeline.connect("embedder.documents", "writer.documents")
indexing_pipeline.run({"documents": documents})
return document_store
doc_store = indexing(all_docs)