Install the following python packages

pip install haystack-ai
pip install "sentence-transformers>=2.2.0"
pip install pypdf
pip install PyMuPDF

Haystack components to extract text from PDF

import re
from pathlib import Path
from typing import List

import fitz  # pip install PyMuPDF
from haystack import Document
from haystack import Pipeline
from haystack import component
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy

@component
class MulticolumnPDFToDocumentConverter:
    def __init__(self, column: str = "left"):
        self.column = column

    @component.output_types(documents=List[Document])
    def run(self, sources: List[str]):
        extracted_docs = []
        for source in sources:
            print(source)
            doc = fitz.open(source)
            text = []
            for page in doc:
                page_rect = page.rect
                half_width = page_rect.width / 2

                if self.column == "left":
                    left_rect = +page_rect
                    left_rect.x1 = half_width
                    left_text = page.get_text(sort=True, clip=left_rect)
                    text.append(left_text)
                else:
                    right_rect = +page_rect
                    right_rect.x0 = half_width
                    right_text = page.get_text(sort=True, clip=right_rect)
                    text.append(right_text)
            extracted_docs.append(Document(content=" ".join(text), meta={'file_name': source}))

        return {"documents": extracted_docs}

@component
class ExtractQuestion:
    @component.output_types(documents=List[Document])
    def run(self, raw_documents: List[Document]):
        # DOC, Question, Réponse
        new_documents = []
        inside = False
        for doc in raw_documents:
            parts = re.split(r'(DO \\d+\\n)', doc.content)
            for question in parts[1:]:
                if re.match("^DO", question) and not inside:
                    new_doc = Document(meta={'DOC_ID': question.strip()})
                    inside = True
                if "\\nRéponse d" in question and inside:
                    question, _, answer = re.split(r'(\\nRéponse d)', question)
                    new_doc.content = question.replace("\\n", " ").strip()
                    new_doc.meta['answer'] = '\\nRéponse d' + answer.replace("\\n", " ").strip()
                    new_doc.meta['filename'] = doc.meta['file_name']
                    inside = False
                    new_documents.append(new_doc)

        return {'documents': new_documents}

Let’s build a pipeline with those components and run it:

# Extract text from PDF
text_extraction_pipeline = Pipeline()
text_extraction_pipeline.add_component(instance=MulticolumnPDFToDocumentConverter(), name="pdf_extractor")
text_extraction_pipeline.add_component(instance=ExtractQuestion(), name="data_extractor")
text_extraction_pipeline.connect("pdf_extractor", "data_extractor")

# set 'consolidated_path' to the path with the unzipped 'dataset-justice-1.zip' file
consolidated_path = "/my/home/BelgiumHackathon/data/consolidated/"

all_files = list(Path(consolidated_path).rglob("*.pdf"))
files = [file for file in all_files if not file.name.startswith("ic")]
docs = text_extraction_pipeline.run({'sources': files})
all_docs = docs['data_extractor']['documents']

Let’s index the documents in a DocumentStore

def indexing(documents: List[Document]):
    model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
    document_store = InMemoryDocumentStore()
    indexing_pipeline = Pipeline()
    indexing_pipeline.add_component(instance=SentenceTransformersDocumentEmbedder(model=model), name="embedder")
    indexing_pipeline.add_component(instance=DocumentWriter(
        document_store=document_store, policy=DuplicatePolicy.SKIP), name="writer"
    )
    indexing_pipeline.connect("embedder.documents", "writer.documents")
    indexing_pipeline.run({"documents": documents})
    return document_store

We can pass the documents created in the step before get a document store with indexed documents

doc_store = indexing(all_docs)