Wednesday, December 13, 2023

How to Build RAG Pipeline with Mixtral 8x7B to Talk to Your Own Documents

 This video shows step by step process as how to locally build RAG pipeline with Mixtral 8x7B to talk to local documents in PDF etc.





Commands Used:


#%%capture


!pip install farm-haystack[colab]


from getpass import getpass

HF_TOKEN = getpass("Hugging Face Token")


from haystack.nodes import PreProcessor,PromptModel, PromptTemplate, PromptNode


from google.colab import files

files.upload()


%%capture

!pip install PyPDF2


import PyPDF2

from haystack import Document


pdf_file_path = "e10897.pdf"  # Sostituisci con il percorso del tuo file PDF


def extract_text_from_pdf(pdf_path):

    text = ""

    with open(pdf_path, "rb") as pdf_file:

        pdf_reader = PyPDF2.PdfReader(pdf_file)

        for page_num in range(len(pdf_reader.pages)):

            page = pdf_reader.pages[page_num]

            text += page.extract_text()


    return text


pdf_text = extract_text_from_pdf(pdf_file_path)


# Creazione del documento di Haystack

doc = Document(

    content=pdf_text,

    meta={"pdf_path": pdf_file_path}

)


docs = [doc]


processor = PreProcessor(

    clean_empty_lines=True,

    clean_whitespace=True,

    clean_header_footer=True,

    split_by="word",

    split_length=500,

    split_respect_sentence_boundary=True,

    split_overlap=0,

    language="it",

)


preprocessed_docs = processor.process(docs)


from haystack.document_stores import InMemoryDocumentStore


document_store = InMemoryDocumentStore(use_bm25=True)

document_store.write_documents(preprocessed_docs)


from haystack import Pipeline

from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store, top_k=2)


qa_template = PromptTemplate(prompt=

  """ Using only the information contained in the context,

  answer only the question asked without adding suggestions of possible questions and answer exclusively in Italian.

  If the answer cannot be deduced from the context, reply: "\I don't know because it is not relevant to the Context.\"

  Context: {join(documents)};

  Question: {query}

  """)


prompt_node = PromptNode(

    model_name_or_path="mistralai/Mixtral-8x7B-Instruct-v0.1",

    api_key=HF_TOKEN,

    default_prompt_template=qa_template,

    max_length=500,

    model_kwargs={"model_max_length": 5000}

)


rag_pipeline = Pipeline()

rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])

rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])


from pprint import pprint

print_answer = lambda out: pprint(out["results"][0].strip())


print_answer(rag_pipeline.run(query="What is Oracle DBA?"))


print_answer(rag_pipeline.run(query="Why Lion is king of jungle?"))

No comments: