Monday, February 26, 2024

Tutorial to Implement RAG with Gemma Model Locally

 Unlock the power of Retrieval-Augmented Generation (RAG) locally using the Gemma model with our detailed step-by-step tutorial. Learn how to enhance your projects by integrating RAG for insightful document processing and AI-driven content generation. Perfect for developers, data scientists, and AI enthusiasts eager to leverage advanced NLP techniques on their own documents. No prior RAG experience required!




Commands Used:


pip install -U "transformers==4.38.1" --upgrade

pip install -q pypdf

pip install -q python-dotenv

pip install  llama-index==0.10.12

pip install -q gradio

pip install einops

pip install accelerate

pip install llama-index-llms-huggingface

pip install llama-index-embeddings-fastembed

pip install fastembed


import logging

import sys


logging.basicConfig(stream=sys.stdout, level=logging.INFO)

logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

from llama_index.llms.huggingface import HuggingFaceLLM

from llama_index.core import Settings


documents = SimpleDirectoryReader("/home/ubuntu/pdfs").load_data()


from llama_index.embeddings.fastembed import FastEmbedEmbedding


embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")

Settings.embed_model = embed_model

Settings.chunk_size = 512


from llama_index.core import PromptTemplate


system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."


query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")


from huggingface_hub.hf_api import HfFolder 

HfFolder.save_token('<your huggingface token from huggingface.co>')


import torch


llm = HuggingFaceLLM(

    context_window=8192,

    max_new_tokens=256,

    generate_kwargs={"temperature": 0.7, "do_sample": False},

    system_prompt=system_prompt,

    query_wrapper_prompt=query_wrapper_prompt,

    tokenizer_name="google/gemma-7b-it",

    model_name="google/gemma-7b-it",

    device_map="auto",

    tokenizer_kwargs={"max_length": 4096},

    model_kwargs={"torch_dtype": torch.float16}

)


Settings.llm = llm

Settings.chunk_size = 512


index = VectorStoreIndex.from_documents(documents)


query_engine = index.as_query_engine()


def predict(input, history):

  response = query_engine.query(input)

  return str(response)

  

import gradio as gr


gr.ChatInterface(predict).launch(share=True)


  




No comments: