Monday, February 26, 2024

Tutorial to Implement RAG with Gemma Model Locally

 Unlock the power of Retrieval-Augmented Generation (RAG) locally using the Gemma model with our detailed step-by-step tutorial. Learn how to enhance your projects by integrating RAG for insightful document processing and AI-driven content generation. Perfect for developers, data scientists, and AI enthusiasts eager to leverage advanced NLP techniques on their own documents. No prior RAG experience required!

Commands Used:

pip install -U "transformers==4.38.1" --upgrade

pip install -q pypdf

pip install -q python-dotenv

pip install  llama-index==0.10.12

pip install -q gradio

pip install einops

pip install accelerate

pip install llama-index-llms-huggingface

pip install llama-index-embeddings-fastembed

pip install fastembed

import logging

import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)


from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

from llama_index.llms.huggingface import HuggingFaceLLM

from llama_index.core import Settings

documents = SimpleDirectoryReader("/home/ubuntu/pdfs").load_data()

from llama_index.embeddings.fastembed import FastEmbedEmbedding

embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")

Settings.embed_model = embed_model

Settings.chunk_size = 512

from llama_index.core import PromptTemplate

system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."

query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

from huggingface_hub.hf_api import HfFolder 

HfFolder.save_token('<your huggingface token from>')

import torch

llm = HuggingFaceLLM(



    generate_kwargs={"temperature": 0.7, "do_sample": False},






    tokenizer_kwargs={"max_length": 4096},

    model_kwargs={"torch_dtype": torch.float16}


Settings.llm = llm

Settings.chunk_size = 512

index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()

def predict(input, history):

  response = query_engine.query(input)

  return str(response)


import gradio as gr



No comments: