Fahd Mirza on AI, Cloud, DevOps and Databases: How to Create Urdu Hindi AI Model and Dataset from New Dataset

This video is hands on step-by-step tutorial to create a new dataset, an AI model, fine-tune the model on dataset and then push it to hugging face.

Code:

%%capture

import torch

major_version, minor_version = torch.cuda.get_device_capability()

# Must install separately since Colab has torch 2.2.1, which breaks packages

!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

if major_version >= 8:

# Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)

!pip install --no-deps packaging ninja flash-attn xformers trl peft accelerate bitsandbytes

else:

# Use this for older GPUs (V100, Tesla T4, RTX 20xx)

!pip install --no-deps xformers trl peft accelerate bitsandbytes

pass

!pip install einops

from unsloth import FastLanguageModel

import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!

dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+

load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(

model_name = "unsloth/gemma-7b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B

max_seq_length = max_seq_length,

dtype = dtype,

load_in_4bit = load_in_4bit,

token = " ", # use one if using gated models like meta-llama/Llama-2-7b-hf

)

model = FastLanguageModel.get_peft_model(

model,

r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128

target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",

"gate_proj", "up_proj", "down_proj",],

lora_alpha = 16,

lora_dropout = 0, # Supports any, but = 0 is optimized

bias = "none", # Supports any, but = "none" is optimized

use_gradient_checkpointing = True,

random_state = 3407,

use_rslora = False, # We support rank stabilized LoRA

loftq_config = None, # And LoftQ

)

alpaca_prompt = """ذیل میں ایک ہدایت ہے جو فلم کے نام کی وضاحت کرتی ہے، اس کے ساتھ ایک ان پٹ بھی ہے جو مزید دستاویزات فراہم کرتا ہے۔ گانے کے بول لکھنے کے لیے ایک لمحہ نکالیں جو فلم کے نام کے معنی سے میل کھاتا ہے۔

### Instruction:

{}

### Input:

{}

### Response:

{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):

instructions = examples["urdu_instruction"]

inputs = examples["urdu_input"]

outputs = examples["urdu_output"]

texts = []

for instruction, input, output in zip(instructions, inputs, outputs):

# Must add EOS_TOKEN, otherwise your generation will go on forever!

text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN

texts.append(text)

return { "text" : texts, }

pass

from datasets import load_dataset

dataset = load_dataset("fahdmirzac/urdu_bollywood_songs_dataset", split = "train")

dataset = dataset.map(formatting_prompts_func, batched = True,)

from huggingface_hub import login

access_token = "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"

from trl import SFTTrainer

from transformers import TrainingArguments

trainer = SFTTrainer(

model = model,

tokenizer = tokenizer,

train_dataset = dataset,

dataset_text_field = "text",

max_seq_length = max_seq_length,

dataset_num_proc = 2,

packing = False, # Can make training 5x faster for short sequences.

args = TrainingArguments(

per_device_train_batch_size = 2,

gradient_accumulation_steps = 4,

warmup_steps = 5,

max_steps = 100,

learning_rate = 2e-4,

fp16 = not torch.cuda.is_bf16_supported(),

bf16 = torch.cuda.is_bf16_supported(),

logging_steps = 1,

optim = "adamw_8bit",

weight_decay = 0.01,

lr_scheduler_type = "linear",

seed = 3407,

output_dir = "outputs",

)

trainer_stats = trainer.train()

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

inputs = tokenizer(

[

alpaca_prompt.format(

"دیے گئے فلم کے نام کے بارے میں ایک مختصر گیت کے بول لکھیں۔", # instruction

"کیوں پیار ہو گیا", # input

"", # output - leave this blank for generation!

)

], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 200, use_cache = True)

tokenizer.batch_decode(outputs)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

inputs = tokenizer(

[

alpaca_prompt.format(

"دیے گئے فلم کے نام کے بارے میں ایک مختصر گیت کے بول لکھیں۔", # instruction

"رنگ", # input

"", # output - leave this blank for generation!

)

], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 200, use_cache = True)

tokenizer.batch_decode(outputs)

model.push_to_hub("fahdmirzac/Gemma_Urdu_Hindi_Bollywood_Songs", token = "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")

Fahd Mirza on AI, Cloud, DevOps and Databases

Friday, March 15, 2024

How to Create Urdu Hindi AI Model and Dataset from New Dataset

No comments:

Favourite Authors

Popular Posts

Oracle Jobs in Pakistan

Blog Honor