Friday, March 15, 2024

How to Create Urdu Hindi AI Model and Dataset from New Dataset

 This video is hands on step-by-step tutorial to create a new dataset, an AI model, fine-tune the model on dataset and then push it to hugging face.




Code:

%%capture

import torch

major_version, minor_version = torch.cuda.get_device_capability()

# Must install separately since Colab has torch 2.2.1, which breaks packages

!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

if major_version >= 8:

    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)

    !pip install --no-deps packaging ninja flash-attn xformers trl peft accelerate bitsandbytes

else:

    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)

    !pip install --no-deps xformers trl peft accelerate bitsandbytes

pass


!pip install einops


from unsloth import FastLanguageModel

import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!

dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+

load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(

    model_name = "unsloth/gemma-7b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B

    max_seq_length = max_seq_length,

    dtype = dtype,

    load_in_4bit = load_in_4bit,

    token = " ", # use one if using gated models like meta-llama/Llama-2-7b-hf

)


model = FastLanguageModel.get_peft_model(

    model,

    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128

    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",

                      "gate_proj", "up_proj", "down_proj",],

    lora_alpha = 16,

    lora_dropout = 0, # Supports any, but = 0 is optimized

    bias = "none",    # Supports any, but = "none" is optimized

    use_gradient_checkpointing = True,

    random_state = 3407,

    use_rslora = False,  # We support rank stabilized LoRA

    loftq_config = None, # And LoftQ

)


alpaca_prompt = """ذیل میں ایک ہدایت ہے جو فلم کے نام کی وضاحت کرتی ہے، اس کے ساتھ ایک ان پٹ بھی ہے جو مزید دستاویزات فراہم کرتا ہے۔ گانے کے بول لکھنے کے لیے ایک لمحہ نکالیں جو فلم کے نام کے معنی سے میل کھاتا ہے۔


### Instruction:

{}


### Input:

{}


### Response:

{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):

    instructions = examples["urdu_instruction"]

    inputs       = examples["urdu_input"]

    outputs      = examples["urdu_output"]

    texts = []

    for instruction, input, output in zip(instructions, inputs, outputs):

        # Must add EOS_TOKEN, otherwise your generation will go on forever!

        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN

        texts.append(text)

    return { "text" : texts, }

pass


from datasets import load_dataset

dataset = load_dataset("fahdmirzac/urdu_bollywood_songs_dataset", split = "train")

dataset = dataset.map(formatting_prompts_func, batched = True,)


from huggingface_hub import login

access_token = "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"

login(token=access_token)


from trl import SFTTrainer

from transformers import TrainingArguments


trainer = SFTTrainer(

    model = model,

    tokenizer = tokenizer,

    train_dataset = dataset,

    dataset_text_field = "text",

    max_seq_length = max_seq_length,

    dataset_num_proc = 2,

    packing = False, # Can make training 5x faster for short sequences.

    args = TrainingArguments(

        per_device_train_batch_size = 2,

        gradient_accumulation_steps = 4,

        warmup_steps = 5,

        max_steps = 100,

        learning_rate = 2e-4,

        fp16 = not torch.cuda.is_bf16_supported(),

        bf16 = torch.cuda.is_bf16_supported(),

        logging_steps = 1,

        optim = "adamw_8bit",

        weight_decay = 0.01,

        lr_scheduler_type = "linear",

        seed = 3407,

        output_dir = "outputs",

    ),

)


trainer_stats = trainer.train()


FastLanguageModel.for_inference(model) # Enable native 2x faster inference

inputs = tokenizer(

[

    alpaca_prompt.format(

        "دیے گئے فلم کے نام کے بارے میں ایک مختصر گیت کے بول لکھیں۔", # instruction

        "کیوں پیار ہو گیا", # input

        "", # output - leave this blank for generation!

    )

], return_tensors = "pt").to("cuda")


outputs = model.generate(**inputs, max_new_tokens = 200, use_cache = True)

tokenizer.batch_decode(outputs)


FastLanguageModel.for_inference(model) # Enable native 2x faster inference

inputs = tokenizer(

[

    alpaca_prompt.format(

        "دیے گئے فلم کے نام کے بارے میں ایک مختصر گیت کے بول لکھیں۔", # instruction

        "رنگ", # input

        "", # output - leave this blank for generation!

    )

], return_tensors = "pt").to("cuda")


outputs = model.generate(**inputs, max_new_tokens = 200, use_cache = True)

tokenizer.batch_decode(outputs)


model.push_to_hub("fahdmirzac/Gemma_Urdu_Hindi_Bollywood_Songs", token = "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")

No comments: