Wednesday, April 24, 2024

Fine-Tune Phi-3 on Local Custom Dataset

 This video is an easy step by step hands on tutorial to locally finetune Phi-3 LLM on your local system on your own dataset.





from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

from datasets import load_dataset

from trl import SFTTrainer

from jinja2 import Template

import yaml


MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"

NEW_MODEL_NAME = "TinyButMighty"

DATASET_NAME = "macadeliccc/opus_samantha"

SPLIT = "train"

MAX_SEQ_LENGTH = 2048

num_train_epochs = 1

license = "apache-2.0"

username = "fahdmirzac"

learning_rate = 1.41e-5

per_device_train_batch_size = 4

gradient_accumulation_steps = 1


model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

dataset = load_dataset("macadeliccc/opus_samantha", split="train")


EOS_TOKEN=tokenizer.eos_token_id


def process_dataset(mydata):

    conversations = mydata["conversations"]

    texts = []

    mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}

    end_mapper = {"system": "", "human": "", "gpt": ""}

    for c in conversations:

        text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in c)

        texts.append(f"{text}{EOS_TOKEN}")

    return {"text": texts}


dataset = dataset.map(process_dataset, batched=True)

print(dataset['text'][2])


args = TrainingArguments(

    per_device_train_batch_size=1,

    gradient_accumulation_steps=gradient_accumulation_steps,

    gradient_checkpointing=True,

    learning_rate=2e-5,

    lr_scheduler_type="cosine",

    max_steps=-1,

    num_train_epochs=num_train_epochs,

    save_strategy="no",

    logging_steps=1,

    output_dir=NEW_MODEL_NAME,

    optim="paged_adamw_32bit",

    bf16=True,

)


trainer = SFTTrainer(

    model=model,

    args=args,

    train_dataset=dataset,

    dataset_text_field="text",

    max_seq_length=MAX_SEQ_LENGTH,

    formatting_func=process_dataset

)


trainer.train()













from huggingface_hub import ModelCard, ModelCardData, HfApi

from jinja2 import Template


template_text = """

---

license: {{ license }}

---


# {{ NEW_MODEL_NAME }}


{{ NEW_MODEL_NAME }} is an SFT fine-tuned version of {{ MODEL_ID }} using a custom training dataset.

This model was made with [Phinetune]()


## Process

- Learning Rate: {{ learning_rate }}

- Maximum Sequence Length: {{ MAX_SEQ_LENGTH }}

- Dataset: {{ DATASET_NAME }}

- Split: {{ SPLIT }}


## 💻 Usage

```python

!pip install -qU transformers

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


model = "{{ username }}/{{ NEW_MODEL_NAME }}"

tokenizer = AutoTokenizer.from_pretrained(model)


# Example prompt

prompt = "Your example prompt here"


# Generate a response

model = AutoModelForCausalLM.from_pretrained(model)

pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

outputs = pipeline(prompt, max_length=50, num_return_sequences=1)

print(outputs[0]["generated_text"])

```


"""

# Create a Jinja template object

jinja_template = Template(template_text.strip())


# Fill the template

content = jinja_template.render(

    license=license,

    NEW_MODEL_NAME=NEW_MODEL_NAME,

    MODEL_ID=MODEL_ID,

    learning_rate=learning_rate,

    MAX_SEQ_LENGTH=MAX_SEQ_LENGTH,

    DATASET_NAME=DATASET_NAME,

    SPLIT=SPLIT,

    username=username,

)


model.save_pretrained(f"{username}/{NEW_MODEL_NAME}")

tokenizer.save_pretrained(f"{username}/{NEW_MODEL_NAME}")


from google.colab import userdata


# Save the model card

card = ModelCard(content)

card.save(f"{username}/{NEW_MODEL_NAME}/README.md")


# Defined in the secrets tab in Google Colab

api = HfApi(token=userdata.get("HF_TOKEN"))


# Upload merge folder

api.create_repo(

    repo_id=f"{username}/{NEW_MODEL_NAME}",

    repo_type="model",

    exist_ok=True,

)


api.upload_folder(

    repo_id=f"{username}/{NEW_MODEL_NAME}",

    folder_path=f"{username}/{NEW_MODEL_NAME}",

)

2 comments:

Senthil Mohan K said...

You need to put the code in GITHUB or some code comprehension env and the link it here ....

Lojik said...

You need to show how to format custom dataset not curated dataset