This video is an easy step by step hands on tutorial to locally finetune Phi-3 LLM on your local system on your own dataset.
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from trl import SFTTrainer
from jinja2 import Template
import yaml
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
NEW_MODEL_NAME = "TinyButMighty"
DATASET_NAME = "macadeliccc/opus_samantha"
SPLIT = "train"
MAX_SEQ_LENGTH = 2048
num_train_epochs = 1
license = "apache-2.0"
username = "fahdmirzac"
learning_rate = 1.41e-5
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
dataset = load_dataset("macadeliccc/opus_samantha", split="train")
EOS_TOKEN=tokenizer.eos_token_id
def process_dataset(mydata):
conversations = mydata["conversations"]
texts = []
mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}
end_mapper = {"system": "", "human": "", "gpt": ""}
for c in conversations:
text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in c)
texts.append(f"{text}{EOS_TOKEN}")
return {"text": texts}
dataset = dataset.map(process_dataset, batched=True)
print(dataset['text'][2])
args = TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=gradient_accumulation_steps,
gradient_checkpointing=True,
learning_rate=2e-5,
lr_scheduler_type="cosine",
max_steps=-1,
num_train_epochs=num_train_epochs,
save_strategy="no",
logging_steps=1,
output_dir=NEW_MODEL_NAME,
optim="paged_adamw_32bit",
bf16=True,
)
trainer = SFTTrainer(
model=model,
args=args,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=MAX_SEQ_LENGTH,
formatting_func=process_dataset
)
trainer.train()
from huggingface_hub import ModelCard, ModelCardData, HfApi
from jinja2 import Template
template_text = """
---
license: {{ license }}
---
# {{ NEW_MODEL_NAME }}
{{ NEW_MODEL_NAME }} is an SFT fine-tuned version of {{ MODEL_ID }} using a custom training dataset.
This model was made with [Phinetune]()
## Process
- Learning Rate: {{ learning_rate }}
- Maximum Sequence Length: {{ MAX_SEQ_LENGTH }}
- Dataset: {{ DATASET_NAME }}
- Split: {{ SPLIT }}
## 💻 Usage
```python
!pip install -qU transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
model = "{{ username }}/{{ NEW_MODEL_NAME }}"
tokenizer = AutoTokenizer.from_pretrained(model)
# Example prompt
prompt = "Your example prompt here"
# Generate a response
model = AutoModelForCausalLM.from_pretrained(model)
pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
outputs = pipeline(prompt, max_length=50, num_return_sequences=1)
print(outputs[0]["generated_text"])
```
"""
# Create a Jinja template object
jinja_template = Template(template_text.strip())
# Fill the template
content = jinja_template.render(
license=license,
NEW_MODEL_NAME=NEW_MODEL_NAME,
MODEL_ID=MODEL_ID,
learning_rate=learning_rate,
MAX_SEQ_LENGTH=MAX_SEQ_LENGTH,
DATASET_NAME=DATASET_NAME,
SPLIT=SPLIT,
username=username,
)
model.save_pretrained(f"{username}/{NEW_MODEL_NAME}")
tokenizer.save_pretrained(f"{username}/{NEW_MODEL_NAME}")
from google.colab import userdata
# Save the model card
card = ModelCard(content)
card.save(f"{username}/{NEW_MODEL_NAME}/README.md")
# Defined in the secrets tab in Google Colab
api = HfApi(token=userdata.get("HF_TOKEN"))
# Upload merge folder
api.create_repo(
repo_id=f"{username}/{NEW_MODEL_NAME}",
repo_type="model",
exist_ok=True,
)
api.upload_folder(
repo_id=f"{username}/{NEW_MODEL_NAME}",
folder_path=f"{username}/{NEW_MODEL_NAME}",
)
2 comments:
You need to put the code in GITHUB or some code comprehension env and the link it here ....
You need to show how to format custom dataset not curated dataset
Post a Comment