Friday, August 4, 2023

Tutorial to Finetune LLM on Local Machine

 This video shares step by step tutorial as how to finetune a AI model on local machine easily and quickly.

Commands Used:

!pip install transformers !pip install datasets !pip install accelerate from transformers import AutoTokenizer from datasets import load_dataset from transformers import AutoModelForCausalLM from transformers import Trainer, TrainingArguments import transformers transformers.set_seed(42) model_checkpoint = "roneneldan/TinyStories-33M" ds = load_dataset('MohamedRashad/characters_backstories') # As this dataset has no validation split, we will create one ds = ds["train"].train_test_split(test_size=0.2, seed=42) # We'll create a tokenizer from model checkpoint tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=False) # We'll need padding to have same length sequences in a batch tokenizer.pad_token = tokenizer.eos_token # Define a tokenization function that first concatenates text and target def tokenize_function(example): merged = example["text"] + " " + example["target"] batch = tokenizer(merged, padding='max_length', truncation=True, max_length=128) batch["labels"] = batch["input_ids"].copy() return batch # Apply it on our dataset, and remove the text columns tokenized_datasets =, remove_columns=["text", "target"]) # We will train a causal (autoregressive) language model from a pretrained checkpoint model = AutoModelForCausalLM.from_pretrained(model_checkpoint); # Define training arguments model_name = model_checkpoint.split("/")[-1] training_args = TrainingArguments( f"{model_name}-finetuned-characters-backstories", num_train_epochs=1, logging_steps=1, evaluation_strategy = "epoch", learning_rate=1e-4, weight_decay=0.01, no_cuda=True, # force cpu use, will be renamed `use_cpu` ) # We'll use HF Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], ) # Let's train! trainer.train() transformers.logging.set_verbosity_error() # suppress tokenizer warnings prefix = "Generate Backstory based on following information Character Name: " prompts = [ "Spider Character Race: Monkey Character Class: Paladin Output: ", ] for prompt in prompts: input_ids = tokenizer.encode(prefix + prompt, return_tensors="pt") output = model.generate(input_ids, do_sample=True, max_new_tokens=50, top_p=0.3) output_text = tokenizer.decode(output[0], skip_special_tokens=True)

No comments: