Monday, May 6, 2024

Easy Way to Create LLM dataset for Training and Validation Locally

 This video is a hands-on tutorial for creating a classification dataset from a text file with training, validation and test datasets.

import urllib.request

import zipfile

import os

from pathlib import Path

url = ""

zip_path = ""

extracted_path = "sms_spam_collection"

data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip(url, zip_path, extracted_path, data_file_path):

    if data_file_path.exists():

        print(f"{data_file_path} already exists. Skipping download and extraction.")


    # Downloading the file

    with urllib.request.urlopen(url) as response:

        with open(zip_path, "wb") as out_file:


    # Unzipping the file

    with zipfile.ZipFile(zip_path, "r") as zip_ref:


    # Add .tsv file extension

    original_file_path = Path(extracted_path) / "SMSSpamCollection"

    os.rename(original_file_path, data_file_path)

    print(f"File downloaded and saved as {data_file_path}")

download_and_unzip(url, zip_path, extracted_path, data_file_path)

import pandas as pd

df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])



def create_balanced_dataset(df):


    # Count the instances of "spam"

    num_spam = df[df["Label"] == "spam"].shape[0]


    # Randomly sample "ham' instances to match the number of 'spam' instances

    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)


    # Combine ham "subset" with "spam"

    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])

    return balanced_df

balanced_df = create_balanced_dataset(df)


Credit: rasbt/LLMs-from-scratch: Implementing a ChatGPT-like LLM from scratch, step by step (

No comments: