From 0a795ca3e10c88651ac529227a7bbba6a80283e2 Mon Sep 17 00:00:00 2001 From: pks Date: Sun, 30 Nov 2025 21:36:19 +0100 Subject: WIP --- make_dataset.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'make_dataset.py') diff --git a/make_dataset.py b/make_dataset.py index 0fb44b2..ea02281 100755 --- a/make_dataset.py +++ b/make_dataset.py @@ -1,10 +1,10 @@ #!/usr/bin/env python -import huggingface +import huggingface_hub import json import os -from datasets import Dataset, Image +from datasets import Dataset, DatasetDict, Image from glob import glob @@ -30,9 +30,26 @@ def make_dataset(base="./baseline"): # TODO: Make actual hf dataset def main(): - huggingface.login() + huggingface_hub.login() dataset = make_dataset() - dataset.push_to_repo("asdf2k/caption_translation") + + splits = ds.train_test_split( + test_size=0.1, + seed=42, + ) + + train_valid = splits["train"].train_test_split( + test_size=0.111111, + seed=42, + ) + + dataset = DatasetDict({ + "train": train_valid["train"], + "dev": train_valid["test"], + "test": splits["test"], + }) + + dataset.push_to_hub("asdf2k/caption_translation", repo_visibility="private") if __name__ == "__main__": -- cgit v1.2.3