diff options
| author | pks <pks@pks.rocks> | 2025-11-30 21:36:19 +0100 |
|---|---|---|
| committer | pks <pks@pks.rocks> | 2025-11-30 21:36:19 +0100 |
| commit | 0a795ca3e10c88651ac529227a7bbba6a80283e2 (patch) | |
| tree | a8a4d37c7850259807bdc7ed44f694b8ac6a4163 /make_dataset.py | |
| parent | 698cd9b4fea329203891d6b1525d7530734bea80 (diff) | |
WIP
Diffstat (limited to 'make_dataset.py')
| -rwxr-xr-x | make_dataset.py | 25 |
1 files changed, 21 insertions, 4 deletions
diff --git a/make_dataset.py b/make_dataset.py index 0fb44b2..ea02281 100755 --- a/make_dataset.py +++ b/make_dataset.py @@ -1,10 +1,10 @@ #!/usr/bin/env python -import huggingface +import huggingface_hub import json import os -from datasets import Dataset, Image +from datasets import Dataset, DatasetDict, Image from glob import glob @@ -30,9 +30,26 @@ def make_dataset(base="./baseline"): # TODO: Make actual hf dataset def main(): - huggingface.login() + huggingface_hub.login() dataset = make_dataset() - dataset.push_to_repo("asdf2k/caption_translation") + + splits = ds.train_test_split( + test_size=0.1, + seed=42, + ) + + train_valid = splits["train"].train_test_split( + test_size=0.111111, + seed=42, + ) + + dataset = DatasetDict({ + "train": train_valid["train"], + "dev": train_valid["test"], + "test": splits["test"], + }) + + dataset.push_to_hub("asdf2k/caption_translation", repo_visibility="private") if __name__ == "__main__": |
