diff options
| -rwxr-xr-x | make_dataset.py | 25 |
1 files changed, 21 insertions, 4 deletions
diff --git a/make_dataset.py b/make_dataset.py index 0fb44b2..ea02281 100755 --- a/make_dataset.py +++ b/make_dataset.py @@ -1,10 +1,10 @@ #!/usr/bin/env python -import huggingface +import huggingface_hub import json import os -from datasets import Dataset, Image +from datasets import Dataset, DatasetDict, Image from glob import glob @@ -30,9 +30,26 @@ def make_dataset(base="./baseline"): # TODO: Make actual hf dataset def main(): - huggingface.login() + huggingface_hub.login() dataset = make_dataset() - dataset.push_to_repo("asdf2k/caption_translation") + + splits = ds.train_test_split( + test_size=0.1, + seed=42, + ) + + train_valid = splits["train"].train_test_split( + test_size=0.111111, + seed=42, + ) + + dataset = DatasetDict({ + "train": train_valid["train"], + "dev": train_valid["test"], + "test": splits["test"], + }) + + dataset.push_to_hub("asdf2k/caption_translation", repo_visibility="private") if __name__ == "__main__": |
