summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpks <pks@pks.rocks>2025-11-30 21:36:19 +0100
committerpks <pks@pks.rocks>2025-11-30 21:36:19 +0100
commit0a795ca3e10c88651ac529227a7bbba6a80283e2 (patch)
treea8a4d37c7850259807bdc7ed44f694b8ac6a4163
parent698cd9b4fea329203891d6b1525d7530734bea80 (diff)
WIP
-rwxr-xr-xmake_dataset.py25
1 files changed, 21 insertions, 4 deletions
diff --git a/make_dataset.py b/make_dataset.py
index 0fb44b2..ea02281 100755
--- a/make_dataset.py
+++ b/make_dataset.py
@@ -1,10 +1,10 @@
#!/usr/bin/env python
-import huggingface
+import huggingface_hub
import json
import os
-from datasets import Dataset, Image
+from datasets import Dataset, DatasetDict, Image
from glob import glob
@@ -30,9 +30,26 @@ def make_dataset(base="./baseline"): # TODO: Make actual hf dataset
def main():
- huggingface.login()
+ huggingface_hub.login()
dataset = make_dataset()
- dataset.push_to_repo("asdf2k/caption_translation")
+
+ splits = ds.train_test_split(
+ test_size=0.1,
+ seed=42,
+ )
+
+ train_valid = splits["train"].train_test_split(
+ test_size=0.111111,
+ seed=42,
+ )
+
+ dataset = DatasetDict({
+ "train": train_valid["train"],
+ "dev": train_valid["test"],
+ "test": splits["test"],
+ })
+
+ dataset.push_to_hub("asdf2k/caption_translation", repo_visibility="private")
if __name__ == "__main__":