summaryrefslogtreecommitdiff
path: root/make_dataset.py
diff options
context:
space:
mode:
Diffstat (limited to 'make_dataset.py')
-rwxr-xr-xmake_dataset.py14
1 files changed, 13 insertions, 1 deletions
diff --git a/make_dataset.py b/make_dataset.py
index a3ce0ab..0fb44b2 100755
--- a/make_dataset.py
+++ b/make_dataset.py
@@ -1,6 +1,8 @@
#!/usr/bin/env python
+import huggingface
import json
+import os
from datasets import Dataset, Image
from glob import glob
@@ -8,9 +10,11 @@ from glob import glob
def make_dataset(base="./baseline"): # TODO: Make actual hf dataset
prompt = "You are a professional English-German translator and also a renowned photography critic.\n\nWrite a detailed caption for this image in a single sentence. Translate the caption into German. The output needs to be JSON, the keys being 'English' and 'German' for the respective captions. Only output the JSON, nothing else." + "<start_of_image>"
+
user_prompts = []
images = []
assistant_replies = []
+
for filename in glob(f"{base}/*.jsonl"):
with open(filename, "r") as f:
data = json.loads(f.read())
@@ -24,4 +28,12 @@ def make_dataset(base="./baseline"): # TODO: Make actual hf dataset
return Dataset.from_dict({"image": images, "user": user_prompts, "assistant": assistant_replies}).cast_column("image", Image())
-dataset = make_dataset()
+
+def main():
+ huggingface.login()
+ dataset = make_dataset()
+ dataset.push_to_repo("asdf2k/caption_translation")
+
+
+if __name__ == "__main__":
+ main()