summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpks <pks@pks.rocks>2025-11-30 21:27:11 +0100
committerpks <pks@pks.rocks>2025-11-30 21:27:11 +0100
commit698cd9b4fea329203891d6b1525d7530734bea80 (patch)
tree717a972e9ddd1c1250c4838405f2e4d3733fa690
parentd1924e295bce5018ded513c23d863a8b2cfa5d61 (diff)
WIP
-rwxr-xr-xmake_dataset.py14
1 files changed, 13 insertions, 1 deletions
diff --git a/make_dataset.py b/make_dataset.py
index a3ce0ab..0fb44b2 100755
--- a/make_dataset.py
+++ b/make_dataset.py
@@ -1,6 +1,8 @@
#!/usr/bin/env python
+import huggingface
import json
+import os
from datasets import Dataset, Image
from glob import glob
@@ -8,9 +10,11 @@ from glob import glob
def make_dataset(base="./baseline"): # TODO: Make actual hf dataset
prompt = "You are a professional English-German translator and also a renowned photography critic.\n\nWrite a detailed caption for this image in a single sentence. Translate the caption into German. The output needs to be JSON, the keys being 'English' and 'German' for the respective captions. Only output the JSON, nothing else." + "<start_of_image>"
+
user_prompts = []
images = []
assistant_replies = []
+
for filename in glob(f"{base}/*.jsonl"):
with open(filename, "r") as f:
data = json.loads(f.read())
@@ -24,4 +28,12 @@ def make_dataset(base="./baseline"): # TODO: Make actual hf dataset
return Dataset.from_dict({"image": images, "user": user_prompts, "assistant": assistant_replies}).cast_column("image", Image())
-dataset = make_dataset()
+
+def main():
+ huggingface.login()
+ dataset = make_dataset()
+ dataset.push_to_repo("asdf2k/caption_translation")
+
+
+if __name__ == "__main__":
+ main()