From 698cd9b4fea329203891d6b1525d7530734bea80 Mon Sep 17 00:00:00 2001
From: pks <pks@pks.rocks>
Date: Sun, 30 Nov 2025 21:27:11 +0100
Subject: WIP

---
 make_dataset.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'make_dataset.py')

diff --git a/make_dataset.py b/make_dataset.py
index a3ce0ab..0fb44b2 100755
--- a/make_dataset.py
+++ b/make_dataset.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python
 
+import huggingface
 import json
+import os
 
 from datasets import Dataset, Image
 from glob import glob
@@ -8,9 +10,11 @@ from glob import glob
 
 def make_dataset(base="./baseline"):  # TODO: Make actual hf dataset
     prompt = "You are a professional English-German translator and also a renowned photography critic.\n\nWrite a detailed caption for this image in a single sentence. Translate the caption into German. The output needs to be JSON, the keys being 'English' and 'German' for the respective captions. Only output the JSON, nothing else." + "<start_of_image>"
+
     user_prompts = []
     images = []
     assistant_replies = []
+
     for filename in glob(f"{base}/*.jsonl"):
         with open(filename, "r") as f:
             data = json.loads(f.read())
@@ -24,4 +28,12 @@ def make_dataset(base="./baseline"):  # TODO: Make actual hf dataset
 
     return Dataset.from_dict({"image": images, "user": user_prompts, "assistant": assistant_replies}).cast_column("image", Image())
 
-dataset = make_dataset()
+
+def main():
+    huggingface.login()
+    dataset = make_dataset()
+    dataset.push_to_repo("asdf2k/caption_translation")
+
+
+if __name__ == "__main__":
+    main()
-- 
cgit v1.2.3