From 698cd9b4fea329203891d6b1525d7530734bea80 Mon Sep 17 00:00:00 2001 From: pks Date: Sun, 30 Nov 2025 21:27:11 +0100 Subject: WIP --- make_dataset.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'make_dataset.py') diff --git a/make_dataset.py b/make_dataset.py index a3ce0ab..0fb44b2 100755 --- a/make_dataset.py +++ b/make_dataset.py @@ -1,6 +1,8 @@ #!/usr/bin/env python +import huggingface import json +import os from datasets import Dataset, Image from glob import glob @@ -8,9 +10,11 @@ from glob import glob def make_dataset(base="./baseline"): # TODO: Make actual hf dataset prompt = "You are a professional English-German translator and also a renowned photography critic.\n\nWrite a detailed caption for this image in a single sentence. Translate the caption into German. The output needs to be JSON, the keys being 'English' and 'German' for the respective captions. Only output the JSON, nothing else." + "" + user_prompts = [] images = [] assistant_replies = [] + for filename in glob(f"{base}/*.jsonl"): with open(filename, "r") as f: data = json.loads(f.read()) @@ -24,4 +28,12 @@ def make_dataset(base="./baseline"): # TODO: Make actual hf dataset return Dataset.from_dict({"image": images, "user": user_prompts, "assistant": assistant_replies}).cast_column("image", Image()) -dataset = make_dataset() + +def main(): + huggingface.login() + dataset = make_dataset() + dataset.push_to_repo("asdf2k/caption_translation") + + +if __name__ == "__main__": + main() -- cgit v1.2.3