#!/usr/bin/env python import huggingface_hub import json import os from datasets import Dataset, DatasetDict, Image from glob import glob def make_dataset(base="./baseline"): # TODO: Make actual hf dataset prompt = "You are a professional English-German translator and also a renowned photography critic.\n\nWrite a detailed caption for this image in a single sentence. Translate the caption into German. The output needs to be JSON, the keys being 'English' and 'German' for the respective captions. Only output the JSON, nothing else." + "" user_prompts = [] images = [] assistant_replies = [] for filename in glob(f"{base}/*.jsonl"): with open(filename, "r") as f: data = json.loads(f.read()) image_path = f"../d/Images/{os.path.basename(filename).removesuffix(".jsonl")}.jpg" user_prompts.append(prompt) assistant_replies.append(json.dumps({ "English": data["English"], "German": data["Translation"], }, ensure_ascii=False, indent=0)) images.append(image_path) return Dataset.from_dict({"image": images, "user": user_prompts, "assistant": assistant_replies}).cast_column("image", Image()) def main(): huggingface_hub.login() dataset = make_dataset() splits = dataset.train_test_split( test_size=0.1, seed=42, ) train_valid = splits["train"].train_test_split( test_size=0.111111, seed=42, ) dataset = DatasetDict({ "train": train_valid["train"], "dev": train_valid["test"], "test": splits["test"], }) dataset.push_to_hub("asdf2k/caption_translation", private=True) if __name__ == "__main__": main()