#!/usr/bin/env python import huggingface_hub import json import os from datasets import Dataset, DatasetDict from glob import glob from PIL import Image def make_dataset(base="./baseline"): prompt = "You are a professional English-German translator and also a renowned photography critic.\n\nWrite a detailed caption for this image in a single sentence. Translate the caption into German. The output needs to be JSON, the keys being 'English' and 'German' for the respective captions. Only output the JSON, nothing else." + "" user_prompts = [] images = [] ids = [] assistant_replies = [] for filename in glob(f"{base}/*.jsonl"): with open(filename, "r") as f: data = json.loads(f.read()) id_ = os.path.basename(filename).removesuffix(".jsonl") image_path = f"../d/Images/{id_.removesuffix(".jsonl")}.jpg" user_prompts.append(prompt) assistant_replies.append(json.dumps({ "English": data["English"], "German": data["Translation"], }, ensure_ascii=False, indent=0)) ids.append(id_) images.append(Image.open(image_path).convert("RGB")) return Dataset.from_dict({"id": ids, "image": images, "user": user_prompts, "assistant": assistant_replies}) def main(): huggingface_hub.login() dataset = make_dataset() splits = dataset.train_test_split( test_size=0.1, seed=42, ) train_valid = splits["train"].train_test_split( test_size=0.111111, seed=42, ) dataset = DatasetDict({ "train": train_valid["train"], "dev": train_valid["test"], "test": splits["test"], }) dataset.push_to_hub("asdf2k/caption_translation", private=True) if __name__ == "__main__": main()