diff options
| author | pks <pks@pks.rocks> | 2025-12-01 22:54:55 +0100 |
|---|---|---|
| committer | pks <pks@pks.rocks> | 2025-12-01 22:54:55 +0100 |
| commit | 1924181a775b6131842d840bfb9554142ca55a3c (patch) | |
| tree | 36af59e9b5888324b9db9f8938f51c9551034665 /make_dataset.py | |
| parent | e7dd88970fddea62006ba7b6620db6a31c97f5ed (diff) | |
WIP
Diffstat (limited to 'make_dataset.py')
| -rwxr-xr-x | make_dataset.py | 15 |
1 files changed, 10 insertions, 5 deletions
diff --git a/make_dataset.py b/make_dataset.py index 65338d5..b7695b3 100755 --- a/make_dataset.py +++ b/make_dataset.py @@ -4,29 +4,34 @@ import huggingface_hub import json import os -from datasets import Dataset, DatasetDict, Image +from datasets import Dataset, DatasetDict from glob import glob +from PIL.Image import Image -def make_dataset(base="./baseline"): # TODO: Make actual hf dataset +def make_dataset(base="./baseline"): prompt = "You are a professional English-German translator and also a renowned photography critic.\n\nWrite a detailed caption for this image in a single sentence. Translate the caption into German. The output needs to be JSON, the keys being 'English' and 'German' for the respective captions. Only output the JSON, nothing else." + "<start_of_image>" user_prompts = [] images = [] + file_ids = [] assistant_replies = [] for filename in glob(f"{base}/*.jsonl"): with open(filename, "r") as f: data = json.loads(f.read()) - image_path = f"../d/Images/{os.path.basename(filename).removesuffix(".jsonl")}.jpg" + id_ = os.path.basename(filename) + image_path = f"../d/Images/{id_.removesuffix(".jsonl")}.jpg" + user_prompts.append(prompt) assistant_replies.append(json.dumps({ "English": data["English"], "German": data["Translation"], }, ensure_ascii=False, indent=0)) - images.append(image_path) + ids.append(id_) + images.append(Image(image_path).convert("RGB")) - return Dataset.from_dict({"image": images, "user": user_prompts, "assistant": assistant_replies}).cast_column("image", Image().convert("RGB")) + return Dataset.from_dict({"id": ids, "image": images, "user": user_prompts, "assistant": assistant_replies}) def main(): |
