diff options
| author | pks <pks@pks.rocks> | 2025-11-30 21:27:11 +0100 |
|---|---|---|
| committer | pks <pks@pks.rocks> | 2025-11-30 21:27:11 +0100 |
| commit | 698cd9b4fea329203891d6b1525d7530734bea80 (patch) | |
| tree | 717a972e9ddd1c1250c4838405f2e4d3733fa690 /make_dataset.py | |
| parent | d1924e295bce5018ded513c23d863a8b2cfa5d61 (diff) | |
WIP
Diffstat (limited to 'make_dataset.py')
| -rwxr-xr-x | make_dataset.py | 14 |
1 files changed, 13 insertions, 1 deletions
diff --git a/make_dataset.py b/make_dataset.py index a3ce0ab..0fb44b2 100755 --- a/make_dataset.py +++ b/make_dataset.py @@ -1,6 +1,8 @@ #!/usr/bin/env python +import huggingface import json +import os from datasets import Dataset, Image from glob import glob @@ -8,9 +10,11 @@ from glob import glob def make_dataset(base="./baseline"): # TODO: Make actual hf dataset prompt = "You are a professional English-German translator and also a renowned photography critic.\n\nWrite a detailed caption for this image in a single sentence. Translate the caption into German. The output needs to be JSON, the keys being 'English' and 'German' for the respective captions. Only output the JSON, nothing else." + "<start_of_image>" + user_prompts = [] images = [] assistant_replies = [] + for filename in glob(f"{base}/*.jsonl"): with open(filename, "r") as f: data = json.loads(f.read()) @@ -24,4 +28,12 @@ def make_dataset(base="./baseline"): # TODO: Make actual hf dataset return Dataset.from_dict({"image": images, "user": user_prompts, "assistant": assistant_replies}).cast_column("image", Image()) -dataset = make_dataset() + +def main(): + huggingface.login() + dataset = make_dataset() + dataset.push_to_repo("asdf2k/caption_translation") + + +if __name__ == "__main__": + main() |
