summaryrefslogtreecommitdiff
path: root/make_dataset.py
diff options
context:
space:
mode:
authorpks <pks@pks.rocks>2025-12-02 13:20:43 +0100
committerpks <pks@pks.rocks>2025-12-02 13:20:43 +0100
commitc0ed7b3ada7f41faaad9a2a64697d6a0e385ed86 (patch)
tree5a5af88d08d1392f6d22fda6614efb15826447ab /make_dataset.py
parentd19bd3fbf54f08db5e563b17c64991ef9b9706a6 (diff)
WIP
Diffstat (limited to 'make_dataset.py')
-rwxr-xr-xmake_dataset.py8
1 files changed, 4 insertions, 4 deletions
diff --git a/make_dataset.py b/make_dataset.py
index b7695b3..b2f2cee 100755
--- a/make_dataset.py
+++ b/make_dataset.py
@@ -6,7 +6,7 @@ import os
from datasets import Dataset, DatasetDict
from glob import glob
-from PIL.Image import Image
+from PIL import Image
def make_dataset(base="./baseline"):
@@ -14,13 +14,13 @@ def make_dataset(base="./baseline"):
user_prompts = []
images = []
- file_ids = []
+ ids = []
assistant_replies = []
for filename in glob(f"{base}/*.jsonl"):
with open(filename, "r") as f:
data = json.loads(f.read())
- id_ = os.path.basename(filename)
+ id_ = os.path.basename(filename).removesuffix(".jsonl")
image_path = f"../d/Images/{id_.removesuffix(".jsonl")}.jpg"
user_prompts.append(prompt)
@@ -29,7 +29,7 @@ def make_dataset(base="./baseline"):
"German": data["Translation"],
}, ensure_ascii=False, indent=0))
ids.append(id_)
- images.append(Image(image_path).convert("RGB"))
+ images.append(Image.open(image_path).convert("RGB"))
return Dataset.from_dict({"id": ids, "image": images, "user": user_prompts, "assistant": assistant_replies})