From 7b8f054c80b0aecaadd8ae4fc6a6fe37cf1c749f Mon Sep 17 00:00:00 2001 From: pks Date: Thu, 20 Nov 2025 20:47:31 +0100 Subject: init --- format-data.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 format-data.py (limited to 'format-data.py') diff --git a/format-data.py b/format-data.py new file mode 100644 index 0000000..60c13d7 --- /dev/null +++ b/format-data.py @@ -0,0 +1,46 @@ +import json + +from glob import glob + +from PIL import Image + + +def format_example_for_gemma3_preferences(data, target_score, translation_score): + prompt = """user +You are a professional English-German translator and also a renowned photography critic. + + +Write a detailed caption for this image in a single sentence. Translate the caption into German. The output needs to be JSON, the keys being 'English' and 'German' for the respective captions. Only output the JSON, nothing else.""" + translation = f"""model +{data['Translation']}""" + target = f"""model +{data['German']}""" + + if target_score == translation_score: + return None, None, None + elif target_score > translation_score: + return prompt, target, translation + else: + return prompt, translation, target + + +def main(): + with open("baseline/target.gemba-gpt4.1.scores", "r") as f: + scores_target = [int(line.strip()) for line in f.readlines()] + with open("baseline/translation.gemba-gpt4.1.scores", "r") as f: + scores_translation = [int(line.strip()) for line in f.readlines()] + + for index, filename in enumerate(sorted(glob("baseline/*.jsonl"))): + with open(filename, "r") as f: + data = json.loads(f.read()) + prompt, c, r = format_example_for_gemma3_preferences(data, scores_target[index], scores_translation[index]) + print(f"{c=} {scores_target[index] > scores_translation[index]}") + + + from transformers import AutoTokenizer + model_id = "google/gemma-3-4b-it" + tokenizer = AutoTokenizer.from_pretrained(model_id) + + +if __name__ == "__main__": + main() -- cgit v1.2.3