import json from glob import glob from PIL import Image def format_example_for_gemma3_preferences(data, target_score, translation_score): prompt = """user You are a professional English-German translator and also a renowned photography critic. Write a detailed caption for this image in a single sentence. Translate the caption into German. The output needs to be JSON, the keys being 'English' and 'German' for the respective captions. Only output the JSON, nothing else.""" translation = f"""model {data['Translation']}""" target = f"""model {data['German']}""" if target_score == translation_score: return None, None, None elif target_score > translation_score: return prompt, target, translation else: return prompt, translation, target def main(): with open("baseline/target.gemba-gpt4.1.scores", "r") as f: scores_target = [int(line.strip()) for line in f.readlines()] with open("baseline/translation.gemba-gpt4.1.scores", "r") as f: scores_translation = [int(line.strip()) for line in f.readlines()] for index, filename in enumerate(sorted(glob("baseline/*.jsonl"))): with open(filename, "r") as f: data = json.loads(f.read()) prompt, c, r = format_example_for_gemma3_preferences(data, scores_target[index], scores_translation[index]) print(f"{c=} {scores_target[index] > scores_translation[index]}") from transformers import AutoTokenizer model_id = "google/gemma-3-4b-it" tokenizer = AutoTokenizer.from_pretrained(model_id) if __name__ == "__main__": main()