From 406c46ce1cfaf56b3b7334152dedd3101d50207e Mon Sep 17 00:00:00 2001
From: pks <pks@pks.rocks>
Date: Fri, 5 Dec 2025 22:28:02 +0100
Subject: WIP

---
 format-data.py |  46 ---------------
 inference.py   | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 inference2.py  | 179 ---------------------------------------------------------
 test.py        |  75 ------------------------
 4 files changed, 179 insertions(+), 300 deletions(-)
 delete mode 100644 format-data.py
 create mode 100644 inference.py
 delete mode 100644 inference2.py
 delete mode 100644 test.py
diff --git a/format-data.py b/format-data.py
deleted file mode 100644
index 60c13d7..0000000
--- a/format-data.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import json
-
-from glob import glob
-
-from PIL import Image
-
-
-def format_example_for_gemma3_preferences(data, target_score, translation_score):
-    prompt = """<bos><start_of_turn>user
-You are a professional English-German translator and also a renowned photography critic.
-
-<start_of_image>
-Write a detailed caption for this image in a single sentence. Translate the caption into German. The output needs to be JSON, the keys being 'English' and 'German' for the respective captions. Only output the JSON, nothing else.<end_of_turn>"""
-    translation = f"""<start_of_turn>model
-{data['Translation']}<end_of_turn>"""
-    target = f"""<start_of_turn>model
-{data['German']}<end_of_turn>"""
-
-    if target_score == translation_score:
-        return None, None, None
-    elif target_score > translation_score:
-        return prompt, target, translation
-    else:
-        return prompt, translation, target
-
-
-def main():
-    with open("baseline/target.gemba-gpt4.1.scores", "r") as f:
-        scores_target = [int(line.strip()) for line in f.readlines()]
-    with open("baseline/translation.gemba-gpt4.1.scores", "r") as f:
-        scores_translation = [int(line.strip()) for line in f.readlines()]
-
-    for index, filename in enumerate(sorted(glob("baseline/*.jsonl"))):
-        with open(filename, "r") as f:
-            data = json.loads(f.read())
-        prompt, c, r = format_example_for_gemma3_preferences(data, scores_target[index], scores_translation[index])
-        print(f"{c=} {scores_target[index] > scores_translation[index]}")
-
-
-    from transformers import AutoTokenizer
-    model_id = "google/gemma-3-4b-it"
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-        
-
-if __name__ == "__main__":
-    main()
diff --git a/inference.py b/inference.py
new file mode 100644
index 0000000..67e633a
--- /dev/null
+++ b/inference.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+
+import argparse
+import datasets
+import json
+import os
+import requests
+import sys
+import torch
+from glob import glob
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForImageTextToText
+
+
+def clean_str(s):
+    return s.removeprefix("```json").removesuffix("```").replace("\n", "").strip()
+
+
+def captioning_prompt(image):
+    return [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": "You are a professional English-German translator and also a renowned photography critic."}]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": "Write a detailed caption for this image in a single sentence. Translate the caption into German. The output needs to be JSON, the keys being 'English' and 'German' for the respective captions. Only output the JSON, nothing else."}
+            ]
+        }
+    ]
+
+
+def captioning_prompt_with_source(image, source):
+    prompt = captioning_prompt(image)
+    prefix = json.dumps({"English": source}).removesuffix("}") + ', "German": "'
+    prompt.append({"role": "assistant", "content": [{"type": "text", "text": prefix}]})
+
+    return prompt
+
+
+def translation_prompt(source):
+    return [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": "You are a professional English-German translator."}]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": f"Translate the following caption into German. The output needs to be JSON, the only being 'Translation' for the translation. Only output the JSON, nothing else. Caption: {source}"}
+            ]
+        }
+    ]
+
+
+def make_inputs(processor,
+                messages,
+                device):
+    return processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt"
+    ).to(device, dtype=torch.bfloat16)
+
+
+def generate_and_parse(model,
+                       processor,
+                       messages,
+                       args,
+                       example_id=None):
+    sys.stderr.write(f"Processing {example_id=}\n")
+    inputs = make_inputs(processor, messages, model.device)
+    input_len = inputs["input_ids"].shape[-1]
+
+    stop_token_ids = [processor.tokenizer.eos_token_id, processor.tokenizer.convert_tokens_to_ids("<end_of_turn>")]
+
+    with torch.inference_mode():
+        generation = model.generate(
+            **inputs,
+            max_new_tokens=args.max_new_tokens,
+            do_sample=not args.do_not_sample,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            top_k=args.top_k,
+            eos_token_id=stop_token_ids,
+            disable_compile=True,
+        )
+
+    output_tokens = generation[0][input_len:]
+    output_text = clean_str(processor.decode(output_tokens, skip_special_tokens=True))
+
+    try:
+        return json.loads(output_text)
+    except Exception:
+        if example_id is not None:
+            sys.stderr.write(
+                f"Error loading JSON from string '{output_text}' for id={example_id}\n"
+            )
+        else:
+            sys.stderr.write(
+                f"Error loading JSON from string '{output_text}'\n"
+            )
+        return output_text
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="google/gemma-3-4b-it", type=str)
+    parser.add_argument("--attention-implementation", default="eager", type=str)
+    parser.add_argument("--lora-adapter", default=None, type=str)
+    parser.add_argument("--mode", choices=["from_scratch", "with_prefix", "translate"], type=str, required=True)
+    parser.add_argument("--dataset", default="asdf2k/caption_translation", type=str)
+    parser.add_argument("--data-subset", choices=["train", "dev", "test"], default="test", type=str)
+    parser.add_argument("--max-new-tokens", default=300, type=int)
+    parser.add_argument("--top-p", default=1.0, type=int)
+    parser.add_argument("--top-k", default=50, type=int)
+    parser.add_argument("--temperature", default=0.8, type=int)
+    parser.add_argument("--do-not-sample", action="store_true")
+    args = parser.parse_args()
+
+    model = AutoModelForImageTextToText.from_pretrained(
+        args.model,
+        device_map="cuda",
+        dtype=torch.bfloat16,
+        attn_implementation=args.attention_implementation,
+    ).eval()
+    processor = AutoProcessor.from_pretrained(args.model, use_fast=True)
+
+    if args.lora_adapter:
+        from peft import PeftModel
+        model = PeftModel.from_pretrained(model, args.lora_adapter)
+
+    dataset = datasets.load_dataset(args.dataset)[args.data_subset]
+
+    if args.mode == "from_scratch":  # Generate caption & translation from scratch
+        for x in dataset:
+            output = generate_and_parse(
+                model,
+                processor,
+                captioning_prompt(x["image"]),
+                args,
+                example_id=x["id"],
+            )
+            print(f"{x['id']}\t{json.dumps(output)}")
+
+    elif args.mode == "translate":  # Generate German translation given English source
+        for x in dataset:
+            input_data = json.loads(x["assistant"])
+            output = generate_and_parse(
+                model,
+                processor,
+                translation_prompt(input_data["English"]),
+                args,
+                example_id=x["id"],
+            )
+            output = {"English": input_data["English"], "German": output["Translation"]}
+            print(f"{x['id']}\t{json.dumps(output)}")
+
+    elif args.mode == "with_prefix":  # Generate German translation given English caption and image
+        for x in dataset:
+            assistant_output_as_input = json.loads(x["assistant"])
+            output = generate_and_parse(
+                model,
+                processor,
+                captioning_prompt_with_source(x["image"], assistant_output_as_input["English"]),
+                args,
+                example_id=x["id"],
+            )
+            print(f"{x['id']}\t{json.dumps(output)}")
+    else:
+        sys.stderr.write(f"Unkown mode '{args.mode}'")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/inference2.py b/inference2.py
deleted file mode 100644
index 67e633a..0000000
--- a/inference2.py
+++ /dev/null
@@ -1,179 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import datasets
-import json
-import os
-import requests
-import sys
-import torch
-from glob import glob
-from PIL import Image
-from transformers import AutoProcessor, AutoModelForImageTextToText
-
-
-def clean_str(s):
-    return s.removeprefix("```json").removesuffix("```").replace("\n", "").strip()
-
-
-def captioning_prompt(image):
-    return [
-        {
-            "role": "system",
-            "content": [{"type": "text", "text": "You are a professional English-German translator and also a renowned photography critic."}]
-        },
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": "Write a detailed caption for this image in a single sentence. Translate the caption into German. The output needs to be JSON, the keys being 'English' and 'German' for the respective captions. Only output the JSON, nothing else."}
-            ]
-        }
-    ]
-
-
-def captioning_prompt_with_source(image, source):
-    prompt = captioning_prompt(image)
-    prefix = json.dumps({"English": source}).removesuffix("}") + ', "German": "'
-    prompt.append({"role": "assistant", "content": [{"type": "text", "text": prefix}]})
-
-    return prompt
-
-
-def translation_prompt(source):
-    return [
-        {
-            "role": "system",
-            "content": [{"type": "text", "text": "You are a professional English-German translator."}]
-        },
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": f"Translate the following caption into German. The output needs to be JSON, the only being 'Translation' for the translation. Only output the JSON, nothing else. Caption: {source}"}
-            ]
-        }
-    ]
-
-
-def make_inputs(processor,
-                messages,
-                device):
-    return processor.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors="pt"
-    ).to(device, dtype=torch.bfloat16)
-
-
-def generate_and_parse(model,
-                       processor,
-                       messages,
-                       args,
-                       example_id=None):
-    sys.stderr.write(f"Processing {example_id=}\n")
-    inputs = make_inputs(processor, messages, model.device)
-    input_len = inputs["input_ids"].shape[-1]
-
-    stop_token_ids = [processor.tokenizer.eos_token_id, processor.tokenizer.convert_tokens_to_ids("<end_of_turn>")]
-
-    with torch.inference_mode():
-        generation = model.generate(
-            **inputs,
-            max_new_tokens=args.max_new_tokens,
-            do_sample=not args.do_not_sample,
-            temperature=args.temperature,
-            top_p=args.top_p,
-            top_k=args.top_k,
-            eos_token_id=stop_token_ids,
-            disable_compile=True,
-        )
-
-    output_tokens = generation[0][input_len:]
-    output_text = clean_str(processor.decode(output_tokens, skip_special_tokens=True))
-
-    try:
-        return json.loads(output_text)
-    except Exception:
-        if example_id is not None:
-            sys.stderr.write(
-                f"Error loading JSON from string '{output_text}' for id={example_id}\n"
-            )
-        else:
-            sys.stderr.write(
-                f"Error loading JSON from string '{output_text}'\n"
-            )
-        return output_text
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", default="google/gemma-3-4b-it", type=str)
-    parser.add_argument("--attention-implementation", default="eager", type=str)
-    parser.add_argument("--lora-adapter", default=None, type=str)
-    parser.add_argument("--mode", choices=["from_scratch", "with_prefix", "translate"], type=str, required=True)
-    parser.add_argument("--dataset", default="asdf2k/caption_translation", type=str)
-    parser.add_argument("--data-subset", choices=["train", "dev", "test"], default="test", type=str)
-    parser.add_argument("--max-new-tokens", default=300, type=int)
-    parser.add_argument("--top-p", default=1.0, type=int)
-    parser.add_argument("--top-k", default=50, type=int)
-    parser.add_argument("--temperature", default=0.8, type=int)
-    parser.add_argument("--do-not-sample", action="store_true")
-    args = parser.parse_args()
-
-    model = AutoModelForImageTextToText.from_pretrained(
-        args.model,
-        device_map="cuda",
-        dtype=torch.bfloat16,
-        attn_implementation=args.attention_implementation,
-    ).eval()
-    processor = AutoProcessor.from_pretrained(args.model, use_fast=True)
-
-    if args.lora_adapter:
-        from peft import PeftModel
-        model = PeftModel.from_pretrained(model, args.lora_adapter)
-
-    dataset = datasets.load_dataset(args.dataset)[args.data_subset]
-
-    if args.mode == "from_scratch":  # Generate caption & translation from scratch
-        for x in dataset:
-            output = generate_and_parse(
-                model,
-                processor,
-                captioning_prompt(x["image"]),
-                args,
-                example_id=x["id"],
-            )
-            print(f"{x['id']}\t{json.dumps(output)}")
-
-    elif args.mode == "translate":  # Generate German translation given English source
-        for x in dataset:
-            input_data = json.loads(x["assistant"])
-            output = generate_and_parse(
-                model,
-                processor,
-                translation_prompt(input_data["English"]),
-                args,
-                example_id=x["id"],
-            )
-            output = {"English": input_data["English"], "German": output["Translation"]}
-            print(f"{x['id']}\t{json.dumps(output)}")
-
-    elif args.mode == "with_prefix":  # Generate German translation given English caption and image
-        for x in dataset:
-            assistant_output_as_input = json.loads(x["assistant"])
-            output = generate_and_parse(
-                model,
-                processor,
-                captioning_prompt_with_source(x["image"], assistant_output_as_input["English"]),
-                args,
-                example_id=x["id"],
-            )
-            print(f"{x['id']}\t{json.dumps(output)}")
-    else:
-        sys.stderr.write(f"Unkown mode '{args.mode}'")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test.py b/test.py
deleted file mode 100644
index 35ceb5f..0000000
--- a/test.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import torch
-import requests
-from PIL import Image
-from transformers import AutoProcessor, AutoModelForCausalLM
-
-from glob import glob
-import json
-import os
-from datasets import Dataset
-
-
-def make_chat_data(base="./baseline"):
-    dataset = []
-    for filename in sorted(glob(f"{base}/*.jsonl"))[0:1]:
-        with open(filename, "r") as f:
-            data = json.loads(f.read())
-        image_path = f"../Images/{os.path.basename(filename).removesuffix(".jsonl")}.jpg"
-        image = Image.open(image_path).convert("RGB")
-        chat = [{
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "You are a professional English-German translator and also a renowned photography critic.\n\nWrite a detailed caption for this image in a single sentence. Translate the caption into German. The output needs to be JSON, the keys being 'English' and 'German' for the respective captions. Only output the JSON, nothing else."},
-                    {"type": "image"}
-                ]
-                },
-                #{ "role": "assistant",
-                #  "content": [{"type": "text", "text": '{"English": ' +  json.dumps(data["English"]) + ', "German": ' + json.dumps(data["Translation"]) + '}'}]
-                #}
-        ]
-        item = {"image": image, "chat": chat}
-        dataset.append(item)
-
-    return Dataset.from_list(dataset)
-
-
-model_id = "google/gemma-3-4b-it"
-processor = AutoProcessor.from_pretrained(model_id, use_fast=True)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    dtype=torch.bfloat16,
-    device_map="auto",
-    attn_implementation="eager",
-)
-device = model.device
-dataset = make_chat_data()
-chat_prompt = processor.tokenizer.apply_chat_template(
-    [item["chat"] for item in dataset],
-    tokenize=False,
-	add_generation_prompt=True,
-)
-
-print(dataset[0])
-
-inputs = processor(
-    text=chat_prompt,
-    images=[item["image"] for item in dataset],
-    return_tensors="pt"
-).to(device)
-
-print("Keys in the output:", inputs.keys())
-
-input_ids = inputs["input_ids"]
-print("\nShape of input_ids:", input_ids.shape)
-print("input_ids:", input_ids)
-decoded_text = processor.decode(input_ids[0], skip_special_tokens=False)
-print("\nDecoded input_ids (showing special tokens):")
-print(decoded_text)
-pixel_values = inputs["pixel_values"]
-print("\n--- Generating a Response ---")
-output = model.generate(
-    **inputs,
-    max_new_tokens=100
-)
-generated_text = processor.decode(output[0], skip_special_tokens=True)
-print("\nModel's response:\n", generated_text)
-- 
cgit v1.2.3