merge upstream/master

author: Patrick Simianer <simianer@cl.uni-heidelberg.de> 2012-11-05 15:29:46 +0100
committer: Patrick Simianer <simianer@cl.uni-heidelberg.de> 2012-11-05 15:29:46 +0100
commit: 6f29f345dc06c1a1033475eac1d1340781d1d603 (patch)
tree: 6fa4cdd7aefd7d54c9585c2c6274db61bb8b159a /python/examples/rampion.py
parent: b510da2e562c695c90d565eb295c749569c59be8 (diff)
parent: c615c37501fa8576584a510a9d2bfe2fdd5bace7 (diff)
1 files changed, 76 insertions, 0 deletions
diff --git a/python/examples/rampion.py b/python/examples/rampion.py
new file mode 100644
index 00000000..30244cf7
--- /dev/null
+++ b/python/examples/rampion.py
@@ -0,0 +1,76 @@
+import argparse
+import logging
+from itertools import izip
+import cdec, cdec.score
+
+def evaluate(hyp, ref):
+    """ Compute BLEU score for a set of hypotheses+references """
+    return sum(cdec.score.BLEU(r).evaluate(h) for h, r in izip(hyp, ref)).score
+
+T1, T2, T3 = 5, 10, 20 # number of iterations (global, CCCP, SSD)
+K = 500 # k-best list size
+C = 1 # regularization coefficient
+eta = 1e-4 # step size
+cost = lambda c: 10 * (1 - c.score) # cost definition
+
+def rampion(decoder, sources, references):
+    # Empty k-best lists
+    candidate_sets = [cdec.score.BLEU(refs).candidate_set() for refs in references]
+    # Weight vector -> sparse
+    w = decoder.weights.tosparse()
+    w0 = w.copy()
+
+    N = len(sources)
+    for t in range(T1):
+        logging.info('Iteration {0}: translating...'.format(t+1))
+        # Get the hypergraphs and extend the k-best lists
+        hgs = []
+        for src, candidates in izip(sources, candidate_sets):
+            hg = decoder.translate(src)
+            hgs.append(hg)
+            candidates.add_kbest(hg, K)
+        # BLEU score for the previous iteration
+        score = evaluate((hg.viterbi() for hg in hgs), references)
+        logging.info('BLEU: {:.2f}'.format(100 * score))
+        logging.info('Optimizing...')
+        for _ in range(T2):
+            # y_i^+, h_i^+; i=1..N
+            plus = [max(candidates, key=lambda c: w.dot(c.fmap) - cost(c)).fmap
+                    for candidates in candidate_sets]
+            for _ in range(T3):
+                for fp, candidates in izip(plus, candidate_sets):
+                    # y^-, h^-
+                    fm = max(candidates, key=lambda c: w.dot(c.fmap) + cost(c)).fmap
+                    # update weights (line 11-12)
+                    w += eta * ((fp - fm) - C/N * (w - w0))
+        logging.info('Updated weight vector: {0}'.format(dict(w)))
+        # Update decoder weights
+        decoder.weights = w
+
+def main():
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', help='cdec config', required=True)
+    parser.add_argument('-w', '--weights', help='initial weights', required=True)
+    parser.add_argument('-r', '--reference', help='reference file', required=True)
+    parser.add_argument('-s', '--source', help='source file', required=True)
+    args = parser.parse_args()
+
+    with open(args.config) as fp:
+        config = fp.read()
+
+    decoder = cdec.Decoder(config)
+    decoder.read_weights(args.weights)
+    with open(args.reference) as fp:
+        references = fp.readlines()
+    with open(args.source) as fp:
+        sources = fp.readlines()
+    assert len(references) == len(sources)
+    rampion(decoder, sources, references)
+
+    for fname, fval in sorted(dict(decoder.weights).iteritems()):
+        print('{0}\t{1}'.format(fname, fval))
+
+if __name__ == '__main__':
+    main()
author	Patrick Simianer <simianer@cl.uni-heidelberg.de>	2012-11-05 15:29:46 +0100
committer	Patrick Simianer <simianer@cl.uni-heidelberg.de>	2012-11-05 15:29:46 +0100
commit	6f29f345dc06c1a1033475eac1d1340781d1d603 (patch)
tree	6fa4cdd7aefd7d54c9585c2c6274db61bb8b159a /python/examples/rampion.py
parent	b510da2e562c695c90d565eb295c749569c59be8 (diff)
parent	c615c37501fa8576584a510a9d2bfe2fdd5bace7 (diff)