From f63027cecd0649b4d30e3f288e1311f9f27f1b5b Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 2 Feb 2012 17:19:40 -0500
Subject: remove some dead code to clean things up

---
 decoder/1dev.ur                 |   1 -
 decoder/apply_fsa_models.README |  21 ------
 decoder/cdec-gz.ini             |   7 --
 decoder/cdec-nolm-tuned.ini     |   7 --
 decoder/decode.sh               |  10 ---
 decoder/do.tests.sh             |   1 -
 decoder/fsa-decode.sh           |   3 -
 decoder/fsa-hiero.ini           |   5 --
 decoder/fsa.ini                 |   2 -
 decoder/glue-lda.scfg           |   8 ---
 decoder/grammar.hiero           | 151 ----------------------------------------
 decoder/perro.sh                |   1 -
 decoder/perro.ur                |   1 -
 decoder/short.ur                |   1 -
 decoder/weights-fsa             |  14 ----
 decoder/weights.hiero           |  10 ---
 16 files changed, 243 deletions(-)
 delete mode 100755 decoder/1dev.ur
 delete mode 100755 decoder/apply_fsa_models.README
 delete mode 100755 decoder/cdec-gz.ini
 delete mode 100755 decoder/cdec-nolm-tuned.ini
 delete mode 100755 decoder/decode.sh
 delete mode 100755 decoder/do.tests.sh
 delete mode 100755 decoder/fsa-decode.sh
 delete mode 100755 decoder/fsa-hiero.ini
 delete mode 100755 decoder/fsa.ini
 delete mode 100755 decoder/glue-lda.scfg
 delete mode 100755 decoder/grammar.hiero
 delete mode 100755 decoder/perro.sh
 delete mode 100755 decoder/perro.ur
 delete mode 100755 decoder/short.ur
 delete mode 100644 decoder/weights-fsa
 delete mode 100755 decoder/weights.hiero

(limited to 'decoder')

diff --git a/decoder/1dev.ur b/decoder/1dev.ur
deleted file mode 100755
index adeaa101..00000000
--- a/decoder/1dev.ur
+++ /dev/null
@@ -1 +0,0 @@
-krAcy ( AstRAf rpwrtRr ) krAcy myN pyr kw mxtlf HAdvAt myN xAtwn smyt 4 AfrAd hlAk hw gyY jbkh smndr sY Ayk $xS ky lA$ mly .
diff --git a/decoder/apply_fsa_models.README b/decoder/apply_fsa_models.README
deleted file mode 100755
index 7e116a62..00000000
--- a/decoder/apply_fsa_models.README
+++ /dev/null
@@ -1,21 +0,0 @@
-trie root and trie lhs2[lhs-nodeid] -> trie node
-
-trie node edges (adj) - list of w,dest,p.  dest==0 means it's a completed rule (note: p is redundant with node e.dest->p-p, except in case of dest=0).  we will also use null_wordid (max_int) for dest=0 edges, but that doesn't matter
-
-we intersect by iterating over adj and scoring w/ fsa.  TODO: index for sparse fsa; for now we assume smoothed ngram fsa where all items are scorable.
-
-predicted items: we don't make copies of the pending predictions as we scan toward completion; instead, item backpointers are followed until the prediction (where backpointer=0).  such backpointer=0 items have a queue of prediction-originating items.
-
-reusing completed items using a lookup on pair [NT,a] -> all [NT,a,b] lazy best-first.  b-next (right state) index in lazy index.
-
-perhaps predictors need to register the # of items it has already mated with. (b-next index)
-
-comb-like (cube) t-next (position in trie node edge list), b-next?  or just check chart and don't redup.  depends on whether we want just 1best or kbest deriv - diff. ways of reaching same result are good in kbest.
-
-types of chart items:
-
-A->t.*,a,b (trie node t) with mutable state t-next for generating successor lazily (vs. all at once)
-
-A->t.B,a,b (t-next of A->t.* points to (B,t')): mutable state b-next for choosing which B->b,? to use.  note: such an item can't be queued immediately on its own, but can be added to the pending list of B->b,? ; once any B->b,? is completed then we see if any more b-next are already known; if they're exhausted then we add back to pending list?
-
-A->a,? - list of all known (b,inside prob) such that A[a,b].  we may also choose to represent this as A->.*,a,a.
diff --git a/decoder/cdec-gz.ini b/decoder/cdec-gz.ini
deleted file mode 100755
index f9b15420..00000000
--- a/decoder/cdec-gz.ini
+++ /dev/null
@@ -1,7 +0,0 @@
-cubepruning_pop_limit=200
-feature_function=WordPenalty
-feature_function=ArityPenalty
-add_pass_through_rules=true
-formalism=scfg
-grammar=mt09.grammar.gz
-weights=weights.tune.nolm
diff --git a/decoder/cdec-nolm-tuned.ini b/decoder/cdec-nolm-tuned.ini
deleted file mode 100755
index 5ebab747..00000000
--- a/decoder/cdec-nolm-tuned.ini
+++ /dev/null
@@ -1,7 +0,0 @@
-cubepruning_pop_limit=200
-feature_function=WordPenalty
-feature_function=ArityPenalty
-add_pass_through_rules=true
-formalism=scfg
-grammar=mt09.grammar
-weights=weights.tune.nolm
diff --git a/decoder/decode.sh b/decoder/decode.sh
deleted file mode 100755
index 677e64ad..00000000
--- a/decoder/decode.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-d=$(dirname `readlink -f $0`)/
-decode() {
-if [ "$lm" ] ; then
-    lmargs0=-F
-    lmargs1="LanguageModel lm.gz -n LM"
-fi
-set -x
-$gdb ${cdec:=$d/cdec} -c $d/${cfg:=cdec-fsa}.ini -i $d/${in:=1dev.ur} $lmargs0 "$lmargs1" --show_features --show_config --show_weights "$@"
-set +x
-}
diff --git a/decoder/do.tests.sh b/decoder/do.tests.sh
deleted file mode 100755
index b3ddeb18..00000000
--- a/decoder/do.tests.sh
+++ /dev/null
@@ -1 +0,0 @@
-for f in *_test; do ./$f; done
diff --git a/decoder/fsa-decode.sh b/decoder/fsa-decode.sh
deleted file mode 100755
index 66879523..00000000
--- a/decoder/fsa-decode.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-d=$(dirname `readlink -f $0`)/
-. $d/decode.sh
-in=1dev.ur cfg=cdec-fsa decode
diff --git a/decoder/fsa-hiero.ini b/decoder/fsa-hiero.ini
deleted file mode 100755
index 7c7d0347..00000000
--- a/decoder/fsa-hiero.ini
+++ /dev/null
@@ -1,5 +0,0 @@
-formalism=scfg
-scfg_extra_glue_grammar=glue-lda.scfg
-grammar=grammar.hiero
-show_tree_structure=true
-weights=weights.hiero
diff --git a/decoder/fsa.ini b/decoder/fsa.ini
deleted file mode 100755
index 571a2e34..00000000
--- a/decoder/fsa.ini
+++ /dev/null
@@ -1,2 +0,0 @@
-feature_function=ShorterThanPrev
-feature_function=LongerThanPrev
diff --git a/decoder/glue-lda.scfg b/decoder/glue-lda.scfg
deleted file mode 100755
index 27489817..00000000
--- a/decoder/glue-lda.scfg
+++ /dev/null
@@ -1,8 +0,0 @@
-[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1
-[S] ||| [X0,1] ||| [1] ||| GlueTop=1
-[S] ||| [S,1] [X1,2] ||| [1] [2] ||| Glue=1
-[S] ||| [X1,1] ||| [1] ||| GlueTop=1
-[S] ||| [S,1] [X2,2] ||| [1] [2] ||| Glue=1
-[S] ||| [X2,1] ||| [1] ||| GlueTop=1
-[S] ||| [S,1] [X3,2] ||| [1] [2] ||| Glue=1
-[S] ||| [X3,1] ||| [1] ||| GlueTop=1
diff --git a/decoder/grammar.hiero b/decoder/grammar.hiero
deleted file mode 100755
index 79adf33a..00000000
--- a/decoder/grammar.hiero
+++ /dev/null
@@ -1,151 +0,0 @@
-[X] ||| . ||| . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] . ||| [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] anciano ||| [1] old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| [X,1] anciano . ||| [1] old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| [X,1] anciano [X,2] ||| [1] old man [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| [X,1] feo ||| ugly [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] feo . ||| ugly [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] feo [X,2] ||| ugly [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato ||| [1] cat ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato . ||| [1] cat . ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato [X,2] ||| [1] [2] cat ||| EgivenF=0 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato [X,2] ||| [1] cat [2] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato [X,2] . ||| [1] [2] cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato negro ||| [1] black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato negro . ||| [1] black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato negro [X,2] ||| [1] black cat [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] grande ||| big [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] grande . ||| big [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] grande [X,2] ||| big [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] negro ||| black [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] negro . ||| black [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] negro [X,2] ||| black [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] oruga ||| [1] caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] oruga . ||| [1] caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] oruga [X,2] ||| [1] caterpiller [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] patito [X,2] ||| [1] [2] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] patito [X,2] . ||| [1] [2] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] patito feo ||| [1] ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] patito feo . ||| [1] ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] patito feo [X,2] ||| [1] ugly duckling [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] peces ||| [1] fish ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] peces . ||| [1] fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] peces [X,2] ||| [1] fish [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro ||| [1] dog ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro . ||| [1] dog . ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro [X,2] ||| [1] dog [2] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro [X,2] ||| [1] [2] dog ||| EgivenF=0 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro [X,2] . ||| [1] [2] dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro grande ||| [1] big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro grande . ||| [1] big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro grande [X,2] ||| [1] big dog [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] pájaro [X,2] ||| [1] [2] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] pájaro [X,2] . ||| [1] [2] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] pájaro negro ||| [1] black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] pájaro negro . ||| [1] black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] pájaro negro [X,2] ||| [1] black bird [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| anciano ||| old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| anciano . ||| old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| anciano [X,1] ||| old man [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| el ||| the ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] ||| the [1] ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] . ||| the [1] . ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] feo ||| the ugly [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] feo . ||| the ugly [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] feo [X,2] ||| the ugly [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] grande ||| the big [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] grande . ||| the big [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] grande [X,2] ||| the big [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] negro ||| the black [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] negro . ||| the black [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] negro [X,2] ||| the black [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato ||| the cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato . ||| the cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato [X,1] ||| the [1] cat ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato [X,1] ||| the cat [1] ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato [X,1] . ||| the [1] cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato negro ||| the black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato negro . ||| the black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato negro [X,1] ||| the black cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el patito [X,1] ||| the [1] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el patito [X,1] . ||| the [1] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el patito feo ||| the ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el patito feo . ||| the ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el patito feo [X,1] ||| the ugly duckling [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro ||| the dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro . ||| the dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro [X,1] ||| the [1] dog ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro [X,1] ||| the dog [1] ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro [X,1] . ||| the [1] dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro grande ||| the big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro grande . ||| the big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro grande [X,1] ||| the big dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el pájaro [X,1] ||| the [1] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el pájaro [X,1] . ||| the [1] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el pájaro negro ||| the black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el pájaro negro . ||| the black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el pájaro negro [X,1] ||| the black bird [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| eso ||| that ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| eso [X,1] ||| that [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| eso [X,1] . ||| that [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| eso perro ||| that dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| eso perro . ||| that dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| eso perro [X,1] ||| that dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| este ||| this ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| este [X,1] ||| this [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| este [X,1] . ||| this [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| este anciano ||| this old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| este anciano . ||| this old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| este anciano [X,1] ||| this old man [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| este gato ||| this cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| este gato . ||| this cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| este gato [X,1] ||| this cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| feo ||| ugly ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato ||| cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato . ||| cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato [X,1] ||| [1] cat ||| EgivenF=1.09861 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato [X,1] ||| cat [1] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato [X,1] . ||| [1] cat . ||| EgivenF=1.09861 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato negro ||| black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato negro . ||| black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato negro [X,1] ||| black cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| grande ||| big ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| la ||| the ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| la [X,1] ||| the [1] ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| la [X,1] . ||| the [1] . ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| la oruga ||| the caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| la oruga . ||| the caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| la oruga [X,1] ||| the caterpiller [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| los ||| the ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| los [X,1] ||| the [1] ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| los [X,1] . ||| the [1] . ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| los peces ||| the fish ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| los peces . ||| the fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| los peces [X,1] ||| the fish [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| negro ||| black ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| oruga ||| caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| oruga . ||| caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| oruga [X,1] ||| caterpiller [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| patito ||| duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| patito [X,1] ||| [1] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| patito [X,1] . ||| [1] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| patito feo ||| ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| patito feo . ||| ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| patito feo [X,1] ||| ugly duckling [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| peces ||| fish ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| peces . ||| fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| peces [X,1] ||| fish [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro ||| dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro . ||| dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro [X,1] ||| [1] dog ||| EgivenF=1.09861 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro [X,1] ||| dog [1] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro [X,1] . ||| [1] dog . ||| EgivenF=1.09861 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro grande ||| big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro grande . ||| big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro grande [X,1] ||| big dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| pájaro ||| bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| pájaro [X,1] ||| [1] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| pájaro [X,1] . ||| [1] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| pájaro negro ||| black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| pájaro negro . ||| black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| pájaro negro [X,1] ||| black bird [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
diff --git a/decoder/perro.sh b/decoder/perro.sh
deleted file mode 100755
index 3e54ac71..00000000
--- a/decoder/perro.sh
+++ /dev/null
@@ -1 +0,0 @@
-$gdb $cdec "$@" -k 30 --show_features -c fsa-hiero.ini -i perro.ur 
diff --git a/decoder/perro.ur b/decoder/perro.ur
deleted file mode 100755
index 6c5da6d7..00000000
--- a/decoder/perro.ur
+++ /dev/null
@@ -1 +0,0 @@
-eso perro feo
diff --git a/decoder/short.ur b/decoder/short.ur
deleted file mode 100755
index 48612801..00000000
--- a/decoder/short.ur
+++ /dev/null
@@ -1 +0,0 @@
-krAcy myN pyr kw mxtlf HAdvAt
diff --git a/decoder/weights-fsa b/decoder/weights-fsa
deleted file mode 100644
index 3cc96c2f..00000000
--- a/decoder/weights-fsa
+++ /dev/null
@@ -1,14 +0,0 @@
-Arity_0 1.70741473606976
-Arity_1 1.12426238048012
-Arity_2 1.14986187839554
-Glue -0.04589037041388
-LanguageModel 1.09051
-LM 1.09051
-PassThrough -3.66226367902928
-PhraseModel_0 -1.94633451863252
-PhraseModel_1 -0.1475347695476
-PhraseModel_2 -1.614818994946
-WordPenalty -3.0
-WordPenaltyFsa -0.56028442964748
-ShorterThanPrev -10
-LongerThanPrev -10
diff --git a/decoder/weights.hiero b/decoder/weights.hiero
deleted file mode 100755
index 6747f059..00000000
--- a/decoder/weights.hiero
+++ /dev/null
@@ -1,10 +0,0 @@
-SameFirstLetter 1
-LongerThanPrev 1
-ShorterThanPrev 1
-GlueTop 0.0
-Glue -1.0
-EgivenF -0.5
-FgivenE -0.5
-LexEgivenF -0.5
-LexFgivenE -0.5
-LM 1
-- 
cgit v1.2.3


From 99f221dfd9b1c086baca8b675920f4aecea8aca9 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kenlm@kheafield.com>
Date: Fri, 10 Feb 2012 14:13:24 -0500
Subject: Dear windows users, code is not executable

---
 decoder/apply_fsa_models.h | 0
 decoder/cfg.cc             | 0
 decoder/cfg.h              | 0
 decoder/cfg_binarize.h     | 0
 decoder/cfg_format.h       | 0
 decoder/cfg_options.h      | 0
 decoder/cfg_test.cc        | 0
 decoder/ff_register.h      | 0
 decoder/ff_sample_fsa.h    | 0
 decoder/hg_cfg.h           | 0
 decoder/hg_test.h          | 0
 decoder/nt_span.h          | 0
 decoder/oracle_bleu.h      | 0
 decoder/program_options.h  | 0
 decoder/sentences.h        | 0
 15 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 decoder/apply_fsa_models.h
 mode change 100755 => 100644 decoder/cfg.cc
 mode change 100755 => 100644 decoder/cfg.h
 mode change 100755 => 100644 decoder/cfg_binarize.h
 mode change 100755 => 100644 decoder/cfg_format.h
 mode change 100755 => 100644 decoder/cfg_options.h
 mode change 100755 => 100644 decoder/cfg_test.cc
 mode change 100755 => 100644 decoder/ff_register.h
 mode change 100755 => 100644 decoder/ff_sample_fsa.h
 mode change 100755 => 100644 decoder/hg_cfg.h
 mode change 100755 => 100644 decoder/hg_test.h
 mode change 100755 => 100644 decoder/nt_span.h
 mode change 100755 => 100644 decoder/oracle_bleu.h
 mode change 100755 => 100644 decoder/program_options.h
 mode change 100755 => 100644 decoder/sentences.h

(limited to 'decoder')

diff --git a/decoder/apply_fsa_models.h b/decoder/apply_fsa_models.h
old mode 100755
new mode 100644
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
old mode 100755
new mode 100644
diff --git a/decoder/cfg.h b/decoder/cfg.h
old mode 100755
new mode 100644
diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h
old mode 100755
new mode 100644
diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h
old mode 100755
new mode 100644
diff --git a/decoder/cfg_options.h b/decoder/cfg_options.h
old mode 100755
new mode 100644
diff --git a/decoder/cfg_test.cc b/decoder/cfg_test.cc
old mode 100755
new mode 100644
diff --git a/decoder/ff_register.h b/decoder/ff_register.h
old mode 100755
new mode 100644
diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h
old mode 100755
new mode 100644
diff --git a/decoder/hg_cfg.h b/decoder/hg_cfg.h
old mode 100755
new mode 100644
diff --git a/decoder/hg_test.h b/decoder/hg_test.h
old mode 100755
new mode 100644
diff --git a/decoder/nt_span.h b/decoder/nt_span.h
old mode 100755
new mode 100644
diff --git a/decoder/oracle_bleu.h b/decoder/oracle_bleu.h
old mode 100755
new mode 100644
diff --git a/decoder/program_options.h b/decoder/program_options.h
old mode 100755
new mode 100644
diff --git a/decoder/sentences.h b/decoder/sentences.h
old mode 100755
new mode 100644
-- 
cgit v1.2.3


From a7681b121192c2c60591c51bf268ed943a83bc15 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kenlm@kheafield.com>
Date: Sun, 12 Feb 2012 16:46:46 -0500
Subject: Target-side only output format

---
 decoder/decoder.cc |  3 +++
 decoder/hg_io.cc   | 27 +++++++++++++++++++++++++++
 decoder/hg_io.h    |  3 +++
 3 files changed, 33 insertions(+)

(limited to 'decoder')

diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index 3b53fd6b..3394e0b8 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -408,6 +408,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
         ("show_partition,z", "Compute and show the partition (inside score)")
         ("show_conditional_prob", "Output the conditional log prob to STDOUT instead of a translation")
         ("show_cfg_search_space", "Show the search space as a CFG")
+        ("show_target_graph", "Output the target hypergraph")
         ("coarse_to_fine_beam_prune", po::value<double>(), "Prune paths from coarse parse forest before fine parse, keeping paths within exp(alpha>=0)")
         ("ctf_beam_widen", po::value<double>()->default_value(2.0), "Expand coarse pass beam by this factor if no fine parse is found")
         ("ctf_num_widenings", po::value<int>()->default_value(2), "Widen coarse beam this many times before backing off to full parse")
@@ -1017,6 +1018,8 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
   }
   if (conf.count("show_cfg_search_space"))
     HypergraphIO::WriteAsCFG(forest);
+  if (conf.count("show_target_graph"))
+    HypergraphIO::WriteTarget(forest);
   if (has_ref) {
     if (HG::Intersect(ref, &forest)) {
 //      if (crf_uniform_empirical) {
diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc
index c1c93933..0283ec3c 100644
--- a/decoder/hg_io.cc
+++ b/decoder/hg_io.cc
@@ -624,3 +624,30 @@ void HypergraphIO::WriteAsCFG(const Hypergraph& hg) {
   }
 }
 
+/* Output format:
+ * #vertices
+ * for each vertex in bottom-up topological order:
+ *   #downward_edges
+ *   for each downward edge:
+ *     RHS with [vertex_index] for NTs ||| scores
+ */
+void HypergraphIO::WriteTarget(const Hypergraph& hg) {
+  cout << hg.nodes_.size() << '\n';
+  for (unsigned int i = 0; i < hg.nodes_.size(); ++i) {
+    const Hypergraph::EdgesVector &edges = hg.nodes_[i].in_edges_;
+    cout << edges.size() << '\n';
+    for (unsigned int j = 0; j < edges.size(); ++j) {
+      const Hypergraph::Edge &edge = hg.edges_[edges[j]];
+      const std::vector<WordID> &e = edge.rule_->e();
+      for (std::vector<WordID>::const_iterator word = e.begin(); word != e.end(); ++word) {
+        if (*word <= 0) {
+          cout << '[' << edge.tail_nodes_[-*word] << "] ";
+        } else {
+          cout << TD::Convert(*word) << ' ';
+        }
+      }
+      cout << "||| " << edge.rule_->scores_ << '\n';
+    }
+  }
+}
+
diff --git a/decoder/hg_io.h b/decoder/hg_io.h
index 082489d8..44817157 100644
--- a/decoder/hg_io.h
+++ b/decoder/hg_io.h
@@ -23,6 +23,9 @@ struct HypergraphIO {
 
   static void WriteAsCFG(const Hypergraph& hg);
 
+  // Write only the target size information in bottom-up order.  
+  static void WriteTarget(const Hypergraph& hg);
+
   // serialization utils
   static void ReadFromPLF(const std::string& in, Hypergraph* out, int line = 0);
   // return PLF string representation (undefined behavior on non-lattices)
-- 
cgit v1.2.3


From dbe064a15aa78b344d13db5bd831cc9e0a9bbac1 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kenlm@kheafield.com>
Date: Sun, 12 Feb 2012 17:40:03 -0500
Subject: Might as well provide the edge count as well

---
 decoder/hg_io.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'decoder')

diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc
index 0283ec3c..9f0f50fa 100644
--- a/decoder/hg_io.cc
+++ b/decoder/hg_io.cc
@@ -632,7 +632,7 @@ void HypergraphIO::WriteAsCFG(const Hypergraph& hg) {
  *     RHS with [vertex_index] for NTs ||| scores
  */
 void HypergraphIO::WriteTarget(const Hypergraph& hg) {
-  cout << hg.nodes_.size() << '\n';
+  cout << hg.nodes_.size() << ' ' << hg.edges_.size() << '\n';
   for (unsigned int i = 0; i < hg.nodes_.size(); ++i) {
     const Hypergraph::EdgesVector &edges = hg.nodes_[i].in_edges_;
     cout << edges.size() << '\n';
-- 
cgit v1.2.3


From e5c17041c0596994a60fe984c71b0a058a4aec47 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sun, 19 Feb 2012 04:27:55 -0500
Subject: lbl preliminary clean up

---
 decoder/lattice.cc    |  1 +
 training/lbl_model.cc | 84 +++++++++++++++++++++++++++++++++------------------
 2 files changed, 55 insertions(+), 30 deletions(-)

(limited to 'decoder')

diff --git a/decoder/lattice.cc b/decoder/lattice.cc
index e3631e59..89da3cd0 100644
--- a/decoder/lattice.cc
+++ b/decoder/lattice.cc
@@ -46,6 +46,7 @@ void LatticeTools::ConvertTextToLattice(const string& text, Lattice* pl) {
   Lattice& l = *pl;
   vector<WordID> ids;
   TD::ConvertSentence(text, &ids);
+  l.clear();
   l.resize(ids.size());
   for (int i = 0; i < l.size(); ++i)
     l[i].push_back(LatticeArc(ids[i], 0.0, 1));
diff --git a/training/lbl_model.cc b/training/lbl_model.cc
index 72d80a56..ccd29255 100644
--- a/training/lbl_model.cc
+++ b/training/lbl_model.cc
@@ -6,6 +6,7 @@
 #else
 
 #include <cmath>
+#include <set>
 
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
@@ -20,10 +21,17 @@
 namespace po = boost::program_options;
 using namespace std;
 
+#define kDIMENSIONS 10
+typedef Eigen::Matrix<float, kDIMENSIONS, 1> RVector;
+typedef Eigen::Matrix<float, 1, kDIMENSIONS> RTVector;
+typedef Eigen::Matrix<float, kDIMENSIONS, kDIMENSIONS> TMatrix;
+vector<RVector> r_src, r_trg;
+
 bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
-        ("iterations,i",po::value<unsigned>()->default_value(5),"Number of iterations of training")
+        ("input,i",po::value<string>(),"Input file")
+        ("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training")
         ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)")
         ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model");
   po::options_description clo("Command line options");
@@ -42,7 +50,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::notify(*conf);
 
   if (argc < 2 || conf->count("help")) {
-    cerr << "Usage " << argv[0] << " [OPTIONS] corpus.fr-en\n";
+    cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n";
     cerr << dcmdline_options << endl;
     return false;
   }
@@ -52,33 +60,32 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
 int main(int argc, char** argv) {
   po::variables_map conf;
   if (!InitCommandLine(argc, argv, &conf)) return 1;
-  const string fname = argv[argc - 1];
+  const string fname = conf["input"].as<string>();
   const int ITERATIONS = conf["iterations"].as<unsigned>();
   const double diagonal_tension = conf["diagonal_tension"].as<double>();
+  if (diagonal_tension < 0.0) {
+    cerr << "Invalid value for diagonal_tension: must be >= 0\n";
+    return 1;
+  }
   string testset;
   if (conf.count("testset")) testset = conf["testset"].as<string>();
 
-  double tot_len_ratio = 0;
-  double mean_srclen_multiplier = 0;
+  int lc = 0;
   vector<double> unnormed_a_i;
-  for (int iter = 0; iter < ITERATIONS; ++iter) {
-    cerr << "ITERATION " << (iter + 1) << endl;
+  string line;
+  string ssrc, strg;
+  bool flag = false;
+  Lattice src, trg;
+  set<WordID> vocab_e;
+  { // read through corpus, initialize int map, check lines are good
+    cerr << "INITIAL READ OF " << fname << endl;
     ReadFile rf(fname);
     istream& in = *rf.stream();
-    double likelihood = 0;
-    double denom = 0.0;
-    int lc = 0;
-    bool flag = false;
-    string line;
-    string ssrc, strg;
-    while(true) {
-      getline(in, line);
-      if (!in) break;
+    while(getline(in, line)) {
       ++lc;
       if (lc % 1000 == 0) { cerr << '.'; flag = true; }
       if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; }
       ParseTranslatorInput(line, &ssrc, &strg);
-      Lattice src, trg;
       LatticeTools::ConvertTextToLattice(ssrc, &src);
       LatticeTools::ConvertTextToLattice(strg, &trg);
       if (src.size() == 0 || trg.size() == 0) {
@@ -88,37 +95,54 @@ int main(int argc, char** argv) {
       }
       if (src.size() > unnormed_a_i.size())
         unnormed_a_i.resize(src.size());
-      if (iter == 0)
-        tot_len_ratio += static_cast<double>(trg.size()) / static_cast<double>(src.size());
+      for (unsigned i = 0; i < trg.size(); ++i) {
+        assert(trg[i].size() == 1);
+        vocab_e.insert(trg[i][0].label);
+      }
+    }
+  }
+  if (flag) cerr << endl;
+
+  // do optimization
+  for (int iter = 0; iter < ITERATIONS; ++iter) {
+    cerr << "ITERATION " << (iter + 1) << endl;
+    ReadFile rf(fname);
+    istream& in = *rf.stream();
+    double likelihood = 0;
+    double denom = 0.0;
+    lc = 0;
+    flag = false;
+    while(true) {
+      getline(in, line);
+      if (!in) break;
+      ++lc;
+      if (lc % 1000 == 0) { cerr << '.'; flag = true; }
+      if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; }
+      ParseTranslatorInput(line, &ssrc, &strg);
+      LatticeTools::ConvertTextToLattice(ssrc, &src);
+      LatticeTools::ConvertTextToLattice(strg, &trg);
       denom += trg.size();
       vector<double> probs(src.size() + 1);
-      bool first_al = true;  // used for write_alignments
       for (int j = 0; j < trg.size(); ++j) {
         const WordID& f_j = trg[j][0].label;
         double sum = 0;
         const double j_over_ts = double(j) / trg.size();
-        double prob_a_i = 1.0 / src.size();
         double az = 0;
         for (int ta = 0; ta < src.size(); ++ta) {
           unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension);
           az += unnormed_a_i[ta];
         }
         for (int i = 1; i <= src.size(); ++i) {
-          prob_a_i = unnormed_a_i[i-1] / az;
+          const double prob_a_i = unnormed_a_i[i-1] / az;
+          // TODO
           probs[i] = 1; // tt.prob(src[i-1][0].label, f_j) * prob_a_i;
           sum += probs[i];
         }
       }
     }
-
-    // log(e) = 1.0
-    double base2_likelihood = likelihood / log(2);
-
     if (flag) { cerr << endl; }
-    if (iter == 0) {
-      mean_srclen_multiplier = tot_len_ratio / lc;
-      cerr << "expected target length = source length * " << mean_srclen_multiplier << endl;
-    }
+
+    const double base2_likelihood = likelihood / log(2);
     cerr << "  log_e likelihood: " << likelihood << endl;
     cerr << "  log_2 likelihood: " << base2_likelihood << endl;
     cerr << "   cross entropy: " << (-base2_likelihood / denom) << endl;
-- 
cgit v1.2.3


From dc2b2fc395ad496851f723c4da59181445c07047 Mon Sep 17 00:00:00 2001
From: Chris Dyer <prguest11@taipan.cs>
Date: Mon, 27 Feb 2012 02:19:34 +0000
Subject: generic bayesian cfg learner with a bunch of cfg grammar types

---
 .gitignore         |   1 +
 decoder/trule.cc   |  16 +--
 gi/pf/Makefile.am  |   4 +-
 gi/pf/hierolm.cc   | 309 -----------------------------------------
 gi/pf/learn_cfg.cc | 394 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 398 insertions(+), 326 deletions(-)
 delete mode 100644 gi/pf/hierolm.cc
 create mode 100644 gi/pf/learn_cfg.cc

(limited to 'decoder')

diff --git a/.gitignore b/.gitignore
index 327f7261..28d5a60a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -57,6 +57,7 @@ training/mpi_extract_reachable
 klm/lm/build_binary
 extools/extractor_monolingual
 gi/pf/.deps
+gi/pf/learn_cfg
 gi/pf/brat
 gi/pf/cbgi
 gi/pf/dpnaive
diff --git a/decoder/trule.cc b/decoder/trule.cc
index 40235542..141b8faa 100644
--- a/decoder/trule.cc
+++ b/decoder/trule.cc
@@ -232,16 +232,6 @@ void TRule::ComputeArity() {
   arity_ = 1 - min;
 }
 
-static string AnonymousStrVar(int i) {
-  string res("[v]");
-  if(!(i <= 0 && i >= -8)) {
-    cerr << "Can't handle more than 9 non-terminals: index=" << (-i) << endl;
-    abort();
-  }
-  res[1] = '1' - i;
-  return res;
-}
-
 string TRule::AsString(bool verbose) const {
   ostringstream os;
   int idx = 0;
@@ -259,15 +249,11 @@ string TRule::AsString(bool verbose) const {
     }
   }
   os << " ||| ";
-  if (idx > 9) {
-    cerr << "Too many non-terminals!\n partial: " << os.str() << endl;
-    exit(1);
-  }
   for (int i =0; i<e_.size(); ++i) {
     if (i) os << ' ';
     const WordID& w = e_[i];
     if (w < 1)
-      os << AnonymousStrVar(w);
+      os << '[' << (1-w) << ']';
     else
       os << TD::Convert(w);
   }
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index ed5b6fd3..0cf0bc63 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,4 +1,4 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp hierolm
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg
 
 noinst_LIBRARIES = libpf.a
 libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc
@@ -9,7 +9,7 @@ align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
 
 itg_SOURCES = itg.cc
 
-hierolm_SOURCES = hierolm.cc
+learn_cfg_SOURCES = learn_cfg.cc
 
 condnaive_SOURCES = condnaive.cc
 
diff --git a/gi/pf/hierolm.cc b/gi/pf/hierolm.cc
deleted file mode 100644
index afb12fef..00000000
--- a/gi/pf/hierolm.cc
+++ /dev/null
@@ -1,309 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-#include <queue>
-
-#include <boost/functional.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "inside_outside.h"
-#include "hg.h"
-#include "bottom_up_parser.h"
-#include "fdict.h"
-#include "grammar.h"
-#include "m.h"
-#include "trule.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp.h"
-#include "ccrp_onetable.h"
-
-using namespace std;
-using namespace tr1;
-namespace po = boost::program_options;
-
-shared_ptr<MT19937> prng;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
-        ("input,i",po::value<string>(),"Read parallel data from")
-        ("random_seed,S",po::value<uint32_t>(), "Random seed");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || (conf->count("input") == 0)) {
-    cerr << dcmdline_options << endl;
-    exit(1);
-  }
-}
-
-void ReadCorpus(const string& filename,
-                vector<vector<WordID> >* e,
-                set<WordID>* vocab_e) {
-  e->clear();
-  vocab_e->clear();
-  istream* in;
-  if (filename == "-")
-    in = &cin;
-  else
-    in = new ifstream(filename.c_str());
-  assert(*in);
-  string line;
-  while(*in) {
-    getline(*in, line);
-    if (line.empty() && !*in) break;
-    e->push_back(vector<int>());
-    vector<int>& le = e->back();
-    TD::ConvertSentence(line, &le);
-    for (unsigned i = 0; i < le.size(); ++i)
-      vocab_e->insert(le[i]);
-  }
-  if (in != &cin) delete in;
-}
-
-struct Grid {
-  // a b c d e
-  // 0 - 0 - -
-  vector<int> grid;
-};
-
-struct BaseRuleModel {
-  explicit BaseRuleModel(unsigned term_size,
-                         unsigned nonterm_size = 1) :
-      unif_term(1.0 / term_size),
-      unif_nonterm(1.0 / nonterm_size) {}
-  prob_t operator()(const TRule& r) const {
-    prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size()));
-    const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2));
-    const prob_t nonterm_prob(1.0 - term_prob.as_float());
-    for (unsigned i = 0; i < r.f_.size(); ++i) {
-      if (r.f_[i] <= 0) {     // nonterminal
-        p *= nonterm_prob;
-        p *= unif_nonterm;
-      } else {                // terminal
-        p *= term_prob;
-        p *= unif_term;
-      }
-    }
-    return p;
-  }
-  const prob_t unif_term, unif_nonterm;
-};
-
-struct HieroLMModel {
-  explicit HieroLMModel(unsigned vocab_size) : p0(vocab_size), x(1,1,1,1) {}
-
-  prob_t Prob(const TRule& r) const {
-    return x.probT<prob_t>(r, p0(r));
-  }
-
-  int Increment(const TRule& r, MT19937* rng) {
-    return x.incrementT<prob_t>(r, p0(r), rng);
-    // return x.increment(r);
-  }
-
-  int Decrement(const TRule& r, MT19937* rng) {
-    return x.decrement(r, rng);
-    //return x.decrement(r);
-  }
-
-  prob_t Likelihood() const {
-    prob_t p;
-    p.logeq(x.log_crp_prob());
-    for (CCRP<TRule>::const_iterator it = x.begin(); it != x.end(); ++it) {
-      prob_t tp = p0(it->first);
-      tp.poweq(it->second.table_counts_.size());
-      p *= tp;
-    }
-    //for (CCRP_OneTable<TRule>::const_iterator it = x.begin(); it != x.end(); ++it)
-    //    p *= p0(it->first);
-    return p;
-  }
-
-  void ResampleHyperparameters(MT19937* rng) {
-    x.resample_hyperparameters(rng);
-    cerr << " d=" << x.discount() << ", alpha=" << x.concentration() << endl;
-  }
-
-  const BaseRuleModel p0;
-  CCRP<TRule> x;
-  //CCRP_OneTable<TRule> x;
-};
-
-vector<GrammarIter* > tofreelist;
-
-HieroLMModel* plm;
-
-struct NPGrammarIter : public GrammarIter, public RuleBin {
-  NPGrammarIter() : arity() { tofreelist.push_back(this); }
-  NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a + (symbol < 0 ? 1 : 0)) {
-    if (inr) {
-      r.reset(new TRule(*inr));
-    } else {
-      static const int kLHS = -TD::Convert("X");
-      r.reset(new TRule);
-      r->lhs_ = kLHS;
-    }
-    TRule& rr = *r;
-    rr.f_.push_back(symbol);
-    rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol);
-    tofreelist.push_back(this);
-  }
-  virtual int GetNumRules() const {
-    if (r) return 1; else return 0;
-  }
-  virtual TRulePtr GetIthRule(int) const {
-    return r;
-  }
-  virtual int Arity() const {
-    return arity;
-  }
-  virtual const RuleBin* GetRules() const {
-    if (!r) return NULL; else return this;
-  }
-  virtual const GrammarIter* Extend(int symbol) const {
-    return new NPGrammarIter(r, arity, symbol);
-  }
-  const unsigned char arity;
-  TRulePtr r;
-};
-
-struct NPGrammar : public Grammar {
-  virtual const GrammarIter* GetRoot() const {
-    return new NPGrammarIter;
-  }
-};
-
-void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector<unsigned>* sampled_deriv, HieroLMModel* plm) {
-  HieroLMModel& lm = *plm;
-  vector<prob_t> node_probs;
-  const prob_t total_prob = Inside<prob_t, EdgeProb>(hg, &node_probs);
-  queue<unsigned> q;
-  q.push(hg.nodes_.size() - 3);
-  while(!q.empty()) {
-    unsigned cur_node_id = q.front();
-//    cerr << "NODE=" << cur_node_id << endl;
-    q.pop();
-    const Hypergraph::Node& node = hg.nodes_[cur_node_id];
-    const unsigned num_in_edges = node.in_edges_.size();
-    unsigned sampled_edge = 0;
-    if (num_in_edges == 1) {
-      sampled_edge = node.in_edges_[0];
-    } else {
-      //prob_t z;
-      assert(num_in_edges > 1);
-      SampleSet<prob_t> ss;
-      for (unsigned j = 0; j < num_in_edges; ++j) {
-        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
-        prob_t p = edge.edge_prob_;
-        for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k)
-          p *= node_probs[edge.tail_nodes_[k]];
-        ss.add(p);
-//        cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl;
-        //z += p;
-      }
-//      for (unsigned j = 0; j < num_in_edges; ++j) {
-//        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
-//        cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl;
-//      }
-//      cerr << " --- \n";
-      sampled_edge = node.in_edges_[rng->SelectSample(ss)];
-    }
-    sampled_deriv->push_back(sampled_edge);
-    const Hypergraph::Edge& edge = hg.edges_[sampled_edge];
-    for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) {
-      q.push(edge.tail_nodes_[j]);
-    }
-  }
-  for (unsigned i = 0; i < sampled_deriv->size(); ++i) {
-    cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl;
-  }
-}
-
-void IncrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) {
-  for (unsigned i = 0; i < d.size(); ++i)
-    plm->Increment(*hg.edges_[d[i]].rule_, rng);
-}
-
-void DecrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) {
-  for (unsigned i = 0; i < d.size(); ++i)
-    plm->Decrement(*hg.edges_[d[i]].rule_, rng);
-}
-
-int main(int argc, char** argv) {
-  po::variables_map conf;
-  vector<GrammarPtr> grammars;
-  grammars.push_back(GrammarPtr(new NPGrammar));
-
-  InitCommandLine(argc, argv, &conf);
-  const unsigned samples = conf["samples"].as<unsigned>();
-
-  if (conf.count("random_seed"))
-    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
-  else
-    prng.reset(new MT19937);
-  MT19937& rng = *prng;
-
-  vector<vector<WordID> > corpuse;
-  set<WordID> vocabe;
-  cerr << "Reading corpus...\n";
-  ReadCorpus(conf["input"].as<string>(), &corpuse, &vocabe);
-  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
-  HieroLMModel lm(vocabe.size());
-
-  plm = &lm;
-  ExhaustiveBottomUpParser parser("X", grammars);
-
-  Hypergraph hg;
-  const int kX = -TD::Convert("X");
-  const int kLP = FD::Convert("LogProb");
-  SparseVector<double> v; v.set_value(kLP, 1.0);
-  vector<vector<unsigned> > derivs(corpuse.size());
-  for (int SS=0; SS < samples; ++SS) {
-    for (int ci = 0; ci < corpuse.size(); ++ci) {
-      vector<int>& src = corpuse[ci];
-      Lattice lat(src.size());
-      for (unsigned i = 0; i < src.size(); ++i)
-        lat[i].push_back(LatticeArc(src[i], 0.0, 1));
-      cerr << TD::GetString(src) << endl;
-      hg.clear();
-      parser.Parse(lat, &hg);  // exhaustive parse
-      DecrementDerivation(hg, derivs[ci], &lm, &rng);
-      for (unsigned i = 0; i < hg.edges_.size(); ++i) {
-        TRule& r = *hg.edges_[i].rule_;
-        if (r.lhs_ == kX)
-          hg.edges_[i].edge_prob_ = lm.Prob(r);
-      }
-      vector<unsigned> d;
-      SampleDerivation(hg, &rng, &d, &lm);
-      derivs[ci] = d;
-      IncrementDerivation(hg, derivs[ci], &lm, &rng);
-      if (tofreelist.size() > 100000) {
-        cerr << "Freeing ... ";
-        for (unsigned i = 0; i < tofreelist.size(); ++i)
-          delete tofreelist[i];
-        tofreelist.clear();
-        cerr << "Freed.\n";
-      }
-    }
-    cerr << "LLH=" << lm.Likelihood() << endl;
-  }
-  return 0;
-}
-
diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc
new file mode 100644
index 00000000..3d202816
--- /dev/null
+++ b/gi/pf/learn_cfg.cc
@@ -0,0 +1,394 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "inside_outside.h"
+#include "hg.h"
+#include "bottom_up_parser.h"
+#include "fdict.h"
+#include "grammar.h"
+#include "m.h"
+#include "trule.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp.h"
+#include "ccrp_onetable.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+shared_ptr<MT19937> prng;
+vector<int> nt_vocab;
+vector<int> nt_id_to_index;
+static unsigned kMAX_RULE_SIZE = 0;
+static unsigned kMAX_ARITY = 0;
+static bool kALLOW_MIXED = true;  // allow rules with mixed terminals and NTs
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("input,i",po::value<string>(),"Read parallel data from")
+        ("max_rule_size,m", po::value<unsigned>()->default_value(0), "Maximum rule size (0 for unlimited)")
+        ("max_arity,a", po::value<unsigned>()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)")
+        ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS")
+        ("nonterminals,n", po::value<unsigned>()->default_value(1), "Size of nonterminal vocabulary")
+        ("random_seed,S",po::value<uint32_t>(), "Random seed");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || (conf->count("input") == 0)) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+unsigned ReadCorpus(const string& filename,
+                    vector<vector<WordID> >* e,
+                    set<WordID>* vocab_e) {
+  e->clear();
+  vocab_e->clear();
+  istream* in;
+  if (filename == "-")
+    in = &cin;
+  else
+    in = new ifstream(filename.c_str());
+  assert(*in);
+  string line;
+  unsigned toks = 0;
+  while(*in) {
+    getline(*in, line);
+    if (line.empty() && !*in) break;
+    e->push_back(vector<int>());
+    vector<int>& le = e->back();
+    TD::ConvertSentence(line, &le);
+    for (unsigned i = 0; i < le.size(); ++i)
+      vocab_e->insert(le[i]);
+    toks += le.size();
+  }
+  if (in != &cin) delete in;
+  return toks;
+}
+
+struct Grid {
+  // a b c d e
+  // 0 - 0 - -
+  vector<int> grid;
+};
+
+struct BaseRuleModel {
+  explicit BaseRuleModel(unsigned term_size,
+                         unsigned nonterm_size = 1) :
+      unif_term(1.0 / term_size),
+      unif_nonterm(1.0 / nonterm_size) {}
+  prob_t operator()(const TRule& r) const {
+    prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size()));
+    const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2));
+    const prob_t nonterm_prob(1.0 - term_prob.as_float());
+    for (unsigned i = 0; i < r.f_.size(); ++i) {
+      if (r.f_[i] <= 0) {     // nonterminal
+        p *= nonterm_prob;
+        p *= unif_nonterm;
+      } else {                // terminal
+        p *= term_prob;
+        p *= unif_term;
+      }
+    }
+    return p;
+  }
+  const prob_t unif_term, unif_nonterm;
+};
+
+struct HieroLMModel {
+  explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : p0(vocab_size, num_nts), nts(num_nts, CCRP<TRule>(1,1,1,1)) {}
+
+  prob_t Prob(const TRule& r) const {
+    return nts[nt_id_to_index[-r.lhs_]].probT<prob_t>(r, p0(r));
+  }
+
+  int Increment(const TRule& r, MT19937* rng) {
+    return nts[nt_id_to_index[-r.lhs_]].incrementT<prob_t>(r, p0(r), rng);
+    // return x.increment(r);
+  }
+
+  int Decrement(const TRule& r, MT19937* rng) {
+    return nts[nt_id_to_index[-r.lhs_]].decrement(r, rng);
+    //return x.decrement(r);
+  }
+
+  prob_t Likelihood() const {
+    prob_t p = prob_t::One();
+    for (unsigned i = 0; i < nts.size(); ++i) {
+      prob_t q; q.logeq(nts[i].log_crp_prob());
+      p *= q;
+      for (CCRP<TRule>::const_iterator it = nts[i].begin(); it != nts[i].end(); ++it) {
+        prob_t tp = p0(it->first);
+        tp.poweq(it->second.table_counts_.size());
+        p *= tp;
+      }
+    }
+    //for (CCRP_OneTable<TRule>::const_iterator it = x.begin(); it != x.end(); ++it)
+    //    p *= p0(it->first);
+    return p;
+  }
+
+  void ResampleHyperparameters(MT19937* rng) {
+    for (unsigned i = 0; i < nts.size(); ++i)
+      nts[i].resample_hyperparameters(rng);
+    cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].concentration() << endl;
+  }
+
+  const BaseRuleModel p0;
+  vector<CCRP<TRule> > nts;
+  //CCRP_OneTable<TRule> x;
+};
+
+vector<GrammarIter* > tofreelist;
+
+HieroLMModel* plm;
+
+struct NPGrammarIter : public GrammarIter, public RuleBin {
+  NPGrammarIter() : arity() { tofreelist.push_back(this); }
+  NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a) {
+    if (inr) {
+      r.reset(new TRule(*inr));
+    } else {
+      r.reset(new TRule);
+    }
+    TRule& rr = *r;
+    rr.lhs_ = nt_vocab[0];
+    rr.f_.push_back(symbol);
+    rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol);
+    tofreelist.push_back(this);
+  }
+  inline static unsigned NextArity(int cur_a, int symbol) {
+    return cur_a + (symbol <= 0 ? 1 : 0);
+  }
+  virtual int GetNumRules() const {
+    if (r) return nt_vocab.size(); else return 0;
+  }
+  virtual TRulePtr GetIthRule(int i) const {
+    if (i == 0) return r;
+    TRulePtr nr(new TRule(*r));
+    nr->lhs_ = nt_vocab[i];
+    return nr;
+  }
+  virtual int Arity() const {
+    return arity;
+  }
+  virtual const RuleBin* GetRules() const {
+    if (!r) return NULL; else return this;
+  }
+  virtual const GrammarIter* Extend(int symbol) const {
+    const int next_arity = NextArity(arity, symbol);
+    if (kMAX_ARITY && next_arity > kMAX_ARITY)
+      return NULL;
+    if (!kALLOW_MIXED && r) {
+      bool t1 = r->f_.front() <= 0;
+      bool t2 = symbol <= 0;
+      if (t1 != t2) return NULL;
+    }
+    if (!kMAX_RULE_SIZE || !r || (r->f_.size() < kMAX_RULE_SIZE))
+      return new NPGrammarIter(r, next_arity, symbol);
+    else
+      return NULL;
+  }
+  const unsigned char arity;
+  TRulePtr r;
+};
+
+struct NPGrammar : public Grammar {
+  virtual const GrammarIter* GetRoot() const {
+    return new NPGrammarIter;
+  }
+};
+
+prob_t TotalProb(const Hypergraph& hg) {
+  return Inside<prob_t, EdgeProb>(hg);
+}
+
+void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector<unsigned>* sampled_deriv) {
+  vector<prob_t> node_probs;
+  Inside<prob_t, EdgeProb>(hg, &node_probs);
+  queue<unsigned> q;
+  q.push(hg.nodes_.size() - 2);
+  while(!q.empty()) {
+    unsigned cur_node_id = q.front();
+//    cerr << "NODE=" << cur_node_id << endl;
+    q.pop();
+    const Hypergraph::Node& node = hg.nodes_[cur_node_id];
+    const unsigned num_in_edges = node.in_edges_.size();
+    unsigned sampled_edge = 0;
+    if (num_in_edges == 1) {
+      sampled_edge = node.in_edges_[0];
+    } else {
+      //prob_t z;
+      assert(num_in_edges > 1);
+      SampleSet<prob_t> ss;
+      for (unsigned j = 0; j < num_in_edges; ++j) {
+        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+        prob_t p = edge.edge_prob_;
+        for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k)
+          p *= node_probs[edge.tail_nodes_[k]];
+        ss.add(p);
+//        cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl;
+        //z += p;
+      }
+//      for (unsigned j = 0; j < num_in_edges; ++j) {
+//        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+//        cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl;
+//      }
+//      cerr << " --- \n";
+      sampled_edge = node.in_edges_[rng->SelectSample(ss)];
+    }
+    sampled_deriv->push_back(sampled_edge);
+    const Hypergraph::Edge& edge = hg.edges_[sampled_edge];
+    for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) {
+      q.push(edge.tail_nodes_[j]);
+    }
+  }
+  for (unsigned i = 0; i < sampled_deriv->size(); ++i) {
+    cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl;
+  }
+}
+
+void IncrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) {
+  for (unsigned i = 0; i < d.size(); ++i)
+    plm->Increment(*hg.edges_[d[i]].rule_, rng);
+}
+
+void DecrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) {
+  for (unsigned i = 0; i < d.size(); ++i)
+    plm->Decrement(*hg.edges_[d[i]].rule_, rng);
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+
+  InitCommandLine(argc, argv, &conf);
+  nt_vocab.resize(conf["nonterminals"].as<unsigned>());
+  assert(nt_vocab.size() > 0);
+  assert(nt_vocab.size() < 26);
+  {
+    string nt = "X";
+    for (unsigned i = 0; i < nt_vocab.size(); ++i) {
+      if (nt_vocab.size() > 1) nt[0] = ('A' + i);
+      int pid = TD::Convert(nt);
+      nt_vocab[i] = -pid;
+      if (pid >= nt_id_to_index.size()) {
+        nt_id_to_index.resize(pid + 1, -1);
+      }
+      nt_id_to_index[pid] = i;
+    }
+  }
+  vector<GrammarPtr> grammars;
+  grammars.push_back(GrammarPtr(new NPGrammar));
+
+  const unsigned samples = conf["samples"].as<unsigned>();
+  kMAX_RULE_SIZE = conf["max_rule_size"].as<unsigned>();
+  if (kMAX_RULE_SIZE == 1) {
+    cerr << "Invalid maximum rule size: must be 0 or >1\n";
+    return 1;
+  }
+  kMAX_ARITY = conf["max_arity"].as<unsigned>();
+  if (kMAX_ARITY == 1) {
+    cerr << "Invalid maximum arity: must be 0 or >1\n";
+    return 1;
+  }
+  kALLOW_MIXED = !conf.count("no_mixed_rules");
+
+  if (conf.count("random_seed"))
+    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    prng.reset(new MT19937);
+  MT19937& rng = *prng;
+  vector<vector<WordID> > corpuse;
+  set<WordID> vocabe;
+  cerr << "Reading corpus...\n";
+  const unsigned toks = ReadCorpus(conf["input"].as<string>(), &corpuse, &vocabe);
+  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
+  HieroLMModel lm(vocabe.size(), nt_vocab.size());
+
+  plm = &lm;
+  ExhaustiveBottomUpParser parser(TD::Convert(-nt_vocab[0]), grammars);
+
+  Hypergraph hg;
+  const int kGoal = -TD::Convert("Goal");
+  const int kLP = FD::Convert("LogProb");
+  SparseVector<double> v; v.set_value(kLP, 1.0);
+  vector<vector<unsigned> > derivs(corpuse.size());
+  vector<Lattice> cl(corpuse.size());
+  for (int ci = 0; ci < corpuse.size(); ++ci) {
+    vector<int>& src = corpuse[ci];
+    Lattice& lat = cl[ci];
+    lat.resize(src.size());
+    for (unsigned i = 0; i < src.size(); ++i)
+      lat[i].push_back(LatticeArc(src[i], 0.0, 1));
+  }
+  for (int SS=0; SS < samples; ++SS) {
+    const bool is_last = ((samples - 1) == SS);
+    prob_t dlh = prob_t::One();
+    for (int ci = 0; ci < corpuse.size(); ++ci) {
+      const vector<int>& src = corpuse[ci];
+      const Lattice& lat = cl[ci];
+      cerr << TD::GetString(src) << endl;
+      hg.clear();
+      parser.Parse(lat, &hg);  // exhaustive parse
+      vector<unsigned>& d = derivs[ci];
+      if (!is_last) DecrementDerivation(hg, d, &lm, &rng);
+      for (unsigned i = 0; i < hg.edges_.size(); ++i) {
+        TRule& r = *hg.edges_[i].rule_;
+        if (r.lhs_ == kGoal)
+          hg.edges_[i].edge_prob_ = prob_t::One();
+        else
+          hg.edges_[i].edge_prob_ = lm.Prob(r);
+      }
+      if (!is_last) {
+        d.clear();
+        SampleDerivation(hg, &rng, &d);
+        IncrementDerivation(hg, derivs[ci], &lm, &rng);
+      } else {
+        prob_t p = TotalProb(hg);
+        dlh *= p;
+        cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl;
+      }
+      if (tofreelist.size() > 200000) {
+        cerr << "Freeing ... ";
+        for (unsigned i = 0; i < tofreelist.size(); ++i)
+          delete tofreelist[i];
+        tofreelist.clear();
+        cerr << "Freed.\n";
+      }
+    }
+    double llh = log(lm.Likelihood());
+    cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl;
+    if (SS % 10 == 9) lm.ResampleHyperparameters(&rng);
+    if (is_last) {
+      double z = log(dlh);
+      cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl;
+    }
+  }
+  for (unsigned i = 0; i < nt_vocab.size(); ++i)
+    cerr << lm.nts[i] << endl;
+  return 0;
+}
+
-- 
cgit v1.2.3


From 61a1c37fb1357b85286c9aca6790042f8ca38f5f Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Wed, 29 Feb 2012 12:58:53 -0800
Subject: Dump the forest before the language model rescoring

---
 decoder/decoder.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'decoder')

diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index 3394e0b8..69fbaf85 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -812,6 +812,9 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
     abort();
   }
 
+  if (conf.count("show_target_graph"))
+    HypergraphIO::WriteTarget(forest);
+
   for (int pass = 0; pass < rescoring_passes.size(); ++pass) {
     const RescoringPass& rp = rescoring_passes[pass];
     const vector<weight_t>& cur_weights = *rp.weight_vector;
@@ -1018,8 +1021,6 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
   }
   if (conf.count("show_cfg_search_space"))
     HypergraphIO::WriteAsCFG(forest);
-  if (conf.count("show_target_graph"))
-    HypergraphIO::WriteTarget(forest);
   if (has_ref) {
     if (HG::Intersect(ref, &forest)) {
 //      if (crf_uniform_empirical) {
-- 
cgit v1.2.3


From d258228a87bf83b14b2ff58b010cd977a15a8c43 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 3 Mar 2012 01:21:05 -0500
Subject: use assert properly

---
 decoder/apply_models.cc    | 9 ++++++---
 decoder/earley_composer.cc | 5 ++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'decoder')

diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc
index 40fd27e4..9ba59d1b 100644
--- a/decoder/apply_models.cc
+++ b/decoder/apply_models.cc
@@ -270,7 +270,8 @@ public:
       const Hypergraph::Edge& edge = in.edges_[in_edges[i]];
       const JVector j(edge.tail_nodes_.size(), 0);
       cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal));
-      assert(unique_cands.insert(cand.back()).second);  // these should all be unique!
+      bool is_new = unique_cands.insert(cand.back()).second;
+      assert(is_new);  // these should all be unique!
     }
 //    cerr << "  making heap of " << cand.size() << " candidates\n";
     make_heap(cand.begin(), cand.end(), HeapCandCompare());
@@ -378,7 +379,8 @@ public:
 		  pop_heap(cand.begin(), cand.end(), HeapCandCompare());
 		  Candidate* item = cand.back();
 		  cand.pop_back();
-		  assert(unique_accepted.insert(item).second); // these should all be unique!
+                  bool is_new = unique_accepted.insert(item).second;
+		  assert(is_new); // these should all be unique!
 		  // cerr << "POPPED: " << *item << endl;
 
 		  PushSuccFast2(*item, is_goal, &cand, &unique_accepted);
@@ -419,7 +421,8 @@ public:
           Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal);
           cand.push_back(new_cand);
           push_heap(cand.begin(), cand.end(), HeapCandCompare());
-          assert(cs->insert(new_cand).second);  // insert into uniqueness set, sanity check
+          bool is_new = cs->insert(new_cand).second;
+          assert(is_new);  // insert into uniqueness set, sanity check
         }
       }
     }
diff --git a/decoder/earley_composer.cc b/decoder/earley_composer.cc
index 48e94a31..b7af801a 100644
--- a/decoder/earley_composer.cc
+++ b/decoder/earley_composer.cc
@@ -329,7 +329,10 @@ class EarleyComposerImpl {
     forest->ReserveNodes(kMAX_NODES);
     assert(sit != g.end());
     Edge* init = new Edge(start_cat_, &sit->second, q_0_);
-    assert(IncorporateNewEdge(init));
+    if (!IncorporateNewEdge(init)) {
+      cerr << "Failed to create initial edge!\n";
+      abort();
+    }
     while (exp_agenda.HasWork() || agenda.HasWork()) {
       while(exp_agenda.HasWork()) {
         const Edge* edge = exp_agenda.Next();
-- 
cgit v1.2.3


From b3c0b5e4a05019045e6a81209741b60e0f20b073 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 3 Mar 2012 03:24:53 -0500
Subject: PYP language model (Teh 2006)

---
 decoder/fst_translator.cc |   5 +-
 gi/pf/Makefile.am         |   4 +-
 gi/pf/pyp_lm.cc           | 150 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 157 insertions(+), 2 deletions(-)
 create mode 100644 gi/pf/pyp_lm.cc

(limited to 'decoder')

diff --git a/decoder/fst_translator.cc b/decoder/fst_translator.cc
index 38dbd717..074de4c9 100644
--- a/decoder/fst_translator.cc
+++ b/decoder/fst_translator.cc
@@ -30,7 +30,10 @@ struct FSTTranslatorImpl {
     if (input.find("{\"rules\"") == 0) {
       istringstream is(input);
       Hypergraph src_cfg_hg;
-      assert(HypergraphIO::ReadFromJSON(&is, &src_cfg_hg));
+      if (!HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)) {
+        cerr << "Failed to read HG from JSON.\n";
+        abort();
+      }
       if (add_pass_through_rules) {
         SparseVector<double> feats;
         feats.set_value(FD::Convert("PassThrough"), 1);
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 0cf0bc63..7cf9c14d 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,4 +1,4 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm
 
 noinst_LIBRARIES = libpf.a
 libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc
@@ -9,6 +9,8 @@ align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
 
 itg_SOURCES = itg.cc
 
+pyp_lm_SOURCES = pyp_lm.cc
+
 learn_cfg_SOURCES = learn_cfg.cc
 
 condnaive_SOURCES = condnaive.cc
diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc
new file mode 100644
index 00000000..2837e33c
--- /dev/null
+++ b/gi/pf/pyp_lm.cc
@@ -0,0 +1,150 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/functional.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "corpus_tools.h"
+#include "m.h"
+#include "tdict.h"
+#include "sampler.h"
+#include "ccrp.h"
+#include "ccrp_onetable.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+shared_ptr<MT19937> prng;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("input,i",po::value<string>(),"Read data from")
+        ("random_seed,S",po::value<uint32_t>(), "Random seed");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || (conf->count("input") == 0)) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+template <unsigned N> struct PYPLM;
+
+// uniform base distribution
+template<> struct PYPLM<0> {
+  PYPLM(unsigned vs) : p0(1.0 / vs) {}
+  void increment(WordID w, const vector<WordID>& context, MT19937* rng) const {}
+  void decrement(WordID w, const vector<WordID>& context, MT19937* rng) const {}
+  double prob(WordID w, const vector<WordID>& context) const { return p0; }
+  const double p0;
+};
+
+// represents an N-gram LM
+template <unsigned N> struct PYPLM {
+  PYPLM(unsigned vs) : backoff(vs) {}
+  void increment(WordID w, const vector<WordID>& context, MT19937* rng) {
+    const double bo = backoff.prob(w, context);
+    static vector<WordID> lookup(N-1);
+    for (unsigned i = 0; i < N-1; ++i)
+      lookup[i] = context[context.size() - 1 - i];
+    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
+    if (it == p.end())
+      it = p.insert(make_pair(lookup, CCRP<WordID>(1,1,1,1))).first;
+    if (it->second.increment(w, bo, rng))
+      backoff.increment(w, context, rng);
+  }
+  void decrement(WordID w, const vector<WordID>& context, MT19937* rng) {
+    static vector<WordID> lookup(N-1);
+    for (unsigned i = 0; i < N-1; ++i)
+      lookup[i] = context[context.size() - 1 - i];
+    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup);
+    assert(it != p.end());
+    if (it->second.decrement(w, rng))
+      backoff.decrement(w, context, rng);
+  }
+  double prob(WordID w, const vector<WordID>& context) const {
+    const double bo = backoff.prob(w, context);
+    static vector<WordID> lookup(N-1);
+    for (unsigned i = 0; i < N-1; ++i)
+      lookup[i] = context[context.size() - 1 - i];
+    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it = p.find(lookup);
+    if (it == p.end()) return bo;
+    return it->second.prob(w, bo);
+  }
+  PYPLM<N-1> backoff;
+  unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > > p;
+};
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+
+  InitCommandLine(argc, argv, &conf);
+  const unsigned samples = conf["samples"].as<unsigned>();
+  if (conf.count("random_seed"))
+    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    prng.reset(new MT19937);
+  MT19937& rng = *prng;
+  vector<vector<WordID> > corpuse;
+  set<WordID> vocabe;
+  const WordID kEOS = TD::Convert("</s>");
+  cerr << "Reading corpus...\n";
+  CorpusTools::ReadFromFile(conf["input"].as<string>(), &corpuse, &vocabe);
+  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
+#define kORDER 5
+  PYPLM<kORDER> lm(vocabe.size());
+  vector<WordID> ctx(kORDER - 1, TD::Convert("<s>"));
+  int mci = corpuse.size() * 99 / 100;
+  for (int SS=0; SS < samples; ++SS) {
+    for (int ci = 0; ci < mci; ++ci) {
+      ctx.resize(kORDER - 1);
+      const vector<WordID>& s = corpuse[ci];
+      for (int i = 0; i <= s.size(); ++i) {
+        WordID w = (i < s.size() ? s[i] : kEOS);
+        if (SS > 0) lm.decrement(w, ctx, &rng);
+        lm.increment(w, ctx, &rng);
+        ctx.push_back(w);
+      }
+      if (SS > 0) lm.decrement(kEOS, ctx, &rng);
+      lm.increment(kEOS, ctx, &rng);
+    }
+  }
+  double llh = 0;
+  unsigned cnt = 0;
+  for (int ci = mci; ci < corpuse.size(); ++ci) {
+    ctx.resize(kORDER - 1);
+    const vector<WordID>& s = corpuse[ci];
+    for (int i = 0; i <= s.size(); ++i) {
+      WordID w = (i < s.size() ? s[i] : kEOS);
+      double lp = log(lm.prob(w, ctx)) / log(2);
+      cerr << "p(" << TD::Convert(w) << " | " << TD::GetString(ctx) << ") = " << lp << endl;
+      ctx.push_back(w);
+      llh -= lp;
+      cnt++;
+    }
+  }
+  cerr << "  Log_10 prob: " << (llh * log(2) / log(10)) << endl;
+  cerr << "        Count: " << (cnt) << endl;
+  cerr << "Cross-entropy: " << (llh / cnt) << endl;
+  cerr << "   Perplexity: " << pow(2, llh / cnt) << endl;
+  return 0;
+}
+
-- 
cgit v1.2.3


From f423af0b44a1f977e5e8898363c1d4a2e8cd15e5 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 7 Mar 2012 09:46:54 -0500
Subject: configure order of n-gram features

---
 decoder/ff_ngrams.cc | 43 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 4 deletions(-)

(limited to 'decoder')

diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc
index 04dd1906..d6d79f5e 100644
--- a/decoder/ff_ngrams.cc
+++ b/decoder/ff_ngrams.cc
@@ -57,6 +57,39 @@ namespace {
   }
 }
 
+static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order) {
+  vector<string> const& argv=SplitOnWhitespace(in);
+  *explicit_markers = false;
+  *order = 3;
+#define LMSPEC_NEXTARG if (i==argv.end()) {            \
+    cerr << "Missing argument for "<<*last<<". "; goto usage; \
+    } else { ++i; }
+
+  for (vector<string>::const_iterator last,i=argv.begin(),e=argv.end();i!=e;++i) {
+    string const& s=*i;
+    if (s[0]=='-') {
+      if (s.size()>2) goto fail;
+      switch (s[1]) {
+      case 'x':
+        *explicit_markers = true;
+        break;
+      case 'o':
+        LMSPEC_NEXTARG; *order=atoi((*i).c_str());
+        break;
+#undef LMSPEC_NEXTARG
+      default:
+      fail:
+        cerr<<"Unknown option on NgramFeatures "<<s<<" ; ";
+        goto usage;
+      }
+    }
+  }
+  return true;
+usage:
+  cerr << "NgramFeatures is incorrect!\n";
+  return false;
+}
+
 class NgramDetectorImpl {
 
   // returns the number of unscored words at the left edge of a span
@@ -264,10 +297,10 @@ class NgramDetectorImpl {
   }
 
  public:
-  explicit NgramDetectorImpl(bool explicit_markers) :
+  explicit NgramDetectorImpl(bool explicit_markers, unsigned order) :
       kCDEC_UNK(TD::Convert("<unk>")) ,
       add_sos_eos_(!explicit_markers) {
-    order_ = 3;
+    order_ = order;
     state_size_ = (order_ - 1) * sizeof(WordID) + 2 + (order_ - 1) * sizeof(WordID);
     unscored_size_offset_ = (order_ - 1) * sizeof(WordID);
     is_complete_offset_ = unscored_size_offset_ + 1;
@@ -316,8 +349,10 @@ class NgramDetectorImpl {
 
 NgramDetector::NgramDetector(const string& param) {
   string filename, mapfile, featname;
-  bool explicit_markers = (param == "-x");
-  pimpl_ = new NgramDetectorImpl(explicit_markers);
+  bool explicit_markers = false;
+  unsigned order = 3;
+  ParseArgs(param, &explicit_markers, &order);
+  pimpl_ = new NgramDetectorImpl(explicit_markers, order);
   SetStateSize(pimpl_->ReserveStateSize());
 }
 
-- 
cgit v1.2.3


From 9399d6e1f1112d67dd842086a3225387ea55725c Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 8 Mar 2012 01:46:32 -0500
Subject: simple context feature for tagger

---
 decoder/Makefile.am       |   1 +
 decoder/cdec_ff.cc        |   2 +
 decoder/ff_context.cc     |  99 +++++++++++++++++++++++
 decoder/ff_context.h      |  23 ++++++
 gi/pf/align-tl.cc         |   6 +-
 gi/pf/reachability.cc     |   2 +
 gi/pf/reachability.h      |   6 +-
 gi/pf/transliterations.cc | 198 ++++++++++++++--------------------------------
 gi/pf/transliterations.h  |   5 +-
 9 files changed, 194 insertions(+), 148 deletions(-)
 create mode 100644 decoder/ff_context.cc
 create mode 100644 decoder/ff_context.h

(limited to 'decoder')

diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 30eaf04d..a00b18af 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -63,6 +63,7 @@ libcdec_a_SOURCES = \
   ff.cc \
   ff_rules.cc \
   ff_wordset.cc \
+  ff_context.cc \
   ff_charset.cc \
   ff_lm.cc \
   ff_klm.cc \
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index 4ce5749e..b516c386 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -1,6 +1,7 @@
 #include <boost/shared_ptr.hpp>
 
 #include "ff.h"
+#include "ff_context.h"
 #include "ff_spans.h"
 #include "ff_lm.h"
 #include "ff_klm.h"
@@ -42,6 +43,7 @@ void register_feature_functions() {
 #endif
   ff_registry.Register("SpanFeatures", new FFFactory<SpanFeatures>());
   ff_registry.Register("NgramFeatures", new FFFactory<NgramDetector>());
+  ff_registry.Register("RuleContextFeatures", new FFFactory<RuleContextFeatures>());
   ff_registry.Register("RuleIdentityFeatures", new FFFactory<RuleIdentityFeatures>());
   ff_registry.Register("SourceSyntaxFeatures", new FFFactory<SourceSyntaxFeatures>);
   ff_registry.Register("SourceSpanSizeFeatures", new FFFactory<SourceSpanSizeFeatures>);
diff --git a/decoder/ff_context.cc b/decoder/ff_context.cc
new file mode 100644
index 00000000..19f9a413
--- /dev/null
+++ b/decoder/ff_context.cc
@@ -0,0 +1,99 @@
+#include "ff_context.h"
+
+#include <sstream>
+#include <cassert>
+#include <cmath>
+
+#include "filelib.h"
+#include "stringlib.h"
+#include "sentence_metadata.h"
+#include "lattice.h"
+#include "fdict.h"
+#include "verbose.h"
+
+using namespace std;
+
+namespace {
+  string Escape(const string& x) {
+    string y = x;
+    for (int i = 0; i < y.size(); ++i) {
+      if (y[i] == '=') y[i]='_';
+      if (y[i] == ';') y[i]='_';
+    }
+    return y;
+  }
+}
+
+RuleContextFeatures::RuleContextFeatures(const std::string& param) {
+  kSOS = TD::Convert("<s>");
+  kEOS = TD::Convert("</s>");
+
+  // TODO param lets you pass in a string from the cdec.ini file
+}
+
+void RuleContextFeatures::PrepareForInput(const SentenceMetadata& smeta) {
+  const Lattice& sl = smeta.GetSourceLattice();
+  current_input.resize(sl.size());
+  for (unsigned i = 0; i < sl.size(); ++i) {
+    if (sl[i].size() != 1) {
+      cerr << "Context features not supported with lattice inputs!\nid=" << smeta.GetSentenceId() << endl;
+      abort();
+    }
+    current_input[i] = sl[i][0].label;
+  }
+}
+
+void RuleContextFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                                const Hypergraph::Edge& edge,
+                                                const vector<const void*>& ant_contexts,
+                                                SparseVector<double>* features,
+                                                SparseVector<double>* estimated_features,
+                                                void* context) const {
+  const TRule& rule = *edge.rule_;
+
+  if (rule.Arity() != 0 || // arity = 0, no nonterminals
+      rule.e_.size() != 1) return; // size = 1, predicted label is a single token
+
+
+  // you can see the current label "for free"
+  const WordID cur_label = rule.e_[0];
+  // (if you want to see more labels, you have to be very careful, and muck
+  //  about with contexts and ant_contexts)
+
+  // but... you can look at as much of the source as you want!
+  const int from_src_index = edge.i_;   // start of the span in the input being labeled
+  const int to_src_index = edge.j_;     // end of the span in the input
+  // (note: in the case of tagging the size of the spans being labeled will
+  //  always be 1, but in other formalisms, you can have bigger spans.)
+
+  // this is the current token being labeled:
+  const WordID cur_input = current_input[from_src_index];
+
+  // let's get the previous token in the input (may be to the left of the start
+  // of the sentence!)
+  WordID prev_input = kSOS;
+  if (from_src_index > 0) { prev_input = current_input[from_src_index - 1]; }
+  // let's get the next token (may be to the left of the start of the sentence!)
+  WordID next_input = kEOS;
+  if (to_src_index < current_input.size()) { next_input = current_input[to_src_index]; }
+
+  // now, build a feature string
+  ostringstream os;
+  // TD::Convert converts from the internal integer representation of a token
+  // to the actual token
+  os << "C1:" << TD::Convert(prev_input) << '_' 
+     << TD::Convert(cur_input) << '|' << TD::Convert(cur_label);
+  // C1 is just to prevent a name clash
+
+  // pick a value
+  double fval = 1.0; // can be any real value
+
+  // add it to the feature vector FD::Convert converts the feature string to a
+  // feature int, Escape makes sure the feature string doesn't have any bad
+  // symbols that could confuse a parser somewhere
+  features->add_value(FD::Convert(Escape(os.str())), fval);
+  // that's it!
+
+  // create more features if you like...
+}
+
diff --git a/decoder/ff_context.h b/decoder/ff_context.h
new file mode 100644
index 00000000..0d22b027
--- /dev/null
+++ b/decoder/ff_context.h
@@ -0,0 +1,23 @@
+#ifndef _FF_CONTEXT_H_
+#define _FF_CONTEXT_H_
+
+#include <vector>
+#include "ff.h"
+
+class RuleContextFeatures : public FeatureFunction {
+ public:
+  RuleContextFeatures(const std::string& param);
+ protected:
+  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                     const Hypergraph::Edge& edge,
+                                     const std::vector<const void*>& ant_contexts,
+                                     SparseVector<double>* features,
+                                     SparseVector<double>* estimated_features,
+                                     void* context) const;
+  virtual void PrepareForInput(const SentenceMetadata& smeta);
+ private:
+  std::vector<WordID> current_input;
+  WordID kSOS, kEOS;
+};
+
+#endif
diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc
index 0e0454e5..6bb8c886 100644
--- a/gi/pf/align-tl.cc
+++ b/gi/pf/align-tl.cc
@@ -310,18 +310,16 @@ int main(int argc, char** argv) {
   // TODO CONFIGURE THIS
   int min_trans_src = 4;
 
-  cerr << "Initializing transliteration DPs ...\n";
+  cerr << "Initializing transliteration graph structures ...\n";
   for (int i = 0; i < corpus.size(); ++i) {
     const vector<int>& src = corpus[i].src;
     const vector<int>& trg = corpus[i].trg;
-    cerr << '.' << flush;
-    if (i % 80 == 79) cerr << endl;
     for (int j = 0; j < src.size(); ++j) {
       const vector<int>& src_let = letters[src[j]];
       for (int k = 0; k < trg.size(); ++k) {
         const vector<int>& trg_let = letters[trg[k]];
         if (src_let.size() < min_trans_src)
-          tl.Forbid(src[j], trg[k]);
+          tl.Forbid(src[j], src_let, trg[k], trg_let);
         else
           tl.Initialize(src[j], src_let, trg[k], trg_let);
       }
diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc
index 73dd8d39..70fb76da 100644
--- a/gi/pf/reachability.cc
+++ b/gi/pf/reachability.cc
@@ -47,6 +47,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras
           r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true;
           int src_delta = i - prevs[k].prev_src_covered;
           edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true;
+          valid_deltas[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(make_pair<short,short>(src_delta,j - prevs[k].prev_trg_covered));
           short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered];
           if (src_delta > msd) msd = src_delta;
         }
@@ -56,6 +57,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras
     assert(!edges[0][0][0][1]);
     assert(!edges[0][0][0][0]);
     assert(max_src_delta[0][0] > 0);
+    cerr << "Sentence with length (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node\n";
     //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n";
     //for (int i = 0; i < b[0][0].size(); ++i) {
     //  cerr << "  -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n";
diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h
index 98450ec1..fb2f4965 100644
--- a/gi/pf/reachability.h
+++ b/gi/pf/reachability.h
@@ -12,12 +12,14 @@
 // currently forbids 0 -> n and n -> 0 alignments
 
 struct Reachability {
-  boost::multi_array<bool, 4> edges;  // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring?
+  boost::multi_array<bool, 4> edges;  // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring?
   boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid
+  boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node
 
   Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :
       edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),
-      max_src_delta(boost::extents[srclen][trglen]) {
+      max_src_delta(boost::extents[srclen][trglen]),
+      valid_deltas(boost::extents[srclen][trglen]) {
     ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);
   }
 
diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc
index 6e0c2e93..e29334fd 100644
--- a/gi/pf/transliterations.cc
+++ b/gi/pf/transliterations.cc
@@ -2,173 +2,92 @@
 
 #include <iostream>
 #include <vector>
-#include <tr1/unordered_map>
 
-#include "grammar.h"
-#include "bottom_up_parser.h"
-#include "hg.h"
-#include "hg_intersect.h"
+#include "boost/shared_ptr.hpp"
+
 #include "filelib.h"
 #include "ccrp.h"
 #include "m.h"
-#include "lattice.h"
-#include "verbose.h"
+#include "reachability.h"
 
 using namespace std;
 using namespace std::tr1;
 
-static WordID kX;
-static int kMAX_SRC_SIZE = 0;
-static vector<vector<WordID> > cur_trg_chunks;
-
-vector<GrammarIter*> tlttofreelist;
-
-static void InitTargetChunks(int max_size, const vector<WordID>& trg) {
-  cur_trg_chunks.clear();
-  vector<WordID> tmp;
-  unordered_set<vector<WordID>, boost::hash<vector<WordID> > > u;
-  for (int len = 1; len <= max_size; ++len) {
-    int end = trg.size() + 1;
-    end -= len;
-    for (int i = 0; i < end; ++i) {
-      tmp.clear();
-      for (int j = 0; j < len; ++j)
-        tmp.push_back(trg[i + j]);
-      if (u.insert(tmp).second) cur_trg_chunks.push_back(tmp);
-    }
-  }
-}
-
-struct TransliterationGrammarIter : public GrammarIter, public RuleBin {
-  TransliterationGrammarIter() { tlttofreelist.push_back(this); }
-  TransliterationGrammarIter(const TRulePtr& inr, int symbol) {
-    if (inr) {
-      r.reset(new TRule(*inr));
-    } else {
-      r.reset(new TRule);
-    }
-    TRule& rr = *r;
-    rr.lhs_ = kX;
-    rr.f_.push_back(symbol);
-    tlttofreelist.push_back(this);
-  }
-  virtual int GetNumRules() const {
-    if (!r) return 0;
-    return cur_trg_chunks.size();
-  }
-  virtual TRulePtr GetIthRule(int i) const {
-    TRulePtr nr(new TRule(*r));
-    nr->e_ = cur_trg_chunks[i];
-    //cerr << nr->AsString() << endl;
-    return nr;
-  }
-  virtual int Arity() const {
-    return 0;
-  }
-  virtual const RuleBin* GetRules() const {
-    if (!r) return NULL; else return this;
-  }
-  virtual const GrammarIter* Extend(int symbol) const {
-    if (symbol <= 0) return NULL;
-    if (!r || !kMAX_SRC_SIZE || r->f_.size() < kMAX_SRC_SIZE)
-      return new TransliterationGrammarIter(r, symbol);
-    else
-      return NULL;
-  }
-  TRulePtr r;
-};
-
-struct TransliterationGrammar : public Grammar {
-  virtual const GrammarIter* GetRoot() const {
-    return new TransliterationGrammarIter;
-  }
-  virtual bool HasRuleForSpan(int, int, int distance) const {
-    return (distance < kMAX_SRC_SIZE);
-  }
-};
-
-struct TInfo {
-  TInfo() : initialized(false) {}
+struct GraphStructure {
+  GraphStructure() : initialized(false) {}
+  boost::shared_ptr<Reachability> r;
   bool initialized;
-  Hypergraph lattice;   // may be empty if transliteration is not possible
-  prob_t est_prob;      // will be zero if not possible
 };
 
 struct TransliterationsImpl {
   TransliterationsImpl() {
-    kX = TD::Convert("X")*-1;
-    kMAX_SRC_SIZE = 4;
-    grammars.push_back(GrammarPtr(new TransliterationGrammar));
-    grammars.push_back(GrammarPtr(new GlueGrammar("S", "X")));
-    SetSilent(true);
   }
 
   void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
-    if (src >= graphs.size()) graphs.resize(src + 1);
-    if (graphs[src][trg].initialized) return;
-    int kMAX_TRG_SIZE = 4;
-    InitTargetChunks(kMAX_TRG_SIZE, trg_lets);
-    ExhaustiveBottomUpParser parser("S", grammars);
-    Lattice lat(src_lets.size()), tlat(trg_lets.size());
-    for (unsigned i = 0; i < src_lets.size(); ++i)
-      lat[i].push_back(LatticeArc(src_lets[i], 0.0, 1));
-    for (unsigned i = 0; i < trg_lets.size(); ++i)
-      tlat[i].push_back(LatticeArc(trg_lets[i], 0.0, 1));
-    //cerr << "Creating lattice for: " << TD::Convert(src) << " --> " << TD::Convert(trg) << endl;
-    //cerr << "'" << TD::GetString(src_lets) << "' --> " << TD::GetString(trg_lets) << endl;
-    if (!parser.Parse(lat, &graphs[src][trg].lattice)) {
-      //cerr << "Failed to parse " << TD::GetString(src_lets) << endl;
-      abort();
-    }
-    if (HG::Intersect(tlat, &graphs[src][trg].lattice)) {
-      graphs[src][trg].est_prob = prob_t(1e-4);
+    const size_t src_len = src_lets.size();
+    const size_t trg_len = trg_lets.size();
+    if (src_len >= graphs.size()) graphs.resize(src_len + 1);
+    if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
+    if (graphs[src_len][trg_len].initialized) return;
+    graphs[src_len][trg_len].r.reset(new Reachability(src_len, trg_len, 4, 4));
+
+#if 0
+    if (HG::Intersect(tlat, &hg)) {
+      // TODO
     } else {
-      graphs[src][trg].lattice.clear();
-      //cerr << "Failed to intersect " << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << endl;
-      graphs[src][trg].est_prob = prob_t::Zero();
+      cerr << "No transliteration lattice possible for src_len=" << src_len << " trg_len=" << trg_len << endl;
+      hg.clear();
     }
-    for (unsigned i = 0; i < tlttofreelist.size(); ++i)
-      delete tlttofreelist[i];
-    tlttofreelist.clear();
     //cerr << "Number of paths: " << graphs[src][trg].lattice.NumberOfPaths() << endl;
-    graphs[src][trg].initialized = true;
+#endif
+    graphs[src_len][trg_len].initialized = true;
   }
 
-  const prob_t& EstimateProbability(WordID src, WordID trg) const {
-    assert(src < graphs.size());
-    const unordered_map<WordID, TInfo>& um = graphs[src];
-    const unordered_map<WordID, TInfo>::const_iterator it = um.find(trg);
-    assert(it != um.end());
-    assert(it->second.initialized);
-    return it->second.est_prob;
+  void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
+    const size_t src_len = src_lets.size();
+    const size_t trg_len = trg_lets.size();
+    if (src_len >= graphs.size()) graphs.resize(src_len + 1);
+    if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
+    graphs[src_len][trg_len].r.reset();
+    graphs[src_len][trg_len].initialized = true;
   }
 
-  void Forbid(WordID src, WordID trg) {
-    if (src >= graphs.size()) graphs.resize(src + 1);
-    graphs[src][trg].est_prob = prob_t::Zero();
-    graphs[src][trg].initialized = true;
+  prob_t EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const {
+    assert(src.size() < graphs.size());
+    const vector<GraphStructure>& tv = graphs[src.size()];
+    assert(trg.size() < tv.size());
+    const GraphStructure& gs = tv[trg.size()];
+    // TODO: do prob
+    return prob_t::Zero();
   }
 
   void GraphSummary() const {
-    double tlp = 0;
-    int tt = 0;
+    double to = 0;
+    double tn = 0;
+    double tt = 0;
     for (int i = 0; i < graphs.size(); ++i) {
-      const unordered_map<WordID, TInfo>& um = graphs[i];
-      unordered_map<WordID, TInfo>::const_iterator it;
-      for (it = um.begin(); it != um.end(); ++it) {
-        if (it->second.lattice.empty()) continue;
-        //cerr << TD::Convert(i) << " --> " << TD::Convert(it->first) << ": " << it->second.lattice.NumberOfPaths() << endl;
-        tlp += log(it->second.lattice.NumberOfPaths());
+      const vector<GraphStructure>& vt = graphs[i];
+      for (int j = 0; j < vt.size(); ++j) {
+        const GraphStructure& gs = vt[j];
+        if (!gs.r) continue;
         tt++;
+        for (int k = 0; k < i; ++k) {
+          for (int l = 0; l < j; ++l) {
+            size_t c = gs.r->valid_deltas[k][l].size();
+            if (c) {
+              tn += 1;
+              to += c;
+            }
+          }
+        }
       }
     }
-    tlp /= tt;
-    cerr << "E[log paths] = " << tlp << endl;
-    cerr << "exp(E[log paths]) = " << exp(tlp) << endl;
+    cerr << "     Average nodes = " << (tn / tt) << endl;
+    cerr << "Average out-degree = " << (to / tn) << endl;
+    cerr << " Unique structures = " << tt << endl;
   }
 
-  vector<unordered_map<WordID, TInfo> > graphs;
-  vector<GrammarPtr> grammars;
+  vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len]
 };
 
 Transliterations::Transliterations() : pimpl_(new TransliterationsImpl) {}
@@ -178,16 +97,15 @@ void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, Wo
   pimpl_->Initialize(src, src_lets, trg, trg_lets);
 }
 
-prob_t Transliterations::EstimateProbability(WordID src, WordID trg) const {
-  return pimpl_->EstimateProbability(src,trg);
+prob_t Transliterations::EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const {
+  return pimpl_->EstimateProbability(s, src,t, trg);
 }
 
-void Transliterations::Forbid(WordID src, WordID trg) {
-  pimpl_->Forbid(src, trg);
+void Transliterations::Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
+  pimpl_->Forbid(src, src_lets, trg, trg_lets);
 }
 
 void Transliterations::GraphSummary() const {
   pimpl_->GraphSummary();
 }
 
-
diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h
index a548aacf..76eb2a05 100644
--- a/gi/pf/transliterations.h
+++ b/gi/pf/transliterations.h
@@ -10,9 +10,10 @@ struct Transliterations {
   explicit Transliterations();
   ~Transliterations();
   void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
-  void Forbid(WordID src, WordID trg);
+  void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
   void GraphSummary() const;
-  prob_t EstimateProbability(WordID src, WordID trg) const;
+  prob_t EstimateProbability(WordID s, const std::vector<WordID>& src, WordID t, const std::vector<WordID>& trg) const;
+ private:
   TransliterationsImpl* pimpl_;
 };
 
-- 
cgit v1.2.3


From a45af4a3704531a8382cd231f6445b3a33b598a3 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 10 Mar 2012 16:42:12 -0500
Subject: frequency-based binning

---
 decoder/Makefile.am        |  1 -
 decoder/ff_csplit.cc       |  2 +-
 decoder/freqdict.cc        | 29 -----------------------------
 decoder/freqdict.h         | 37 ++++++++++++++++++++++++++++++++-----
 gi/pf/align-lexonly-pyp.cc | 24 +++++++++++++++++-------
 gi/pf/make-freq-bins.pl    | 26 ++++++++++++++++++++++++++
 gi/pf/pyp_tm.cc            | 24 +++++++++++++++++-------
 gi/pf/pyp_tm.h             |  7 ++++---
 8 files changed, 97 insertions(+), 53 deletions(-)
 delete mode 100644 decoder/freqdict.cc
 create mode 100755 gi/pf/make-freq-bins.pl

(limited to 'decoder')

diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index a00b18af..ec51d643 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -76,7 +76,6 @@ libcdec_a_SOURCES = \
   ff_source_syntax.cc \
   ff_bleu.cc \
   ff_factory.cc \
-  freqdict.cc \
   lexalign.cc \
   lextrans.cc \
   tagger.cc \
diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc
index 3991d38f..c9ed996c 100644
--- a/decoder/ff_csplit.cc
+++ b/decoder/ff_csplit.cc
@@ -72,7 +72,7 @@ struct BasicCSplitFeaturesImpl {
   const int fl1_;
   const int fl2_;
   const int bad_;
-  FreqDict freq_dict_;
+  FreqDict<float> freq_dict_;
   set<WordID> bad_words_;
 };
 
diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc
deleted file mode 100644
index 9e25d346..00000000
--- a/decoder/freqdict.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <cassert>
-#include "freqdict.h"
-#include "tdict.h"
-#include "filelib.h"
-
-using namespace std;
-
-void FreqDict::Load(const std::string& fname) {
-  cerr << "Reading word frequencies: " << fname << endl;
-  ReadFile rf(fname);
-  istream& ifs = *rf.stream();
-  int cc=0;
-  while (ifs) {
-    std::string word;
-    ifs >> word;
-    if (word.size() == 0) continue;
-    if (word[0] == '#') continue;
-    double count = 0;
-    ifs >> count;
-    assert(count > 0.0);  // use -log(f)
-    counts_[TD::Convert(word)]=count;
-    ++cc;
-    if (cc % 10000 == 0) { std::cerr << "."; }
-  }
-  std::cerr << "\n";
-  std::cerr << "Loaded " << cc << " words\n";
-}
diff --git a/decoder/freqdict.h b/decoder/freqdict.h
index 9acf0c33..4e03fadd 100644
--- a/decoder/freqdict.h
+++ b/decoder/freqdict.h
@@ -1,20 +1,47 @@
 #ifndef _FREQDICT_H_
 #define _FREQDICT_H_
 
+#include <iostream>
 #include <map>
 #include <string>
 #include "wordid.h"
+#include "filelib.h"
+#include "tdict.h"
 
+template <typename T = float>
 class FreqDict {
  public:
-  void Load(const std::string& fname);
-  float LookUp(const WordID& word) const {
-    std::map<WordID,float>::const_iterator i = counts_.find(word);
-    if (i == counts_.end()) return 0;
+  FreqDict() : max_() {}
+  T Max() const { return max_; }
+  void Load(const std::string& fname) {
+    std::cerr << "Reading word statistics from: " << fname << std::endl;
+    ReadFile rf(fname);
+    std::istream& ifs = *rf.stream();
+    int cc=0;
+    std::string word;
+    while (ifs) {
+      ifs >> word;
+      if (word.size() == 0) continue;
+      if (word[0] == '#') continue;
+      T count = 0;
+      ifs >> count;
+      if (count > max_) max_ = count;
+      counts_[TD::Convert(word)]=count;
+      ++cc;
+      if (cc % 10000 == 0) { std::cerr << "."; }
+    }
+    std::cerr << "\n";
+    std::cerr << "Loaded " << cc << " words\n";
+  }
+
+  T LookUp(const WordID& word) const {
+    typename std::map<WordID,T>::const_iterator i = counts_.find(word);
+    if (i == counts_.end()) return T();
     return i->second;
   }
  private:
-  std::map<WordID, float> counts_;
+  T max_;
+  std::map<WordID, T> counts_;
 };
 
 #endif
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 6c054753..942dcf51 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -20,6 +20,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
         ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed")
+        ("p_null,0", po::value<double>()->default_value(0.08), "probability of aligning to null")
+        ("align_alpha,a", po::value<double>()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?")
         ("input,i",po::value<string>(),"Read parallel data from")
         ("random_seed,S",po::value<uint32_t>(), "Random seed");
   po::options_description clo("Command line options");
@@ -59,9 +62,13 @@ struct AlignedSentencePair {
 };
 
 struct Aligner {
-  Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) :
+  Aligner(const vector<vector<WordID> >& lets,
+          int num_letters,
+          const po::variables_map& conf,
+          vector<AlignedSentencePair>* c) :
       corpus(*c),
-      paj_model(4, 0.08),
+      paj_model(conf["align_alpha"].as<double>(), conf["p_null"].as<double>()),
+      infer_paj(conf.count("infer_alignment_hyperparameters") > 0),
       model(lets, num_letters),
       kNULL(TD::Convert("NULL")) {
     assert(lets[kNULL].size() == 0);
@@ -69,12 +76,13 @@ struct Aligner {
 
   vector<AlignedSentencePair>& corpus;
   QuasiModel2 paj_model;
+  const bool infer_paj;
   PYPLexicalTranslation model;
   const WordID kNULL;
 
   void ResampleHyperparameters() {
     model.ResampleHyperparameters(prng);
-    paj_model.ResampleHyperparameters(prng);
+    if (infer_paj) paj_model.ResampleHyperparameters(prng);
   }
 
   void InitializeRandom() {
@@ -117,8 +125,6 @@ struct Aligner {
         paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());
       }
     }
-    cerr << "LLH = " << Likelihood() << "    \t(Amodel=" << paj_model.Likelihood()
-         << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl;
   }
 
   prob_t Likelihood() const {
@@ -211,13 +217,17 @@ int main(int argc, char** argv) {
   ExtractLetters(vocabf, &letters, NULL);
   letters[TD::Convert("NULL")].clear();
 
-  Aligner aligner(letters, letset.size(), &corpus);
+  Aligner aligner(letters, letset.size(), conf, &corpus);
   aligner.InitializeRandom();
 
   const unsigned samples = conf["samples"].as<unsigned>();
   for (int i = 0; i < samples; ++i) {
     for (int j = 65; j < 67; ++j) Debug(corpus[j]);
-    if (i % 10 == 9) aligner.ResampleHyperparameters();
+    if (i % 10 == 9) {
+      aligner.ResampleHyperparameters();
+      cerr << "LLH = " << aligner.Likelihood() << "    \t(Amodel=" << aligner.paj_model.Likelihood()
+           << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl;
+    }
     aligner.ResampleCorpus();
     if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
   }
diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl
new file mode 100755
index 00000000..fdcd3555
--- /dev/null
+++ b/gi/pf/make-freq-bins.pl
@@ -0,0 +1,26 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $BASE = 6;
+my $CUTOFF = 3;
+
+my %d;
+my $num = 0;
+while(<>){
+ chomp;
+ my @words = split /\s+/;
+ for my $w (@words) {$d{$w}++; $num++;}
+}
+
+my @vocab = sort {$d{$b} <=> $d{$a}} keys %d;
+
+for (my $i=0; $i<scalar @vocab; $i++) {
+  my $most = $d{$vocab[$i]};
+  my $least = 1;
+
+  my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF);
+  if ($nl < 0) { $nl = 0; }
+  print "$vocab[$i] $nl\n"
+}
+
+
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
index 34ef0ba2..e21f0267 100644
--- a/gi/pf/pyp_tm.cc
+++ b/gi/pf/pyp_tm.cc
@@ -4,9 +4,6 @@
 #include <iostream>
 #include <queue>
 
-#include "base_distributions.h"
-#include "monotonic_pseg.h"
-#include "conditional_pseg.h"
 #include "tdict.h"
 #include "ccrp.h"
 #include "pyp_word_model.h"
@@ -15,9 +12,19 @@
 using namespace std;
 using namespace std::tr1;
 
-template <typename Base>
+struct FreqBinner {
+  FreqBinner(const std::string& fname) { fd_.Load(fname); }
+  unsigned NumberOfBins() const { return fd_.Max() + 1; }
+  unsigned Bin(const WordID& w) const { return fd_.LookUp(w); }
+  FreqDict<unsigned> fd_;
+};
+
+template <typename Base, class Binner = FreqBinner>
 struct ConditionalPYPWordModel {
-  ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {}
+  ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) :
+      base(*b),
+      binner(bnr),
+      btr(binner ? binner->NumberOfBins() + 1u : 2u) {}
 
   void Summary() const {
     cerr << "Number of conditioning contexts: " << r.size() << endl;
@@ -46,7 +53,9 @@ struct ConditionalPYPWordModel {
     if (it == r.end()) {
       it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first;
       static const WordID kNULL = TD::Convert("NULL");
-      btr.Add(src == kNULL ? 0 : 1, &it->second);
+      unsigned bin = (src == kNULL ? 0 : 1);
+      if (binner && bin) { bin = binner->Bin(src) + 1; }
+      btr.Add(bin, &it->second);
     }
     if (it->second.increment(trglets, base(trglets), rng))
       base.Increment(trglets, rng);
@@ -75,6 +84,7 @@ struct ConditionalPYPWordModel {
 
   // TODO tie PYP hyperparameters based on source word frequency bins
   Base& base;
+  const Binner* binner;
   BinTiedResampler<CCRP<vector<WordID> > > btr;
   typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;
   RuleModelHash r;
@@ -84,7 +94,7 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets
                                              const unsigned num_letters) :
     letters(lets),
     up0(new PYPWordModel(num_letters)),
-    tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0)),
+    tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0, new FreqBinner("10k.freq"))),
     kX(-TD::Convert("X")) {}
 
 void PYPLexicalTranslation::Summary() const {
diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h
index fa0fb28f..63e7c96d 100644
--- a/gi/pf/pyp_tm.h
+++ b/gi/pf/pyp_tm.h
@@ -5,10 +5,11 @@
 #include "wordid.h"
 #include "prob.h"
 #include "sampler.h"
+#include "freqdict.h"
 
-struct TRule;
+struct FreqBinner;
 struct PYPWordModel;
-template <typename T> struct ConditionalPYPWordModel;
+template <typename T, class B> struct ConditionalPYPWordModel;
 
 struct PYPLexicalTranslation {
   explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets,
@@ -26,7 +27,7 @@ struct PYPLexicalTranslation {
  private:
   const std::vector<std::vector<WordID> >& letters;   // spelling dictionary
   PYPWordModel* up0;  // base distribuction (model English word)
-  ConditionalPYPWordModel<PYPWordModel>* tmodel;  // translation distributions
+  ConditionalPYPWordModel<PYPWordModel, FreqBinner>* tmodel;  // translation distributions
                       // (model English word | French word)
   const WordID kX;
 };
-- 
cgit v1.2.3