From e1eae0ac941aa76528d4673dbd35f214cdac23fb Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 2 Feb 2012 17:19:40 -0500 Subject: remove some dead code to clean things up --- decoder/1dev.ur | 1 - decoder/apply_fsa_models.README | 21 ------ decoder/cdec-gz.ini | 7 -- decoder/cdec-nolm-tuned.ini | 7 -- decoder/decode.sh | 10 --- decoder/do.tests.sh | 1 - decoder/fsa-decode.sh | 3 - decoder/fsa-hiero.ini | 5 -- decoder/fsa.ini | 2 - decoder/glue-lda.scfg | 8 --- decoder/grammar.hiero | 151 ---------------------------------------- decoder/perro.sh | 1 - decoder/perro.ur | 1 - decoder/short.ur | 1 - decoder/weights-fsa | 14 ---- decoder/weights.hiero | 10 --- 16 files changed, 243 deletions(-) delete mode 100755 decoder/1dev.ur delete mode 100755 decoder/apply_fsa_models.README delete mode 100755 decoder/cdec-gz.ini delete mode 100755 decoder/cdec-nolm-tuned.ini delete mode 100755 decoder/decode.sh delete mode 100755 decoder/do.tests.sh delete mode 100755 decoder/fsa-decode.sh delete mode 100755 decoder/fsa-hiero.ini delete mode 100755 decoder/fsa.ini delete mode 100755 decoder/glue-lda.scfg delete mode 100755 decoder/grammar.hiero delete mode 100755 decoder/perro.sh delete mode 100755 decoder/perro.ur delete mode 100755 decoder/short.ur delete mode 100644 decoder/weights-fsa delete mode 100755 decoder/weights.hiero (limited to 'decoder') diff --git a/decoder/1dev.ur b/decoder/1dev.ur deleted file mode 100755 index adeaa101..00000000 --- a/decoder/1dev.ur +++ /dev/null @@ -1 +0,0 @@ -krAcy ( AstRAf rpwrtRr ) krAcy myN pyr kw mxtlf HAdvAt myN xAtwn smyt 4 AfrAd hlAk hw gyY jbkh smndr sY Ayk $xS ky lA$ mly . diff --git a/decoder/apply_fsa_models.README b/decoder/apply_fsa_models.README deleted file mode 100755 index 7e116a62..00000000 --- a/decoder/apply_fsa_models.README +++ /dev/null @@ -1,21 +0,0 @@ -trie root and trie lhs2[lhs-nodeid] -> trie node - -trie node edges (adj) - list of w,dest,p. dest==0 means it's a completed rule (note: p is redundant with node e.dest->p-p, except in case of dest=0). we will also use null_wordid (max_int) for dest=0 edges, but that doesn't matter - -we intersect by iterating over adj and scoring w/ fsa. TODO: index for sparse fsa; for now we assume smoothed ngram fsa where all items are scorable. - -predicted items: we don't make copies of the pending predictions as we scan toward completion; instead, item backpointers are followed until the prediction (where backpointer=0). such backpointer=0 items have a queue of prediction-originating items. - -reusing completed items using a lookup on pair [NT,a] -> all [NT,a,b] lazy best-first. b-next (right state) index in lazy index. - -perhaps predictors need to register the # of items it has already mated with. (b-next index) - -comb-like (cube) t-next (position in trie node edge list), b-next? or just check chart and don't redup. depends on whether we want just 1best or kbest deriv - diff. ways of reaching same result are good in kbest. - -types of chart items: - -A->t.*,a,b (trie node t) with mutable state t-next for generating successor lazily (vs. all at once) - -A->t.B,a,b (t-next of A->t.* points to (B,t')): mutable state b-next for choosing which B->b,? to use. note: such an item can't be queued immediately on its own, but can be added to the pending list of B->b,? ; once any B->b,? is completed then we see if any more b-next are already known; if they're exhausted then we add back to pending list? - -A->a,? - list of all known (b,inside prob) such that A[a,b]. we may also choose to represent this as A->.*,a,a. diff --git a/decoder/cdec-gz.ini b/decoder/cdec-gz.ini deleted file mode 100755 index f9b15420..00000000 --- a/decoder/cdec-gz.ini +++ /dev/null @@ -1,7 +0,0 @@ -cubepruning_pop_limit=200 -feature_function=WordPenalty -feature_function=ArityPenalty -add_pass_through_rules=true -formalism=scfg -grammar=mt09.grammar.gz -weights=weights.tune.nolm diff --git a/decoder/cdec-nolm-tuned.ini b/decoder/cdec-nolm-tuned.ini deleted file mode 100755 index 5ebab747..00000000 --- a/decoder/cdec-nolm-tuned.ini +++ /dev/null @@ -1,7 +0,0 @@ -cubepruning_pop_limit=200 -feature_function=WordPenalty -feature_function=ArityPenalty -add_pass_through_rules=true -formalism=scfg -grammar=mt09.grammar -weights=weights.tune.nolm diff --git a/decoder/decode.sh b/decoder/decode.sh deleted file mode 100755 index 677e64ad..00000000 --- a/decoder/decode.sh +++ /dev/null @@ -1,10 +0,0 @@ -d=$(dirname `readlink -f $0`)/ -decode() { -if [ "$lm" ] ; then - lmargs0=-F - lmargs1="LanguageModel lm.gz -n LM" -fi -set -x -$gdb ${cdec:=$d/cdec} -c $d/${cfg:=cdec-fsa}.ini -i $d/${in:=1dev.ur} $lmargs0 "$lmargs1" --show_features --show_config --show_weights "$@" -set +x -} diff --git a/decoder/do.tests.sh b/decoder/do.tests.sh deleted file mode 100755 index b3ddeb18..00000000 --- a/decoder/do.tests.sh +++ /dev/null @@ -1 +0,0 @@ -for f in *_test; do ./$f; done diff --git a/decoder/fsa-decode.sh b/decoder/fsa-decode.sh deleted file mode 100755 index 66879523..00000000 --- a/decoder/fsa-decode.sh +++ /dev/null @@ -1,3 +0,0 @@ -d=$(dirname `readlink -f $0`)/ -. $d/decode.sh -in=1dev.ur cfg=cdec-fsa decode diff --git a/decoder/fsa-hiero.ini b/decoder/fsa-hiero.ini deleted file mode 100755 index 7c7d0347..00000000 --- a/decoder/fsa-hiero.ini +++ /dev/null @@ -1,5 +0,0 @@ -formalism=scfg -scfg_extra_glue_grammar=glue-lda.scfg -grammar=grammar.hiero -show_tree_structure=true -weights=weights.hiero diff --git a/decoder/fsa.ini b/decoder/fsa.ini deleted file mode 100755 index 571a2e34..00000000 --- a/decoder/fsa.ini +++ /dev/null @@ -1,2 +0,0 @@ -feature_function=ShorterThanPrev -feature_function=LongerThanPrev diff --git a/decoder/glue-lda.scfg b/decoder/glue-lda.scfg deleted file mode 100755 index 27489817..00000000 --- a/decoder/glue-lda.scfg +++ /dev/null @@ -1,8 +0,0 @@ -[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X0,1] ||| [1] ||| GlueTop=1 -[S] ||| [S,1] [X1,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X1,1] ||| [1] ||| GlueTop=1 -[S] ||| [S,1] [X2,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X2,1] ||| [1] ||| GlueTop=1 -[S] ||| [S,1] [X3,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X3,1] ||| [1] ||| GlueTop=1 diff --git a/decoder/grammar.hiero b/decoder/grammar.hiero deleted file mode 100755 index 79adf33a..00000000 --- a/decoder/grammar.hiero +++ /dev/null @@ -1,151 +0,0 @@ -[X] ||| . ||| . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] . ||| [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] anciano ||| [1] old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| [X,1] anciano . ||| [1] old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| [X,1] anciano [X,2] ||| [1] old man [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| [X,1] feo ||| ugly [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] feo . ||| ugly [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] feo [X,2] ||| ugly [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato ||| [1] cat ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato . ||| [1] cat . ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato [X,2] ||| [1] [2] cat ||| EgivenF=0 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato [X,2] ||| [1] cat [2] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato [X,2] . ||| [1] [2] cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato negro ||| [1] black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato negro . ||| [1] black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato negro [X,2] ||| [1] black cat [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] grande ||| big [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] grande . ||| big [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] grande [X,2] ||| big [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] negro ||| black [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] negro . ||| black [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] negro [X,2] ||| black [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] oruga ||| [1] caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] oruga . ||| [1] caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] oruga [X,2] ||| [1] caterpiller [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito [X,2] ||| [1] [2] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito [X,2] . ||| [1] [2] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito feo ||| [1] ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito feo . ||| [1] ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito feo [X,2] ||| [1] ugly duckling [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] peces ||| [1] fish ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] peces . ||| [1] fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] peces [X,2] ||| [1] fish [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro ||| [1] dog ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro . ||| [1] dog . ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro [X,2] ||| [1] dog [2] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro [X,2] ||| [1] [2] dog ||| EgivenF=0 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro [X,2] . ||| [1] [2] dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro grande ||| [1] big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro grande . ||| [1] big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro grande [X,2] ||| [1] big dog [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro [X,2] ||| [1] [2] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro [X,2] . ||| [1] [2] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro negro ||| [1] black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro negro . ||| [1] black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro negro [X,2] ||| [1] black bird [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| anciano ||| old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| anciano . ||| old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| anciano [X,1] ||| old man [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| el ||| the ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] ||| the [1] ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] . ||| the [1] . ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] feo ||| the ugly [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] feo . ||| the ugly [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] feo [X,2] ||| the ugly [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] grande ||| the big [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] grande . ||| the big [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] grande [X,2] ||| the big [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] negro ||| the black [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] negro . ||| the black [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] negro [X,2] ||| the black [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato ||| the cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato . ||| the cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato [X,1] ||| the [1] cat ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato [X,1] ||| the cat [1] ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato [X,1] . ||| the [1] cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato negro ||| the black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato negro . ||| the black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato negro [X,1] ||| the black cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito [X,1] ||| the [1] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito [X,1] . ||| the [1] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito feo ||| the ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito feo . ||| the ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito feo [X,1] ||| the ugly duckling [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro ||| the dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro . ||| the dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro [X,1] ||| the [1] dog ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro [X,1] ||| the dog [1] ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro [X,1] . ||| the [1] dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro grande ||| the big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro grande . ||| the big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro grande [X,1] ||| the big dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro [X,1] ||| the [1] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro [X,1] . ||| the [1] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro negro ||| the black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro negro . ||| the black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro negro [X,1] ||| the black bird [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| eso ||| that ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso [X,1] ||| that [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso [X,1] . ||| that [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso perro ||| that dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso perro . ||| that dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso perro [X,1] ||| that dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este ||| this ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este [X,1] ||| this [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este [X,1] . ||| this [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este anciano ||| this old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| este anciano . ||| this old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| este anciano [X,1] ||| this old man [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| este gato ||| this cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este gato . ||| this cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este gato [X,1] ||| this cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| feo ||| ugly ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato ||| cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato . ||| cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato [X,1] ||| [1] cat ||| EgivenF=1.09861 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato [X,1] ||| cat [1] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato [X,1] . ||| [1] cat . ||| EgivenF=1.09861 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato negro ||| black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato negro . ||| black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato negro [X,1] ||| black cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| grande ||| big ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| la ||| the ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la [X,1] ||| the [1] ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la [X,1] . ||| the [1] . ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la oruga ||| the caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la oruga . ||| the caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la oruga [X,1] ||| the caterpiller [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los ||| the ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los [X,1] ||| the [1] ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los [X,1] . ||| the [1] . ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los peces ||| the fish ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los peces . ||| the fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los peces [X,1] ||| the fish [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| negro ||| black ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| oruga ||| caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| oruga . ||| caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| oruga [X,1] ||| caterpiller [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito ||| duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito [X,1] ||| [1] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito [X,1] . ||| [1] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito feo ||| ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito feo . ||| ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito feo [X,1] ||| ugly duckling [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| peces ||| fish ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| peces . ||| fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| peces [X,1] ||| fish [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro ||| dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro . ||| dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro [X,1] ||| [1] dog ||| EgivenF=1.09861 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro [X,1] ||| dog [1] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro [X,1] . ||| [1] dog . ||| EgivenF=1.09861 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro grande ||| big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro grande . ||| big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro grande [X,1] ||| big dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro ||| bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro [X,1] ||| [1] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro [X,1] . ||| [1] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro negro ||| black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro negro . ||| black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro negro [X,1] ||| black bird [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 diff --git a/decoder/perro.sh b/decoder/perro.sh deleted file mode 100755 index 3e54ac71..00000000 --- a/decoder/perro.sh +++ /dev/null @@ -1 +0,0 @@ -$gdb $cdec "$@" -k 30 --show_features -c fsa-hiero.ini -i perro.ur diff --git a/decoder/perro.ur b/decoder/perro.ur deleted file mode 100755 index 6c5da6d7..00000000 --- a/decoder/perro.ur +++ /dev/null @@ -1 +0,0 @@ -eso perro feo diff --git a/decoder/short.ur b/decoder/short.ur deleted file mode 100755 index 48612801..00000000 --- a/decoder/short.ur +++ /dev/null @@ -1 +0,0 @@ -krAcy myN pyr kw mxtlf HAdvAt diff --git a/decoder/weights-fsa b/decoder/weights-fsa deleted file mode 100644 index 3cc96c2f..00000000 --- a/decoder/weights-fsa +++ /dev/null @@ -1,14 +0,0 @@ -Arity_0 1.70741473606976 -Arity_1 1.12426238048012 -Arity_2 1.14986187839554 -Glue -0.04589037041388 -LanguageModel 1.09051 -LM 1.09051 -PassThrough -3.66226367902928 -PhraseModel_0 -1.94633451863252 -PhraseModel_1 -0.1475347695476 -PhraseModel_2 -1.614818994946 -WordPenalty -3.0 -WordPenaltyFsa -0.56028442964748 -ShorterThanPrev -10 -LongerThanPrev -10 diff --git a/decoder/weights.hiero b/decoder/weights.hiero deleted file mode 100755 index 6747f059..00000000 --- a/decoder/weights.hiero +++ /dev/null @@ -1,10 +0,0 @@ -SameFirstLetter 1 -LongerThanPrev 1 -ShorterThanPrev 1 -GlueTop 0.0 -Glue -1.0 -EgivenF -0.5 -FgivenE -0.5 -LexEgivenF -0.5 -LexFgivenE -0.5 -LM 1 -- cgit v1.2.3 From a97f481dda269c1474722ce7dc987fb5868951b6 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Fri, 10 Feb 2012 14:13:24 -0500 Subject: Dear windows users, code is not executable --- decoder/apply_fsa_models.h | 0 decoder/cfg.cc | 0 decoder/cfg.h | 0 decoder/cfg_binarize.h | 0 decoder/cfg_format.h | 0 decoder/cfg_options.h | 0 decoder/cfg_test.cc | 0 decoder/ff_register.h | 0 decoder/ff_sample_fsa.h | 0 decoder/hg_cfg.h | 0 decoder/hg_test.h | 0 decoder/nt_span.h | 0 decoder/oracle_bleu.h | 0 decoder/program_options.h | 0 decoder/sentences.h | 0 15 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 decoder/apply_fsa_models.h mode change 100755 => 100644 decoder/cfg.cc mode change 100755 => 100644 decoder/cfg.h mode change 100755 => 100644 decoder/cfg_binarize.h mode change 100755 => 100644 decoder/cfg_format.h mode change 100755 => 100644 decoder/cfg_options.h mode change 100755 => 100644 decoder/cfg_test.cc mode change 100755 => 100644 decoder/ff_register.h mode change 100755 => 100644 decoder/ff_sample_fsa.h mode change 100755 => 100644 decoder/hg_cfg.h mode change 100755 => 100644 decoder/hg_test.h mode change 100755 => 100644 decoder/nt_span.h mode change 100755 => 100644 decoder/oracle_bleu.h mode change 100755 => 100644 decoder/program_options.h mode change 100755 => 100644 decoder/sentences.h (limited to 'decoder') diff --git a/decoder/apply_fsa_models.h b/decoder/apply_fsa_models.h old mode 100755 new mode 100644 diff --git a/decoder/cfg.cc b/decoder/cfg.cc old mode 100755 new mode 100644 diff --git a/decoder/cfg.h b/decoder/cfg.h old mode 100755 new mode 100644 diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h old mode 100755 new mode 100644 diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h old mode 100755 new mode 100644 diff --git a/decoder/cfg_options.h b/decoder/cfg_options.h old mode 100755 new mode 100644 diff --git a/decoder/cfg_test.cc b/decoder/cfg_test.cc old mode 100755 new mode 100644 diff --git a/decoder/ff_register.h b/decoder/ff_register.h old mode 100755 new mode 100644 diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h old mode 100755 new mode 100644 diff --git a/decoder/hg_cfg.h b/decoder/hg_cfg.h old mode 100755 new mode 100644 diff --git a/decoder/hg_test.h b/decoder/hg_test.h old mode 100755 new mode 100644 diff --git a/decoder/nt_span.h b/decoder/nt_span.h old mode 100755 new mode 100644 diff --git a/decoder/oracle_bleu.h b/decoder/oracle_bleu.h old mode 100755 new mode 100644 diff --git a/decoder/program_options.h b/decoder/program_options.h old mode 100755 new mode 100644 diff --git a/decoder/sentences.h b/decoder/sentences.h old mode 100755 new mode 100644 -- cgit v1.2.3 From feab7be095f6454f9ce3021190939ca64bf41e62 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 12 Feb 2012 16:46:46 -0500 Subject: Target-side only output format --- decoder/decoder.cc | 3 +++ decoder/hg_io.cc | 27 +++++++++++++++++++++++++++ decoder/hg_io.h | 3 +++ 3 files changed, 33 insertions(+) (limited to 'decoder') diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 3b53fd6b..3394e0b8 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -408,6 +408,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("show_partition,z", "Compute and show the partition (inside score)") ("show_conditional_prob", "Output the conditional log prob to STDOUT instead of a translation") ("show_cfg_search_space", "Show the search space as a CFG") + ("show_target_graph", "Output the target hypergraph") ("coarse_to_fine_beam_prune", po::value(), "Prune paths from coarse parse forest before fine parse, keeping paths within exp(alpha>=0)") ("ctf_beam_widen", po::value()->default_value(2.0), "Expand coarse pass beam by this factor if no fine parse is found") ("ctf_num_widenings", po::value()->default_value(2), "Widen coarse beam this many times before backing off to full parse") @@ -1017,6 +1018,8 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { } if (conf.count("show_cfg_search_space")) HypergraphIO::WriteAsCFG(forest); + if (conf.count("show_target_graph")) + HypergraphIO::WriteTarget(forest); if (has_ref) { if (HG::Intersect(ref, &forest)) { // if (crf_uniform_empirical) { diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc index c1c93933..0283ec3c 100644 --- a/decoder/hg_io.cc +++ b/decoder/hg_io.cc @@ -624,3 +624,30 @@ void HypergraphIO::WriteAsCFG(const Hypergraph& hg) { } } +/* Output format: + * #vertices + * for each vertex in bottom-up topological order: + * #downward_edges + * for each downward edge: + * RHS with [vertex_index] for NTs ||| scores + */ +void HypergraphIO::WriteTarget(const Hypergraph& hg) { + cout << hg.nodes_.size() << '\n'; + for (unsigned int i = 0; i < hg.nodes_.size(); ++i) { + const Hypergraph::EdgesVector &edges = hg.nodes_[i].in_edges_; + cout << edges.size() << '\n'; + for (unsigned int j = 0; j < edges.size(); ++j) { + const Hypergraph::Edge &edge = hg.edges_[edges[j]]; + const std::vector &e = edge.rule_->e(); + for (std::vector::const_iterator word = e.begin(); word != e.end(); ++word) { + if (*word <= 0) { + cout << '[' << edge.tail_nodes_[-*word] << "] "; + } else { + cout << TD::Convert(*word) << ' '; + } + } + cout << "||| " << edge.rule_->scores_ << '\n'; + } + } +} + diff --git a/decoder/hg_io.h b/decoder/hg_io.h index 082489d8..44817157 100644 --- a/decoder/hg_io.h +++ b/decoder/hg_io.h @@ -23,6 +23,9 @@ struct HypergraphIO { static void WriteAsCFG(const Hypergraph& hg); + // Write only the target size information in bottom-up order. + static void WriteTarget(const Hypergraph& hg); + // serialization utils static void ReadFromPLF(const std::string& in, Hypergraph* out, int line = 0); // return PLF string representation (undefined behavior on non-lattices) -- cgit v1.2.3 From e8583574e25c8ef09c9cd21cbc7421d9d12cf75f Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 12 Feb 2012 17:40:03 -0500 Subject: Might as well provide the edge count as well --- decoder/hg_io.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'decoder') diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc index 0283ec3c..9f0f50fa 100644 --- a/decoder/hg_io.cc +++ b/decoder/hg_io.cc @@ -632,7 +632,7 @@ void HypergraphIO::WriteAsCFG(const Hypergraph& hg) { * RHS with [vertex_index] for NTs ||| scores */ void HypergraphIO::WriteTarget(const Hypergraph& hg) { - cout << hg.nodes_.size() << '\n'; + cout << hg.nodes_.size() << ' ' << hg.edges_.size() << '\n'; for (unsigned int i = 0; i < hg.nodes_.size(); ++i) { const Hypergraph::EdgesVector &edges = hg.nodes_[i].in_edges_; cout << edges.size() << '\n'; -- cgit v1.2.3 From c4ffa6df1fdd89e3db9c6d3829b7b84edac20bcf Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 19 Feb 2012 04:27:55 -0500 Subject: lbl preliminary clean up --- decoder/lattice.cc | 1 + training/lbl_model.cc | 84 +++++++++++++++++++++++++++++++++------------------ 2 files changed, 55 insertions(+), 30 deletions(-) (limited to 'decoder') diff --git a/decoder/lattice.cc b/decoder/lattice.cc index e3631e59..89da3cd0 100644 --- a/decoder/lattice.cc +++ b/decoder/lattice.cc @@ -46,6 +46,7 @@ void LatticeTools::ConvertTextToLattice(const string& text, Lattice* pl) { Lattice& l = *pl; vector ids; TD::ConvertSentence(text, &ids); + l.clear(); l.resize(ids.size()); for (int i = 0; i < l.size(); ++i) l[i].push_back(LatticeArc(ids[i], 0.0, 1)); diff --git a/training/lbl_model.cc b/training/lbl_model.cc index 72d80a56..ccd29255 100644 --- a/training/lbl_model.cc +++ b/training/lbl_model.cc @@ -6,6 +6,7 @@ #else #include +#include #include #include @@ -20,10 +21,17 @@ namespace po = boost::program_options; using namespace std; +#define kDIMENSIONS 10 +typedef Eigen::Matrix RVector; +typedef Eigen::Matrix RTVector; +typedef Eigen::Matrix TMatrix; +vector r_src, r_trg; + bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("iterations,i",po::value()->default_value(5),"Number of iterations of training") + ("input,i",po::value(),"Input file") + ("iterations,I",po::value()->default_value(1000),"Number of iterations of training") ("diagonal_tension,T", po::value()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") ("testset,x", po::value(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); po::options_description clo("Command line options"); @@ -42,7 +50,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::notify(*conf); if (argc < 2 || conf->count("help")) { - cerr << "Usage " << argv[0] << " [OPTIONS] corpus.fr-en\n"; + cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n"; cerr << dcmdline_options << endl; return false; } @@ -52,33 +60,32 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { int main(int argc, char** argv) { po::variables_map conf; if (!InitCommandLine(argc, argv, &conf)) return 1; - const string fname = argv[argc - 1]; + const string fname = conf["input"].as(); const int ITERATIONS = conf["iterations"].as(); const double diagonal_tension = conf["diagonal_tension"].as(); + if (diagonal_tension < 0.0) { + cerr << "Invalid value for diagonal_tension: must be >= 0\n"; + return 1; + } string testset; if (conf.count("testset")) testset = conf["testset"].as(); - double tot_len_ratio = 0; - double mean_srclen_multiplier = 0; + int lc = 0; vector unnormed_a_i; - for (int iter = 0; iter < ITERATIONS; ++iter) { - cerr << "ITERATION " << (iter + 1) << endl; + string line; + string ssrc, strg; + bool flag = false; + Lattice src, trg; + set vocab_e; + { // read through corpus, initialize int map, check lines are good + cerr << "INITIAL READ OF " << fname << endl; ReadFile rf(fname); istream& in = *rf.stream(); - double likelihood = 0; - double denom = 0.0; - int lc = 0; - bool flag = false; - string line; - string ssrc, strg; - while(true) { - getline(in, line); - if (!in) break; + while(getline(in, line)) { ++lc; if (lc % 1000 == 0) { cerr << '.'; flag = true; } if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } ParseTranslatorInput(line, &ssrc, &strg); - Lattice src, trg; LatticeTools::ConvertTextToLattice(ssrc, &src); LatticeTools::ConvertTextToLattice(strg, &trg); if (src.size() == 0 || trg.size() == 0) { @@ -88,37 +95,54 @@ int main(int argc, char** argv) { } if (src.size() > unnormed_a_i.size()) unnormed_a_i.resize(src.size()); - if (iter == 0) - tot_len_ratio += static_cast(trg.size()) / static_cast(src.size()); + for (unsigned i = 0; i < trg.size(); ++i) { + assert(trg[i].size() == 1); + vocab_e.insert(trg[i][0].label); + } + } + } + if (flag) cerr << endl; + + // do optimization + for (int iter = 0; iter < ITERATIONS; ++iter) { + cerr << "ITERATION " << (iter + 1) << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + double likelihood = 0; + double denom = 0.0; + lc = 0; + flag = false; + while(true) { + getline(in, line); + if (!in) break; + ++lc; + if (lc % 1000 == 0) { cerr << '.'; flag = true; } + if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } + ParseTranslatorInput(line, &ssrc, &strg); + LatticeTools::ConvertTextToLattice(ssrc, &src); + LatticeTools::ConvertTextToLattice(strg, &trg); denom += trg.size(); vector probs(src.size() + 1); - bool first_al = true; // used for write_alignments for (int j = 0; j < trg.size(); ++j) { const WordID& f_j = trg[j][0].label; double sum = 0; const double j_over_ts = double(j) / trg.size(); - double prob_a_i = 1.0 / src.size(); double az = 0; for (int ta = 0; ta < src.size(); ++ta) { unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); az += unnormed_a_i[ta]; } for (int i = 1; i <= src.size(); ++i) { - prob_a_i = unnormed_a_i[i-1] / az; + const double prob_a_i = unnormed_a_i[i-1] / az; + // TODO probs[i] = 1; // tt.prob(src[i-1][0].label, f_j) * prob_a_i; sum += probs[i]; } } } - - // log(e) = 1.0 - double base2_likelihood = likelihood / log(2); - if (flag) { cerr << endl; } - if (iter == 0) { - mean_srclen_multiplier = tot_len_ratio / lc; - cerr << "expected target length = source length * " << mean_srclen_multiplier << endl; - } + + const double base2_likelihood = likelihood / log(2); cerr << " log_e likelihood: " << likelihood << endl; cerr << " log_2 likelihood: " << base2_likelihood << endl; cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; -- cgit v1.2.3 From d87220030b82fed860efee40487502e9ee8f0651 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 27 Feb 2012 02:19:34 +0000 Subject: generic bayesian cfg learner with a bunch of cfg grammar types --- .gitignore | 1 + decoder/trule.cc | 16 +-- gi/pf/Makefile.am | 4 +- gi/pf/hierolm.cc | 309 ----------------------------------------- gi/pf/learn_cfg.cc | 394 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 398 insertions(+), 326 deletions(-) delete mode 100644 gi/pf/hierolm.cc create mode 100644 gi/pf/learn_cfg.cc (limited to 'decoder') diff --git a/.gitignore b/.gitignore index 327f7261..28d5a60a 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,7 @@ training/mpi_extract_reachable klm/lm/build_binary extools/extractor_monolingual gi/pf/.deps +gi/pf/learn_cfg gi/pf/brat gi/pf/cbgi gi/pf/dpnaive diff --git a/decoder/trule.cc b/decoder/trule.cc index 40235542..141b8faa 100644 --- a/decoder/trule.cc +++ b/decoder/trule.cc @@ -232,16 +232,6 @@ void TRule::ComputeArity() { arity_ = 1 - min; } -static string AnonymousStrVar(int i) { - string res("[v]"); - if(!(i <= 0 && i >= -8)) { - cerr << "Can't handle more than 9 non-terminals: index=" << (-i) << endl; - abort(); - } - res[1] = '1' - i; - return res; -} - string TRule::AsString(bool verbose) const { ostringstream os; int idx = 0; @@ -259,15 +249,11 @@ string TRule::AsString(bool verbose) const { } } os << " ||| "; - if (idx > 9) { - cerr << "Too many non-terminals!\n partial: " << os.str() << endl; - exit(1); - } for (int i =0; i -#include -#include - -#include -#include -#include - -#include "inside_outside.h" -#include "hg.h" -#include "bottom_up_parser.h" -#include "fdict.h" -#include "grammar.h" -#include "m.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp.h" -#include "ccrp_onetable.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -shared_ptr prng; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void ReadCorpus(const string& filename, - vector >* e, - set* vocab_e) { - e->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector()); - vector& le = e->back(); - TD::ConvertSentence(line, &le); - for (unsigned i = 0; i < le.size(); ++i) - vocab_e->insert(le[i]); - } - if (in != &cin) delete in; -} - -struct Grid { - // a b c d e - // 0 - 0 - - - vector grid; -}; - -struct BaseRuleModel { - explicit BaseRuleModel(unsigned term_size, - unsigned nonterm_size = 1) : - unif_term(1.0 / term_size), - unif_nonterm(1.0 / nonterm_size) {} - prob_t operator()(const TRule& r) const { - prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); - const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); - const prob_t nonterm_prob(1.0 - term_prob.as_float()); - for (unsigned i = 0; i < r.f_.size(); ++i) { - if (r.f_[i] <= 0) { // nonterminal - p *= nonterm_prob; - p *= unif_nonterm; - } else { // terminal - p *= term_prob; - p *= unif_term; - } - } - return p; - } - const prob_t unif_term, unif_nonterm; -}; - -struct HieroLMModel { - explicit HieroLMModel(unsigned vocab_size) : p0(vocab_size), x(1,1,1,1) {} - - prob_t Prob(const TRule& r) const { - return x.probT(r, p0(r)); - } - - int Increment(const TRule& r, MT19937* rng) { - return x.incrementT(r, p0(r), rng); - // return x.increment(r); - } - - int Decrement(const TRule& r, MT19937* rng) { - return x.decrement(r, rng); - //return x.decrement(r); - } - - prob_t Likelihood() const { - prob_t p; - p.logeq(x.log_crp_prob()); - for (CCRP::const_iterator it = x.begin(); it != x.end(); ++it) { - prob_t tp = p0(it->first); - tp.poweq(it->second.table_counts_.size()); - p *= tp; - } - //for (CCRP_OneTable::const_iterator it = x.begin(); it != x.end(); ++it) - // p *= p0(it->first); - return p; - } - - void ResampleHyperparameters(MT19937* rng) { - x.resample_hyperparameters(rng); - cerr << " d=" << x.discount() << ", alpha=" << x.concentration() << endl; - } - - const BaseRuleModel p0; - CCRP x; - //CCRP_OneTable x; -}; - -vector tofreelist; - -HieroLMModel* plm; - -struct NPGrammarIter : public GrammarIter, public RuleBin { - NPGrammarIter() : arity() { tofreelist.push_back(this); } - NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a + (symbol < 0 ? 1 : 0)) { - if (inr) { - r.reset(new TRule(*inr)); - } else { - static const int kLHS = -TD::Convert("X"); - r.reset(new TRule); - r->lhs_ = kLHS; - } - TRule& rr = *r; - rr.f_.push_back(symbol); - rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); - tofreelist.push_back(this); - } - virtual int GetNumRules() const { - if (r) return 1; else return 0; - } - virtual TRulePtr GetIthRule(int) const { - return r; - } - virtual int Arity() const { - return arity; - } - virtual const RuleBin* GetRules() const { - if (!r) return NULL; else return this; - } - virtual const GrammarIter* Extend(int symbol) const { - return new NPGrammarIter(r, arity, symbol); - } - const unsigned char arity; - TRulePtr r; -}; - -struct NPGrammar : public Grammar { - virtual const GrammarIter* GetRoot() const { - return new NPGrammarIter; - } -}; - -void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector* sampled_deriv, HieroLMModel* plm) { - HieroLMModel& lm = *plm; - vector node_probs; - const prob_t total_prob = Inside(hg, &node_probs); - queue q; - q.push(hg.nodes_.size() - 3); - while(!q.empty()) { - unsigned cur_node_id = q.front(); -// cerr << "NODE=" << cur_node_id << endl; - q.pop(); - const Hypergraph::Node& node = hg.nodes_[cur_node_id]; - const unsigned num_in_edges = node.in_edges_.size(); - unsigned sampled_edge = 0; - if (num_in_edges == 1) { - sampled_edge = node.in_edges_[0]; - } else { - //prob_t z; - assert(num_in_edges > 1); - SampleSet ss; - for (unsigned j = 0; j < num_in_edges; ++j) { - const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; - prob_t p = edge.edge_prob_; - for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) - p *= node_probs[edge.tail_nodes_[k]]; - ss.add(p); -// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; - //z += p; - } -// for (unsigned j = 0; j < num_in_edges; ++j) { -// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; -// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; -// } -// cerr << " --- \n"; - sampled_edge = node.in_edges_[rng->SelectSample(ss)]; - } - sampled_deriv->push_back(sampled_edge); - const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; - for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { - q.push(edge.tail_nodes_[j]); - } - } - for (unsigned i = 0; i < sampled_deriv->size(); ++i) { - cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; - } -} - -void IncrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { - for (unsigned i = 0; i < d.size(); ++i) - plm->Increment(*hg.edges_[d[i]].rule_, rng); -} - -void DecrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { - for (unsigned i = 0; i < d.size(); ++i) - plm->Decrement(*hg.edges_[d[i]].rule_, rng); -} - -int main(int argc, char** argv) { - po::variables_map conf; - vector grammars; - grammars.push_back(GrammarPtr(new NPGrammar)); - - InitCommandLine(argc, argv, &conf); - const unsigned samples = conf["samples"].as(); - - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - - vector > corpuse; - set vocabe; - cerr << "Reading corpus...\n"; - ReadCorpus(conf["input"].as(), &corpuse, &vocabe); - cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; - HieroLMModel lm(vocabe.size()); - - plm = &lm; - ExhaustiveBottomUpParser parser("X", grammars); - - Hypergraph hg; - const int kX = -TD::Convert("X"); - const int kLP = FD::Convert("LogProb"); - SparseVector v; v.set_value(kLP, 1.0); - vector > derivs(corpuse.size()); - for (int SS=0; SS < samples; ++SS) { - for (int ci = 0; ci < corpuse.size(); ++ci) { - vector& src = corpuse[ci]; - Lattice lat(src.size()); - for (unsigned i = 0; i < src.size(); ++i) - lat[i].push_back(LatticeArc(src[i], 0.0, 1)); - cerr << TD::GetString(src) << endl; - hg.clear(); - parser.Parse(lat, &hg); // exhaustive parse - DecrementDerivation(hg, derivs[ci], &lm, &rng); - for (unsigned i = 0; i < hg.edges_.size(); ++i) { - TRule& r = *hg.edges_[i].rule_; - if (r.lhs_ == kX) - hg.edges_[i].edge_prob_ = lm.Prob(r); - } - vector d; - SampleDerivation(hg, &rng, &d, &lm); - derivs[ci] = d; - IncrementDerivation(hg, derivs[ci], &lm, &rng); - if (tofreelist.size() > 100000) { - cerr << "Freeing ... "; - for (unsigned i = 0; i < tofreelist.size(); ++i) - delete tofreelist[i]; - tofreelist.clear(); - cerr << "Freed.\n"; - } - } - cerr << "LLH=" << lm.Likelihood() << endl; - } - return 0; -} - diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc new file mode 100644 index 00000000..3d202816 --- /dev/null +++ b/gi/pf/learn_cfg.cc @@ -0,0 +1,394 @@ +#include +#include +#include + +#include +#include +#include + +#include "inside_outside.h" +#include "hg.h" +#include "bottom_up_parser.h" +#include "fdict.h" +#include "grammar.h" +#include "m.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; +vector nt_vocab; +vector nt_id_to_index; +static unsigned kMAX_RULE_SIZE = 0; +static unsigned kMAX_ARITY = 0; +static bool kALLOW_MIXED = true; // allow rules with mixed terminals and NTs + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("max_rule_size,m", po::value()->default_value(0), "Maximum rule size (0 for unlimited)") + ("max_arity,a", po::value()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)") + ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS") + ("nonterminals,n", po::value()->default_value(1), "Size of nonterminal vocabulary") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +unsigned ReadCorpus(const string& filename, + vector >* e, + set* vocab_e) { + e->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + unsigned toks = 0; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + vector& le = e->back(); + TD::ConvertSentence(line, &le); + for (unsigned i = 0; i < le.size(); ++i) + vocab_e->insert(le[i]); + toks += le.size(); + } + if (in != &cin) delete in; + return toks; +} + +struct Grid { + // a b c d e + // 0 - 0 - - + vector grid; +}; + +struct BaseRuleModel { + explicit BaseRuleModel(unsigned term_size, + unsigned nonterm_size = 1) : + unif_term(1.0 / term_size), + unif_nonterm(1.0 / nonterm_size) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); + const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); + const prob_t nonterm_prob(1.0 - term_prob.as_float()); + for (unsigned i = 0; i < r.f_.size(); ++i) { + if (r.f_[i] <= 0) { // nonterminal + p *= nonterm_prob; + p *= unif_nonterm; + } else { // terminal + p *= term_prob; + p *= unif_term; + } + } + return p; + } + const prob_t unif_term, unif_nonterm; +}; + +struct HieroLMModel { + explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : p0(vocab_size, num_nts), nts(num_nts, CCRP(1,1,1,1)) {} + + prob_t Prob(const TRule& r) const { + return nts[nt_id_to_index[-r.lhs_]].probT(r, p0(r)); + } + + int Increment(const TRule& r, MT19937* rng) { + return nts[nt_id_to_index[-r.lhs_]].incrementT(r, p0(r), rng); + // return x.increment(r); + } + + int Decrement(const TRule& r, MT19937* rng) { + return nts[nt_id_to_index[-r.lhs_]].decrement(r, rng); + //return x.decrement(r); + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (unsigned i = 0; i < nts.size(); ++i) { + prob_t q; q.logeq(nts[i].log_crp_prob()); + p *= q; + for (CCRP::const_iterator it = nts[i].begin(); it != nts[i].end(); ++it) { + prob_t tp = p0(it->first); + tp.poweq(it->second.table_counts_.size()); + p *= tp; + } + } + //for (CCRP_OneTable::const_iterator it = x.begin(); it != x.end(); ++it) + // p *= p0(it->first); + return p; + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < nts.size(); ++i) + nts[i].resample_hyperparameters(rng); + cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].concentration() << endl; + } + + const BaseRuleModel p0; + vector > nts; + //CCRP_OneTable x; +}; + +vector tofreelist; + +HieroLMModel* plm; + +struct NPGrammarIter : public GrammarIter, public RuleBin { + NPGrammarIter() : arity() { tofreelist.push_back(this); } + NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a) { + if (inr) { + r.reset(new TRule(*inr)); + } else { + r.reset(new TRule); + } + TRule& rr = *r; + rr.lhs_ = nt_vocab[0]; + rr.f_.push_back(symbol); + rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); + tofreelist.push_back(this); + } + inline static unsigned NextArity(int cur_a, int symbol) { + return cur_a + (symbol <= 0 ? 1 : 0); + } + virtual int GetNumRules() const { + if (r) return nt_vocab.size(); else return 0; + } + virtual TRulePtr GetIthRule(int i) const { + if (i == 0) return r; + TRulePtr nr(new TRule(*r)); + nr->lhs_ = nt_vocab[i]; + return nr; + } + virtual int Arity() const { + return arity; + } + virtual const RuleBin* GetRules() const { + if (!r) return NULL; else return this; + } + virtual const GrammarIter* Extend(int symbol) const { + const int next_arity = NextArity(arity, symbol); + if (kMAX_ARITY && next_arity > kMAX_ARITY) + return NULL; + if (!kALLOW_MIXED && r) { + bool t1 = r->f_.front() <= 0; + bool t2 = symbol <= 0; + if (t1 != t2) return NULL; + } + if (!kMAX_RULE_SIZE || !r || (r->f_.size() < kMAX_RULE_SIZE)) + return new NPGrammarIter(r, next_arity, symbol); + else + return NULL; + } + const unsigned char arity; + TRulePtr r; +}; + +struct NPGrammar : public Grammar { + virtual const GrammarIter* GetRoot() const { + return new NPGrammarIter; + } +}; + +prob_t TotalProb(const Hypergraph& hg) { + return Inside(hg); +} + +void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector* sampled_deriv) { + vector node_probs; + Inside(hg, &node_probs); + queue q; + q.push(hg.nodes_.size() - 2); + while(!q.empty()) { + unsigned cur_node_id = q.front(); +// cerr << "NODE=" << cur_node_id << endl; + q.pop(); + const Hypergraph::Node& node = hg.nodes_[cur_node_id]; + const unsigned num_in_edges = node.in_edges_.size(); + unsigned sampled_edge = 0; + if (num_in_edges == 1) { + sampled_edge = node.in_edges_[0]; + } else { + //prob_t z; + assert(num_in_edges > 1); + SampleSet ss; + for (unsigned j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + prob_t p = edge.edge_prob_; + for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) + p *= node_probs[edge.tail_nodes_[k]]; + ss.add(p); +// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; + //z += p; + } +// for (unsigned j = 0; j < num_in_edges; ++j) { +// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; +// } +// cerr << " --- \n"; + sampled_edge = node.in_edges_[rng->SelectSample(ss)]; + } + sampled_deriv->push_back(sampled_edge); + const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; + for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { + q.push(edge.tail_nodes_[j]); + } + } + for (unsigned i = 0; i < sampled_deriv->size(); ++i) { + cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; + } +} + +void IncrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Increment(*hg.edges_[d[i]].rule_, rng); +} + +void DecrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Decrement(*hg.edges_[d[i]].rule_, rng); +} + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + nt_vocab.resize(conf["nonterminals"].as()); + assert(nt_vocab.size() > 0); + assert(nt_vocab.size() < 26); + { + string nt = "X"; + for (unsigned i = 0; i < nt_vocab.size(); ++i) { + if (nt_vocab.size() > 1) nt[0] = ('A' + i); + int pid = TD::Convert(nt); + nt_vocab[i] = -pid; + if (pid >= nt_id_to_index.size()) { + nt_id_to_index.resize(pid + 1, -1); + } + nt_id_to_index[pid] = i; + } + } + vector grammars; + grammars.push_back(GrammarPtr(new NPGrammar)); + + const unsigned samples = conf["samples"].as(); + kMAX_RULE_SIZE = conf["max_rule_size"].as(); + if (kMAX_RULE_SIZE == 1) { + cerr << "Invalid maximum rule size: must be 0 or >1\n"; + return 1; + } + kMAX_ARITY = conf["max_arity"].as(); + if (kMAX_ARITY == 1) { + cerr << "Invalid maximum arity: must be 0 or >1\n"; + return 1; + } + kALLOW_MIXED = !conf.count("no_mixed_rules"); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector > corpuse; + set vocabe; + cerr << "Reading corpus...\n"; + const unsigned toks = ReadCorpus(conf["input"].as(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + HieroLMModel lm(vocabe.size(), nt_vocab.size()); + + plm = &lm; + ExhaustiveBottomUpParser parser(TD::Convert(-nt_vocab[0]), grammars); + + Hypergraph hg; + const int kGoal = -TD::Convert("Goal"); + const int kLP = FD::Convert("LogProb"); + SparseVector v; v.set_value(kLP, 1.0); + vector > derivs(corpuse.size()); + vector cl(corpuse.size()); + for (int ci = 0; ci < corpuse.size(); ++ci) { + vector& src = corpuse[ci]; + Lattice& lat = cl[ci]; + lat.resize(src.size()); + for (unsigned i = 0; i < src.size(); ++i) + lat[i].push_back(LatticeArc(src[i], 0.0, 1)); + } + for (int SS=0; SS < samples; ++SS) { + const bool is_last = ((samples - 1) == SS); + prob_t dlh = prob_t::One(); + for (int ci = 0; ci < corpuse.size(); ++ci) { + const vector& src = corpuse[ci]; + const Lattice& lat = cl[ci]; + cerr << TD::GetString(src) << endl; + hg.clear(); + parser.Parse(lat, &hg); // exhaustive parse + vector& d = derivs[ci]; + if (!is_last) DecrementDerivation(hg, d, &lm, &rng); + for (unsigned i = 0; i < hg.edges_.size(); ++i) { + TRule& r = *hg.edges_[i].rule_; + if (r.lhs_ == kGoal) + hg.edges_[i].edge_prob_ = prob_t::One(); + else + hg.edges_[i].edge_prob_ = lm.Prob(r); + } + if (!is_last) { + d.clear(); + SampleDerivation(hg, &rng, &d); + IncrementDerivation(hg, derivs[ci], &lm, &rng); + } else { + prob_t p = TotalProb(hg); + dlh *= p; + cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl; + } + if (tofreelist.size() > 200000) { + cerr << "Freeing ... "; + for (unsigned i = 0; i < tofreelist.size(); ++i) + delete tofreelist[i]; + tofreelist.clear(); + cerr << "Freed.\n"; + } + } + double llh = log(lm.Likelihood()); + cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl; + if (SS % 10 == 9) lm.ResampleHyperparameters(&rng); + if (is_last) { + double z = log(dlh); + cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl; + } + } + for (unsigned i = 0; i < nt_vocab.size(); ++i) + cerr << lm.nts[i] << endl; + return 0; +} + -- cgit v1.2.3 From e1a0c140e9f31461ab45ec7f9533ad98d2b9caa9 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 29 Feb 2012 12:58:53 -0800 Subject: Dump the forest before the language model rescoring --- decoder/decoder.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'decoder') diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 3394e0b8..69fbaf85 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -812,6 +812,9 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { abort(); } + if (conf.count("show_target_graph")) + HypergraphIO::WriteTarget(forest); + for (int pass = 0; pass < rescoring_passes.size(); ++pass) { const RescoringPass& rp = rescoring_passes[pass]; const vector& cur_weights = *rp.weight_vector; @@ -1018,8 +1021,6 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { } if (conf.count("show_cfg_search_space")) HypergraphIO::WriteAsCFG(forest); - if (conf.count("show_target_graph")) - HypergraphIO::WriteTarget(forest); if (has_ref) { if (HG::Intersect(ref, &forest)) { // if (crf_uniform_empirical) { -- cgit v1.2.3 From 064e53669428404269c9015af2dee135bf91226d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 3 Mar 2012 01:21:05 -0500 Subject: use assert properly --- decoder/apply_models.cc | 9 ++++++--- decoder/earley_composer.cc | 5 ++++- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'decoder') diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc index 40fd27e4..9ba59d1b 100644 --- a/decoder/apply_models.cc +++ b/decoder/apply_models.cc @@ -270,7 +270,8 @@ public: const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; const JVector j(edge.tail_nodes_.size(), 0); cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal)); - assert(unique_cands.insert(cand.back()).second); // these should all be unique! + bool is_new = unique_cands.insert(cand.back()).second; + assert(is_new); // these should all be unique! } // cerr << " making heap of " << cand.size() << " candidates\n"; make_heap(cand.begin(), cand.end(), HeapCandCompare()); @@ -378,7 +379,8 @@ public: pop_heap(cand.begin(), cand.end(), HeapCandCompare()); Candidate* item = cand.back(); cand.pop_back(); - assert(unique_accepted.insert(item).second); // these should all be unique! + bool is_new = unique_accepted.insert(item).second; + assert(is_new); // these should all be unique! // cerr << "POPPED: " << *item << endl; PushSuccFast2(*item, is_goal, &cand, &unique_accepted); @@ -419,7 +421,8 @@ public: Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal); cand.push_back(new_cand); push_heap(cand.begin(), cand.end(), HeapCandCompare()); - assert(cs->insert(new_cand).second); // insert into uniqueness set, sanity check + bool is_new = cs->insert(new_cand).second; + assert(is_new); // insert into uniqueness set, sanity check } } } diff --git a/decoder/earley_composer.cc b/decoder/earley_composer.cc index 48e94a31..b7af801a 100644 --- a/decoder/earley_composer.cc +++ b/decoder/earley_composer.cc @@ -329,7 +329,10 @@ class EarleyComposerImpl { forest->ReserveNodes(kMAX_NODES); assert(sit != g.end()); Edge* init = new Edge(start_cat_, &sit->second, q_0_); - assert(IncorporateNewEdge(init)); + if (!IncorporateNewEdge(init)) { + cerr << "Failed to create initial edge!\n"; + abort(); + } while (exp_agenda.HasWork() || agenda.HasWork()) { while(exp_agenda.HasWork()) { const Edge* edge = exp_agenda.Next(); -- cgit v1.2.3 From e0507d1aa96c6b1348e6a202beb95f63d8662258 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 3 Mar 2012 03:24:53 -0500 Subject: PYP language model (Teh 2006) --- decoder/fst_translator.cc | 5 +- gi/pf/Makefile.am | 4 +- gi/pf/pyp_lm.cc | 150 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 2 deletions(-) create mode 100644 gi/pf/pyp_lm.cc (limited to 'decoder') diff --git a/decoder/fst_translator.cc b/decoder/fst_translator.cc index 38dbd717..074de4c9 100644 --- a/decoder/fst_translator.cc +++ b/decoder/fst_translator.cc @@ -30,7 +30,10 @@ struct FSTTranslatorImpl { if (input.find("{\"rules\"") == 0) { istringstream is(input); Hypergraph src_cfg_hg; - assert(HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)); + if (!HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)) { + cerr << "Failed to read HG from JSON.\n"; + abort(); + } if (add_pass_through_rules) { SparseVector feats; feats.set_value(FD::Convert("PassThrough"), 1); diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 0cf0bc63..7cf9c14d 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc @@ -9,6 +9,8 @@ align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc itg_SOURCES = itg.cc +pyp_lm_SOURCES = pyp_lm.cc + learn_cfg_SOURCES = learn_cfg.cc condnaive_SOURCES = condnaive.cc diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc new file mode 100644 index 00000000..2837e33c --- /dev/null +++ b/gi/pf/pyp_lm.cc @@ -0,0 +1,150 @@ +#include +#include +#include + +#include +#include +#include + +#include "corpus_tools.h" +#include "m.h" +#include "tdict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +template struct PYPLM; + +// uniform base distribution +template<> struct PYPLM<0> { + PYPLM(unsigned vs) : p0(1.0 / vs) {} + void increment(WordID w, const vector& context, MT19937* rng) const {} + void decrement(WordID w, const vector& context, MT19937* rng) const {} + double prob(WordID w, const vector& context) const { return p0; } + const double p0; +}; + +// represents an N-gram LM +template struct PYPLM { + PYPLM(unsigned vs) : backoff(vs) {} + void increment(WordID w, const vector& context, MT19937* rng) { + const double bo = backoff.prob(w, context); + static vector lookup(N-1); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); + if (it == p.end()) + it = p.insert(make_pair(lookup, CCRP(1,1,1,1))).first; + if (it->second.increment(w, bo, rng)) + backoff.increment(w, context, rng); + } + void decrement(WordID w, const vector& context, MT19937* rng) { + static vector lookup(N-1); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); + assert(it != p.end()); + if (it->second.decrement(w, rng)) + backoff.decrement(w, context, rng); + } + double prob(WordID w, const vector& context) const { + const double bo = backoff.prob(w, context); + static vector lookup(N-1); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map, CCRP, boost::hash > >::const_iterator it = p.find(lookup); + if (it == p.end()) return bo; + return it->second.prob(w, bo); + } + PYPLM backoff; + unordered_map, CCRP, boost::hash > > p; +}; + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + const unsigned samples = conf["samples"].as(); + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector > corpuse; + set vocabe; + const WordID kEOS = TD::Convert(""); + cerr << "Reading corpus...\n"; + CorpusTools::ReadFromFile(conf["input"].as(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; +#define kORDER 5 + PYPLM lm(vocabe.size()); + vector ctx(kORDER - 1, TD::Convert("")); + int mci = corpuse.size() * 99 / 100; + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < mci; ++ci) { + ctx.resize(kORDER - 1); + const vector& s = corpuse[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + if (SS > 0) lm.decrement(w, ctx, &rng); + lm.increment(w, ctx, &rng); + ctx.push_back(w); + } + if (SS > 0) lm.decrement(kEOS, ctx, &rng); + lm.increment(kEOS, ctx, &rng); + } + } + double llh = 0; + unsigned cnt = 0; + for (int ci = mci; ci < corpuse.size(); ++ci) { + ctx.resize(kORDER - 1); + const vector& s = corpuse[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + double lp = log(lm.prob(w, ctx)) / log(2); + cerr << "p(" << TD::Convert(w) << " | " << TD::GetString(ctx) << ") = " << lp << endl; + ctx.push_back(w); + llh -= lp; + cnt++; + } + } + cerr << " Log_10 prob: " << (llh * log(2) / log(10)) << endl; + cerr << " Count: " << (cnt) << endl; + cerr << "Cross-entropy: " << (llh / cnt) << endl; + cerr << " Perplexity: " << pow(2, llh / cnt) << endl; + return 0; +} + -- cgit v1.2.3 From 4f19cd0c9a729cfd59d186492b3035c168f5e58f Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 7 Mar 2012 09:46:54 -0500 Subject: configure order of n-gram features --- decoder/ff_ngrams.cc | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) (limited to 'decoder') diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc index 04dd1906..d6d79f5e 100644 --- a/decoder/ff_ngrams.cc +++ b/decoder/ff_ngrams.cc @@ -57,6 +57,39 @@ namespace { } } +static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order) { + vector const& argv=SplitOnWhitespace(in); + *explicit_markers = false; + *order = 3; +#define LMSPEC_NEXTARG if (i==argv.end()) { \ + cerr << "Missing argument for "<<*last<<". "; goto usage; \ + } else { ++i; } + + for (vector::const_iterator last,i=argv.begin(),e=argv.end();i!=e;++i) { + string const& s=*i; + if (s[0]=='-') { + if (s.size()>2) goto fail; + switch (s[1]) { + case 'x': + *explicit_markers = true; + break; + case 'o': + LMSPEC_NEXTARG; *order=atoi((*i).c_str()); + break; +#undef LMSPEC_NEXTARG + default: + fail: + cerr<<"Unknown option on NgramFeatures "<")) , add_sos_eos_(!explicit_markers) { - order_ = 3; + order_ = order; state_size_ = (order_ - 1) * sizeof(WordID) + 2 + (order_ - 1) * sizeof(WordID); unscored_size_offset_ = (order_ - 1) * sizeof(WordID); is_complete_offset_ = unscored_size_offset_ + 1; @@ -316,8 +349,10 @@ class NgramDetectorImpl { NgramDetector::NgramDetector(const string& param) { string filename, mapfile, featname; - bool explicit_markers = (param == "-x"); - pimpl_ = new NgramDetectorImpl(explicit_markers); + bool explicit_markers = false; + unsigned order = 3; + ParseArgs(param, &explicit_markers, &order); + pimpl_ = new NgramDetectorImpl(explicit_markers, order); SetStateSize(pimpl_->ReserveStateSize()); } -- cgit v1.2.3 From e2998fc79c9dd549b1c1bad537fdf1052276f82c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 8 Mar 2012 01:46:32 -0500 Subject: simple context feature for tagger --- decoder/Makefile.am | 1 + decoder/cdec_ff.cc | 2 + decoder/ff_context.cc | 99 +++++++++++++++++++++++ decoder/ff_context.h | 23 ++++++ gi/pf/align-tl.cc | 6 +- gi/pf/reachability.cc | 2 + gi/pf/reachability.h | 6 +- gi/pf/transliterations.cc | 198 ++++++++++++++-------------------------------- gi/pf/transliterations.h | 5 +- 9 files changed, 194 insertions(+), 148 deletions(-) create mode 100644 decoder/ff_context.cc create mode 100644 decoder/ff_context.h (limited to 'decoder') diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 30eaf04d..a00b18af 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -63,6 +63,7 @@ libcdec_a_SOURCES = \ ff.cc \ ff_rules.cc \ ff_wordset.cc \ + ff_context.cc \ ff_charset.cc \ ff_lm.cc \ ff_klm.cc \ diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 4ce5749e..b516c386 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -1,6 +1,7 @@ #include #include "ff.h" +#include "ff_context.h" #include "ff_spans.h" #include "ff_lm.h" #include "ff_klm.h" @@ -42,6 +43,7 @@ void register_feature_functions() { #endif ff_registry.Register("SpanFeatures", new FFFactory()); ff_registry.Register("NgramFeatures", new FFFactory()); + ff_registry.Register("RuleContextFeatures", new FFFactory()); ff_registry.Register("RuleIdentityFeatures", new FFFactory()); ff_registry.Register("SourceSyntaxFeatures", new FFFactory); ff_registry.Register("SourceSpanSizeFeatures", new FFFactory); diff --git a/decoder/ff_context.cc b/decoder/ff_context.cc new file mode 100644 index 00000000..19f9a413 --- /dev/null +++ b/decoder/ff_context.cc @@ -0,0 +1,99 @@ +#include "ff_context.h" + +#include +#include +#include + +#include "filelib.h" +#include "stringlib.h" +#include "sentence_metadata.h" +#include "lattice.h" +#include "fdict.h" +#include "verbose.h" + +using namespace std; + +namespace { + string Escape(const string& x) { + string y = x; + for (int i = 0; i < y.size(); ++i) { + if (y[i] == '=') y[i]='_'; + if (y[i] == ';') y[i]='_'; + } + return y; + } +} + +RuleContextFeatures::RuleContextFeatures(const std::string& param) { + kSOS = TD::Convert(""); + kEOS = TD::Convert(""); + + // TODO param lets you pass in a string from the cdec.ini file +} + +void RuleContextFeatures::PrepareForInput(const SentenceMetadata& smeta) { + const Lattice& sl = smeta.GetSourceLattice(); + current_input.resize(sl.size()); + for (unsigned i = 0; i < sl.size(); ++i) { + if (sl[i].size() != 1) { + cerr << "Context features not supported with lattice inputs!\nid=" << smeta.GetSentenceId() << endl; + abort(); + } + current_input[i] = sl[i][0].label; + } +} + +void RuleContextFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const { + const TRule& rule = *edge.rule_; + + if (rule.Arity() != 0 || // arity = 0, no nonterminals + rule.e_.size() != 1) return; // size = 1, predicted label is a single token + + + // you can see the current label "for free" + const WordID cur_label = rule.e_[0]; + // (if you want to see more labels, you have to be very careful, and muck + // about with contexts and ant_contexts) + + // but... you can look at as much of the source as you want! + const int from_src_index = edge.i_; // start of the span in the input being labeled + const int to_src_index = edge.j_; // end of the span in the input + // (note: in the case of tagging the size of the spans being labeled will + // always be 1, but in other formalisms, you can have bigger spans.) + + // this is the current token being labeled: + const WordID cur_input = current_input[from_src_index]; + + // let's get the previous token in the input (may be to the left of the start + // of the sentence!) + WordID prev_input = kSOS; + if (from_src_index > 0) { prev_input = current_input[from_src_index - 1]; } + // let's get the next token (may be to the left of the start of the sentence!) + WordID next_input = kEOS; + if (to_src_index < current_input.size()) { next_input = current_input[to_src_index]; } + + // now, build a feature string + ostringstream os; + // TD::Convert converts from the internal integer representation of a token + // to the actual token + os << "C1:" << TD::Convert(prev_input) << '_' + << TD::Convert(cur_input) << '|' << TD::Convert(cur_label); + // C1 is just to prevent a name clash + + // pick a value + double fval = 1.0; // can be any real value + + // add it to the feature vector FD::Convert converts the feature string to a + // feature int, Escape makes sure the feature string doesn't have any bad + // symbols that could confuse a parser somewhere + features->add_value(FD::Convert(Escape(os.str())), fval); + // that's it! + + // create more features if you like... +} + diff --git a/decoder/ff_context.h b/decoder/ff_context.h new file mode 100644 index 00000000..0d22b027 --- /dev/null +++ b/decoder/ff_context.h @@ -0,0 +1,23 @@ +#ifndef _FF_CONTEXT_H_ +#define _FF_CONTEXT_H_ + +#include +#include "ff.h" + +class RuleContextFeatures : public FeatureFunction { + public: + RuleContextFeatures(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + virtual void PrepareForInput(const SentenceMetadata& smeta); + private: + std::vector current_input; + WordID kSOS, kEOS; +}; + +#endif diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc index 0e0454e5..6bb8c886 100644 --- a/gi/pf/align-tl.cc +++ b/gi/pf/align-tl.cc @@ -310,18 +310,16 @@ int main(int argc, char** argv) { // TODO CONFIGURE THIS int min_trans_src = 4; - cerr << "Initializing transliteration DPs ...\n"; + cerr << "Initializing transliteration graph structures ...\n"; for (int i = 0; i < corpus.size(); ++i) { const vector& src = corpus[i].src; const vector& trg = corpus[i].trg; - cerr << '.' << flush; - if (i % 80 == 79) cerr << endl; for (int j = 0; j < src.size(); ++j) { const vector& src_let = letters[src[j]]; for (int k = 0; k < trg.size(); ++k) { const vector& trg_let = letters[trg[k]]; if (src_let.size() < min_trans_src) - tl.Forbid(src[j], trg[k]); + tl.Forbid(src[j], src_let, trg[k], trg_let); else tl.Initialize(src[j], src_let, trg[k], trg_let); } diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index 73dd8d39..70fb76da 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -47,6 +47,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; int src_delta = i - prevs[k].prev_src_covered; edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; + valid_deltas[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(make_pair(src_delta,j - prevs[k].prev_trg_covered)); short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; if (src_delta > msd) msd = src_delta; } @@ -56,6 +57,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras assert(!edges[0][0][0][1]); assert(!edges[0][0][0][0]); assert(max_src_delta[0][0] > 0); + cerr << "Sentence with length (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node\n"; //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; //for (int i = 0; i < b[0][0].size(); ++i) { // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index 98450ec1..fb2f4965 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -12,12 +12,14 @@ // currently forbids 0 -> n and n -> 0 alignments struct Reachability { - boost::multi_array edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? + boost::multi_array edges; // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring? boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + boost::multi_array >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), - max_src_delta(boost::extents[srclen][trglen]) { + max_src_delta(boost::extents[srclen][trglen]), + valid_deltas(boost::extents[srclen][trglen]) { ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); } diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index 6e0c2e93..e29334fd 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -2,173 +2,92 @@ #include #include -#include -#include "grammar.h" -#include "bottom_up_parser.h" -#include "hg.h" -#include "hg_intersect.h" +#include "boost/shared_ptr.hpp" + #include "filelib.h" #include "ccrp.h" #include "m.h" -#include "lattice.h" -#include "verbose.h" +#include "reachability.h" using namespace std; using namespace std::tr1; -static WordID kX; -static int kMAX_SRC_SIZE = 0; -static vector > cur_trg_chunks; - -vector tlttofreelist; - -static void InitTargetChunks(int max_size, const vector& trg) { - cur_trg_chunks.clear(); - vector tmp; - unordered_set, boost::hash > > u; - for (int len = 1; len <= max_size; ++len) { - int end = trg.size() + 1; - end -= len; - for (int i = 0; i < end; ++i) { - tmp.clear(); - for (int j = 0; j < len; ++j) - tmp.push_back(trg[i + j]); - if (u.insert(tmp).second) cur_trg_chunks.push_back(tmp); - } - } -} - -struct TransliterationGrammarIter : public GrammarIter, public RuleBin { - TransliterationGrammarIter() { tlttofreelist.push_back(this); } - TransliterationGrammarIter(const TRulePtr& inr, int symbol) { - if (inr) { - r.reset(new TRule(*inr)); - } else { - r.reset(new TRule); - } - TRule& rr = *r; - rr.lhs_ = kX; - rr.f_.push_back(symbol); - tlttofreelist.push_back(this); - } - virtual int GetNumRules() const { - if (!r) return 0; - return cur_trg_chunks.size(); - } - virtual TRulePtr GetIthRule(int i) const { - TRulePtr nr(new TRule(*r)); - nr->e_ = cur_trg_chunks[i]; - //cerr << nr->AsString() << endl; - return nr; - } - virtual int Arity() const { - return 0; - } - virtual const RuleBin* GetRules() const { - if (!r) return NULL; else return this; - } - virtual const GrammarIter* Extend(int symbol) const { - if (symbol <= 0) return NULL; - if (!r || !kMAX_SRC_SIZE || r->f_.size() < kMAX_SRC_SIZE) - return new TransliterationGrammarIter(r, symbol); - else - return NULL; - } - TRulePtr r; -}; - -struct TransliterationGrammar : public Grammar { - virtual const GrammarIter* GetRoot() const { - return new TransliterationGrammarIter; - } - virtual bool HasRuleForSpan(int, int, int distance) const { - return (distance < kMAX_SRC_SIZE); - } -}; - -struct TInfo { - TInfo() : initialized(false) {} +struct GraphStructure { + GraphStructure() : initialized(false) {} + boost::shared_ptr r; bool initialized; - Hypergraph lattice; // may be empty if transliteration is not possible - prob_t est_prob; // will be zero if not possible }; struct TransliterationsImpl { TransliterationsImpl() { - kX = TD::Convert("X")*-1; - kMAX_SRC_SIZE = 4; - grammars.push_back(GrammarPtr(new TransliterationGrammar)); - grammars.push_back(GrammarPtr(new GlueGrammar("S", "X"))); - SetSilent(true); } void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { - if (src >= graphs.size()) graphs.resize(src + 1); - if (graphs[src][trg].initialized) return; - int kMAX_TRG_SIZE = 4; - InitTargetChunks(kMAX_TRG_SIZE, trg_lets); - ExhaustiveBottomUpParser parser("S", grammars); - Lattice lat(src_lets.size()), tlat(trg_lets.size()); - for (unsigned i = 0; i < src_lets.size(); ++i) - lat[i].push_back(LatticeArc(src_lets[i], 0.0, 1)); - for (unsigned i = 0; i < trg_lets.size(); ++i) - tlat[i].push_back(LatticeArc(trg_lets[i], 0.0, 1)); - //cerr << "Creating lattice for: " << TD::Convert(src) << " --> " << TD::Convert(trg) << endl; - //cerr << "'" << TD::GetString(src_lets) << "' --> " << TD::GetString(trg_lets) << endl; - if (!parser.Parse(lat, &graphs[src][trg].lattice)) { - //cerr << "Failed to parse " << TD::GetString(src_lets) << endl; - abort(); - } - if (HG::Intersect(tlat, &graphs[src][trg].lattice)) { - graphs[src][trg].est_prob = prob_t(1e-4); + const size_t src_len = src_lets.size(); + const size_t trg_len = trg_lets.size(); + if (src_len >= graphs.size()) graphs.resize(src_len + 1); + if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); + if (graphs[src_len][trg_len].initialized) return; + graphs[src_len][trg_len].r.reset(new Reachability(src_len, trg_len, 4, 4)); + +#if 0 + if (HG::Intersect(tlat, &hg)) { + // TODO } else { - graphs[src][trg].lattice.clear(); - //cerr << "Failed to intersect " << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << endl; - graphs[src][trg].est_prob = prob_t::Zero(); + cerr << "No transliteration lattice possible for src_len=" << src_len << " trg_len=" << trg_len << endl; + hg.clear(); } - for (unsigned i = 0; i < tlttofreelist.size(); ++i) - delete tlttofreelist[i]; - tlttofreelist.clear(); //cerr << "Number of paths: " << graphs[src][trg].lattice.NumberOfPaths() << endl; - graphs[src][trg].initialized = true; +#endif + graphs[src_len][trg_len].initialized = true; } - const prob_t& EstimateProbability(WordID src, WordID trg) const { - assert(src < graphs.size()); - const unordered_map& um = graphs[src]; - const unordered_map::const_iterator it = um.find(trg); - assert(it != um.end()); - assert(it->second.initialized); - return it->second.est_prob; + void Forbid(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { + const size_t src_len = src_lets.size(); + const size_t trg_len = trg_lets.size(); + if (src_len >= graphs.size()) graphs.resize(src_len + 1); + if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); + graphs[src_len][trg_len].r.reset(); + graphs[src_len][trg_len].initialized = true; } - void Forbid(WordID src, WordID trg) { - if (src >= graphs.size()) graphs.resize(src + 1); - graphs[src][trg].est_prob = prob_t::Zero(); - graphs[src][trg].initialized = true; + prob_t EstimateProbability(WordID s, const vector& src, WordID t, const vector& trg) const { + assert(src.size() < graphs.size()); + const vector& tv = graphs[src.size()]; + assert(trg.size() < tv.size()); + const GraphStructure& gs = tv[trg.size()]; + // TODO: do prob + return prob_t::Zero(); } void GraphSummary() const { - double tlp = 0; - int tt = 0; + double to = 0; + double tn = 0; + double tt = 0; for (int i = 0; i < graphs.size(); ++i) { - const unordered_map& um = graphs[i]; - unordered_map::const_iterator it; - for (it = um.begin(); it != um.end(); ++it) { - if (it->second.lattice.empty()) continue; - //cerr << TD::Convert(i) << " --> " << TD::Convert(it->first) << ": " << it->second.lattice.NumberOfPaths() << endl; - tlp += log(it->second.lattice.NumberOfPaths()); + const vector& vt = graphs[i]; + for (int j = 0; j < vt.size(); ++j) { + const GraphStructure& gs = vt[j]; + if (!gs.r) continue; tt++; + for (int k = 0; k < i; ++k) { + for (int l = 0; l < j; ++l) { + size_t c = gs.r->valid_deltas[k][l].size(); + if (c) { + tn += 1; + to += c; + } + } + } } } - tlp /= tt; - cerr << "E[log paths] = " << tlp << endl; - cerr << "exp(E[log paths]) = " << exp(tlp) << endl; + cerr << " Average nodes = " << (tn / tt) << endl; + cerr << "Average out-degree = " << (to / tn) << endl; + cerr << " Unique structures = " << tt << endl; } - vector > graphs; - vector grammars; + vector > graphs; // graphs[src_len][trg_len] }; Transliterations::Transliterations() : pimpl_(new TransliterationsImpl) {} @@ -178,16 +97,15 @@ void Transliterations::Initialize(WordID src, const vector& src_lets, Wo pimpl_->Initialize(src, src_lets, trg, trg_lets); } -prob_t Transliterations::EstimateProbability(WordID src, WordID trg) const { - return pimpl_->EstimateProbability(src,trg); +prob_t Transliterations::EstimateProbability(WordID s, const vector& src, WordID t, const vector& trg) const { + return pimpl_->EstimateProbability(s, src,t, trg); } -void Transliterations::Forbid(WordID src, WordID trg) { - pimpl_->Forbid(src, trg); +void Transliterations::Forbid(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { + pimpl_->Forbid(src, src_lets, trg, trg_lets); } void Transliterations::GraphSummary() const { pimpl_->GraphSummary(); } - diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h index a548aacf..76eb2a05 100644 --- a/gi/pf/transliterations.h +++ b/gi/pf/transliterations.h @@ -10,9 +10,10 @@ struct Transliterations { explicit Transliterations(); ~Transliterations(); void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); - void Forbid(WordID src, WordID trg); + void Forbid(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); void GraphSummary() const; - prob_t EstimateProbability(WordID src, WordID trg) const; + prob_t EstimateProbability(WordID s, const std::vector& src, WordID t, const std::vector& trg) const; + private: TransliterationsImpl* pimpl_; }; -- cgit v1.2.3 From dfbc278c1057555fda9312291c8024049e00b7d8 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 16:42:12 -0500 Subject: frequency-based binning --- decoder/Makefile.am | 1 - decoder/ff_csplit.cc | 2 +- decoder/freqdict.cc | 29 ----------------------------- decoder/freqdict.h | 37 ++++++++++++++++++++++++++++++++----- gi/pf/align-lexonly-pyp.cc | 24 +++++++++++++++++------- gi/pf/make-freq-bins.pl | 26 ++++++++++++++++++++++++++ gi/pf/pyp_tm.cc | 24 +++++++++++++++++------- gi/pf/pyp_tm.h | 7 ++++--- 8 files changed, 97 insertions(+), 53 deletions(-) delete mode 100644 decoder/freqdict.cc create mode 100755 gi/pf/make-freq-bins.pl (limited to 'decoder') diff --git a/decoder/Makefile.am b/decoder/Makefile.am index a00b18af..ec51d643 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -76,7 +76,6 @@ libcdec_a_SOURCES = \ ff_source_syntax.cc \ ff_bleu.cc \ ff_factory.cc \ - freqdict.cc \ lexalign.cc \ lextrans.cc \ tagger.cc \ diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc index 3991d38f..c9ed996c 100644 --- a/decoder/ff_csplit.cc +++ b/decoder/ff_csplit.cc @@ -72,7 +72,7 @@ struct BasicCSplitFeaturesImpl { const int fl1_; const int fl2_; const int bad_; - FreqDict freq_dict_; + FreqDict freq_dict_; set bad_words_; }; diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc deleted file mode 100644 index 9e25d346..00000000 --- a/decoder/freqdict.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include -#include -#include -#include "freqdict.h" -#include "tdict.h" -#include "filelib.h" - -using namespace std; - -void FreqDict::Load(const std::string& fname) { - cerr << "Reading word frequencies: " << fname << endl; - ReadFile rf(fname); - istream& ifs = *rf.stream(); - int cc=0; - while (ifs) { - std::string word; - ifs >> word; - if (word.size() == 0) continue; - if (word[0] == '#') continue; - double count = 0; - ifs >> count; - assert(count > 0.0); // use -log(f) - counts_[TD::Convert(word)]=count; - ++cc; - if (cc % 10000 == 0) { std::cerr << "."; } - } - std::cerr << "\n"; - std::cerr << "Loaded " << cc << " words\n"; -} diff --git a/decoder/freqdict.h b/decoder/freqdict.h index 9acf0c33..4e03fadd 100644 --- a/decoder/freqdict.h +++ b/decoder/freqdict.h @@ -1,20 +1,47 @@ #ifndef _FREQDICT_H_ #define _FREQDICT_H_ +#include #include #include #include "wordid.h" +#include "filelib.h" +#include "tdict.h" +template class FreqDict { public: - void Load(const std::string& fname); - float LookUp(const WordID& word) const { - std::map::const_iterator i = counts_.find(word); - if (i == counts_.end()) return 0; + FreqDict() : max_() {} + T Max() const { return max_; } + void Load(const std::string& fname) { + std::cerr << "Reading word statistics from: " << fname << std::endl; + ReadFile rf(fname); + std::istream& ifs = *rf.stream(); + int cc=0; + std::string word; + while (ifs) { + ifs >> word; + if (word.size() == 0) continue; + if (word[0] == '#') continue; + T count = 0; + ifs >> count; + if (count > max_) max_ = count; + counts_[TD::Convert(word)]=count; + ++cc; + if (cc % 10000 == 0) { std::cerr << "."; } + } + std::cerr << "\n"; + std::cerr << "Loaded " << cc << " words\n"; + } + + T LookUp(const WordID& word) const { + typename std::map::const_iterator i = counts_.find(word); + if (i == counts_.end()) return T(); return i->second; } private: - std::map counts_; + T max_; + std::map counts_; }; #endif diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 6c054753..942dcf51 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -20,6 +20,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("samples,s",po::value()->default_value(1000),"Number of samples") + ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed") + ("p_null,0", po::value()->default_value(0.08), "probability of aligning to null") + ("align_alpha,a", po::value()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?") ("input,i",po::value(),"Read parallel data from") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); @@ -59,9 +62,13 @@ struct AlignedSentencePair { }; struct Aligner { - Aligner(const vector >& lets, int num_letters, vector* c) : + Aligner(const vector >& lets, + int num_letters, + const po::variables_map& conf, + vector* c) : corpus(*c), - paj_model(4, 0.08), + paj_model(conf["align_alpha"].as(), conf["p_null"].as()), + infer_paj(conf.count("infer_alignment_hyperparameters") > 0), model(lets, num_letters), kNULL(TD::Convert("NULL")) { assert(lets[kNULL].size() == 0); @@ -69,12 +76,13 @@ struct Aligner { vector& corpus; QuasiModel2 paj_model; + const bool infer_paj; PYPLexicalTranslation model; const WordID kNULL; void ResampleHyperparameters() { model.ResampleHyperparameters(prng); - paj_model.ResampleHyperparameters(prng); + if (infer_paj) paj_model.ResampleHyperparameters(prng); } void InitializeRandom() { @@ -117,8 +125,6 @@ struct Aligner { paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() - << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; } prob_t Likelihood() const { @@ -211,13 +217,17 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - Aligner aligner(letters, letset.size(), &corpus); + Aligner aligner(letters, letset.size(), conf, &corpus); aligner.InitializeRandom(); const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); - if (i % 10 == 9) aligner.ResampleHyperparameters(); + if (i % 10 == 9) { + aligner.ResampleHyperparameters(); + cerr << "LLH = " << aligner.Likelihood() << " \t(Amodel=" << aligner.paj_model.Likelihood() + << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl; + } aligner.ResampleCorpus(); if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl new file mode 100755 index 00000000..fdcd3555 --- /dev/null +++ b/gi/pf/make-freq-bins.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w +use strict; + +my $BASE = 6; +my $CUTOFF = 3; + +my %d; +my $num = 0; +while(<>){ + chomp; + my @words = split /\s+/; + for my $w (@words) {$d{$w}++; $num++;} +} + +my @vocab = sort {$d{$b} <=> $d{$a}} keys %d; + +for (my $i=0; $i #include -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" #include "tdict.h" #include "ccrp.h" #include "pyp_word_model.h" @@ -15,9 +12,19 @@ using namespace std; using namespace std::tr1; -template +struct FreqBinner { + FreqBinner(const std::string& fname) { fd_.Load(fname); } + unsigned NumberOfBins() const { return fd_.Max() + 1; } + unsigned Bin(const WordID& w) const { return fd_.LookUp(w); } + FreqDict fd_; +}; + +template struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {} + ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) : + base(*b), + binner(bnr), + btr(binner ? binner->NumberOfBins() + 1u : 2u) {} void Summary() const { cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -46,7 +53,9 @@ struct ConditionalPYPWordModel { if (it == r.end()) { it = r.insert(make_pair(src, CCRP >(0.5,1.0))).first; static const WordID kNULL = TD::Convert("NULL"); - btr.Add(src == kNULL ? 0 : 1, &it->second); + unsigned bin = (src == kNULL ? 0 : 1); + if (binner && bin) { bin = binner->Bin(src) + 1; } + btr.Add(bin, &it->second); } if (it->second.increment(trglets, base(trglets), rng)) base.Increment(trglets, rng); @@ -75,6 +84,7 @@ struct ConditionalPYPWordModel { // TODO tie PYP hyperparameters based on source word frequency bins Base& base; + const Binner* binner; BinTiedResampler > > btr; typedef unordered_map > > RuleModelHash; RuleModelHash r; @@ -84,7 +94,7 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector >& lets const unsigned num_letters) : letters(lets), up0(new PYPWordModel(num_letters)), - tmodel(new ConditionalPYPWordModel(up0)), + tmodel(new ConditionalPYPWordModel(up0, new FreqBinner("10k.freq"))), kX(-TD::Convert("X")) {} void PYPLexicalTranslation::Summary() const { diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h index fa0fb28f..63e7c96d 100644 --- a/gi/pf/pyp_tm.h +++ b/gi/pf/pyp_tm.h @@ -5,10 +5,11 @@ #include "wordid.h" #include "prob.h" #include "sampler.h" +#include "freqdict.h" -struct TRule; +struct FreqBinner; struct PYPWordModel; -template struct ConditionalPYPWordModel; +template struct ConditionalPYPWordModel; struct PYPLexicalTranslation { explicit PYPLexicalTranslation(const std::vector >& lets, @@ -26,7 +27,7 @@ struct PYPLexicalTranslation { private: const std::vector >& letters; // spelling dictionary PYPWordModel* up0; // base distribuction (model English word) - ConditionalPYPWordModel* tmodel; // translation distributions + ConditionalPYPWordModel* tmodel; // translation distributions // (model English word | French word) const WordID kX; }; -- cgit v1.2.3