From 2a69c7255fef37e9b8c2e63ed775d1dd9aaa8686 Mon Sep 17 00:00:00 2001 From: "trevor.cohn" Date: Wed, 21 Jul 2010 22:42:52 +0000 Subject: Fixes to PR command line. Added bilingual agreement model processing to pipeline. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@365 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/local-gi-pipeline.pl | 48 ++++++++++++++++++---- .../prjava/src/phrase/Agree2Sides.java | 17 ++++---- .../prjava/src/phrase/Trainer.java | 24 ++++++++--- 3 files changed, 68 insertions(+), 21 deletions(-) (limited to 'gi') diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 8353a242..96df34ea 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -39,8 +39,8 @@ die "Can't find posterior-regularisation: $PRTOOLS" unless -e $PRTOOLS && -d $PR my $REDUCER = "$EXTOOLS/mr_stripe_rule_reduce"; my $C2D = "$PYPSCRIPTS/contexts2documents.py"; my $S2L = "$PYPSCRIPTS/spans2labels.py"; +my $SPLIT = "$SCRIPT_DIR/../posterior-regularisation/split-languages.py"; -my $PYP_TOPICS_TRAIN="$PYPTOOLS/pyp-contexts-train"; my $PREM_TRAIN="$PRTOOLS/prjava/train-PR-cluster.sh"; my $SORT_KEYS = "$SCRIPT_DIR/scripts/sort-by-key.sh"; @@ -48,9 +48,8 @@ my $PATCH_CORPUS = "$SCRIPT_DIR/scripts/patch-corpus.pl"; my $EXTRACTOR = "$EXTOOLS/extractor"; my $TOPIC_TRAIN = "$PYPTOOLS/pyp-contexts-train"; -assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR, #$PYP_TOPICS_TRAIN, - $S2L, $C2D #, $TOPIC_TRAIN - ); +assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR, + $S2L, $C2D, $TOPIC_TRAIN, $SPLIT); my $BACKOFF_GRAMMAR; my $DEFAULT_CAT; @@ -78,7 +77,7 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE, 'pr-flags=s' => \$PR_FLAGS, 'tagged_corpus=s' => \$TAGGED_CORPUS, 'language=s' => \$LANGUAGE, - 'get_name_only' => \$NAME_SHORTCUT, + 'get_name_only' => \$NAME_SHORTCUT ); if ($NAME_SHORTCUT) { $NUM_TOPICS = $NUM_TOPICS_FINE; @@ -132,7 +131,12 @@ if(-e $TOPICS_CONFIG) { setup_data(); -extract_context(); +if (lc($MODEL) eq "blagree") { + extract_bilingual_context(); +} else { + extract_context(); +} + if (lc($MODEL) eq "pyp") { if($HIER_CAT) { $NUM_TOPICS = $NUM_TOPICS_COARSE; @@ -199,6 +203,8 @@ sub cluster_dir { return context_dir() . ".PR.t$NUM_TOPICS.i$NUM_ITERS.sp$PR_SCALE_P.sc$PR_SCALE_C"; } elsif (lc($MODEL) eq "agree") { return context_dir() . ".AGREE.t$NUM_TOPICS.i$NUM_ITERS"; + } elsif (lc($MODEL) eq "blagree") { + return context_dir() . ".BLAGREE.t$NUM_TOPICS.i$NUM_ITERS"; } } @@ -261,6 +267,29 @@ sub extract_context { } } +sub extract_bilingual_context { + print STDERR "\n!!!CONTEXT EXTRACTION\n"; + my $OUT_SRC_CONTEXTS = "$CONTEXT_DIR/context.source"; + my $OUT_TGT_CONTEXTS = "$CONTEXT_DIR/context.target"; + + if (-e $OUT_SRC_CONTEXTS . ".gz" and -e $OUT_TGT_CONTEXTS . ".gz") { + print STDERR "$OUT_SRC_CONTEXTS.gz and $OUT_TGT_CONTEXTS.gz exist, reusing...\n"; + } else { + my $OUT_BI_CONTEXTS = "$CONTEXT_DIR/context.bilingual.txt.gz"; + my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language both --context_language both | $SORT_KEYS | $REDUCER | $GZIP > $OUT_BI_CONTEXTS"; + if ($COMPLETE_CACHE) { + print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n"; + $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language both --context_language both | $SORT_KEYS | $GZIP > $OUT_BI_CONTEXTS"; + } + safesystem($cmd) or die "Failed to extract contexts."; + + safesystem("$ZCAT $OUT_BI_CONTEXTS | $SPLIT $OUT_SRC_CONTEXTS $OUT_TGT_CONTEXTS") or die "Failed to split contexts.\n"; + safesystem("$GZIP -f $OUT_SRC_CONTEXTS") or die "Failed to zip output contexts.\n"; + safesystem("$GZIP -f $OUT_TGT_CONTEXTS") or die "Failed to zip output contexts.\n"; + } +} + + sub topic_train { print STDERR "\n!!!TRAIN PYP TOPICS\n"; my $IN_CONTEXTS = "$CONTEXT_DIR/context.txt.gz"; @@ -274,18 +303,21 @@ sub topic_train { sub prem_train { print STDERR "\n!!!TRAIN PR/EM model\n"; - my $IN_CONTEXTS = "$CONTEXT_DIR/context.txt.gz"; my $OUT_CLUSTERS = "$CLUSTER_DIR/docs.txt.gz"; if (-e $OUT_CLUSTERS) { print STDERR "$OUT_CLUSTERS exists, reusing...\n"; } else { + my $in = "--in $CONTEXT_DIR/context.txt.gz"; my $opts = ""; if (lc($MODEL) eq "pr") { $opts = "--scale-phrase $PR_SCALE_P --scale-context $PR_SCALE_C"; } elsif (lc($MODEL) eq "agree") { $opts = "--agree-direction"; + } elsif (lc($MODEL) eq "blagree") { + $in = "--in $CONTEXT_DIR/context.source.gz --in1 $CONTEXT_DIR/context.target.gz"; + $opts = "--agree-language"; } - safesystem("$PREM_TRAIN --in $IN_CONTEXTS --topics $NUM_TOPICS --out $OUT_CLUSTERS --iterations $NUM_ITERS $opts $PR_FLAGS") or die "Topic training failed.\n"; + safesystem("$PREM_TRAIN $in --topics $NUM_TOPICS --out $OUT_CLUSTERS --iterations $NUM_ITERS $opts $PR_FLAGS") or die "Topic training failed.\n"; } } diff --git a/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java b/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java index 8bf0b93e..031f887f 100644 --- a/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java +++ b/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java @@ -157,13 +157,16 @@ public class Agree2Sides { public double[] posterior(int edgeIdx) { - Edge edge1=c1.getEdges().get(edgeIdx); - Edge edge2=c2.getEdges().get(edgeIdx); - double[] prob1=model1.posterior(edge1); - double[] prob2=model2.posterior(edge2); + return posterior(c1.getEdges().get(edgeIdx), c2.getEdges().get(edgeIdx)); + } + + public double[] posterior(Edge e1, Edge e2) + { + double[] prob1=model1.posterior(e1); + double[] prob2=model2.posterior(e2); - llh+=edge1.getCount()*Math.log(arr.F.l1norm(prob1)); - llh+=edge2.getCount()*Math.log(arr.F.l1norm(prob2)); + llh+=e1.getCount()*Math.log(arr.F.l1norm(prob1)); + llh+=e2.getCount()*Math.log(arr.F.l1norm(prob2)); arr.F.l1normalize(prob1); arr.F.l1normalize(prob2); @@ -177,7 +180,6 @@ public class Agree2Sides { public void displayPosterior(PrintStream ps) { - for (int i=0;i