Fixes to PR command line.

Added bilingual agreement model processing to pipeline. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@365 ec762483-ff6d-05da-a07a-a48fb63a330f
author: trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-21 22:42:52 +0000
committer: trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-21 22:42:52 +0000
commit: 2a69c7255fef37e9b8c2e63ed775d1dd9aaa8686 (patch)
tree: a26629610b352775196c650cb43d9ac93c1912f9 /gi
parent: 8700e2ee96a71ed267617ce1ebd4ef3a002a1f6c (diff)
3 files changed, 68 insertions, 21 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index 8353a242..96df34ea 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -39,8 +39,8 @@ die "Can't find posterior-regularisation: $PRTOOLS" unless -e $PRTOOLS && -d $PR
 my $REDUCER = "$EXTOOLS/mr_stripe_rule_reduce";
 my $C2D = "$PYPSCRIPTS/contexts2documents.py";
 my $S2L = "$PYPSCRIPTS/spans2labels.py";
+my $SPLIT = "$SCRIPT_DIR/../posterior-regularisation/split-languages.py";
 
-my $PYP_TOPICS_TRAIN="$PYPTOOLS/pyp-contexts-train";
 my $PREM_TRAIN="$PRTOOLS/prjava/train-PR-cluster.sh";
 
 my $SORT_KEYS = "$SCRIPT_DIR/scripts/sort-by-key.sh";
@@ -48,9 +48,8 @@ my $PATCH_CORPUS = "$SCRIPT_DIR/scripts/patch-corpus.pl";
 my $EXTRACTOR = "$EXTOOLS/extractor";
 my $TOPIC_TRAIN = "$PYPTOOLS/pyp-contexts-train";
 
-assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR, #$PYP_TOPICS_TRAIN, 
-            $S2L, $C2D #, $TOPIC_TRAIN
-           );
+assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR,
+            $S2L, $C2D, $TOPIC_TRAIN, $SPLIT);
 
 my $BACKOFF_GRAMMAR;
 my $DEFAULT_CAT;
@@ -78,7 +77,7 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,
                            'pr-flags=s' => \$PR_FLAGS,
                            'tagged_corpus=s' => \$TAGGED_CORPUS,
                            'language=s' => \$LANGUAGE,
-                           'get_name_only' => \$NAME_SHORTCUT,
+                           'get_name_only' => \$NAME_SHORTCUT
                           );
 if ($NAME_SHORTCUT) {
   $NUM_TOPICS = $NUM_TOPICS_FINE;
@@ -132,7 +131,12 @@ if(-e $TOPICS_CONFIG) {
 
 setup_data();
 
-extract_context();
+if (lc($MODEL) eq "blagree") {
+    extract_bilingual_context();
+} else {
+    extract_context();
+}
+
 if (lc($MODEL) eq "pyp") {
     if($HIER_CAT) {
         $NUM_TOPICS = $NUM_TOPICS_COARSE;
@@ -199,6 +203,8 @@ sub cluster_dir {
         return context_dir() . ".PR.t$NUM_TOPICS.i$NUM_ITERS.sp$PR_SCALE_P.sc$PR_SCALE_C";
     } elsif (lc($MODEL) eq "agree") {
         return context_dir() . ".AGREE.t$NUM_TOPICS.i$NUM_ITERS";
+    } elsif (lc($MODEL) eq "blagree") {
+        return context_dir() . ".BLAGREE.t$NUM_TOPICS.i$NUM_ITERS";
     }
 }
 
@@ -261,6 +267,29 @@ sub extract_context {
   }
 }
 
+sub extract_bilingual_context {
+ print STDERR "\n!!!CONTEXT EXTRACTION\n"; 
+ my $OUT_SRC_CONTEXTS = "$CONTEXT_DIR/context.source";
+ my $OUT_TGT_CONTEXTS = "$CONTEXT_DIR/context.target";
+
+ if (-e $OUT_SRC_CONTEXTS . ".gz" and -e $OUT_TGT_CONTEXTS . ".gz") {
+   print STDERR "$OUT_SRC_CONTEXTS.gz and $OUT_TGT_CONTEXTS.gz exist, reusing...\n";
+ } else {
+   my $OUT_BI_CONTEXTS = "$CONTEXT_DIR/context.bilingual.txt.gz";
+   my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language both --context_language both | $SORT_KEYS | $REDUCER | $GZIP > $OUT_BI_CONTEXTS";
+   if ($COMPLETE_CACHE) {
+     print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n";
+     $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE  --phrase_language both --context_language both  | $SORT_KEYS | $GZIP > $OUT_BI_CONTEXTS";
+   }
+   safesystem($cmd) or die "Failed to extract contexts.";
+
+   safesystem("$ZCAT $OUT_BI_CONTEXTS | $SPLIT $OUT_SRC_CONTEXTS $OUT_TGT_CONTEXTS") or die "Failed to split contexts.\n";
+   safesystem("$GZIP -f $OUT_SRC_CONTEXTS") or die "Failed to zip output contexts.\n";
+   safesystem("$GZIP -f $OUT_TGT_CONTEXTS") or die "Failed to zip output contexts.\n";
+ }
+}
+
+
 sub topic_train {
   print STDERR "\n!!!TRAIN PYP TOPICS\n";
   my $IN_CONTEXTS = "$CONTEXT_DIR/context.txt.gz";
@@ -274,18 +303,21 @@ sub topic_train {
 
 sub prem_train {
   print STDERR "\n!!!TRAIN PR/EM model\n";
-  my $IN_CONTEXTS = "$CONTEXT_DIR/context.txt.gz";
   my $OUT_CLUSTERS = "$CLUSTER_DIR/docs.txt.gz";
   if (-e $OUT_CLUSTERS) {
     print STDERR "$OUT_CLUSTERS exists, reusing...\n";
   } else {
+    my $in = "--in $CONTEXT_DIR/context.txt.gz";
     my $opts = "";
     if (lc($MODEL) eq "pr") {
         $opts = "--scale-phrase $PR_SCALE_P --scale-context $PR_SCALE_C";
     } elsif (lc($MODEL) eq "agree") {
         $opts = "--agree-direction";
+    } elsif (lc($MODEL) eq "blagree") {
+        $in = "--in $CONTEXT_DIR/context.source.gz --in1 $CONTEXT_DIR/context.target.gz";
+        $opts = "--agree-language";
     }
-    safesystem("$PREM_TRAIN --in $IN_CONTEXTS --topics $NUM_TOPICS --out $OUT_CLUSTERS --iterations $NUM_ITERS $opts $PR_FLAGS") or die "Topic training failed.\n";
+    safesystem("$PREM_TRAIN $in --topics $NUM_TOPICS --out $OUT_CLUSTERS --iterations $NUM_ITERS $opts $PR_FLAGS") or die "Topic training failed.\n";
   }
 }
 
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java b/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java
index 8bf0b93e..031f887f 100644
--- a/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java
+++ b/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java
@@ -157,13 +157,16 @@ public class Agree2Sides {
 
 	public double[] posterior(int edgeIdx) 
 	{
-		Edge edge1=c1.getEdges().get(edgeIdx);
-		Edge edge2=c2.getEdges().get(edgeIdx);
-		double[] prob1=model1.posterior(edge1);
-		double[] prob2=model2.posterior(edge2);
+		return posterior(c1.getEdges().get(edgeIdx), c2.getEdges().get(edgeIdx));
+	}
+	
+	public double[] posterior(Edge e1, Edge e2) 
+	{
+		double[] prob1=model1.posterior(e1);
+		double[] prob2=model2.posterior(e2);
 		
-		llh+=edge1.getCount()*Math.log(arr.F.l1norm(prob1));
-		llh+=edge2.getCount()*Math.log(arr.F.l1norm(prob2));
+		llh+=e1.getCount()*Math.log(arr.F.l1norm(prob1));
+		llh+=e2.getCount()*Math.log(arr.F.l1norm(prob2));
 		arr.F.l1normalize(prob1);
 		arr.F.l1normalize(prob2);
 		
@@ -177,7 +180,6 @@ public class Agree2Sides {
 	
 	public void displayPosterior(PrintStream ps)
 	{	
-		
 		for (int i=0;i<c1.getEdges().size();i++)
 		{
 			Edge edge=c1.getEdges().get(i);
@@ -192,5 +194,4 @@ public class Agree2Sides {
 			ps.println(" ||| C=" + t);
 		}
 	}
-	
 }
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Trainer.java b/gi/posterior-regularisation/prjava/src/phrase/Trainer.java
index 202930f5..c1d4775e 100644
--- a/gi/posterior-regularisation/prjava/src/phrase/Trainer.java
+++ b/gi/posterior-regularisation/prjava/src/phrase/Trainer.java
@@ -166,8 +166,10 @@ public class Trainer
 			last = o;
 		}
 		
-		if (cluster == null)
+		if (cluster == null && agree != null)
 			cluster = agree.model1;
+		else if (cluster == null && agree2sides != null)
+			cluster = agree2sides.model1;
 
 		double pl1lmax = cluster.phrase_l1lmax();
 		double cl1lmax = cluster.context_l1lmax();
@@ -183,11 +185,23 @@ public class Trainer
 					test = corpus.getEdges();
 				else
 				{	// if --test supplied, load up the file
-					infile = (File) options.valueOf("test");
-					System.out.println("Reading testing concordance from " + infile);
-					test = corpus.readEdges(FileUtil.reader(infile));
+					if (agree == null && agree2sides == null)
+					{
+						infile = (File) options.valueOf("test");
+						System.out.println("Reading testing concordance from " + infile);
+						test = corpus.readEdges(FileUtil.reader(infile));
+					}
+					else
+						System.err.println("Can't run agreement models on different test data cf training (yet); --test ignored.");
 				}
-				cluster.displayPosterior(ps, test);
+				
+				if (agree != null)
+					agree.displayPosterior(ps);
+				else if (agree2sides != null)
+					agree2sides.displayPosterior(ps);
+				else
+					cluster.displayPosterior(ps, test);
+					
 				ps.close();
 			} catch (IOException e) {
 				System.err.println("Failed to open either testing file or output file");
author	trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-21 22:42:52 +0000
committer	trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-21 22:42:52 +0000
commit	2a69c7255fef37e9b8c2e63ed775d1dd9aaa8686 (patch)
tree	a26629610b352775196c650cb43d9ac93c1912f9 /gi
parent	8700e2ee96a71ed267617ce1ebd4ef3a002a1f6c (diff)