From 58e5949176bfbd66ab1eefa8ba82d729a36a5f70 Mon Sep 17 00:00:00 2001
From: "trevor.cohn" <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Wed, 7 Jul 2010 14:12:08 +0000
Subject: Added EM and PR launcher

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@174 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 gi/pipeline/local-gi-pipeline.pl | 48 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index 9ae7fd95..c7df2372 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -17,17 +17,25 @@ my $CONTEXT_SIZE = 1;
 my $BIDIR = 0;
 my $TOPICS_CONFIG = "pyp-topics.conf";
 
+my $MODEL = "pyp";
+my $NUM_EM_PR_ITERS = 20;
+my $PR_SCALE = 10.0;
+my $PR_THREADS = 0;
+
 my $EXTOOLS = "$SCRIPT_DIR/../../extools";
 die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS;
 my $PYPTOOLS = "$SCRIPT_DIR/../pyp-topics/src";
-die "Can't find extools: $PYPTOOLS" unless -e $PYPTOOLS && -d $PYPTOOLS;
+die "Can't find pyp-topics: $PYPTOOLS" unless -e $PYPTOOLS && -d $PYPTOOLS;
 my $PYPSCRIPTS = "$SCRIPT_DIR/../pyp-topics/scripts";
-die "Can't find extools: $PYPSCRIPTS" unless -e $PYPSCRIPTS && -d $PYPSCRIPTS;
+die "Can't find pyp-topics: $PYPSCRIPTS" unless -e $PYPSCRIPTS && -d $PYPSCRIPTS;
+my $PRTOOLS = "$SCRIPT_DIR/../posterior-regularisation";
+die "Can't find posterior-regularisation: $PRTOOLS" unless -e $PRTOOLS && -d $PRTOOLS;
 my $REDUCER = "$EXTOOLS/mr_stripe_rule_reduce";
 my $C2D = "$PYPSCRIPTS/contexts2documents.py";
 my $S2L = "$PYPSCRIPTS/spans2labels.py";
 
 my $PYP_TOPICS_TRAIN="$PYPTOOLS/pyp-contexts-train";
+my $PREM_TRAIN="java -ea -Xmx4g -jar $PRTOOLS/prjava.jar";
 
 my $SORT_KEYS = "$SCRIPT_DIR/scripts/sort-by-key.sh";
 my $EXTRACTOR = "$EXTOOLS/extractor";
@@ -35,14 +43,17 @@ my $TOPIC_TRAIN = "$PYPTOOLS/pyp-contexts-train";
 
 assert_exec($SORT_KEYS, $REDUCER, $EXTRACTOR, $PYP_TOPICS_TRAIN, $S2L, $C2D, $TOPIC_TRAIN);
 
-
 my $OUTPUT = './giwork';
 usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,
                            'output=s' => \$OUTPUT,
+                           'model=s' => \$MODEL,
                            'topics=i' => \$NUM_TOPICS,
                            'trg_context=i' => \$CONTEXT_SIZE,
                            'samples=i' => \$NUM_SAMPLES,
                            'topics-config=s' => \$TOPICS_CONFIG,
+                           'em-iterations=i' => \$NUM_EM_PR_ITERS,
+                           'pr-scale=f' => \$PR_SCALE,
+                           'pr-threads=i' => \$PR_THREADS,
                           );
 
 usage() unless scalar @ARGV == 1;
@@ -63,7 +74,11 @@ if (-e $TOPICS_CONFIG) {
 }
 
 extract_context();
-topic_train();
+if (lc($MODEL) eq "pyp") {
+    topic_train();
+} else {
+    prem_train();
+}
 label_spans_with_topics();
 my $res;
 if ($BIDIR) {
@@ -80,7 +95,14 @@ sub context_dir {
 }
 
 sub cluster_dir {
-  return context_dir() . ".t$NUM_TOPICS.s$NUM_SAMPLES";
+    if (lc($MODEL) eq "pyp") {
+        return context_dir() . ".PYP.t$NUM_TOPICS.s$NUM_SAMPLES";
+    } elsif (lc($MODEL) eq "em") {
+        return context_dir() . ".EM.t$NUM_TOPICS.i$NUM_EM_PR_ITERS";
+    } elsif (lc($MODEL) eq "pr") {
+        return context_dir() . ".PR.t$NUM_TOPICS.i$NUM_EM_PR_ITERS.s$PR_SCALE";
+    }
+    die "Badness 10000\n";
 }
 
 sub grammar_dir {
@@ -101,7 +123,7 @@ sub usage {
 
 Usage: $0 [OPTIONS] corpus.fr-en-al
 
-Induces a grammar using Pitman-Yor topic modeling.
+Induces a grammar using Pitman-Yor topic modeling or Posterior Regularisation.
 
 EOT
   exit 1;
@@ -141,6 +163,20 @@ sub topic_train {
   }
 }
 
+sub prem_train {
+  print STDERR "\n!!!TRAIN PR/EM model\n";
+  my $IN_CONTEXTS = "$CONTEXT_DIR/context.txt.gz";
+  my $OUT_CLUSTERS = "$CLUSTER_DIR/docs.txt.gz";
+  if (-e $OUT_CLUSTERS) {
+    print STDERR "$OUT_CLUSTERS exists, reusing...\n";
+  } else {
+    my $emflag="false";
+    if (lc($MODEL) eq "em") { $emflag="true"; }
+    elsif (lc($MODEL) ne "pr") { die "Unsupported model type: $MODEL"; }
+    safesystem("$PREM_TRAIN $IN_CONTEXTS $NUM_TOPICS $OUT_CLUSTERS $NUM_EM_PR_ITERS $PR_SCALE $PR_THREADS $emflag") or die "Topic training failed.\n";
+  }
+}
+
 sub label_spans_with_topics {
   my ($file) = (@_);
   print STDERR "\n!!!LABEL SPANS\n";
-- 
cgit v1.2.3