very simple local grammar induction pipeline

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@16 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-23 22:07:34 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-23 22:07:34 +0000
commit: 5ed01d87524dc4471e4fe601e528b2753f0038b6 (patch)
tree: 509ffc2d2691d9e26bfab40590933337e1870f19 /gi/pipeline
parent: cf2f68eca737c60f2490d81ea0fde9ef714123c3 (diff)
2 files changed, 143 insertions, 3 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index 8a0e10c2..e52ad4ec 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -2,29 +2,75 @@
 use strict;
 my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
 
+use Getopt::Long "GetOptions";
 use IPC::Run3;
 use File::Temp qw ( tempdir );
 my $TEMP_DIR = tempdir( CLEANUP => 1 );
 
+my $GZIP = 'gzip';
+my $ZCAT = 'gunzip -c';
+my $BASE_PHRASE_MAX_SIZE = 10;
+my $ITEMS_IN_MEMORY = 3000000;  # cache size in extractors
+my $NUM_TOPICS = 50;
+my $NUM_SAMPLES = 100;
+my $CONTEXT_SIZE = 1;
+my $BIDIR = 1;
+
 my $EXTOOLS = "$SCRIPT_DIR/../../extools";
 die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS;
 my $PYPTOOLS = "$SCRIPT_DIR/../pyp-topics/src";
 die "Can't find extools: $PYPTOOLS" unless -e $PYPTOOLS && -d $PYPTOOLS;
+my $PYPSCRIPTS = "$SCRIPT_DIR/../pyp-topics/scripts";
+die "Can't find extools: $PYPSCRIPTS" unless -e $PYPSCRIPTS && -d $PYPSCRIPTS;
 my $REDUCER = "$EXTOOLS/mr_stripe_rule_reduce";
+my $C2D = "$PYPSCRIPTS/contexts2documents.py";
+my $S2L = "$PYPSCRIPTS/spans2labels.py";
 
 my $PYP_TOPICS_TRAIN="$PYPTOOLS/pyp-topics-train";
 
+my $SORT_KEYS = "$SCRIPT_DIR/sort-by-key.sh";
 my $EXTRACTOR = "$EXTOOLS/extractor";
 my $FILTER = "$EXTOOLS/filter_grammar";
 my $SCORER = "$EXTOOLS/score_grammar";
+my $TOPIC_TRAIN = "$PYPTOOLS/pyp-topics-train";
+
+assert_exec($SORT_KEYS, $REDUCER, $EXTRACTOR, $FILTER, $SCORER, $PYP_TOPICS_TRAIN, $S2L, $C2D, $TOPIC_TRAIN);
+
+my $OUTPUT = './giwork';
+
+usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,
+                           'output=s' => \$OUTPUT,
+                           'topics=i' => \$NUM_TOPICS,
+                           'trg_context=i' => \$CONTEXT_SIZE,
+                           'samples=i' => \$NUM_SAMPLES,
+                          );
 
-assert_exec($REDUCER, $EXTRACTOR, $FILTER, $SCORER, $PYP_TOPICS_TRAIN);
+mkdir($OUTPUT);
+die "Couldn't create output direction: $OUTPUT" unless -d $OUTPUT;
+print STDERR "OUTPUT DIRECTORY: $OUTPUT\n";
 
 usage() unless scalar @ARGV == 1;
-open F, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!";
-close F;
+my $CORPUS = $ARGV[0];
+open F, "<$CORPUS" or die "Can't read $CORPUS: $!"; close F;
+
+extract_context();
+contexts_to_documents();
+topic_train();
+label_spans_with_topics();
+if ($BIDIR) {
+  grammar_extract_bidir();
+} else {
+  grammar_extract();
+}
+print STDERR "\n!!!COMPLETE!!!\n";
 exit 0;
 
+
+
+
+
+
+
 sub usage {
   print <<EOT;
 
@@ -44,4 +90,93 @@ sub assert_exec {
   }
 };
 
+sub extract_context {
+ print STDERR "\n!!!CONTEXT EXTRACTION\n"; 
+ my $OUT_CONTEXTS = "$OUTPUT/context.txt.gz";
+ if (-e $OUT_CONTEXTS) {
+   print STDERR "$OUT_CONTEXTS exists, reusing...\n";
+ } else {
+   safesystem("$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS") or die "Failed to extract contexts.";
+ }
+}
+
+sub contexts_to_documents {
+ print STDERR "\n!!!CONTEXT TO DOCUMENTS\n"; 
+ my $IN_CONTEXTS = "$OUTPUT/context.txt.gz";
+ my $OUT_DOCS = "$OUTPUT/ctx.num.gz";
+ if (-e $OUT_DOCS) {
+   print STDERR "$OUT_DOCS exists, reusing...\n";
+ } else {
+   safesystem("$ZCAT $IN_CONTEXTS | $C2D $OUTPUT/contexts.index $OUTPUT/phrases.index | $GZIP > $OUT_DOCS") or die;
+ }
+}
+
+sub topic_train {
+  print STDERR "\n!!!TRAIN PYP TOPICS\n";
+  my $IN_DOCS = "$OUTPUT/ctx.num.gz";
+  my $OUT_CLUSTERS = "$OUTPUT/docs.txt.gz";
+  if (-e $OUT_CLUSTERS) {
+    print STDERR "$OUT_CLUSTERS exists, reusing...\n";
+  } else {
+    safesystem("$TOPIC_TRAIN -d $IN_DOCS -t $NUM_TOPICS -s $NUM_SAMPLES -o $OUT_CLUSTERS -w /dev/null") or die "Topic training failed.\n";
+  }
+}
+
+sub label_spans_with_topics {
+  my ($file) = (@_);
+  print STDERR "\n!!!LABEL SPANS\n";
+  my $IN_CLUSTERS = "$OUTPUT/docs.txt.gz";
+  my $OUT_SPANS = "$OUTPUT/labeled_spans.txt";
+  if (-e $OUT_SPANS) {
+    print STDERR "$OUT_SPANS exists, reusing...\n";
+  } else {
+    safesystem("$ZCAT $IN_CLUSTERS > $OUTPUT/clusters.txt") or die "Failed to unzip";
+    safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/phrases.index $OUTPUT/contexts.index $OUTPUT/clusters.txt > $OUT_SPANS") or die "Failed to label spans";
+    unlink("$OUTPUT/clusters.txt") or warn "Failed to remove $OUTPUT/clusters.txt";
+    safesystem("paste -d ' ' $CORPUS $OUT_SPANS > $OUTPUT/corpus.src_trg_al") or die "Couldn't paste";
+  }
+}
+
+sub grammar_extract {
+  my $LABELED = "$OUTPUT/corpus.src_trg_al";
+  print STDERR "\n!!!EXTRACTING GRAMMAR\n";
+  my $OUTGRAMMAR = "$OUTPUT/grammar.gz";
+  if (-e $OUTGRAMMAR) {
+    print STDERR "$OUTGRAMMAR exists, reusing...\n";
+  } else {
+    safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE | $SORT_KEYS | $REDUCER -p | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar";
+  }
+}
+
+sub grammar_extract_bidir {
+#gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz
+  my $LABELED = "$OUTPUT/corpus.src_trg_al";
+  print STDERR "\n!!!EXTRACTING GRAMMAR\n";
+  my $OUTGRAMMAR = "$OUTPUT/grammar.bidir.gz";
+  if (-e $OUTGRAMMAR) {
+    print STDERR "$OUTGRAMMAR exists, reusing...\n";
+  } else {
+    safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar";
+  }
+
+}
+
+sub safesystem {
+  print STDERR "Executing: @_\n";
+  system(@_);
+  if ($? == -1) {
+      print STDERR "ERROR: Failed to execute: @_\n  $!\n";
+      exit(1);
+  }
+  elsif ($? & 127) {
+      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n",
+          ($? & 127),  ($? & 128) ? 'with' : 'without';
+      exit(1);
+  }
+  else {
+    my $exitcode = $? >> 8;
+    print STDERR "Exit code: $exitcode\n" if $exitcode;
+    return ! $exitcode;
+  }
+}
 
diff --git a/gi/pipeline/sort-by-key.sh b/gi/pipeline/sort-by-key.sh
new file mode 100755
index 00000000..948dd4df
--- /dev/null
+++ b/gi/pipeline/sort-by-key.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+export LANG=C
+sort -t $'\t' -k 1
+
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-23 22:07:34 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-23 22:07:34 +0000
commit	5ed01d87524dc4471e4fe601e528b2753f0038b6 (patch)
tree	509ffc2d2691d9e26bfab40590933337e1870f19 /gi/pipeline
parent	cf2f68eca737c60f2490d81ea0fde9ef714123c3 (diff)