summaryrefslogtreecommitdiff
path: root/gi/pipeline
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-23 22:07:34 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-23 22:07:34 +0000
commit5ed01d87524dc4471e4fe601e528b2753f0038b6 (patch)
tree509ffc2d2691d9e26bfab40590933337e1870f19 /gi/pipeline
parentcf2f68eca737c60f2490d81ea0fde9ef714123c3 (diff)
very simple local grammar induction pipeline
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@16 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline')
-rwxr-xr-xgi/pipeline/local-gi-pipeline.pl141
-rwxr-xr-xgi/pipeline/sort-by-key.sh5
2 files changed, 143 insertions, 3 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index 8a0e10c2..e52ad4ec 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -2,29 +2,75 @@
use strict;
my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
+use Getopt::Long "GetOptions";
use IPC::Run3;
use File::Temp qw ( tempdir );
my $TEMP_DIR = tempdir( CLEANUP => 1 );
+my $GZIP = 'gzip';
+my $ZCAT = 'gunzip -c';
+my $BASE_PHRASE_MAX_SIZE = 10;
+my $ITEMS_IN_MEMORY = 3000000; # cache size in extractors
+my $NUM_TOPICS = 50;
+my $NUM_SAMPLES = 100;
+my $CONTEXT_SIZE = 1;
+my $BIDIR = 1;
+
my $EXTOOLS = "$SCRIPT_DIR/../../extools";
die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS;
my $PYPTOOLS = "$SCRIPT_DIR/../pyp-topics/src";
die "Can't find extools: $PYPTOOLS" unless -e $PYPTOOLS && -d $PYPTOOLS;
+my $PYPSCRIPTS = "$SCRIPT_DIR/../pyp-topics/scripts";
+die "Can't find extools: $PYPSCRIPTS" unless -e $PYPSCRIPTS && -d $PYPSCRIPTS;
my $REDUCER = "$EXTOOLS/mr_stripe_rule_reduce";
+my $C2D = "$PYPSCRIPTS/contexts2documents.py";
+my $S2L = "$PYPSCRIPTS/spans2labels.py";
my $PYP_TOPICS_TRAIN="$PYPTOOLS/pyp-topics-train";
+my $SORT_KEYS = "$SCRIPT_DIR/sort-by-key.sh";
my $EXTRACTOR = "$EXTOOLS/extractor";
my $FILTER = "$EXTOOLS/filter_grammar";
my $SCORER = "$EXTOOLS/score_grammar";
+my $TOPIC_TRAIN = "$PYPTOOLS/pyp-topics-train";
+
+assert_exec($SORT_KEYS, $REDUCER, $EXTRACTOR, $FILTER, $SCORER, $PYP_TOPICS_TRAIN, $S2L, $C2D, $TOPIC_TRAIN);
+
+my $OUTPUT = './giwork';
+
+usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,
+ 'output=s' => \$OUTPUT,
+ 'topics=i' => \$NUM_TOPICS,
+ 'trg_context=i' => \$CONTEXT_SIZE,
+ 'samples=i' => \$NUM_SAMPLES,
+ );
-assert_exec($REDUCER, $EXTRACTOR, $FILTER, $SCORER, $PYP_TOPICS_TRAIN);
+mkdir($OUTPUT);
+die "Couldn't create output direction: $OUTPUT" unless -d $OUTPUT;
+print STDERR "OUTPUT DIRECTORY: $OUTPUT\n";
usage() unless scalar @ARGV == 1;
-open F, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!";
-close F;
+my $CORPUS = $ARGV[0];
+open F, "<$CORPUS" or die "Can't read $CORPUS: $!"; close F;
+
+extract_context();
+contexts_to_documents();
+topic_train();
+label_spans_with_topics();
+if ($BIDIR) {
+ grammar_extract_bidir();
+} else {
+ grammar_extract();
+}
+print STDERR "\n!!!COMPLETE!!!\n";
exit 0;
+
+
+
+
+
+
sub usage {
print <<EOT;
@@ -44,4 +90,93 @@ sub assert_exec {
}
};
+sub extract_context {
+ print STDERR "\n!!!CONTEXT EXTRACTION\n";
+ my $OUT_CONTEXTS = "$OUTPUT/context.txt.gz";
+ if (-e $OUT_CONTEXTS) {
+ print STDERR "$OUT_CONTEXTS exists, reusing...\n";
+ } else {
+ safesystem("$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS") or die "Failed to extract contexts.";
+ }
+}
+
+sub contexts_to_documents {
+ print STDERR "\n!!!CONTEXT TO DOCUMENTS\n";
+ my $IN_CONTEXTS = "$OUTPUT/context.txt.gz";
+ my $OUT_DOCS = "$OUTPUT/ctx.num.gz";
+ if (-e $OUT_DOCS) {
+ print STDERR "$OUT_DOCS exists, reusing...\n";
+ } else {
+ safesystem("$ZCAT $IN_CONTEXTS | $C2D $OUTPUT/contexts.index $OUTPUT/phrases.index | $GZIP > $OUT_DOCS") or die;
+ }
+}
+
+sub topic_train {
+ print STDERR "\n!!!TRAIN PYP TOPICS\n";
+ my $IN_DOCS = "$OUTPUT/ctx.num.gz";
+ my $OUT_CLUSTERS = "$OUTPUT/docs.txt.gz";
+ if (-e $OUT_CLUSTERS) {
+ print STDERR "$OUT_CLUSTERS exists, reusing...\n";
+ } else {
+ safesystem("$TOPIC_TRAIN -d $IN_DOCS -t $NUM_TOPICS -s $NUM_SAMPLES -o $OUT_CLUSTERS -w /dev/null") or die "Topic training failed.\n";
+ }
+}
+
+sub label_spans_with_topics {
+ my ($file) = (@_);
+ print STDERR "\n!!!LABEL SPANS\n";
+ my $IN_CLUSTERS = "$OUTPUT/docs.txt.gz";
+ my $OUT_SPANS = "$OUTPUT/labeled_spans.txt";
+ if (-e $OUT_SPANS) {
+ print STDERR "$OUT_SPANS exists, reusing...\n";
+ } else {
+ safesystem("$ZCAT $IN_CLUSTERS > $OUTPUT/clusters.txt") or die "Failed to unzip";
+ safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/phrases.index $OUTPUT/contexts.index $OUTPUT/clusters.txt > $OUT_SPANS") or die "Failed to label spans";
+ unlink("$OUTPUT/clusters.txt") or warn "Failed to remove $OUTPUT/clusters.txt";
+ safesystem("paste -d ' ' $CORPUS $OUT_SPANS > $OUTPUT/corpus.src_trg_al") or die "Couldn't paste";
+ }
+}
+
+sub grammar_extract {
+ my $LABELED = "$OUTPUT/corpus.src_trg_al";
+ print STDERR "\n!!!EXTRACTING GRAMMAR\n";
+ my $OUTGRAMMAR = "$OUTPUT/grammar.gz";
+ if (-e $OUTGRAMMAR) {
+ print STDERR "$OUTGRAMMAR exists, reusing...\n";
+ } else {
+ safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE | $SORT_KEYS | $REDUCER -p | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar";
+ }
+}
+
+sub grammar_extract_bidir {
+#gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz
+ my $LABELED = "$OUTPUT/corpus.src_trg_al";
+ print STDERR "\n!!!EXTRACTING GRAMMAR\n";
+ my $OUTGRAMMAR = "$OUTPUT/grammar.bidir.gz";
+ if (-e $OUTGRAMMAR) {
+ print STDERR "$OUTGRAMMAR exists, reusing...\n";
+ } else {
+ safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar";
+ }
+
+}
+
+sub safesystem {
+ print STDERR "Executing: @_\n";
+ system(@_);
+ if ($? == -1) {
+ print STDERR "ERROR: Failed to execute: @_\n $!\n";
+ exit(1);
+ }
+ elsif ($? & 127) {
+ printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
+ ($? & 127), ($? & 128) ? 'with' : 'without';
+ exit(1);
+ }
+ else {
+ my $exitcode = $? >> 8;
+ print STDERR "Exit code: $exitcode\n" if $exitcode;
+ return ! $exitcode;
+ }
+}
diff --git a/gi/pipeline/sort-by-key.sh b/gi/pipeline/sort-by-key.sh
new file mode 100755
index 00000000..948dd4df
--- /dev/null
+++ b/gi/pipeline/sort-by-key.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+export LANG=C
+sort -t $'\t' -k 1
+