diff options
| author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-23 22:07:34 +0000 | 
|---|---|---|
| committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-23 22:07:34 +0000 | 
| commit | 5ed01d87524dc4471e4fe601e528b2753f0038b6 (patch) | |
| tree | 509ffc2d2691d9e26bfab40590933337e1870f19 /gi/pipeline | |
| parent | cf2f68eca737c60f2490d81ea0fde9ef714123c3 (diff) | |
very simple local grammar induction pipeline
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@16 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline')
| -rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 141 | ||||
| -rwxr-xr-x | gi/pipeline/sort-by-key.sh | 5 | 
2 files changed, 143 insertions, 3 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 8a0e10c2..e52ad4ec 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -2,29 +2,75 @@  use strict;  my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } +use Getopt::Long "GetOptions";  use IPC::Run3;  use File::Temp qw ( tempdir );  my $TEMP_DIR = tempdir( CLEANUP => 1 ); +my $GZIP = 'gzip'; +my $ZCAT = 'gunzip -c'; +my $BASE_PHRASE_MAX_SIZE = 10; +my $ITEMS_IN_MEMORY = 3000000;  # cache size in extractors +my $NUM_TOPICS = 50; +my $NUM_SAMPLES = 100; +my $CONTEXT_SIZE = 1; +my $BIDIR = 1; +  my $EXTOOLS = "$SCRIPT_DIR/../../extools";  die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS;  my $PYPTOOLS = "$SCRIPT_DIR/../pyp-topics/src";  die "Can't find extools: $PYPTOOLS" unless -e $PYPTOOLS && -d $PYPTOOLS; +my $PYPSCRIPTS = "$SCRIPT_DIR/../pyp-topics/scripts"; +die "Can't find extools: $PYPSCRIPTS" unless -e $PYPSCRIPTS && -d $PYPSCRIPTS;  my $REDUCER = "$EXTOOLS/mr_stripe_rule_reduce"; +my $C2D = "$PYPSCRIPTS/contexts2documents.py"; +my $S2L = "$PYPSCRIPTS/spans2labels.py";  my $PYP_TOPICS_TRAIN="$PYPTOOLS/pyp-topics-train"; +my $SORT_KEYS = "$SCRIPT_DIR/sort-by-key.sh";  my $EXTRACTOR = "$EXTOOLS/extractor";  my $FILTER = "$EXTOOLS/filter_grammar";  my $SCORER = "$EXTOOLS/score_grammar"; +my $TOPIC_TRAIN = "$PYPTOOLS/pyp-topics-train"; + +assert_exec($SORT_KEYS, $REDUCER, $EXTRACTOR, $FILTER, $SCORER, $PYP_TOPICS_TRAIN, $S2L, $C2D, $TOPIC_TRAIN); + +my $OUTPUT = './giwork'; + +usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE, +                           'output=s' => \$OUTPUT, +                           'topics=i' => \$NUM_TOPICS, +                           'trg_context=i' => \$CONTEXT_SIZE, +                           'samples=i' => \$NUM_SAMPLES, +                          ); -assert_exec($REDUCER, $EXTRACTOR, $FILTER, $SCORER, $PYP_TOPICS_TRAIN); +mkdir($OUTPUT); +die "Couldn't create output direction: $OUTPUT" unless -d $OUTPUT; +print STDERR "OUTPUT DIRECTORY: $OUTPUT\n";  usage() unless scalar @ARGV == 1; -open F, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; -close F; +my $CORPUS = $ARGV[0]; +open F, "<$CORPUS" or die "Can't read $CORPUS: $!"; close F; + +extract_context(); +contexts_to_documents(); +topic_train(); +label_spans_with_topics(); +if ($BIDIR) { +  grammar_extract_bidir(); +} else { +  grammar_extract(); +} +print STDERR "\n!!!COMPLETE!!!\n";  exit 0; + + + + + +  sub usage {    print <<EOT; @@ -44,4 +90,93 @@ sub assert_exec {    }  }; +sub extract_context { + print STDERR "\n!!!CONTEXT EXTRACTION\n";  + my $OUT_CONTEXTS = "$OUTPUT/context.txt.gz"; + if (-e $OUT_CONTEXTS) { +   print STDERR "$OUT_CONTEXTS exists, reusing...\n"; + } else { +   safesystem("$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS") or die "Failed to extract contexts."; + } +} + +sub contexts_to_documents { + print STDERR "\n!!!CONTEXT TO DOCUMENTS\n";  + my $IN_CONTEXTS = "$OUTPUT/context.txt.gz"; + my $OUT_DOCS = "$OUTPUT/ctx.num.gz"; + if (-e $OUT_DOCS) { +   print STDERR "$OUT_DOCS exists, reusing...\n"; + } else { +   safesystem("$ZCAT $IN_CONTEXTS | $C2D $OUTPUT/contexts.index $OUTPUT/phrases.index | $GZIP > $OUT_DOCS") or die; + } +} + +sub topic_train { +  print STDERR "\n!!!TRAIN PYP TOPICS\n"; +  my $IN_DOCS = "$OUTPUT/ctx.num.gz"; +  my $OUT_CLUSTERS = "$OUTPUT/docs.txt.gz"; +  if (-e $OUT_CLUSTERS) { +    print STDERR "$OUT_CLUSTERS exists, reusing...\n"; +  } else { +    safesystem("$TOPIC_TRAIN -d $IN_DOCS -t $NUM_TOPICS -s $NUM_SAMPLES -o $OUT_CLUSTERS -w /dev/null") or die "Topic training failed.\n"; +  } +} + +sub label_spans_with_topics { +  my ($file) = (@_); +  print STDERR "\n!!!LABEL SPANS\n"; +  my $IN_CLUSTERS = "$OUTPUT/docs.txt.gz"; +  my $OUT_SPANS = "$OUTPUT/labeled_spans.txt"; +  if (-e $OUT_SPANS) { +    print STDERR "$OUT_SPANS exists, reusing...\n"; +  } else { +    safesystem("$ZCAT $IN_CLUSTERS > $OUTPUT/clusters.txt") or die "Failed to unzip"; +    safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/phrases.index $OUTPUT/contexts.index $OUTPUT/clusters.txt > $OUT_SPANS") or die "Failed to label spans"; +    unlink("$OUTPUT/clusters.txt") or warn "Failed to remove $OUTPUT/clusters.txt"; +    safesystem("paste -d ' ' $CORPUS $OUT_SPANS > $OUTPUT/corpus.src_trg_al") or die "Couldn't paste"; +  } +} + +sub grammar_extract { +  my $LABELED = "$OUTPUT/corpus.src_trg_al"; +  print STDERR "\n!!!EXTRACTING GRAMMAR\n"; +  my $OUTGRAMMAR = "$OUTPUT/grammar.gz"; +  if (-e $OUTGRAMMAR) { +    print STDERR "$OUTGRAMMAR exists, reusing...\n"; +  } else { +    safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE | $SORT_KEYS | $REDUCER -p | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; +  } +} + +sub grammar_extract_bidir { +#gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz +  my $LABELED = "$OUTPUT/corpus.src_trg_al"; +  print STDERR "\n!!!EXTRACTING GRAMMAR\n"; +  my $OUTGRAMMAR = "$OUTPUT/grammar.bidir.gz"; +  if (-e $OUTGRAMMAR) { +    print STDERR "$OUTGRAMMAR exists, reusing...\n"; +  } else { +    safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; +  } + +} + +sub safesystem { +  print STDERR "Executing: @_\n"; +  system(@_); +  if ($? == -1) { +      print STDERR "ERROR: Failed to execute: @_\n  $!\n"; +      exit(1); +  } +  elsif ($? & 127) { +      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n", +          ($? & 127),  ($? & 128) ? 'with' : 'without'; +      exit(1); +  } +  else { +    my $exitcode = $? >> 8; +    print STDERR "Exit code: $exitcode\n" if $exitcode; +    return ! $exitcode; +  } +} diff --git a/gi/pipeline/sort-by-key.sh b/gi/pipeline/sort-by-key.sh new file mode 100755 index 00000000..948dd4df --- /dev/null +++ b/gi/pipeline/sort-by-key.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +export LANG=C +sort -t $'\t' -k 1 +  | 
