From 5ed01d87524dc4471e4fe601e528b2753f0038b6 Mon Sep 17 00:00:00 2001 From: redpony Date: Wed, 23 Jun 2010 22:07:34 +0000 Subject: very simple local grammar induction pipeline git-svn-id: https://ws10smt.googlecode.com/svn/trunk@16 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/local-gi-pipeline.pl | 141 ++++++++++++++++++++++++++++++++++++++- gi/pipeline/sort-by-key.sh | 5 ++ 2 files changed, 143 insertions(+), 3 deletions(-) create mode 100755 gi/pipeline/sort-by-key.sh (limited to 'gi/pipeline') diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 8a0e10c2..e52ad4ec 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -2,29 +2,75 @@ use strict; my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } +use Getopt::Long "GetOptions"; use IPC::Run3; use File::Temp qw ( tempdir ); my $TEMP_DIR = tempdir( CLEANUP => 1 ); +my $GZIP = 'gzip'; +my $ZCAT = 'gunzip -c'; +my $BASE_PHRASE_MAX_SIZE = 10; +my $ITEMS_IN_MEMORY = 3000000; # cache size in extractors +my $NUM_TOPICS = 50; +my $NUM_SAMPLES = 100; +my $CONTEXT_SIZE = 1; +my $BIDIR = 1; + my $EXTOOLS = "$SCRIPT_DIR/../../extools"; die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS; my $PYPTOOLS = "$SCRIPT_DIR/../pyp-topics/src"; die "Can't find extools: $PYPTOOLS" unless -e $PYPTOOLS && -d $PYPTOOLS; +my $PYPSCRIPTS = "$SCRIPT_DIR/../pyp-topics/scripts"; +die "Can't find extools: $PYPSCRIPTS" unless -e $PYPSCRIPTS && -d $PYPSCRIPTS; my $REDUCER = "$EXTOOLS/mr_stripe_rule_reduce"; +my $C2D = "$PYPSCRIPTS/contexts2documents.py"; +my $S2L = "$PYPSCRIPTS/spans2labels.py"; my $PYP_TOPICS_TRAIN="$PYPTOOLS/pyp-topics-train"; +my $SORT_KEYS = "$SCRIPT_DIR/sort-by-key.sh"; my $EXTRACTOR = "$EXTOOLS/extractor"; my $FILTER = "$EXTOOLS/filter_grammar"; my $SCORER = "$EXTOOLS/score_grammar"; +my $TOPIC_TRAIN = "$PYPTOOLS/pyp-topics-train"; + +assert_exec($SORT_KEYS, $REDUCER, $EXTRACTOR, $FILTER, $SCORER, $PYP_TOPICS_TRAIN, $S2L, $C2D, $TOPIC_TRAIN); + +my $OUTPUT = './giwork'; + +usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE, + 'output=s' => \$OUTPUT, + 'topics=i' => \$NUM_TOPICS, + 'trg_context=i' => \$CONTEXT_SIZE, + 'samples=i' => \$NUM_SAMPLES, + ); -assert_exec($REDUCER, $EXTRACTOR, $FILTER, $SCORER, $PYP_TOPICS_TRAIN); +mkdir($OUTPUT); +die "Couldn't create output direction: $OUTPUT" unless -d $OUTPUT; +print STDERR "OUTPUT DIRECTORY: $OUTPUT\n"; usage() unless scalar @ARGV == 1; -open F, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; -close F; +my $CORPUS = $ARGV[0]; +open F, "<$CORPUS" or die "Can't read $CORPUS: $!"; close F; + +extract_context(); +contexts_to_documents(); +topic_train(); +label_spans_with_topics(); +if ($BIDIR) { + grammar_extract_bidir(); +} else { + grammar_extract(); +} +print STDERR "\n!!!COMPLETE!!!\n"; exit 0; + + + + + + sub usage { print < $OUT_CONTEXTS") or die "Failed to extract contexts."; + } +} + +sub contexts_to_documents { + print STDERR "\n!!!CONTEXT TO DOCUMENTS\n"; + my $IN_CONTEXTS = "$OUTPUT/context.txt.gz"; + my $OUT_DOCS = "$OUTPUT/ctx.num.gz"; + if (-e $OUT_DOCS) { + print STDERR "$OUT_DOCS exists, reusing...\n"; + } else { + safesystem("$ZCAT $IN_CONTEXTS | $C2D $OUTPUT/contexts.index $OUTPUT/phrases.index | $GZIP > $OUT_DOCS") or die; + } +} + +sub topic_train { + print STDERR "\n!!!TRAIN PYP TOPICS\n"; + my $IN_DOCS = "$OUTPUT/ctx.num.gz"; + my $OUT_CLUSTERS = "$OUTPUT/docs.txt.gz"; + if (-e $OUT_CLUSTERS) { + print STDERR "$OUT_CLUSTERS exists, reusing...\n"; + } else { + safesystem("$TOPIC_TRAIN -d $IN_DOCS -t $NUM_TOPICS -s $NUM_SAMPLES -o $OUT_CLUSTERS -w /dev/null") or die "Topic training failed.\n"; + } +} + +sub label_spans_with_topics { + my ($file) = (@_); + print STDERR "\n!!!LABEL SPANS\n"; + my $IN_CLUSTERS = "$OUTPUT/docs.txt.gz"; + my $OUT_SPANS = "$OUTPUT/labeled_spans.txt"; + if (-e $OUT_SPANS) { + print STDERR "$OUT_SPANS exists, reusing...\n"; + } else { + safesystem("$ZCAT $IN_CLUSTERS > $OUTPUT/clusters.txt") or die "Failed to unzip"; + safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/phrases.index $OUTPUT/contexts.index $OUTPUT/clusters.txt > $OUT_SPANS") or die "Failed to label spans"; + unlink("$OUTPUT/clusters.txt") or warn "Failed to remove $OUTPUT/clusters.txt"; + safesystem("paste -d ' ' $CORPUS $OUT_SPANS > $OUTPUT/corpus.src_trg_al") or die "Couldn't paste"; + } +} + +sub grammar_extract { + my $LABELED = "$OUTPUT/corpus.src_trg_al"; + print STDERR "\n!!!EXTRACTING GRAMMAR\n"; + my $OUTGRAMMAR = "$OUTPUT/grammar.gz"; + if (-e $OUTGRAMMAR) { + print STDERR "$OUTGRAMMAR exists, reusing...\n"; + } else { + safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE | $SORT_KEYS | $REDUCER -p | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; + } +} + +sub grammar_extract_bidir { +#gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz + my $LABELED = "$OUTPUT/corpus.src_trg_al"; + print STDERR "\n!!!EXTRACTING GRAMMAR\n"; + my $OUTGRAMMAR = "$OUTPUT/grammar.bidir.gz"; + if (-e $OUTGRAMMAR) { + print STDERR "$OUTGRAMMAR exists, reusing...\n"; + } else { + safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; + } + +} + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "ERROR: Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} diff --git a/gi/pipeline/sort-by-key.sh b/gi/pipeline/sort-by-key.sh new file mode 100755 index 00000000..948dd4df --- /dev/null +++ b/gi/pipeline/sort-by-key.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +export LANG=C +sort -t $'\t' -k 1 + -- cgit v1.2.3