diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-23 22:07:34 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-23 22:07:34 +0000 |
commit | 8cfda7b7677801f30ef15e319f6ac49847a5a6c9 (patch) | |
tree | 78e7339506c965a080bf361ff8bbf9452374c606 /gi | |
parent | 7c26e270a555d524c4e6eebf572e115213ed2695 (diff) |
very simple local grammar induction pipeline
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@16 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi')
-rw-r--r-- | gi/clda/src/clda.cc | 3 | ||||
-rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 141 | ||||
-rwxr-xr-x | gi/pipeline/sort-by-key.sh | 5 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/contexts2documents.py | 16 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/spans2labels.py | 2 |
5 files changed, 159 insertions, 8 deletions
diff --git a/gi/clda/src/clda.cc b/gi/clda/src/clda.cc index 574fa038..05cbb441 100644 --- a/gi/clda/src/clda.cc +++ b/gi/clda/src/clda.cc @@ -7,6 +7,7 @@ #include "sampler.h" #include "tdict.h" const size_t MAX_DOC_LEN_CHARS = 1000000; +#include <boost/math/distributions/beta.hpp> using namespace std; @@ -24,6 +25,8 @@ void ShowTopWordsForTopic(const map<WordID, int>& counts) { } int main(int argc, char** argv) { + boost::math::beta_distribution<double> bd(2.0,5.0); + cerr << pdf(bd, 0.2152132) << endl; if (argc != 3) { cerr << "Usage: " << argv[0] << " num-classes num-samples\n"; return 1; diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 8a0e10c2..e52ad4ec 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -2,29 +2,75 @@ use strict; my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } +use Getopt::Long "GetOptions"; use IPC::Run3; use File::Temp qw ( tempdir ); my $TEMP_DIR = tempdir( CLEANUP => 1 ); +my $GZIP = 'gzip'; +my $ZCAT = 'gunzip -c'; +my $BASE_PHRASE_MAX_SIZE = 10; +my $ITEMS_IN_MEMORY = 3000000; # cache size in extractors +my $NUM_TOPICS = 50; +my $NUM_SAMPLES = 100; +my $CONTEXT_SIZE = 1; +my $BIDIR = 1; + my $EXTOOLS = "$SCRIPT_DIR/../../extools"; die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS; my $PYPTOOLS = "$SCRIPT_DIR/../pyp-topics/src"; die "Can't find extools: $PYPTOOLS" unless -e $PYPTOOLS && -d $PYPTOOLS; +my $PYPSCRIPTS = "$SCRIPT_DIR/../pyp-topics/scripts"; +die "Can't find extools: $PYPSCRIPTS" unless -e $PYPSCRIPTS && -d $PYPSCRIPTS; my $REDUCER = "$EXTOOLS/mr_stripe_rule_reduce"; +my $C2D = "$PYPSCRIPTS/contexts2documents.py"; +my $S2L = "$PYPSCRIPTS/spans2labels.py"; my $PYP_TOPICS_TRAIN="$PYPTOOLS/pyp-topics-train"; +my $SORT_KEYS = "$SCRIPT_DIR/sort-by-key.sh"; my $EXTRACTOR = "$EXTOOLS/extractor"; my $FILTER = "$EXTOOLS/filter_grammar"; my $SCORER = "$EXTOOLS/score_grammar"; +my $TOPIC_TRAIN = "$PYPTOOLS/pyp-topics-train"; + +assert_exec($SORT_KEYS, $REDUCER, $EXTRACTOR, $FILTER, $SCORER, $PYP_TOPICS_TRAIN, $S2L, $C2D, $TOPIC_TRAIN); + +my $OUTPUT = './giwork'; + +usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE, + 'output=s' => \$OUTPUT, + 'topics=i' => \$NUM_TOPICS, + 'trg_context=i' => \$CONTEXT_SIZE, + 'samples=i' => \$NUM_SAMPLES, + ); -assert_exec($REDUCER, $EXTRACTOR, $FILTER, $SCORER, $PYP_TOPICS_TRAIN); +mkdir($OUTPUT); +die "Couldn't create output direction: $OUTPUT" unless -d $OUTPUT; +print STDERR "OUTPUT DIRECTORY: $OUTPUT\n"; usage() unless scalar @ARGV == 1; -open F, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; -close F; +my $CORPUS = $ARGV[0]; +open F, "<$CORPUS" or die "Can't read $CORPUS: $!"; close F; + +extract_context(); +contexts_to_documents(); +topic_train(); +label_spans_with_topics(); +if ($BIDIR) { + grammar_extract_bidir(); +} else { + grammar_extract(); +} +print STDERR "\n!!!COMPLETE!!!\n"; exit 0; + + + + + + sub usage { print <<EOT; @@ -44,4 +90,93 @@ sub assert_exec { } }; +sub extract_context { + print STDERR "\n!!!CONTEXT EXTRACTION\n"; + my $OUT_CONTEXTS = "$OUTPUT/context.txt.gz"; + if (-e $OUT_CONTEXTS) { + print STDERR "$OUT_CONTEXTS exists, reusing...\n"; + } else { + safesystem("$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS") or die "Failed to extract contexts."; + } +} + +sub contexts_to_documents { + print STDERR "\n!!!CONTEXT TO DOCUMENTS\n"; + my $IN_CONTEXTS = "$OUTPUT/context.txt.gz"; + my $OUT_DOCS = "$OUTPUT/ctx.num.gz"; + if (-e $OUT_DOCS) { + print STDERR "$OUT_DOCS exists, reusing...\n"; + } else { + safesystem("$ZCAT $IN_CONTEXTS | $C2D $OUTPUT/contexts.index $OUTPUT/phrases.index | $GZIP > $OUT_DOCS") or die; + } +} + +sub topic_train { + print STDERR "\n!!!TRAIN PYP TOPICS\n"; + my $IN_DOCS = "$OUTPUT/ctx.num.gz"; + my $OUT_CLUSTERS = "$OUTPUT/docs.txt.gz"; + if (-e $OUT_CLUSTERS) { + print STDERR "$OUT_CLUSTERS exists, reusing...\n"; + } else { + safesystem("$TOPIC_TRAIN -d $IN_DOCS -t $NUM_TOPICS -s $NUM_SAMPLES -o $OUT_CLUSTERS -w /dev/null") or die "Topic training failed.\n"; + } +} + +sub label_spans_with_topics { + my ($file) = (@_); + print STDERR "\n!!!LABEL SPANS\n"; + my $IN_CLUSTERS = "$OUTPUT/docs.txt.gz"; + my $OUT_SPANS = "$OUTPUT/labeled_spans.txt"; + if (-e $OUT_SPANS) { + print STDERR "$OUT_SPANS exists, reusing...\n"; + } else { + safesystem("$ZCAT $IN_CLUSTERS > $OUTPUT/clusters.txt") or die "Failed to unzip"; + safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/phrases.index $OUTPUT/contexts.index $OUTPUT/clusters.txt > $OUT_SPANS") or die "Failed to label spans"; + unlink("$OUTPUT/clusters.txt") or warn "Failed to remove $OUTPUT/clusters.txt"; + safesystem("paste -d ' ' $CORPUS $OUT_SPANS > $OUTPUT/corpus.src_trg_al") or die "Couldn't paste"; + } +} + +sub grammar_extract { + my $LABELED = "$OUTPUT/corpus.src_trg_al"; + print STDERR "\n!!!EXTRACTING GRAMMAR\n"; + my $OUTGRAMMAR = "$OUTPUT/grammar.gz"; + if (-e $OUTGRAMMAR) { + print STDERR "$OUTGRAMMAR exists, reusing...\n"; + } else { + safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE | $SORT_KEYS | $REDUCER -p | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; + } +} + +sub grammar_extract_bidir { +#gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz + my $LABELED = "$OUTPUT/corpus.src_trg_al"; + print STDERR "\n!!!EXTRACTING GRAMMAR\n"; + my $OUTGRAMMAR = "$OUTPUT/grammar.bidir.gz"; + if (-e $OUTGRAMMAR) { + print STDERR "$OUTGRAMMAR exists, reusing...\n"; + } else { + safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; + } + +} + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "ERROR: Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} diff --git a/gi/pipeline/sort-by-key.sh b/gi/pipeline/sort-by-key.sh new file mode 100755 index 00000000..948dd4df --- /dev/null +++ b/gi/pipeline/sort-by-key.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +export LANG=C +sort -t $'\t' -k 1 + diff --git a/gi/pyp-topics/scripts/contexts2documents.py b/gi/pyp-topics/scripts/contexts2documents.py index c625d17d..9be4ebbb 100755 --- a/gi/pyp-topics/scripts/contexts2documents.py +++ b/gi/pyp-topics/scripts/contexts2documents.py @@ -3,27 +3,35 @@ import sys from operator import itemgetter -if len(sys.argv) > 2: - print "Usage: contexts2documents.py [contexts_index_out]" +if len(sys.argv) > 3: + print "Usage: contexts2documents.py [contexts_index_out] [phrases_index_out]" exit(1) context_index = {} +phrase_index = {} for line in sys.stdin: phrase, line_tail = line.split('\t') raw_contexts = line_tail.split('|||') contexts = [c.strip() for x,c in enumerate(raw_contexts) if x%2 == 0] counts = [int(c.split('=')[1].strip()) for x,c in enumerate(raw_contexts) if x%2 != 0] - + phrase_index.setdefault(phrase, len(phrase_index)) print len(contexts), for context,count in zip(contexts,counts): c = context_index.setdefault(context, len(context_index)) print "%d:%d" % (c,count), print -if len(sys.argv) == 2: +if 1 < len(sys.argv) < 4: contexts_out = open(sys.argv[1],'w') contexts = context_index.items() contexts.sort(key = itemgetter(1)) for context in contexts: print >>contexts_out, context[0] contexts_out.close() +if len(sys.argv) == 3: + phrases_out = open(sys.argv[2],'w') + phrases = phrase_index.items() + phrases.sort(key = itemgetter(1)) + for phrase in phrases: + print >>phrases_out, phrase[0] + phrases_out.close() diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py index b523e191..409fda92 100755 --- a/gi/pyp-topics/scripts/spans2labels.py +++ b/gi/pyp-topics/scripts/spans2labels.py @@ -40,7 +40,7 @@ for line in sys.stdin: pi = phrase_index[phrase] ci = context_index[context] label = phrase_context_index[(pi,ci)] - print "%s-%s:%s" % (t1-1,t2-1,label), + print "%s-%s:X%s" % (t1-1,t2-1,label), # print phrase, pi, context, ci # print phrase_context_index[(pi,ci)] print |