summaryrefslogtreecommitdiff
path: root/gi
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-23 22:07:34 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-23 22:07:34 +0000
commit8cfda7b7677801f30ef15e319f6ac49847a5a6c9 (patch)
tree78e7339506c965a080bf361ff8bbf9452374c606 /gi
parent7c26e270a555d524c4e6eebf572e115213ed2695 (diff)
very simple local grammar induction pipeline
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@16 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi')
-rw-r--r--gi/clda/src/clda.cc3
-rwxr-xr-xgi/pipeline/local-gi-pipeline.pl141
-rwxr-xr-xgi/pipeline/sort-by-key.sh5
-rwxr-xr-xgi/pyp-topics/scripts/contexts2documents.py16
-rwxr-xr-xgi/pyp-topics/scripts/spans2labels.py2
5 files changed, 159 insertions, 8 deletions
diff --git a/gi/clda/src/clda.cc b/gi/clda/src/clda.cc
index 574fa038..05cbb441 100644
--- a/gi/clda/src/clda.cc
+++ b/gi/clda/src/clda.cc
@@ -7,6 +7,7 @@
#include "sampler.h"
#include "tdict.h"
const size_t MAX_DOC_LEN_CHARS = 1000000;
+#include <boost/math/distributions/beta.hpp>
using namespace std;
@@ -24,6 +25,8 @@ void ShowTopWordsForTopic(const map<WordID, int>& counts) {
}
int main(int argc, char** argv) {
+ boost::math::beta_distribution<double> bd(2.0,5.0);
+ cerr << pdf(bd, 0.2152132) << endl;
if (argc != 3) {
cerr << "Usage: " << argv[0] << " num-classes num-samples\n";
return 1;
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index 8a0e10c2..e52ad4ec 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -2,29 +2,75 @@
use strict;
my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
+use Getopt::Long "GetOptions";
use IPC::Run3;
use File::Temp qw ( tempdir );
my $TEMP_DIR = tempdir( CLEANUP => 1 );
+my $GZIP = 'gzip';
+my $ZCAT = 'gunzip -c';
+my $BASE_PHRASE_MAX_SIZE = 10;
+my $ITEMS_IN_MEMORY = 3000000; # cache size in extractors
+my $NUM_TOPICS = 50;
+my $NUM_SAMPLES = 100;
+my $CONTEXT_SIZE = 1;
+my $BIDIR = 1;
+
my $EXTOOLS = "$SCRIPT_DIR/../../extools";
die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS;
my $PYPTOOLS = "$SCRIPT_DIR/../pyp-topics/src";
die "Can't find extools: $PYPTOOLS" unless -e $PYPTOOLS && -d $PYPTOOLS;
+my $PYPSCRIPTS = "$SCRIPT_DIR/../pyp-topics/scripts";
+die "Can't find extools: $PYPSCRIPTS" unless -e $PYPSCRIPTS && -d $PYPSCRIPTS;
my $REDUCER = "$EXTOOLS/mr_stripe_rule_reduce";
+my $C2D = "$PYPSCRIPTS/contexts2documents.py";
+my $S2L = "$PYPSCRIPTS/spans2labels.py";
my $PYP_TOPICS_TRAIN="$PYPTOOLS/pyp-topics-train";
+my $SORT_KEYS = "$SCRIPT_DIR/sort-by-key.sh";
my $EXTRACTOR = "$EXTOOLS/extractor";
my $FILTER = "$EXTOOLS/filter_grammar";
my $SCORER = "$EXTOOLS/score_grammar";
+my $TOPIC_TRAIN = "$PYPTOOLS/pyp-topics-train";
+
+assert_exec($SORT_KEYS, $REDUCER, $EXTRACTOR, $FILTER, $SCORER, $PYP_TOPICS_TRAIN, $S2L, $C2D, $TOPIC_TRAIN);
+
+my $OUTPUT = './giwork';
+
+usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,
+ 'output=s' => \$OUTPUT,
+ 'topics=i' => \$NUM_TOPICS,
+ 'trg_context=i' => \$CONTEXT_SIZE,
+ 'samples=i' => \$NUM_SAMPLES,
+ );
-assert_exec($REDUCER, $EXTRACTOR, $FILTER, $SCORER, $PYP_TOPICS_TRAIN);
+mkdir($OUTPUT);
+die "Couldn't create output direction: $OUTPUT" unless -d $OUTPUT;
+print STDERR "OUTPUT DIRECTORY: $OUTPUT\n";
usage() unless scalar @ARGV == 1;
-open F, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!";
-close F;
+my $CORPUS = $ARGV[0];
+open F, "<$CORPUS" or die "Can't read $CORPUS: $!"; close F;
+
+extract_context();
+contexts_to_documents();
+topic_train();
+label_spans_with_topics();
+if ($BIDIR) {
+ grammar_extract_bidir();
+} else {
+ grammar_extract();
+}
+print STDERR "\n!!!COMPLETE!!!\n";
exit 0;
+
+
+
+
+
+
sub usage {
print <<EOT;
@@ -44,4 +90,93 @@ sub assert_exec {
}
};
+sub extract_context {
+ print STDERR "\n!!!CONTEXT EXTRACTION\n";
+ my $OUT_CONTEXTS = "$OUTPUT/context.txt.gz";
+ if (-e $OUT_CONTEXTS) {
+ print STDERR "$OUT_CONTEXTS exists, reusing...\n";
+ } else {
+ safesystem("$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS") or die "Failed to extract contexts.";
+ }
+}
+
+sub contexts_to_documents {
+ print STDERR "\n!!!CONTEXT TO DOCUMENTS\n";
+ my $IN_CONTEXTS = "$OUTPUT/context.txt.gz";
+ my $OUT_DOCS = "$OUTPUT/ctx.num.gz";
+ if (-e $OUT_DOCS) {
+ print STDERR "$OUT_DOCS exists, reusing...\n";
+ } else {
+ safesystem("$ZCAT $IN_CONTEXTS | $C2D $OUTPUT/contexts.index $OUTPUT/phrases.index | $GZIP > $OUT_DOCS") or die;
+ }
+}
+
+sub topic_train {
+ print STDERR "\n!!!TRAIN PYP TOPICS\n";
+ my $IN_DOCS = "$OUTPUT/ctx.num.gz";
+ my $OUT_CLUSTERS = "$OUTPUT/docs.txt.gz";
+ if (-e $OUT_CLUSTERS) {
+ print STDERR "$OUT_CLUSTERS exists, reusing...\n";
+ } else {
+ safesystem("$TOPIC_TRAIN -d $IN_DOCS -t $NUM_TOPICS -s $NUM_SAMPLES -o $OUT_CLUSTERS -w /dev/null") or die "Topic training failed.\n";
+ }
+}
+
+sub label_spans_with_topics {
+ my ($file) = (@_);
+ print STDERR "\n!!!LABEL SPANS\n";
+ my $IN_CLUSTERS = "$OUTPUT/docs.txt.gz";
+ my $OUT_SPANS = "$OUTPUT/labeled_spans.txt";
+ if (-e $OUT_SPANS) {
+ print STDERR "$OUT_SPANS exists, reusing...\n";
+ } else {
+ safesystem("$ZCAT $IN_CLUSTERS > $OUTPUT/clusters.txt") or die "Failed to unzip";
+ safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/phrases.index $OUTPUT/contexts.index $OUTPUT/clusters.txt > $OUT_SPANS") or die "Failed to label spans";
+ unlink("$OUTPUT/clusters.txt") or warn "Failed to remove $OUTPUT/clusters.txt";
+ safesystem("paste -d ' ' $CORPUS $OUT_SPANS > $OUTPUT/corpus.src_trg_al") or die "Couldn't paste";
+ }
+}
+
+sub grammar_extract {
+ my $LABELED = "$OUTPUT/corpus.src_trg_al";
+ print STDERR "\n!!!EXTRACTING GRAMMAR\n";
+ my $OUTGRAMMAR = "$OUTPUT/grammar.gz";
+ if (-e $OUTGRAMMAR) {
+ print STDERR "$OUTGRAMMAR exists, reusing...\n";
+ } else {
+ safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE | $SORT_KEYS | $REDUCER -p | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar";
+ }
+}
+
+sub grammar_extract_bidir {
+#gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz
+ my $LABELED = "$OUTPUT/corpus.src_trg_al";
+ print STDERR "\n!!!EXTRACTING GRAMMAR\n";
+ my $OUTGRAMMAR = "$OUTPUT/grammar.bidir.gz";
+ if (-e $OUTGRAMMAR) {
+ print STDERR "$OUTGRAMMAR exists, reusing...\n";
+ } else {
+ safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar";
+ }
+
+}
+
+sub safesystem {
+ print STDERR "Executing: @_\n";
+ system(@_);
+ if ($? == -1) {
+ print STDERR "ERROR: Failed to execute: @_\n $!\n";
+ exit(1);
+ }
+ elsif ($? & 127) {
+ printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
+ ($? & 127), ($? & 128) ? 'with' : 'without';
+ exit(1);
+ }
+ else {
+ my $exitcode = $? >> 8;
+ print STDERR "Exit code: $exitcode\n" if $exitcode;
+ return ! $exitcode;
+ }
+}
diff --git a/gi/pipeline/sort-by-key.sh b/gi/pipeline/sort-by-key.sh
new file mode 100755
index 00000000..948dd4df
--- /dev/null
+++ b/gi/pipeline/sort-by-key.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+export LANG=C
+sort -t $'\t' -k 1
+
diff --git a/gi/pyp-topics/scripts/contexts2documents.py b/gi/pyp-topics/scripts/contexts2documents.py
index c625d17d..9be4ebbb 100755
--- a/gi/pyp-topics/scripts/contexts2documents.py
+++ b/gi/pyp-topics/scripts/contexts2documents.py
@@ -3,27 +3,35 @@
import sys
from operator import itemgetter
-if len(sys.argv) > 2:
- print "Usage: contexts2documents.py [contexts_index_out]"
+if len(sys.argv) > 3:
+ print "Usage: contexts2documents.py [contexts_index_out] [phrases_index_out]"
exit(1)
context_index = {}
+phrase_index = {}
for line in sys.stdin:
phrase, line_tail = line.split('\t')
raw_contexts = line_tail.split('|||')
contexts = [c.strip() for x,c in enumerate(raw_contexts) if x%2 == 0]
counts = [int(c.split('=')[1].strip()) for x,c in enumerate(raw_contexts) if x%2 != 0]
-
+ phrase_index.setdefault(phrase, len(phrase_index))
print len(contexts),
for context,count in zip(contexts,counts):
c = context_index.setdefault(context, len(context_index))
print "%d:%d" % (c,count),
print
-if len(sys.argv) == 2:
+if 1 < len(sys.argv) < 4:
contexts_out = open(sys.argv[1],'w')
contexts = context_index.items()
contexts.sort(key = itemgetter(1))
for context in contexts:
print >>contexts_out, context[0]
contexts_out.close()
+if len(sys.argv) == 3:
+ phrases_out = open(sys.argv[2],'w')
+ phrases = phrase_index.items()
+ phrases.sort(key = itemgetter(1))
+ for phrase in phrases:
+ print >>phrases_out, phrase[0]
+ phrases_out.close()
diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py
index b523e191..409fda92 100755
--- a/gi/pyp-topics/scripts/spans2labels.py
+++ b/gi/pyp-topics/scripts/spans2labels.py
@@ -40,7 +40,7 @@ for line in sys.stdin:
pi = phrase_index[phrase]
ci = context_index[context]
label = phrase_context_index[(pi,ci)]
- print "%s-%s:%s" % (t1-1,t2-1,label),
+ print "%s-%s:X%s" % (t1-1,t2-1,label),
# print phrase, pi, context, ci
# print phrase_context_index[(pi,ci)]
print