From 767e0c98265ba25a8f5dbdc75b5998e4ed2ff08c Mon Sep 17 00:00:00 2001 From: redpony Date: Wed, 30 Jun 2010 02:29:00 +0000 Subject: pipeline cleanup git-svn-id: https://ws10smt.googlecode.com/svn/trunk@74 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/local-gi-pipeline.pl | 71 ++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 32 deletions(-) (limited to 'gi/pipeline/local-gi-pipeline.pl') diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 8ed21c4d..e66ca602 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -34,8 +34,8 @@ my $TOPIC_TRAIN = "$PYPTOOLS/pyp-contexts-train"; assert_exec($SORT_KEYS, $REDUCER, $EXTRACTOR, $FILTER, $SCORER, $PYP_TOPICS_TRAIN, $S2L, $C2D, $TOPIC_TRAIN); -my $OUTPUT = './giwork'; +my $OUTPUT = './giwork'; usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE, 'output=s' => \$OUTPUT, 'topics=i' => \$NUM_TOPICS, @@ -43,16 +43,21 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE, 'samples=i' => \$NUM_SAMPLES, ); -mkdir($OUTPUT); -die "Couldn't create output direction: $OUTPUT" unless -d $OUTPUT; -print STDERR "OUTPUT DIRECTORY: $OUTPUT\n"; - usage() unless scalar @ARGV == 1; my $CORPUS = $ARGV[0]; open F, "<$CORPUS" or die "Can't read $CORPUS: $!"; close F; +print STDERR " Output: $OUTPUT\n"; +my $CONTEXT_DIR = $OUTPUT . '/' . context_dir(); +my $CLUSTER_DIR = $OUTPUT . '/' . cluster_dir(); +my $GRAMMAR_DIR = $OUTPUT . '/' . grammar_dir(); +print STDERR " Context: $CONTEXT_DIR\n Cluster: $CLUSTER_DIR\n Grammar: $GRAMMAR_DIR\n"; +safemkdir($OUTPUT) or die "Couldn't create output directory $OUTPUT: $!"; +safemkdir($CONTEXT_DIR) or die "Couldn't create output directory $CONTEXT_DIR: $!"; +safemkdir($CLUSTER_DIR) or die "Couldn't create output directory $CLUSTER_DIR: $!"; +safemkdir($GRAMMAR_DIR) or die "Couldn't create output directory $GRAMMAR_DIR: $!"; + extract_context(); -# contexts_to_documents(); topic_train(); label_spans_with_topics(); my $res; @@ -65,11 +70,26 @@ print STDERR "\n!!!COMPLETE!!!\n"; print STDERR "GRAMMAR: $res\n\nYou should probably run:\n\n $SCRIPT_DIR/filter-for-test-set.pl $CORPUS $res TESTSET.TXT > filtered-grammar.scfg\n\n"; exit 0; +sub context_dir { + return "ct${CONTEXT_SIZE}s0.L$BASE_PHRASE_MAX_SIZE"; +} +sub cluster_dir { + return context_dir() . ".t$NUM_TOPICS.s$NUM_SAMPLES"; +} +sub grammar_dir { + # TODO add grammar config options -- adjacent NTs, etc + return cluster_dir() . ".grammar"; +} +sub safemkdir { + my $dir = shift; + if (-d $dir) { return 1; } + return mkdir($dir); +} sub usage { print < $OUT_DOCS") or die; - } -} - sub topic_train { print STDERR "\n!!!TRAIN PYP TOPICS\n"; -# my $IN_DOCS = "$OUTPUT/ctx.num.gz"; - my $IN_CONTEXTS = "$OUTPUT/context.txt.gz"; - my $OUT_CLUSTERS = "$OUTPUT/docs.txt.gz"; + my $IN_CONTEXTS = "$CONTEXT_DIR/context.txt.gz"; + my $OUT_CLUSTERS = "$CLUSTER_DIR/docs.txt.gz"; if (-e $OUT_CLUSTERS) { print STDERR "$OUT_CLUSTERS exists, reusing...\n"; } else { @@ -132,23 +140,22 @@ sub topic_train { sub label_spans_with_topics { my ($file) = (@_); print STDERR "\n!!!LABEL SPANS\n"; - my $IN_CLUSTERS = "$OUTPUT/docs.txt.gz"; - my $OUT_SPANS = "$OUTPUT/labeled_spans.txt"; + my $IN_CLUSTERS = "$CLUSTER_DIR/docs.txt.gz"; + my $OUT_SPANS = "$CLUSTER_DIR/labeled_spans.txt"; if (-e $OUT_SPANS) { print STDERR "$OUT_SPANS exists, reusing...\n"; } else { - safesystem("$ZCAT $IN_CLUSTERS > $OUTPUT/clusters.txt") or die "Failed to unzip"; -# safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/phrases.index $OUTPUT/contexts.index $OUTPUT/clusters.txt > $OUT_SPANS") or die "Failed to label spans"; - safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/clusters.txt $CONTEXT_SIZE > $OUT_SPANS") or die "Failed to label spans"; - unlink("$OUTPUT/clusters.txt") or warn "Failed to remove $OUTPUT/clusters.txt"; - safesystem("paste -d ' ' $CORPUS $OUT_SPANS > $OUTPUT/corpus.src_trg_al") or die "Couldn't paste"; + safesystem("$ZCAT $IN_CLUSTERS > $CLUSTER_DIR/clusters.txt") or die "Failed to unzip"; + safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE > $OUT_SPANS") or die "Failed to label spans"; + unlink("$CLUSTER_DIR/clusters.txt") or warn "Failed to remove $CLUSTER_DIR/clusters.txt"; + safesystem("paste -d ' ' $CORPUS $OUT_SPANS > $CLUSTER_DIR/corpus.src_trg_al_label") or die "Couldn't paste"; } } sub grammar_extract { - my $LABELED = "$OUTPUT/corpus.src_trg_al"; + my $LABELED = "$CLUSTER_DIR/corpus.src_trg_al_label"; print STDERR "\n!!!EXTRACTING GRAMMAR\n"; - my $OUTGRAMMAR = "$OUTPUT/grammar.gz"; + my $OUTGRAMMAR = "$GRAMMAR_DIR/grammar.gz"; if (-e $OUTGRAMMAR) { print STDERR "$OUTGRAMMAR exists, reusing...\n"; } else { @@ -159,9 +166,9 @@ sub grammar_extract { sub grammar_extract_bidir { #gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz - my $LABELED = "$OUTPUT/corpus.src_trg_al"; + my $LABELED = "$CLUSTER_DIR/corpus.src_trg_al_label"; print STDERR "\n!!!EXTRACTING GRAMMAR\n"; - my $OUTGRAMMAR = "$OUTPUT/grammar.bidir.gz"; + my $OUTGRAMMAR = "$GRAMMAR_DIR/grammar.bidir.gz"; if (-e $OUTGRAMMAR) { print STDERR "$OUTGRAMMAR exists, reusing...\n"; } else { -- cgit v1.2.3