From 40ee5446b84b5cdbc4e4a613e4c1aa19231c42d3 Mon Sep 17 00:00:00 2001 From: "trevor.cohn" Date: Thu, 15 Jul 2010 00:34:58 +0000 Subject: Massacred the pipeline to support source language phrases and contexts. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@255 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/local-gi-pipeline.pl | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'gi/pipeline') diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 0d6c553c..cb411c1b 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -19,7 +19,8 @@ my $NUM_SAMPLES = 1000; my $CONTEXT_SIZE = 1; my $BIDIR = 0; my $TOPICS_CONFIG = "pyp-topics.conf"; -my $LABEL_THRESHOLD = 0; +my $LANGUAGE = "target"; + my $MODEL = "pyp"; my $NUM_EM_ITERS = 100; my $NUM_PR_ITERS = 0; @@ -71,6 +72,7 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE, 'pr-scale-context=f' => \$PR_SCALE_C, 'pr-threads=i' => \$PR_THREADS, 'tagged_corpus=s' => \$TAGGED_CORPUS, + 'language=s' => \$LANGUAGE, ); usage() unless scalar @ARGV == 1; @@ -166,7 +168,7 @@ sub setup_data { } sub context_dir { - return "ct${CONTEXT_SIZE}s0.L$BASE_PHRASE_MAX_SIZE"; + return "ct${CONTEXT_SIZE}s0.L$BASE_PHRASE_MAX_SIZE.l$LANGUAGE"; } sub cluster_dir { @@ -231,10 +233,10 @@ sub extract_context { if (-e $OUT_CONTEXTS) { print STDERR "$OUT_CONTEXTS exists, reusing...\n"; } else { - my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS"; + my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS"; if ($COMPLETE_CACHE) { print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n"; - $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS"; + $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS"; } safesystem($cmd) or die "Failed to extract contexts."; } @@ -270,8 +272,14 @@ sub label_spans_with_topics { if (-e $OUT_SPANS) { print STDERR "$OUT_SPANS exists, reusing...\n"; } else { + my $l = "tt"; + if ($LANGUAGE eq "source") { + $l = "ss"; + } elsif ($LANGUAGE eq "both") { + $l = "bb"; + } else { die "Invalid language specifier $LANGUAGE\n" unless $LANGUAGE eq "target" }; safesystem("$ZCAT $IN_CLUSTERS > $CLUSTER_DIR/clusters.txt") or die "Failed to unzip"; - safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD > $OUT_SPANS") or die "Failed to label spans"; + safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $l > $OUT_SPANS") or die "Failed to label spans"; unlink("$CLUSTER_DIR/clusters.txt") or warn "Failed to remove $CLUSTER_DIR/clusters.txt"; safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS > $LABELED_DIR/corpus.src_trg_al_label") or die "Couldn't paste"; } -- cgit v1.2.3