From a72189be8a87f63a3f86a67287e53b835eca004f Mon Sep 17 00:00:00 2001 From: bothameister Date: Thu, 15 Jul 2010 20:41:31 +0000 Subject: updated pipeline with --use_default_cat to handle unlabelled spans (which default to 'X'); added a shortcut so that pipeline can return the directory name where a labelled corpus ends up - I find it helpful for organizing experiments git-svn-id: https://ws10smt.googlecode.com/svn/trunk@267 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/evaluation-pipeline.pl | 7 +++++-- gi/pipeline/local-gi-pipeline.pl | 16 +++++++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'gi') diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index 6e786a8a..bf9d037c 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -281,8 +281,8 @@ sub write_cdec_ini { formalism=scfg cubepruning_pop_limit=100 add_pass_through_rules=true -scfg_extra_glue_grammar=/export/ws10smt/data/glue/glue.scfg.gz -grammar=/export/ws10smt/data/oov.scfg.gz +scfg_extra_glue_grammar=/export/ws10smt/jan/glue.scfg.gz +grammar=/export/ws10smt/jan/oov.scfg.gz grammar=$grammar_path scfg_default_nt=OOV scfg_no_hiero_glue_grammar=true @@ -292,6 +292,9 @@ EOT close CDECINI; }; +#scfg_extra_glue_grammar=/export/ws10smt/data/glue/glue.scfg.gz +#grammar=/export/ws10smt/data/oov.scfg.gz + sub print_help { print STDERR< \$BASE_PHRASE_MAX_SIZE, 'backoff_grammar' => \$BACKOFF_GRAMMAR, @@ -66,6 +69,7 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE, 'trg_context=i' => \$CONTEXT_SIZE, 'samples=i' => \$NUM_SAMPLES, 'label_threshold=f' => \$LABEL_THRESHOLD, + 'use_default_cat' => \$DEFAULT_CAT, 'topics-config=s' => \$TOPICS_CONFIG, 'em-iterations=i' => \$NUM_EM_ITERS, 'pr-iterations=i' => \$NUM_PR_ITERS, @@ -74,8 +78,13 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE, 'pr-threads=i' => \$PR_THREADS, 'tagged_corpus=s' => \$TAGGED_CORPUS, 'language=s' => \$LANGUAGE, + 'get_name_only' => \$NAME_SHORTCUT, ); - +if ($NAME_SHORTCUT) { + $NUM_TOPICS = $NUM_TOPICS_FINE; + print STDERR labeled_dir(); + exit 0; +} usage() unless scalar @ARGV == 1; my $CORPUS = $ARGV[0]; open F, "<$CORPUS" or die "Can't read $CORPUS: $!"; close F; @@ -186,7 +195,7 @@ sub cluster_dir { sub labeled_dir { if (lc($MODEL) eq "pyp" && $LABEL_THRESHOLD != 0) { - return cluster_dir() . "-lt$LABEL_THRESHOLD"; + return cluster_dir() . "_lt$LABEL_THRESHOLD"; } else { return cluster_dir(); } @@ -314,7 +323,8 @@ sub grammar_extract { print STDERR "$OUTGRAMMAR exists, reusing...\n"; } else { my $BACKOFF_ARG = ($BACKOFF_GRAMMAR ? "-g" : ""); - safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -t $NUM_TOPICS $BACKOFF_ARG | $SORT_KEYS | $REDUCER -p | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; + my $DEFAULT_CAT_ARG = ($DEFAULT_CAT ? "-d X" : ""); + safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -t $NUM_TOPICS $BACKOFF_ARG $DEFAULT_CAT_ARG | $SORT_KEYS | $REDUCER -p | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; } return $OUTGRAMMAR; } -- cgit v1.2.3