updated pipeline with --use_default_cat to handle unlabelled spans (which default to 'X'); added a shortcut so that pipeline can return the directory name where a labelled corpus ends up - I find it helpful for organizing experiments

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@267 ec762483-ff6d-05da-a07a-a48fb63a330f
author: bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-15 20:41:31 +0000
committer: bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-15 20:41:31 +0000
commit: 7ea02958e5ee1588a454a21f69110117c19eed02 (patch)
tree: 40799a87c5c0c9e8d21c43eb5b7496b4f585c42f /gi/pipeline
parent: 272108e70b3462fdf5011f1d4dc7ec635259e344 (diff)
2 files changed, 18 insertions, 5 deletions
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index 6e786a8a..bf9d037c 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -281,8 +281,8 @@ sub write_cdec_ini {
 formalism=scfg
 cubepruning_pop_limit=100
 add_pass_through_rules=true
-scfg_extra_glue_grammar=/export/ws10smt/data/glue/glue.scfg.gz
-grammar=/export/ws10smt/data/oov.scfg.gz
+scfg_extra_glue_grammar=/export/ws10smt/jan/glue.scfg.gz
+grammar=/export/ws10smt/jan/oov.scfg.gz
 grammar=$grammar_path
 scfg_default_nt=OOV
 scfg_no_hiero_glue_grammar=true
@@ -292,6 +292,9 @@ EOT
   close CDECINI;
 };
 
+#scfg_extra_glue_grammar=/export/ws10smt/data/glue/glue.scfg.gz
+#grammar=/export/ws10smt/data/oov.scfg.gz
+
 sub print_help {
   print STDERR<<EOT;
 
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index f7e9dd22..a895a03f 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -52,9 +52,12 @@ my $TOPIC_TRAIN = "$PYPTOOLS/pyp-contexts-train";
 assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR, $PYP_TOPICS_TRAIN, $S2L, $C2D, $TOPIC_TRAIN);
 
 my $BACKOFF_GRAMMAR;
+my $DEFAULT_CAT;
 my $HIER_CAT;
 my $TAGGED_CORPUS;
 
+my $NAME_SHORTCUT;
+
 my $OUTPUT = './giwork';
 usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,
                            'backoff_grammar' => \$BACKOFF_GRAMMAR,
@@ -66,6 +69,7 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,
                            'trg_context=i' => \$CONTEXT_SIZE,
                            'samples=i' => \$NUM_SAMPLES,
                            'label_threshold=f' => \$LABEL_THRESHOLD,
+                           'use_default_cat' => \$DEFAULT_CAT,
                            'topics-config=s' => \$TOPICS_CONFIG,
                            'em-iterations=i' => \$NUM_EM_ITERS,
                            'pr-iterations=i' => \$NUM_PR_ITERS,
@@ -74,8 +78,13 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,
                            'pr-threads=i' => \$PR_THREADS,
                            'tagged_corpus=s' => \$TAGGED_CORPUS,
                            'language=s' => \$LANGUAGE,
+                           'get_name_only' => \$NAME_SHORTCUT,
                           );
-
+if ($NAME_SHORTCUT) {
+  $NUM_TOPICS = $NUM_TOPICS_FINE;
+  print STDERR labeled_dir();
+  exit 0;
+}
 usage() unless scalar @ARGV == 1;
 my $CORPUS = $ARGV[0];
 open F, "<$CORPUS" or die "Can't read $CORPUS: $!"; close F;
@@ -186,7 +195,7 @@ sub cluster_dir {
 
 sub labeled_dir {
   if (lc($MODEL) eq "pyp" && $LABEL_THRESHOLD != 0) {
-    return cluster_dir() . "-lt$LABEL_THRESHOLD";
+    return cluster_dir() . "_lt$LABEL_THRESHOLD";
   } else {
     return cluster_dir();
   }
@@ -314,7 +323,8 @@ sub grammar_extract {
     print STDERR "$OUTGRAMMAR exists, reusing...\n";
   } else {
     my $BACKOFF_ARG = ($BACKOFF_GRAMMAR ? "-g" : "");
-    safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -t $NUM_TOPICS $BACKOFF_ARG | $SORT_KEYS | $REDUCER -p | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar";
+    my $DEFAULT_CAT_ARG = ($DEFAULT_CAT ? "-d X" : "");
+    safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -t $NUM_TOPICS $BACKOFF_ARG $DEFAULT_CAT_ARG | $SORT_KEYS | $REDUCER -p | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar";
   }
   return $OUTGRAMMAR;
 }
author	bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-15 20:41:31 +0000
committer	bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-15 20:41:31 +0000
commit	7ea02958e5ee1588a454a21f69110117c19eed02 (patch)
tree	40799a87c5c0c9e8d21c43eb5b7496b4f585c42f /gi/pipeline
parent	272108e70b3462fdf5011f1d4dc7ec635259e344 (diff)