summaryrefslogtreecommitdiff
path: root/gi
diff options
context:
space:
mode:
Diffstat (limited to 'gi')
-rwxr-xr-xgi/pipeline/evaluation-pipeline.pl12
-rw-r--r--gi/pipeline/valhalla.config8
2 files changed, 19 insertions, 1 deletions
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index 8ee41122..c6dcca05 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -70,6 +70,16 @@ assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST);
my $numtopics = 25;
my $config = "$SCRIPT_DIR/clsp.config";
+if ((scalar @ARGV) >= 2 && ($ARGV[0] eq '-c')) {
+ $config = $ARGV[1];
+ shift @ARGV; shift @ARGV;
+ unless (-f $config) {
+ $config = "$SCRIPT_DIR/$config";
+ unless (-f $config) {
+ $config .= ".config";
+ }
+ }
+}
print STDERR "CORPORA CONFIGURATION: $config\n";
open CONF, "<$config" or die "Can't read $config: $!";
my %paths;
@@ -296,7 +306,7 @@ EOT
sub print_help {
print STDERR<<EOT;
-Usage: $0 [OPTIONS] language-pair grammar.bidir.gz
+Usage: $0 [-c data-config-file] language-pair grammar.bidir.gz [OPTIONS]
Given an induced grammar for an entire corpus (i.e., generated by
local-gi-pipeline.pl), filter and featurize it for a dev and test set,
diff --git a/gi/pipeline/valhalla.config b/gi/pipeline/valhalla.config
new file mode 100644
index 00000000..503cbd4a
--- /dev/null
+++ b/gi/pipeline/valhalla.config
@@ -0,0 +1,8 @@
+# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED
+# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ...
+btec /home/chris/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh
+fbis /home/chris/ws10smt/data/chinese-english.fbis corpus.zh-en.al
+zhen /home/chris/ws10smt/data/chinese-english corpus.zh-en.al
+aren /home/chris/ws10smt/data/arabic-english corpus.ar-en.al
+uren /home/chris/ws10smt/data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh
+nlfr /home/chris/ws10smt/data/dutch-french corpus.nl-fr.al