summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2012-11-14 20:33:51 -0500
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2012-11-14 20:33:51 -0500
commit7928695272b000de7142b91e05959a8fab6b1d2a (patch)
tree59fdff666e938512a34f772f04a1a247704a246f
parent41ec6ee5146c92cdb1c279267a5058fe42f8a644 (diff)
major mert clean up, stuff for simple system demo
-rw-r--r--compound-split/README.md (renamed from compound-split/README)0
-rwxr-xr-xcompound-split/make-dict.pl24
-rwxr-xr-xcorpus/cut-corpus.pl16
-rwxr-xr-xcorpus/filter-length.pl14
-rw-r--r--corpus/support/README2
-rwxr-xr-xcorpus/support/fix-contract.pl10
-rwxr-xr-xcorpus/support/quote-norm.pl64
-rw-r--r--corpus/support/token_list212
-rw-r--r--corpus/support/token_patterns3
-rwxr-xr-xcorpus/support/tokenizer.pl717
-rwxr-xr-xcorpus/support/utf8-normalize.sh36
-rwxr-xr-xcorpus/tokenize-anything.sh13
-rwxr-xr-xdpmert/decode-and-evaluate.pl246
-rwxr-xr-xdpmert/dpmert.pl237
-rwxr-xr-xdpmert/parallelize.pl6
15 files changed, 1424 insertions, 176 deletions
diff --git a/compound-split/README b/compound-split/README.md
index b7491007..b7491007 100644
--- a/compound-split/README
+++ b/compound-split/README.md
diff --git a/compound-split/make-dict.pl b/compound-split/make-dict.pl
new file mode 100755
index 00000000..71f2b928
--- /dev/null
+++ b/compound-split/make-dict.pl
@@ -0,0 +1,24 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+my %d;
+my $z = 0;
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+while(<STDIN>) {
+ chomp;
+ s/[\–":“„!=+*.@«#%&,»\?\/{}\$\(\)\[\];\-0-9]+/ /g;
+ $_ = lc $_;
+ my @words = split /\s+/;
+ for my $w (@words) {
+ next if length($w) == 0;
+ $d{$w}++;
+ $z++;
+ }
+}
+my $lz = log($z);
+for my $w (sort {$d{$b} <=> $d{$a}} keys %d) {
+ my $c = $lz-log($d{$w});
+ print "$w $c\n";
+}
+
diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl
new file mode 100755
index 00000000..fc9cce3b
--- /dev/null
+++ b/corpus/cut-corpus.pl
@@ -0,0 +1,16 @@
+#!/usr/bin/perl -w
+use strict;
+die "Usage: $0 N\nSplits a corpus separated by ||| symbols and returns the Nth field\n" unless scalar @ARGV > 0;
+
+my $x = shift @ARGV;
+die "N must be numeric" unless $x =~ /^\d+$/;
+$x--;
+
+while(<>) {
+ chomp;
+ my @fields = split / \|\|\| /;
+ my $y = $fields[$x];
+ if (!defined $y) { $y= ''; }
+ print "$y\n";
+}
+
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl
index d7eacdd7..70032ca7 100755
--- a/corpus/filter-length.pl
+++ b/corpus/filter-length.pl
@@ -24,6 +24,7 @@ my $rat_max = log(9);
my $lrm = 0;
my $zerof = 0;
my $zeroe = 0;
+my $bad_format = 0;
my $absbadrat = 0;
my $overlene = 0;
my $overlenf = 0;
@@ -34,7 +35,13 @@ while(<F>) {
if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
elsif ($lines % 2500 == 0) { print STDERR "."; }
my ($sf, $se, @d) = split / \|\|\| /;
- die "Bad format: $_" if scalar @d != 0 or !defined $se;
+ if (scalar @d != 0 or !defined $se) {
+ $bad_format++;
+ if ($bad_format > 100 && ($bad_format / $lines) > 0.02) {
+ die "Corpus appears to be incorretly formatted, example: $_";
+ }
+ next;
+ }
my @fs = split /\s+/, $sf;
my @es = split /\s+/, $se;
my $flen = scalar @fs;
@@ -78,7 +85,7 @@ for my $lr (@lograts) {
$lsd = sqrt($lsd / scalar @lograts);
@lograts = ();
-my $pass1_discard = $zerof + $zeroe + $absbadrat + $overlene + $overlenf;
+my $pass1_discard = $zerof + $zeroe + $absbadrat + $overlene + $overlenf + $bad_format;
my $discard_rate = int(10000 * $pass1_discard / $lines) / 100;
print STDERR " Total lines: $lines\n";
print STDERR " Already discared: $pass1_discard\t(discard rate = $discard_rate%)\n";
@@ -96,7 +103,8 @@ while(<F>) {
$lines++;
if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
elsif ($lines % 2500 == 0) { print STDERR "."; }
- my ($sf, $se) = split / \|\|\| /;
+ my ($sf, $se, @d) = split / \|\|\| /;
+ if (scalar @d != 0 or !defined $se) { next; }
my @fs = split /\s+/, $sf;
my @es = split /\s+/, $se;
my $flen = scalar @fs;
diff --git a/corpus/support/README b/corpus/support/README
new file mode 100644
index 00000000..fdbd523e
--- /dev/null
+++ b/corpus/support/README
@@ -0,0 +1,2 @@
+Run ./tokenize.sh to tokenize text
+Edit eng_token_patterns and eng_token_list to add rules for things not to segment
diff --git a/corpus/support/fix-contract.pl b/corpus/support/fix-contract.pl
new file mode 100755
index 00000000..f1e191ab
--- /dev/null
+++ b/corpus/support/fix-contract.pl
@@ -0,0 +1,10 @@
+#!/usr/bin/perl -w
+use strict;
+while(<>) {
+ #s/ (pre|anti|re|pro|inter|intra|multi|e|x|neo) - / $1- /ig;
+ #s/ - (year) - (old)/ -$1-$2/ig;
+ s/ ' (s|m|ll|re|d|ve) / '$1 /ig;
+ s/n ' t / n't /ig;
+ print;
+}
+
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
new file mode 100755
index 00000000..0c5b9c26
--- /dev/null
+++ b/corpus/support/quote-norm.pl
@@ -0,0 +1,64 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+binmode(STDIN,"utf8");
+binmode(STDOUT,"utf8");
+while(<STDIN>) {
+ chomp;
+ $_ = " $_ ";
+ s/&\s*lt\s*;/</gi;
+ s/&\s*gt\s*;/>/gi;
+ s/&\s*squot\s*;/'/gi;
+ s/&\s*quot\s*;/"/gi;
+ s/&\s*amp\s*;/&/gi;
+ s/ (\d\d): (\d\d)/ $1:$2/g;
+ s/[\x{20a0}]\x{20ac}]/ EUR /g;
+ s/[\x{00A3}]/ GBP /g;
+ s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g;
+ s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi;
+ s/&\s*#45\s*;\s*&\s*#45\s*;/--/g;
+ s/&\s*#45\s*;/--/g;
+ s/ ,,/ "/g;
+ s/``/"/g;
+ s/''/"/g;
+ s/〃/"/g;
+ s/¨/"/g;
+ s/¡/ ¡ /g;
+ s/¿/ ¿ /g;
+ s/ˇ/'/g;
+ s/´/'/g;
+ s/`/'/g;
+ s/’/'/g;
+ s/ ́/'/g;
+ s/‘/'/g;
+ s/ˉ/'/g;
+ s/β/ß/g; # WMT 2010 error
+ s/“/"/g;
+ s/”/"/g;
+ s/«/"/g;
+ s/»/"/g;
+ tr/!-~/!-~/;
+ s/、/,/g;
+ s/。/./g;
+ s/…/.../g;
+ s/―/--/g;
+ s/–/--/g;
+ s/─/--/g;
+ s/—/--/g;
+ s/•/ * /g;
+ s/\*/ * /g;
+ s/،/,/g;
+ s/؟/?/g;
+ s/ـ/ /g;
+ s/Ã ̄/i/g;
+ s/’/'/g;
+ s/â€"/"/g;
+ s/؛/;/g;
+
+ s/\s+/ /g;
+ s/^\s+//;
+ s/\s+$//;
+ s/[\x{00}-\x{1f}]//g;
+ print "$_\n";
+}
+
diff --git a/corpus/support/token_list b/corpus/support/token_list
new file mode 100644
index 00000000..28eb4396
--- /dev/null
+++ b/corpus/support/token_list
@@ -0,0 +1,212 @@
+##################### hyphenated words added by Fei since 3/7/05
+##X-ray
+
+##################### words made of punct only
+:-
+:-)
+:-(
++=
+-=
+.=
+*=
+>=
+<=
+==
+&&
+||
+=>
+->
+<-
+:)
+:(
+;)
+
+#################### abbr added by Fei
+oz.
+fl.
+tel.
+1.
+2.
+3.
+4.
+5.
+6.
+7.
+8.
+9.
+10.
+
+##################### abbreviation: words that contain period.
+U.A.E
+Ala.
+Ph.D.
+min.
+max.
+z.B.
+d.h.
+ggf.
+ca.
+bzw.
+bzgl.
+Eng.
+i.e.
+a.m.
+am.
+A.M.
+Apr.
+Ariz.
+Ark.
+Aug.
+B.A.T.
+B.A.T
+Calif.
+Co.
+Conn.
+Corp.
+Cos.
+D.C.
+Dec.
+Dept.
+Dr.
+Drs.
+Feb.
+Fla.
+Fri.
+Ga.
+Gen.
+gen.
+GEN.
+Gov.
+Govt.
+Ill.
+Inc.
+Jan.
+Jr.
+Jul.
+Jun.
+Kan.
+L.A.
+Lieut.
+Lt.
+Ltd.
+Ma.
+Mar.
+Mass.
+Md.
+Mfg.
+Mgr.
+Mexican-U.S.
+Mich.
+Minn.
+Mo.
+Mon.
+Mr.
+Mrs.
+Ms.
+Mt.
+N.D.
+Neb.
+Nev.
+No.
+Nos.
+Nov.
+Oct.
+Okla.
+Op.
+Ore.
+Pa.
+p.m
+p.m.
+I.B.C.
+N.T.V
+Pres.
+Prof.
+Prop.
+Rd.
+Rev.
+R.J.
+C.L
+Rte.
+Sat.
+W.T
+Sen.
+Sep.
+Sept.
+Sgt.
+Sr.
+SR.
+St.
+Ste.
+Sun.
+Tenn.
+Tex.
+Thu.
+Tue.
+Univ.
+Va.
+Vt.
+Wed.
+approx.
+dept.
+e.g.
+E.G.
+eg.
+est.
+etc.
+ex.
+ext.
+ft.
+hon.
+hr.
+hrs.
+lab.
+lb.
+lbs.
+mass.
+misc.
+no.
+nos.
+nt.
+para.
+paras.
+pct.
+prod.
+rec.
+ref.
+rel.
+rep.
+sq.
+st.
+stg.
+vol.
+vs.
+U.S.
+J.S.
+U.N.
+u.n.
+A.
+B.
+C.
+D.
+E.
+F.
+G.
+H.
+I.
+J.
+K.
+L.
+M.
+N.
+O.
+P.
+Q.
+R.
+S.
+T.
+U.
+V.
+W.
+X.
+Y.
+Z.
diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns
new file mode 100644
index 00000000..c0e6fe1a
--- /dev/null
+++ b/corpus/support/token_patterns
@@ -0,0 +1,3 @@
+/^(al|el|ul|e)\-[a-z]+$/
+/^(\d+)\.$/
+
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
new file mode 100755
index 00000000..23be00a5
--- /dev/null
+++ b/corpus/support/tokenizer.pl
@@ -0,0 +1,717 @@
+#!/usr/bin/env perl
+
+my $script_dir;
+BEGIN {$^W = 1; use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
+
+use strict;
+use utf8;
+
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+my $debug = 0;
+
+
+############ options:
+### for all options:
+### 0 means no split on that symbol
+### 1 means split on that symbol in all cases.
+### 2 means do not split in condition 1.
+### n means do not split in any of the conditions in the set {1, 2, ..., n-1}.
+
+
+### prefix
+## for "#": #90
+my $Split_On_SharpSign = 2; # 2: do not split on Num, e.g., "#90"
+
+
+############## "infix"
+my $Split_On_Tilde = 2; # 2: do not split on Num, e.g., "12~13".
+
+my $Split_On_Circ = 2; # 2: do not split on Num, e.g, "2^3"
+
+## for "&"
+my $Split_On_AndSign = 2; # 2: do not split on short Name, e.g., "AT&T".
+
+## for hyphen: 1990-1992
+my $Split_On_Dash = 2; ## 2: do not split on number, e.g., "22-23".
+my $Split_On_Underscore = 0; ## 0: do not split by underline
+
+## for ":": 5:4
+my $Split_On_Semicolon = 2; ## 2: don't split for num, e.g., "5:4"
+
+########### suffix
+## for percent sign: 5%
+my $Split_On_PercentSign = 1; ## 2: don't split num, e.g., 5%
+
+############# others
+## for slash: 1/4
+my $Split_On_Slash = 2; ## 2: don't split on number, e.g., 1/4.
+my $Split_On_BackSlash = 0; ## 0: do not split on "\", e.g., \t
+
+### for "$": US$120
+my $Split_On_DollarSign = 2; ### 2: US$120 => "US$ 120"
+ ### 1: US$120 => "US $ 120"
+## for 's etc.
+my $Split_NAposT = 1; ## n't
+my $Split_AposS = 1; ## 's
+my $Split_AposM = 1; ## 'm
+my $Split_AposRE = 1; ## 're
+my $Split_AposVE = 1; ## 've
+my $Split_AposLL = 1; ## 'll
+my $Split_AposD = 1; ## 'd
+
+
+### some patterns
+my $common_right_punc = '\.|\,|\;|:|\!|\?|\"|\)|\]|\}|\>|\-';
+
+#### step 1: read files
+
+my $workdir = $script_dir;
+my $dict_file = "$workdir/token_list";
+my $word_patt_file = "$workdir/token_patterns";
+
+open(my $dict_fp, "$dict_file") or die;
+
+# read in the list of words that should not be segmented,
+## e.g.,"I.B.M.", co-operation.
+my %dict_hash = ();
+my $dict_entry = 0;
+while(<$dict_fp>){
+ chomp;
+ next if /^\s*$/;
+ s/^\s+//;
+ s/\s+$//;
+ tr/A-Z/a-z/;
+ $dict_hash{$_} = 1;
+ $dict_entry ++;
+}
+
+open(my $patt_fp, "$word_patt_file") or die;
+my @word_patts = ();
+my $word_patt_num = 0;
+while(<$patt_fp>){
+ chomp;
+ next if /^\s*$/;
+ s/^\s+//;
+ s/\s+$//;
+ s/^\/(.+)\/$/$1/; # remove / / around the pattern
+ push(@word_patts, $_);
+ $word_patt_num ++;
+}
+
+
+###### step 2: process the input file
+my $orig_token_total = 0;
+my $deep_proc_token_total = 0;
+my $new_token_total = 0;
+
+my $line_total = 0;
+my $content_line_total = 0;
+
+while(<STDIN>){
+ chomp();
+
+ $line_total ++;
+ if ($line_total % 100000 == 0) { print STDERR " [$line_total]\n"; }
+ elsif ($line_total % 2500 == 0) { print STDERR "."; }
+
+ if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) {
+ ## markup
+ print STDOUT "$_\n";
+ next;
+ }
+
+ $content_line_total ++;
+
+ my $orig_num = 0;
+ my $deep_proc_num = 0;
+
+ my $new_line = proc_line($_, \$orig_num, \$deep_proc_num);
+
+ $orig_token_total += $orig_num;
+ $deep_proc_token_total += $deep_proc_num;
+
+ $new_line =~ s/\s+$//;
+ $new_line =~ s/^\s+//;
+ my @parts = split(/\s+/, $new_line);
+ $new_token_total += scalar @parts;
+
+ $new_line =~ s/\s+/ /g;
+# fix sgm-markup tokenization
+ $new_line =~ s/\s*<\s+seg\s+id\s+=\s+(\d+)\s+>/<seg id=$1>/;
+ $new_line =~ s/\s*<\s+(p|hl)\s+>/<$1>/;
+ $new_line =~ s/\s*<\s+\/\s+(p|hl|DOC)\s+>/<\/$1>/;
+ $new_line =~ s/<\s+\/\s+seg\s+>/<\/seg>/;
+ if ($new_line =~ /^\s*<\s+DOC\s+/) {
+ $new_line =~ s/\s+//g;
+ $new_line =~ s/DOC/DOC /;
+ $new_line =~ s/sys/ sys/;
+ }
+ if ($new_line =~ /^\s*<\s+(refset|srcset)\s+/) {
+ $new_line =~ s/\s+//g;
+ $new_line =~ s/(set|src|tgt|trg)/ $1/g;
+ }
+
+ print STDOUT " $new_line\n";
+}
+print STDERR "\n";
+
+########################################################################
+
+### tokenize a line.
+sub proc_line {
+ my @params = @_;
+ my $param_num = scalar @params;
+
+ if(($param_num < 1) || ($param_num > 3)){
+ die "wrong number of params for proc_line: $param_num\n";
+ }
+
+ my $orig_line = $params[0];
+
+ $orig_line =~ s/^\s+//;
+ $orig_line =~ s/\s+$//;
+
+ my @parts = split(/\s+/, $orig_line);
+
+ if($param_num >= 2){
+ my $orig_num_ptr = $params[1];
+ $$orig_num_ptr = scalar @parts;
+ }
+
+ my $new_line = "";
+
+ my $deep_proc_token = 0;
+ foreach my $part (@parts){
+ my $flag = -1;
+ $new_line .= proc_token($part, \$flag) . " ";
+ $deep_proc_token += $flag;
+ }
+
+ if($param_num == 3){
+ my $deep_num_ptr = $params[2];
+ $$deep_num_ptr = $deep_proc_token;
+ }
+
+ return $new_line;
+}
+
+
+
+## Tokenize a str that does not contain " ", return the new string
+## The function handles the cases that the token needs not be segmented.
+## for other cases, it calls deep_proc_token()
+sub proc_token {
+ my @params = @_;
+ my $param_num = scalar @params;
+ if($param_num > 2){
+ die "proc_token: wrong number of params: $param_num\n";
+ }
+
+ my $token = $params[0];
+
+ if(!defined($token)){
+ return "";
+ }
+
+ my $deep_proc_flag;
+
+ if($param_num == 2){
+ $deep_proc_flag = $params[1];
+ $$deep_proc_flag = 0;
+ }
+
+ if($debug){
+ print STDERR "pro_token:+$token+\n";
+ }
+
+ ### step 0: it has only one char
+ if(($token eq "") || ($token=~ /^.$/)){
+ ## print STDERR "see +$token+\n";
+ return $token;
+ }
+
+ ## step 1: check the most common case
+ if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+$/i){
+ ### most common cases
+ return $token;
+ }
+
+ ## step 2: check whether it is some NE entity
+ ### 1.2.4.6
+ if($token =~ /^\d+(.\d+)+$/){
+ return $token;
+ }
+
+ ## 1,234,345.34
+ if($token =~ /^\d+(\.\d{3})*,\d+$/){
+ ## number
+ return $token;
+ }
+ if($token =~ /^\d+(,\d{3})*\.\d+$/){
+ ## number
+ return $token;
+ }
+ if($token =~ /^(@|#)[A-Za-z0-9_\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+.*$/){
+ ## twitter hashtag or address
+ return proc_rightpunc($token);
+ }
+
+ if($token =~ /^[a-z0-9\_\-]+\@[a-z\d\_\-]+(\.[a-z\d\_\-]+)*(.*)$/i){
+ ### email address: xxx@yy.zz
+ return proc_rightpunc($token);
+ }
+
+ if($token =~ /^(mailto|http|https|ftp|gopher|telnet|file)\:\/{0,2}([^\.]+)(\.(.+))*$/i){
+ ### URL: http://xx.yy.zz
+ return proc_rightpunc($token);
+ }
+
+ if($token =~ /^(www)(\.(.+))+$/i){
+ ### www.yy.dd/land/
+ return proc_rightpunc($token);
+ }
+
+ if($token =~ /^(\w+\.)+(com|co|edu|org|gov|ly|cz|ru|eu)(\.[a-z]{2,3})?\:{0,2}(\/\S*)?$/i){
+ ### URL: upenn.edu/~xx
+ return proc_rightpunc($token);
+ }
+
+ if($token =~ /^\(\d{3}\)\d{3}(\-\d{4})($common_right_punc)*$/){
+ ## only handle American phone numbers: e.g., (914)244-4567
+ return proc_rightpunc($token);
+ }
+
+ #my $t1 = '[\x{0600}-\x{06ff}a-z\d\_\.\-]';
+ my $t1 = '[a-z\d\_\-\.\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]';
+ if($token =~ /^\/(($t1)+\/)+($t1)+\/?$/i){
+ ### /nls/p/....
+ return $token;
+ }
+
+ if($token =~ /^\\(($t1)+\\)+($t1)+\\?$/i){
+ ### \nls\p\....
+ return $token;
+ }
+
+ ## step 3: check the dictionary
+ my $token_lc = $token;
+ $token_lc =~ tr/A-Z/a-z/;
+
+ if(defined($dict_hash{$token_lc})){
+ return $token;
+ }
+
+ ## step 4: check word_patterns
+ my $i=1;
+ foreach my $patt (@word_patts){
+ if($token_lc =~ /$patt/){
+ if($debug){
+ print STDERR "+$token+ match pattern $i: +$patt+\n";
+ }
+ return $token;
+ }else{
+ $i++;
+ }
+ }
+
+ ## step 5: call deep tokenization
+ if($param_num == 2){
+ $$deep_proc_flag = 1;
+ }
+ return deep_proc_token($token);
+}
+
+
+### remove punct on the right side
+### e.g., xxx@yy.zz, => xxx@yy.zz ,
+sub proc_rightpunc {
+ my ($token) = @_;
+
+ $token =~ s/(($common_right_punc)+)$/ $1 /;
+ if($token =~ /\s/){
+ return proc_line($token);
+ }else{
+ return $token;
+ }
+}
+
+
+
+#######################################
+### return the new token:
+### types of punct:
+## T1 (2): the punct is always a token by itself no matter where it
+### appears: " ;
+## T2 (15): the punct that can be a part of words made of puncts only.
+## ` ! @ + = [ ] ( ) { } | < > ?
+## T3 (15): the punct can be part of a word that contains [a-z\d]
+## T3: ~ ^ & : , # * % - _ \ / . $ '
+## infix: ~ (12~13), ^ (2^3), & (AT&T), : ,
+## prefix: # (#9), * (*3),
+## suffix: % (10%),
+## infix+prefix: - (-5), _ (_foo),
+## more than one position: \ / . $
+## Appos: 'm n't ...
+
+## 1. separate by puncts in T1
+## 2. separate by puncts in T2
+## 3. deal with punct T3 one by one according to options
+## 4. if the token remains unchanged after step 1-3, return the token
+
+## $line contains at least 2 chars, and no space.
+sub deep_proc_token {
+ my ($line) = @_;
+ if($debug){
+ print STDERR "deep_proc_token: +$line+\n";
+ }
+
+ ##### step 0: if it mades up of all puncts, remove one punct at a time.
+ if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}a-zA-Z\d]/){
+ if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\.+|\-+|\_+|\&+)$/){
+ ## ++ @@@@ !!! ....
+ return $line;
+ }
+
+ if($line =~ /^(.)(.+)$/){
+ my $t1 = $1;
+ my $t2 = $2;
+ return $t1 . " " . proc_token($t2);
+ }else{
+ ### one char only
+ print STDERR "deep_proc_token: this should not happen: +$line+\n";
+ return $line;
+ }
+ }
+
+ ##### step 1: separate by punct T2 on the boundary
+ my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;';
+ if($line =~ s/^(($t2)+)/$1 /){
+ return proc_line($line);
+ }
+
+ if($line =~ s/(($t2)+)$/ $1/){
+ return proc_line($line);
+ }
+
+ ## step 2: separate by punct T2 in any position
+ if($line =~ s/(($t2)+)/ $1 /g){
+ return proc_line($line);
+ }
+
+ ##### step 3: deal with special puncts in T3.
+ if($line =~ /^(\,+)(.+)$/){
+ my $t1 = $1;
+ my $t2 = $2;
+ return proc_token($t1) . " " . proc_token($t2);
+ }
+
+ if($line =~ /^(.*[^\,]+)(\,+)$/){
+ ## 19.3,,, => 19.3 ,,,
+ my $t1 = $1;
+ my $t2 = $2;
+ return proc_token($t1) . " " . proc_token($t2);
+ }
+
+ ## remove the ending periods that follow number etc.
+ if($line =~ /^(.*(\d|\~|\^|\&|\:|\,|\#|\*|\%|\-|\_|\/|\\|\$|\'))(\.+)$/){
+ ## 12~13. => 12~13 .
+ my $t1 = $1;
+ my $t3 = $3;
+ return proc_token($t1) . " " . proc_token($t3);
+ }
+
+ ### deal with "$"
+ if(($line =~ /\$/) && ($Split_On_DollarSign > 0)){
+ my $suc = 0;
+ if($Split_On_DollarSign == 1){
+ ## split on all occasation
+ $suc = ($line =~ s/(\$+)/ $1 /g);
+ }else{
+ ## split only between $ and number
+ $suc = ($line =~ s/(\$+)(\d)/$1 $2/g);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+ ## deal with "#"
+ if(($line =~ /\#/) && ($Split_On_SharpSign > 0)){
+ my $suc = 0;
+ if($Split_On_SharpSign >= 2){
+ ### keep #50 as a token
+ $suc = ($line =~ s/(\#+)(\D)/ $1 $2/gi);
+ }else{
+ $suc = ($line =~ s/(\#+)/ $1 /gi);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+ ## deal with '
+ if($line =~ /\'/){
+ my $suc = ($line =~ s/([^\'])([\']+)$/$1 $2/g); ## xxx'' => xxx ''
+
+ ### deal with ': e.g., 's, 't, 'm, 'll, 're, 've, n't
+
+ ## 'there => ' there '98 => the same
+ $suc += ($line =~ s/^(\'+)([a-z]+)/ $1 $2/gi);
+
+ ## note that \' and \. could interact: e.g., U.S.'s; 're.
+ if($Split_NAposT && ($line =~ /^(.*[a-z]+)(n\'t)([\.]*)$/i)){
+ ## doesn't => does n't
+ my $t1 = $1;
+ my $t2 = $2;
+ my $t3 = $3;
+ return proc_token($t1) . " " . $t2 . " " . proc_token($t3);
+ }
+
+ ## 's, 't, 'm, 'll, 're, 've: they've => they 've
+ ## 1950's => 1950 's Co.'s => Co. 's
+ if($Split_AposS && ($line =~ /^(.+)(\'s)(\W*)$/i)){
+ my $t1 = $1;
+ my $t2 = $2;
+ my $t3 = $3;
+ return proc_token($t1) . " " . $t2 . " " . proc_token($t3);
+ }
+
+ if($Split_AposM && ($line =~ /^(.*[a-z]+)(\'m)(\.*)$/i)){
+ my $t1 = $1;
+ my $t2 = $2;
+ my $t3 = $3;
+ return proc_token($t1) . " " . $t2 . " " . proc_token($t3);
+ }
+
+
+ if($Split_AposRE && ($line =~ /^(.*[a-z]+)(\'re)(\.*)$/i)){
+ my $t1 = $1;
+ my $t2 = $2;
+ my $t3 = $3;
+ return proc_token($t1) . " " . $t2 . " " . proc_token($t3);
+ }
+
+ if($Split_AposVE && ($line =~ /^(.*[a-z]+)(\'ve)(\.*)$/i)){
+ my $t1 = $1;
+ my $t2 = $2;
+ my $t3 = $3;
+ return proc_token($t1) . " " . $t2 . " " . proc_token($t3);
+ }
+
+ if($Split_AposLL && ($line =~ /^(.*[a-z]+)(\'ll)(\.*)$/i)){
+ my $t1 = $1;
+ my $t2 = $2;
+ my $t3 = $3;
+ return proc_token($t1) . " " . $t2 . " " . proc_token($t3);
+ }
+
+ if($Split_AposD && ($line =~ /^(.*[a-z]+)(\'d)(\.*)$/i)){
+ my $t1 = $1;
+ my $t2 = $2;
+ my $t3 = $3;
+ return proc_token($t1) . " " . $t2 . " " . proc_token($t3);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+
+ ## deal with "~"
+ if(($line =~ /\~/) && ($Split_On_Tilde > 0)){
+ my $suc = 0;
+ if($Split_On_Tilde >= 2){
+ ## keep 12~13 as one token
+ $suc += ($line =~ s/(\D)(\~+)/$1 $2 /g);
+ $suc += ($line =~ s/(\~+)(\D)/ $1 $2/g);
+ $suc += ($line =~ s/^(\~+)(\d)/$1 $2/g);
+ $suc += ($line =~ s/(\d)(\~+)$/$1 $2/g);
+ }else{
+ $suc += ($line =~ s/(\~+)/ $1 /g);
+ }
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+ ## deal with "^"
+ if(($line =~ /\^/) && ($Split_On_Circ > 0)){
+ my $suc = 0;
+ if($Split_On_Circ >= 2){
+ ## keep 12~13 as one token
+ $suc += ($line =~ s/(\D)(\^+)/$1 $2 /g);
+ $suc += ($line =~ s/(\^+)(\D)/ $1 $2/g);
+ }else{
+ $suc = ($line =~ s/(\^+)/ $1 /g);
+ }
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+ ## deal with ":"
+ if(($line =~ /\:/) && ($Split_On_Semicolon > 0)){
+ ## 2: => 2 :
+ my $suc = ($line =~ s/^(\:+)/$1 /);
+ $suc += ($line =~ s/(\:+)$/ $1/);
+ if($Split_On_Semicolon >= 2){
+ ## keep 5:4 as one token
+ $suc += ($line =~ s/(\D)(\:+)/$1 $2 /g);
+ $suc += ($line =~ s/(\:+)(\D)/ $1 $2/g);
+ }else{
+ $suc += ($line =~ s/(\:+)/ $1 /g);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+ ### deal with hyphen: 1992-1993. 21st-24th
+ if(($line =~ /\-/) && ($Split_On_Dash > 0)){
+ my $suc = ($line =~ s/(\-{2,})/ $1 /g);
+ if($Split_On_Dash >= 2){
+ ## keep 1992-1993 as one token
+ $suc += ($line =~ s/(\D)(\-+)/$1 $2 /g);
+ $suc += ($line =~ s/(\-+)(\D)/ $1 $2/g);
+ }else{
+ ### always split on "-"
+ $suc += ($line =~ s/([\-]+)/ $1 /g);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+ ## deal with "_"
+ if(($line =~ /\_/) && ($Split_On_Underscore > 0)){
+ ### always split on "-"
+ if($line =~ s/([\_]+)/ $1 /g){
+ return proc_line($line);
+ }
+ }
+
+
+
+ ## deal with "%"
+ if(($line =~ /\%/) && ($Split_On_PercentSign > 0)){
+ my $suc = 0;
+ if($Split_On_PercentSign >= 2){
+ $suc += ($line =~ s/(\D)(\%+)/$1 $2/g);
+ }else{
+ $suc += ($line =~ s/(\%+)/ $1 /g);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+
+ ### deal with "/": 4/5
+ if(($line =~ /\//) && ($Split_On_Slash > 0)){
+ my $suc = 0;
+ if($Split_On_Slash >= 2){
+ $suc += ($line =~ s/(\D)(\/+)/$1 $2 /g);
+ $suc += ($line =~ s/(\/+)(\D)/ $1 $2/g);
+ }else{
+ $suc += ($line =~ s/(\/+)/ $1 /g);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+
+ ### deal with comma: 123,456
+ if($line =~ /\,/){
+ my $suc = 0;
+ $suc += ($line =~ s/([^\d]),/$1 , /g); ## xxx, 1923 => xxx , 1923
+ $suc += ($line =~ s/\,\s*([^\d])/ , $1/g); ## 1923, xxx => 1923 , xxx
+
+ $suc += ($line =~ s/,([\d]{1,2}[^\d])/ , $1/g); ## 1,23 => 1 , 23
+ $suc += ($line =~ s/,([\d]{4,}[^\d])/ , $1/g); ## 1,2345 => 1 , 2345
+
+ $suc += ($line =~ s/,([\d]{1,2})$/ , $1/g); ## 1,23 => 1 , 23
+ $suc += ($line =~ s/,([\d]{4,})$/ , $1/g); ## 1,2345 => 1 , 2345
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+
+ ## deal with "&"
+ if(($line =~ /\&/) && ($Split_On_AndSign > 0)){
+ my $suc = 0;
+ if($Split_On_AndSign >= 2){
+ $suc += ($line =~ s/([a-z]{3,})(\&+)/$1 $2 /gi);
+ $suc += ($line =~ s/(\&+)([a-z]{3,})/ $1 $2/gi);
+ }else{
+ $suc += ($line =~ s/(\&+)/ $1 /g);
+ }
+
+ if($suc){
+ return proc_line($line);
+ }
+ }
+
+ ## deal with period
+ if($line =~ /\./){
+ if($line =~ /^(([\+|\-])*(\d+\,)*\d*\.\d+\%*)$/){
+ ### numbers: 3.5
+ return $line;
+ }
+
+ if($line =~ /^(([a-z]\.)+)(\.*)$/i){
+ ## I.B.M.
+ my $t1 = $1;
+ my $t3 = $3;
+ return $t1 . " ". proc_token($t3);
+ }
+
+ ## Feb.. => Feb. .
+ if($line =~ /^(.*[^\.])(\.)(\.*)$/){
+ my $p1 = $1;
+ my $p2 = $2;
+ my $p3 = $3;
+
+ my $p1_lc = $p1;
+ $p1_lc =~ tr/A-Z/a-z/;
+
+ if(defined($dict_hash{$p1_lc . $p2})){
+ ## Dec.. => Dec. .
+ return $p1 . $p2 . " " . proc_token($p3);
+ }elsif(defined($dict_hash{$p1_lc})){
+ return $p1 . " " . proc_token($p2 . $p3);
+ }else{
+ ## this. => this .
+ return proc_token($p1) . " " . proc_token($p2 . $p3);
+ }
+ }
+
+ if($line =~ s/(\.+)(.+)/$1 $2/g){
+ return proc_line($line);
+ }
+ }
+
+
+ ## no pattern applies
+ return $line;
+}
+
+
+
+
+
+
+
+
diff --git a/corpus/support/utf8-normalize.sh b/corpus/support/utf8-normalize.sh
new file mode 100755
index 00000000..2f347854
--- /dev/null
+++ b/corpus/support/utf8-normalize.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# this is the location on malbec, if you want to run on another machine
+# ICU may be installed in /usr or /usr/local
+ICU_DIR=/usr0/tools/icu
+UCONV_BIN=$ICU_DIR/bin/uconv
+UCONV_LIB=$ICU_DIR/lib
+
+if [ -e $UCONV_BIN ] && [ -d $UCONV_LIB ]
+then
+ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$UCONV_LIB
+ if [ ! -x $UCONV_BIN ]
+ then
+ echo "$0: Cannot execute $UCONV_BIN! Please fix." 1>&2
+ exit
+ fi
+ CMD="$UCONV_BIN -f utf8 -t utf8 -x Any-NFKC --callback skip"
+else
+ if which uconv > /dev/null
+ then
+ CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip"
+ else
+ echo "$0: Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Quality may suffer." 1>&2
+ CMD="iconv -f utf8 -t utf8 -c"
+ fi
+fi
+
+perl -e 'while(<>){s/\r\n*/\n/g; print;}' | $CMD | /usr/bin/perl -w -e '
+ while (<>) {
+ chomp;
+ s/[\x00-\x1F]+/ /g;
+ s/ +/ /g;
+ s/^ //;
+ s/ $//;
+ print "$_\n";
+ }'
diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh
new file mode 100755
index 00000000..1a24193d
--- /dev/null
+++ b/corpus/tokenize-anything.sh
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+ROOTDIR=`dirname $0`
+SUPPORT=$ROOTDIR/support
+
+$SUPPORT/utf8-normalize.sh |
+ $SUPPORT/quote-norm.pl |
+ $SUPPORT/tokenizer.pl |
+ sed -e 's/ al - / al-/g' |
+ $SUPPORT/fix-contract.pl |
+ sed -e 's/^ //' | sed -e 's/ $//' |
+ perl -e 'while(<>){s/(\d+)(\.+)$/$1 ./;print;}'
+
diff --git a/dpmert/decode-and-evaluate.pl b/dpmert/decode-and-evaluate.pl
new file mode 100755
index 00000000..fe765d00
--- /dev/null
+++ b/dpmert/decode-and-evaluate.pl
@@ -0,0 +1,246 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use File::Basename qw(basename);
+my $QSUB_CMD = qsub_args(mert_memory());
+
+require "libcall.pl";
+
+# Default settings
+my $default_jobs = env_default_jobs();
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $parallelize = "$bin_dir/parallelize.pl";
+my $libcall = "$bin_dir/libcall.pl";
+my $sentserver = "$bin_dir/sentserver";
+my $sentclient = "$bin_dir/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+my $cdec = "$bin_dir/../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $jobs = $default_jobs; # number of decode nodes
+my $pmem = "9g";
+my $help = 0;
+my $config;
+my $test_set;
+my $weights;
+my $use_make = 1;
+my $useqsub;
+my $cpbin=1;
+# Process command-line options
+if (GetOptions(
+ "jobs=i" => \$jobs,
+ "help" => \$help,
+ "qsub" => \$useqsub,
+ "input=s" => \$test_set,
+ "config=s" => \$config,
+ "weights=s" => \$weights,
+) == 0 || @ARGV!=0 || $help) {
+ print_help();
+ exit;
+}
+
+if ($useqsub) {
+ $use_make = 0;
+ die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+
+if (!defined $test_set) { push @missing_args, "--input"; }
+if (!defined $config) { push @missing_args, "--config"; }
+if (!defined $weights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args);
+
+my @tf = localtime(time);
+my $tname = basename($test_set);
+$tname =~ s/\.(sgm|sgml|xml)$//i;
+my $dir = "eval.$tname." . sprintf('%d%02d%02d-%02d%02d%02d', 1900+$tf[5], $tf[4], $tf[3], $tf[2], $tf[1], $tf[0]);
+
+my $time = unchecked_output("date");
+
+check_call("mkdir -p $dir");
+
+split_devset($test_set, "$dir/test.input.raw", "$dir/test.refs");
+my $refs = "-r $dir/test.refs";
+my $newsrc = "$dir/test.input";
+enseg("$dir/test.input.raw", $newsrc);
+my $src_file = $newsrc;
+open F, "<$src_file" or die "Can't read $src_file: $!"; close F;
+
+my $test_trans="$dir/test.trans";
+my $logdir="$dir/logs";
+my $decoderLog="$logdir/decoder.sentserver.log";
+check_call("mkdir -p $logdir");
+
+#decode
+print STDERR "RUNNING DECODER AT ";
+print STDERR unchecked_output("date");
+my $decoder_cmd = "$decoder -c $config --weights $weights";
+my $pcmd;
+if ($use_make) {
+ $pcmd = "cat $src_file | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --";
+} else {
+ $pcmd = "cat $src_file | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --";
+}
+my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $test_trans";
+check_bash_call($cmd);
+print STDERR "DECODER COMPLETED AT ";
+print STDERR unchecked_output("date");
+print STDERR "\nOUTPUT: $test_trans\n\n";
+my $bleu = check_output("cat $test_trans | $SCORER $refs -m ibm_bleu");
+chomp $bleu;
+print STDERR "BLEU: $bleu\n";
+my $ter = check_output("cat $test_trans | $SCORER $refs -m ter");
+chomp $ter;
+print STDERR " TER: $ter\n";
+open TR, ">$dir/test.scores" or die "Can't write $dir/test.scores: $!";
+print TR <<EOT;
+### SCORE REPORT #############################################################
+ OUTPUT=$test_trans
+ SCRIPT INPUT=$test_set
+ DECODER INPUT=$src_file
+ REFERENCES=$dir/test.refs
+------------------------------------------------------------------------------
+ BLEU=$bleu
+ TER=$ter
+##############################################################################
+EOT
+close TR;
+my $sr = unchecked_output("cat $dir/test.scores");
+print STDERR "\n\n$sr\n(A copy of this report can be found in $dir/test.scores)\n\n";
+exit 0;
+
+sub enseg {
+ my $src = shift;
+ my $newsrc = shift;
+ open(SRC, $src);
+ open(NEWSRC, ">$newsrc");
+ my $i=0;
+ while (my $line=<SRC>){
+ chomp $line;
+ if ($line =~ /^\s*<seg/i) {
+ if($line =~ /id="[0-9]+"/) {
+ print NEWSRC "$line\n";
+ } else {
+ die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+ }
+ } else {
+ print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+ }
+ $i++;
+ }
+ close SRC;
+ close NEWSRC;
+}
+
+sub print_help {
+ my $executable = basename($0); chomp $executable;
+ print << "Help";
+
+Usage: $executable [options] <ini file>
+
+ $executable --config cdec.ini --weights weights.txt [--jobs N] [--qsub] <testset.in-ref>
+
+Options:
+
+ --help
+ Print this message and exit.
+
+ --config <file>
+ A path to the cdec.ini file.
+
+ --weights <file>
+ A file specifying feature weights.
+
+ --dir <dir>
+ Directory for intermediate and output files.
+
+Job control options:
+
+ --jobs <I>
+ Number of decoder processes to run in parallel. [default=$default_jobs]
+
+ --qsub
+ Use qsub to run jobs in parallel (qsub must be configured in
+ environment/LocalEnvironment.pm)
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs
+ (used with qsub requests only)
+
+Help
+}
+
+sub convert {
+ my ($str) = @_;
+ my @ps = split /;/, $str;
+ my %dict = ();
+ for my $p (@ps) {
+ my ($k, $v) = split /=/, $p;
+ $dict{$k} = $v;
+ }
+ return %dict;
+}
+
+
+
+sub cmdline {
+ return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+ my ($arg)=@_;
+ return undef unless defined $arg;
+ if ($arg =~ /$is_shell_special/) {
+ $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+ return "\"$arg\"";
+ }
+ return $arg;
+}
+
+sub escaped_shell_args {
+ return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+ return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+ return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
+
+sub split_devset {
+ my ($infile, $outsrc, $outref) = @_;
+ open F, "<$infile" or die "Can't read $infile: $!";
+ open S, ">$outsrc" or die "Can't write $outsrc: $!";
+ open R, ">$outref" or die "Can't write $outref: $!";
+ while(<F>) {
+ chomp;
+ my ($src, @refs) = split /\s*\|\|\|\s*/;
+ die "Malformed devset line: $_\n" unless scalar @refs > 0;
+ print S "$src\n";
+ print R join(' ||| ', @refs) . "\n";
+ }
+ close R;
+ close S;
+ close F;
+}
+
diff --git a/dpmert/dpmert.pl b/dpmert/dpmert.pl
index 2e6a9728..c4f98870 100755
--- a/dpmert/dpmert.pl
+++ b/dpmert/dpmert.pl
@@ -7,15 +7,14 @@ my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR
# Skip local config (used for distributing jobs) if we're running in local-only mode
use LocalConfig;
use Getopt::Long;
-use IPC::Open2;
-use POSIX ":sys_wait_h";
-my $QSUB_CMD = qsub_args(mert_memory());
-
+use File::Basename qw(basename);
require "libcall.pl";
+my $QSUB_CMD = qsub_args(mert_memory());
+
# Default settings
-my $srcFile;
-my $refFiles;
+my $srcFile; # deprecated
+my $refFiles; # deprecated
my $default_jobs = env_default_jobs();
my $bin_dir = $SCRIPT_DIR;
die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
@@ -37,7 +36,7 @@ die "Can't find decoder in $cdec" unless -x $cdec;
die "Can't find $parallelize" unless -x $parallelize;
die "Can't find $libcall" unless -e $libcall;
my $decoder = $cdec;
-my $lines_per_mapper = 400;
+my $lines_per_mapper = 200;
my $rand_directions = 15;
my $iteration = 1;
my $best_weights;
@@ -47,53 +46,35 @@ my $jobs = $default_jobs; # number of decode nodes
my $pmem = "9g";
my $disable_clean = 0;
my %seen_weights;
-my $normalize;
my $help = 0;
my $epsilon = 0.0001;
-my $interval = 5;
-my $dryrun = 0;
my $last_score = -10000000;
my $metric = "ibm_bleu";
my $dir;
my $iniFile;
my $weights;
my $initialWeights;
-my $decoderOpt;
-my $noprimary;
-my $maxsim=0;
-my $oraclen=0;
-my $oracleb=20;
my $bleu_weight=1;
my $use_make = 1; # use make to parallelize line search
my $useqsub;
my $pass_suffix = '';
-my $devset = '';
-my $cpbin=1;
+my $devset;
# Process command-line options
-Getopt::Long::Configure("no_auto_abbrev");
if (GetOptions(
- "decoder=s" => \$decoderOpt,
+ "config=s" => \$iniFile,
+ "weights=s" => \$initialWeights,
+ "devset=s" => \$devset,
"jobs=i" => \$jobs,
- "dont-clean" => \$disable_clean,
"pass-suffix=s" => \$pass_suffix,
- "dry-run" => \$dryrun,
- "epsilon=s" => \$epsilon,
"help" => \$help,
- "interval" => \$interval,
"qsub" => \$useqsub,
- "max-iterations=i" => \$max_iterations,
- "normalize=s" => \$normalize,
+ "iterations=i" => \$max_iterations,
"pmem=s" => \$pmem,
- "cpbin!" => \$cpbin,
"random-directions=i" => \$rand_directions,
- "devset=s" => \$devset,
- "ref-files=s" => \$refFiles,
"metric=s" => \$metric,
"source-file=s" => \$srcFile,
- "weights=s" => \$initialWeights,
- "workdir=s" => \$dir,
- "opt-iterations=i" => \$optimization_iters,
-) == 0 || @ARGV!=1 || $help) {
+ "output-dir=s" => \$dir,
+) == 0 || @ARGV!=0 || $help) {
print_help();
exit;
}
@@ -114,22 +95,17 @@ if (defined $srcFile || defined $refFiles) {
EOT
}
+if (!defined $iniFile) { push @missing_args, "--config"; }
if (!defined $devset) { push @missing_args, "--devset"; }
if (!defined $initialWeights) { push @missing_args, "--weights"; }
-die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args);
if ($metric =~ /^(combi|ter)$/i) {
$lines_per_mapper = 40;
} elsif ($metric =~ /^meteor$/i) {
- $lines_per_mapper = 2000; # start up time is really high
+ $lines_per_mapper = 2000; # start up time is really high for METEOR
}
-($iniFile) = @ARGV;
-
-
-sub write_config;
-sub enseg;
-sub print_help;
my $nodelist;
my $host =check_output("hostname"); chomp $host;
@@ -153,8 +129,6 @@ unless ($dir =~ /^\//){ # convert relative path to absolute path
$dir = "$basedir/$dir";
}
-if ($decoderOpt){ $decoder = $decoderOpt; }
-
# Initializations and helper functions
srand;
@@ -169,73 +143,47 @@ sub cleanup {
exit 1;
};
# Always call cleanup, no matter how we exit
-*CORE::GLOBAL::exit =
- sub{ cleanup(); };
+*CORE::GLOBAL::exit = sub{ cleanup(); };
$SIG{INT} = "cleanup";
$SIG{TERM} = "cleanup";
$SIG{HUP} = "cleanup";
-my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
+my $decoderBase = basename($decoder); chomp $decoderBase;
my $newIniFile = "$dir/$decoderBase.ini";
my $inputFileName = "$dir/input";
my $user = $ENV{"USER"};
-
# process ini file
-e $iniFile || die "Error: could not open $iniFile for reading\n";
-open(INI, $iniFile);
-use File::Basename qw(basename);
-#pass bindir, refs to vars holding bin
-sub modbin {
- local $_;
- my $bindir=shift;
- check_call("mkdir -p $bindir");
- -d $bindir || die "couldn't make bindir $bindir";
- for (@_) {
- my $src=$$_;
- $$_="$bindir/".basename($src);
- check_call("cp -p $src $$_");
- }
-}
sub dirsize {
opendir ISEMPTY,$_[0];
return scalar(readdir(ISEMPTY))-1;
}
-if ($dryrun){
- write_config(*STDERR);
- exit 0;
+if (-e $dir) {
+ # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs
+ die "ERROR: output directory $dir already exists (remove or use --output-dir dir)\n\n";
} else {
- if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs
- die "ERROR: working dir $dir already exists\n\n";
- } else {
- -e $dir || mkdir $dir;
- mkdir "$dir/hgs";
- modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin;
- mkdir "$dir/scripts";
- my $cmdfile="$dir/rerun-dpmert.sh";
- open CMD,'>',$cmdfile;
- print CMD "cd ",&getcwd,"\n";
-# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted.
- my $cline=&cmdline."\n";
- print CMD $cline;
- close CMD;
- print STDERR $cline;
- chmod(0755,$cmdfile);
- unless (-e $initialWeights) {
- print STDERR "Please specify an initial weights file with --initial-weights\n";
- print_help();
- exit;
- }
- check_call("cp $initialWeights $dir/weights.0");
- die "Can't find weights.0" unless (-e "$dir/weights.0");
- }
- write_config(*STDERR);
+ mkdir "$dir" or die "Can't mkdir $dir: $!";
+ mkdir "$dir/hgs" or die;
+ mkdir "$dir/scripts" or die;
+ print STDERR <<EOT;
+ DECODER: $decoder
+ INI FILE: $iniFile
+ WORKING DIR: $dir
+ DEVSET: $devset
+ EVAL METRIC: $metric
+ MAX ITERATIONS: $max_iterations
+ PARALLEL JOBS: $jobs
+ HEAD NODE: $host
+ PMEM (DECODING): $pmem
+ INITIAL WEIGHTS: $initialWeights
+EOT
}
-
# Generate initial files and values
check_call("cp $iniFile $newIniFile");
+check_call("cp $initialWeights $dir/weights.0");
$iniFile = $newIniFile;
split_devset($devset, "$dir/dev.input.raw", "$dir/dev.refs");
@@ -280,9 +228,9 @@ while (1){
my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
my $pcmd;
if ($use_make) {
- $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
+ $pcmd = "cat $srcFile | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --";
} else {
- $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
+ $pcmd = "cat $srcFile | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --";
}
my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
print STDERR "COMMAND:\n$cmd\n";
@@ -469,29 +417,11 @@ while (1){
print STDERR "\n==========\n";
}
-print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n";
-
-print STDOUT "$lastWeightsFile\n";
-
+check_call("cp $lastWeightsFile $dir/weights.final");
+print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n";
+print STDOUT "$dir/weights.final\n";
exit 0;
-sub normalize_weights {
- my ($rfn, $rpts, $feat) = @_;
- my @feat_names = @$rfn;
- my @pts = @$rpts;
- my $z = 1.0;
- for (my $i=0; $i < scalar @feat_names; $i++) {
- if ($feat_names[$i] eq $feat) {
- $z = $pts[$i];
- last;
- }
- }
- for (my $i=0; $i < scalar @feat_names; $i++) {
- $pts[$i] /= $z;
- }
- print STDERR " NORM WEIGHTS: @pts\n";
- return @pts;
-}
sub get_lines {
my $fn = shift @_;
@@ -523,27 +453,6 @@ sub read_weights_file {
return join ' ', @r;
}
-# subs
-sub write_config {
- my $fh = shift;
- my $cleanup = "yes";
- if ($disable_clean) {$cleanup = "no";}
-
- print $fh "\n";
- print $fh "DECODER: $decoder\n";
- print $fh "INI FILE: $iniFile\n";
- print $fh "WORKING DIR: $dir\n";
- print $fh "DEVSET: $devset\n";
- print $fh "EVAL METRIC: $metric\n";
- print $fh "START ITERATION: $iteration\n";
- print $fh "MAX ITERATIONS: $max_iterations\n";
- print $fh "PARALLEL JOBS: $jobs\n";
- print $fh "HEAD NODE: $host\n";
- print $fh "PMEM (DECODING): $pmem\n";
- print $fh "CLEANUP: $cleanup\n";
- print $fh "INITIAL WEIGHTS: $initialWeights\n";
-}
-
sub update_weights_file {
my ($neww, $rfn, $rpts) = @_;
my @feats = @$rfn;
@@ -585,22 +494,34 @@ sub enseg {
sub print_help {
- my $executable = check_output("basename $0"); chomp $executable;
- print << "Help";
+ my $executable = basename($0); chomp $executable;
+ print << "Help";
Usage: $executable [options] <ini file>
- $executable [options] <ini file>
- Runs a complete MERT optimization using the decoder configuration
- in <ini file>. Required options are --weights, --source-file, and
- --ref-files.
+ $executable [options]
+ Runs a complete MERT optimization. Required options are --weights,
+ --devset, and --config.
Options:
- --help
- Print this message and exit.
+ --config <file> [-c <file>]
+ The decoder configuration file.
+
+ --devset <file> [-d <file>]
+ The source *and* references for the development set.
+
+ --weights <file> [-w <file>]
+ A file specifying initial feature weights. The format is
+ FeatureName_1 value1
+ FeatureName_2 value2
+ **All and only the weights listed in <file> will be optimized!**
+
+ --metric <name>
+ Metric to optimize.
+ Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
- --max-iterations <M>
+ --iterations <M>
Maximum number of iterations to run. If not specified, defaults
to 10.
@@ -608,39 +529,15 @@ Options:
If the decoder is doing multi-pass decoding, the pass suffix "2",
"3", etc., is used to control what iteration of weights is set.
- --ref-files <files>
- Dev set ref files. This option takes only a single string argument.
- To use multiple files (including file globbing), this argument should
- be quoted.
-
- --metric <method>
- Metric to optimize.
- Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
-
- --normalize <feature-name>
- After each iteration, rescale all feature weights such that feature-
- name has a weight of 1.0.
-
--rand-directions <num>
MERT will attempt to optimize along all of the principle directions,
set this parameter to explore other directions. Defaults to 5.
- --source-file <file>
- Dev set source file.
+ --output-dir <dir>
+ Directory for intermediate and output files.
- --weights <file>
- A file specifying initial feature weights. The format is
- FeatureName_1 value1
- FeatureName_2 value2
- **All and only the weights listed in <file> will be optimized!**
-
- --workdir <dir>
- Directory for intermediate and output files. If not specified, the
- name is derived from the ini filename. Assuming that the ini
- filename begins with the decoder name and ends with ini, the default
- name of the working directory is inferred from the middle part of
- the filename. E.g. an ini file named decoder.foo.ini would have
- a default working directory name foo.
+ --help
+ Print this message and exit.
Job control options:
diff --git a/dpmert/parallelize.pl b/dpmert/parallelize.pl
index 7d0365cc..d2ebaeea 100755
--- a/dpmert/parallelize.pl
+++ b/dpmert/parallelize.pl
@@ -40,7 +40,7 @@ my $stay_alive; # dont let server die when having zero clients
my $joblist = "";
my $errordir="";
my $multiline;
-my @files_to_stage;
+my $workdir = '.';
my $numnodes = 8;
my $user = $ENV{"USER"};
my $pmem = "9g";
@@ -128,7 +128,7 @@ unless (GetOptions(
"recycle-clients" => \$recycle_clients,
"error-dir=s" => \$errordir,
"multi-line" => \$multiline,
- "file=s" => \@files_to_stage,
+ "workdir=s" => \$workdir,
"use-fork" => \$use_fork,
"verbose" => \$verbose,
"jobs=i" => \$numnodes,
@@ -363,7 +363,7 @@ sub launch_job_fork {
}
sub get_temp_script {
- my ($fh, $filename) = tempfile( "workXXXX", SUFFIX => '.sh');
+ my ($fh, $filename) = tempfile( "$workdir/workXXXX", SUFFIX => '.sh');
return ($fh, $filename);
}