From df5445c3651fa1cc99ed4bdb682dcf57092dd4e2 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 25 Oct 2012 16:05:56 -0400 Subject: add self translation --- corpus/add-self-translations.pl | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100755 corpus/add-self-translations.pl (limited to 'corpus') diff --git a/corpus/add-self-translations.pl b/corpus/add-self-translations.pl new file mode 100755 index 00000000..153bc454 --- /dev/null +++ b/corpus/add-self-translations.pl @@ -0,0 +1,29 @@ +#!/usr/bin/perl -w +use strict; + +# ADDS SELF-TRANSLATIONS OF POORLY ATTESTED WORDS TO THE PARALLEL DATA + +my %df; +my %def; +while(<>) { + print; + chomp; + my ($sf, $se) = split / \|\|\| /; + die "Format error: $_\n" unless defined $sf && defined $se; + my @fs = split /\s+/, $sf; + my @es = split /\s+/, $se; + for my $f (@fs) { + $df{$f}++; + for my $e (@es) { + if ($f eq $e) { $def{$f}++; } + } + } +} + +for my $k (sort keys %def) { + next if $df{$k} > 4; + print "$k ||| $k\n"; + print "$k ||| $k\n"; + print "$k ||| $k\n"; +} + -- cgit v1.2.3 From 782fb27af98ed98256cc25c832131c59c8e9ce9c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 5 Nov 2012 21:34:14 -0500 Subject: script to add sos/eos --- corpus/add-sos-eos.pl | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100755 corpus/add-sos-eos.pl (limited to 'corpus') diff --git a/corpus/add-sos-eos.pl b/corpus/add-sos-eos.pl new file mode 100755 index 00000000..5e2d44cb --- /dev/null +++ b/corpus/add-sos-eos.pl @@ -0,0 +1,24 @@ +#!/usr/bin/perl -w +use strict; + +while(<>) { + chomp; + my @fields = split / \|\|\| /; + my ($ff, $ee, $aa) = @fields; + die "Expected: foreign ||| target ||| alignments" unless scalar @fields == 3; + my @fs = split /\s+/, $ff; + my @es = split /\s+/, $ee; + my @as = split /\s+/, $aa; + my @oas = (); + push @oas, '0-0'; + my $flen = scalar @fs; + my $elen = scalar @es; + for my $ap (@as) { + my ($a, $b) = split /-/, $ap; + die "Bad format in: @as" unless defined $a && defined $b; + push @oas, ($a + 1) . '-' . ($b + 1); + } + push @oas, ($flen + 1) . '-' . ($elen + 1); + print " $ff ||| $ee ||| @oas\n"; +} + -- cgit v1.2.3 From 552793bbd50f634ea755b84d47ddcc6cd4f158f2 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 6 Nov 2012 00:02:38 -0500 Subject: add lowercase script --- corpus/lowercase.pl | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100755 corpus/lowercase.pl (limited to 'corpus') diff --git a/corpus/lowercase.pl b/corpus/lowercase.pl new file mode 100755 index 00000000..688e493b --- /dev/null +++ b/corpus/lowercase.pl @@ -0,0 +1,9 @@ +#!/usr/bin/perl -w +use strict; +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); +while(<>) { + $_ = lc $_; + print; +} + -- cgit v1.2.3 From 7928695272b000de7142b91e05959a8fab6b1d2a Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 14 Nov 2012 20:33:51 -0500 Subject: major mert clean up, stuff for simple system demo --- compound-split/README | 51 --- compound-split/README.md | 51 +++ compound-split/make-dict.pl | 24 ++ corpus/cut-corpus.pl | 16 + corpus/filter-length.pl | 14 +- corpus/support/README | 2 + corpus/support/fix-contract.pl | 10 + corpus/support/quote-norm.pl | 64 ++++ corpus/support/token_list | 212 ++++++++++++ corpus/support/token_patterns | 3 + corpus/support/tokenizer.pl | 717 +++++++++++++++++++++++++++++++++++++++ corpus/support/utf8-normalize.sh | 36 ++ corpus/tokenize-anything.sh | 13 + dpmert/decode-and-evaluate.pl | 246 ++++++++++++++ dpmert/dpmert.pl | 237 ++++--------- dpmert/parallelize.pl | 6 +- 16 files changed, 1475 insertions(+), 227 deletions(-) delete mode 100644 compound-split/README create mode 100644 compound-split/README.md create mode 100755 compound-split/make-dict.pl create mode 100755 corpus/cut-corpus.pl create mode 100644 corpus/support/README create mode 100755 corpus/support/fix-contract.pl create mode 100755 corpus/support/quote-norm.pl create mode 100644 corpus/support/token_list create mode 100644 corpus/support/token_patterns create mode 100755 corpus/support/tokenizer.pl create mode 100755 corpus/support/utf8-normalize.sh create mode 100755 corpus/tokenize-anything.sh create mode 100755 dpmert/decode-and-evaluate.pl (limited to 'corpus') diff --git a/compound-split/README b/compound-split/README deleted file mode 100644 index b7491007..00000000 --- a/compound-split/README +++ /dev/null @@ -1,51 +0,0 @@ -Instructions for running the compound splitter, which is a reimplementation -and extension (more features, larger non-word list) of the model described in - - C. Dyer. (2009) Using a maximum entropy model to build segmentation - lattices for MT. In Proceedings of NAACL HLT 2009, - Boulder, Colorado, June 2009 - -If you use this software, please cite this paper. - - -GENERATING 1-BEST SEGMENTATIONS AND LATTICES ------------------------------------------------------------------------------- - -Here are some sample invokations: - - ./compound-split.pl --output 1best < infile.txt > out.1best.txt - Segment infile.txt according to the 1-best segmentation file. - - ./compound-split.pl --output plf < infile.txt > out.plf - - ./compound-split.pl --output plf --beam 3.5 < infile.txt > out.plf - This generates denser lattices than usual (the default beam threshold - is 2.2, higher numbers do less pruning) - - -MODEL TRAINING (only for the adventuresome) ------------------------------------------------------------------------------- - -I've included some training data for training a German language lattice -segmentation model, and if you want to explore, you can or change the data. -If you're especially adventuresome, you can add features to cdec (the current -feature functions are found in ff_csplit.cc). The training/references are -in the file: - - dev.in-ref - -The format is the unsegmented form on the right and the reference lattice on -the left, separated by a triple pipe ( ||| ). Note that the segmentation -model inserts a # as the first word, so your segmentation references must -include this. - -To retrain the model (using MAP estimation of a conditional model), do the -following: - - cd de - ./TRAIN - -Note, the optimization objective is supposed to be non-convex, but i haven't -found much of an effect of where I initialize things. But I haven't looked -very hard- this might be something to explore. - diff --git a/compound-split/README.md b/compound-split/README.md new file mode 100644 index 00000000..b7491007 --- /dev/null +++ b/compound-split/README.md @@ -0,0 +1,51 @@ +Instructions for running the compound splitter, which is a reimplementation +and extension (more features, larger non-word list) of the model described in + + C. Dyer. (2009) Using a maximum entropy model to build segmentation + lattices for MT. In Proceedings of NAACL HLT 2009, + Boulder, Colorado, June 2009 + +If you use this software, please cite this paper. + + +GENERATING 1-BEST SEGMENTATIONS AND LATTICES +------------------------------------------------------------------------------ + +Here are some sample invokations: + + ./compound-split.pl --output 1best < infile.txt > out.1best.txt + Segment infile.txt according to the 1-best segmentation file. + + ./compound-split.pl --output plf < infile.txt > out.plf + + ./compound-split.pl --output plf --beam 3.5 < infile.txt > out.plf + This generates denser lattices than usual (the default beam threshold + is 2.2, higher numbers do less pruning) + + +MODEL TRAINING (only for the adventuresome) +------------------------------------------------------------------------------ + +I've included some training data for training a German language lattice +segmentation model, and if you want to explore, you can or change the data. +If you're especially adventuresome, you can add features to cdec (the current +feature functions are found in ff_csplit.cc). The training/references are +in the file: + + dev.in-ref + +The format is the unsegmented form on the right and the reference lattice on +the left, separated by a triple pipe ( ||| ). Note that the segmentation +model inserts a # as the first word, so your segmentation references must +include this. + +To retrain the model (using MAP estimation of a conditional model), do the +following: + + cd de + ./TRAIN + +Note, the optimization objective is supposed to be non-convex, but i haven't +found much of an effect of where I initialize things. But I haven't looked +very hard- this might be something to explore. + diff --git a/compound-split/make-dict.pl b/compound-split/make-dict.pl new file mode 100755 index 00000000..71f2b928 --- /dev/null +++ b/compound-split/make-dict.pl @@ -0,0 +1,24 @@ +#!/usr/bin/perl -w +use strict; +use utf8; +my %d; +my $z = 0; +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); +while() { + chomp; + s/[\–":“„!=+*.@«#%&,»\?\/{}\$\(\)\[\];\-0-9]+/ /g; + $_ = lc $_; + my @words = split /\s+/; + for my $w (@words) { + next if length($w) == 0; + $d{$w}++; + $z++; + } +} +my $lz = log($z); +for my $w (sort {$d{$b} <=> $d{$a}} keys %d) { + my $c = $lz-log($d{$w}); + print "$w $c\n"; +} + diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl new file mode 100755 index 00000000..fc9cce3b --- /dev/null +++ b/corpus/cut-corpus.pl @@ -0,0 +1,16 @@ +#!/usr/bin/perl -w +use strict; +die "Usage: $0 N\nSplits a corpus separated by ||| symbols and returns the Nth field\n" unless scalar @ARGV > 0; + +my $x = shift @ARGV; +die "N must be numeric" unless $x =~ /^\d+$/; +$x--; + +while(<>) { + chomp; + my @fields = split / \|\|\| /; + my $y = $fields[$x]; + if (!defined $y) { $y= ''; } + print "$y\n"; +} + diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index d7eacdd7..70032ca7 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -24,6 +24,7 @@ my $rat_max = log(9); my $lrm = 0; my $zerof = 0; my $zeroe = 0; +my $bad_format = 0; my $absbadrat = 0; my $overlene = 0; my $overlenf = 0; @@ -34,7 +35,13 @@ while() { if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; } elsif ($lines % 2500 == 0) { print STDERR "."; } my ($sf, $se, @d) = split / \|\|\| /; - die "Bad format: $_" if scalar @d != 0 or !defined $se; + if (scalar @d != 0 or !defined $se) { + $bad_format++; + if ($bad_format > 100 && ($bad_format / $lines) > 0.02) { + die "Corpus appears to be incorretly formatted, example: $_"; + } + next; + } my @fs = split /\s+/, $sf; my @es = split /\s+/, $se; my $flen = scalar @fs; @@ -78,7 +85,7 @@ for my $lr (@lograts) { $lsd = sqrt($lsd / scalar @lograts); @lograts = (); -my $pass1_discard = $zerof + $zeroe + $absbadrat + $overlene + $overlenf; +my $pass1_discard = $zerof + $zeroe + $absbadrat + $overlene + $overlenf + $bad_format; my $discard_rate = int(10000 * $pass1_discard / $lines) / 100; print STDERR " Total lines: $lines\n"; print STDERR " Already discared: $pass1_discard\t(discard rate = $discard_rate%)\n"; @@ -96,7 +103,8 @@ while() { $lines++; if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; } elsif ($lines % 2500 == 0) { print STDERR "."; } - my ($sf, $se) = split / \|\|\| /; + my ($sf, $se, @d) = split / \|\|\| /; + if (scalar @d != 0 or !defined $se) { next; } my @fs = split /\s+/, $sf; my @es = split /\s+/, $se; my $flen = scalar @fs; diff --git a/corpus/support/README b/corpus/support/README new file mode 100644 index 00000000..fdbd523e --- /dev/null +++ b/corpus/support/README @@ -0,0 +1,2 @@ +Run ./tokenize.sh to tokenize text +Edit eng_token_patterns and eng_token_list to add rules for things not to segment diff --git a/corpus/support/fix-contract.pl b/corpus/support/fix-contract.pl new file mode 100755 index 00000000..f1e191ab --- /dev/null +++ b/corpus/support/fix-contract.pl @@ -0,0 +1,10 @@ +#!/usr/bin/perl -w +use strict; +while(<>) { + #s/ (pre|anti|re|pro|inter|intra|multi|e|x|neo) - / $1- /ig; + #s/ - (year) - (old)/ -$1-$2/ig; + s/ ' (s|m|ll|re|d|ve) / '$1 /ig; + s/n ' t / n't /ig; + print; +} + diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl new file mode 100755 index 00000000..0c5b9c26 --- /dev/null +++ b/corpus/support/quote-norm.pl @@ -0,0 +1,64 @@ +#!/usr/bin/perl -w +use strict; +use utf8; +binmode(STDIN,"utf8"); +binmode(STDOUT,"utf8"); +while() { + chomp; + $_ = " $_ "; + s/&\s*lt\s*;//gi; + s/&\s*squot\s*;/'/gi; + s/&\s*quot\s*;/"/gi; + s/&\s*amp\s*;/&/gi; + s/ (\d\d): (\d\d)/ $1:$2/g; + s/[\x{20a0}]\x{20ac}]/ EUR /g; + s/[\x{00A3}]/ GBP /g; + s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g; + s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; + s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; + s/&\s*#45\s*;/--/g; + s/ ,,/ "/g; + s/``/"/g; + s/''/"/g; + s/〃/"/g; + s/¨/"/g; + s/¡/ ¡ /g; + s/¿/ ¿ /g; + s/ˇ/'/g; + s/´/'/g; + s/`/'/g; + s/’/'/g; + s/ ́/'/g; + s/‘/'/g; + s/ˉ/'/g; + s/β/ß/g; # WMT 2010 error + s/“/"/g; + s/”/"/g; + s/«/"/g; + s/»/"/g; + tr/!-~/!-~/; + s/、/,/g; + s/。/./g; + s/…/.../g; + s/―/--/g; + s/–/--/g; + s/─/--/g; + s/—/--/g; + s/•/ * /g; + s/\*/ * /g; + s/،/,/g; + s/؟/?/g; + s/ـ/ /g; + s/à ̄/i/g; + s/’/'/g; + s/â€"/"/g; + s/؛/;/g; + + s/\s+/ /g; + s/^\s+//; + s/\s+$//; + s/[\x{00}-\x{1f}]//g; + print "$_\n"; +} + diff --git a/corpus/support/token_list b/corpus/support/token_list new file mode 100644 index 00000000..28eb4396 --- /dev/null +++ b/corpus/support/token_list @@ -0,0 +1,212 @@ +##################### hyphenated words added by Fei since 3/7/05 +##X-ray + +##################### words made of punct only +:- +:-) +:-( ++= +-= +.= +*= +>= +<= +== +&& +|| +=> +-> +<- +:) +:( +;) + +#################### abbr added by Fei +oz. +fl. +tel. +1. +2. +3. +4. +5. +6. +7. +8. +9. +10. + +##################### abbreviation: words that contain period. +U.A.E +Ala. +Ph.D. +min. +max. +z.B. +d.h. +ggf. +ca. +bzw. +bzgl. +Eng. +i.e. +a.m. +am. +A.M. +Apr. +Ariz. +Ark. +Aug. +B.A.T. +B.A.T +Calif. +Co. +Conn. +Corp. +Cos. +D.C. +Dec. +Dept. +Dr. +Drs. +Feb. +Fla. +Fri. +Ga. +Gen. +gen. +GEN. +Gov. +Govt. +Ill. +Inc. +Jan. +Jr. +Jul. +Jun. +Kan. +L.A. +Lieut. +Lt. +Ltd. +Ma. +Mar. +Mass. +Md. +Mfg. +Mgr. +Mexican-U.S. +Mich. +Minn. +Mo. +Mon. +Mr. +Mrs. +Ms. +Mt. +N.D. +Neb. +Nev. +No. +Nos. +Nov. +Oct. +Okla. +Op. +Ore. +Pa. +p.m +p.m. +I.B.C. +N.T.V +Pres. +Prof. +Prop. +Rd. +Rev. +R.J. +C.L +Rte. +Sat. +W.T +Sen. +Sep. +Sept. +Sgt. +Sr. +SR. +St. +Ste. +Sun. +Tenn. +Tex. +Thu. +Tue. +Univ. +Va. +Vt. +Wed. +approx. +dept. +e.g. +E.G. +eg. +est. +etc. +ex. +ext. +ft. +hon. +hr. +hrs. +lab. +lb. +lbs. +mass. +misc. +no. +nos. +nt. +para. +paras. +pct. +prod. +rec. +ref. +rel. +rep. +sq. +st. +stg. +vol. +vs. +U.S. +J.S. +U.N. +u.n. +A. +B. +C. +D. +E. +F. +G. +H. +I. +J. +K. +L. +M. +N. +O. +P. +Q. +R. +S. +T. +U. +V. +W. +X. +Y. +Z. diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns new file mode 100644 index 00000000..c0e6fe1a --- /dev/null +++ b/corpus/support/token_patterns @@ -0,0 +1,3 @@ +/^(al|el|ul|e)\-[a-z]+$/ +/^(\d+)\.$/ + diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl new file mode 100755 index 00000000..23be00a5 --- /dev/null +++ b/corpus/support/tokenizer.pl @@ -0,0 +1,717 @@ +#!/usr/bin/env perl + +my $script_dir; +BEGIN {$^W = 1; use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } + +use strict; +use utf8; + +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +my $debug = 0; + + +############ options: +### for all options: +### 0 means no split on that symbol +### 1 means split on that symbol in all cases. +### 2 means do not split in condition 1. +### n means do not split in any of the conditions in the set {1, 2, ..., n-1}. + + +### prefix +## for "#": #90 +my $Split_On_SharpSign = 2; # 2: do not split on Num, e.g., "#90" + + +############## "infix" +my $Split_On_Tilde = 2; # 2: do not split on Num, e.g., "12~13". + +my $Split_On_Circ = 2; # 2: do not split on Num, e.g, "2^3" + +## for "&" +my $Split_On_AndSign = 2; # 2: do not split on short Name, e.g., "AT&T". + +## for hyphen: 1990-1992 +my $Split_On_Dash = 2; ## 2: do not split on number, e.g., "22-23". +my $Split_On_Underscore = 0; ## 0: do not split by underline + +## for ":": 5:4 +my $Split_On_Semicolon = 2; ## 2: don't split for num, e.g., "5:4" + +########### suffix +## for percent sign: 5% +my $Split_On_PercentSign = 1; ## 2: don't split num, e.g., 5% + +############# others +## for slash: 1/4 +my $Split_On_Slash = 2; ## 2: don't split on number, e.g., 1/4. +my $Split_On_BackSlash = 0; ## 0: do not split on "\", e.g., \t + +### for "$": US$120 +my $Split_On_DollarSign = 2; ### 2: US$120 => "US$ 120" + ### 1: US$120 => "US $ 120" +## for 's etc. +my $Split_NAposT = 1; ## n't +my $Split_AposS = 1; ## 's +my $Split_AposM = 1; ## 'm +my $Split_AposRE = 1; ## 're +my $Split_AposVE = 1; ## 've +my $Split_AposLL = 1; ## 'll +my $Split_AposD = 1; ## 'd + + +### some patterns +my $common_right_punc = '\.|\,|\;|:|\!|\?|\"|\)|\]|\}|\>|\-'; + +#### step 1: read files + +my $workdir = $script_dir; +my $dict_file = "$workdir/token_list"; +my $word_patt_file = "$workdir/token_patterns"; + +open(my $dict_fp, "$dict_file") or die; + +# read in the list of words that should not be segmented, +## e.g.,"I.B.M.", co-operation. +my %dict_hash = (); +my $dict_entry = 0; +while(<$dict_fp>){ + chomp; + next if /^\s*$/; + s/^\s+//; + s/\s+$//; + tr/A-Z/a-z/; + $dict_hash{$_} = 1; + $dict_entry ++; +} + +open(my $patt_fp, "$word_patt_file") or die; +my @word_patts = (); +my $word_patt_num = 0; +while(<$patt_fp>){ + chomp; + next if /^\s*$/; + s/^\s+//; + s/\s+$//; + s/^\/(.+)\/$/$1/; # remove / / around the pattern + push(@word_patts, $_); + $word_patt_num ++; +} + + +###### step 2: process the input file +my $orig_token_total = 0; +my $deep_proc_token_total = 0; +my $new_token_total = 0; + +my $line_total = 0; +my $content_line_total = 0; + +while(){ + chomp(); + + $line_total ++; + if ($line_total % 100000 == 0) { print STDERR " [$line_total]\n"; } + elsif ($line_total % 2500 == 0) { print STDERR "."; } + + if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^//; + $new_line =~ s/\s*<\s+(p|hl)\s+>/<$1>/; + $new_line =~ s/\s*<\s+\/\s+(p|hl|DOC)\s+>/<\/$1>/; + $new_line =~ s/<\s+\/\s+seg\s+>/<\/seg>/; + if ($new_line =~ /^\s*<\s+DOC\s+/) { + $new_line =~ s/\s+//g; + $new_line =~ s/DOC/DOC /; + $new_line =~ s/sys/ sys/; + } + if ($new_line =~ /^\s*<\s+(refset|srcset)\s+/) { + $new_line =~ s/\s+//g; + $new_line =~ s/(set|src|tgt|trg)/ $1/g; + } + + print STDOUT " $new_line\n"; +} +print STDERR "\n"; + +######################################################################## + +### tokenize a line. +sub proc_line { + my @params = @_; + my $param_num = scalar @params; + + if(($param_num < 1) || ($param_num > 3)){ + die "wrong number of params for proc_line: $param_num\n"; + } + + my $orig_line = $params[0]; + + $orig_line =~ s/^\s+//; + $orig_line =~ s/\s+$//; + + my @parts = split(/\s+/, $orig_line); + + if($param_num >= 2){ + my $orig_num_ptr = $params[1]; + $$orig_num_ptr = scalar @parts; + } + + my $new_line = ""; + + my $deep_proc_token = 0; + foreach my $part (@parts){ + my $flag = -1; + $new_line .= proc_token($part, \$flag) . " "; + $deep_proc_token += $flag; + } + + if($param_num == 3){ + my $deep_num_ptr = $params[2]; + $$deep_num_ptr = $deep_proc_token; + } + + return $new_line; +} + + + +## Tokenize a str that does not contain " ", return the new string +## The function handles the cases that the token needs not be segmented. +## for other cases, it calls deep_proc_token() +sub proc_token { + my @params = @_; + my $param_num = scalar @params; + if($param_num > 2){ + die "proc_token: wrong number of params: $param_num\n"; + } + + my $token = $params[0]; + + if(!defined($token)){ + return ""; + } + + my $deep_proc_flag; + + if($param_num == 2){ + $deep_proc_flag = $params[1]; + $$deep_proc_flag = 0; + } + + if($debug){ + print STDERR "pro_token:+$token+\n"; + } + + ### step 0: it has only one char + if(($token eq "") || ($token=~ /^.$/)){ + ## print STDERR "see +$token+\n"; + return $token; + } + + ## step 1: check the most common case + if($token =~ /^[a-z0-9\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+$/i){ + ### most common cases + return $token; + } + + ## step 2: check whether it is some NE entity + ### 1.2.4.6 + if($token =~ /^\d+(.\d+)+$/){ + return $token; + } + + ## 1,234,345.34 + if($token =~ /^\d+(\.\d{3})*,\d+$/){ + ## number + return $token; + } + if($token =~ /^\d+(,\d{3})*\.\d+$/){ + ## number + return $token; + } + if($token =~ /^(@|#)[A-Za-z0-9_\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]+.*$/){ + ## twitter hashtag or address + return proc_rightpunc($token); + } + + if($token =~ /^[a-z0-9\_\-]+\@[a-z\d\_\-]+(\.[a-z\d\_\-]+)*(.*)$/i){ + ### email address: xxx@yy.zz + return proc_rightpunc($token); + } + + if($token =~ /^(mailto|http|https|ftp|gopher|telnet|file)\:\/{0,2}([^\.]+)(\.(.+))*$/i){ + ### URL: http://xx.yy.zz + return proc_rightpunc($token); + } + + if($token =~ /^(www)(\.(.+))+$/i){ + ### www.yy.dd/land/ + return proc_rightpunc($token); + } + + if($token =~ /^(\w+\.)+(com|co|edu|org|gov|ly|cz|ru|eu)(\.[a-z]{2,3})?\:{0,2}(\/\S*)?$/i){ + ### URL: upenn.edu/~xx + return proc_rightpunc($token); + } + + if($token =~ /^\(\d{3}\)\d{3}(\-\d{4})($common_right_punc)*$/){ + ## only handle American phone numbers: e.g., (914)244-4567 + return proc_rightpunc($token); + } + + #my $t1 = '[\x{0600}-\x{06ff}a-z\d\_\.\-]'; + my $t1 = '[a-z\d\_\-\.\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}]'; + if($token =~ /^\/(($t1)+\/)+($t1)+\/?$/i){ + ### /nls/p/.... + return $token; + } + + if($token =~ /^\\(($t1)+\\)+($t1)+\\?$/i){ + ### \nls\p\.... + return $token; + } + + ## step 3: check the dictionary + my $token_lc = $token; + $token_lc =~ tr/A-Z/a-z/; + + if(defined($dict_hash{$token_lc})){ + return $token; + } + + ## step 4: check word_patterns + my $i=1; + foreach my $patt (@word_patts){ + if($token_lc =~ /$patt/){ + if($debug){ + print STDERR "+$token+ match pattern $i: +$patt+\n"; + } + return $token; + }else{ + $i++; + } + } + + ## step 5: call deep tokenization + if($param_num == 2){ + $$deep_proc_flag = 1; + } + return deep_proc_token($token); +} + + +### remove punct on the right side +### e.g., xxx@yy.zz, => xxx@yy.zz , +sub proc_rightpunc { + my ($token) = @_; + + $token =~ s/(($common_right_punc)+)$/ $1 /; + if($token =~ /\s/){ + return proc_line($token); + }else{ + return $token; + } +} + + + +####################################### +### return the new token: +### types of punct: +## T1 (2): the punct is always a token by itself no matter where it +### appears: " ; +## T2 (15): the punct that can be a part of words made of puncts only. +## ` ! @ + = [ ] ( ) { } | < > ? +## T3 (15): the punct can be part of a word that contains [a-z\d] +## T3: ~ ^ & : , # * % - _ \ / . $ ' +## infix: ~ (12~13), ^ (2^3), & (AT&T), : , +## prefix: # (#9), * (*3), +## suffix: % (10%), +## infix+prefix: - (-5), _ (_foo), +## more than one position: \ / . $ +## Appos: 'm n't ... + +## 1. separate by puncts in T1 +## 2. separate by puncts in T2 +## 3. deal with punct T3 one by one according to options +## 4. if the token remains unchanged after step 1-3, return the token + +## $line contains at least 2 chars, and no space. +sub deep_proc_token { + my ($line) = @_; + if($debug){ + print STDERR "deep_proc_token: +$line+\n"; + } + + ##### step 0: if it mades up of all puncts, remove one punct at a time. + if($line !~ /[\p{Cyrillic}\p{Greek}\p{Hebrew}\p{Han}\p{Arabic}a-zA-Z\d]/){ + if($line =~ /^(\!+|\@+|\++|\=+|\*+|\<+|\>+|\|+|\?+|\.+|\-+|\_+|\&+)$/){ + ## ++ @@@@ !!! .... + return $line; + } + + if($line =~ /^(.)(.+)$/){ + my $t1 = $1; + my $t2 = $2; + return $t1 . " " . proc_token($t2); + }else{ + ### one char only + print STDERR "deep_proc_token: this should not happen: +$line+\n"; + return $line; + } + } + + ##### step 1: separate by punct T2 on the boundary + my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;'; + if($line =~ s/^(($t2)+)/$1 /){ + return proc_line($line); + } + + if($line =~ s/(($t2)+)$/ $1/){ + return proc_line($line); + } + + ## step 2: separate by punct T2 in any position + if($line =~ s/(($t2)+)/ $1 /g){ + return proc_line($line); + } + + ##### step 3: deal with special puncts in T3. + if($line =~ /^(\,+)(.+)$/){ + my $t1 = $1; + my $t2 = $2; + return proc_token($t1) . " " . proc_token($t2); + } + + if($line =~ /^(.*[^\,]+)(\,+)$/){ + ## 19.3,,, => 19.3 ,,, + my $t1 = $1; + my $t2 = $2; + return proc_token($t1) . " " . proc_token($t2); + } + + ## remove the ending periods that follow number etc. + if($line =~ /^(.*(\d|\~|\^|\&|\:|\,|\#|\*|\%|\-|\_|\/|\\|\$|\'))(\.+)$/){ + ## 12~13. => 12~13 . + my $t1 = $1; + my $t3 = $3; + return proc_token($t1) . " " . proc_token($t3); + } + + ### deal with "$" + if(($line =~ /\$/) && ($Split_On_DollarSign > 0)){ + my $suc = 0; + if($Split_On_DollarSign == 1){ + ## split on all occasation + $suc = ($line =~ s/(\$+)/ $1 /g); + }else{ + ## split only between $ and number + $suc = ($line =~ s/(\$+)(\d)/$1 $2/g); + } + + if($suc){ + return proc_line($line); + } + } + + ## deal with "#" + if(($line =~ /\#/) && ($Split_On_SharpSign > 0)){ + my $suc = 0; + if($Split_On_SharpSign >= 2){ + ### keep #50 as a token + $suc = ($line =~ s/(\#+)(\D)/ $1 $2/gi); + }else{ + $suc = ($line =~ s/(\#+)/ $1 /gi); + } + + if($suc){ + return proc_line($line); + } + } + + ## deal with ' + if($line =~ /\'/){ + my $suc = ($line =~ s/([^\'])([\']+)$/$1 $2/g); ## xxx'' => xxx '' + + ### deal with ': e.g., 's, 't, 'm, 'll, 're, 've, n't + + ## 'there => ' there '98 => the same + $suc += ($line =~ s/^(\'+)([a-z]+)/ $1 $2/gi); + + ## note that \' and \. could interact: e.g., U.S.'s; 're. + if($Split_NAposT && ($line =~ /^(.*[a-z]+)(n\'t)([\.]*)$/i)){ + ## doesn't => does n't + my $t1 = $1; + my $t2 = $2; + my $t3 = $3; + return proc_token($t1) . " " . $t2 . " " . proc_token($t3); + } + + ## 's, 't, 'm, 'll, 're, 've: they've => they 've + ## 1950's => 1950 's Co.'s => Co. 's + if($Split_AposS && ($line =~ /^(.+)(\'s)(\W*)$/i)){ + my $t1 = $1; + my $t2 = $2; + my $t3 = $3; + return proc_token($t1) . " " . $t2 . " " . proc_token($t3); + } + + if($Split_AposM && ($line =~ /^(.*[a-z]+)(\'m)(\.*)$/i)){ + my $t1 = $1; + my $t2 = $2; + my $t3 = $3; + return proc_token($t1) . " " . $t2 . " " . proc_token($t3); + } + + + if($Split_AposRE && ($line =~ /^(.*[a-z]+)(\'re)(\.*)$/i)){ + my $t1 = $1; + my $t2 = $2; + my $t3 = $3; + return proc_token($t1) . " " . $t2 . " " . proc_token($t3); + } + + if($Split_AposVE && ($line =~ /^(.*[a-z]+)(\'ve)(\.*)$/i)){ + my $t1 = $1; + my $t2 = $2; + my $t3 = $3; + return proc_token($t1) . " " . $t2 . " " . proc_token($t3); + } + + if($Split_AposLL && ($line =~ /^(.*[a-z]+)(\'ll)(\.*)$/i)){ + my $t1 = $1; + my $t2 = $2; + my $t3 = $3; + return proc_token($t1) . " " . $t2 . " " . proc_token($t3); + } + + if($Split_AposD && ($line =~ /^(.*[a-z]+)(\'d)(\.*)$/i)){ + my $t1 = $1; + my $t2 = $2; + my $t3 = $3; + return proc_token($t1) . " " . $t2 . " " . proc_token($t3); + } + + if($suc){ + return proc_line($line); + } + } + + + ## deal with "~" + if(($line =~ /\~/) && ($Split_On_Tilde > 0)){ + my $suc = 0; + if($Split_On_Tilde >= 2){ + ## keep 12~13 as one token + $suc += ($line =~ s/(\D)(\~+)/$1 $2 /g); + $suc += ($line =~ s/(\~+)(\D)/ $1 $2/g); + $suc += ($line =~ s/^(\~+)(\d)/$1 $2/g); + $suc += ($line =~ s/(\d)(\~+)$/$1 $2/g); + }else{ + $suc += ($line =~ s/(\~+)/ $1 /g); + } + if($suc){ + return proc_line($line); + } + } + + ## deal with "^" + if(($line =~ /\^/) && ($Split_On_Circ > 0)){ + my $suc = 0; + if($Split_On_Circ >= 2){ + ## keep 12~13 as one token + $suc += ($line =~ s/(\D)(\^+)/$1 $2 /g); + $suc += ($line =~ s/(\^+)(\D)/ $1 $2/g); + }else{ + $suc = ($line =~ s/(\^+)/ $1 /g); + } + if($suc){ + return proc_line($line); + } + } + + ## deal with ":" + if(($line =~ /\:/) && ($Split_On_Semicolon > 0)){ + ## 2: => 2 : + my $suc = ($line =~ s/^(\:+)/$1 /); + $suc += ($line =~ s/(\:+)$/ $1/); + if($Split_On_Semicolon >= 2){ + ## keep 5:4 as one token + $suc += ($line =~ s/(\D)(\:+)/$1 $2 /g); + $suc += ($line =~ s/(\:+)(\D)/ $1 $2/g); + }else{ + $suc += ($line =~ s/(\:+)/ $1 /g); + } + + if($suc){ + return proc_line($line); + } + } + + ### deal with hyphen: 1992-1993. 21st-24th + if(($line =~ /\-/) && ($Split_On_Dash > 0)){ + my $suc = ($line =~ s/(\-{2,})/ $1 /g); + if($Split_On_Dash >= 2){ + ## keep 1992-1993 as one token + $suc += ($line =~ s/(\D)(\-+)/$1 $2 /g); + $suc += ($line =~ s/(\-+)(\D)/ $1 $2/g); + }else{ + ### always split on "-" + $suc += ($line =~ s/([\-]+)/ $1 /g); + } + + if($suc){ + return proc_line($line); + } + } + + ## deal with "_" + if(($line =~ /\_/) && ($Split_On_Underscore > 0)){ + ### always split on "-" + if($line =~ s/([\_]+)/ $1 /g){ + return proc_line($line); + } + } + + + + ## deal with "%" + if(($line =~ /\%/) && ($Split_On_PercentSign > 0)){ + my $suc = 0; + if($Split_On_PercentSign >= 2){ + $suc += ($line =~ s/(\D)(\%+)/$1 $2/g); + }else{ + $suc += ($line =~ s/(\%+)/ $1 /g); + } + + if($suc){ + return proc_line($line); + } + } + + + ### deal with "/": 4/5 + if(($line =~ /\//) && ($Split_On_Slash > 0)){ + my $suc = 0; + if($Split_On_Slash >= 2){ + $suc += ($line =~ s/(\D)(\/+)/$1 $2 /g); + $suc += ($line =~ s/(\/+)(\D)/ $1 $2/g); + }else{ + $suc += ($line =~ s/(\/+)/ $1 /g); + } + + if($suc){ + return proc_line($line); + } + } + + + ### deal with comma: 123,456 + if($line =~ /\,/){ + my $suc = 0; + $suc += ($line =~ s/([^\d]),/$1 , /g); ## xxx, 1923 => xxx , 1923 + $suc += ($line =~ s/\,\s*([^\d])/ , $1/g); ## 1923, xxx => 1923 , xxx + + $suc += ($line =~ s/,([\d]{1,2}[^\d])/ , $1/g); ## 1,23 => 1 , 23 + $suc += ($line =~ s/,([\d]{4,}[^\d])/ , $1/g); ## 1,2345 => 1 , 2345 + + $suc += ($line =~ s/,([\d]{1,2})$/ , $1/g); ## 1,23 => 1 , 23 + $suc += ($line =~ s/,([\d]{4,})$/ , $1/g); ## 1,2345 => 1 , 2345 + + if($suc){ + return proc_line($line); + } + } + + + ## deal with "&" + if(($line =~ /\&/) && ($Split_On_AndSign > 0)){ + my $suc = 0; + if($Split_On_AndSign >= 2){ + $suc += ($line =~ s/([a-z]{3,})(\&+)/$1 $2 /gi); + $suc += ($line =~ s/(\&+)([a-z]{3,})/ $1 $2/gi); + }else{ + $suc += ($line =~ s/(\&+)/ $1 /g); + } + + if($suc){ + return proc_line($line); + } + } + + ## deal with period + if($line =~ /\./){ + if($line =~ /^(([\+|\-])*(\d+\,)*\d*\.\d+\%*)$/){ + ### numbers: 3.5 + return $line; + } + + if($line =~ /^(([a-z]\.)+)(\.*)$/i){ + ## I.B.M. + my $t1 = $1; + my $t3 = $3; + return $t1 . " ". proc_token($t3); + } + + ## Feb.. => Feb. . + if($line =~ /^(.*[^\.])(\.)(\.*)$/){ + my $p1 = $1; + my $p2 = $2; + my $p3 = $3; + + my $p1_lc = $p1; + $p1_lc =~ tr/A-Z/a-z/; + + if(defined($dict_hash{$p1_lc . $p2})){ + ## Dec.. => Dec. . + return $p1 . $p2 . " " . proc_token($p3); + }elsif(defined($dict_hash{$p1_lc})){ + return $p1 . " " . proc_token($p2 . $p3); + }else{ + ## this. => this . + return proc_token($p1) . " " . proc_token($p2 . $p3); + } + } + + if($line =~ s/(\.+)(.+)/$1 $2/g){ + return proc_line($line); + } + } + + + ## no pattern applies + return $line; +} + + + + + + + + diff --git a/corpus/support/utf8-normalize.sh b/corpus/support/utf8-normalize.sh new file mode 100755 index 00000000..2f347854 --- /dev/null +++ b/corpus/support/utf8-normalize.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# this is the location on malbec, if you want to run on another machine +# ICU may be installed in /usr or /usr/local +ICU_DIR=/usr0/tools/icu +UCONV_BIN=$ICU_DIR/bin/uconv +UCONV_LIB=$ICU_DIR/lib + +if [ -e $UCONV_BIN ] && [ -d $UCONV_LIB ] +then + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$UCONV_LIB + if [ ! -x $UCONV_BIN ] + then + echo "$0: Cannot execute $UCONV_BIN! Please fix." 1>&2 + exit + fi + CMD="$UCONV_BIN -f utf8 -t utf8 -x Any-NFKC --callback skip" +else + if which uconv > /dev/null + then + CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip" + else + echo "$0: Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Quality may suffer." 1>&2 + CMD="iconv -f utf8 -t utf8 -c" + fi +fi + +perl -e 'while(<>){s/\r\n*/\n/g; print;}' | $CMD | /usr/bin/perl -w -e ' + while (<>) { + chomp; + s/[\x00-\x1F]+/ /g; + s/ +/ /g; + s/^ //; + s/ $//; + print "$_\n"; + }' diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh new file mode 100755 index 00000000..1a24193d --- /dev/null +++ b/corpus/tokenize-anything.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +ROOTDIR=`dirname $0` +SUPPORT=$ROOTDIR/support + +$SUPPORT/utf8-normalize.sh | + $SUPPORT/quote-norm.pl | + $SUPPORT/tokenizer.pl | + sed -e 's/ al - / al-/g' | + $SUPPORT/fix-contract.pl | + sed -e 's/^ //' | sed -e 's/ $//' | + perl -e 'while(<>){s/(\d+)(\.+)$/$1 ./;print;}' + diff --git a/dpmert/decode-and-evaluate.pl b/dpmert/decode-and-evaluate.pl new file mode 100755 index 00000000..fe765d00 --- /dev/null +++ b/dpmert/decode-and-evaluate.pl @@ -0,0 +1,246 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use File::Basename qw(basename); +my $QSUB_CMD = qsub_args(mert_memory()); + +require "libcall.pl"; + +# Default settings +my $default_jobs = env_default_jobs(); +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $parallelize = "$bin_dir/parallelize.pl"; +my $libcall = "$bin_dir/libcall.pl"; +my $sentserver = "$bin_dir/sentserver"; +my $sentclient = "$bin_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +my $cdec = "$bin_dir/../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "9g"; +my $help = 0; +my $config; +my $test_set; +my $weights; +my $use_make = 1; +my $useqsub; +my $cpbin=1; +# Process command-line options +if (GetOptions( + "jobs=i" => \$jobs, + "help" => \$help, + "qsub" => \$useqsub, + "input=s" => \$test_set, + "config=s" => \$config, + "weights=s" => \$weights, +) == 0 || @ARGV!=0 || $help) { + print_help(); + exit; +} + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); + +if (!defined $test_set) { push @missing_args, "--input"; } +if (!defined $config) { push @missing_args, "--config"; } +if (!defined $weights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args); + +my @tf = localtime(time); +my $tname = basename($test_set); +$tname =~ s/\.(sgm|sgml|xml)$//i; +my $dir = "eval.$tname." . sprintf('%d%02d%02d-%02d%02d%02d', 1900+$tf[5], $tf[4], $tf[3], $tf[2], $tf[1], $tf[0]); + +my $time = unchecked_output("date"); + +check_call("mkdir -p $dir"); + +split_devset($test_set, "$dir/test.input.raw", "$dir/test.refs"); +my $refs = "-r $dir/test.refs"; +my $newsrc = "$dir/test.input"; +enseg("$dir/test.input.raw", $newsrc); +my $src_file = $newsrc; +open F, "<$src_file" or die "Can't read $src_file: $!"; close F; + +my $test_trans="$dir/test.trans"; +my $logdir="$dir/logs"; +my $decoderLog="$logdir/decoder.sentserver.log"; +check_call("mkdir -p $logdir"); + +#decode +print STDERR "RUNNING DECODER AT "; +print STDERR unchecked_output("date"); +my $decoder_cmd = "$decoder -c $config --weights $weights"; +my $pcmd; +if ($use_make) { + $pcmd = "cat $src_file | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --"; +} else { + $pcmd = "cat $src_file | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --"; +} +my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $test_trans"; +check_bash_call($cmd); +print STDERR "DECODER COMPLETED AT "; +print STDERR unchecked_output("date"); +print STDERR "\nOUTPUT: $test_trans\n\n"; +my $bleu = check_output("cat $test_trans | $SCORER $refs -m ibm_bleu"); +chomp $bleu; +print STDERR "BLEU: $bleu\n"; +my $ter = check_output("cat $test_trans | $SCORER $refs -m ter"); +chomp $ter; +print STDERR " TER: $ter\n"; +open TR, ">$dir/test.scores" or die "Can't write $dir/test.scores: $!"; +print TR <$newsrc"); + my $i=0; + while (my $line=){ + chomp $line; + if ($line =~ /^\s* tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "$line\n"; + } + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + my $executable = basename($0); chomp $executable; + print << "Help"; + +Usage: $executable [options] + + $executable --config cdec.ini --weights weights.txt [--jobs N] [--qsub] + +Options: + + --help + Print this message and exit. + + --config + A path to the cdec.ini file. + + --weights + A file specifying feature weights. + + --dir + Directory for intermediate and output files. + +Job control options: + + --jobs + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} + +sub split_devset { + my ($infile, $outsrc, $outref) = @_; + open F, "<$infile" or die "Can't read $infile: $!"; + open S, ">$outsrc" or die "Can't write $outsrc: $!"; + open R, ">$outref" or die "Can't write $outref: $!"; + while() { + chomp; + my ($src, @refs) = split /\s*\|\|\|\s*/; + die "Malformed devset line: $_\n" unless scalar @refs > 0; + print S "$src\n"; + print R join(' ||| ', @refs) . "\n"; + } + close R; + close S; + close F; +} + diff --git a/dpmert/dpmert.pl b/dpmert/dpmert.pl index 2e6a9728..c4f98870 100755 --- a/dpmert/dpmert.pl +++ b/dpmert/dpmert.pl @@ -7,15 +7,14 @@ my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR # Skip local config (used for distributing jobs) if we're running in local-only mode use LocalConfig; use Getopt::Long; -use IPC::Open2; -use POSIX ":sys_wait_h"; -my $QSUB_CMD = qsub_args(mert_memory()); - +use File::Basename qw(basename); require "libcall.pl"; +my $QSUB_CMD = qsub_args(mert_memory()); + # Default settings -my $srcFile; -my $refFiles; +my $srcFile; # deprecated +my $refFiles; # deprecated my $default_jobs = env_default_jobs(); my $bin_dir = $SCRIPT_DIR; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; @@ -37,7 +36,7 @@ die "Can't find decoder in $cdec" unless -x $cdec; die "Can't find $parallelize" unless -x $parallelize; die "Can't find $libcall" unless -e $libcall; my $decoder = $cdec; -my $lines_per_mapper = 400; +my $lines_per_mapper = 200; my $rand_directions = 15; my $iteration = 1; my $best_weights; @@ -47,53 +46,35 @@ my $jobs = $default_jobs; # number of decode nodes my $pmem = "9g"; my $disable_clean = 0; my %seen_weights; -my $normalize; my $help = 0; my $epsilon = 0.0001; -my $interval = 5; -my $dryrun = 0; my $last_score = -10000000; my $metric = "ibm_bleu"; my $dir; my $iniFile; my $weights; my $initialWeights; -my $decoderOpt; -my $noprimary; -my $maxsim=0; -my $oraclen=0; -my $oracleb=20; my $bleu_weight=1; my $use_make = 1; # use make to parallelize line search my $useqsub; my $pass_suffix = ''; -my $devset = ''; -my $cpbin=1; +my $devset; # Process command-line options -Getopt::Long::Configure("no_auto_abbrev"); if (GetOptions( - "decoder=s" => \$decoderOpt, + "config=s" => \$iniFile, + "weights=s" => \$initialWeights, + "devset=s" => \$devset, "jobs=i" => \$jobs, - "dont-clean" => \$disable_clean, "pass-suffix=s" => \$pass_suffix, - "dry-run" => \$dryrun, - "epsilon=s" => \$epsilon, "help" => \$help, - "interval" => \$interval, "qsub" => \$useqsub, - "max-iterations=i" => \$max_iterations, - "normalize=s" => \$normalize, + "iterations=i" => \$max_iterations, "pmem=s" => \$pmem, - "cpbin!" => \$cpbin, "random-directions=i" => \$rand_directions, - "devset=s" => \$devset, - "ref-files=s" => \$refFiles, "metric=s" => \$metric, "source-file=s" => \$srcFile, - "weights=s" => \$initialWeights, - "workdir=s" => \$dir, - "opt-iterations=i" => \$optimization_iters, -) == 0 || @ARGV!=1 || $help) { + "output-dir=s" => \$dir, +) == 0 || @ARGV!=0 || $help) { print_help(); exit; } @@ -114,22 +95,17 @@ if (defined $srcFile || defined $refFiles) { EOT } +if (!defined $iniFile) { push @missing_args, "--config"; } if (!defined $devset) { push @missing_args, "--devset"; } if (!defined $initialWeights) { push @missing_args, "--weights"; } -die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); +die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args); if ($metric =~ /^(combi|ter)$/i) { $lines_per_mapper = 40; } elsif ($metric =~ /^meteor$/i) { - $lines_per_mapper = 2000; # start up time is really high + $lines_per_mapper = 2000; # start up time is really high for METEOR } -($iniFile) = @ARGV; - - -sub write_config; -sub enseg; -sub print_help; my $nodelist; my $host =check_output("hostname"); chomp $host; @@ -153,8 +129,6 @@ unless ($dir =~ /^\//){ # convert relative path to absolute path $dir = "$basedir/$dir"; } -if ($decoderOpt){ $decoder = $decoderOpt; } - # Initializations and helper functions srand; @@ -169,73 +143,47 @@ sub cleanup { exit 1; }; # Always call cleanup, no matter how we exit -*CORE::GLOBAL::exit = - sub{ cleanup(); }; +*CORE::GLOBAL::exit = sub{ cleanup(); }; $SIG{INT} = "cleanup"; $SIG{TERM} = "cleanup"; $SIG{HUP} = "cleanup"; -my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $decoderBase = basename($decoder); chomp $decoderBase; my $newIniFile = "$dir/$decoderBase.ini"; my $inputFileName = "$dir/input"; my $user = $ENV{"USER"}; - # process ini file -e $iniFile || die "Error: could not open $iniFile for reading\n"; -open(INI, $iniFile); -use File::Basename qw(basename); -#pass bindir, refs to vars holding bin -sub modbin { - local $_; - my $bindir=shift; - check_call("mkdir -p $bindir"); - -d $bindir || die "couldn't make bindir $bindir"; - for (@_) { - my $src=$$_; - $$_="$bindir/".basename($src); - check_call("cp -p $src $$_"); - } -} sub dirsize { opendir ISEMPTY,$_[0]; return scalar(readdir(ISEMPTY))-1; } -if ($dryrun){ - write_config(*STDERR); - exit 0; +if (-e $dir) { + # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs + die "ERROR: output directory $dir already exists (remove or use --output-dir dir)\n\n"; } else { - if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs - die "ERROR: working dir $dir already exists\n\n"; - } else { - -e $dir || mkdir $dir; - mkdir "$dir/hgs"; - modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; - mkdir "$dir/scripts"; - my $cmdfile="$dir/rerun-dpmert.sh"; - open CMD,'>',$cmdfile; - print CMD "cd ",&getcwd,"\n"; -# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. - my $cline=&cmdline."\n"; - print CMD $cline; - close CMD; - print STDERR $cline; - chmod(0755,$cmdfile); - unless (-e $initialWeights) { - print STDERR "Please specify an initial weights file with --initial-weights\n"; - print_help(); - exit; - } - check_call("cp $initialWeights $dir/weights.0"); - die "Can't find weights.0" unless (-e "$dir/weights.0"); - } - write_config(*STDERR); + mkdir "$dir" or die "Can't mkdir $dir: $!"; + mkdir "$dir/hgs" or die; + mkdir "$dir/scripts" or die; + print STDERR < with the decoder)\n\n"; - -print STDOUT "$lastWeightsFile\n"; - +check_call("cp $lastWeightsFile $dir/weights.final"); +print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w with the decoder)\n\n"; +print STDOUT "$dir/weights.final\n"; exit 0; -sub normalize_weights { - my ($rfn, $rpts, $feat) = @_; - my @feat_names = @$rfn; - my @pts = @$rpts; - my $z = 1.0; - for (my $i=0; $i < scalar @feat_names; $i++) { - if ($feat_names[$i] eq $feat) { - $z = $pts[$i]; - last; - } - } - for (my $i=0; $i < scalar @feat_names; $i++) { - $pts[$i] /= $z; - } - print STDERR " NORM WEIGHTS: @pts\n"; - return @pts; -} sub get_lines { my $fn = shift @_; @@ -523,27 +453,6 @@ sub read_weights_file { return join ' ', @r; } -# subs -sub write_config { - my $fh = shift; - my $cleanup = "yes"; - if ($disable_clean) {$cleanup = "no";} - - print $fh "\n"; - print $fh "DECODER: $decoder\n"; - print $fh "INI FILE: $iniFile\n"; - print $fh "WORKING DIR: $dir\n"; - print $fh "DEVSET: $devset\n"; - print $fh "EVAL METRIC: $metric\n"; - print $fh "START ITERATION: $iteration\n"; - print $fh "MAX ITERATIONS: $max_iterations\n"; - print $fh "PARALLEL JOBS: $jobs\n"; - print $fh "HEAD NODE: $host\n"; - print $fh "PMEM (DECODING): $pmem\n"; - print $fh "CLEANUP: $cleanup\n"; - print $fh "INITIAL WEIGHTS: $initialWeights\n"; -} - sub update_weights_file { my ($neww, $rfn, $rpts) = @_; my @feats = @$rfn; @@ -585,22 +494,34 @@ sub enseg { sub print_help { - my $executable = check_output("basename $0"); chomp $executable; - print << "Help"; + my $executable = basename($0); chomp $executable; + print << "Help"; Usage: $executable [options] - $executable [options] - Runs a complete MERT optimization using the decoder configuration - in . Required options are --weights, --source-file, and - --ref-files. + $executable [options] + Runs a complete MERT optimization. Required options are --weights, + --devset, and --config. Options: - --help - Print this message and exit. + --config [-c ] + The decoder configuration file. + + --devset [-d ] + The source *and* references for the development set. + + --weights [-w ] + A file specifying initial feature weights. The format is + FeatureName_1 value1 + FeatureName_2 value2 + **All and only the weights listed in will be optimized!** + + --metric + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi - --max-iterations + --iterations Maximum number of iterations to run. If not specified, defaults to 10. @@ -608,39 +529,15 @@ Options: If the decoder is doing multi-pass decoding, the pass suffix "2", "3", etc., is used to control what iteration of weights is set. - --ref-files - Dev set ref files. This option takes only a single string argument. - To use multiple files (including file globbing), this argument should - be quoted. - - --metric - Metric to optimize. - Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi - - --normalize - After each iteration, rescale all feature weights such that feature- - name has a weight of 1.0. - --rand-directions MERT will attempt to optimize along all of the principle directions, set this parameter to explore other directions. Defaults to 5. - --source-file - Dev set source file. + --output-dir + Directory for intermediate and output files. - --weights - A file specifying initial feature weights. The format is - FeatureName_1 value1 - FeatureName_2 value2 - **All and only the weights listed in will be optimized!** - - --workdir - Directory for intermediate and output files. If not specified, the - name is derived from the ini filename. Assuming that the ini - filename begins with the decoder name and ends with ini, the default - name of the working directory is inferred from the middle part of - the filename. E.g. an ini file named decoder.foo.ini would have - a default working directory name foo. + --help + Print this message and exit. Job control options: diff --git a/dpmert/parallelize.pl b/dpmert/parallelize.pl index 7d0365cc..d2ebaeea 100755 --- a/dpmert/parallelize.pl +++ b/dpmert/parallelize.pl @@ -40,7 +40,7 @@ my $stay_alive; # dont let server die when having zero clients my $joblist = ""; my $errordir=""; my $multiline; -my @files_to_stage; +my $workdir = '.'; my $numnodes = 8; my $user = $ENV{"USER"}; my $pmem = "9g"; @@ -128,7 +128,7 @@ unless (GetOptions( "recycle-clients" => \$recycle_clients, "error-dir=s" => \$errordir, "multi-line" => \$multiline, - "file=s" => \@files_to_stage, + "workdir=s" => \$workdir, "use-fork" => \$use_fork, "verbose" => \$verbose, "jobs=i" => \$numnodes, @@ -363,7 +363,7 @@ sub launch_job_fork { } sub get_temp_script { - my ($fh, $filename) = tempfile( "workXXXX", SUFFIX => '.sh'); + my ($fh, $filename) = tempfile( "$workdir/workXXXX", SUFFIX => '.sh'); return ($fh, $filename); } -- cgit v1.2.3 From 0fcf21f26c77ccc22f14e66a15ef3c51080d12ef Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 16 Nov 2012 00:22:05 -0500 Subject: readme --- corpus/README.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 corpus/README.md (limited to 'corpus') diff --git a/corpus/README.md b/corpus/README.md new file mode 100644 index 00000000..935d9a65 --- /dev/null +++ b/corpus/README.md @@ -0,0 +1,32 @@ +This directory contains a number of useful scripts that are helpful for preprocessing parallel and monolingual corpora. They are provided for convenience and may be very useful, but their functionality will often be supplainted by other, more specialized tools. + +Many of these scripts assume that the input is [UTF-8 encoded](http://en.wikipedia.org/wiki/UTF-8). + +## Paste parallel files together + +This script reads one line at a time from a set of files and concatenates them with a triple pipe separator (`|||`) in the output. This is useful for generating parallel corpora files for training or evaluation: + ./paste-files.pl file.a file.b file.c [...] + +## Punctuation Normalization and Tokenization + +This script tokenizes text in any language (well, it does a good job in most languages, and in some it will completely go crazy): + ./tokenize-anything.sh < input.txt > output.txt + +It also normalizes a lot of unicode symbols and even corrects some common encoding errors. It can be applied to monolingual and parallel corpora directly. + +## Text lowercasing + +This script also does what it says, provided your input is in UTF8: + ./lowercase.pl < input.txt > output.txt + +## Length ratio filtering (for parallel corpora) + +This script computes statistics about sentence length ratios in a parallel corpus and removes sentences that are statistical outliers. This tends to remove extremely poorly aligned sentence pairs or sentence pairs that would otherwise be difficult to align: + ./filter-length.pl input.src-trg > output.src-trg + +## Add infrequent self-transaltions to a parallel corpus + +This script identifies rare words (those that occur less than 2 times in the corpus) and which have the same orthographic form in both the source and target language. Several copies of these words are then inserted at the end of the corpus that is written, which improves alignment quality. + ./add-self-translations.pl input.src-trg > output.src-trg + + -- cgit v1.2.3 From db9897bcafe5f732cee5c1c0fe5c9d9eaecdef0e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 16 Nov 2012 00:24:52 -0500 Subject: fix --- corpus/README.md | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'corpus') diff --git a/corpus/README.md b/corpus/README.md index 935d9a65..adc35b84 100644 --- a/corpus/README.md +++ b/corpus/README.md @@ -5,11 +5,13 @@ Many of these scripts assume that the input is [UTF-8 encoded](http://en.wikiped ## Paste parallel files together This script reads one line at a time from a set of files and concatenates them with a triple pipe separator (`|||`) in the output. This is useful for generating parallel corpora files for training or evaluation: + ./paste-files.pl file.a file.b file.c [...] ## Punctuation Normalization and Tokenization This script tokenizes text in any language (well, it does a good job in most languages, and in some it will completely go crazy): + ./tokenize-anything.sh < input.txt > output.txt It also normalizes a lot of unicode symbols and even corrects some common encoding errors. It can be applied to monolingual and parallel corpora directly. @@ -17,16 +19,19 @@ It also normalizes a lot of unicode symbols and even corrects some common encodi ## Text lowercasing This script also does what it says, provided your input is in UTF8: + ./lowercase.pl < input.txt > output.txt ## Length ratio filtering (for parallel corpora) This script computes statistics about sentence length ratios in a parallel corpus and removes sentences that are statistical outliers. This tends to remove extremely poorly aligned sentence pairs or sentence pairs that would otherwise be difficult to align: + ./filter-length.pl input.src-trg > output.src-trg ## Add infrequent self-transaltions to a parallel corpus This script identifies rare words (those that occur less than 2 times in the corpus) and which have the same orthographic form in both the source and target language. Several copies of these words are then inserted at the end of the corpus that is written, which improves alignment quality. + ./add-self-translations.pl input.src-trg > output.src-trg -- cgit v1.2.3 From c81b0dc240f2233c3e5ecccd8982218115476f9a Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 4 Dec 2012 22:01:32 -0500 Subject: more flexible corpus cutting --- corpus/cut-corpus.pl | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'corpus') diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl index fc9cce3b..7daac0e2 100755 --- a/corpus/cut-corpus.pl +++ b/corpus/cut-corpus.pl @@ -3,14 +3,33 @@ use strict; die "Usage: $0 N\nSplits a corpus separated by ||| symbols and returns the Nth field\n" unless scalar @ARGV > 0; my $x = shift @ARGV; -die "N must be numeric" unless $x =~ /^\d+$/; -$x--; +my @ind = split /,/, $x; +my @o = (); +for my $ff (@ind) { + if ($ff =~ /^\d+$/) { + push @o, $ff - 1; + } elsif ($ff =~ /^(\d+)-(\d+)$/) { + my $a = $1; + my $b = $2; + die "$a-$b is a bad range in input: $x\n" unless $b > $a; + for (my $i=$a; $i <= $b; $i++) { + push @o, $i - 1; + } + } else { + die "Bad input: $x\n"; + } +} while(<>) { chomp; my @fields = split / \|\|\| /; - my $y = $fields[$x]; - if (!defined $y) { $y= ''; } - print "$y\n"; + my @sf; + for my $i (@o) { + my $y = $fields[$i]; + if (!defined $y) { $y= ''; } + push @sf, $y; + } + print join(' ||| ', @sf) . "\n"; } + -- cgit v1.2.3 From a86a37cbe2fb6ffdcf4374f180010a118fed1063 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 5 Dec 2012 20:27:30 -0500 Subject: remove logging, you should be using pv --- corpus/support/quote-norm.pl | 7 ++++++- corpus/support/tokenizer.pl | 9 --------- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'corpus') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 0c5b9c26..72b0064d 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -18,13 +18,18 @@ while() { s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; s/&\s*#45\s*;/--/g; + s/�c/--/g; s/ ,,/ "/g; s/``/"/g; s/''/"/g; + s/[「」]/"/g; s/〃/"/g; s/¨/"/g; s/¡/ ¡ /g; s/¿/ ¿ /g; + # â + s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; + s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; s/ˇ/'/g; s/´/'/g; s/`/'/g; @@ -39,7 +44,7 @@ while() { s/»/"/g; tr/!-~/!-~/; s/、/,/g; - s/。/./g; + # s/。/./g; s/…/.../g; s/―/--/g; s/–/--/g; diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 23be00a5..e9c3a37d 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -107,24 +107,15 @@ my $orig_token_total = 0; my $deep_proc_token_total = 0; my $new_token_total = 0; -my $line_total = 0; -my $content_line_total = 0; - while(){ chomp(); - $line_total ++; - if ($line_total % 100000 == 0) { print STDERR " [$line_total]\n"; } - elsif ($line_total % 2500 == 0) { print STDERR "."; } - if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^ Date: Wed, 5 Dec 2012 20:57:20 -0500 Subject: slight tokenization bug fix --- corpus/support/token_patterns | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'corpus') diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns index c0e6fe1a..8e69432b 100644 --- a/corpus/support/token_patterns +++ b/corpus/support/token_patterns @@ -1,3 +1,3 @@ /^(al|el|ul|e)\-[a-z]+$/ -/^(\d+)\.$/ +/^(\d|\d\d|\d\d\d)\.$/ -- cgit v1.2.3 From bae5fe99037ae7e101953ad0df118127191c711c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 15 Jan 2013 01:20:00 -0500 Subject: corpus files --- Makefile.am | 1 + configure.ac | 2 +- corpus/add-self-translations.pl | 2 +- corpus/filter-length.pl | 6 ++++-- corpus/paste-files.pl | 12 +++++++++++- 5 files changed, 18 insertions(+), 5 deletions(-) (limited to 'corpus') diff --git a/Makefile.am b/Makefile.am index dbf604a1..1d898156 100644 --- a/Makefile.am +++ b/Makefile.am @@ -15,6 +15,7 @@ SUBDIRS = \ #gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava +EXTRA_DIST = python/pkg python/src python/tests python/examples AUTOMAKE_OPTIONS = foreign ACLOCAL_AMFLAGS = -I m4 AM_CPPFLAGS = -D_GLIBCXX_PARALLEL diff --git a/configure.ac b/configure.ac index dcd0a0d8..69971dc3 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cdec],[1.0]) +AC_INIT([cdec],[2013-01-15]) AC_CONFIG_SRCDIR([decoder/cdec.cc]) AM_INIT_AUTOMAKE AC_CONFIG_HEADERS(config.h) diff --git a/corpus/add-self-translations.pl b/corpus/add-self-translations.pl index 153bc454..d707ce29 100755 --- a/corpus/add-self-translations.pl +++ b/corpus/add-self-translations.pl @@ -6,7 +6,7 @@ use strict; my %df; my %def; while(<>) { - print; +# print; chomp; my ($sf, $se) = split / \|\|\| /; die "Format error: $_\n" unless defined $sf && defined $se; diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 70032ca7..3cfa40cc 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -3,8 +3,8 @@ use strict; use utf8; ##### EDIT THESE SETTINGS #################################################### -my $MAX_LENGTH = 99; # discard a sentence if it is longer than this -my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 6; # if both are shorter, include +my $MAX_LENGTH = 150; # discard a sentence if it is longer than this +my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be? ############################################################################## @@ -128,6 +128,8 @@ while() { next; } print; + } else { + print; } $to++; } diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl index 24c70599..0b788386 100755 --- a/corpus/paste-files.pl +++ b/corpus/paste-files.pl @@ -17,6 +17,7 @@ for my $file (@ARGV) { binmode(STDOUT,":utf8"); binmode(STDERR,":utf8"); +my $bad = 0; my $lc = 0; my $done = 0; my $fl = 0; @@ -34,7 +35,15 @@ while(1) { last; } chomp $r; - die "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; + if ($r =~ /\|\|\|/) { + $r = ''; + $bad++; + } + warn "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; + $r =~ s/\|\|\|/ /g; + $r =~ s/ +//g; + $r =~ s/^ //; + $r =~ s/ $//; $anum++; push @line, $r; } @@ -47,4 +56,5 @@ for (my $i = 1; $i < scalar @fhs; $i++) { my $r = <$fh>; die "Mismatched number of lines.\n" if defined $r; } +print STDERR "Bad lines containing ||| were $bad\n"; -- cgit v1.2.3 From 8b9aae7cff1efd1be195cdd000b21546bd5fca04 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 19 Jan 2013 19:09:48 -0500 Subject: updated version of boost.m4 and automatically build kenneth's LM builder --- Makefile.am | 2 + configure.ac | 7 +- corpus/cut-corpus.pl | 2 +- klm/lm/builder/Makefile.am | 28 +++ klm/util/Makefile.am | 2 +- klm/util/double-conversion/Makefile.am | 2 +- klm/util/stream/Makefile.am | 20 ++ klm/util/stream/sort.hh | 3 +- m4/boost.m4 | 322 +++++++++++++++++++++++++-------- 9 files changed, 311 insertions(+), 77 deletions(-) create mode 100644 klm/lm/builder/Makefile.am create mode 100644 klm/util/stream/Makefile.am (limited to 'corpus') diff --git a/Makefile.am b/Makefile.am index c2444928..17190d27 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,8 +5,10 @@ SUBDIRS = \ utils \ mteval \ klm/util/double-conversion \ + klm/util/stream \ klm/util \ klm/lm \ + klm/lm/builder \ klm/search \ decoder \ training \ diff --git a/configure.ac b/configure.ac index d6030752..a1e5ad84 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cdec],[2013-01-15]) +AC_INIT([cdec],[2013-01-19]) AC_CONFIG_SRCDIR([decoder/cdec.cc]) AM_INIT_AUTOMAKE AC_CONFIG_HEADERS(config.h) @@ -15,7 +15,10 @@ BOOST_REQUIRE([1.44]) BOOST_PROGRAM_OPTIONS BOOST_SYSTEM BOOST_SERIALIZATION +BOOST_CHRONO +BOOST_TIMER BOOST_TEST +BOOST_THREADS AM_PATH_PYTHON AC_CHECK_HEADER(dlfcn.h,AC_DEFINE(HAVE_DLFCN_H)) AC_CHECK_LIB(dl, dlopen) @@ -111,8 +114,10 @@ AC_CONFIG_FILES([word-aligner/Makefile]) # KenLM stuff AC_CONFIG_FILES([klm/util/double-conversion/Makefile]) +AC_CONFIG_FILES([klm/util/stream/Makefile]) AC_CONFIG_FILES([klm/util/Makefile]) AC_CONFIG_FILES([klm/lm/Makefile]) +AC_CONFIG_FILES([klm/lm/builder/Makefile]) AC_CONFIG_FILES([klm/search/Makefile]) # training stuff diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl index 7daac0e2..0af3b23c 100755 --- a/corpus/cut-corpus.pl +++ b/corpus/cut-corpus.pl @@ -22,7 +22,7 @@ for my $ff (@ind) { while(<>) { chomp; - my @fields = split / \|\|\| /; + my @fields = split /\s*\|\|\|\s*/; my @sf; for my $i (@o) { my $y = $fields[$i]; diff --git a/klm/lm/builder/Makefile.am b/klm/lm/builder/Makefile.am new file mode 100644 index 00000000..00444256 --- /dev/null +++ b/klm/lm/builder/Makefile.am @@ -0,0 +1,28 @@ +bin_PROGRAMS = builder + +builder_SOURCES = \ + main.cc \ + adjust_counts.cc \ + adjust_counts.hh \ + corpus_count.cc \ + corpus_count.hh \ + discount.hh \ + header_info.hh \ + initial_probabilities.cc \ + initial_probabilities.hh \ + interpolate.cc \ + interpolate.hh \ + joint_order.hh \ + multi_stream.hh \ + ngram.hh \ + ngram_stream.hh \ + pipeline.cc \ + pipeline.hh \ + print.cc \ + print.hh \ + sort.hh + +builder_LDADD = ../libklm.a ../../util/double-conversion/libklm_util_double.a ../../util/stream/libklm_util_stream.a ../../util/libklm_util.a $(BOOST_TIMER_LIBS) $(BOOST_CHRONO_LIBS) $(BOOST_THREAD_LIBS) + +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm + diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am index 294ebc0a..248cc844 100644 --- a/klm/util/Makefile.am +++ b/klm/util/Makefile.am @@ -54,4 +54,4 @@ libklm_util_a_SOURCES = \ string_piece.cc \ usage.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion diff --git a/klm/util/double-conversion/Makefile.am b/klm/util/double-conversion/Makefile.am index eb6616f7..dfcfb009 100644 --- a/klm/util/double-conversion/Makefile.am +++ b/klm/util/double-conversion/Makefile.am @@ -20,4 +20,4 @@ libklm_util_double_a_SOURCES = \ fixed-dtoa.cc \ strtod.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion diff --git a/klm/util/stream/Makefile.am b/klm/util/stream/Makefile.am new file mode 100644 index 00000000..f18cbedb --- /dev/null +++ b/klm/util/stream/Makefile.am @@ -0,0 +1,20 @@ +noinst_LIBRARIES = libklm_util_stream.a + +libklm_util_stream_a_SOURCES = \ + block.hh \ + chain.cc \ + chain.hh \ + config.hh \ + io.cc \ + io.hh \ + line_input.cc \ + line_input.hh \ + multi_progress.cc \ + multi_progress.hh \ + sort.hh \ + stream.hh \ + timer.hh + +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm + +#-I$(top_srcdir)/klm/util/double-conversion diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh index be6c11ea..df57fa41 100644 --- a/klm/util/stream/sort.hh +++ b/klm/util/stream/sort.hh @@ -259,7 +259,8 @@ template class MergingReader { while (in_offsets_->RemainingBlocks()) { // Use bigger buffers if there's less remaining. - uint64_t per_buffer = std::max(buffer_size_, total_memory_ / in_offsets_->RemainingBlocks()); + uint64_t per_buffer = std::max(static_cast(buffer_size_), + static_cast(total_memory_ / in_offsets_->RemainingBlocks())); per_buffer -= per_buffer % entry_size; assert(per_buffer); diff --git a/m4/boost.m4 b/m4/boost.m4 index 7e0ed075..027e039b 100644 --- a/m4/boost.m4 +++ b/m4/boost.m4 @@ -1,5 +1,5 @@ # boost.m4: Locate Boost headers and libraries for autoconf-based projects. -# Copyright (C) 2007, 2008, 2009 Benoit Sigoure +# Copyright (C) 2007, 2008, 2009, 2010, 2011 Benoit Sigoure # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -22,7 +22,7 @@ # along with this program. If not, see . m4_define([_BOOST_SERIAL], [m4_translit([ -# serial 12 +# serial 16 ], [# ], [])]) @@ -45,15 +45,19 @@ m4_define([_BOOST_SERIAL], [m4_translit([ # Note: THESE MACROS ASSUME THAT YOU USE LIBTOOL. If you don't, don't worry, # simply read the README, it will show you what to do step by step. -m4_pattern_forbid([^_?BOOST_]) +m4_pattern_forbid([^_?(BOOST|Boost)_]) # _BOOST_SED_CPP(SED-PROGRAM, PROGRAM, # [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) # -------------------------------------------------------- # Same as AC_EGREP_CPP, but leave the result in conftest.i. -# PATTERN is *not* overquoted, as in AC_EGREP_CPP. It could be useful -# to turn this into a macro which extracts the value of any macro. +# +# SED-PROGRAM is *not* overquoted, as in AC_EGREP_CPP. It is expanded +# in double-quotes, so escape your double quotes. +# +# It could be useful to turn this into a macro which extracts the +# value of any macro. m4_define([_BOOST_SED_CPP], [AC_LANG_PREPROC_REQUIRE()dnl AC_REQUIRE([AC_PROG_SED])dnl @@ -98,6 +102,7 @@ set x $boost_version_req 0 0 0 IFS=$boost_save_IFS shift boost_version_req=`expr "$[1]" '*' 100000 + "$[2]" '*' 100 + "$[3]"` +boost_version_req_string=$[1].$[2].$[3] AC_ARG_WITH([boost], [AS_HELP_STRING([--with-boost=DIR], [prefix of Boost $1 @<:@guess@:>@])])dnl @@ -113,9 +118,9 @@ if test x"$BOOST_ROOT" != x; then fi fi AC_SUBST([DISTCHECK_CONFIGURE_FLAGS], - ["$DISTCHECK_CONFIGURE_FLAGS '--with-boost=$with_boost'"]) + ["$DISTCHECK_CONFIGURE_FLAGS '--with-boost=$with_boost'"])dnl boost_save_CPPFLAGS=$CPPFLAGS - AC_CACHE_CHECK([for Boost headers version >= $boost_version_req], + AC_CACHE_CHECK([for Boost headers version >= $boost_version_req_string], [boost_cv_inc_path], [boost_cv_inc_path=no AC_LANG_PUSH([C++])dnl @@ -183,24 +188,25 @@ AC_LANG_POP([C++])dnl ]) case $boost_cv_inc_path in #( no) - boost_errmsg="cannot find Boost headers version >= $boost_version_req" + boost_errmsg="cannot find Boost headers version >= $boost_version_req_string" m4_if([$2], [], [AC_MSG_ERROR([$boost_errmsg])], [AC_MSG_NOTICE([$boost_errmsg])]) $2 ;;#( yes) BOOST_CPPFLAGS= - AC_DEFINE([HAVE_BOOST], [1], - [Defined if the requested minimum BOOST version is satisfied]) ;;#( *) - AC_SUBST([BOOST_CPPFLAGS], ["-I$boost_cv_inc_path"]) + AC_SUBST([BOOST_CPPFLAGS], ["-I$boost_cv_inc_path"])dnl ;; esac + if test x"$boost_cv_inc_path" != xno; then + AC_DEFINE([HAVE_BOOST], [1], + [Defined if the requested minimum BOOST version is satisfied]) AC_CACHE_CHECK([for Boost's header version], [boost_cv_lib_version], [m4_pattern_allow([^BOOST_LIB_VERSION$])dnl - _BOOST_SED_CPP([/^boost-lib-version = /{s///;s/\"//g;p;g;}], + _BOOST_SED_CPP([/^boost-lib-version = /{s///;s/\"//g;p;q;}], [#include boost-lib-version = BOOST_LIB_VERSION], [boost_cv_lib_version=`cat conftest.i`])]) @@ -211,6 +217,7 @@ boost-lib-version = BOOST_LIB_VERSION], AC_MSG_ERROR([invalid value: boost_major_version=$boost_major_version]) ;; esac +fi CPPFLAGS=$boost_save_CPPFLAGS ])# BOOST_REQUIRE @@ -220,7 +227,7 @@ CPPFLAGS=$boost_save_CPPFLAGS # on the command line, static versions of the libraries will be looked up. AC_DEFUN([BOOST_STATIC], [AC_ARG_ENABLE([static-boost], - [AC_HELP_STRING([--enable-static-boost], + [AS_HELP_STRING([--enable-static-boost], [Prefer the static boost libraries over the shared ones [no]])], [enable_static_boost=yes], [enable_static_boost=no])])# BOOST_STATIC @@ -290,6 +297,7 @@ dnl The else branch is huge and wasn't intended on purpose. AC_LANG_PUSH([C++])dnl AS_VAR_PUSHDEF([Boost_lib], [boost_cv_lib_$1])dnl AS_VAR_PUSHDEF([Boost_lib_LDFLAGS], [boost_cv_lib_$1_LDFLAGS])dnl +AS_VAR_PUSHDEF([Boost_lib_LDPATH], [boost_cv_lib_$1_LDPATH])dnl AS_VAR_PUSHDEF([Boost_lib_LIBS], [boost_cv_lib_$1_LIBS])dnl BOOST_FIND_HEADER([$3]) boost_save_CPPFLAGS=$CPPFLAGS @@ -371,8 +379,8 @@ for boost_rtopt_ in $boost_rtopt '' -d; do boost_tmp_lib=$with_boost test x"$with_boost" = x && boost_tmp_lib=${boost_cv_inc_path%/include} for boost_ldpath in "$boost_tmp_lib/lib" '' \ - /opt/local/lib /usr/local/lib /opt/lib /usr/lib \ - "$with_boost" C:/Boost/lib /lib /usr/lib64 /lib64 + /opt/local/lib* /usr/local/lib* /opt/lib* /usr/lib* \ + "$with_boost" C:/Boost/lib /lib* do test -e "$boost_ldpath" || continue boost_save_LDFLAGS=$LDFLAGS @@ -395,7 +403,16 @@ dnl generated only once above (before we start the for loops). LDFLAGS=$boost_save_LDFLAGS LIBS=$boost_save_LIBS if test x"$Boost_lib" = xyes; then - Boost_lib_LDFLAGS="-L$boost_ldpath -R$boost_ldpath" + # Because Boost is often installed in non-standard locations we want to + # hardcode the path to the library (with rpath). Here we assume that + # Libtool's macro was already invoked so we can steal its variable + # hardcode_libdir_flag_spec in order to get the right flags for ld. + boost_save_libdir=$libdir + libdir=$boost_ldpath + eval boost_rpath=\"$hardcode_libdir_flag_spec\" + libdir=$boost_save_libdir + Boost_lib_LDFLAGS="-L$boost_ldpath $boost_rpath" + Boost_lib_LDPATH="$boost_ldpath" break 6 else boost_failed_libs="$boost_failed_libs@$boost_lib@" @@ -410,14 +427,17 @@ rm -f conftest.$ac_objext ]) case $Boost_lib in #( no) _AC_MSG_LOG_CONFTEST - AC_MSG_ERROR([cannot not find the flags to link with Boost $1]) + AC_MSG_ERROR([cannot find the flags to link with Boost $1]) ;; esac -AC_SUBST(AS_TR_CPP([BOOST_$1_LDFLAGS]), [$Boost_lib_LDFLAGS]) -AC_SUBST(AS_TR_CPP([BOOST_$1_LIBS]), [$Boost_lib_LIBS]) +AC_SUBST(AS_TR_CPP([BOOST_$1_LDFLAGS]), [$Boost_lib_LDFLAGS])dnl +AC_SUBST(AS_TR_CPP([BOOST_$1_LDPATH]), [$Boost_lib_LDPATH])dnl +AC_SUBST([BOOST_LDPATH], [$Boost_lib_LDPATH])dnl +AC_SUBST(AS_TR_CPP([BOOST_$1_LIBS]), [$Boost_lib_LIBS])dnl CPPFLAGS=$boost_save_CPPFLAGS AS_VAR_POPDEF([Boost_lib])dnl AS_VAR_POPDEF([Boost_lib_LDFLAGS])dnl +AS_VAR_POPDEF([Boost_lib_LDPATH])dnl AS_VAR_POPDEF([Boost_lib_LIBS])dnl AC_LANG_POP([C++])dnl fi @@ -432,17 +452,31 @@ fi # The page http://beta.boost.org/doc/libs is useful: it gives the first release # version of each library (among other things). +# BOOST_DEFUN(LIBRARY, CODE) +# -------------------------- +# Define BOOST_ as a macro that runs CODE. +# +# Use indir to avoid the warning on underquoted macro name given to AC_DEFUN. +m4_define([BOOST_DEFUN], +[m4_indir([AC_DEFUN], + m4_toupper([BOOST_$1]), +[m4_pushdef([BOOST_Library], [$1])dnl +$2 +m4_popdef([BOOST_Library])dnl +]) +]) + # BOOST_ARRAY() # ------------- # Look for Boost.Array -AC_DEFUN([BOOST_ARRAY], +BOOST_DEFUN([Array], [BOOST_FIND_HEADER([boost/array.hpp])]) # BOOST_ASIO() # ------------ # Look for Boost.Asio (new in Boost 1.35). -AC_DEFUN([BOOST_ASIO], +BOOST_DEFUN([Asio], [AC_REQUIRE([BOOST_SYSTEM])dnl BOOST_FIND_HEADER([boost/asio.hpp])]) @@ -450,14 +484,41 @@ BOOST_FIND_HEADER([boost/asio.hpp])]) # BOOST_BIND() # ------------ # Look for Boost.Bind -AC_DEFUN([BOOST_BIND], +BOOST_DEFUN([Bind], [BOOST_FIND_HEADER([boost/bind.hpp])]) +# BOOST_CHRONO() +# ------------------ +# Look for Boost.Chrono +BOOST_DEFUN([Chrono], +[# Do we have to check for Boost.System? This link-time dependency was +# added as of 1.35.0. If we have a version <1.35, we must not attempt to +# find Boost.System as it didn't exist by then. +if test $boost_major_version -ge 135; then + BOOST_SYSTEM([$1]) +fi # end of the Boost.System check. +boost_system_save_LIBS=$LIBS +boost_system_save_LDFLAGS=$LDFLAGS +m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl +LIBS="$LIBS $BOOST_SYSTEM_LIBS" +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" +BOOST_FIND_LIB([chrono], [$1], + [boost/chrono.hpp], + [boost::chrono::system_clock::time_point d = boost::chrono::system_clock::now();]) +if test $enable_static_boost = yes && test $boost_major_version -ge 135; then + AC_SUBST([BOOST_SYSTEM_LIBS], ["$BOOST_SYSTEM_LIBS $BOOST_SYSTEM_LIBS"]) +fi +LIBS=$boost_system_save_LIBS +LDFLAGS=$boost_system_save_LDFLAGS + +])# BOOST_CHRONO + + # BOOST_CONVERSION() # ------------------ # Look for Boost.Conversion (cast / lexical_cast) -AC_DEFUN([BOOST_CONVERSION], +BOOST_DEFUN([Conversion], [BOOST_FIND_HEADER([boost/cast.hpp]) BOOST_FIND_HEADER([boost/lexical_cast.hpp]) ])# BOOST_CONVERSION @@ -467,12 +528,31 @@ BOOST_FIND_HEADER([boost/lexical_cast.hpp]) # ----------------------------------- # Look for Boost.Date_Time. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_DATE_TIME], +BOOST_DEFUN([Date_Time], [BOOST_FIND_LIB([date_time], [$1], [boost/date_time/posix_time/posix_time.hpp], [boost::posix_time::ptime t;]) ])# BOOST_DATE_TIME +# BOOST_TIMER([PREFERRED-RT-OPT]) +# ----------------------------------- +# Look for Boost.Timer. For the documentation of PREFERRED-RT-OPT, see the +# documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Timer], +[#check for Boost.System +BOOST_SYSTEM([$1]) +boost_system_save_LIBS=$LIBS +boost_system_save_LDFLAGS=$LDFLAGS +m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl +LIBS="$LIBS $BOOST_SYSTEM_LIBS" +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" +BOOST_FIND_LIB([timer], [$1], + [boost/timer/timer.hpp], + [boost::timer::auto_cpu_timer t;]) +AC_SUBST([BOOST_SYSTEM_LIBS], ["$BOOST_SYSTEM_LIBS $BOOST_SYSTEM_LIBS"]) +LIBS=$boost_system_save_LIBS +LDFLAGS=$boost_system_save_LDFLAGS +])# BOOST_TIMER # BOOST_FILESYSTEM([PREFERRED-RT-OPT]) # ------------------------------------ @@ -480,7 +560,7 @@ AC_DEFUN([BOOST_DATE_TIME], # the documentation of BOOST_FIND_LIB above. # Do not check for boost/filesystem.hpp because this file was introduced in # 1.34. -AC_DEFUN([BOOST_FILESYSTEM], +BOOST_DEFUN([Filesystem], [# Do we have to check for Boost.System? This link-time dependency was # added as of 1.35.0. If we have a version <1.35, we must not attempt to # find Boost.System as it didn't exist by then. @@ -494,6 +574,9 @@ LIBS="$LIBS $BOOST_SYSTEM_LIBS" LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" BOOST_FIND_LIB([filesystem], [$1], [boost/filesystem/path.hpp], [boost::filesystem::path p;]) +if test $enable_static_boost = yes && test $boost_major_version -ge 135; then + AC_SUBST([BOOST_FILESYSTEM_LIBS], ["$BOOST_FILESYSTEM_LIBS $BOOST_SYSTEM_LIBS"]) +fi LIBS=$boost_filesystem_save_LIBS LDFLAGS=$boost_filesystem_save_LDFLAGS ])# BOOST_FILESYSTEM @@ -502,7 +585,7 @@ LDFLAGS=$boost_filesystem_save_LDFLAGS # BOOST_FOREACH() # --------------- # Look for Boost.Foreach -AC_DEFUN([BOOST_FOREACH], +BOOST_DEFUN([Foreach], [BOOST_FIND_HEADER([boost/foreach.hpp])]) @@ -513,14 +596,14 @@ AC_DEFUN([BOOST_FOREACH], # standalone. It can't be compiled because it triggers the following error: # boost/format/detail/config_macros.hpp:88: error: 'locale' in namespace 'std' # does not name a type -AC_DEFUN([BOOST_FORMAT], +BOOST_DEFUN([Format], [BOOST_FIND_HEADER([boost/format.hpp])]) # BOOST_FUNCTION() # ---------------- # Look for Boost.Function -AC_DEFUN([BOOST_FUNCTION], +BOOST_DEFUN([Function], [BOOST_FIND_HEADER([boost/function.hpp])]) @@ -528,37 +611,60 @@ AC_DEFUN([BOOST_FUNCTION], # ------------------------------- # Look for Boost.Graphs. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_GRAPH], +BOOST_DEFUN([Graph], [BOOST_FIND_LIB([graph], [$1], [boost/graph/adjacency_list.hpp], [boost::adjacency_list<> g;]) ])# BOOST_GRAPH # BOOST_IOSTREAMS([PREFERRED-RT-OPT]) -# ------------------------------- +# ----------------------------------- # Look for Boost.IOStreams. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_IOSTREAMS], +BOOST_DEFUN([IOStreams], [BOOST_FIND_LIB([iostreams], [$1], [boost/iostreams/device/file_descriptor.hpp], - [boost::iostreams::file_descriptor fd(0); fd.close();]) + [boost::iostreams::file_descriptor fd; fd.close();]) ])# BOOST_IOSTREAMS # BOOST_HASH() # ------------ # Look for Boost.Functional/Hash -AC_DEFUN([BOOST_HASH], +BOOST_DEFUN([Hash], [BOOST_FIND_HEADER([boost/functional/hash.hpp])]) # BOOST_LAMBDA() # -------------- # Look for Boost.Lambda -AC_DEFUN([BOOST_LAMBDA], +BOOST_DEFUN([Lambda], [BOOST_FIND_HEADER([boost/lambda/lambda.hpp])]) +# BOOST_LOG([PREFERRED-RT-OPT]) +# ----------------------------- +# Look for Boost.Log For the documentation of PREFERRED-RT-OPT, see the +# documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Log], +[BOOST_FIND_LIB([log], [$1], + [boost/log/core/core.hpp], + [boost::log::attribute a; a.get_value();]) +])# BOOST_LOG + + +# BOOST_LOG_SETUP([PREFERRED-RT-OPT]) +# ----------------------------------- +# Look for Boost.Log For the documentation of PREFERRED-RT-OPT, see the +# documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Log_Setup], +[AC_REQUIRE([BOOST_LOG])dnl +BOOST_FIND_LIB([log_setup], [$1], + [boost/log/utility/init/from_settings.hpp], + [boost::log::basic_settings bs; bs.empty();]) +])# BOOST_LOG_SETUP + + # BOOST_MATH() # ------------ # Look for Boost.Math @@ -567,21 +673,21 @@ AC_DEFUN([BOOST_LAMBDA], # libboost_math_c99f, libboost_math_c99l, libboost_math_tr1, # libboost_math_tr1f, libboost_math_tr1l). This macro must be fixed to do the # right thing anyway. -AC_DEFUN([BOOST_MATH], +BOOST_DEFUN([Math], [BOOST_FIND_HEADER([boost/math/special_functions.hpp])]) # BOOST_MULTIARRAY() # ------------------ # Look for Boost.MultiArray -AC_DEFUN([BOOST_MULTIARRAY], +BOOST_DEFUN([MultiArray], [BOOST_FIND_HEADER([boost/multi_array.hpp])]) # BOOST_NUMERIC_CONVERSION() # -------------------------- # Look for Boost.NumericConversion (policy-based numeric conversion) -AC_DEFUN([BOOST_NUMERIC_CONVERSION], +BOOST_DEFUN([Numeric_Conversion], [BOOST_FIND_HEADER([boost/numeric/conversion/converter.hpp]) ])# BOOST_NUMERIC_CONVERSION @@ -589,32 +695,76 @@ AC_DEFUN([BOOST_NUMERIC_CONVERSION], # BOOST_OPTIONAL() # ---------------- # Look for Boost.Optional -AC_DEFUN([BOOST_OPTIONAL], +BOOST_DEFUN([Optional], [BOOST_FIND_HEADER([boost/optional.hpp])]) # BOOST_PREPROCESSOR() # -------------------- # Look for Boost.Preprocessor -AC_DEFUN([BOOST_PREPROCESSOR], +BOOST_DEFUN([Preprocessor], [BOOST_FIND_HEADER([boost/preprocessor/repeat.hpp])]) +# BOOST_UNORDERED() +# ----------------- +# Look for Boost.Unordered +BOOST_DEFUN([Unordered], +[BOOST_FIND_HEADER([boost/unordered_map.hpp])]) + + +# BOOST_UUID() +# ------------ +# Look for Boost.Uuid +BOOST_DEFUN([Uuid], +[BOOST_FIND_HEADER([boost/uuid/uuid.hpp])]) + + # BOOST_PROGRAM_OPTIONS([PREFERRED-RT-OPT]) # ----------------------------------------- -# Look for Boost.Program_options. For the documentation of PREFERRED-RT-OPT, see -# the documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_PROGRAM_OPTIONS], +# Look for Boost.Program_options. For the documentation of PREFERRED-RT-OPT, +# see the documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Program_Options], [BOOST_FIND_LIB([program_options], [$1], [boost/program_options.hpp], [boost::program_options::options_description d("test");]) ])# BOOST_PROGRAM_OPTIONS + +# _BOOST_PYTHON_CONFIG(VARIABLE, FLAG) +# ------------------------------------ +# Save VARIABLE, and define it via `python-config --FLAG`. +# Substitute BOOST_PYTHON_VARIABLE. +m4_define([_BOOST_PYTHON_CONFIG], +[AC_SUBST([BOOST_PYTHON_$1], + [`python-config --$2 2>/dev/null`])dnl +boost_python_save_$1=$$1 +$1="$$1 $BOOST_PYTHON_$1"]) + + +# BOOST_PYTHON([PREFERRED-RT-OPT]) +# -------------------------------- +# Look for Boost.Python. For the documentation of PREFERRED-RT-OPT, +# see the documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Python], +[_BOOST_PYTHON_CONFIG([CPPFLAGS], [includes]) +_BOOST_PYTHON_CONFIG([LDFLAGS], [ldflags]) +_BOOST_PYTHON_CONFIG([LIBS], [libs]) +m4_pattern_allow([^BOOST_PYTHON_MODULE$])dnl +BOOST_FIND_LIB([python], [$1], + [boost/python.hpp], + [], [BOOST_PYTHON_MODULE(empty) {}]) +CPPFLAGS=$boost_python_save_CPPFLAGS +LDFLAGS=$boost_python_save_LDFLAGS +LIBS=$boost_python_save_LIBS +])# BOOST_PYTHON + + # BOOST_REF() # ----------- # Look for Boost.Ref -AC_DEFUN([BOOST_REF], +BOOST_DEFUN([Ref], [BOOST_FIND_HEADER([boost/ref.hpp])]) @@ -622,7 +772,7 @@ AC_DEFUN([BOOST_REF], # ------------------------------- # Look for Boost.Regex. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_REGEX], +BOOST_DEFUN([Regex], [BOOST_FIND_LIB([regex], [$1], [boost/regex.hpp], [boost::regex exp("*"); boost::regex_match("foo", exp);]) @@ -633,19 +783,19 @@ AC_DEFUN([BOOST_REGEX], # --------------------------------------- # Look for Boost.Serialization. For the documentation of PREFERRED-RT-OPT, see # the documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_SERIALIZATION], +BOOST_DEFUN([Serialization], [BOOST_FIND_LIB([serialization], [$1], [boost/archive/text_oarchive.hpp], [std::ostream* o = 0; // Cheap way to get an ostream... boost::archive::text_oarchive t(*o);]) -])# BOOST_SIGNALS +])# BOOST_SERIALIZATION # BOOST_SIGNALS([PREFERRED-RT-OPT]) # --------------------------------- # Look for Boost.Signals. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_SIGNALS], +BOOST_DEFUN([Signals], [BOOST_FIND_LIB([signals], [$1], [boost/signal.hpp], [boost::signal s;]) @@ -655,7 +805,7 @@ AC_DEFUN([BOOST_SIGNALS], # BOOST_SMART_PTR() # ----------------- # Look for Boost.SmartPtr -AC_DEFUN([BOOST_SMART_PTR], +BOOST_DEFUN([Smart_Ptr], [BOOST_FIND_HEADER([boost/scoped_ptr.hpp]) BOOST_FIND_HEADER([boost/shared_ptr.hpp]) ]) @@ -664,14 +814,14 @@ BOOST_FIND_HEADER([boost/shared_ptr.hpp]) # BOOST_STATICASSERT() # -------------------- # Look for Boost.StaticAssert -AC_DEFUN([BOOST_STATICASSERT], +BOOST_DEFUN([StaticAssert], [BOOST_FIND_HEADER([boost/static_assert.hpp])]) # BOOST_STRING_ALGO() # ------------------- # Look for Boost.StringAlgo -AC_DEFUN([BOOST_STRING_ALGO], +BOOST_DEFUN([String_Algo], [BOOST_FIND_HEADER([boost/algorithm/string.hpp]) ]) @@ -681,7 +831,7 @@ AC_DEFUN([BOOST_STRING_ALGO], # Look for Boost.System. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. This library was introduced in Boost # 1.35.0. -AC_DEFUN([BOOST_SYSTEM], +BOOST_DEFUN([System], [BOOST_FIND_LIB([system], [$1], [boost/system/error_code.hpp], [boost::system::error_code e; e.clear();]) @@ -692,7 +842,7 @@ AC_DEFUN([BOOST_SYSTEM], # ------------------------------ # Look for Boost.Test. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_TEST], +BOOST_DEFUN([Test], [m4_pattern_allow([^BOOST_CHECK$])dnl BOOST_FIND_LIB([unit_test_framework], [$1], [boost/test/unit_test.hpp], [BOOST_CHECK(2 == 2);], @@ -707,25 +857,49 @@ BOOST_FIND_LIB([unit_test_framework], [$1], # Look for Boost.Thread. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. # FIXME: Provide an alias "BOOST_THREAD". -AC_DEFUN([BOOST_THREADS], +BOOST_DEFUN([Threads], [dnl Having the pthread flag is required at least on GCC3 where dnl boost/thread.hpp would complain if we try to compile without dnl -pthread on GNU/Linux. AC_REQUIRE([_BOOST_PTHREAD_FLAG])dnl boost_threads_save_LIBS=$LIBS +boost_threads_save_LDFLAGS=$LDFLAGS boost_threads_save_CPPFLAGS=$CPPFLAGS -LIBS="$LIBS $boost_cv_pthread_flag" +# Link-time dependency from thread to system was added as of 1.49.0. +if test $boost_major_version -ge 149; then +BOOST_SYSTEM([$1]) +fi # end of the Boost.System check. +m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl +LIBS="$LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag" +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" # Yes, we *need* to put the -pthread thing in CPPFLAGS because with GCC3, # boost/thread.hpp will trigger a #error if -pthread isn't used: # boost/config/requires_threads.hpp:47:5: #error "Compiler threading support # is not turned on. Please set the correct command line options for # threading: -pthread (Linux), -pthreads (Solaris) or -mthreads (Mingw32)" CPPFLAGS="$CPPFLAGS $boost_cv_pthread_flag" -BOOST_FIND_LIB([thread], [$1], - [boost/thread.hpp], [boost::thread t; boost::mutex m;]) -BOOST_THREAD_LIBS="$BOOST_THREAD_LIBS $boost_cv_pthread_flag" + +# When compiling for the Windows platform, the threads library is named +# differently. +case $host_os in + (*mingw*) + BOOST_FIND_LIB([thread_win32], [$1], + [boost/thread.hpp], [boost::thread t; boost::mutex m;]) + BOOST_THREAD_LDFLAGS=$BOOST_THREAD_WIN32_LDFLAGS + BOOST_THREAD_LDPATH=$BOOST_THREAD_WIN32_LDPATH + BOOST_THREAD_LIBS=$BOOST_THREAD_WIN32_LIBS + ;; + (*) + BOOST_FIND_LIB([thread], [$1], + [boost/thread.hpp], [boost::thread t; boost::mutex m;]) + ;; +esac + +BOOST_THREAD_LIBS="$BOOST_THREAD_LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag" +BOOST_THREAD_LDFLAGS="$BOOST_SYSTEM_LDFLAGS" BOOST_CPPFLAGS="$BOOST_CPPFLAGS $boost_cv_pthread_flag" LIBS=$boost_threads_save_LIBS +LDFLAGS=$boost_threads_save_LDFLAGS CPPFLAGS=$boost_threads_save_CPPFLAGS ])# BOOST_THREADS @@ -733,14 +907,14 @@ CPPFLAGS=$boost_threads_save_CPPFLAGS # BOOST_TOKENIZER() # ----------------- # Look for Boost.Tokenizer -AC_DEFUN([BOOST_TOKENIZER], +BOOST_DEFUN([Tokenizer], [BOOST_FIND_HEADER([boost/tokenizer.hpp])]) # BOOST_TRIBOOL() # --------------- # Look for Boost.Tribool -AC_DEFUN([BOOST_TRIBOOL], +BOOST_DEFUN([Tribool], [BOOST_FIND_HEADER([boost/logic/tribool_fwd.hpp]) BOOST_FIND_HEADER([boost/logic/tribool.hpp]) ]) @@ -749,14 +923,14 @@ BOOST_FIND_HEADER([boost/logic/tribool.hpp]) # BOOST_TUPLE() # ------------- # Look for Boost.Tuple -AC_DEFUN([BOOST_TUPLE], +BOOST_DEFUN([Tuple], [BOOST_FIND_HEADER([boost/tuple/tuple.hpp])]) # BOOST_TYPETRAITS() # -------------------- # Look for Boost.TypeTraits -AC_DEFUN([BOOST_TYPETRAITS], +BOOST_DEFUN([TypeTraits], [BOOST_FIND_HEADER([boost/type_traits.hpp])]) @@ -764,14 +938,14 @@ AC_DEFUN([BOOST_TYPETRAITS], # --------------- # Look for Boost.Utility (noncopyable, result_of, base-from-member idiom, # etc.) -AC_DEFUN([BOOST_UTILITY], +BOOST_DEFUN([Utility], [BOOST_FIND_HEADER([boost/utility.hpp])]) # BOOST_VARIANT() # --------------- # Look for Boost.Variant. -AC_DEFUN([BOOST_VARIANT], +BOOST_DEFUN([Variant], [BOOST_FIND_HEADER([boost/variant/variant_fwd.hpp]) BOOST_FIND_HEADER([boost/variant.hpp])]) @@ -782,15 +956,15 @@ BOOST_FIND_HEADER([boost/variant.hpp])]) # call BOOST_THREADS first. # Look for Boost.Wave. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_WAVE], +BOOST_DEFUN([Wave], [AC_REQUIRE([BOOST_FILESYSTEM])dnl AC_REQUIRE([BOOST_DATE_TIME])dnl boost_wave_save_LIBS=$LIBS boost_wave_save_LDFLAGS=$LDFLAGS m4_pattern_allow([^BOOST_((FILE)?SYSTEM|DATE_TIME|THREAD)_(LIBS|LDFLAGS)$])dnl -LIBS="$LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $BOOST_DATE_TIME_LIBS\ +LIBS="$LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $BOOST_DATE_TIME_LIBS \ $BOOST_THREAD_LIBS" -LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS\ +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS \ $BOOST_DATE_TIME_LDFLAGS $BOOST_THREAD_LDFLAGS" BOOST_FIND_LIB([wave], [$1], [boost/wave.hpp], @@ -803,7 +977,7 @@ LDFLAGS=$boost_wave_save_LDFLAGS # BOOST_XPRESSIVE() # ----------------- # Look for Boost.Xpressive (new since 1.36.0). -AC_DEFUN([BOOST_XPRESSIVE], +BOOST_DEFUN([Xpressive], [BOOST_FIND_HEADER([boost/xpressive/xpressive.hpp])]) @@ -893,8 +1067,9 @@ AC_DEFUN([_BOOST_FIND_COMPILER_TAG], [AC_REQUIRE([AC_PROG_CXX])dnl AC_REQUIRE([AC_CANONICAL_HOST])dnl AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag], -[AC_LANG_PUSH([C++])dnl - boost_cv_lib_tag=unknown +[boost_cv_lib_tag=unknown +if test x$boost_cv_inc_path != xno; then + AC_LANG_PUSH([C++])dnl # The following tests are mostly inspired by boost/config/auto_link.hpp # The list is sorted to most recent/common to oldest compiler (in order # to increase the likelihood of finding the right compiler with the @@ -908,8 +1083,12 @@ AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag] # como, edg, kcc, bck, mp, sw, tru, xlc # I'm not sure about my test for `il' (be careful: Intel's ICC pre-defines # the same defines as GCC's). - # TODO: Move the test on GCC 4.4 up once it's released. for i in \ + _BOOST_gcc_test(4, 8) \ + _BOOST_gcc_test(4, 7) \ + _BOOST_gcc_test(4, 6) \ + _BOOST_gcc_test(4, 5) \ + _BOOST_gcc_test(4, 4) \ _BOOST_gcc_test(4, 3) \ _BOOST_gcc_test(4, 2) \ _BOOST_gcc_test(4, 1) \ @@ -929,7 +1108,6 @@ AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag] "defined __ICC && (defined __unix || defined __unix__) @ il" \ "defined __ICL @ iw" \ "defined _MSC_VER && _MSC_VER == 1300 @ vc7" \ - _BOOST_gcc_test(4, 4) \ _BOOST_gcc_test(2, 95) \ "defined __MWERKS__ && __MWERKS__ <= 0x32FF @ cw9" \ "defined _MSC_VER && _MSC_VER < 1300 && !defined UNDER_CE @ vc6" \ @@ -969,7 +1147,7 @@ AC_LANG_POP([C++])dnl boost_cv_lib_tag= ;; esac -])dnl end of AC_CACHE_CHECK +fi])dnl end of AC_CACHE_CHECK ])# _BOOST_FIND_COMPILER_TAG -- cgit v1.2.3 From c18c2497707bed72ace95db459e541261213c7e2 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 20 Jan 2013 01:38:42 -0500 Subject: control max len --- corpus/filter-length.pl | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'corpus') diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 3cfa40cc..38837f14 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -3,20 +3,30 @@ use strict; use utf8; ##### EDIT THESE SETTINGS #################################################### -my $MAX_LENGTH = 150; # discard a sentence if it is longer than this my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be? ############################################################################## -die "Usage: $0 corpus.fr-en\n\n Filter sentence pairs containing sentences longer than $MAX_LENGTH words\n or whose log length ratios are $MAX_ZSCORE stddevs away from the mean log ratio.\n\n" unless scalar @ARGV == 1; +die "Usage: $0 [-NNN] corpus.fr-en\n\n Filter sentence pairs containing sentences longer than NNN words (where NNN\n is 150 by default) or whose log length ratios are $MAX_ZSCORE stddevs away from the\n mean log ratio.\n\n" unless scalar @ARGV == 1 || scalar @ARGV == 2; binmode(STDOUT,":utf8"); binmode(STDERR,":utf8"); +my $MAX_LENGTH = 150; # discard a sentence if it is longer than this +if (scalar @ARGV == 2) { + my $fp = shift @ARGV; + die "Expected -NNN for first parameter, but got $fp\n" unless $fp =~ /^-(\d+)$/; + $MAX_LENGTH=$1; +} + my $corpus = shift @ARGV; + die "Cannot read from STDIN\n" if $corpus eq '-'; my $ff = "<$corpus"; $ff = "gunzip -c $corpus|" if $ff =~ /\.gz$/; +print STDERR "Max line length (monolingual): $MAX_LENGTH\n"; +print STDERR " Parallel corpus: $corpus\n"; + open F,$ff or die "Can't read $corpus: $!"; binmode(F,":utf8"); -- cgit v1.2.3 From e1f71a6ce868d116f04082b697a8d61afcd625f1 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 21 Jan 2013 16:53:05 -0500 Subject: a little bit of cleanup --- corpus/filter-length.pl | 4 ++-- corpus/paste-files.pl | 8 ++++---- corpus/support/tokenizer.pl | 1 - 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'corpus') diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 38837f14..2e257cda 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -44,11 +44,11 @@ while() { $lines++; if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; } elsif ($lines % 2500 == 0) { print STDERR "."; } - my ($sf, $se, @d) = split / \|\|\| /; + my ($sf, $se, @d) = split /\s*\|\|\|\s*/; if (scalar @d != 0 or !defined $se) { $bad_format++; if ($bad_format > 100 && ($bad_format / $lines) > 0.02) { - die "Corpus appears to be incorretly formatted, example: $_"; + die "$bad_format / $lines : Corpus appears to be incorretly formatted, example: $_"; } next; } diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl index 0b788386..4cb424ad 100755 --- a/corpus/paste-files.pl +++ b/corpus/paste-files.pl @@ -41,9 +41,9 @@ while(1) { } warn "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; $r =~ s/\|\|\|/ /g; - $r =~ s/ +//g; - $r =~ s/^ //; - $r =~ s/ $//; + $r =~ s/\s+/ /g; + $r =~ s/^ +//; + $r =~ s/ +$//; $anum++; push @line, $r; } @@ -56,5 +56,5 @@ for (my $i = 1; $i < scalar @fhs; $i++) { my $r = <$fh>; die "Mismatched number of lines.\n" if defined $r; } -print STDERR "Bad lines containing ||| were $bad\n"; +print STDERR "Number of lines containing ||| was: $bad\n" if $bad > 0; diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index e9c3a37d..b5190858 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -147,7 +147,6 @@ while(){ print STDOUT " $new_line\n"; } -print STDERR "\n"; ######################################################################## -- cgit v1.2.3 From ee681f3d0337dc3d93c25de373a50fbda69252fe Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 21 Jan 2013 17:28:19 -0500 Subject: tokenizer support for utf8 patterns --- corpus/support/token_patterns | 1 + corpus/support/tokenizer.pl | 2 ++ 2 files changed, 3 insertions(+) (limited to 'corpus') diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns index 8e69432b..b25ac6de 100644 --- a/corpus/support/token_patterns +++ b/corpus/support/token_patterns @@ -1,3 +1,4 @@ /^(al|el|ul|e)\-[a-z]+$/ +/^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/ /^(\d|\d\d|\d\d\d)\.$/ diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index b5190858..0350a894 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -73,6 +73,7 @@ my $dict_file = "$workdir/token_list"; my $word_patt_file = "$workdir/token_patterns"; open(my $dict_fp, "$dict_file") or die; +binmode($dict_fp, ":utf8"); # read in the list of words that should not be segmented, ## e.g.,"I.B.M.", co-operation. @@ -89,6 +90,7 @@ while(<$dict_fp>){ } open(my $patt_fp, "$word_patt_file") or die; +binmode($patt_fp, ":utf8"); my @word_patts = (); my $word_patt_num = 0; while(<$patt_fp>){ -- cgit v1.2.3 From a262141a80e5cfefafa958fc083e5a49f8b8dc10 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 22 Jan 2013 20:08:28 -0500 Subject: russian abbrevs --- corpus/support/quote-norm.pl | 14 ++++++++ corpus/support/token_list | 82 +++++++++++++++++++++++++++++++++++++++++++ corpus/support/token_patterns | 1 + 3 files changed, 97 insertions(+) (limited to 'corpus') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 72b0064d..e4e5055e 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -27,6 +27,20 @@ while() { s/¨/"/g; s/¡/ ¡ /g; s/¿/ ¿ /g; + + s/ п. п. / п.п. /g; + s/ ст. л. / ст.л. /g; + s/ т. е. / т.е. /g; + s/ т. к. / т.к. /g; + s/ т. ч. / т.ч. /g; + s/ т. д. / т.д. /g; + s/ т. п. / т.п. /g; + s/ и. о. / и.о. /g; + s/ с. г. / с.г. /g; + s/ г. р. / г.р. /g; + s/ т. н. / т.н. /g; + s/ т. ч. / т.ч. /g; + s/ н. э. / н.э. /g; # â s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; diff --git a/corpus/support/token_list b/corpus/support/token_list index 28eb4396..d470cb22 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -210,3 +210,85 @@ W. X. Y. Z. +А. +Б. +В. +Г. +Д. +Е. +Ё. +Ж. +З. +И. +Й. +К. +Л. +М. +Н. +О. +П. +Р. +С. +Т. +У. +Ф. +Х. +Ц. +Ч. +Ш. +Щ. +Ъ. +Ы. +Ь. +Э. +Ю. +Я. +л. +г. +обл. +гг. +в. +вв. +мин. +ч. +тыс. +млн. +млрд. +трлн. +кв. +куб. +руб. +коп. +долл. +Прим. +прим. +чел. +грн. +мин. +им. +проф. +акад. +ред. +авт. +корр. +соб. +спец. +см. +тж. +др. +пр. +букв. +# Two-letter abbreviations - can be written with space +п.п. +ст.л. +т.е. +т.к. +т.ч. +т.д. +т.п. +и.о. +с.г. +г.р. +т.н. +т.ч. +н.э. diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns index b25ac6de..de64fb2a 100644 --- a/corpus/support/token_patterns +++ b/corpus/support/token_patterns @@ -1,4 +1,5 @@ /^(al|el|ul|e)\-[a-z]+$/ /^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/ +/^\p{Cyrillic}\.\p{Cyrillic}\.$/ /^(\d|\d\d|\d\d\d)\.$/ -- cgit v1.2.3 From 9f0109076f1c95170cbe46e1708597bc6e0f9fd4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 23 Feb 2013 04:23:48 -0500 Subject: one missing quote type --- corpus/support/quote-norm.pl | 1 + 1 file changed, 1 insertion(+) (limited to 'corpus') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index e4e5055e..d2980092 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -20,6 +20,7 @@ while() { s/&\s*#45\s*;/--/g; s/�c/--/g; s/ ,,/ "/g; + s/„/"/g; s/``/"/g; s/''/"/g; s/[「」]/"/g; -- cgit v1.2.3 From 99538847039c06bdcc288e2c5dfcdb507ff879ca Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 27 Feb 2013 20:14:18 -0500 Subject: quick fix --- corpus/tokenize-anything.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'corpus') diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh index 1a24193d..028992cf 100755 --- a/corpus/tokenize-anything.sh +++ b/corpus/tokenize-anything.sh @@ -9,5 +9,5 @@ $SUPPORT/utf8-normalize.sh | sed -e 's/ al - / al-/g' | $SUPPORT/fix-contract.pl | sed -e 's/^ //' | sed -e 's/ $//' | - perl -e 'while(<>){s/(\d+)(\.+)$/$1 ./;print;}' + perl -e 'while(<>){s/(\d+)(\.+)$/$1 ./; s/(\d+)(\.+) \|\|\|/$1 . |||/; print;}' -- cgit v1.2.3 From 3a162d28033d1b9d5241e31f32978dba4eba6296 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 8 Mar 2013 22:44:49 -0500 Subject: few preproc fixes --- corpus/paste-files.pl | 1 + corpus/support/quote-norm.pl | 2 ++ corpus/support/token_list | 2 ++ 3 files changed, 5 insertions(+) (limited to 'corpus') diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl index 4cb424ad..ef2cd937 100755 --- a/corpus/paste-files.pl +++ b/corpus/paste-files.pl @@ -34,6 +34,7 @@ while(1) { $done = 1; last; } + $r =~ s/\r//g; chomp $r; if ($r =~ /\|\|\|/) { $r = ''; diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index d2980092..b104e73c 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -11,6 +11,8 @@ while() { s/&\s*squot\s*;/'/gi; s/&\s*quot\s*;/"/gi; s/&\s*amp\s*;/&/gi; + s/&\s*nbsp\s*;/&/gi; + s/&\s*#\s*160\s*;/ /gi; s/ (\d\d): (\d\d)/ $1:$2/g; s/[\x{20a0}]\x{20ac}]/ EUR /g; s/[\x{00A3}]/ GBP /g; diff --git a/corpus/support/token_list b/corpus/support/token_list index d470cb22..366cd7ff 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -37,6 +37,8 @@ tel. 10. ##################### abbreviation: words that contain period. +EE.UU. +ee.uu. U.A.E Ala. Ph.D. -- cgit v1.2.3 From 4f452c5bf5cd0ed3cb50d31012f93a50366b3aac Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 17 Mar 2013 23:26:24 -0400 Subject: fix possible utf8 bug --- corpus/lowercase.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'corpus') diff --git a/corpus/lowercase.pl b/corpus/lowercase.pl index 688e493b..9fd91dac 100755 --- a/corpus/lowercase.pl +++ b/corpus/lowercase.pl @@ -2,7 +2,7 @@ use strict; binmode(STDIN,":utf8"); binmode(STDOUT,":utf8"); -while(<>) { +while() { $_ = lc $_; print; } -- cgit v1.2.3 From 96fedabebafe7a38a6d5928be8fff767e411d705 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 26 Mar 2013 10:44:45 -0400 Subject: swahili abbreviations --- corpus/support/token_list | 152 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) (limited to 'corpus') diff --git a/corpus/support/token_list b/corpus/support/token_list index 366cd7ff..43dd80d9 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -294,3 +294,155 @@ Z. т.н. т.ч. н.э. +# Swahili +A.D. +Afr. +A.G. +agh. +A.H. +A.M. +a.s. +B.A. +B.C. +Bi. +B.J. +B.K. +B.O.M. +Brig. +Bro. +bt. +bw. +Bw. +Cap. +C.C. +cCM. +C.I.A. +cit. +C.M.S. +Co. +Corp. +C.S.Sp. +C.W. +D.C. +Dk. +Dkt. +Dk.B. +Dr. +E.C. +e.g. +E.M. +E.n. +etc. +Feb. +F.F.U. +F.M. +Fr. +F.W. +I.C.O. +i.e. +I.L.C. +Inc. +Jan. +J.F. +Jr. +J.S. +J.V.W.A. +K.A.R. +K.A.U. +K.C.M.C. +K.k. +K.K. +k.m. +km. +K.m. +K.N.C.U. +K.O. +K.S. +Ksh. +kt. +kumb. +k.v. +kv. +L.G. +ltd. +Ltd. +M.A. +M.D. +mf. +Mh. +Mhe. +mil. +m.m. +M.m. +Mm. +M.M. +Mr. +Mrs. +M.S. +Mt. +Mw. +M.W. +Mwl. +na. +Na. +N.F. +N.J. +n.k. +nk. +n.k.w. +N.N. +Nov. +O.C.D. +op. +P.C. +Phd. +Ph.D. +P.J. +P.o. +P.O. +P.O.P. +P.P.F. +Prof. +P.s. +P.S. +Q.C. +Rd. +s.a.w. +S.A.W. +S.D. +Sept. +sh. +Sh. +SH. +shs. +Shs. +S.J. +S.L. +S.L.P. +S.s. +S.S. +St. +s.w. +s.w.T. +taz. +Taz. +T.C. +T.E.C. +T.L.P. +T.O.H.S. +Tsh. +T.V. +tz. +uk. +Uk. +U.M.C.A. +U.N. +U.S. +Ush. +U.W.T. +Viii. +Vol. +V.T.C. +W.H. +yamb. +Y.M.C.A. -- cgit v1.2.3