From 3a5aeb67de3d7156e77ee94625ed3714117d3b43 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 18 Mar 2014 02:05:25 -0400 Subject: chris edits --- corpus/support/tokenizer.pl | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'corpus') diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 7771201f..f57bc87a 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -240,6 +240,10 @@ sub proc_token { return $token; } + if($token =~ /^\d+(.\d+)+(亿|百万|万|千)?$/){ + return $token; + } + ## 1,234,345.34 if($token =~ /^\d+(\.\d{3})*,\d+$/){ ## number -- cgit v1.2.3 From db41a3ca621447da38b5c2a1ba5c3ba1a47292fa Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 2 Apr 2014 20:05:48 -0400 Subject: moses conversion script --- corpus/moses-scfg-to-cdec.pl | 69 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100755 corpus/moses-scfg-to-cdec.pl (limited to 'corpus') diff --git a/corpus/moses-scfg-to-cdec.pl b/corpus/moses-scfg-to-cdec.pl new file mode 100755 index 00000000..9b8e3617 --- /dev/null +++ b/corpus/moses-scfg-to-cdec.pl @@ -0,0 +1,69 @@ +#!/usr/bin/perl -w +use strict; + +while(<>) { + my ($src, $trg, $feats, $al) = split / \|\|\| /; + # [X][NP] von [X][NP] [X] ||| [X][NP] 's [X][NP] [S] ||| 0.00110169 0.0073223 2.84566e-06 0.0027702 0.0121867 2.718 0.606531 ||| 0-0 1-1 2-2 ||| 635 245838 2 + + my @srcs = split /\s+/, $src; + my @trgs = split /\s+/, $trg; + my $lhs = pop @trgs; + $lhs =~ s/&apos;/'/g; + $lhs =~ s/'/'/g; + $lhs =~ s/,/COMMA/g; + my $ntc = 0; + my $sc = 0; + my @of = (); + my $x = pop @srcs; + my %d = (); # src index to nonterminal count + die "Expected [X]" unless $x eq '[X]'; + my %amap = (); + my @als = split / /, $al; + for my $st (@als) { + my ($s, $t) = split /-/, $st; + $amap{$t} = $s; + } + for my $f (@srcs) { + if ($f =~ /^\[X\]\[([^]]+)\]$/) { + $ntc++; + my $nt = $1; + $nt =~ s/&apos;/'/g; + $nt =~ s/'/'/g; + $nt =~ s/,/COMMA/g; + push @of, "[$nt]"; + $d{$sc} = $ntc; + } elsif ($f =~ /^\[[^]]+\]$/) { + die "Unexpected $f"; + } else { + push @of, $f; + } + $sc++; + } + my @oe = (); + my $ind = 0; + for my $e (@trgs) { + if ($e =~ /^\[X\]\[([^]]+)\]$/) { + my $imap = $d{$amap{$ind}}; + push @oe, "[$imap]"; + } else { + push @oe, $e; + } + $ind++; + } + my ($fe, $ef, $j, $lfe, $lef, $dummy, $of) = split / /, $feats; + next if $lef eq '0'; + next if $lfe eq '0'; + next if $ef eq '0'; + next if $fe eq '0'; + next if $j eq '0'; + next if $of eq '0'; + $ef = sprintf('%.6g', log($ef)); + $fe = sprintf('%.6g',log($fe)); + $j = sprintf('%.6g',log($j)); + $lef = sprintf('%.6g',log($lef)); + $lfe = sprintf('%.6g',log($lfe)); + $of = sprintf('%.6g',log($of)); + print "$lhs ||| @of ||| @oe ||| RuleCount=1 FgivenE=$fe EgivenF=$ef Joint=$j LexEgivenF=$lef LexFgivenE=$lfe Other=$of\n"; +} + +# [X][ADVP] angestiegen [X] ||| rose [X][ADVP] [VP] ||| 0.0538131 0.0097508 0.00744224 0.0249653 0.000698602 2.718 0.606531 ||| 0-1 1-0 ||| 13 94 2 -- cgit v1.2.3 From 0e2f8d3d049f06afb08b4639c6a28aa5461cdc78 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 3 Jun 2014 16:58:29 -0400 Subject: fix for nonjoining chars --- corpus/support/quote-norm.pl | 1 + training/pro/mr_pro_map.cc | 26 +++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'corpus') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 0366fad5..3eee0666 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -40,6 +40,7 @@ while() { # Regularlize spaces: s/\x{ad}//g; # soft hyphen + s/\x{200C}//g; # zero-width non-joiner s/\x{a0}/ /g; # non-breaking space s/\x{2009}/ /g; # thin space s/\x{2028}/ /g; # "line separator" diff --git a/training/pro/mr_pro_map.cc b/training/pro/mr_pro_map.cc index a5e6e48f..da58cd24 100644 --- a/training/pro/mr_pro_map.cc +++ b/training/pro/mr_pro_map.cc @@ -88,23 +88,43 @@ struct DiffOrder { } }; -void Sample(const unsigned gamma, +double LengthDifferenceStdDev(const training::CandidateSet& J_i, int n) { + double sum = 0; + for (int i = 0; i < n; ++i) { + const size_t a = rng->inclusive(0, J_i.size() - 1)(); + const size_t b = rng->inclusive(0, J_i.size() - 1)(); + if (a == b) { --i; continue; } + double p = J_i[a].ewords.size(); + p -= J_i[b].ewords.size(); + sum += p * p; // mean is 0 by construction + } + return max(sqrt(sum / n), 2.0); +}; + +void Sample(const int gamma, const unsigned xi, const training::CandidateSet& J_i, const EvaluationMetric* metric, vector* pv) { + const double len_stddev = LengthDifferenceStdDev(J_i, 5000); const bool invert_score = metric->IsErrorMetric(); vector v1, v2; float avg_diff = 0; - for (unsigned i = 0; i < gamma; ++i) { + const double z_score_threshold=2; + for (int i = 0; i < gamma; ++i) { const size_t a = rng->inclusive(0, J_i.size() - 1)(); const size_t b = rng->inclusive(0, J_i.size() - 1)(); - if (a == b) continue; + if (a == b) { --i; continue; } + double z_score = fabs(((int)J_i[a].ewords.size() - (int)J_i[b].ewords.size()) / len_stddev); + // variation on Nakov et al. (2011) + if (z_score > z_score_threshold) { --i; continue; } float ga = metric->ComputeScore(J_i[a].eval_feats); float gb = metric->ComputeScore(J_i[b].eval_feats); bool positive = gb < ga; if (invert_score) positive = !positive; const float gdiff = fabs(ga - gb); + //cerr << ((int)J_i[a].ewords.size() - (int)J_i[b].ewords.size()) << endl; + //cerr << (ga - gb) << endl; if (!gdiff) continue; avg_diff += gdiff; SparseVector xdiff = (J_i[a].fmap - J_i[b].fmap).erase_zeros(); -- cgit v1.2.3