diff options
author | Chris Dyer <redpony@gmail.com> | 2009-12-14 20:35:11 -0500 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2009-12-14 20:35:11 -0500 |
commit | 851e389dffdd6996ea32d70defb8906de80b9edc (patch) | |
tree | 8c68ee77205badc056b8ab5b332e67e3e98017df /training | |
parent | dc6930c00b4b276883280cff1ed6dcd9ddef03c7 (diff) |
few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec
Diffstat (limited to 'training')
-rw-r--r-- | training/Makefile.am | 20 | ||||
-rw-r--r-- | training/atools.cc | 96 | ||||
-rwxr-xr-x | training/cluster-ptrain.pl | 52 | ||||
-rwxr-xr-x | training/make-lexcrf-grammar.pl | 73 |
4 files changed, 210 insertions, 31 deletions
diff --git a/training/Makefile.am b/training/Makefile.am index c4c22fa2..944c75f7 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -10,32 +10,32 @@ bin_PROGRAMS = \ optimize_test atools_SOURCES = atools.cc -atools_LDADD = $(top_srcdir)/src/libhg.a -lz +atools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz model1_SOURCES = model1.cc -model1_LDADD = $(top_srcdir)/src/libhg.a -lz +model1_LDADD = $(top_srcdir)/decoder/libcdec.a -lz grammar_convert_SOURCES = grammar_convert.cc -grammar_convert_LDADD = $(top_srcdir)/src/libhg.a -lz +grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a -lz optimize_test_SOURCES = optimize_test.cc optimize.cc -optimize_test_LDADD = $(top_srcdir)/src/libhg.a -lz +optimize_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz collapse_weights_SOURCES = collapse_weights.cc -collapse_weights_LDADD = $(top_srcdir)/src/libhg.a -lz +collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a -lz lbfgs_test_SOURCES = lbfgs_test.cc -lbfgs_test_LDADD = $(top_srcdir)/src/libhg.a -lz +lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc optimize.cc -mr_optimize_reduce_LDADD = $(top_srcdir)/src/libhg.a -lz +mr_optimize_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz mr_em_train_SOURCES = mr_em_train.cc -mr_em_train_LDADD = $(top_srcdir)/src/libhg.a -lz +mr_em_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz plftools_SOURCES = plftools.cc -plftools_LDADD = $(top_srcdir)/src/libhg.a -lz +plftools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/src +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder AM_LDFLAGS = $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB) diff --git a/training/atools.cc b/training/atools.cc index bac73859..a18250f7 100644 --- a/training/atools.cc +++ b/training/atools.cc @@ -2,6 +2,7 @@ #include <sstream> #include <vector> +#include <queue> #include <map> #include <boost/program_options.hpp> #include <boost/shared_ptr.hpp> @@ -105,6 +106,99 @@ struct IntersectCommand : public Command { } }; +struct UnionCommand : public Command { + string Name() const { return "union"; } + bool RequiresTwoOperands() const { return true; } + void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) { + EnsureSize(a, b, x); + Array2D<bool>& res = *x; + for (int i = 0; i < res.width(); ++i) + for (int j = 0; j < res.height(); ++j) + res(i, j) = Safe(a, i, j) || Safe(b, i, j); + } +}; + +struct RefineCommand : public Command { + RefineCommand() { + neighbors_.push_back(make_pair(1,0)); + neighbors_.push_back(make_pair(-1,0)); + neighbors_.push_back(make_pair(0,1)); + neighbors_.push_back(make_pair(0,-1)); + } + bool RequiresTwoOperands() const { return true; } + protected: + void InitRefine( + const Array2D<bool>& a, + const Array2D<bool>& b, + Array2D<bool>* x) { + EnsureSize(a, b, x); + in_.clear(); un_.clear(); is_i_aligned_.clear(); is_j_aligned_.clear(); + EnsureSize(a, b, &in_); + EnsureSize(a, b, &un_); + is_i_aligned_.resize(x->width(), false); + is_j_aligned_.resize(x->height(), false); + for (int i = 0; i < in_.width(); ++i) + for (int j = 0; j < in_.height(); ++j) { + un_(i, j) = Safe(a, i, j) || Safe(b, i, j); + in_(i, j) = Safe(a, i, j) && Safe(b, i, j); + } + } + // "grow" the intersection alignment with neighboring points + // from the union alignment + void Grow(Array2D<bool>* x) { + Array2D<bool>& res = *x; + queue<pair<int, int> > q; + for (int i = 0; i < in_.width(); ++i) + for (int j = 0; j < in_.height(); ++j) + if (in_(i, j)) { + Align(i, j, x); + q.push(make_pair(i, j)); + } + while(!q.empty()) { + const pair<int,int> point = q.front(); + q.pop(); + for (int k = 0; k < neighbors_.size(); ++k) { + const int test_i = neighbors_[k].first + point.first; + const int test_j = neighbors_[k].second + point.second; + if (Safe(un_, test_i, test_j) && !res(test_i, test_j)) { + Align(test_i, test_j, x); + q.push(make_pair(test_i, test_j)); + } + } + } + } + void Final(bool do_and, Array2D<bool>* x) { + } + void Align(int i, int j, Array2D<bool>* x) { + (*x)(i, j) = true; + is_i_aligned_[i] = true; + is_j_aligned_[j] = true; + } + Array2D<bool> in_; // intersection alignment + Array2D<bool> un_; // union alignment + vector<bool> is_i_aligned_; + vector<bool> is_j_aligned_; + vector<pair<int,int> > neighbors_; +}; + +struct DiagCommand : public RefineCommand { + DiagCommand() { + neighbors_.push_back(make_pair(1,1)); + neighbors_.push_back(make_pair(-1,1)); + neighbors_.push_back(make_pair(1,-1)); + neighbors_.push_back(make_pair(-1,-1)); + } +}; + +struct GDFCommand : public DiagCommand { + string Name() const { return "gdf"; } + void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) { + InitRefine(a, b, x); + Grow(x); + Final(false, x); + } +}; + map<string, boost::shared_ptr<Command> > commands; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { @@ -163,6 +257,8 @@ int main(int argc, char **argv) { AddCommand<ConvertCommand>(); AddCommand<InvertCommand>(); AddCommand<IntersectCommand>(); + AddCommand<UnionCommand>(); + AddCommand<GDFCommand>(); AddCommand<FMeasureCommand>(); po::variables_map conf; InitCommandLine(argc, argv, &conf); diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl index 9f7c1569..8b06f162 100755 --- a/training/cluster-ptrain.pl +++ b/training/cluster-ptrain.pl @@ -8,7 +8,7 @@ my $MAX_ITER_ATTEMPTS = 5; # number of times to retry a failed function evaluati my $CWD=`pwd`; chomp $CWD; my $BIN_DIR = $SCRIPT_DIR; my $OPTIMIZER = "$BIN_DIR/mr_optimize_reduce"; -my $DECODER = "$BIN_DIR/../src/cdec"; +my $DECODER = "$BIN_DIR/../decoder/cdec"; my $COMBINER_CACHE_SIZE = 150; # This is a hack to run this on a weird cluster, # eventually, I'll provide Hadoop scripts. @@ -19,32 +19,35 @@ my $restart = ''; if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; } my $pmem="2500mb"; -my $nodes = 36; +my $nodes = 1; my $max_iteration = 1000; my $PRIOR_FLAG = ""; my $parallel = 1; my $CFLAG = "-C 1"; my $LOCAL; +my $DISTRIBUTED; my $PRIOR; my $OALG = "lbfgs"; my $sigsq = 1; my $means_file; -GetOptions("decoder=s" => \$DECODER, +GetOptions("cdec=s" => \$DECODER, "run_locally" => \$LOCAL, - "gaussian_prior" => \$PRIOR, + "distributed" => \$DISTRIBUTED, "sigma_squared=f" => \$sigsq, "means=s" => \$means_file, "optimizer=s" => \$OALG, + "jobs=i" => \$nodes, "pmem=s" => \$pmem ) or usage(); usage() unless scalar @ARGV==3; my $config_file = shift @ARGV; my $training_corpus = shift @ARGV; my $initial_weights = shift @ARGV; +unless ($DISTRIBUTED) { $LOCAL = 1; } die "Can't find $config_file" unless -f $config_file; die "Can't find $DECODER" unless -f $DECODER; die "Can't execute $DECODER" unless -x $DECODER; -if ($LOCAL) { print STDERR "Will running LOCALLY.\n"; $parallel = 0; } +if ($LOCAL) { print STDERR "Will run LOCALLY.\n"; $parallel = 0; } if ($PRIOR) { $PRIOR_FLAG="-p --sigma_squared $sigsq"; if ($means_file) { $PRIOR_FLAG .= " -u $means_file"; } @@ -56,20 +59,23 @@ if ($parallel) { } unless ($parallel) { $CFLAG = "-C 500"; } unless ($config_file =~ /^\//) { $config_file = $CWD . '/' . $config_file; } +my $clines = num_lines($training_corpus); print STDERR <<EOT; PTRAIN CONFIGURATION INFORMATION Config file: $config_file Training corpus: $training_corpus + Corpus size: $clines Initial weights: $initial_weights Decoder memory: $pmem - Nodes requested: $nodes Max iterations: $max_iteration Optimizer: $OALG - PRIOR: $PRIOR_FLAG - restart: $restart + Jobs requested: $nodes + prior?: $PRIOR_FLAG + restart?: $restart EOT + if ($OALG) { $OALG="-m $OALG"; } my $nodelist="1"; @@ -142,5 +148,33 @@ while ($iter < $max_iteration) { print "FINAL WEIGHTS: $dir/weights.$iter\n"; sub usage { - die "Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init\n"; + die <<EOT; + +Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init + + Options: + + --distributed Parallelize function evaluation + --cdec PATH Path to cdec binary + --optimize OPT lbfgs, rprop, sgd + --gaussian_prior add Gaussian prior + --means FILE if you want means other than 0 + --sigma_squared S variance on prior + --pmem MEM Memory required for decoder + +EOT +} + +sub num_lines { + my $file = shift; + my $fh; + if ($file=~ /\.gz$/) { + open $fh, "zcat $file|" or die "Couldn't fork zcat $file: $!"; + } else { + open $fh, "<$file" or die "Couldn't read $file: $!"; + } + my $lines = 0; + while(<$fh>) { $lines++; } + close $fh; + return $lines; } diff --git a/training/make-lexcrf-grammar.pl b/training/make-lexcrf-grammar.pl index 0e290492..8cdf7718 100755 --- a/training/make-lexcrf-grammar.pl +++ b/training/make-lexcrf-grammar.pl @@ -17,23 +17,27 @@ while(<M1>) { } my $ADD_MODEL1 = 0; # found that model1 hurts performance -my $IS_FRENCH_F = 0; # indicates that the f language is french -my $IS_ARABIC_F = 1; # indicates that the f language is arabic +my $IS_FRENCH_F = 1; # indicates that the f language is french +my $IS_ARABIC_F = 0; # indicates that the f language is arabic +my $IS_URDU_F = 0; # indicates that the f language is arabic my $ADD_PREFIX_ID = 0; my $ADD_LEN = 1; -my $ADD_LD = 0; +my $ADD_SIM = 1; my $ADD_DICE = 1; my $ADD_111 = 1; my $ADD_ID = 1; my $ADD_PUNC = 1; my $ADD_NUM_MM = 1; my $ADD_NULL = 1; +my $ADD_STEM_ID = 1; my $BEAM_RATIO = 50; my %fdict; my %fcounts; my %ecounts; +my %sdict; + while(<EF>) { chomp; my ($f, $e) = split /\s*\|\|\|\s*/; @@ -56,10 +60,11 @@ print STDERR "PuncMiss 0\n" if $ADD_PUNC; print STDERR "IsNull 0\n" if $ADD_NULL; print STDERR "Model1 0\n" if $ADD_MODEL1; print STDERR "DLen 0\n" if $ADD_LEN; -print STDERR "NumMM 0\n" if $ADD_NUM_MM; -print STDERR "Level 0\n" if $ADD_LD; +print STDERR "NumMM 0\nNumMatch 0\n" if $ADD_NUM_MM; +print STDERR "OrthoSim 0\n" if $ADD_SIM; print STDERR "PfxIdentical 0\n" if ($ADD_PREFIX_ID); my $fc = 1000000; +my $sids = 1000000; for my $f (sort keys %fdict) { my $re = $fdict{$f}; my $max; @@ -72,7 +77,6 @@ for my $f (sort keys %fdict) { my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f}); my $feats = "F$fc=1"; my $oe = $e; - my $len_e = length($oe); my $of = $f; # normalized form if ($IS_FRENCH_F) { # see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French @@ -85,7 +89,27 @@ for my $f (sort keys %fdict) { if (length($of) > 1 && !($of =~ /\d/)) { $of =~ s/\$/sh/g; } + } elsif ($IS_URDU_F) { + if (length($of) > 1 && !($of =~ /\d/)) { + $of =~ s/\$/sh/g; + } + $oe =~ s/^-e-//; + $oe =~ s/^al-/al/; + $of =~ s/([a-z])\~/$1$1/g; + $of =~ s/E/'/g; + $of =~ s/^Aw/o/g; + $of =~ s/\|/a/g; + $of =~ s/@/h/g; + $of =~ s/c/ch/g; + $of =~ s/x/kh/g; + $of =~ s/\*/dh/g; + $of =~ s/w/o/g; + $of =~ s/Z/dh/g; + $of =~ s/y/i/g; + $of =~ s/Y/a/g; + $of = lc $of; } + my $len_e = length($oe); my $len_f = length($of); $feats .= " Model1=$m1" if ($ADD_MODEL1); $feats .= " Dice=$dice" if $ADD_DICE; @@ -100,12 +124,35 @@ for my $f (sort keys %fdict) { $feats .= " DLen=$dlen"; } } - my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/); # this matches *two digit* and more numbers - my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/); + my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3)); + my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3)); my $both_non_numeric = (!$e_num && !$f_num); if ($ADD_NUM_MM && (($f_num && !$e_num) || ($e_num && !$f_num))) { $feats .= " NumMM=1"; } + if ($ADD_NUM_MM && ($f_num && $e_num) && ($oe eq $of)) { + $feats .= " NumMatch=1"; + } + if ($ADD_STEM_ID) { + my $el = 4; + my $fl = 4; + if ($oe =~ /^al|re|co/) { $el++; } + if ($of =~ /^al|re|co/) { $fl++; } + if ($oe =~ /^trans|inter/) { $el+=2; } + if ($of =~ /^trans|inter/) { $fl+=2; } + if ($fl > length($of)) { $fl = length($of); } + if ($el > length($oe)) { $el = length($oe); } + my $sf = substr $of, 0, $fl; + my $se = substr $oe, 0, $el; + my $id = $sdict{$sf}->{$se}; + if (!$id) { + $sids++; + $sdict{$sf}->{$se} = $sids; + $id = $sids; + print STDERR "S$sids 0\n" + } + $feats .= " S$id=1"; + } if ($ADD_PREFIX_ID) { if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { my $pe = substr $oe, 0, 3; @@ -113,12 +160,14 @@ for my $f (sort keys %fdict) { if ($pe eq $pf) { $feats .= " PfxIdentical=1"; } } } - if ($ADD_LD) { + if ($ADD_SIM) { my $ld = 0; - if ($is_null) { $ld = length($e); } else { - $ld = levenshtein($e, $f); + my $eff = $len_e; + if ($eff < $len_f) { $eff = $len_f; } + if (!$is_null) { + $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); } - $feats .= " Leven=$ld"; + $feats .= " OrthoSim=$ld"; } my $ident = ($e eq $f); if ($ident && $ADD_ID) { $feats .= " Identical=1"; } |