summaryrefslogtreecommitdiff
path: root/training
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2009-12-14 20:35:11 -0500
committerChris Dyer <redpony@gmail.com>2009-12-14 20:35:11 -0500
commit851e389dffdd6996ea32d70defb8906de80b9edc (patch)
tree8c68ee77205badc056b8ab5b332e67e3e98017df /training
parentdc6930c00b4b276883280cff1ed6dcd9ddef03c7 (diff)
few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec
Diffstat (limited to 'training')
-rw-r--r--training/Makefile.am20
-rw-r--r--training/atools.cc96
-rwxr-xr-xtraining/cluster-ptrain.pl52
-rwxr-xr-xtraining/make-lexcrf-grammar.pl73
4 files changed, 210 insertions, 31 deletions
diff --git a/training/Makefile.am b/training/Makefile.am
index c4c22fa2..944c75f7 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -10,32 +10,32 @@ bin_PROGRAMS = \
optimize_test
atools_SOURCES = atools.cc
-atools_LDADD = $(top_srcdir)/src/libhg.a -lz
+atools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
model1_SOURCES = model1.cc
-model1_LDADD = $(top_srcdir)/src/libhg.a -lz
+model1_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
grammar_convert_SOURCES = grammar_convert.cc
-grammar_convert_LDADD = $(top_srcdir)/src/libhg.a -lz
+grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
optimize_test_SOURCES = optimize_test.cc optimize.cc
-optimize_test_LDADD = $(top_srcdir)/src/libhg.a -lz
+optimize_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
collapse_weights_SOURCES = collapse_weights.cc
-collapse_weights_LDADD = $(top_srcdir)/src/libhg.a -lz
+collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
lbfgs_test_SOURCES = lbfgs_test.cc
-lbfgs_test_LDADD = $(top_srcdir)/src/libhg.a -lz
+lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc optimize.cc
-mr_optimize_reduce_LDADD = $(top_srcdir)/src/libhg.a -lz
+mr_optimize_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
mr_em_train_SOURCES = mr_em_train.cc
-mr_em_train_LDADD = $(top_srcdir)/src/libhg.a -lz
+mr_em_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
plftools_SOURCES = plftools.cc
-plftools_LDADD = $(top_srcdir)/src/libhg.a -lz
+plftools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/src
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder
AM_LDFLAGS = $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB)
diff --git a/training/atools.cc b/training/atools.cc
index bac73859..a18250f7 100644
--- a/training/atools.cc
+++ b/training/atools.cc
@@ -2,6 +2,7 @@
#include <sstream>
#include <vector>
+#include <queue>
#include <map>
#include <boost/program_options.hpp>
#include <boost/shared_ptr.hpp>
@@ -105,6 +106,99 @@ struct IntersectCommand : public Command {
}
};
+struct UnionCommand : public Command {
+ string Name() const { return "union"; }
+ bool RequiresTwoOperands() const { return true; }
+ void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
+ EnsureSize(a, b, x);
+ Array2D<bool>& res = *x;
+ for (int i = 0; i < res.width(); ++i)
+ for (int j = 0; j < res.height(); ++j)
+ res(i, j) = Safe(a, i, j) || Safe(b, i, j);
+ }
+};
+
+struct RefineCommand : public Command {
+ RefineCommand() {
+ neighbors_.push_back(make_pair(1,0));
+ neighbors_.push_back(make_pair(-1,0));
+ neighbors_.push_back(make_pair(0,1));
+ neighbors_.push_back(make_pair(0,-1));
+ }
+ bool RequiresTwoOperands() const { return true; }
+ protected:
+ void InitRefine(
+ const Array2D<bool>& a,
+ const Array2D<bool>& b,
+ Array2D<bool>* x) {
+ EnsureSize(a, b, x);
+ in_.clear(); un_.clear(); is_i_aligned_.clear(); is_j_aligned_.clear();
+ EnsureSize(a, b, &in_);
+ EnsureSize(a, b, &un_);
+ is_i_aligned_.resize(x->width(), false);
+ is_j_aligned_.resize(x->height(), false);
+ for (int i = 0; i < in_.width(); ++i)
+ for (int j = 0; j < in_.height(); ++j) {
+ un_(i, j) = Safe(a, i, j) || Safe(b, i, j);
+ in_(i, j) = Safe(a, i, j) && Safe(b, i, j);
+ }
+ }
+ // "grow" the intersection alignment with neighboring points
+ // from the union alignment
+ void Grow(Array2D<bool>* x) {
+ Array2D<bool>& res = *x;
+ queue<pair<int, int> > q;
+ for (int i = 0; i < in_.width(); ++i)
+ for (int j = 0; j < in_.height(); ++j)
+ if (in_(i, j)) {
+ Align(i, j, x);
+ q.push(make_pair(i, j));
+ }
+ while(!q.empty()) {
+ const pair<int,int> point = q.front();
+ q.pop();
+ for (int k = 0; k < neighbors_.size(); ++k) {
+ const int test_i = neighbors_[k].first + point.first;
+ const int test_j = neighbors_[k].second + point.second;
+ if (Safe(un_, test_i, test_j) && !res(test_i, test_j)) {
+ Align(test_i, test_j, x);
+ q.push(make_pair(test_i, test_j));
+ }
+ }
+ }
+ }
+ void Final(bool do_and, Array2D<bool>* x) {
+ }
+ void Align(int i, int j, Array2D<bool>* x) {
+ (*x)(i, j) = true;
+ is_i_aligned_[i] = true;
+ is_j_aligned_[j] = true;
+ }
+ Array2D<bool> in_; // intersection alignment
+ Array2D<bool> un_; // union alignment
+ vector<bool> is_i_aligned_;
+ vector<bool> is_j_aligned_;
+ vector<pair<int,int> > neighbors_;
+};
+
+struct DiagCommand : public RefineCommand {
+ DiagCommand() {
+ neighbors_.push_back(make_pair(1,1));
+ neighbors_.push_back(make_pair(-1,1));
+ neighbors_.push_back(make_pair(1,-1));
+ neighbors_.push_back(make_pair(-1,-1));
+ }
+};
+
+struct GDFCommand : public DiagCommand {
+ string Name() const { return "gdf"; }
+ void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
+ InitRefine(a, b, x);
+ Grow(x);
+ Final(false, x);
+ }
+};
+
map<string, boost::shared_ptr<Command> > commands;
void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
@@ -163,6 +257,8 @@ int main(int argc, char **argv) {
AddCommand<ConvertCommand>();
AddCommand<InvertCommand>();
AddCommand<IntersectCommand>();
+ AddCommand<UnionCommand>();
+ AddCommand<GDFCommand>();
AddCommand<FMeasureCommand>();
po::variables_map conf;
InitCommandLine(argc, argv, &conf);
diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl
index 9f7c1569..8b06f162 100755
--- a/training/cluster-ptrain.pl
+++ b/training/cluster-ptrain.pl
@@ -8,7 +8,7 @@ my $MAX_ITER_ATTEMPTS = 5; # number of times to retry a failed function evaluati
my $CWD=`pwd`; chomp $CWD;
my $BIN_DIR = $SCRIPT_DIR;
my $OPTIMIZER = "$BIN_DIR/mr_optimize_reduce";
-my $DECODER = "$BIN_DIR/../src/cdec";
+my $DECODER = "$BIN_DIR/../decoder/cdec";
my $COMBINER_CACHE_SIZE = 150;
# This is a hack to run this on a weird cluster,
# eventually, I'll provide Hadoop scripts.
@@ -19,32 +19,35 @@ my $restart = '';
if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; }
my $pmem="2500mb";
-my $nodes = 36;
+my $nodes = 1;
my $max_iteration = 1000;
my $PRIOR_FLAG = "";
my $parallel = 1;
my $CFLAG = "-C 1";
my $LOCAL;
+my $DISTRIBUTED;
my $PRIOR;
my $OALG = "lbfgs";
my $sigsq = 1;
my $means_file;
-GetOptions("decoder=s" => \$DECODER,
+GetOptions("cdec=s" => \$DECODER,
"run_locally" => \$LOCAL,
- "gaussian_prior" => \$PRIOR,
+ "distributed" => \$DISTRIBUTED,
"sigma_squared=f" => \$sigsq,
"means=s" => \$means_file,
"optimizer=s" => \$OALG,
+ "jobs=i" => \$nodes,
"pmem=s" => \$pmem
) or usage();
usage() unless scalar @ARGV==3;
my $config_file = shift @ARGV;
my $training_corpus = shift @ARGV;
my $initial_weights = shift @ARGV;
+unless ($DISTRIBUTED) { $LOCAL = 1; }
die "Can't find $config_file" unless -f $config_file;
die "Can't find $DECODER" unless -f $DECODER;
die "Can't execute $DECODER" unless -x $DECODER;
-if ($LOCAL) { print STDERR "Will running LOCALLY.\n"; $parallel = 0; }
+if ($LOCAL) { print STDERR "Will run LOCALLY.\n"; $parallel = 0; }
if ($PRIOR) {
$PRIOR_FLAG="-p --sigma_squared $sigsq";
if ($means_file) { $PRIOR_FLAG .= " -u $means_file"; }
@@ -56,20 +59,23 @@ if ($parallel) {
}
unless ($parallel) { $CFLAG = "-C 500"; }
unless ($config_file =~ /^\//) { $config_file = $CWD . '/' . $config_file; }
+my $clines = num_lines($training_corpus);
print STDERR <<EOT;
PTRAIN CONFIGURATION INFORMATION
Config file: $config_file
Training corpus: $training_corpus
+ Corpus size: $clines
Initial weights: $initial_weights
Decoder memory: $pmem
- Nodes requested: $nodes
Max iterations: $max_iteration
Optimizer: $OALG
- PRIOR: $PRIOR_FLAG
- restart: $restart
+ Jobs requested: $nodes
+ prior?: $PRIOR_FLAG
+ restart?: $restart
EOT
+
if ($OALG) { $OALG="-m $OALG"; }
my $nodelist="1";
@@ -142,5 +148,33 @@ while ($iter < $max_iteration) {
print "FINAL WEIGHTS: $dir/weights.$iter\n";
sub usage {
- die "Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init\n";
+ die <<EOT;
+
+Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init
+
+ Options:
+
+ --distributed Parallelize function evaluation
+ --cdec PATH Path to cdec binary
+ --optimize OPT lbfgs, rprop, sgd
+ --gaussian_prior add Gaussian prior
+ --means FILE if you want means other than 0
+ --sigma_squared S variance on prior
+ --pmem MEM Memory required for decoder
+
+EOT
+}
+
+sub num_lines {
+ my $file = shift;
+ my $fh;
+ if ($file=~ /\.gz$/) {
+ open $fh, "zcat $file|" or die "Couldn't fork zcat $file: $!";
+ } else {
+ open $fh, "<$file" or die "Couldn't read $file: $!";
+ }
+ my $lines = 0;
+ while(<$fh>) { $lines++; }
+ close $fh;
+ return $lines;
}
diff --git a/training/make-lexcrf-grammar.pl b/training/make-lexcrf-grammar.pl
index 0e290492..8cdf7718 100755
--- a/training/make-lexcrf-grammar.pl
+++ b/training/make-lexcrf-grammar.pl
@@ -17,23 +17,27 @@ while(<M1>) {
}
my $ADD_MODEL1 = 0; # found that model1 hurts performance
-my $IS_FRENCH_F = 0; # indicates that the f language is french
-my $IS_ARABIC_F = 1; # indicates that the f language is arabic
+my $IS_FRENCH_F = 1; # indicates that the f language is french
+my $IS_ARABIC_F = 0; # indicates that the f language is arabic
+my $IS_URDU_F = 0; # indicates that the f language is arabic
my $ADD_PREFIX_ID = 0;
my $ADD_LEN = 1;
-my $ADD_LD = 0;
+my $ADD_SIM = 1;
my $ADD_DICE = 1;
my $ADD_111 = 1;
my $ADD_ID = 1;
my $ADD_PUNC = 1;
my $ADD_NUM_MM = 1;
my $ADD_NULL = 1;
+my $ADD_STEM_ID = 1;
my $BEAM_RATIO = 50;
my %fdict;
my %fcounts;
my %ecounts;
+my %sdict;
+
while(<EF>) {
chomp;
my ($f, $e) = split /\s*\|\|\|\s*/;
@@ -56,10 +60,11 @@ print STDERR "PuncMiss 0\n" if $ADD_PUNC;
print STDERR "IsNull 0\n" if $ADD_NULL;
print STDERR "Model1 0\n" if $ADD_MODEL1;
print STDERR "DLen 0\n" if $ADD_LEN;
-print STDERR "NumMM 0\n" if $ADD_NUM_MM;
-print STDERR "Level 0\n" if $ADD_LD;
+print STDERR "NumMM 0\nNumMatch 0\n" if $ADD_NUM_MM;
+print STDERR "OrthoSim 0\n" if $ADD_SIM;
print STDERR "PfxIdentical 0\n" if ($ADD_PREFIX_ID);
my $fc = 1000000;
+my $sids = 1000000;
for my $f (sort keys %fdict) {
my $re = $fdict{$f};
my $max;
@@ -72,7 +77,6 @@ for my $f (sort keys %fdict) {
my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f});
my $feats = "F$fc=1";
my $oe = $e;
- my $len_e = length($oe);
my $of = $f; # normalized form
if ($IS_FRENCH_F) {
# see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French
@@ -85,7 +89,27 @@ for my $f (sort keys %fdict) {
if (length($of) > 1 && !($of =~ /\d/)) {
$of =~ s/\$/sh/g;
}
+ } elsif ($IS_URDU_F) {
+ if (length($of) > 1 && !($of =~ /\d/)) {
+ $of =~ s/\$/sh/g;
+ }
+ $oe =~ s/^-e-//;
+ $oe =~ s/^al-/al/;
+ $of =~ s/([a-z])\~/$1$1/g;
+ $of =~ s/E/'/g;
+ $of =~ s/^Aw/o/g;
+ $of =~ s/\|/a/g;
+ $of =~ s/@/h/g;
+ $of =~ s/c/ch/g;
+ $of =~ s/x/kh/g;
+ $of =~ s/\*/dh/g;
+ $of =~ s/w/o/g;
+ $of =~ s/Z/dh/g;
+ $of =~ s/y/i/g;
+ $of =~ s/Y/a/g;
+ $of = lc $of;
}
+ my $len_e = length($oe);
my $len_f = length($of);
$feats .= " Model1=$m1" if ($ADD_MODEL1);
$feats .= " Dice=$dice" if $ADD_DICE;
@@ -100,12 +124,35 @@ for my $f (sort keys %fdict) {
$feats .= " DLen=$dlen";
}
}
- my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/); # this matches *two digit* and more numbers
- my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/);
+ my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3));
+ my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3));
my $both_non_numeric = (!$e_num && !$f_num);
if ($ADD_NUM_MM && (($f_num && !$e_num) || ($e_num && !$f_num))) {
$feats .= " NumMM=1";
}
+ if ($ADD_NUM_MM && ($f_num && $e_num) && ($oe eq $of)) {
+ $feats .= " NumMatch=1";
+ }
+ if ($ADD_STEM_ID) {
+ my $el = 4;
+ my $fl = 4;
+ if ($oe =~ /^al|re|co/) { $el++; }
+ if ($of =~ /^al|re|co/) { $fl++; }
+ if ($oe =~ /^trans|inter/) { $el+=2; }
+ if ($of =~ /^trans|inter/) { $fl+=2; }
+ if ($fl > length($of)) { $fl = length($of); }
+ if ($el > length($oe)) { $el = length($oe); }
+ my $sf = substr $of, 0, $fl;
+ my $se = substr $oe, 0, $el;
+ my $id = $sdict{$sf}->{$se};
+ if (!$id) {
+ $sids++;
+ $sdict{$sf}->{$se} = $sids;
+ $id = $sids;
+ print STDERR "S$sids 0\n"
+ }
+ $feats .= " S$id=1";
+ }
if ($ADD_PREFIX_ID) {
if ($len_e > 3 && $len_f > 3 && $both_non_numeric) {
my $pe = substr $oe, 0, 3;
@@ -113,12 +160,14 @@ for my $f (sort keys %fdict) {
if ($pe eq $pf) { $feats .= " PfxIdentical=1"; }
}
}
- if ($ADD_LD) {
+ if ($ADD_SIM) {
my $ld = 0;
- if ($is_null) { $ld = length($e); } else {
- $ld = levenshtein($e, $f);
+ my $eff = $len_e;
+ if ($eff < $len_f) { $eff = $len_f; }
+ if (!$is_null) {
+ $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
}
- $feats .= " Leven=$ld";
+ $feats .= " OrthoSim=$ld";
}
my $ident = ($e eq $f);
if ($ident && $ADD_ID) { $feats .= " Identical=1"; }