From 1afbff874473c79619ce74cdf90f3c312185e4e1 Mon Sep 17 00:00:00 2001
From: Guest_account Guest_account prguest11 <prguest11@taipan.cs>
Date: Sat, 17 Sep 2011 01:39:07 +0100
Subject: add dep

---
 training/cluster-em.pl          | 114 ----------------
 training/cluster-ptrain.pl      | 206 -----------------------------
 training/compute_cllh.cc        | 196 ---------------------------
 training/make-lexcrf-grammar.pl | 285 ----------------------------------------
 training/mpi_compute_cllh.cc    | 196 +++++++++++++++++++++++++++
 5 files changed, 196 insertions(+), 801 deletions(-)
 delete mode 100755 training/cluster-em.pl
 delete mode 100755 training/cluster-ptrain.pl
 delete mode 100644 training/compute_cllh.cc
 delete mode 100755 training/make-lexcrf-grammar.pl
 create mode 100644 training/mpi_compute_cllh.cc

(limited to 'training')

diff --git a/training/cluster-em.pl b/training/cluster-em.pl
deleted file mode 100755
index 267ab642..00000000
--- a/training/cluster-em.pl
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
-use Getopt::Long;
-my $parallel = 0;
-
-my $CWD=`pwd`; chomp $CWD;
-my $BIN_DIR = "$CWD/..";
-my $REDUCER = "$BIN_DIR/training/mr_em_adapted_reduce";
-my $REDUCE2WEIGHTS = "$BIN_DIR/training/mr_reduce_to_weights";
-my $ADAPTER = "$BIN_DIR/training/mr_em_map_adapter";
-my $DECODER = "$BIN_DIR/decoder/cdec";
-my $COMBINER_CACHE_SIZE = 10000000;
-my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl";
-die "Can't find $REDUCER" unless -f $REDUCER;
-die "Can't execute $REDUCER" unless -x $REDUCER;
-die "Can't find $REDUCE2WEIGHTS" unless -f $REDUCE2WEIGHTS;
-die "Can't execute $REDUCE2WEIGHTS" unless -x $REDUCE2WEIGHTS;
-die "Can't find $ADAPTER" unless -f $ADAPTER;
-die "Can't execute $ADAPTER" unless -x $ADAPTER;
-die "Can't find $DECODER" unless -f $DECODER;
-die "Can't execute $DECODER" unless -x $DECODER;
-my $restart = '';
-if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; }
-
-die "Usage: $0 [--restart] training.corpus cdec.ini\n" unless (scalar @ARGV == 2);
-
-my $training_corpus = shift @ARGV;
-my $config = shift @ARGV;
-my $pmem="2500mb";
-my $nodes = 40;
-my $max_iteration = 1000;
-my $CFLAG = "-C 1";
-if ($parallel) {
-  die "Can't find $PARALLEL" unless -f $PARALLEL;
-  die "Can't execute $PARALLEL" unless -x $PARALLEL;
-} else { $CFLAG = "-C 500"; }
-
-my $initial_weights = '';
-
-print STDERR <<EOT;
-EM TRAIN CONFIGURATION INFORMATION
-
-      Config file: $config
-  Training corpus: $training_corpus
-  Initial weights: $initial_weights
-   Decoder memory: $pmem
-  Nodes requested: $nodes
-   Max iterations: $max_iteration
-          restart: $restart
-EOT
-
-my $nodelist="1";
-for (my $i=1; $i<$nodes; $i++) { $nodelist .= " 1"; }
-my $iter = 1;
-
-my $dir = "$CWD/emtrain";
-if ($restart) {
-  die "$dir doesn't exist, but --restart specified!\n" unless -d $dir;
-  my $o = `ls -t $dir/weights.*`;
-  my ($a, @x) = split /\n/, $o;
-  if ($a =~ /weights.(\d+)\.gz$/) {
-    $iter = $1;
-  } else {
-    die "Unexpected file: $a!\n";
-  }
-  print STDERR "Restarting at iteration $iter\n";
-} else {
-  die "$dir already exists!\n" if -e $dir;
-  mkdir $dir or die "Can't create $dir: $!";
-
-  if ($initial_weights) {
-    unless ($initial_weights =~ /\.gz$/) {
-      `cp $initial_weights $dir/weights.1`;
-      `gzip -9 $dir/weights.1`;
-    } else {
-      `cp $initial_weights $dir/weights.1.gz`;
-    }
-  }
-}
-
-while ($iter < $max_iteration) {
-  my $cur_time = `date`; chomp $cur_time;
-  print STDERR "\nStarting iteration $iter...\n";
-  print STDERR "  time: $cur_time\n";
-  my $start = time;
-  my $next_iter = $iter + 1;
-  my $WSTR = "-w $dir/weights.$iter.gz";
-  if ($iter == 1) { $WSTR = ''; }
-  my $dec_cmd="$DECODER --feature_expectations -c $config $WSTR $CFLAG < $training_corpus 2> $dir/deco.log.$iter";
-  my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- ";
-  my $cmd = "";
-  if ($parallel) { $cmd = $pcmd; }
-  $cmd .= "$dec_cmd";
-  $cmd .= "| $ADAPTER | sort -k1 | $REDUCER | $REDUCE2WEIGHTS -o $dir/weights.$next_iter.gz";
-  print STDERR "EXECUTING: $cmd\n";
-  my $result = `$cmd`;
-  if ($? != 0) {
-    die "Error running iteration $iter: $!";
-  }
-  chomp $result;
-  my $end = time;
-  my $diff = ($end - $start);
-  print STDERR "  ITERATION $iter TOOK $diff SECONDS\n";
-  $iter = $next_iter;
-  if ($result =~ /1$/) {
-    print STDERR "Training converged.\n";
-    last;
-  }
-}
-
-print "FINAL WEIGHTS: $dir/weights.$iter\n";
-
diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl
deleted file mode 100755
index 03122df9..00000000
--- a/training/cluster-ptrain.pl
+++ /dev/null
@@ -1,206 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path getcwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
-use Getopt::Long;
-
-my $MAX_ITER_ATTEMPTS = 5; # number of times to retry a failed function evaluation
-my $CWD=getcwd();
-my $OPTIMIZER = "$SCRIPT_DIR/mr_optimize_reduce";
-my $DECODER = "$SCRIPT_DIR/../decoder/cdec";
-my $COMBINER_CACHE_SIZE = 150;
-# This is a hack to run this on a weird cluster,
-# eventually, I'll provide Hadoop scripts.
-my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl";
-die "Can't find $OPTIMIZER" unless -f $OPTIMIZER;
-die "Can't execute $OPTIMIZER" unless -x $OPTIMIZER;
-my $restart = '';
-if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; }
-
-my $pmem="2500mb";
-my $nodes = 1;
-my $max_iteration = 1000;
-my $PRIOR_FLAG = "";
-my $parallel = 1;
-my $CFLAG = "-C 1";
-my $LOCAL;
-my $DISTRIBUTED;
-my $PRIOR;
-my $OALG = "lbfgs";
-my $sigsq = 1;
-my $means_file;
-my $mem_buffers = 20;
-my $RESTART_IF_NECESSARY;
-GetOptions("cdec=s" => \$DECODER,
-           "distributed" => \$DISTRIBUTED,
-           "sigma_squared=f" => \$sigsq,
-           "lbfgs_memory_buffers=i" => \$mem_buffers,
-           "max_iteration=i" => \$max_iteration,
-           "means=s" => \$means_file,
-           "optimizer=s" => \$OALG,
-           "gaussian_prior" => \$PRIOR,
-           "restart_if_necessary" => \$RESTART_IF_NECESSARY,
-           "jobs=i" => \$nodes,
-           "pmem=s" => \$pmem
-          ) or usage();
-usage() unless scalar @ARGV==3;
-my $config_file = shift @ARGV;
-my $training_corpus = shift @ARGV;
-my $initial_weights = shift @ARGV;
-unless ($DISTRIBUTED) { $LOCAL = 1; }
-die "Can't find $config_file" unless -f $config_file;
-die "Can't find $DECODER" unless -f $DECODER;
-die "Can't execute $DECODER" unless -x $DECODER;
-if ($LOCAL) { print STDERR "Will run LOCALLY.\n"; $parallel = 0; }
-if ($PRIOR) {
-  $PRIOR_FLAG="-p --sigma_squared $sigsq";
-  if ($means_file) { $PRIOR_FLAG .= " -u $means_file"; }
-}
-
-if ($parallel) {
-  die "Can't find $PARALLEL" unless -f $PARALLEL;
-  die "Can't execute $PARALLEL" unless -x $PARALLEL;
-}
-unless ($parallel) { $CFLAG = "-C 500"; }
-unless ($config_file =~ /^\//) { $config_file = $CWD . '/' . $config_file; }
-my $clines = num_lines($training_corpus);
-my $dir = "$CWD/ptrain";
-
-if ($RESTART_IF_NECESSARY && -d $dir) {
-  $restart = 1;
-}
-
-print STDERR <<EOT;
-PTRAIN CONFIGURATION INFORMATION
-
-      Config file: $config_file
-  Training corpus: $training_corpus
-      Corpus size: $clines
-  Initial weights: $initial_weights
-   Decoder memory: $pmem
-   Max iterations: $max_iteration
-        Optimizer: $OALG
-   Jobs requested: $nodes
-           prior?: $PRIOR_FLAG
-         restart?: $restart
-EOT
-
-if ($OALG) { $OALG="-m $OALG"; }
-
-my $nodelist="1";
-for (my $i=1; $i<$nodes; $i++) { $nodelist .= " 1"; }
-my $iter = 1;
-
-if ($restart) {
-  die "$dir doesn't exist, but --restart specified!\n" unless -d $dir;
-  my $o = `ls -t $dir/weights.*`;
-  my ($a, @x) = split /\n/, $o;
-  if ($a =~ /weights.(\d+)\.gz$/) {
-    $iter = $1;
-  } else {
-    die "Unexpected file: $a!\n";
-  }
-  print STDERR "Restarting at iteration $iter\n";
-} else {
-  die "$dir already exists!\n" if -e $dir;
-  mkdir $dir or die "Can't create $dir: $!";
-
-  unless ($initial_weights =~ /\.gz$/) {
-    `cp $initial_weights $dir/weights.1`;
-    `gzip -9 $dir/weights.1`;
-  } else {
-    `cp $initial_weights $dir/weights.1.gz`;
-  }
-  open T, "<$training_corpus" or die "Can't read $training_corpus: $!";
-  open TO, ">$dir/training.in";
-  my $lc = 0;
-  while(<T>) {
-    chomp;
-    s/^\s+//;
-    s/\s+$//;
-    die "Expected A ||| B in input file" unless / \|\|\| /;
-    print TO "<seg id=\"$lc\">$_</seg>\n";
-    $lc++;
-  }
-  close T;
-  close TO;
-}
-$training_corpus = "$dir/training.in";
-
-my $iter_attempts = 1;
-while ($iter < $max_iteration) {
-  my $cur_time = `date`; chomp $cur_time;
-  print STDERR "\nStarting iteration $iter...\n";
-  print STDERR "  time: $cur_time\n";
-  my $start = time;
-  my $next_iter = $iter + 1;
-  my $dec_cmd="$DECODER -G $CFLAG -c $config_file -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter";
-  my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M $mem_buffers $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz";
-  my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- ";
-  my $cmd = "";
-  if ($parallel) { $cmd = $pcmd; }
-  $cmd .= "$dec_cmd | $opt_cmd";
-
-  print STDERR "EXECUTING: $cmd\n";
-  my $result = `$cmd`;
-  my $exit_code = $? >> 8;
-  if ($exit_code == 99) {
-    $iter_attempts++;
-    if ($iter_attempts > $MAX_ITER_ATTEMPTS) {
-      die "Received restart request $iter_attempts times from optimizer, giving up\n";
-    }
-    print STDERR "Function evaluation failed, retrying (attempt $iter_attempts)\n";
-    next;
-  }
-  if ($? != 0) {
-    die "Error running iteration $iter: $!";
-  }
-  chomp $result;
-  my $end = time;
-  my $diff = ($end - $start);
-  print STDERR "  ITERATION $iter TOOK $diff SECONDS\n";
-  $iter = $next_iter;
-  if ($result =~ /1$/) {
-    print STDERR "Training converged.\n";
-    last;
-  }
-  $iter_attempts = 1;
-}
-
-print "FINAL WEIGHTS: $dir/weights.$iter\n";
-`mv $dir/weights.$iter.gz $dir/weights.final.gz`;
-
-sub usage {
-  die <<EOT;
-
-Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init
-
-  Options:
-
-    --distributed      Parallelize function evaluation
-    --jobs N           Number of jobs to use
-    --cdec PATH        Path to cdec binary
-    --optimize OPT     lbfgs, rprop, sgd
-    --gaussian_prior   add Gaussian prior
-    --means FILE       if you want means other than 0
-    --sigma_squared S  variance on prior
-    --pmem MEM         Memory required for decoder
-    --lbfgs_memory_buffers Number of buffers to use
-                           with LBFGS optimizer
-
-EOT
-}
-
-sub num_lines {
-  my $file = shift;
-  my $fh;
-  if ($file=~ /\.gz$/) {
-    open $fh, "zcat $file|" or die "Couldn't fork zcat $file: $!";
-  } else {
-    open $fh, "<$file" or die "Couldn't read $file: $!";
-  }
-  my $lines = 0;
-  while(<$fh>) { $lines++; }
-  close $fh;
-  return $lines;
-}
diff --git a/training/compute_cllh.cc b/training/compute_cllh.cc
deleted file mode 100644
index b496d196..00000000
--- a/training/compute_cllh.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#include "config.h"
-#ifdef HAVE_MPI
-#include <boost/mpi.hpp>
-#endif
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "verbose.h"
-#include "hg.h"
-#include "prob.h"
-#include "inside_outside.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "weights.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("weights,w",po::value<string>(),"Input feature weights file")
-        ("training_data,t",po::value<string>(),"Training data corpus")
-        ("decoder_config,c",po::value<string>(),"Decoder configuration file");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
-    cerr << dcmdline_options << endl;
-    return false;
-  }
-  return true;
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* ids) {
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  int lc = 0;
-  while(in) {
-    getline(in, line);
-    if (!in) break;
-    if (lc % size == rank) {
-      c->push_back(line);
-      ids->push_back(lc);
-    }
-    ++lc;
-  }
-}
-
-static const double kMINUS_EPSILON = -1e-6;
-
-struct TrainingObserver : public DecoderObserver {
-  void Reset() {
-    acc_obj = 0;
-  } 
-
-  virtual void NotifyDecodingStart(const SentenceMetadata&) {
-    cur_obj = 0;
-    state = 1;
-  }
-
-  // compute model expectations, denominator of objective
-  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
-    assert(state == 1);
-    state = 2;
-    SparseVector<prob_t> cur_model_exp;
-    const prob_t z = InsideOutside<prob_t,
-                                   EdgeProb,
-                                   SparseVector<prob_t>,
-                                   EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
-    cur_obj = log(z);
-  }
-
-  // compute "empirical" expectations, numerator of objective
-  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-    assert(state == 2);
-    state = 3;
-    SparseVector<prob_t> ref_exp;
-    const prob_t ref_z = InsideOutside<prob_t,
-                                       EdgeProb,
-                                       SparseVector<prob_t>,
-                                       EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp);
-
-    double log_ref_z;
-#if 0
-    if (crf_uniform_empirical) {
-      log_ref_z = ref_exp.dot(feature_weights);
-    } else {
-      log_ref_z = log(ref_z);
-    }
-#else
-    log_ref_z = log(ref_z);
-#endif
-
-    // rounding errors means that <0 is too strict
-    if ((cur_obj - log_ref_z) < kMINUS_EPSILON) {
-      cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl;
-      exit(1);
-    }
-    assert(!isnan(log_ref_z));
-    acc_obj += (cur_obj - log_ref_z);
-  }
-
-  double acc_obj;
-  double cur_obj;
-  int state;
-};
-
-#ifdef HAVE_MPI
-namespace mpi = boost::mpi;
-#endif
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
-  mpi::environment env(argc, argv);
-  mpi::communicator world;
-  const int size = world.size(); 
-  const int rank = world.rank();
-#else
-  const int size = 1;
-  const int rank = 0;
-#endif
-  if (size > 1) SetSilent(true);  // turn off verbose decoder output
-  register_feature_functions();
-
-  po::variables_map conf;
-  if (!InitCommandLine(argc, argv, &conf))
-    return false;
-
-  // load cdec.ini and set up decoder
-  ReadFile ini_rf(conf["decoder_config"].as<string>());
-  Decoder decoder(ini_rf.stream());
-  if (decoder.GetConf()["input"].as<string>() != "-") {
-    cerr << "cdec.ini must not set an input file\n";
-    abort();
-  }
-
-  // load weights
-  vector<weight_t>& weights = decoder.CurrentWeightVector();
-  if (conf.count("weights"))
-    Weights::InitFromFile(conf["weights"].as<string>(), &weights);
-
-  // freeze feature set
-  //const bool freeze_feature_set = conf.count("freeze_feature_set");
-  //if (freeze_feature_set) FD::Freeze();
-
-  vector<string> corpus; vector<int> ids;
-  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
-  assert(corpus.size() > 0);
-  assert(corpus.size() == ids.size());
-
-  TrainingObserver observer;
-  double objective = 0;
-
-  observer.Reset();
-  if (rank == 0)
-    cerr << "Each processor is decoding " << corpus.size() << " training examples...\n";
-
-  for (int i = 0; i < corpus.size(); ++i) {
-    decoder.SetId(ids[i]);
-    decoder.Decode(corpus[i], &observer);
-  }
-
-#ifdef HAVE_MPI
-  reduce(world, observer.acc_obj, objective, std::plus<double>(), 0);
-#else
-  objective = observer.acc_obj;
-#endif
-
-  if (rank == 0)
-    cout << "OBJECTIVE: " << objective << endl;
-
-  return 0;
-}
-
diff --git a/training/make-lexcrf-grammar.pl b/training/make-lexcrf-grammar.pl
deleted file mode 100755
index 8cdf7718..00000000
--- a/training/make-lexcrf-grammar.pl
+++ /dev/null
@@ -1,285 +0,0 @@
-#!/usr/bin/perl -w
-use utf8;
-use strict;
-my ($effile, $model1) = @ARGV;
-die "Usage: $0 corpus.fr-en corpus.model1\n" unless $effile && -f $effile && $model1 && -f $model1;
-
-open EF, "<$effile" or die;
-open M1, "<$model1" or die;
-binmode(EF,":utf8");
-binmode(M1,":utf8");
-binmode(STDOUT,":utf8");
-my %model1;
-while(<M1>) {
-  chomp;
-  my ($f, $e, $lp) = split /\s+/;
-  $model1{$f}->{$e} = $lp;
-}
-
-my $ADD_MODEL1 = 0;      # found that model1 hurts performance
-my $IS_FRENCH_F = 1;     # indicates that the f language is french
-my $IS_ARABIC_F = 0;     # indicates that the f language is arabic
-my $IS_URDU_F = 0;     # indicates that the f language is arabic
-my $ADD_PREFIX_ID = 0;
-my $ADD_LEN = 1;
-my $ADD_SIM = 1;
-my $ADD_DICE = 1;
-my $ADD_111 = 1;
-my $ADD_ID = 1;
-my $ADD_PUNC = 1;
-my $ADD_NUM_MM = 1;
-my $ADD_NULL = 1;
-my $ADD_STEM_ID = 1;
-my $BEAM_RATIO = 50;
-
-my %fdict;
-my %fcounts;
-my %ecounts;
-
-my %sdict;
-
-while(<EF>) {
-  chomp;
-  my ($f, $e) = split /\s*\|\|\|\s*/;
-  my @es = split /\s+/, $e;
-  my @fs = split /\s+/, $f;
-  for my $ew (@es){ $ecounts{$ew}++; }
-  push @fs, '<eps>' if $ADD_NULL;
-  for my $fw (@fs){ $fcounts{$fw}++; }
-  for my $fw (@fs){
-    for my $ew (@es){
-      $fdict{$fw}->{$ew}++;
-    }
-  }
-}
-
-print STDERR "Dice 0\n" if $ADD_DICE;
-print STDERR "OneOneOne 0\nId_OneOneOne 0\n" if $ADD_111;
-print STDERR "Identical 0\n" if $ADD_ID;
-print STDERR "PuncMiss 0\n" if $ADD_PUNC;
-print STDERR "IsNull 0\n" if $ADD_NULL;
-print STDERR "Model1 0\n" if $ADD_MODEL1;
-print STDERR "DLen 0\n" if $ADD_LEN;
-print STDERR "NumMM 0\nNumMatch 0\n" if $ADD_NUM_MM;
-print STDERR "OrthoSim 0\n" if $ADD_SIM;
-print STDERR "PfxIdentical 0\n" if ($ADD_PREFIX_ID);
-my $fc = 1000000;
-my $sids = 1000000;
-for my $f (sort keys %fdict) {
-  my $re = $fdict{$f};
-  my $max;
-  for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) {
-    my $efcount = $re->{$e};
-    unless (defined $max) { $max = $efcount; }
-    my $m1 = $model1{$f}->{$e};
-    unless (defined $m1) { next; }
-    $fc++;
-    my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f});
-    my $feats = "F$fc=1";
-    my $oe = $e;
-    my $of = $f;   # normalized form
-    if ($IS_FRENCH_F) {
-      # see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French
-      $of =~ s/â/as/g;
-      $of =~ s/ê/es/g;
-      $of =~ s/î/is/g;
-      $of =~ s/ô/os/g;
-      $of =~ s/û/us/g;
-    } elsif ($IS_ARABIC_F) {
-      if (length($of) > 1 && !($of =~ /\d/)) {
-        $of =~ s/\$/sh/g;
-      }
-    } elsif ($IS_URDU_F) {
-      if (length($of) > 1 && !($of =~ /\d/)) {
-        $of =~ s/\$/sh/g;
-      }
-      $oe =~ s/^-e-//;
-      $oe =~ s/^al-/al/;
-      $of =~ s/([a-z])\~/$1$1/g;
-      $of =~ s/E/'/g;
-      $of =~ s/^Aw/o/g;
-      $of =~ s/\|/a/g;
-      $of =~ s/@/h/g;
-      $of =~ s/c/ch/g;
-      $of =~ s/x/kh/g;
-      $of =~ s/\*/dh/g;
-      $of =~ s/w/o/g;
-      $of =~ s/Z/dh/g;
-      $of =~ s/y/i/g;
-      $of =~ s/Y/a/g;
-      $of = lc $of;
-    }
-    my $len_e = length($oe);
-    my $len_f = length($of);
-    $feats .= " Model1=$m1" if ($ADD_MODEL1);
-    $feats .= " Dice=$dice" if $ADD_DICE;
-    my $is_null = undef;
-    if ($ADD_NULL && $f eq '<eps>') {
-      $feats .= " IsNull=1";
-      $is_null = 1;
-    }
-    if ($ADD_LEN) {
-      if (!$is_null) {
-        my $dlen = abs($len_e - $len_f);
-        $feats .= " DLen=$dlen";
-      }
-    }
-    my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3));
-    my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3));
-    my $both_non_numeric = (!$e_num && !$f_num);
-    if ($ADD_NUM_MM && (($f_num && !$e_num) || ($e_num && !$f_num))) {
-      $feats .= " NumMM=1";
-    }
-    if ($ADD_NUM_MM && ($f_num && $e_num) && ($oe eq $of)) {
-      $feats .= " NumMatch=1";
-    }
-    if ($ADD_STEM_ID) {
-      my $el = 4;
-      my $fl = 4;
-      if ($oe =~ /^al|re|co/) { $el++; }
-      if ($of =~ /^al|re|co/) { $fl++; }
-      if ($oe =~ /^trans|inter/) { $el+=2; }
-      if ($of =~ /^trans|inter/) { $fl+=2; }
-      if ($fl > length($of)) { $fl = length($of); }
-      if ($el > length($oe)) { $el = length($oe); }
-      my $sf = substr $of, 0, $fl;
-      my $se = substr $oe, 0, $el;
-      my $id = $sdict{$sf}->{$se};
-      if (!$id) {
-        $sids++;
-	$sdict{$sf}->{$se} = $sids;
-	$id = $sids;
-	print STDERR "S$sids 0\n"
-      }
-      $feats .= " S$id=1";
-    }
-    if ($ADD_PREFIX_ID) {
-      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { 
-        my $pe = substr $oe, 0, 3;
-        my $pf = substr $of, 0, 3;
-        if ($pe eq $pf) { $feats .= " PfxIdentical=1"; }
-      }
-    }
-    if ($ADD_SIM) {
-      my $ld = 0;
-      my $eff = $len_e;
-      if ($eff < $len_f) { $eff = $len_f; }
-      if (!$is_null) {
-        $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
-      }
-      $feats .= " OrthoSim=$ld";
-    }
-    my $ident = ($e eq $f);
-    if ($ident && $ADD_ID) { $feats .= " Identical=1"; }
-    if ($ADD_111 && ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1)) {
-      if ($ident && $ADD_ID) {
-        $feats .= " Id_OneOneOne=1";
-      }
-      $feats .= " OneOneOne=1";
-    }
-    if ($ADD_PUNC) {
-      if (($f =~ /^[0-9!\$%,\-\/"':;=+?.()«»]+$/ && $e =~ /[a-z]+/) ||
-          ($e =~ /^[0-9!\$%,\-\/"':;=+?.()«»]+$/ && $f =~ /[a-z]+/)) {
-        $feats .= " PuncMiss=1";
-      }
-    }
-    my $r = (0.5 - rand)/5;
-    print STDERR "F$fc $r\n";
-    print "$f ||| $e ||| $feats\n";
-  }
-}
-
-sub levenshtein
-{
-    # $s1 and $s2 are the two strings
-    # $len1 and $len2 are their respective lengths
-    #
-    my ($s1, $s2) = @_;
-    my ($len1, $len2) = (length $s1, length $s2);
-
-    # If one of the strings is empty, the distance is the length
-    # of the other string
-    #
-    return $len2 if ($len1 == 0);
-    return $len1 if ($len2 == 0);
-
-    my %mat;
-
-    # Init the distance matrix
-    #
-    # The first row to 0..$len1
-    # The first column to 0..$len2
-    # The rest to 0
-    #
-    # The first row and column are initialized so to denote distance
-    # from the empty string
-    #
-    for (my $i = 0; $i <= $len1; ++$i)
-    {
-        for (my $j = 0; $j <= $len2; ++$j)
-        {
-            $mat{$i}{$j} = 0;
-            $mat{0}{$j} = $j;
-        }
-
-        $mat{$i}{0} = $i;
-    }
-
-    # Some char-by-char processing is ahead, so prepare
-    # array of chars from the strings
-    #
-    my @ar1 = split(//, $s1);
-    my @ar2 = split(//, $s2);
-
-    for (my $i = 1; $i <= $len1; ++$i)
-    {
-        for (my $j = 1; $j <= $len2; ++$j)
-        {
-            # Set the cost to 1 iff the ith char of $s1
-            # equals the jth of $s2
-            # 
-            # Denotes a substitution cost. When the char are equal
-            # there is no need to substitute, so the cost is 0
-            #
-            my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
-
-            # Cell $mat{$i}{$j} equals the minimum of:
-            #
-            # - The cell immediately above plus 1
-            # - The cell immediately to the left plus 1
-            # - The cell diagonally above and to the left plus the cost
-            #
-            # We can either insert a new char, delete a char or
-            # substitute an existing char (with an associated cost)
-            #
-            $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
-                                $mat{$i}{$j-1} + 1,
-                                $mat{$i-1}{$j-1} + $cost]);
-        }
-    }
-
-    # Finally, the Levenshtein distance equals the rightmost bottom cell
-    # of the matrix
-    #
-    # Note that $mat{$x}{$y} denotes the distance between the substrings
-    # 1..$x and 1..$y
-    #
-    return $mat{$len1}{$len2};
-}
-
-
-# minimal element of a list
-#
-sub min
-{
-    my @list = @{$_[0]};
-    my $min = $list[0];
-
-    foreach my $i (@list)
-    {
-        $min = $i if ($i < $min);
-    }
-
-    return $min;
-}
-
diff --git a/training/mpi_compute_cllh.cc b/training/mpi_compute_cllh.cc
new file mode 100644
index 00000000..b496d196
--- /dev/null
+++ b/training/mpi_compute_cllh.cc
@@ -0,0 +1,196 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+
+#include "config.h"
+#ifdef HAVE_MPI
+#include <boost/mpi.hpp>
+#endif
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "verbose.h"
+#include "hg.h"
+#include "prob.h"
+#include "inside_outside.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "weights.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("weights,w",po::value<string>(),"Input feature weights file")
+        ("training_data,t",po::value<string>(),"Training data corpus")
+        ("decoder_config,c",po::value<string>(),"Decoder configuration file");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* ids) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  int lc = 0;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    if (lc % size == rank) {
+      c->push_back(line);
+      ids->push_back(lc);
+    }
+    ++lc;
+  }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct TrainingObserver : public DecoderObserver {
+  void Reset() {
+    acc_obj = 0;
+  } 
+
+  virtual void NotifyDecodingStart(const SentenceMetadata&) {
+    cur_obj = 0;
+    state = 1;
+  }
+
+  // compute model expectations, denominator of objective
+  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
+    assert(state == 1);
+    state = 2;
+    SparseVector<prob_t> cur_model_exp;
+    const prob_t z = InsideOutside<prob_t,
+                                   EdgeProb,
+                                   SparseVector<prob_t>,
+                                   EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
+    cur_obj = log(z);
+  }
+
+  // compute "empirical" expectations, numerator of objective
+  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+    assert(state == 2);
+    state = 3;
+    SparseVector<prob_t> ref_exp;
+    const prob_t ref_z = InsideOutside<prob_t,
+                                       EdgeProb,
+                                       SparseVector<prob_t>,
+                                       EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp);
+
+    double log_ref_z;
+#if 0
+    if (crf_uniform_empirical) {
+      log_ref_z = ref_exp.dot(feature_weights);
+    } else {
+      log_ref_z = log(ref_z);
+    }
+#else
+    log_ref_z = log(ref_z);
+#endif
+
+    // rounding errors means that <0 is too strict
+    if ((cur_obj - log_ref_z) < kMINUS_EPSILON) {
+      cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl;
+      exit(1);
+    }
+    assert(!isnan(log_ref_z));
+    acc_obj += (cur_obj - log_ref_z);
+  }
+
+  double acc_obj;
+  double cur_obj;
+  int state;
+};
+
+#ifdef HAVE_MPI
+namespace mpi = boost::mpi;
+#endif
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+  const int size = world.size(); 
+  const int rank = world.rank();
+#else
+  const int size = 1;
+  const int rank = 0;
+#endif
+  if (size > 1) SetSilent(true);  // turn off verbose decoder output
+  register_feature_functions();
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf))
+    return false;
+
+  // load cdec.ini and set up decoder
+  ReadFile ini_rf(conf["decoder_config"].as<string>());
+  Decoder decoder(ini_rf.stream());
+  if (decoder.GetConf()["input"].as<string>() != "-") {
+    cerr << "cdec.ini must not set an input file\n";
+    abort();
+  }
+
+  // load weights
+  vector<weight_t>& weights = decoder.CurrentWeightVector();
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &weights);
+
+  // freeze feature set
+  //const bool freeze_feature_set = conf.count("freeze_feature_set");
+  //if (freeze_feature_set) FD::Freeze();
+
+  vector<string> corpus; vector<int> ids;
+  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
+  assert(corpus.size() > 0);
+  assert(corpus.size() == ids.size());
+
+  TrainingObserver observer;
+  double objective = 0;
+
+  observer.Reset();
+  if (rank == 0)
+    cerr << "Each processor is decoding " << corpus.size() << " training examples...\n";
+
+  for (int i = 0; i < corpus.size(); ++i) {
+    decoder.SetId(ids[i]);
+    decoder.Decode(corpus[i], &observer);
+  }
+
+#ifdef HAVE_MPI
+  reduce(world, observer.acc_obj, objective, std::plus<double>(), 0);
+#else
+  objective = observer.acc_obj;
+#endif
+
+  if (rank == 0)
+    cout << "OBJECTIVE: " << objective << endl;
+
+  return 0;
+}
+
-- 
cgit v1.2.3