diff options
Diffstat (limited to 'training')
| -rwxr-xr-x | training/cluster-em.pl | 114 | ||||
| -rwxr-xr-x | training/cluster-ptrain.pl | 206 | ||||
| -rwxr-xr-x | training/make-lexcrf-grammar.pl | 285 | ||||
| -rw-r--r-- | training/mpi_compute_cllh.cc (renamed from training/compute_cllh.cc) | 0 | 
4 files changed, 0 insertions, 605 deletions
| diff --git a/training/cluster-em.pl b/training/cluster-em.pl deleted file mode 100755 index 267ab642..00000000 --- a/training/cluster-em.pl +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } -use Getopt::Long; -my $parallel = 0; - -my $CWD=`pwd`; chomp $CWD; -my $BIN_DIR = "$CWD/.."; -my $REDUCER = "$BIN_DIR/training/mr_em_adapted_reduce"; -my $REDUCE2WEIGHTS = "$BIN_DIR/training/mr_reduce_to_weights"; -my $ADAPTER = "$BIN_DIR/training/mr_em_map_adapter"; -my $DECODER = "$BIN_DIR/decoder/cdec"; -my $COMBINER_CACHE_SIZE = 10000000; -my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl"; -die "Can't find $REDUCER" unless -f $REDUCER; -die "Can't execute $REDUCER" unless -x $REDUCER; -die "Can't find $REDUCE2WEIGHTS" unless -f $REDUCE2WEIGHTS; -die "Can't execute $REDUCE2WEIGHTS" unless -x $REDUCE2WEIGHTS; -die "Can't find $ADAPTER" unless -f $ADAPTER; -die "Can't execute $ADAPTER" unless -x $ADAPTER; -die "Can't find $DECODER" unless -f $DECODER; -die "Can't execute $DECODER" unless -x $DECODER; -my $restart = ''; -if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; } - -die "Usage: $0 [--restart] training.corpus cdec.ini\n" unless (scalar @ARGV == 2); - -my $training_corpus = shift @ARGV; -my $config = shift @ARGV; -my $pmem="2500mb"; -my $nodes = 40; -my $max_iteration = 1000; -my $CFLAG = "-C 1"; -if ($parallel) { -  die "Can't find $PARALLEL" unless -f $PARALLEL; -  die "Can't execute $PARALLEL" unless -x $PARALLEL; -} else { $CFLAG = "-C 500"; } - -my $initial_weights = ''; - -print STDERR <<EOT; -EM TRAIN CONFIGURATION INFORMATION - -      Config file: $config -  Training corpus: $training_corpus -  Initial weights: $initial_weights -   Decoder memory: $pmem -  Nodes requested: $nodes -   Max iterations: $max_iteration -          restart: $restart -EOT - -my $nodelist="1"; -for (my $i=1; $i<$nodes; $i++) { $nodelist .= " 1"; } -my $iter = 1; - -my $dir = "$CWD/emtrain"; -if ($restart) { -  die "$dir doesn't exist, but --restart specified!\n" unless -d $dir; -  my $o = `ls -t $dir/weights.*`; -  my ($a, @x) = split /\n/, $o; -  if ($a =~ /weights.(\d+)\.gz$/) { -    $iter = $1; -  } else { -    die "Unexpected file: $a!\n"; -  } -  print STDERR "Restarting at iteration $iter\n"; -} else { -  die "$dir already exists!\n" if -e $dir; -  mkdir $dir or die "Can't create $dir: $!"; - -  if ($initial_weights) { -    unless ($initial_weights =~ /\.gz$/) { -      `cp $initial_weights $dir/weights.1`; -      `gzip -9 $dir/weights.1`; -    } else { -      `cp $initial_weights $dir/weights.1.gz`; -    } -  } -} - -while ($iter < $max_iteration) { -  my $cur_time = `date`; chomp $cur_time; -  print STDERR "\nStarting iteration $iter...\n"; -  print STDERR "  time: $cur_time\n"; -  my $start = time; -  my $next_iter = $iter + 1; -  my $WSTR = "-w $dir/weights.$iter.gz"; -  if ($iter == 1) { $WSTR = ''; } -  my $dec_cmd="$DECODER --feature_expectations -c $config $WSTR $CFLAG < $training_corpus 2> $dir/deco.log.$iter"; -  my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- "; -  my $cmd = ""; -  if ($parallel) { $cmd = $pcmd; } -  $cmd .= "$dec_cmd"; -  $cmd .= "| $ADAPTER | sort -k1 | $REDUCER | $REDUCE2WEIGHTS -o $dir/weights.$next_iter.gz"; -  print STDERR "EXECUTING: $cmd\n"; -  my $result = `$cmd`; -  if ($? != 0) { -    die "Error running iteration $iter: $!"; -  } -  chomp $result; -  my $end = time; -  my $diff = ($end - $start); -  print STDERR "  ITERATION $iter TOOK $diff SECONDS\n"; -  $iter = $next_iter; -  if ($result =~ /1$/) { -    print STDERR "Training converged.\n"; -    last; -  } -} - -print "FINAL WEIGHTS: $dir/weights.$iter\n"; - diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl deleted file mode 100755 index 03122df9..00000000 --- a/training/cluster-ptrain.pl +++ /dev/null @@ -1,206 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path getcwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } -use Getopt::Long; - -my $MAX_ITER_ATTEMPTS = 5; # number of times to retry a failed function evaluation -my $CWD=getcwd(); -my $OPTIMIZER = "$SCRIPT_DIR/mr_optimize_reduce"; -my $DECODER = "$SCRIPT_DIR/../decoder/cdec"; -my $COMBINER_CACHE_SIZE = 150; -# This is a hack to run this on a weird cluster, -# eventually, I'll provide Hadoop scripts. -my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl"; -die "Can't find $OPTIMIZER" unless -f $OPTIMIZER; -die "Can't execute $OPTIMIZER" unless -x $OPTIMIZER; -my $restart = ''; -if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; } - -my $pmem="2500mb"; -my $nodes = 1; -my $max_iteration = 1000; -my $PRIOR_FLAG = ""; -my $parallel = 1; -my $CFLAG = "-C 1"; -my $LOCAL; -my $DISTRIBUTED; -my $PRIOR; -my $OALG = "lbfgs"; -my $sigsq = 1; -my $means_file; -my $mem_buffers = 20; -my $RESTART_IF_NECESSARY; -GetOptions("cdec=s" => \$DECODER, -           "distributed" => \$DISTRIBUTED, -           "sigma_squared=f" => \$sigsq, -           "lbfgs_memory_buffers=i" => \$mem_buffers, -           "max_iteration=i" => \$max_iteration, -           "means=s" => \$means_file, -           "optimizer=s" => \$OALG, -           "gaussian_prior" => \$PRIOR, -           "restart_if_necessary" => \$RESTART_IF_NECESSARY, -           "jobs=i" => \$nodes, -           "pmem=s" => \$pmem -          ) or usage(); -usage() unless scalar @ARGV==3; -my $config_file = shift @ARGV; -my $training_corpus = shift @ARGV; -my $initial_weights = shift @ARGV; -unless ($DISTRIBUTED) { $LOCAL = 1; } -die "Can't find $config_file" unless -f $config_file; -die "Can't find $DECODER" unless -f $DECODER; -die "Can't execute $DECODER" unless -x $DECODER; -if ($LOCAL) { print STDERR "Will run LOCALLY.\n"; $parallel = 0; } -if ($PRIOR) { -  $PRIOR_FLAG="-p --sigma_squared $sigsq"; -  if ($means_file) { $PRIOR_FLAG .= " -u $means_file"; } -} - -if ($parallel) { -  die "Can't find $PARALLEL" unless -f $PARALLEL; -  die "Can't execute $PARALLEL" unless -x $PARALLEL; -} -unless ($parallel) { $CFLAG = "-C 500"; } -unless ($config_file =~ /^\//) { $config_file = $CWD . '/' . $config_file; } -my $clines = num_lines($training_corpus); -my $dir = "$CWD/ptrain"; - -if ($RESTART_IF_NECESSARY && -d $dir) { -  $restart = 1; -} - -print STDERR <<EOT; -PTRAIN CONFIGURATION INFORMATION - -      Config file: $config_file -  Training corpus: $training_corpus -      Corpus size: $clines -  Initial weights: $initial_weights -   Decoder memory: $pmem -   Max iterations: $max_iteration -        Optimizer: $OALG -   Jobs requested: $nodes -           prior?: $PRIOR_FLAG -         restart?: $restart -EOT - -if ($OALG) { $OALG="-m $OALG"; } - -my $nodelist="1"; -for (my $i=1; $i<$nodes; $i++) { $nodelist .= " 1"; } -my $iter = 1; - -if ($restart) { -  die "$dir doesn't exist, but --restart specified!\n" unless -d $dir; -  my $o = `ls -t $dir/weights.*`; -  my ($a, @x) = split /\n/, $o; -  if ($a =~ /weights.(\d+)\.gz$/) { -    $iter = $1; -  } else { -    die "Unexpected file: $a!\n"; -  } -  print STDERR "Restarting at iteration $iter\n"; -} else { -  die "$dir already exists!\n" if -e $dir; -  mkdir $dir or die "Can't create $dir: $!"; - -  unless ($initial_weights =~ /\.gz$/) { -    `cp $initial_weights $dir/weights.1`; -    `gzip -9 $dir/weights.1`; -  } else { -    `cp $initial_weights $dir/weights.1.gz`; -  } -  open T, "<$training_corpus" or die "Can't read $training_corpus: $!"; -  open TO, ">$dir/training.in"; -  my $lc = 0; -  while(<T>) { -    chomp; -    s/^\s+//; -    s/\s+$//; -    die "Expected A ||| B in input file" unless / \|\|\| /; -    print TO "<seg id=\"$lc\">$_</seg>\n"; -    $lc++; -  } -  close T; -  close TO; -} -$training_corpus = "$dir/training.in"; - -my $iter_attempts = 1; -while ($iter < $max_iteration) { -  my $cur_time = `date`; chomp $cur_time; -  print STDERR "\nStarting iteration $iter...\n"; -  print STDERR "  time: $cur_time\n"; -  my $start = time; -  my $next_iter = $iter + 1; -  my $dec_cmd="$DECODER -G $CFLAG -c $config_file -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter"; -  my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M $mem_buffers $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz"; -  my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- "; -  my $cmd = ""; -  if ($parallel) { $cmd = $pcmd; } -  $cmd .= "$dec_cmd | $opt_cmd"; - -  print STDERR "EXECUTING: $cmd\n"; -  my $result = `$cmd`; -  my $exit_code = $? >> 8; -  if ($exit_code == 99) { -    $iter_attempts++; -    if ($iter_attempts > $MAX_ITER_ATTEMPTS) { -      die "Received restart request $iter_attempts times from optimizer, giving up\n"; -    } -    print STDERR "Function evaluation failed, retrying (attempt $iter_attempts)\n"; -    next; -  } -  if ($? != 0) { -    die "Error running iteration $iter: $!"; -  } -  chomp $result; -  my $end = time; -  my $diff = ($end - $start); -  print STDERR "  ITERATION $iter TOOK $diff SECONDS\n"; -  $iter = $next_iter; -  if ($result =~ /1$/) { -    print STDERR "Training converged.\n"; -    last; -  } -  $iter_attempts = 1; -} - -print "FINAL WEIGHTS: $dir/weights.$iter\n"; -`mv $dir/weights.$iter.gz $dir/weights.final.gz`; - -sub usage { -  die <<EOT; - -Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init - -  Options: - -    --distributed      Parallelize function evaluation -    --jobs N           Number of jobs to use -    --cdec PATH        Path to cdec binary -    --optimize OPT     lbfgs, rprop, sgd -    --gaussian_prior   add Gaussian prior -    --means FILE       if you want means other than 0 -    --sigma_squared S  variance on prior -    --pmem MEM         Memory required for decoder -    --lbfgs_memory_buffers Number of buffers to use -                           with LBFGS optimizer - -EOT -} - -sub num_lines { -  my $file = shift; -  my $fh; -  if ($file=~ /\.gz$/) { -    open $fh, "zcat $file|" or die "Couldn't fork zcat $file: $!"; -  } else { -    open $fh, "<$file" or die "Couldn't read $file: $!"; -  } -  my $lines = 0; -  while(<$fh>) { $lines++; } -  close $fh; -  return $lines; -} diff --git a/training/make-lexcrf-grammar.pl b/training/make-lexcrf-grammar.pl deleted file mode 100755 index 8cdf7718..00000000 --- a/training/make-lexcrf-grammar.pl +++ /dev/null @@ -1,285 +0,0 @@ -#!/usr/bin/perl -w -use utf8; -use strict; -my ($effile, $model1) = @ARGV; -die "Usage: $0 corpus.fr-en corpus.model1\n" unless $effile && -f $effile && $model1 && -f $model1; - -open EF, "<$effile" or die; -open M1, "<$model1" or die; -binmode(EF,":utf8"); -binmode(M1,":utf8"); -binmode(STDOUT,":utf8"); -my %model1; -while(<M1>) { -  chomp; -  my ($f, $e, $lp) = split /\s+/; -  $model1{$f}->{$e} = $lp; -} - -my $ADD_MODEL1 = 0;      # found that model1 hurts performance -my $IS_FRENCH_F = 1;     # indicates that the f language is french -my $IS_ARABIC_F = 0;     # indicates that the f language is arabic -my $IS_URDU_F = 0;     # indicates that the f language is arabic -my $ADD_PREFIX_ID = 0; -my $ADD_LEN = 1; -my $ADD_SIM = 1; -my $ADD_DICE = 1; -my $ADD_111 = 1; -my $ADD_ID = 1; -my $ADD_PUNC = 1; -my $ADD_NUM_MM = 1; -my $ADD_NULL = 1; -my $ADD_STEM_ID = 1; -my $BEAM_RATIO = 50; - -my %fdict; -my %fcounts; -my %ecounts; - -my %sdict; - -while(<EF>) { -  chomp; -  my ($f, $e) = split /\s*\|\|\|\s*/; -  my @es = split /\s+/, $e; -  my @fs = split /\s+/, $f; -  for my $ew (@es){ $ecounts{$ew}++; } -  push @fs, '<eps>' if $ADD_NULL; -  for my $fw (@fs){ $fcounts{$fw}++; } -  for my $fw (@fs){ -    for my $ew (@es){ -      $fdict{$fw}->{$ew}++; -    } -  } -} - -print STDERR "Dice 0\n" if $ADD_DICE; -print STDERR "OneOneOne 0\nId_OneOneOne 0\n" if $ADD_111; -print STDERR "Identical 0\n" if $ADD_ID; -print STDERR "PuncMiss 0\n" if $ADD_PUNC; -print STDERR "IsNull 0\n" if $ADD_NULL; -print STDERR "Model1 0\n" if $ADD_MODEL1; -print STDERR "DLen 0\n" if $ADD_LEN; -print STDERR "NumMM 0\nNumMatch 0\n" if $ADD_NUM_MM; -print STDERR "OrthoSim 0\n" if $ADD_SIM; -print STDERR "PfxIdentical 0\n" if ($ADD_PREFIX_ID); -my $fc = 1000000; -my $sids = 1000000; -for my $f (sort keys %fdict) { -  my $re = $fdict{$f}; -  my $max; -  for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) { -    my $efcount = $re->{$e}; -    unless (defined $max) { $max = $efcount; } -    my $m1 = $model1{$f}->{$e}; -    unless (defined $m1) { next; } -    $fc++; -    my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f}); -    my $feats = "F$fc=1"; -    my $oe = $e; -    my $of = $f;   # normalized form -    if ($IS_FRENCH_F) { -      # see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French -      $of =~ s/â/as/g; -      $of =~ s/ê/es/g; -      $of =~ s/î/is/g; -      $of =~ s/ô/os/g; -      $of =~ s/û/us/g; -    } elsif ($IS_ARABIC_F) { -      if (length($of) > 1 && !($of =~ /\d/)) { -        $of =~ s/\$/sh/g; -      } -    } elsif ($IS_URDU_F) { -      if (length($of) > 1 && !($of =~ /\d/)) { -        $of =~ s/\$/sh/g; -      } -      $oe =~ s/^-e-//; -      $oe =~ s/^al-/al/; -      $of =~ s/([a-z])\~/$1$1/g; -      $of =~ s/E/'/g; -      $of =~ s/^Aw/o/g; -      $of =~ s/\|/a/g; -      $of =~ s/@/h/g; -      $of =~ s/c/ch/g; -      $of =~ s/x/kh/g; -      $of =~ s/\*/dh/g; -      $of =~ s/w/o/g; -      $of =~ s/Z/dh/g; -      $of =~ s/y/i/g; -      $of =~ s/Y/a/g; -      $of = lc $of; -    } -    my $len_e = length($oe); -    my $len_f = length($of); -    $feats .= " Model1=$m1" if ($ADD_MODEL1); -    $feats .= " Dice=$dice" if $ADD_DICE; -    my $is_null = undef; -    if ($ADD_NULL && $f eq '<eps>') { -      $feats .= " IsNull=1"; -      $is_null = 1; -    } -    if ($ADD_LEN) { -      if (!$is_null) { -        my $dlen = abs($len_e - $len_f); -        $feats .= " DLen=$dlen"; -      } -    } -    my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3)); -    my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3)); -    my $both_non_numeric = (!$e_num && !$f_num); -    if ($ADD_NUM_MM && (($f_num && !$e_num) || ($e_num && !$f_num))) { -      $feats .= " NumMM=1"; -    } -    if ($ADD_NUM_MM && ($f_num && $e_num) && ($oe eq $of)) { -      $feats .= " NumMatch=1"; -    } -    if ($ADD_STEM_ID) { -      my $el = 4; -      my $fl = 4; -      if ($oe =~ /^al|re|co/) { $el++; } -      if ($of =~ /^al|re|co/) { $fl++; } -      if ($oe =~ /^trans|inter/) { $el+=2; } -      if ($of =~ /^trans|inter/) { $fl+=2; } -      if ($fl > length($of)) { $fl = length($of); } -      if ($el > length($oe)) { $el = length($oe); } -      my $sf = substr $of, 0, $fl; -      my $se = substr $oe, 0, $el; -      my $id = $sdict{$sf}->{$se}; -      if (!$id) { -        $sids++; -	$sdict{$sf}->{$se} = $sids; -	$id = $sids; -	print STDERR "S$sids 0\n" -      } -      $feats .= " S$id=1"; -    } -    if ($ADD_PREFIX_ID) { -      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) {  -        my $pe = substr $oe, 0, 3; -        my $pf = substr $of, 0, 3; -        if ($pe eq $pf) { $feats .= " PfxIdentical=1"; } -      } -    } -    if ($ADD_SIM) { -      my $ld = 0; -      my $eff = $len_e; -      if ($eff < $len_f) { $eff = $len_f; } -      if (!$is_null) { -        $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); -      } -      $feats .= " OrthoSim=$ld"; -    } -    my $ident = ($e eq $f); -    if ($ident && $ADD_ID) { $feats .= " Identical=1"; } -    if ($ADD_111 && ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1)) { -      if ($ident && $ADD_ID) { -        $feats .= " Id_OneOneOne=1"; -      } -      $feats .= " OneOneOne=1"; -    } -    if ($ADD_PUNC) { -      if (($f =~ /^[0-9!\$%,\-\/"':;=+?.()«»]+$/ && $e =~ /[a-z]+/) || -          ($e =~ /^[0-9!\$%,\-\/"':;=+?.()«»]+$/ && $f =~ /[a-z]+/)) { -        $feats .= " PuncMiss=1"; -      } -    } -    my $r = (0.5 - rand)/5; -    print STDERR "F$fc $r\n"; -    print "$f ||| $e ||| $feats\n"; -  } -} - -sub levenshtein -{ -    # $s1 and $s2 are the two strings -    # $len1 and $len2 are their respective lengths -    # -    my ($s1, $s2) = @_; -    my ($len1, $len2) = (length $s1, length $s2); - -    # If one of the strings is empty, the distance is the length -    # of the other string -    # -    return $len2 if ($len1 == 0); -    return $len1 if ($len2 == 0); - -    my %mat; - -    # Init the distance matrix -    # -    # The first row to 0..$len1 -    # The first column to 0..$len2 -    # The rest to 0 -    # -    # The first row and column are initialized so to denote distance -    # from the empty string -    # -    for (my $i = 0; $i <= $len1; ++$i) -    { -        for (my $j = 0; $j <= $len2; ++$j) -        { -            $mat{$i}{$j} = 0; -            $mat{0}{$j} = $j; -        } - -        $mat{$i}{0} = $i; -    } - -    # Some char-by-char processing is ahead, so prepare -    # array of chars from the strings -    # -    my @ar1 = split(//, $s1); -    my @ar2 = split(//, $s2); - -    for (my $i = 1; $i <= $len1; ++$i) -    { -        for (my $j = 1; $j <= $len2; ++$j) -        { -            # Set the cost to 1 iff the ith char of $s1 -            # equals the jth of $s2 -            #  -            # Denotes a substitution cost. When the char are equal -            # there is no need to substitute, so the cost is 0 -            # -            my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1; - -            # Cell $mat{$i}{$j} equals the minimum of: -            # -            # - The cell immediately above plus 1 -            # - The cell immediately to the left plus 1 -            # - The cell diagonally above and to the left plus the cost -            # -            # We can either insert a new char, delete a char or -            # substitute an existing char (with an associated cost) -            # -            $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1, -                                $mat{$i}{$j-1} + 1, -                                $mat{$i-1}{$j-1} + $cost]); -        } -    } - -    # Finally, the Levenshtein distance equals the rightmost bottom cell -    # of the matrix -    # -    # Note that $mat{$x}{$y} denotes the distance between the substrings -    # 1..$x and 1..$y -    # -    return $mat{$len1}{$len2}; -} - - -# minimal element of a list -# -sub min -{ -    my @list = @{$_[0]}; -    my $min = $list[0]; - -    foreach my $i (@list) -    { -        $min = $i if ($i < $min); -    } - -    return $min; -} - diff --git a/training/compute_cllh.cc b/training/mpi_compute_cllh.cc index b496d196..b496d196 100644 --- a/training/compute_cllh.cc +++ b/training/mpi_compute_cllh.cc | 
