diff options
-rw-r--r-- | configure.ac | 9 | ||||
-rwxr-xr-x | corpus/add-sos-eos.pl | 71 | ||||
-rw-r--r-- | decoder/scfg_translator.cc | 31 | ||||
-rw-r--r-- | training/mira/kbest_cut_mira.cc | 5 | ||||
-rwxr-xr-x | training/mira/mira.py | 4 |
5 files changed, 90 insertions, 30 deletions
diff --git a/configure.ac b/configure.ac index e5d2dadb..6b128768 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ AC_CONFIG_MACRO_DIR([m4]) -AC_INIT([cdec],[2014-01-20]) +AC_INIT([cdec],[2014-01-28]) AC_CONFIG_SRCDIR([decoder/cdec.cc]) AM_INIT_AUTOMAKE AC_CONFIG_HEADERS(config.h) @@ -8,8 +8,10 @@ AC_PROG_LEX case $LEX in :) AC_MSG_ERROR([No lex (Flex, lex, etc.) program found]);; esac +OLD_CXXFLAGS=$CXXFLAGS AC_PROG_CC AC_PROG_CXX +CXXFLAGS=$OLD_CXXFLAGS AX_CXX_COMPILE_STDCXX_11([],[mandatory]) AC_LANG_CPLUSPLUS AC_OPENMP @@ -186,8 +188,9 @@ AC_CHECK_HEADER(google/dense_hash_map, AC_PROG_INSTALL -CPPFLAGS="-DPIC -fPIC $CPPFLAGS -DHAVE_CONFIG_H -DKENLM_MAX_ORDER=6" -CXXFLAGS="$CXX11_SWITCH $CXXFLAGS" +CPPFLAGS="-DPIC $CPPFLAGS -DHAVE_CONFIG_H -DKENLM_MAX_ORDER=6" +CXXFLAGS="$CXX11_SWITCH $CXXFLAGS -fPIC -g -O3" +CFLAGS="$CFLAGS -fPIC -g -O3" if test "x$HAVE_CXX11" = "x0"; then CPPFLAGS="$CPPFLAGS -DHAVE_OLD_CPP" diff --git a/corpus/add-sos-eos.pl b/corpus/add-sos-eos.pl index 5e2d44cb..d7608c5e 100755 --- a/corpus/add-sos-eos.pl +++ b/corpus/add-sos-eos.pl @@ -1,24 +1,63 @@ #!/usr/bin/perl -w use strict; -while(<>) { +die "Usage: $0 corpus.fr[-en1-en2-...] [corpus.al out-corpus.al]\n" unless (scalar @ARGV == 1 || scalar @ARGV == 3); +my $filec = shift @ARGV; +my $filea = shift @ARGV; +my $ofilea = shift @ARGV; +open C, "<$filec" or die "Can't read $filec: $!"; +if ($filea) { + open A, "<$filea" or die "Can't read $filea: $!"; + open OA, ">$ofilea" or die "Can't write $ofilea: $!"; +} +binmode(C, ":utf8"); +binmode(STDOUT, ":utf8"); +print STDERR "Adding <s> and </s> markers to input...\n"; +print STDERR " Reading corpus: $filec\n"; +print STDERR " Writing corpus: STDOUT\n"; +print STDERR "Reading alignments: $filea\n" if $filea; +print STDERR "Writing alignments: $ofilea\n" if $filea; + +my $lines = 0; +while(<C>) { + $lines++; + die "ERROR. Input line $filec:$lines should not contain SGML markup" if /<seg /; + if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; } + elsif ($lines % 2500 == 0) { print STDERR "."; } chomp; my @fields = split / \|\|\| /; - my ($ff, $ee, $aa) = @fields; - die "Expected: foreign ||| target ||| alignments" unless scalar @fields == 3; - my @fs = split /\s+/, $ff; - my @es = split /\s+/, $ee; - my @as = split /\s+/, $aa; - my @oas = (); - push @oas, '0-0'; - my $flen = scalar @fs; - my $elen = scalar @es; - for my $ap (@as) { - my ($a, $b) = split /-/, $ap; - die "Bad format in: @as" unless defined $a && defined $b; - push @oas, ($a + 1) . '-' . ($b + 1); + my $o = ''; + for my $field (@fields) { + $o .= " ||| <s> $field </s>"; } - push @oas, ($flen + 1) . '-' . ($elen + 1); - print "<s> $ff </s> ||| <s> $ee </s> ||| @oas\n"; + $o =~ s/^ \|\|\| //; + if ($filea) { + my $aa = <A>; + die "ERROR. Mismatched number of lines between $filec and $filea\n" unless $aa; + chomp $aa; + my ($ff, $ee) = @fields; + die "ERROR in $filec:$lines: expected 'source ||| target'" unless defined $ee; + my @fs = split /\s+/, $ff; + my @es = split /\s+/, $ee; + my @as = split /\s+/, $aa; + my @oas = (); + push @oas, '0-0'; + my $flen = scalar @fs; + my $elen = scalar @es; + for my $ap (@as) { + my ($a, $b) = split /-/, $ap; + die "ERROR. Bad format in: @as" unless defined $a && defined $b; + push @oas, ($a + 1) . '-' . ($b + 1); + } + push @oas, ($flen + 1) . '-' . ($elen + 1); + print OA "@oas\n"; + } + print "$o\n"; +} +if ($filea) { + close OA; + my $aa = <A>; + die "ERROR. Alignment input file $filea contains more lines than corpus file!\n" if $aa; } +print STDERR "\nSUCCESS. Processed $lines lines.\n"; diff --git a/decoder/scfg_translator.cc b/decoder/scfg_translator.cc index 236d7c90..159a1d60 100644 --- a/decoder/scfg_translator.cc +++ b/decoder/scfg_translator.cc @@ -2,6 +2,7 @@ #include <vector> #include <boost/foreach.hpp> #include <boost/functional/hash.hpp> +#include "fast_lexical_cast.hpp" #include "hash.h" #include "translator.h" #include "hg.h" @@ -353,15 +354,31 @@ bool SCFGTranslator::TranslateImpl(const string& input, return pimpl_->Translate(input, smeta, weights, minus_lm_forest); } -/* -Check for grammar pointer in the sentence markup, for use with sentence specific grammars - */ +// +// Check for extra grammars in the sentence markup, for use with sentence specific grammars +// void SCFGTranslator::ProcessMarkupHintsImpl(const map<string, string>& kv) { - map<string,string>::const_iterator it = kv.find("grammar"); - if (it != kv.end()) { - TextGrammar* sentGrammar = new TextGrammar(it->second); + if (kv.find("grammar0") != kv.end()) { + cerr << "SGML tag grammar0 is not expected (order is: grammar, grammar1, grammar2, ...)\n"; + abort(); + } + unsigned gc = 0; + set<string> loaded; + while(true) { + string gkey = "grammar"; + if (gc > 0) gkey += boost::lexical_cast<string>(gc); + ++gc; + map<string,string>::const_iterator it = kv.find(gkey); + if (it == kv.end()) break; + const string& gfile = it->second; + if (loaded.count(gfile) == 1) { + cerr << "Attempting to load " << gfile << " twice!\n"; + abort(); + } + loaded.insert(gfile); + TextGrammar* sentGrammar = new TextGrammar(gfile); sentGrammar->SetMaxSpan(pimpl_->max_span_limit); - sentGrammar->SetGrammarName(it->second); + sentGrammar->SetGrammarName(gfile); pimpl_->AddSupplementalGrammar(GrammarPtr(sentGrammar)); } } diff --git a/training/mira/kbest_cut_mira.cc b/training/mira/kbest_cut_mira.cc index 9de57f5f..62c770df 100644 --- a/training/mira/kbest_cut_mira.cc +++ b/training/mira/kbest_cut_mira.cc @@ -937,7 +937,8 @@ int main(int argc, char** argv) { //reload weights based on update dense_weights.clear(); lambdas.init_vector(&dense_weights); - ShowLargestFeatures(dense_weights); + if (dense_weights.size() < 500) + ShowLargestFeatures(dense_weights); dense_w_local = dense_weights; iter++; @@ -1004,7 +1005,7 @@ int main(int argc, char** argv) { if (!stream) { int node_id = rng->next() * 100000; cerr << " Writing weights to " << node_id << endl; - Weights::ShowLargestFeatures(dense_weights); + //Weights::ShowLargestFeatures(dense_weights); dots = 0; ostringstream os; os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz"; diff --git a/training/mira/mira.py b/training/mira/mira.py index 1861da1a..0980ef2e 100755 --- a/training/mira/mira.py +++ b/training/mira/mira.py @@ -119,12 +119,12 @@ def main(): parser.add_argument('--metric-scale', type=int, default=1, metavar='N', help='scale MT loss by this amount when computing' ' hope/fear candidates') - parser.add_argument('-k', '--kbest-size', type=int, default=250, metavar='N', + parser.add_argument('-k', '--kbest-size', type=int, default=500, metavar='N', help='size of k-best list to extract from forest') parser.add_argument('--update-size', type=int, metavar='N', help='size of k-best list to use for update. defaults to ' 'equal kbest-size (applies to optimizer 5)') - parser.add_argument('--step-size', type=float, default=0.01, + parser.add_argument('--step-size', type=float, default=0.001, help='controls aggresiveness of update') parser.add_argument('--hope', type=int, default=1, choices=range(1,3), help='how to select hope candidate. options: ' |