From fe9b2b6aa3afe2f7baa9e049693fb37610228b54 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 28 Jan 2014 15:49:05 -0500 Subject: useful debugging --- training/mira/kbest_cut_mira.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/training/mira/kbest_cut_mira.cc b/training/mira/kbest_cut_mira.cc index e0b6eecb..e075bed3 100644 --- a/training/mira/kbest_cut_mira.cc +++ b/training/mira/kbest_cut_mira.cc @@ -933,7 +933,8 @@ int main(int argc, char** argv) { //reload weights based on update dense_weights.clear(); lambdas.init_vector(&dense_weights); - ShowLargestFeatures(dense_weights); + if (dense_weights.size() < 500) + ShowLargestFeatures(dense_weights); dense_w_local = dense_weights; iter++; @@ -1000,7 +1001,7 @@ int main(int argc, char** argv) { if (!stream) { int node_id = rng->next() * 100000; cerr << " Writing weights to " << node_id << endl; - Weights::ShowLargestFeatures(dense_weights); + //Weights::ShowLargestFeatures(dense_weights); dots = 0; ostringstream os; os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz"; -- cgit v1.2.3 From 31f70988747103825525595a20465dd8cd817eac Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 28 Jan 2014 19:50:03 -0500 Subject: smarter script for adding and markers --- corpus/add-sos-eos.pl | 71 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/corpus/add-sos-eos.pl b/corpus/add-sos-eos.pl index 5e2d44cb..d7608c5e 100755 --- a/corpus/add-sos-eos.pl +++ b/corpus/add-sos-eos.pl @@ -1,24 +1,63 @@ #!/usr/bin/perl -w use strict; -while(<>) { +die "Usage: $0 corpus.fr[-en1-en2-...] [corpus.al out-corpus.al]\n" unless (scalar @ARGV == 1 || scalar @ARGV == 3); +my $filec = shift @ARGV; +my $filea = shift @ARGV; +my $ofilea = shift @ARGV; +open C, "<$filec" or die "Can't read $filec: $!"; +if ($filea) { + open A, "<$filea" or die "Can't read $filea: $!"; + open OA, ">$ofilea" or die "Can't write $ofilea: $!"; +} +binmode(C, ":utf8"); +binmode(STDOUT, ":utf8"); +print STDERR "Adding and markers to input...\n"; +print STDERR " Reading corpus: $filec\n"; +print STDERR " Writing corpus: STDOUT\n"; +print STDERR "Reading alignments: $filea\n" if $filea; +print STDERR "Writing alignments: $ofilea\n" if $filea; + +my $lines = 0; +while() { + $lines++; + die "ERROR. Input line $filec:$lines should not contain SGML markup" if / $ff ||| $ee ||| @oas\n"; + $o =~ s/^ \|\|\| //; + if ($filea) { + my $aa = ; + die "ERROR. Mismatched number of lines between $filec and $filea\n" unless $aa; + chomp $aa; + my ($ff, $ee) = @fields; + die "ERROR in $filec:$lines: expected 'source ||| target'" unless defined $ee; + my @fs = split /\s+/, $ff; + my @es = split /\s+/, $ee; + my @as = split /\s+/, $aa; + my @oas = (); + push @oas, '0-0'; + my $flen = scalar @fs; + my $elen = scalar @es; + for my $ap (@as) { + my ($a, $b) = split /-/, $ap; + die "ERROR. Bad format in: @as" unless defined $a && defined $b; + push @oas, ($a + 1) . '-' . ($b + 1); + } + push @oas, ($flen + 1) . '-' . ($elen + 1); + print OA "@oas\n"; + } + print "$o\n"; +} +if ($filea) { + close OA; + my $aa = ; + die "ERROR. Alignment input file $filea contains more lines than corpus file!\n" if $aa; } +print STDERR "\nSUCCESS. Processed $lines lines.\n"; -- cgit v1.2.3 From 305af6fa4f91838d8c9289345c2b124fa8a10c2e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 28 Jan 2014 22:08:58 -0500 Subject: better mira defaults, new release 2014-01-28 --- configure.ac | 9 ++++++--- training/mira/mira.py | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/configure.ac b/configure.ac index e5d2dadb..6b128768 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ AC_CONFIG_MACRO_DIR([m4]) -AC_INIT([cdec],[2014-01-20]) +AC_INIT([cdec],[2014-01-28]) AC_CONFIG_SRCDIR([decoder/cdec.cc]) AM_INIT_AUTOMAKE AC_CONFIG_HEADERS(config.h) @@ -8,8 +8,10 @@ AC_PROG_LEX case $LEX in :) AC_MSG_ERROR([No lex (Flex, lex, etc.) program found]);; esac +OLD_CXXFLAGS=$CXXFLAGS AC_PROG_CC AC_PROG_CXX +CXXFLAGS=$OLD_CXXFLAGS AX_CXX_COMPILE_STDCXX_11([],[mandatory]) AC_LANG_CPLUSPLUS AC_OPENMP @@ -186,8 +188,9 @@ AC_CHECK_HEADER(google/dense_hash_map, AC_PROG_INSTALL -CPPFLAGS="-DPIC -fPIC $CPPFLAGS -DHAVE_CONFIG_H -DKENLM_MAX_ORDER=6" -CXXFLAGS="$CXX11_SWITCH $CXXFLAGS" +CPPFLAGS="-DPIC $CPPFLAGS -DHAVE_CONFIG_H -DKENLM_MAX_ORDER=6" +CXXFLAGS="$CXX11_SWITCH $CXXFLAGS -fPIC -g -O3" +CFLAGS="$CFLAGS -fPIC -g -O3" if test "x$HAVE_CXX11" = "x0"; then CPPFLAGS="$CPPFLAGS -DHAVE_OLD_CPP" diff --git a/training/mira/mira.py b/training/mira/mira.py index c84a8cff..77f2f35f 100755 --- a/training/mira/mira.py +++ b/training/mira/mira.py @@ -119,12 +119,12 @@ def main(): parser.add_argument('--metric-scale', type=int, default=1, metavar='N', help='scale MT loss by this amount when computing' ' hope/fear candidates') - parser.add_argument('-k', '--kbest-size', type=int, default=250, metavar='N', + parser.add_argument('-k', '--kbest-size', type=int, default=500, metavar='N', help='size of k-best list to extract from forest') parser.add_argument('--update-size', type=int, metavar='N', help='size of k-best list to use for update. defaults to ' 'equal kbest-size (applies to optimizer 5)') - parser.add_argument('--step-size', type=float, default=0.01, + parser.add_argument('--step-size', type=float, default=0.001, help='controls aggresiveness of update') parser.add_argument('--hope', type=int, default=1, choices=range(1,3), help='how to select hope candidate. options: ' -- cgit v1.2.3 From 3798fb9a43c27c3dfe0db5ee0dd0ef04bf5ee5f5 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 2 Feb 2014 15:16:53 +0100 Subject: load multiple grammars --- decoder/scfg_translator.cc | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/decoder/scfg_translator.cc b/decoder/scfg_translator.cc index 236d7c90..159a1d60 100644 --- a/decoder/scfg_translator.cc +++ b/decoder/scfg_translator.cc @@ -2,6 +2,7 @@ #include #include #include +#include "fast_lexical_cast.hpp" #include "hash.h" #include "translator.h" #include "hg.h" @@ -353,15 +354,31 @@ bool SCFGTranslator::TranslateImpl(const string& input, return pimpl_->Translate(input, smeta, weights, minus_lm_forest); } -/* -Check for grammar pointer in the sentence markup, for use with sentence specific grammars - */ +// +// Check for extra grammars in the sentence markup, for use with sentence specific grammars +// void SCFGTranslator::ProcessMarkupHintsImpl(const map& kv) { - map::const_iterator it = kv.find("grammar"); - if (it != kv.end()) { - TextGrammar* sentGrammar = new TextGrammar(it->second); + if (kv.find("grammar0") != kv.end()) { + cerr << "SGML tag grammar0 is not expected (order is: grammar, grammar1, grammar2, ...)\n"; + abort(); + } + unsigned gc = 0; + set loaded; + while(true) { + string gkey = "grammar"; + if (gc > 0) gkey += boost::lexical_cast(gc); + ++gc; + map::const_iterator it = kv.find(gkey); + if (it == kv.end()) break; + const string& gfile = it->second; + if (loaded.count(gfile) == 1) { + cerr << "Attempting to load " << gfile << " twice!\n"; + abort(); + } + loaded.insert(gfile); + TextGrammar* sentGrammar = new TextGrammar(gfile); sentGrammar->SetMaxSpan(pimpl_->max_span_limit); - sentGrammar->SetGrammarName(it->second); + sentGrammar->SetGrammarName(gfile); pimpl_->AddSupplementalGrammar(GrammarPtr(sentGrammar)); } } -- cgit v1.2.3