summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-02-06 10:26:29 +0100
committerPatrick Simianer <p@simianer.de>2014-02-06 10:26:29 +0100
commit7bfe96c2a706d375362c054619f28dd40c7c33e8 (patch)
treefb0698a3bcb4b38d99d7a07b87a573a571dd7be2
parentc83f665cb7efbbfb0fdfa12203b09ba60e365d25 (diff)
parent3798fb9a43c27c3dfe0db5ee0dd0ef04bf5ee5f5 (diff)
Merge remote-tracking branch 'upstream/master'
-rw-r--r--configure.ac9
-rwxr-xr-xcorpus/add-sos-eos.pl71
-rw-r--r--decoder/scfg_translator.cc31
-rw-r--r--training/mira/kbest_cut_mira.cc5
-rwxr-xr-xtraining/mira/mira.py4
5 files changed, 90 insertions, 30 deletions
diff --git a/configure.ac b/configure.ac
index e5d2dadb..6b128768 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,5 +1,5 @@
AC_CONFIG_MACRO_DIR([m4])
-AC_INIT([cdec],[2014-01-20])
+AC_INIT([cdec],[2014-01-28])
AC_CONFIG_SRCDIR([decoder/cdec.cc])
AM_INIT_AUTOMAKE
AC_CONFIG_HEADERS(config.h)
@@ -8,8 +8,10 @@ AC_PROG_LEX
case $LEX in
:) AC_MSG_ERROR([No lex (Flex, lex, etc.) program found]);;
esac
+OLD_CXXFLAGS=$CXXFLAGS
AC_PROG_CC
AC_PROG_CXX
+CXXFLAGS=$OLD_CXXFLAGS
AX_CXX_COMPILE_STDCXX_11([],[mandatory])
AC_LANG_CPLUSPLUS
AC_OPENMP
@@ -186,8 +188,9 @@ AC_CHECK_HEADER(google/dense_hash_map,
AC_PROG_INSTALL
-CPPFLAGS="-DPIC -fPIC $CPPFLAGS -DHAVE_CONFIG_H -DKENLM_MAX_ORDER=6"
-CXXFLAGS="$CXX11_SWITCH $CXXFLAGS"
+CPPFLAGS="-DPIC $CPPFLAGS -DHAVE_CONFIG_H -DKENLM_MAX_ORDER=6"
+CXXFLAGS="$CXX11_SWITCH $CXXFLAGS -fPIC -g -O3"
+CFLAGS="$CFLAGS -fPIC -g -O3"
if test "x$HAVE_CXX11" = "x0"; then
CPPFLAGS="$CPPFLAGS -DHAVE_OLD_CPP"
diff --git a/corpus/add-sos-eos.pl b/corpus/add-sos-eos.pl
index 5e2d44cb..d7608c5e 100755
--- a/corpus/add-sos-eos.pl
+++ b/corpus/add-sos-eos.pl
@@ -1,24 +1,63 @@
#!/usr/bin/perl -w
use strict;
-while(<>) {
+die "Usage: $0 corpus.fr[-en1-en2-...] [corpus.al out-corpus.al]\n" unless (scalar @ARGV == 1 || scalar @ARGV == 3);
+my $filec = shift @ARGV;
+my $filea = shift @ARGV;
+my $ofilea = shift @ARGV;
+open C, "<$filec" or die "Can't read $filec: $!";
+if ($filea) {
+ open A, "<$filea" or die "Can't read $filea: $!";
+ open OA, ">$ofilea" or die "Can't write $ofilea: $!";
+}
+binmode(C, ":utf8");
+binmode(STDOUT, ":utf8");
+print STDERR "Adding <s> and </s> markers to input...\n";
+print STDERR " Reading corpus: $filec\n";
+print STDERR " Writing corpus: STDOUT\n";
+print STDERR "Reading alignments: $filea\n" if $filea;
+print STDERR "Writing alignments: $ofilea\n" if $filea;
+
+my $lines = 0;
+while(<C>) {
+ $lines++;
+ die "ERROR. Input line $filec:$lines should not contain SGML markup" if /<seg /;
+ if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
+ elsif ($lines % 2500 == 0) { print STDERR "."; }
chomp;
my @fields = split / \|\|\| /;
- my ($ff, $ee, $aa) = @fields;
- die "Expected: foreign ||| target ||| alignments" unless scalar @fields == 3;
- my @fs = split /\s+/, $ff;
- my @es = split /\s+/, $ee;
- my @as = split /\s+/, $aa;
- my @oas = ();
- push @oas, '0-0';
- my $flen = scalar @fs;
- my $elen = scalar @es;
- for my $ap (@as) {
- my ($a, $b) = split /-/, $ap;
- die "Bad format in: @as" unless defined $a && defined $b;
- push @oas, ($a + 1) . '-' . ($b + 1);
+ my $o = '';
+ for my $field (@fields) {
+ $o .= " ||| <s> $field </s>";
}
- push @oas, ($flen + 1) . '-' . ($elen + 1);
- print "<s> $ff </s> ||| <s> $ee </s> ||| @oas\n";
+ $o =~ s/^ \|\|\| //;
+ if ($filea) {
+ my $aa = <A>;
+ die "ERROR. Mismatched number of lines between $filec and $filea\n" unless $aa;
+ chomp $aa;
+ my ($ff, $ee) = @fields;
+ die "ERROR in $filec:$lines: expected 'source ||| target'" unless defined $ee;
+ my @fs = split /\s+/, $ff;
+ my @es = split /\s+/, $ee;
+ my @as = split /\s+/, $aa;
+ my @oas = ();
+ push @oas, '0-0';
+ my $flen = scalar @fs;
+ my $elen = scalar @es;
+ for my $ap (@as) {
+ my ($a, $b) = split /-/, $ap;
+ die "ERROR. Bad format in: @as" unless defined $a && defined $b;
+ push @oas, ($a + 1) . '-' . ($b + 1);
+ }
+ push @oas, ($flen + 1) . '-' . ($elen + 1);
+ print OA "@oas\n";
+ }
+ print "$o\n";
+}
+if ($filea) {
+ close OA;
+ my $aa = <A>;
+ die "ERROR. Alignment input file $filea contains more lines than corpus file!\n" if $aa;
}
+print STDERR "\nSUCCESS. Processed $lines lines.\n";
diff --git a/decoder/scfg_translator.cc b/decoder/scfg_translator.cc
index 236d7c90..159a1d60 100644
--- a/decoder/scfg_translator.cc
+++ b/decoder/scfg_translator.cc
@@ -2,6 +2,7 @@
#include <vector>
#include <boost/foreach.hpp>
#include <boost/functional/hash.hpp>
+#include "fast_lexical_cast.hpp"
#include "hash.h"
#include "translator.h"
#include "hg.h"
@@ -353,15 +354,31 @@ bool SCFGTranslator::TranslateImpl(const string& input,
return pimpl_->Translate(input, smeta, weights, minus_lm_forest);
}
-/*
-Check for grammar pointer in the sentence markup, for use with sentence specific grammars
- */
+//
+// Check for extra grammars in the sentence markup, for use with sentence specific grammars
+//
void SCFGTranslator::ProcessMarkupHintsImpl(const map<string, string>& kv) {
- map<string,string>::const_iterator it = kv.find("grammar");
- if (it != kv.end()) {
- TextGrammar* sentGrammar = new TextGrammar(it->second);
+ if (kv.find("grammar0") != kv.end()) {
+ cerr << "SGML tag grammar0 is not expected (order is: grammar, grammar1, grammar2, ...)\n";
+ abort();
+ }
+ unsigned gc = 0;
+ set<string> loaded;
+ while(true) {
+ string gkey = "grammar";
+ if (gc > 0) gkey += boost::lexical_cast<string>(gc);
+ ++gc;
+ map<string,string>::const_iterator it = kv.find(gkey);
+ if (it == kv.end()) break;
+ const string& gfile = it->second;
+ if (loaded.count(gfile) == 1) {
+ cerr << "Attempting to load " << gfile << " twice!\n";
+ abort();
+ }
+ loaded.insert(gfile);
+ TextGrammar* sentGrammar = new TextGrammar(gfile);
sentGrammar->SetMaxSpan(pimpl_->max_span_limit);
- sentGrammar->SetGrammarName(it->second);
+ sentGrammar->SetGrammarName(gfile);
pimpl_->AddSupplementalGrammar(GrammarPtr(sentGrammar));
}
}
diff --git a/training/mira/kbest_cut_mira.cc b/training/mira/kbest_cut_mira.cc
index 9de57f5f..62c770df 100644
--- a/training/mira/kbest_cut_mira.cc
+++ b/training/mira/kbest_cut_mira.cc
@@ -937,7 +937,8 @@ int main(int argc, char** argv) {
//reload weights based on update
dense_weights.clear();
lambdas.init_vector(&dense_weights);
- ShowLargestFeatures(dense_weights);
+ if (dense_weights.size() < 500)
+ ShowLargestFeatures(dense_weights);
dense_w_local = dense_weights;
iter++;
@@ -1004,7 +1005,7 @@ int main(int argc, char** argv) {
if (!stream) {
int node_id = rng->next() * 100000;
cerr << " Writing weights to " << node_id << endl;
- Weights::ShowLargestFeatures(dense_weights);
+ //Weights::ShowLargestFeatures(dense_weights);
dots = 0;
ostringstream os;
os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz";
diff --git a/training/mira/mira.py b/training/mira/mira.py
index 1861da1a..0980ef2e 100755
--- a/training/mira/mira.py
+++ b/training/mira/mira.py
@@ -119,12 +119,12 @@ def main():
parser.add_argument('--metric-scale', type=int, default=1, metavar='N',
help='scale MT loss by this amount when computing'
' hope/fear candidates')
- parser.add_argument('-k', '--kbest-size', type=int, default=250, metavar='N',
+ parser.add_argument('-k', '--kbest-size', type=int, default=500, metavar='N',
help='size of k-best list to extract from forest')
parser.add_argument('--update-size', type=int, metavar='N',
help='size of k-best list to use for update. defaults to '
'equal kbest-size (applies to optimizer 5)')
- parser.add_argument('--step-size', type=float, default=0.01,
+ parser.add_argument('--step-size', type=float, default=0.001,
help='controls aggresiveness of update')
parser.add_argument('--hope', type=int, default=1, choices=range(1,3),
help='how to select hope candidate. options: '