summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xcorpus/tokenize-anything.sh6
-rw-r--r--decoder/bottom_up_parser.cc22
-rw-r--r--decoder/decoder.cc3
-rw-r--r--decoder/grammar.cc4
-rw-r--r--decoder/rule_lexer.h3
-rw-r--r--decoder/rule_lexer.ll20
-rw-r--r--decoder/scfg_translator.cc2
-rw-r--r--decoder/trule.cc2
-rw-r--r--realtime/README.md12
-rwxr-xr-xrealtime/realtime.py12
-rw-r--r--realtime/rt/util.py2
-rw-r--r--training/crf/mpi_adagrad_optimize.cc14
-rwxr-xr-xword-aligner/force_align.py16
13 files changed, 68 insertions, 50 deletions
diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh
index c7adfa61..5b7933d8 100755
--- a/corpus/tokenize-anything.sh
+++ b/corpus/tokenize-anything.sh
@@ -5,15 +5,17 @@ SUPPORT=$ROOTDIR/support
if [[ $# == 1 && $1 == '-u' ]] ; then
NORMARGS="--batchline"
+ SEDFLAGS="-u"
else
NORMARGS=""
+ SEDFLAGS=""
fi
$SUPPORT/utf8-normalize.sh $NORMARGS |
$SUPPORT/quote-norm.pl |
$SUPPORT/tokenizer.pl |
- sed -e 's/ al - / al-/g' |
+ sed $SEDFLAGS -e 's/ al - / al-/g' |
$SUPPORT/fix-contract.pl |
- sed -e 's/^ //' | sed -e 's/ $//' |
+ sed $SEDFLAGS -e 's/^ //' | sed $SEDFLAGS -e 's/ $//' |
perl -e '$|++; while(<>){s/(\d+)(\.+)$/$1 ./; s/(\d+)(\.+) \|\|\|/$1 . |||/; print;}'
diff --git a/decoder/bottom_up_parser.cc b/decoder/bottom_up_parser.cc
index ed79aaf0..606b8d7e 100644
--- a/decoder/bottom_up_parser.cc
+++ b/decoder/bottom_up_parser.cc
@@ -14,6 +14,8 @@
using namespace std;
+static WordID kEPS = 0;
+
class ActiveChart;
class PassiveChart {
public:
@@ -74,9 +76,12 @@ class ActiveChart {
gptr_(g), ant_nodes_(), lattice_cost(0.0) {}
void ExtendTerminal(int symbol, float src_cost, vector<ActiveItem>* out_cell) const {
- const GrammarIter* ni = gptr_->Extend(symbol);
- if (ni) {
- out_cell->push_back(ActiveItem(ni, ant_nodes_, lattice_cost + src_cost));
+ if (symbol == kEPS) {
+ out_cell->push_back(ActiveItem(gptr_, ant_nodes_, lattice_cost + src_cost));
+ } else {
+ const GrammarIter* ni = gptr_->Extend(symbol);
+ if (ni)
+ out_cell->push_back(ActiveItem(ni, ant_nodes_, lattice_cost + src_cost));
}
}
void ExtendNonTerminal(const Hypergraph* hg, int node_index, vector<ActiveItem>* out_cell) const {
@@ -127,8 +132,10 @@ class ActiveChart {
const WordID& f = ai->label;
const double& c = ai->cost;
const int& len = ai->dist2next;
- //VLOG(1) << "F: " << TD::Convert(f) << endl;
+ //cerr << "F: " << TD::Convert(f) << " dest=" << i << "," << (j+len-1) << endl;
const vector<ActiveItem>& ec = act_chart_(i, j-1);
+ //cerr << " SRC=" << i << "," << (j-1) << " [ec=" << ec.size() << "]" << endl;
+ //if (ec.size() > 0) { cerr << " LC=" << ec[0].lattice_cost << endl; }
for (vector<ActiveItem>::const_iterator di = ec.begin(); di != ec.end(); ++di)
di->ExtendTerminal(f, c, &act_chart_(i, j + len - 1));
}
@@ -166,6 +173,7 @@ void PassiveChart::ApplyRule(const int i,
const Hypergraph::TailNodeVector& ant_nodes,
const float lattice_cost) {
Hypergraph::Edge* new_edge = forest_->AddEdge(r, ant_nodes);
+ //cerr << i << " " << j << ": APPLYING RULE: " << r->AsString() << endl;
new_edge->prev_i_ = r->prev_i;
new_edge->prev_j_ = r->prev_j;
new_edge->i_ = i;
@@ -198,8 +206,11 @@ void PassiveChart::ApplyRules(const int i,
const Hypergraph::TailNodeVector& tail,
const float lattice_cost) {
const int n = rules->GetNumRules();
- for (int k = 0; k < n; ++k)
+ //cerr << i << " " << j << ": NUM RULES: " << n << endl;
+ for (int k = 0; k < n; ++k) {
+ //cerr << i << " " << j << ": R=" << rules->GetIthRule(k)->AsString() << endl;
ApplyRule(i, j, rules->GetIthRule(k), tail, lattice_cost);
+ }
}
void PassiveChart::ApplyUnaryRules(const int i, const int j) {
@@ -284,6 +295,7 @@ ExhaustiveBottomUpParser::ExhaustiveBottomUpParser(
bool ExhaustiveBottomUpParser::Parse(const Lattice& input,
Hypergraph* forest) const {
+ kEPS = TD::Convert("*EPS*");
PassiveChart chart(goal_sym_, grammars_, input, forest);
const bool result = chart.Parse();
return result;
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index 9b41253b..5bb62710 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -387,6 +387,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
("show_partition,z", "Compute and show the partition (inside score)")
("show_conditional_prob", "Output the conditional log prob to STDOUT instead of a translation")
("show_cfg_search_space", "Show the search space as a CFG")
+ ("show_cfg_alignment_space", "Show the alignment hypergraph as a CFG")
("show_target_graph", po::value<string>(), "Directory to write the target hypergraphs to")
("incremental_search", po::value<string>(), "Run lazy search with this language model file")
("coarse_to_fine_beam_prune", po::value<double>(), "Prune paths from coarse parse forest before fine parse, keeping paths within exp(alpha>=0)")
@@ -988,6 +989,8 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
cerr << " Contst. partition log(Z): " << log(z) << endl;
}
o->NotifyAlignmentForest(smeta, &forest);
+ if (conf.count("show_cfg_alignment_space"))
+ HypergraphIO::WriteAsCFG(forest);
if (conf.count("forest_output")) {
ForestWriter writer(str("forest_output",conf), sent_id);
if (FileExists(writer.fname_)) {
diff --git a/decoder/grammar.cc b/decoder/grammar.cc
index 160d00e6..439e448d 100644
--- a/decoder/grammar.cc
+++ b/decoder/grammar.cc
@@ -121,11 +121,11 @@ static void AddRuleHelper(const TRulePtr& new_rule, const unsigned int ctf_level
void TextGrammar::ReadFromFile(const string& filename) {
ReadFile in(filename);
- ReadFromStream(in.stream());
+ RuleLexer::ReadRules(in.stream(), &AddRuleHelper, filename, this);
}
void TextGrammar::ReadFromStream(istream* in) {
- RuleLexer::ReadRules(in, &AddRuleHelper, this);
+ RuleLexer::ReadRules(in, &AddRuleHelper, "UNKNOWN", this);
}
bool TextGrammar::HasRuleForSpan(int /* i */, int /* j */, int distance) const {
diff --git a/decoder/rule_lexer.h b/decoder/rule_lexer.h
index 976ea02b..f844e5b2 100644
--- a/decoder/rule_lexer.h
+++ b/decoder/rule_lexer.h
@@ -2,12 +2,13 @@
#define _RULE_LEXER_H_
#include <iostream>
+#include <string>
#include "trule.h"
struct RuleLexer {
typedef void (*RuleCallback)(const TRulePtr& new_rule, const unsigned int ctf_level, const TRulePtr& coarse_rule, void* extra);
- static void ReadRules(std::istream* in, RuleCallback func, void* extra);
+ static void ReadRules(std::istream* in, RuleCallback func, const std::string& fname, void* extra);
};
#endif
diff --git a/decoder/rule_lexer.ll b/decoder/rule_lexer.ll
index 083a5bb1..c6a85919 100644
--- a/decoder/rule_lexer.ll
+++ b/decoder/rule_lexer.ll
@@ -18,6 +18,7 @@ std::istream* scfglex_stream = NULL;
RuleLexer::RuleCallback rule_callback = NULL;
void* rule_callback_extra = NULL;
std::vector<int> scfglex_phrase_fnames;
+std::string scfglex_fname;
#undef YY_INPUT
#define YY_INPUT(buf, result, max_size) (result = scfglex_stream->read(buf, max_size).gcount())
@@ -38,12 +39,12 @@ WordID scfglex_lhs;
int scfglex_src_arity;
int scfglex_trg_arity;
-#define MAX_FEATS 100
+#define MAX_FEATS 10000
int scfglex_feat_ids[MAX_FEATS];
double scfglex_feat_vals[MAX_FEATS];
int scfglex_num_feats;
-#define MAX_ARITY 200
+#define MAX_ARITY 1000
int scfglex_nt_sanity[MAX_ARITY];
int scfglex_src_nts[MAX_ARITY];
// float scfglex_nt_size_means[MAX_ARITY];
@@ -51,7 +52,7 @@ int scfglex_src_nts[MAX_ARITY];
std::stack<TRulePtr> ctf_rule_stack;
unsigned int ctf_level = 0;
-#define MAX_ALS 200
+#define MAX_ALS 2000
AlignmentPoint scfglex_als[MAX_ALS];
int scfglex_num_als;
@@ -190,7 +191,7 @@ NT [^\t \[\],]+
BEGIN(SRC);
}
<INITIAL,LHS_END>. {
- std::cerr << "Line " << lex_line << ": unexpected input in LHS: " << yytext << std::endl;
+ std::cerr << "Grammar " << scfglex_fname << " line " << lex_line << ": unexpected input in LHS: " << yytext << std::endl;
abort();
}
@@ -217,7 +218,7 @@ NT [^\t \[\],]+
<TRG,FEATS,ALIGNS>\n {
if (scfglex_src_arity != scfglex_trg_arity) {
- std::cerr << "Line " << lex_line << ": LHS and RHS arity mismatch!\n";
+ std::cerr << "Grammar " << scfglex_fname << " line " << lex_line << ": LHS and RHS arity mismatch!\n";
abort();
}
// const bool ignore_grammar_features = false;
@@ -258,7 +259,7 @@ NT [^\t \[\],]+
BEGIN(FEATS);
}
<FEATVAL>. {
- std::cerr << "Line " << lex_line << ": unexpected input in feature value: " << yytext << std::endl;
+ std::cerr << "Grammar " << scfglex_fname << " line " << lex_line << ": unexpected input in feature value: " << yytext << std::endl;
abort();
}
<FEATS>{REAL} {
@@ -267,7 +268,7 @@ NT [^\t \[\],]+
++scfglex_num_feats;
}
<FEATS>. {
- std::cerr << "Line " << lex_line << " unexpected input in features: " << yytext << std::endl;
+ std::cerr << "Grammar " << scfglex_fname << " line " << lex_line << " unexpected input in features: " << yytext << std::endl;
abort();
}
<ALIGNS>[0-9]+-[0-9]+ {
@@ -291,14 +292,14 @@ NT [^\t \[\],]+
}
<ALIGNS>[ \t] ;
<ALIGNS>. {
- std::cerr << "Line " << lex_line << ": unexpected input in alignment: " << yytext << std::endl;
+ std::cerr << "Grammar " << scfglex_fname << " line " << lex_line << ": unexpected input in alignment: " << yytext << std::endl;
abort();
}
%%
#include "filelib.h"
-void RuleLexer::ReadRules(std::istream* in, RuleLexer::RuleCallback func, void* extra) {
+void RuleLexer::ReadRules(std::istream* in, RuleLexer::RuleCallback func, const std::string& fname, void* extra) {
if (scfglex_phrase_fnames.empty()) {
scfglex_phrase_fnames.resize(100);
for (int i = 0; i < scfglex_phrase_fnames.size(); ++i) {
@@ -308,6 +309,7 @@ void RuleLexer::ReadRules(std::istream* in, RuleLexer::RuleCallback func, void*
}
}
lex_line = 1;
+ scfglex_fname = fname;
scfglex_stream = in;
rule_callback_extra = extra,
rule_callback = func;
diff --git a/decoder/scfg_translator.cc b/decoder/scfg_translator.cc
index a506c591..236d7c90 100644
--- a/decoder/scfg_translator.cc
+++ b/decoder/scfg_translator.cc
@@ -78,7 +78,7 @@ PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat,
}
bool PassThroughGrammar::HasRuleForSpan(int, int, int distance) const {
- return (distance < 2);
+ return (distance < 4); // TODO this isn't great, but helps with EPS lattices
}
struct SCFGTranslatorImpl {
diff --git a/decoder/trule.cc b/decoder/trule.cc
index 896f9f3d..c22baae3 100644
--- a/decoder/trule.cc
+++ b/decoder/trule.cc
@@ -117,7 +117,7 @@ bool TRule::ReadFromString(const string& line, bool strict, bool mono) {
// use lexer
istringstream il(line);
n_assigned=0;
- RuleLexer::ReadRules(&il,assign_trule,this);
+ RuleLexer::ReadRules(&il,assign_trule,"STRING",this);
if (n_assigned>1)
cerr<<"\nWARNING: more than one rule parsed from multi-line string; kept last: "<<line<<".\n";
return n_assigned;
diff --git a/realtime/README.md b/realtime/README.md
index e5290fc5..1d49a9de 100644
--- a/realtime/README.md
+++ b/realtime/README.md
@@ -4,13 +4,11 @@ cdec Realtime
Code by Michael Denkowski (http://www.cs.cmu.edu/~mdenkows/, mdenkows@cs.cmu.edu)
```
-@misc{denkowski-proposal2013,
- author = {Michael Denkowski},
- title = {Machine Translation for Human Translators},
- year = {2013},
- month = {May},
- day = {30},
- howpublished = {{Ph.D.} Thesis Proposal, Carnegie Mellon University}
+@InProceedings{realtime,
+ author = {Michael Denkowski and Chris Dyer and Alon Lavie},
+ title = {Learning from Post-Editing: Online Model Adaptation for Statistical Machine Translation},
+ booktitle = {Proceedings of the 14th Conference of the European Chapter of the Association for Computational Linguistics}
+ year = {2014},
}
```
diff --git a/realtime/realtime.py b/realtime/realtime.py
index ec15b59d..f99478b0 100755
--- a/realtime/realtime.py
+++ b/realtime/realtime.py
@@ -13,13 +13,11 @@ ABOUT = '''Realtime adaptive translation with cdec (See README.md)
Code by Michael Denkowski
Citation:
-@misc{denkowski-proposal2013,
- author = {Michael Denkowski},
- title = {Machine Translation for Human Translators},
- year = {2013},
- month = {May},
- day = {30},
- howpublished = {{Ph.D.} Thesis Proposal, Carnegie Mellon University}
+@InProceedings{realtime,
+ author = {Michael Denkowski and Chris Dyer and Alon Lavie},
+ title = {Learning from Post-Editing: Online Model Adaptation for Statistical Machine Translation},
+ booktitle = {Proceedings of the 14th Conference of the European Chapter of the Association for Computational Linguistics}
+ year = {2014},
}
'''
diff --git a/realtime/rt/util.py b/realtime/rt/util.py
index 52767dac..a7333bbd 100644
--- a/realtime/rt/util.py
+++ b/realtime/rt/util.py
@@ -4,8 +4,6 @@ import subprocess
import sys
import threading
-from cdec.configobj import ConfigObj
-
SA_INI_FILES = set((
'f_sa_file',
'e_file',
diff --git a/training/crf/mpi_adagrad_optimize.cc b/training/crf/mpi_adagrad_optimize.cc
index 39bd763e..bac57324 100644
--- a/training/crf/mpi_adagrad_optimize.cc
+++ b/training/crf/mpi_adagrad_optimize.cc
@@ -157,11 +157,11 @@ struct TrainingObserver : public DecoderObserver {
void GetGradient(SparseVector<double>* g) const {
g->clear();
-#if HAVE_CXX11
+#if HAVE_CXX11 && (__GNUC_MINOR__ > 4 || __GNUC__ > 4)
for (auto& gi : acc_grad) {
#else
for (FastSparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it) {
- pair<unsigned, double>& gi = *it;
+ const pair<unsigned, prob_t>& gi = *it;
#endif
g->set_value(gi.first, -gi.second.as_float());
}
@@ -190,7 +190,7 @@ class AdaGradOptimizer {
G() {}
void update(const SparseVector<double>& g, vector<double>* x, SparseVector<double>* sx) {
if (x->size() > G.size()) G.resize(x->size(), 0.0);
-#if HAVE_CXX11
+#if HAVE_CXX11 && (__GNUC_MINOR__ > 4 || __GNUC__ > 4)
for (auto& gi : g) {
#else
for (SparseVector<double>::const_iterator it = g.begin(); it != g.end(); ++it) {
@@ -220,7 +220,7 @@ class AdaGradL1Optimizer {
G.resize(x->size(), 0.0);
u.resize(x->size(), 0.0);
}
-#if HAVE_CXX11
+#if HAVE_CXX11 && (__GNUC_MINOR__ > 4 || __GNUC__ > 4)
for (auto& gi : g) {
#else
for (SparseVector<double>::const_iterator it = g.begin(); it != g.end(); ++it) {
@@ -236,11 +236,11 @@ class AdaGradL1Optimizer {
// compute updates (avoid invalidating iterators by putting them all
// in the vector vupdate and applying them after this)
vector<pair<unsigned, double>> vupdate;
-#if HAVE_CXX11
+#if HAVE_CXX11 && (__GNUC_MINOR__ > 4 || __GNUC__ > 4)
for (auto& xi : *sx) {
#else
- for (SparseVector<double>::const_iterator it = sx->begin(); it != sx->end(); ++it) {
- const pair<unsigned,double>& gi = *it;
+ for (SparseVector<double>::iterator it = sx->begin(); it != sx->end(); ++it) {
+ const pair<unsigned,double>& xi = *it;
#endif
double z = fabs(u[xi.first] / t) - lambda;
double s = 1;
diff --git a/word-aligner/force_align.py b/word-aligner/force_align.py
index b03d446e..8386e6a5 100755
--- a/word-aligner/force_align.py
+++ b/word-aligner/force_align.py
@@ -3,6 +3,10 @@
import os
import sys
+# Hook into realtime
+sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'realtime', 'rt'))
+from aligner import ForceAligner
+
def main():
if len(sys.argv[1:]) < 4:
@@ -16,14 +20,14 @@ def main():
sys.stderr.write('where heuristic is one of: (intersect union grow-diag grow-diag-final grow-diag-final-and) default=grow-diag-final-and\n')
sys.exit(2)
- # Hook into realtime
- sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'realtime'))
- from rt.aligner import ForceAligner
-
aligner = ForceAligner(*sys.argv[1:])
-
- for line in sys.stdin:
+
+ while True:
+ line = sys.stdin.readline()
+ if not line:
+ break
sys.stdout.write('{}\n'.format(aligner.align_formatted(line.strip())))
+ sys.stdout.flush()
aligner.close()