summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2010-12-22 13:15:42 -0600
committerChris Dyer <cdyer@cs.cmu.edu>2010-12-22 13:15:42 -0600
commit743f63daf739884051ee5760390420023b07ee26 (patch)
tree613185d8cae2e8f32eec5b72406408889ff18fb6
parentb162a853a013fdd964c8d3c0789988222d044ccf (diff)
fix compound splitter, new features, more training data
-rwxr-xr-xcompound-split/de/TRAIN3
-rw-r--r--compound-split/de/cdec-train.ini2
-rw-r--r--compound-split/de/dev.in-ref119
-rw-r--r--compound-split/de/weights.ptinit2
-rw-r--r--compound-split/de/weights.trained37
-rw-r--r--decoder/apply_models.cc1
-rw-r--r--decoder/csplit.cc140
-rw-r--r--decoder/decoder.cc25
-rw-r--r--decoder/ff_csplit.cc21
-rw-r--r--training/Makefile.am9
10 files changed, 247 insertions, 112 deletions
diff --git a/compound-split/de/TRAIN b/compound-split/de/TRAIN
index d586050e..6f7184ea 100755
--- a/compound-split/de/TRAIN
+++ b/compound-split/de/TRAIN
@@ -1,2 +1 @@
-../../training/cluster-ptrain.pl cdec-train.ini dev.in-ref weights.ptinit --gaussian_prior --sigma_squared 1 &> training.log &
-
+~/cdec/training/mpi_batch_optimize -w weights.cur.gz -t dev.in-ref -d cdec-train.ini -M 200
diff --git a/compound-split/de/cdec-train.ini b/compound-split/de/cdec-train.ini
index 58a99106..44f5934d 100644
--- a/compound-split/de/cdec-train.ini
+++ b/compound-split/de/cdec-train.ini
@@ -1,5 +1,5 @@
formalism=csplit
-crf_uniform_empirical=true
+# crf_uniform_empirical=true
intersection_strategy=full
feature_function=CSplit_BasicFeatures large_dict.de.gz badlist.de.gz
feature_function=CSplit_ReverseCharLM charlm.rev.5gm.de.lm.gz
diff --git a/compound-split/de/dev.in-ref b/compound-split/de/dev.in-ref
index a68f0688..83dae731 100644
--- a/compound-split/de/dev.in-ref
+++ b/compound-split/de/dev.in-ref
@@ -619,3 +619,122 @@ teuersten ||| # teuersten
kirchenneubau ||| # kirche neu bau
ostdeutschlands ||| # ost deutschlands
erfolgen ||| # erfolgen
+rumänien ||| # rumänien
+empört ||| # empört
+berlin ||| # berlin
+rumänische ||| # rumänische
+regierung ||| # regierung
+empört ||| # empört
+ankündigung ||| # ankündigung
+deutschlands ||| # deutschlands
+frankreichs ||| # frankreichs
+beitritt ||| # beitritt
+rumäniens ||| # rumäniens
+bulgariens ||| # bulgariens
+schengen ||| # schengen
+nicht ||| # nicht
+zuzustimmen ||| # zuzustimmen
+bukarest ||| # bukarest
+informationen ||| # informationen
+verletzung ||| # verletzung
+vertrags ||| # vertrags
+lissabon ||| # lissabon
+rumänischer ||| # rumänischer
+zollbeamter ||| ((('#',0,1),),(('zoll',0,1),('zollbeamter',0,2),),(('beamter',0,1),),)
+grenze ||| # grenze
+zwischen ||| # zwischen
+rumänien ||| # rumänien
+republik ||| # republik
+moldau ||| # moldau
+dezember ||| # dezember
+regierung ||| # regierung
+bukarest ||| # bukarest
+empört ||| # empört
+treten ||| # treten
+kontrollen ||| # kontrollen
+grenzen ||| # grenzen
+rumänien ||| # rumänien
+bulgarien ||| # bulgarien
+solange ||| # solange
+beizubehalten ||| # beizubehalten
+länder ||| # länder
+unumkehrbare ||| # unumkehrbare
+fortschritte ||| # fortschritte
+korruption ||| # korruption
+organisierte ||| # organisierte
+kriminalität ||| # kriminalität
+vorweisen ||| # vorweisen
+bukarest ||| # bukarest
+informationen ||| # informationen
+dieser ||| # dieser
+zeitung ||| # zeitung
+überlegt ||| # überlegt
+vertragsverletzung ||| ((('#',0,1),),(('vertrag',0,1),('vertrags',0,1),),(('verletzung',0,1),),)
+einzureichen ||| # einzureichen
+sollten ||| # sollten
+deutschland ||| # deutschland
+frankreich ||| # frankreich
+haltung ||| # haltung
+durchsetzen ||| # durchsetzen
+rumäniens ||| # rumäniens
+außenministerium ||| ((('#',0,1),),(('außen',0,1),),(('ministerium',0,1),),)
+spricht ||| # spricht
+unannehmbaren ||| # unannehmbaren
+präzedenzfall ||| ((('#',0,1),),(('präzedenzfall',0,2),('präzedenz',0,1),),(('fall',0,1),),)
+sondern ||| # sondern
+staatspräsident ||| ((('#',0,1),),(('staatspräsident',0,2),('staats',0,1),('staat',0,1),),(('präsident',0,1),),)
+georgi ||| # georgi
+parwanow ||| # parwanow
+verständnis ||| # verständnis
+bulgarien ||| # bulgarien
+verstehen ||| # verstehen
+auflagen ||| # auflagen
+erfüllen ||| # erfüllen
+eigentliche ||| # eigentliche
+erklärung ||| # erklärung
+verzögerung ||| # verzögerung
+mittwoch ||| # mittwoch
+haltung ||| # haltung
+hintergrund ||| # hintergrund
+streits ||| # streits
+regierung ||| # regierung
+ministerpräsident ||| ((('#',0,1),),(('minister',0,1),),(('präsident',0,1),),)
+grenzkontrollen ||| ((('#',0,1),),(('grenz',0,1),),(('kontrollen',0,1),),)
+entfallen ||| # entfallen
+zweiten ||| # zweiten
+weltkrieg ||| ((('#',0,1),),(('welt',0,1),('weltkrieg',0,2),),(('krieg',0,1),),)
+versteckte ||| # versteckte
+abwehr ||| # abwehr
+admirals ||| # admirals
+canaris ||| # canaris
+sprengsätze ||| # sprengsätze
+apfelsinenkisten ||| ((('#',0,1),),(('apfelsinen',0,1),('apfelsine',0,1),),(('kisten',0,1),),)
+britische ||| # britische
+hafenarbeiter ||| ((('#',0,1),),(('hafen',0,1),),(('arbeiter',0,1),),)
+weigerten ||| # weigerten
+schiffe ||| # schiffe
+entladen ||| # entladen
+zeiten ||| # zeiten
+griechischen ||| # griechischen
+militärdiktatur ||| ((('#',0,1),),(('militär',0,1),),(('diktatur',0,1),),)
+warnte ||| # warnte
+widerstandsgruppe ||| ((('#',0,1),),(('widerstand',0,1),('widerstands',0,1),),(('gruppe',0,1),),)
+pfirsiche ||| # pfirsiche
+aprikosen ||| # aprikosen
+vergiftet ||| # vergiftet
+kuklina ||| # kuklina
+trägerin ||| # trägerin
+alternativen ||| # alternativen
+nobelpreises ||| ((('#',0,1),),(('nobel',0,1),),(('preises',0,1),),)
+kämpft ||| # kämpft
+rechte ||| # rechte
+soldaten ||| # soldaten
+russlands ||| # russlands
+online ||| # online
+sprach ||| # sprach
+menschenrechte ||| ((('#',0,1),),(('menschen',0,1),('mensch',0,1),),(('rechte',0,1),),)
+heimat ||| # heimat
+kaufrausch ||| ((('#',0,1),),(('kauf',0,1),),(('rausch',0,1),),)
+kommerzialisierung ||| # kommerzialisierung
+weihnachten ||| # weihnachten
+funktioniert ||| # funktioniert
diff --git a/compound-split/de/weights.ptinit b/compound-split/de/weights.ptinit
index eaaa3899..eaea77ce 100644
--- a/compound-split/de/weights.ptinit
+++ b/compound-split/de/weights.ptinit
@@ -5,9 +5,11 @@ FugS 0
FugN 0
WordCount 0
InDict 0
+InDictSubWord 0
Short 0
Long 0
OOV 0
+OOVSubWord 0
ShortRange 0
HighFreq 0
MedFreq 0
diff --git a/compound-split/de/weights.trained b/compound-split/de/weights.trained
index 359e5cc7..94c6951f 100644
--- a/compound-split/de/weights.trained
+++ b/compound-split/de/weights.trained
@@ -1,17 +1,20 @@
-LettersSq -0.037643555390228831
-LettersSqrt 0.58198736272513085
-RevCharLM 0.45802141843469085
-FugS 0.26570690067173086
-FugN -0.70672252122442492
-WordCount 0.33774557030334018
-InDict 0.23339787529651213
-Short 0.60862824917301594
-Long -0.58675406875713121
-OOV 0.10434769500682411
-ShortRange -1.0221040223076261
-HighFreq -2.9803896632623825
-MedFreq 0.18811013582723696
-Freq -0.26933190242976746
-Bad -2.3217842031714113
-FreqLen1 -0.28996794292058575
-FreqLen2 -0.21944133928835977
+# Objective = 130.351 (eval count=252)
+LettersSq -0.056135510587750022
+LettersSqrt -2.3295721373391776
+RevCharLM 0.36059050723989938
+FugS -0.65163142842679733
+FugN -1.7541906469311515
+WordCount 19.356942545900733
+InDict -15.19336735406667
+InDictSubWord 0.8665049533783179
+Short 1.0429051684475563
+Long -0.66305657970937237
+OOV 35.550309899439839
+OOVSubWord -2.023997552143789
+ShortRange -1.0433366143574028
+HighFreq -4.9882552529226301
+MedFreq -0.091778951485726443
+Freq 0.4285650067397816
+Bad -62
+FreqLen1 -1.8532115534306581
+FreqLen2 -1.2921088742036031
diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc
index 18460950..9390c809 100644
--- a/decoder/apply_models.cc
+++ b/decoder/apply_models.cc
@@ -177,6 +177,7 @@ public:
void Apply() {
int num_nodes = in.nodes_.size();
+ assert(num_nodes >= 2);
int goal_id = num_nodes - 1;
int pregoal = goal_id - 1;
int every = 1;
diff --git a/decoder/csplit.cc b/decoder/csplit.cc
index 7d50e3af..4a723822 100644
--- a/decoder/csplit.cc
+++ b/decoder/csplit.cc
@@ -13,14 +13,16 @@ using namespace std;
struct CompoundSplitImpl {
CompoundSplitImpl(const boost::program_options::variables_map& conf) :
- fugen_elements_(true), // TODO configure
+ fugen_elements_(true),
min_size_(3),
kXCAT(TD::Convert("X")*-1),
kWORDBREAK_RULE(new TRule("[X] ||| # ||| #")),
kTEMPLATE_RULE(new TRule("[X] ||| [X,1] ? ||| [1] ?")),
kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")),
kFUGEN_S(FD::Convert("FugS")),
- kFUGEN_N(FD::Convert("FugN")) {}
+ kFUGEN_N(FD::Convert("FugN")) {
+ // TODO: use conf to turn fugenelements on and off
+ }
void PasteTogetherStrings(const vector<string>& chars,
const int i,
@@ -40,73 +42,73 @@ struct CompoundSplitImpl {
void BuildTrellis(const vector<string>& chars,
Hypergraph* forest) {
-// vector<int> nodes(chars.size()+1, -1);
-// nodes[0] = forest->AddNode(kXCAT)->id_; // source
-// const int left_rule = forest->AddEdge(kWORDBREAK_RULE, Hypergraph::TailNodeVector())->id_;
-// forest->ConnectEdgeToHeadNode(left_rule, nodes[0]);
-//
-// const int max_split_ = max(static_cast<int>(chars.size()) - min_size_ + 1, 1);
-// cerr << "max: " << max_split_ << " " << " min: " << min_size_ << endl;
-// for (int i = min_size_; i < max_split_; ++i)
-// nodes[i] = forest->AddNode(kXCAT)->id_;
-// assert(nodes.back() == -1);
-// nodes.back() = forest->AddNode(kXCAT)->id_; // sink
-//
-// for (int i = 0; i < max_split_; ++i) {
-// if (nodes[i] < 0) continue;
-// const int start = min(i + min_size_, static_cast<int>(chars.size()));
-// for (int j = start; j <= chars.size(); ++j) {
-// if (nodes[j] < 0) continue;
-// string yield;
-// PasteTogetherStrings(chars, i, j, &yield);
-// // cerr << "[" << i << "," << j << "] " << yield << endl;
-// TRulePtr rule = TRulePtr(new TRule(*kTEMPLATE_RULE));
-// rule->e_[1] = rule->f_[1] = TD::Convert(yield);
-// // cerr << rule->AsString() << endl;
-// int edge = forest->AddEdge(
-// rule,
-// Hypergraph::TailNodeVector(1, nodes[i]))->id_;
-// forest->ConnectEdgeToHeadNode(edge, nodes[j]);
-// forest->edges_[edge].i_ = i;
-// forest->edges_[edge].j_ = j;
-//
-// // handle "fugenelemente" here
-// // don't delete "fugenelemente" at the end of words
-// if (fugen_elements_ && j != chars.size()) {
-// const int len = yield.size();
-// string alt;
-// int fid = 0;
-// if (len > (min_size_ + 2) && yield[len-1] == 's' && yield[len-2] == 'e') {
-// alt = yield.substr(0, len - 2);
-// fid = kFUGEN_S;
-// } else if (len > (min_size_ + 1) && yield[len-1] == 's') {
-// alt = yield.substr(0, len - 1);
-// fid = kFUGEN_S;
-// } else if (len > (min_size_ + 2) && yield[len-2] == 'e' && yield[len-1] == 'n') {
-// alt = yield.substr(0, len - 1);
-// fid = kFUGEN_N;
-// }
-// if (alt.size()) {
-// TRulePtr altrule = TRulePtr(new TRule(*rule));
-// altrule->e_[1] = TD::Convert(alt);
-// // cerr << altrule->AsString() << endl;
-// int edge = forest->AddEdge(
-// altrule,
-// Hypergraph::TailNodeVector(1, nodes[i]))->id_;
-// forest->ConnectEdgeToHeadNode(edge, nodes[j]);
-// forest->edges_[edge].feature_values_.set_value(fid, 1.0);
-// forest->edges_[edge].i_ = i;
-// forest->edges_[edge].j_ = j;
-// }
-// }
-// }
-// }
-//
-// // add goal rule
-// Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1);
-// Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1);
-// Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail);
-// forest->ConnectEdgeToHeadNode(hg_edge, goal);
+ vector<int> nodes(chars.size()+1, -1);
+ nodes[0] = forest->AddNode(kXCAT)->id_; // source
+ const int left_rule = forest->AddEdge(kWORDBREAK_RULE, Hypergraph::TailNodeVector())->id_;
+ forest->ConnectEdgeToHeadNode(left_rule, nodes[0]);
+
+ const int max_split_ = max(static_cast<int>(chars.size()) - min_size_ + 1, 1);
+ // cerr << "max: " << max_split_ << " " << " min: " << min_size_ << endl;
+ for (int i = min_size_; i < max_split_; ++i)
+ nodes[i] = forest->AddNode(kXCAT)->id_;
+ assert(nodes.back() == -1);
+ nodes.back() = forest->AddNode(kXCAT)->id_; // sink
+
+ for (int i = 0; i < max_split_; ++i) {
+ if (nodes[i] < 0) continue;
+ const int start = min(i + min_size_, static_cast<int>(chars.size()));
+ for (int j = start; j <= chars.size(); ++j) {
+ if (nodes[j] < 0) continue;
+ string yield;
+ PasteTogetherStrings(chars, i, j, &yield);
+ // cerr << "[" << i << "," << j << "] " << yield << endl;
+ TRulePtr rule = TRulePtr(new TRule(*kTEMPLATE_RULE));
+ rule->e_[1] = rule->f_[1] = TD::Convert(yield);
+ // cerr << rule->AsString() << endl;
+ int edge = forest->AddEdge(
+ rule,
+ Hypergraph::TailNodeVector(1, nodes[i]))->id_;
+ forest->ConnectEdgeToHeadNode(edge, nodes[j]);
+ forest->edges_[edge].i_ = i;
+ forest->edges_[edge].j_ = j;
+
+ // handle "fugenelemente" here
+ // don't delete "fugenelemente" at the end of words
+ if (fugen_elements_ && j != chars.size()) {
+ const int len = yield.size();
+ string alt;
+ int fid = 0;
+ if (len > (min_size_ + 2) && yield[len-1] == 's' && yield[len-2] == 'e') {
+ alt = yield.substr(0, len - 2);
+ fid = kFUGEN_S;
+ } else if (len > (min_size_ + 1) && yield[len-1] == 's') {
+ alt = yield.substr(0, len - 1);
+ fid = kFUGEN_S;
+ } else if (len > (min_size_ + 2) && yield[len-2] == 'e' && yield[len-1] == 'n') {
+ alt = yield.substr(0, len - 1);
+ fid = kFUGEN_N;
+ }
+ if (alt.size()) {
+ TRulePtr altrule = TRulePtr(new TRule(*rule));
+ altrule->e_[1] = TD::Convert(alt);
+ // cerr << altrule->AsString() << endl;
+ int edge = forest->AddEdge(
+ altrule,
+ Hypergraph::TailNodeVector(1, nodes[i]))->id_;
+ forest->ConnectEdgeToHeadNode(edge, nodes[j]);
+ forest->edges_[edge].feature_values_.set_value(fid, 1.0);
+ forest->edges_[edge].i_ = i;
+ forest->edges_[edge].j_ = j;
+ }
+ }
+ }
+ }
+
+ // add goal rule
+ Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1);
+ Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1);
+ Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail);
+ forest->ConnectEdgeToHeadNode(hg_edge, goal);
}
private:
const bool fugen_elements_;
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index 3551b584..e28080aa 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -279,7 +279,6 @@ struct DecoderImpl {
bool encode_b64;
bool kbest;
bool unique_kbest;
- bool crf_uniform_empirical;
bool get_oracle_forest;
shared_ptr<WriteFile> extract_file;
int combine_size;
@@ -379,7 +378,6 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
("max_translation_sample,X", po::value<int>(), "Sample the max translation from the chart")
("pb_max_distortion,D", po::value<int>()->default_value(4), "Phrase-based decoder: maximum distortion")
("cll_gradient,G","Compute conditional log-likelihood gradient and write to STDOUT (src & ref required)")
- ("crf_uniform_empirical", "If there are multple references use (i.e., lattice) a uniform distribution rather than posterior weighting a la EM")
("get_oracle_forest,o", "Calculate rescored hypregraph using approximate BLEU scoring of rules")
("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)")
("vector_format",po::value<string>()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)")
@@ -611,7 +609,6 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
encode_b64 = str("vector_format",conf) == "b64";
kbest = conf.count("k_best");
unique_kbest = conf.count("unique_k_best");
- crf_uniform_empirical = conf.count("crf_uniform_empirical");
get_oracle_forest = conf.count("get_oracle_forest");
cfg_options.Validate();
@@ -842,14 +839,12 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
if (has_ref) {
if (HG::Intersect(ref, &forest)) {
if (!SILENT) forest_stats(forest," Constr. forest",show_tree_structure,show_features,feature_weights,oracle.show_derivation);
- if (crf_uniform_empirical) {
- if (!SILENT) cerr << " USING UNIFORM WEIGHTS\n";
- for (int i = 0; i < forest.edges_.size(); ++i)
- forest.edges_[i].edge_prob_=prob_t::One();
- } else {
- forest.Reweight(feature_weights);
- if (!SILENT) cerr << " Constr. VitTree: " << ViterbiFTree(forest) << endl;
- }
+// if (crf_uniform_empirical) {
+// if (!SILENT) cerr << " USING UNIFORM WEIGHTS\n";
+// for (int i = 0; i < forest.edges_.size(); ++i)
+// forest.edges_[i].edge_prob_=prob_t::One(); }
+ forest.Reweight(feature_weights);
+ if (!SILENT) cerr << " Constr. VitTree: " << ViterbiFTree(forest) << endl;
if (conf.count("show_partition")) {
const prob_t z = Inside<prob_t, EdgeProb>(forest);
cerr << " Contst. partition log(Z): " << log(z) << endl;
@@ -878,11 +873,9 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
if (write_gradient) {
const prob_t ref_z = InsideOutside<prob_t, EdgeProb, SparseVector<prob_t>, EdgeFeaturesAndProbWeightFunction>(forest, &ref_exp);
ref_exp /= ref_z;
- if (crf_uniform_empirical) {
- log_ref_z = ref_exp.dot(feature_weights);
- } else {
- log_ref_z = log(ref_z);
- }
+// if (crf_uniform_empirical)
+// log_ref_z = ref_exp.dot(feature_weights);
+ log_ref_z = log(ref_z);
//cerr << " MODEL LOG Z: " << log_z << endl;
//cerr << " EMPIRICAL LOG Z: " << log_ref_z << endl;
if ((log_z - log_ref_z) < kMINUS_EPSILON) {
diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc
index 1485009b..204b7ce6 100644
--- a/decoder/ff_csplit.cc
+++ b/decoder/ff_csplit.cc
@@ -22,9 +22,11 @@ struct BasicCSplitFeaturesImpl {
letters_sq_(FD::Convert("LettersSq")),
letters_sqrt_(FD::Convert("LettersSqrt")),
in_dict_(FD::Convert("InDict")),
+ in_dict_sub_word_(FD::Convert("InDictSubWord")),
short_(FD::Convert("Short")),
long_(FD::Convert("Long")),
oov_(FD::Convert("OOV")),
+ oov_sub_word_(FD::Convert("OOVSubWord")),
short_range_(FD::Convert("ShortRange")),
high_freq_(FD::Convert("HighFreq")),
med_freq_(FD::Convert("MedFreq")),
@@ -52,15 +54,18 @@ struct BasicCSplitFeaturesImpl {
}
void TraversalFeaturesImpl(const Hypergraph::Edge& edge,
+ const int src_word_size,
SparseVector<double>* features) const;
const int word_count_;
const int letters_sq_;
const int letters_sqrt_;
const int in_dict_;
+ const int in_dict_sub_word_;
const int short_;
const int long_;
const int oov_;
+ const int oov_sub_word_;
const int short_range_;
const int high_freq_;
const int med_freq_;
@@ -77,7 +82,9 @@ BasicCSplitFeatures::BasicCSplitFeatures(const string& param) :
void BasicCSplitFeaturesImpl::TraversalFeaturesImpl(
const Hypergraph::Edge& edge,
+ const int src_word_length,
SparseVector<double>* features) const {
+ const bool subword = (edge.i_ > 0) || (edge.j_ < src_word_length);
features->set_value(word_count_, 1.0);
features->set_value(letters_sq_, (edge.j_ - edge.i_) * (edge.j_ - edge.i_));
features->set_value(letters_sqrt_, sqrt(edge.j_ - edge.i_));
@@ -108,8 +115,10 @@ void BasicCSplitFeaturesImpl::TraversalFeaturesImpl(
if (freq) {
features->set_value(freq_, freq);
features->set_value(in_dict_, 1.0);
+ if (subword) features->set_value(in_dict_sub_word_, 1.0);
} else {
features->set_value(oov_, 1.0);
+ if (subword) features->set_value(oov_sub_word_, 1.0);
freq = 99.0f;
}
if (bad_words_.count(word) != 0)
@@ -143,7 +152,7 @@ void BasicCSplitFeatures::TraversalFeaturesImpl(
(void) estimated_features;
if (edge.Arity() == 0) return;
if (edge.rule_->EWords() != 1) return;
- pimpl_->TraversalFeaturesImpl(edge, features);
+ pimpl_->TraversalFeaturesImpl(edge, smeta.GetSourceLattice().size(), features);
}
struct ReverseCharLMCSplitFeatureImpl {
@@ -208,9 +217,17 @@ void ReverseCharLMCSplitFeature::TraversalFeaturesImpl(
if (edge.rule_->EWords() != 1) return;
const double lpp = pimpl_->LeftPhonotacticProb(smeta.GetSourceLattice(), edge.i_);
features->set_value(fid_, lpp);
+#if 0
WordID neighbor_word = 0;
const WordID word = edge.rule_->e_[1];
-#if 0
+ const char* sword = TD::Convert(word);
+ const int len = strlen(sword);
+ int cur = 0;
+ int chars = 0;
+ while(cur < len) {
+ cur += UTF8Len(sword[cur]);
+ ++chars;
+ }
if (chars > 4 && (sword[0] == 's' || sword[0] == 'n')) {
neighbor_word = TD::Convert(string(&sword[1]));
}
diff --git a/training/Makefile.am b/training/Makefile.am
index cb17aeff..8218ff0a 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -9,9 +9,8 @@ bin_PROGRAMS = \
plftools \
collapse_weights \
cllh_filter_grammar \
- mpi_online_optimize
-
-# mpi_batch_optimize
+ mpi_online_optimize \
+ mpi_batch_optimize
noinst_PROGRAMS = \
lbfgs_test \
@@ -22,8 +21,8 @@ TESTS = lbfgs_test optimize_test
mpi_online_optimize_SOURCES = mpi_online_optimize.cc online_optimizer.cc
mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-#mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc
-#mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc
+mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
if MPI
bin_PROGRAMS += compute_cllh