From 743f63daf739884051ee5760390420023b07ee26 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 22 Dec 2010 13:15:42 -0600 Subject: fix compound splitter, new features, more training data --- decoder/csplit.cc | 140 +++++++++++++++++++++++++++--------------------------- 1 file changed, 71 insertions(+), 69 deletions(-) (limited to 'decoder/csplit.cc') diff --git a/decoder/csplit.cc b/decoder/csplit.cc index 7d50e3af..4a723822 100644 --- a/decoder/csplit.cc +++ b/decoder/csplit.cc @@ -13,14 +13,16 @@ using namespace std; struct CompoundSplitImpl { CompoundSplitImpl(const boost::program_options::variables_map& conf) : - fugen_elements_(true), // TODO configure + fugen_elements_(true), min_size_(3), kXCAT(TD::Convert("X")*-1), kWORDBREAK_RULE(new TRule("[X] ||| # ||| #")), kTEMPLATE_RULE(new TRule("[X] ||| [X,1] ? ||| [1] ?")), kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")), kFUGEN_S(FD::Convert("FugS")), - kFUGEN_N(FD::Convert("FugN")) {} + kFUGEN_N(FD::Convert("FugN")) { + // TODO: use conf to turn fugenelements on and off + } void PasteTogetherStrings(const vector& chars, const int i, @@ -40,73 +42,73 @@ struct CompoundSplitImpl { void BuildTrellis(const vector& chars, Hypergraph* forest) { -// vector nodes(chars.size()+1, -1); -// nodes[0] = forest->AddNode(kXCAT)->id_; // source -// const int left_rule = forest->AddEdge(kWORDBREAK_RULE, Hypergraph::TailNodeVector())->id_; -// forest->ConnectEdgeToHeadNode(left_rule, nodes[0]); -// -// const int max_split_ = max(static_cast(chars.size()) - min_size_ + 1, 1); -// cerr << "max: " << max_split_ << " " << " min: " << min_size_ << endl; -// for (int i = min_size_; i < max_split_; ++i) -// nodes[i] = forest->AddNode(kXCAT)->id_; -// assert(nodes.back() == -1); -// nodes.back() = forest->AddNode(kXCAT)->id_; // sink -// -// for (int i = 0; i < max_split_; ++i) { -// if (nodes[i] < 0) continue; -// const int start = min(i + min_size_, static_cast(chars.size())); -// for (int j = start; j <= chars.size(); ++j) { -// if (nodes[j] < 0) continue; -// string yield; -// PasteTogetherStrings(chars, i, j, &yield); -// // cerr << "[" << i << "," << j << "] " << yield << endl; -// TRulePtr rule = TRulePtr(new TRule(*kTEMPLATE_RULE)); -// rule->e_[1] = rule->f_[1] = TD::Convert(yield); -// // cerr << rule->AsString() << endl; -// int edge = forest->AddEdge( -// rule, -// Hypergraph::TailNodeVector(1, nodes[i]))->id_; -// forest->ConnectEdgeToHeadNode(edge, nodes[j]); -// forest->edges_[edge].i_ = i; -// forest->edges_[edge].j_ = j; -// -// // handle "fugenelemente" here -// // don't delete "fugenelemente" at the end of words -// if (fugen_elements_ && j != chars.size()) { -// const int len = yield.size(); -// string alt; -// int fid = 0; -// if (len > (min_size_ + 2) && yield[len-1] == 's' && yield[len-2] == 'e') { -// alt = yield.substr(0, len - 2); -// fid = kFUGEN_S; -// } else if (len > (min_size_ + 1) && yield[len-1] == 's') { -// alt = yield.substr(0, len - 1); -// fid = kFUGEN_S; -// } else if (len > (min_size_ + 2) && yield[len-2] == 'e' && yield[len-1] == 'n') { -// alt = yield.substr(0, len - 1); -// fid = kFUGEN_N; -// } -// if (alt.size()) { -// TRulePtr altrule = TRulePtr(new TRule(*rule)); -// altrule->e_[1] = TD::Convert(alt); -// // cerr << altrule->AsString() << endl; -// int edge = forest->AddEdge( -// altrule, -// Hypergraph::TailNodeVector(1, nodes[i]))->id_; -// forest->ConnectEdgeToHeadNode(edge, nodes[j]); -// forest->edges_[edge].feature_values_.set_value(fid, 1.0); -// forest->edges_[edge].i_ = i; -// forest->edges_[edge].j_ = j; -// } -// } -// } -// } -// -// // add goal rule -// Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); -// Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1); -// Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); -// forest->ConnectEdgeToHeadNode(hg_edge, goal); + vector nodes(chars.size()+1, -1); + nodes[0] = forest->AddNode(kXCAT)->id_; // source + const int left_rule = forest->AddEdge(kWORDBREAK_RULE, Hypergraph::TailNodeVector())->id_; + forest->ConnectEdgeToHeadNode(left_rule, nodes[0]); + + const int max_split_ = max(static_cast(chars.size()) - min_size_ + 1, 1); + // cerr << "max: " << max_split_ << " " << " min: " << min_size_ << endl; + for (int i = min_size_; i < max_split_; ++i) + nodes[i] = forest->AddNode(kXCAT)->id_; + assert(nodes.back() == -1); + nodes.back() = forest->AddNode(kXCAT)->id_; // sink + + for (int i = 0; i < max_split_; ++i) { + if (nodes[i] < 0) continue; + const int start = min(i + min_size_, static_cast(chars.size())); + for (int j = start; j <= chars.size(); ++j) { + if (nodes[j] < 0) continue; + string yield; + PasteTogetherStrings(chars, i, j, &yield); + // cerr << "[" << i << "," << j << "] " << yield << endl; + TRulePtr rule = TRulePtr(new TRule(*kTEMPLATE_RULE)); + rule->e_[1] = rule->f_[1] = TD::Convert(yield); + // cerr << rule->AsString() << endl; + int edge = forest->AddEdge( + rule, + Hypergraph::TailNodeVector(1, nodes[i]))->id_; + forest->ConnectEdgeToHeadNode(edge, nodes[j]); + forest->edges_[edge].i_ = i; + forest->edges_[edge].j_ = j; + + // handle "fugenelemente" here + // don't delete "fugenelemente" at the end of words + if (fugen_elements_ && j != chars.size()) { + const int len = yield.size(); + string alt; + int fid = 0; + if (len > (min_size_ + 2) && yield[len-1] == 's' && yield[len-2] == 'e') { + alt = yield.substr(0, len - 2); + fid = kFUGEN_S; + } else if (len > (min_size_ + 1) && yield[len-1] == 's') { + alt = yield.substr(0, len - 1); + fid = kFUGEN_S; + } else if (len > (min_size_ + 2) && yield[len-2] == 'e' && yield[len-1] == 'n') { + alt = yield.substr(0, len - 1); + fid = kFUGEN_N; + } + if (alt.size()) { + TRulePtr altrule = TRulePtr(new TRule(*rule)); + altrule->e_[1] = TD::Convert(alt); + // cerr << altrule->AsString() << endl; + int edge = forest->AddEdge( + altrule, + Hypergraph::TailNodeVector(1, nodes[i]))->id_; + forest->ConnectEdgeToHeadNode(edge, nodes[j]); + forest->edges_[edge].feature_values_.set_value(fid, 1.0); + forest->edges_[edge].i_ = i; + forest->edges_[edge].j_ = j; + } + } + } + } + + // add goal rule + Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); + Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1); + Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); + forest->ConnectEdgeToHeadNode(hg_edge, goal); } private: const bool fugen_elements_; -- cgit v1.2.3