summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@Chriss-MacBook-Air.local>2013-03-20 12:24:01 -0400
committerChris Dyer <cdyer@Chriss-MacBook-Air.local>2013-03-20 12:24:01 -0400
commit39e6cc773c7e723ac2a23be51b3c15ee3c9a70d5 (patch)
tree5d406e30b602820aacdfb24eb7975bc45ffac1cd
parent3aeab176d9068b13e3ca3394be4f9089f5952517 (diff)
n-gram word class features
-rw-r--r--decoder/ff_ngrams.cc68
1 files changed, 63 insertions, 5 deletions
diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc
index 9c13fdbb..d337b28b 100644
--- a/decoder/ff_ngrams.cc
+++ b/decoder/ff_ngrams.cc
@@ -60,7 +60,7 @@ namespace {
}
}
-static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector<string>& prefixes, string& target_separator) {
+static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector<string>& prefixes, string& target_separator, string* cluster_file) {
vector<string> const& argv=SplitOnWhitespace(in);
*explicit_markers = false;
*order = 3;
@@ -103,6 +103,10 @@ static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order,
LMSPEC_NEXTARG;
prefixes[5] = *i;
break;
+ case 'c':
+ LMSPEC_NEXTARG;
+ *cluster_file = *i;
+ break;
case 'S':
LMSPEC_NEXTARG;
target_separator = *i;
@@ -124,6 +128,7 @@ usage:
<< "NgramFeatures Usage: \n"
<< " feature_function=NgramFeatures filename.lm [-x] [-o <order>] \n"
+ << " [-c <cluster-file>]\n"
<< " [-U <unigram-prefix>] [-B <bigram-prefix>][-T <trigram-prefix>]\n"
<< " [-4 <4-gram-prefix>] [-5 <5-gram-prefix>] [-S <separator>]\n\n"
@@ -203,6 +208,12 @@ class NgramDetectorImpl {
SetFlag(flag, HAS_FULL_CONTEXT, state);
}
+ WordID MapToClusterIfNecessary(WordID w) const {
+ if (cluster_map.size() == 0) return w;
+ if (w >= cluster_map.size()) return kCDEC_UNK;
+ return cluster_map[w];
+ }
+
void FireFeatures(const State<5>& state, WordID cur, SparseVector<double>* feats) {
FidTree* ft = &fidroot_;
int n = 0;
@@ -285,7 +296,7 @@ class NgramDetectorImpl {
context_complete = true;
}
} else { // handle terminal
- const WordID cur_word = e[j];
+ const WordID cur_word = MapToClusterIfNecessary(e[j]);
SparseVector<double> p;
if (cur_word == kSOS_) {
state = BeginSentenceState();
@@ -348,9 +359,52 @@ class NgramDetectorImpl {
}
}
+ void ReadClusterFile(const string& clusters) {
+ ReadFile rf(clusters);
+ istream& in = *rf.stream();
+ string line;
+ int lc = 0;
+ string cluster;
+ string word;
+ while(getline(in, line)) {
+ ++lc;
+ if (line.size() == 0) continue;
+ if (line[0] == '#') continue;
+ unsigned cend = 1;
+ while((line[cend] != ' ' && line[cend] != '\t') && cend < line.size()) {
+ ++cend;
+ }
+ if (cend == line.size()) {
+ cerr << "Line " << lc << " in " << clusters << " malformed: " << line << endl;
+ abort();
+ }
+ unsigned wbeg = cend + 1;
+ while((line[wbeg] == ' ' || line[wbeg] == '\t') && wbeg < line.size()) {
+ ++wbeg;
+ }
+ if (wbeg == line.size()) {
+ cerr << "Line " << lc << " in " << clusters << " malformed: " << line << endl;
+ abort();
+ }
+ unsigned wend = wbeg + 1;
+ while((line[wend] != ' ' && line[wend] != '\t') && wend < line.size()) {
+ ++wend;
+ }
+ const WordID clusterid = TD::Convert(line.substr(0, cend));
+ const WordID wordid = TD::Convert(line.substr(wbeg, wend - wbeg));
+ if (wordid >= cluster_map.size())
+ cluster_map.resize(wordid + 10, kCDEC_UNK);
+ cluster_map[wordid] = clusterid;
+ }
+ cluster_map[kSOS_] = kSOS_;
+ cluster_map[kEOS_] = kEOS_;
+ }
+
+ vector<WordID> cluster_map;
+
public:
explicit NgramDetectorImpl(bool explicit_markers, unsigned order,
- vector<string>& prefixes, string& target_separator) :
+ vector<string>& prefixes, string& target_separator, const string& clusters) :
kCDEC_UNK(TD::Convert("<unk>")) ,
add_sos_eos_(!explicit_markers) {
order_ = order;
@@ -369,6 +423,9 @@ class NgramDetectorImpl {
dummy_rule_.reset(new TRule("[DUMMY] ||| [BOS] [DUMMY] ||| [1] [2] </s> ||| X=0"));
kSOS_ = TD::Convert("<s>");
kEOS_ = TD::Convert("</s>");
+
+ if (clusters.size())
+ ReadClusterFile(clusters);
}
~NgramDetectorImpl() {
@@ -409,9 +466,10 @@ NgramDetector::NgramDetector(const string& param) {
vector<string> prefixes;
bool explicit_markers = false;
unsigned order = 3;
- ParseArgs(param, &explicit_markers, &order, prefixes, target_separator);
+ string clusters;
+ ParseArgs(param, &explicit_markers, &order, prefixes, target_separator, &clusters);
pimpl_ = new NgramDetectorImpl(explicit_markers, order, prefixes,
- target_separator);
+ target_separator, clusters);
SetStateSize(pimpl_->ReserveStateSize());
}