summaryrefslogtreecommitdiff
path: root/klm/lm/builder/corpus_count.cc
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2014-01-28 00:18:37 -0500
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2014-01-28 00:18:37 -0500
commit0ee2b44c5c0981358ade9ddab1d083bbe1de5daf (patch)
tree2dc8d9b9efb24b3a3b5c2bcaf0b55df30743d151 /klm/lm/builder/corpus_count.cc
parent2ac0704a463d45f0bfe23184a1ea9950d60fd546 (diff)
parent783c57b2d3312738ddcf992ac55ff750afe7cb47 (diff)
Merge branch 'master' of github.com:redpony/cdec
Diffstat (limited to 'klm/lm/builder/corpus_count.cc')
-rw-r--r--klm/lm/builder/corpus_count.cc9
1 files changed, 7 insertions, 2 deletions
diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc
index aea93ad1..ccc06efc 100644
--- a/klm/lm/builder/corpus_count.cc
+++ b/klm/lm/builder/corpus_count.cc
@@ -238,12 +238,17 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
const WordIndex end_sentence = vocab.Lookup("</s>");
Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_);
uint64_t count = 0;
- StringPiece delimiters("\0\t\r ", 4);
+ bool delimiters[256];
+ memset(delimiters, 0, sizeof(delimiters));
+ const char kDelimiterSet[] = "\0\t\n\r ";
+ for (const char *i = kDelimiterSet; i < kDelimiterSet + sizeof(kDelimiterSet); ++i) {
+ delimiters[static_cast<unsigned char>(*i)] = true;
+ }
try {
while(true) {
StringPiece line(from_.ReadLine());
writer.StartSentence();
- for (util::TokenIter<util::AnyCharacter, true> w(line, delimiters); w; ++w) {
+ for (util::TokenIter<util::BoolCharacter, true> w(line, delimiters); w; ++w) {
WordIndex word = vocab.Lookup(*w);
UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus. I plan to support models containing <unk> in the future.");
writer.Append(word);