diff options
author | Patrick Simianer <p@simianer.de> | 2014-01-28 15:35:31 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2014-01-28 15:35:31 +0100 |
commit | c83f665cb7efbbfb0fdfa12203b09ba60e365d25 (patch) | |
tree | d9132aaf35e696a52c5e09430ae2889b033cdacb /klm/lm/builder/corpus_count.cc | |
parent | 85088dc6e09d4e91038aea46e8d20b5c34053b5f (diff) | |
parent | 3e22fcc3569a2855f691be4e3ee81f644b926c04 (diff) |
resolv conflict in mira
Diffstat (limited to 'klm/lm/builder/corpus_count.cc')
-rw-r--r-- | klm/lm/builder/corpus_count.cc | 9 |
1 files changed, 7 insertions, 2 deletions
diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc index aea93ad1..ccc06efc 100644 --- a/klm/lm/builder/corpus_count.cc +++ b/klm/lm/builder/corpus_count.cc @@ -238,12 +238,17 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) { const WordIndex end_sentence = vocab.Lookup("</s>"); Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_); uint64_t count = 0; - StringPiece delimiters("\0\t\r ", 4); + bool delimiters[256]; + memset(delimiters, 0, sizeof(delimiters)); + const char kDelimiterSet[] = "\0\t\n\r "; + for (const char *i = kDelimiterSet; i < kDelimiterSet + sizeof(kDelimiterSet); ++i) { + delimiters[static_cast<unsigned char>(*i)] = true; + } try { while(true) { StringPiece line(from_.ReadLine()); writer.StartSentence(); - for (util::TokenIter<util::AnyCharacter, true> w(line, delimiters); w; ++w) { + for (util::TokenIter<util::BoolCharacter, true> w(line, delimiters); w; ++w) { WordIndex word = vocab.Lookup(*w); UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus. I plan to support models containing <unk> in the future."); writer.Append(word); |