diff options
Diffstat (limited to 'klm/lm/builder')
| -rw-r--r-- | klm/lm/builder/corpus_count.cc | 3 | 
1 files changed, 2 insertions, 1 deletions
| diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc index 8c3de57d..abea4ed0 100644 --- a/klm/lm/builder/corpus_count.cc +++ b/klm/lm/builder/corpus_count.cc @@ -202,11 +202,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {    const WordIndex end_sentence = vocab.Lookup("</s>");    Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_);    uint64_t count = 0; +  StringPiece delimiters("\0\t\r ", 4);    try {      while(true) {        StringPiece line(from_.ReadLine());        writer.StartSentence(); -      for (util::TokenIter<util::AnyCharacter, true> w(line, " \t"); w; ++w) { +      for (util::TokenIter<util::AnyCharacter, true> w(line, delimiters); w; ++w) {          WordIndex word = vocab.Lookup(*w);          UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus.  I plan to support models containing <unk> in the future.");          writer.Append(word); | 
