From dc16aa2accc7d9033d9c31c7bbc5e581d43a5101 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 20 Jan 2013 12:31:03 +0000 Subject: Better delimiters, cross-platform fixes --- klm/lm/builder/corpus_count.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'klm/lm/builder') diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc index 8c3de57d..abea4ed0 100644 --- a/klm/lm/builder/corpus_count.cc +++ b/klm/lm/builder/corpus_count.cc @@ -202,11 +202,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) { const WordIndex end_sentence = vocab.Lookup(""); Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_); uint64_t count = 0; + StringPiece delimiters("\0\t\r ", 4); try { while(true) { StringPiece line(from_.ReadLine()); writer.StartSentence(); - for (util::TokenIter w(line, " \t"); w; ++w) { + for (util::TokenIter w(line, delimiters); w; ++w) { WordIndex word = vocab.Lookup(*w); UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus. I plan to support models containing in the future."); writer.Append(word); -- cgit v1.2.3