summaryrefslogtreecommitdiff
path: root/klm/lm/builder
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2013-01-20 12:31:03 +0000
committerKenneth Heafield <github@kheafield.com>2013-01-20 12:31:03 +0000
commit2753c37d0b59df79be15d88222eb0f2ec6caf903 (patch)
tree47a4f376b2b96dc6dfc9e526ea31a0970beea64c /klm/lm/builder
parent816632a5d1e3a5a24c9b3aacb4210ed8b28a9c62 (diff)
Better delimiters, cross-platform fixes
Diffstat (limited to 'klm/lm/builder')
-rw-r--r--klm/lm/builder/corpus_count.cc3
1 files changed, 2 insertions, 1 deletions
diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc
index 8c3de57d..abea4ed0 100644
--- a/klm/lm/builder/corpus_count.cc
+++ b/klm/lm/builder/corpus_count.cc
@@ -202,11 +202,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
const WordIndex end_sentence = vocab.Lookup("</s>");
Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_);
uint64_t count = 0;
+ StringPiece delimiters("\0\t\r ", 4);
try {
while(true) {
StringPiece line(from_.ReadLine());
writer.StartSentence();
- for (util::TokenIter<util::AnyCharacter, true> w(line, " \t"); w; ++w) {
+ for (util::TokenIter<util::AnyCharacter, true> w(line, delimiters); w; ++w) {
WordIndex word = vocab.Lookup(*w);
UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus. I plan to support models containing <unk> in the future.");
writer.Append(word);