summaryrefslogtreecommitdiff
path: root/klm/util/stream/line_input.cc
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2013-01-18 17:12:51 +0000
committerKenneth Heafield <github@kheafield.com>2013-01-18 17:12:51 +0000
commit0b9031042500d45a098762f0a930bd6a66a58fac (patch)
tree38903f3e29225aa8d444ee66b6963c7148050fee /klm/util/stream/line_input.cc
parent9d7167751a3712a79ad356764d803106a71ce5e3 (diff)
KenLM dffafbf with lmplz source (but not built)
Diffstat (limited to 'klm/util/stream/line_input.cc')
-rw-r--r--klm/util/stream/line_input.cc52
1 files changed, 52 insertions, 0 deletions
diff --git a/klm/util/stream/line_input.cc b/klm/util/stream/line_input.cc
new file mode 100644
index 00000000..dafa5020
--- /dev/null
+++ b/klm/util/stream/line_input.cc
@@ -0,0 +1,52 @@
+#include "util/stream/line_input.hh"
+
+#include "util/exception.hh"
+#include "util/file.hh"
+#include "util/read_compressed.hh"
+#include "util/stream/chain.hh"
+
+#include <algorithm>
+#include <vector>
+
+namespace util { namespace stream {
+
+void LineInput::Run(const ChainPosition &position) {
+ ReadCompressed reader(fd_);
+ // Holding area for beginning of line to be placed in next block.
+ std::vector<char> carry;
+
+ for (Link block(position); ; ++block) {
+ char *to = static_cast<char*>(block->Get());
+ char *begin = to;
+ char *end = to + position.GetChain().BlockSize();
+ std::copy(carry.begin(), carry.end(), to);
+ to += carry.size();
+ while (to != end) {
+ std::size_t got = reader.Read(to, end - to);
+ if (!got) {
+ // EOF
+ block->SetValidSize(to - begin);
+ ++block;
+ block.Poison();
+ return;
+ }
+ to += got;
+ }
+
+ // Find the last newline.
+ char *newline;
+ for (newline = to - 1; ; --newline) {
+ UTIL_THROW_IF(newline < begin, Exception, "Did not find a newline in " << position.GetChain().BlockSize() << " bytes of input of " << NameFromFD(fd_) << ". Is this a text file?");
+ if (*newline == '\n') break;
+ }
+
+ // Copy everything after the last newline to the carry.
+ carry.clear();
+ carry.resize(to - (newline + 1));
+ std::copy(newline + 1, to, &*carry.begin());
+
+ block->SetValidSize(newline + 1 - begin);
+ }
+}
+
+}} // namespaces