summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2012-12-14 12:48:26 -0800
committerKenneth Heafield <github@kheafield.com>2012-12-14 12:48:26 -0800
commitdd0fdabb1db41a4230e487c80b61ace9697f150d (patch)
tree82686a3461645f83343a1926482007946d5a542c
parent5d42134cc676278189b0f77708908542fbb5ccc9 (diff)
Updated kenlm
-rw-r--r--configure.ac2
-rw-r--r--klm/lm/binary_format.cc21
-rw-r--r--klm/lm/config.cc1
-rw-r--r--klm/lm/config.hh59
-rw-r--r--klm/lm/max_order.hh2
-rw-r--r--klm/lm/model.cc30
-rw-r--r--klm/lm/search_trie.cc47
-rw-r--r--klm/util/Makefile.am1
-rw-r--r--klm/util/exception.hh8
-rw-r--r--klm/util/file.cc38
-rw-r--r--klm/util/file.hh8
-rw-r--r--klm/util/file_piece.cc66
-rw-r--r--klm/util/file_piece.hh41
-rw-r--r--klm/util/file_piece_test.cc4
-rw-r--r--klm/util/have.hh12
-rw-r--r--klm/util/joint_sort.hh4
-rw-r--r--klm/util/read_compressed.cc403
-rw-r--r--klm/util/read_compressed.hh74
-rw-r--r--klm/util/read_compressed_test.cc94
-rw-r--r--klm/util/scoped.hh65
-rw-r--r--klm/util/string_piece.hh19
-rw-r--r--klm/util/tokenize_piece.hh14
22 files changed, 781 insertions, 232 deletions
diff --git a/configure.ac b/configure.ac
index f1b9d132..f4650ca4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -87,7 +87,7 @@ AC_CHECK_HEADER(lzma.h,
AC_PROG_INSTALL
-CPPFLAGS="-DPIC -fPIC $CPPFLAGS -DHAVE_CONFIG_H"
+CPPFLAGS="-DPIC -fPIC $CPPFLAGS -DHAVE_CONFIG_H -DKENLM_MAX_ORDER=6"
# core cdec stuff
AC_CONFIG_FILES([Makefile])
diff --git a/klm/lm/binary_format.cc b/klm/lm/binary_format.cc
index efa67056..39c4a9b6 100644
--- a/klm/lm/binary_format.cc
+++ b/klm/lm/binary_format.cc
@@ -16,11 +16,11 @@ namespace ngram {
namespace {
const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
-// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).
+// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).
const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n";
const long int kMagicVersion = 5;
-// Old binary files built on 32-bit machines have this header.
+// Old binary files built on 32-bit machines have this header.
// TODO: eliminate with next binary release.
struct OldSanity {
char magic[sizeof(kMagicBytes)];
@@ -39,7 +39,7 @@ struct OldSanity {
};
-// Test values aligned to 8 bytes.
+// Test values aligned to 8 bytes.
struct Sanity {
char magic[ALIGN8(sizeof(kMagicBytes))];
float zero_f, one_f, minus_half_f;
@@ -101,7 +101,7 @@ uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_
uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) {
std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad;
if (config.write_mmap) {
- // Grow the file to accomodate the search, using zeros.
+ // Grow the file to accomodate the search, using zeros.
try {
util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size);
} catch (util::ErrnoException &e) {
@@ -114,7 +114,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
return reinterpret_cast<uint8_t*>(backing.search.get());
}
// mmap it now.
- // We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
+ // We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
std::size_t page_size = util::SizePage();
std::size_t alignment_cruft = adjusted_vocab % page_size;
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
@@ -122,7 +122,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
} else {
util::MapAnonymous(memory_size, backing.search);
return reinterpret_cast<uint8_t*>(backing.search.get());
- }
+ }
}
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) {
@@ -140,7 +140,7 @@ void FinishFile(const Config &config, ModelType model_type, unsigned int search_
util::FSyncOrThrow(backing.file.get());
break;
}
- // header and vocab share the same mmap. The header is written here because we know the counts.
+ // header and vocab share the same mmap. The header is written here because we know the counts.
Parameters params = Parameters();
params.counts = counts;
params.fixed.order = counts.size();
@@ -160,7 +160,7 @@ namespace detail {
bool IsBinaryFormat(int fd) {
const uint64_t size = util::SizeFile(fd);
if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
- // Try reading the header.
+ // Try reading the header.
util::scoped_memory memory;
try {
util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory);
@@ -214,7 +214,7 @@ void SeekPastHeader(int fd, const Parameters &params) {
uint8_t *SetupBinary(const Config &config, const Parameters &params, uint64_t memory_size, Backing &backing) {
const uint64_t file_size = util::SizeFile(backing.file.get());
- // The header is smaller than a page, so we have to map the whole header as well.
+ // The header is smaller than a page, so we have to map the whole header as well.
std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size);
if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
@@ -233,7 +233,8 @@ void ComplainAboutARPA(const Config &config, ModelType model_type) {
if (config.write_mmap || !config.messages) return;
if (config.arpa_complain == Config::ALL) {
*config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
- } else if (config.arpa_complain == Config::EXPENSIVE && model_type == TRIE_SORTED) {
+ } else if (config.arpa_complain == Config::EXPENSIVE &&
+ (model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {
*config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl;
}
}
diff --git a/klm/lm/config.cc b/klm/lm/config.cc
index f9d988ca..9520c41c 100644
--- a/klm/lm/config.cc
+++ b/klm/lm/config.cc
@@ -6,6 +6,7 @@ namespace lm {
namespace ngram {
Config::Config() :
+ show_progress(true),
messages(&std::cerr),
enumerate_vocab(NULL),
unknown_missing(COMPLAIN),
diff --git a/klm/lm/config.hh b/klm/lm/config.hh
index 739cee9c..0de7b7c6 100644
--- a/klm/lm/config.hh
+++ b/klm/lm/config.hh
@@ -11,46 +11,52 @@
/* Configuration for ngram model. Separate header to reduce pollution. */
namespace lm {
-
+
class EnumerateVocab;
namespace ngram {
struct Config {
- // EFFECTIVE FOR BOTH ARPA AND BINARY READS
+ // EFFECTIVE FOR BOTH ARPA AND BINARY READS
+
+ // (default true) print progress bar to messages
+ bool show_progress;
// Where to log messages including the progress bar. Set to NULL for
// silence.
std::ostream *messages;
+ std::ostream *ProgressMessages() const {
+ return show_progress ? messages : 0;
+ }
+
// This will be called with every string in the vocabulary. See
// enumerate_vocab.hh for more detail. Config does not take ownership; you
- // are still responsible for deleting it (or stack allocating).
+ // are still responsible for deleting it (or stack allocating).
EnumerateVocab *enumerate_vocab;
-
// ONLY EFFECTIVE WHEN READING ARPA
- // What to do when <unk> isn't in the provided model.
+ // What to do when <unk> isn't in the provided model.
WarningAction unknown_missing;
- // What to do when <s> or </s> is missing from the model.
- // If THROW_UP, the exception will be of type util::SpecialWordMissingException.
+ // What to do when <s> or </s> is missing from the model.
+ // If THROW_UP, the exception will be of type util::SpecialWordMissingException.
WarningAction sentence_marker_missing;
// What to do with a positive log probability. For COMPLAIN and SILENT, map
- // to 0.
+ // to 0.
WarningAction positive_log_probability;
- // The probability to substitute for <unk> if it's missing from the model.
+ // The probability to substitute for <unk> if it's missing from the model.
// No effect if the model has <unk> or unknown_missing == THROW_UP.
float unknown_missing_logprob;
// Size multiplier for probing hash table. Must be > 1. Space is linear in
// this. Time is probing_multiplier / (probing_multiplier - 1). No effect
- // for sorted variant.
+ // for sorted variant.
// If you find yourself setting this to a low number, consider using the
- // TrieModel which has lower memory consumption.
+ // TrieModel which has lower memory consumption.
float probing_multiplier;
// Amount of memory to use for building. The actual memory usage will be
@@ -58,10 +64,10 @@ struct Config {
// models.
std::size_t building_memory;
- // Template for temporary directory appropriate for passing to mkdtemp.
+ // Template for temporary directory appropriate for passing to mkdtemp.
// The characters XXXXXX are appended before passing to mkdtemp. Only
// applies to trie. If NULL, defaults to write_mmap. If that's NULL,
- // defaults to input file name.
+ // defaults to input file name.
const char *temporary_directory_prefix;
// Level of complaining to do when loading from ARPA instead of binary format.
@@ -69,49 +75,46 @@ struct Config {
ARPALoadComplain arpa_complain;
// While loading an ARPA file, also write out this binary format file. Set
- // to NULL to disable.
+ // to NULL to disable.
const char *write_mmap;
enum WriteMethod {
- WRITE_MMAP, // Map the file directly.
- WRITE_AFTER // Write after we're done.
+ WRITE_MMAP, // Map the file directly.
+ WRITE_AFTER // Write after we're done.
};
WriteMethod write_method;
- // Include the vocab in the binary file? Only effective if write_mmap != NULL.
+ // Include the vocab in the binary file? Only effective if write_mmap != NULL.
bool include_vocab;
- // Left rest options. Only used when the model includes rest costs.
+ // Left rest options. Only used when the model includes rest costs.
enum RestFunction {
REST_MAX, // Maximum of any score to the left
- REST_LOWER, // Use lower-order files given below.
+ REST_LOWER, // Use lower-order files given below.
};
RestFunction rest_function;
- // Only used for REST_LOWER.
+ // Only used for REST_LOWER.
std::vector<std::string> rest_lower_files;
-
// Quantization options. Only effective for QuantTrieModel. One value is
// reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
- // to quantize (and one of the remaining backoffs will be 0).
+ // to quantize (and one of the remaining backoffs will be 0).
uint8_t prob_bits, backoff_bits;
// Bhiksha compression (simple form). Only works with trie.
uint8_t pointer_bhiksha_bits;
-
-
+
// ONLY EFFECTIVE WHEN READING BINARY
-
+
// How to get the giant array into memory: lazy mmap, populate, read etc.
- // See util/mmap.hh for details of MapMethod.
+ // See util/mmap.hh for details of MapMethod.
util::LoadMethod load_method;
-
- // Set defaults.
+ // Set defaults.
Config();
};
diff --git a/klm/lm/max_order.hh b/klm/lm/max_order.hh
index ea0dea46..3eb97ccd 100644
--- a/klm/lm/max_order.hh
+++ b/klm/lm/max_order.hh
@@ -7,5 +7,3 @@
#ifndef KENLM_ORDER_MESSAGE
#define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile. In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'. Otherwise, edit lm/max_order.hh."
#endif
-
-#define KENLM_MAX_ORDER 5
diff --git a/klm/lm/model.cc b/klm/lm/model.cc
index fc61efee..a40fd2fb 100644
--- a/klm/lm/model.cc
+++ b/klm/lm/model.cc
@@ -37,7 +37,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &config) {
LoadLM(file, config, *this);
- // g++ prints warnings unless these are fully initialized.
+ // g++ prints warnings unless these are fully initialized.
State begin_sentence = State();
begin_sentence.length = 1;
begin_sentence.words[0] = vocab_.BeginSentence();
@@ -69,8 +69,8 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
}
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(const char *file, const Config &config) {
- // Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any.
- util::FilePiece f(backing_.file.release(), file, config.messages);
+ // Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any.
+ util::FilePiece f(backing_.file.release(), file, config.ProgressMessages());
try {
std::vector<uint64_t> counts;
// File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_.
@@ -80,7 +80,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0");
std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config));
- // Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs.
+ // Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs.
vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config);
if (config.write_mmap) {
@@ -95,7 +95,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
if (!vocab_.SawUnk()) {
assert(config.unknown_missing != THROW_UP);
- // Default probabilities for unknown.
+ // Default probabilities for unknown.
search_.UnknownUnigram().backoff = 0.0;
search_.UnknownUnigram().prob = config.unknown_missing_logprob;
}
@@ -147,7 +147,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
}
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const {
- // Generate a state from context.
+ // Generate a state from context.
context_rend = std::min(context_rend, context_rbegin + P::Order() - 1);
if (context_rend == context_rbegin) {
out_state.length = 0;
@@ -191,7 +191,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
ret.rest = ptr.Rest();
ret.prob = ptr.Prob();
ret.extend_left = extend_pointer;
- // If this function is called, then it does depend on left words.
+ // If this function is called, then it does depend on left words.
ret.independent_left = false;
}
float subtract_me = ret.rest;
@@ -199,7 +199,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
next_use = extend_length;
ResumeScore(add_rbegin, add_rend, extend_length - 1, node, backoff_out, next_use, ret);
next_use -= extend_length;
- // Charge backoffs.
+ // Charge backoffs.
for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b;
ret.prob -= subtract_me;
ret.rest -= subtract_me;
@@ -209,7 +209,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
namespace {
// Do a paraonoid copy of history, assuming new_word has already been copied
// (hence the -1). out_state.length could be zero so I avoided using
-// std::copy.
+// std::copy.
void CopyRemainingHistory(const WordIndex *from, State &out_state) {
WordIndex *out = out_state.words + 1;
const WordIndex *in_end = from + static_cast<ptrdiff_t>(out_state.length) - 1;
@@ -217,10 +217,10 @@ void CopyRemainingHistory(const WordIndex *from, State &out_state) {
}
} // namespace
-/* Ugly optimized function. Produce a score excluding backoff.
- * The search goes in increasing order of ngram length.
+/* Ugly optimized function. Produce a score excluding backoff.
+ * The search goes in increasing order of ngram length.
* Context goes backward, so context_begin is the word immediately preceeding
- * new_word.
+ * new_word.
*/
template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ScoreExceptBackoff(
const WordIndex *const context_rbegin,
@@ -229,7 +229,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
State &out_state) const {
assert(new_word < vocab_.Bound());
FullScoreReturn ret;
- // ret.ngram_length contains the last known non-blank ngram length.
+ // ret.ngram_length contains the last known non-blank ngram length.
ret.ngram_length = 1;
typename Search::Node node;
@@ -238,9 +238,9 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
ret.prob = uni.Prob();
ret.rest = uni.Rest();
- // This is the length of the context that should be used for continuation to the right.
+ // This is the length of the context that should be used for continuation to the right.
out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0;
- // We'll write the word anyway since it will probably be used and does no harm being there.
+ // We'll write the word anyway since it will probably be used and does no harm being there.
out_state.words[0] = new_word;
if (context_rbegin == context_rend) return ret;
diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc
index debcfd07..1b0d9b26 100644
--- a/klm/lm/search_trie.cc
+++ b/klm/lm/search_trie.cc
@@ -55,7 +55,7 @@ struct ProbPointer {
uint64_t index;
};
-// Array of n-grams and float indices.
+// Array of n-grams and float indices.
class BackoffMessages {
public:
void Init(std::size_t entry_size) {
@@ -100,7 +100,7 @@ class BackoffMessages {
void Apply(float *const *const base, RecordReader &reader) {
FinishedAdding();
if (current_ == allocated_) return;
- // We'll also use the same buffer to record messages to blanks that they extend.
+ // We'll also use the same buffer to record messages to blanks that they extend.
WordIndex *extend_out = reinterpret_cast<WordIndex*>(current_);
const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex);
for (reader.Rewind(); reader && (current_ != allocated_); ) {
@@ -109,7 +109,7 @@ class BackoffMessages {
++reader;
break;
case 1:
- // Message but nobody to receive it. Write it down at the beginning of the buffer so we can inform this blank that it extends.
+ // Message but nobody to receive it. Write it down at the beginning of the buffer so we can inform this blank that it extends.
for (const WordIndex *w = reinterpret_cast<const WordIndex *>(current_); w != reinterpret_cast<const WordIndex *>(current_) + order; ++w, ++extend_out) *extend_out = *w;
current_ += entry_size_;
break;
@@ -126,7 +126,7 @@ class BackoffMessages {
break;
}
}
- // Now this is a list of blanks that extend right.
+ // Now this is a list of blanks that extend right.
entry_size_ = sizeof(WordIndex) * order;
Resize(sizeof(WordIndex) * (extend_out - (const WordIndex*)backing_.get()));
current_ = (uint8_t*)backing_.get();
@@ -153,7 +153,7 @@ class BackoffMessages {
private:
void FinishedAdding() {
Resize(current_ - (uint8_t*)backing_.get());
- // Sort requests in same order as files.
+ // Sort requests in same order as files.
std::sort(
util::SizedIterator(util::SizedProxy(backing_.get(), entry_size_)),
util::SizedIterator(util::SizedProxy(current_, entry_size_)),
@@ -220,7 +220,7 @@ class SRISucks {
}
private:
- // This used to be one array. Then I needed to separate it by order for quantization to work.
+ // This used to be one array. Then I needed to separate it by order for quantization to work.
std::vector<float> values_[KENLM_MAX_ORDER - 1];
BackoffMessages messages_[KENLM_MAX_ORDER - 1];
@@ -253,7 +253,7 @@ class FindBlanks {
++counts_.back();
}
- // Unigrams wrote one past.
+ // Unigrams wrote one past.
void Cleanup() {
--counts_[0];
}
@@ -270,15 +270,15 @@ class FindBlanks {
SRISucks &sri_;
};
-// Phase to actually write n-grams to the trie.
+// Phase to actually write n-grams to the trie.
template <class Quant, class Bhiksha> class WriteEntries {
public:
- WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) :
+ WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) :
contexts_(contexts),
quant_(quant),
unigrams_(unigrams),
middle_(middle),
- longest_(longest),
+ longest_(longest),
bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)),
order_(order),
sri_(sri) {}
@@ -328,7 +328,7 @@ struct Gram {
const WordIndex *begin, *end;
- // For queue, this is the direction we want.
+ // For queue, this is the direction we want.
bool operator<(const Gram &other) const {
return std::lexicographical_compare(other.begin, other.end, begin, end);
}
@@ -353,7 +353,7 @@ template <class Doing> class BlankManager {
been_length_ = length;
return;
}
- // There are blanks to insert starting with order blank.
+ // There are blanks to insert starting with order blank.
unsigned char blank = cur - to + 1;
UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context.");
const float *lower_basis;
@@ -363,7 +363,7 @@ template <class Doing> class BlankManager {
assert(*lower_basis != kBadProb);
doing_.MiddleBlank(blank, to, based_on, *lower_basis);
*pre = *cur;
- // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.
+ // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.
basis_[blank - 1] = kBadProb;
}
*pre = *cur;
@@ -377,7 +377,7 @@ template <class Doing> class BlankManager {
unsigned char been_length_;
float basis_[KENLM_MAX_ORDER];
-
+
Doing &doing_;
};
@@ -451,7 +451,7 @@ template <class Quant> void TrainProbQuantizer(uint8_t order, uint64_t count, Re
}
void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) {
- // Fill unigram probabilities.
+ // Fill unigram probabilities.
try {
rewind(file);
for (WordIndex i = 0; i < unigram_count; ++i) {
@@ -486,7 +486,7 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
util::scoped_memory unigrams;
MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
- RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder);
+ RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Identifying n-grams omitted by SRI", finder);
fixed_counts = finder.Counts();
}
unigram_file.reset(util::FDOpenOrThrow(unigram_fd));
@@ -504,7 +504,8 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
inputs[i-2].Rewind();
}
if (Quant::kTrain) {
- util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0), config.messages, "Quantizing");
+ util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0),
+ config.ProgressMessages(), "Quantizing");
for (unsigned char i = 2; i < counts.size(); ++i) {
TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, quant);
}
@@ -519,13 +520,13 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
for (unsigned char i = 2; i <= counts.size(); ++i) {
inputs[i-2].Rewind();
}
- // Fill entries except unigram probabilities.
+ // Fill entries except unigram probabilities.
{
WriteEntries<Quant, Bhiksha> writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri);
- RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Writing trie", writer);
+ RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Writing trie", writer);
}
- // Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.
+ // Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.
for (unsigned char order = 2; order <= counts.size(); ++order) {
const RecordReader &context = contexts[order - 2];
if (context) {
@@ -541,13 +542,13 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
}
/* Set ending offsets so the last entry will be sized properly */
- // Last entry for unigrams was already set.
+ // Last entry for unigrams was already set.
if (out.middle_begin_ != out.middle_end_) {
for (typename TrieSearch<Quant, Bhiksha>::Middle *i = out.middle_begin_; i != out.middle_end_ - 1; ++i) {
i->FinishedLoading((i+1)->InsertIndex(), config);
}
(out.middle_end_ - 1)->FinishedLoading(out.longest_.InsertIndex(), config);
- }
+ }
}
template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
@@ -595,7 +596,7 @@ template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::Initializ
} else {
temporary_prefix = file;
}
- // At least 1MB sorting memory.
+ // At least 1MB sorting memory.
SortedFiles sorted(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab);
BuildTrie(sorted, counts, config, *this, quant_, vocab, backing);
diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am
index 5306850f..a676bdb3 100644
--- a/klm/util/Makefile.am
+++ b/klm/util/Makefile.am
@@ -27,6 +27,7 @@ libklm_util_a_SOURCES = \
mmap.cc \
murmur_hash.cc \
pool.cc \
+ read_compressed.cc \
string_piece.cc \
usage.cc
diff --git a/klm/util/exception.hh b/klm/util/exception.hh
index 053a850b..0165a7a3 100644
--- a/klm/util/exception.hh
+++ b/klm/util/exception.hh
@@ -87,8 +87,14 @@ template <class Except, class Data> typename Except::template ExceptionTag<Excep
throw UTIL_e; \
} while (0)
+#if __GNUC__ >= 3
+#define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0)
+#else
+#define UTIL_UNLIKELY(x) (x)
+#endif
+
#define UTIL_THROW_IF(Condition, Exception, Modify) do { \
- if (Condition) { \
+ if (UTIL_UNLIKELY(Condition)) { \
Exception UTIL_e; \
UTIL_SET_LOCATION(UTIL_e, #Exception, #Condition); \
UTIL_e << Modify; \
diff --git a/klm/util/file.cc b/klm/util/file.cc
index 6bf879ac..b9a77cf9 100644
--- a/klm/util/file.cc
+++ b/klm/util/file.cc
@@ -15,6 +15,8 @@
#if defined(_WIN32) || defined(_WIN64)
#include <windows.h>
#include <io.h>
+#include <algorithm>
+#include <limits.h>
#else
#include <unistd.h>
#endif
@@ -48,7 +50,7 @@ int OpenReadOrThrow(const char *name) {
int CreateOrThrow(const char *name) {
int ret;
#if defined(_WIN32) || defined(_WIN64)
- UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
+ UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR | _O_BINARY, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
#else
UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);
#endif
@@ -74,16 +76,22 @@ void ResizeOrThrow(int fd, uint64_t to) {
#endif
}
-#ifdef WIN32
-typedef int ssize_t;
+std::size_t PartialRead(int fd, void *to, std::size_t amount) {
+#if defined(_WIN32) || defined(_WIN64)
+ amount = min(static_cast<std::size_t>(INT_MAX), amount);
+ int ret = _read(fd, to, amount);
+#else
+ ssize_t ret = read(fd, to, amount);
#endif
+ UTIL_THROW_IF(ret < 0, ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
+ return static_cast<std::size_t>(ret);
+}
void ReadOrThrow(int fd, void *to_void, std::size_t amount) {
uint8_t *to = static_cast<uint8_t*>(to_void);
while (amount) {
- ssize_t ret = read(fd, to, amount);
- UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
- UTIL_THROW_IF(ret == 0, EndOfFileException, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
+ std::size_t ret = PartialRead(fd, to, amount);
+ UTIL_THROW_IF(ret == 0, EndOfFileException, " in fd " << fd << " but there should be " << amount << " more bytes to read.");
amount -= ret;
to += ret;
}
@@ -93,8 +101,7 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) {
uint8_t *to = static_cast<uint8_t*>(to_void);
std::size_t remaining = amount;
while (remaining) {
- ssize_t ret = read(fd, to, remaining);
- UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << remaining << " from fd " << fd << " failed.");
+ std::size_t ret = PartialRead(fd, to, remaining);
if (!ret) return amount - remaining;
remaining -= ret;
to += ret;
@@ -105,7 +112,11 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) {
void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
const uint8_t *data = static_cast<const uint8_t*>(data_void);
while (size) {
+#if defined(_WIN32) || defined(_WIN64)
+ int ret = write(fd, data, min(static_cast<std::size_t>(INT_MAX), size));
+#else
ssize_t ret = write(fd, data, size);
+#endif
if (ret < 1) UTIL_THROW(util::ErrnoException, "Write failed");
data += ret;
size -= ret;
@@ -114,7 +125,7 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
void WriteOrThrow(FILE *to, const void *data, std::size_t size) {
assert(size);
- if (1 != std::fwrite(data, size, 1, to)) UTIL_THROW(util::ErrnoException, "Short write; requested size " << size);
+ UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), util::ErrnoException, "Short write; requested size " << size);
}
void FSyncOrThrow(int fd) {
@@ -149,14 +160,15 @@ void SeekEnd(int fd) {
std::FILE *FDOpenOrThrow(scoped_fd &file) {
std::FILE *ret = fdopen(file.get(), "r+b");
- if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen");
+ if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen descriptor " << file.get());
file.release();
return ret;
}
-std::FILE *FOpenOrThrow(const char *path, const char *mode) {
- std::FILE *ret;
- UTIL_THROW_IF(!(ret = fopen(path, mode)), util::ErrnoException, "Could not fopen " << path << " for " << mode);
+std::FILE *FDOpenReadOrThrow(scoped_fd &file) {
+ std::FILE *ret = fdopen(file.get(), "rb");
+ if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen descriptor " << file.get());
+ file.release();
return ret;
}
diff --git a/klm/util/file.hh b/klm/util/file.hh
index 185cb1f3..c24580d6 100644
--- a/klm/util/file.hh
+++ b/klm/util/file.hh
@@ -32,8 +32,6 @@ class scoped_fd {
return ret;
}
- operator bool() { return fd_ != -1; }
-
private:
int fd_;
@@ -76,8 +74,9 @@ uint64_t SizeFile(int fd);
void ResizeOrThrow(int fd, uint64_t to);
+std::size_t PartialRead(int fd, void *to, std::size_t size);
void ReadOrThrow(int fd, void *to, std::size_t size);
-std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount);
+std::size_t ReadOrEOF(int fd, void *to_void, std::size_t size);
void WriteOrThrow(int fd, const void *data_void, std::size_t size);
void WriteOrThrow(FILE *to, const void *data, std::size_t size);
@@ -90,8 +89,7 @@ void AdvanceOrThrow(int fd, int64_t off);
void SeekEnd(int fd);
std::FILE *FDOpenOrThrow(scoped_fd &file);
-
-std::FILE *FOpenOrThrow(const char *path, const char *mode);
+std::FILE *FDOpenReadOrThrow(scoped_fd &file);
class TempMaker {
public:
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index 280f438c..5a208eff 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -14,7 +14,6 @@
#include <limits>
#include <assert.h>
-#include <ctype.h>
#include <fcntl.h>
#include <stdlib.h>
#include <sys/types.h>
@@ -26,13 +25,6 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {
*this << "Could not parse \"" << value << "\" into a number";
}
-#ifdef HAVE_ZLIB
-GZException::GZException(gzFile file) {
- int num;
- *this << gzerror(file, &num) << " from zlib";
-}
-#endif // HAVE_ZLIB
-
// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
@@ -48,19 +40,7 @@ FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std:
Initialize(name, show_progress, min_buffer);
}
-FilePiece::~FilePiece() {
-#ifdef HAVE_ZLIB
- if (gz_file_) {
- // zlib took ownership
- file_.release();
- int ret;
- if (Z_OK != (ret = gzclose(gz_file_))) {
- std::cerr << "could not close file " << file_name_ << " using zlib" << std::endl;
- abort();
- }
- }
-#endif
-}
+FilePiece::~FilePiece() {}
StringPiece FilePiece::ReadLine(char delim) {
std::size_t skip = 0;
@@ -95,9 +75,6 @@ unsigned long int FilePiece::ReadULong() {
}
void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) {
-#ifdef HAVE_ZLIB
- gz_file_ = NULL;
-#endif
file_name_ = name;
default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2);
@@ -117,10 +94,7 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s
}
Shift();
// gzip detect.
- if ((position_end_ - position_) > 2 && *position_ == 0x1f && static_cast<unsigned char>(*(position_ + 1)) == 0x8b) {
-#ifndef HAVE_ZLIB
- UTIL_THROW(GZException, "Looks like a gzip file but support was not compiled in.");
-#endif
+ if ((position_end_ - position_) >= ReadCompressed::kMagicSize && ReadCompressed::DetectCompressedMagic(position_)) {
if (!fallback_to_read_) {
at_end_ = false;
TransitionToRead();
@@ -197,7 +171,7 @@ void FilePiece::Shift() {
if (fallback_to_read_) ReadShift();
for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) {
- if (isspace(*last_space_)) break;
+ if (kSpaces[static_cast<unsigned char>(*last_space_)]) break;
}
}
@@ -248,17 +222,14 @@ void FilePiece::TransitionToRead() {
position_ = data_.begin();
position_end_ = position_;
-#ifdef HAVE_ZLIB
- assert(!gz_file_);
- gz_file_ = gzdopen(file_.get(), "r");
- UTIL_THROW_IF(!gz_file_, GZException, "zlib failed to open " << file_name_);
-#endif
+ try {
+ fell_back_.Reset(file_.release());
+ } catch (util::Exception &e) {
+ e << " in file " << file_name_;
+ throw;
+ }
}
-#ifdef WIN32
-typedef int ssize_t;
-#endif
-
void FilePiece::ReadShift() {
assert(fallback_to_read_);
// Bytes [data_.begin(), position_) have been consumed.
@@ -283,7 +254,7 @@ void FilePiece::ReadShift() {
position_ = data_.begin();
position_end_ = position_ + valid_length;
} else {
- size_t moving = position_end_ - position_;
+ std::size_t moving = position_end_ - position_;
memmove(data_.get(), position_, moving);
position_ = data_.begin();
position_end_ = position_ + moving;
@@ -291,20 +262,9 @@ void FilePiece::ReadShift() {
}
}
- ssize_t read_return;
-#ifdef HAVE_ZLIB
- read_return = gzread(gz_file_, static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
- if (read_return == -1) throw GZException(gz_file_);
- if (total_size_ != kBadSize) {
- // Just get the position, don't actually seek. Apparently this is how you do it. . .
- off_t ret = lseek(file_.get(), 0, SEEK_CUR);
- if (ret != -1) progress_.Set(ret);
- }
-#else
- read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
- UTIL_THROW_IF(read_return == -1, ErrnoException, "read failed");
- progress_.Set(mapped_offset_);
-#endif
+ std::size_t read_return = fell_back_.Read(static_cast<uint8_t*>(data_.get()) + already_read, default_map_size_ - already_read);
+ progress_.Set(fell_back_.RawAmount());
+
if (read_return == 0) {
at_end_ = true;
}
diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh
index af93d8aa..39bd1581 100644
--- a/klm/util/file_piece.hh
+++ b/klm/util/file_piece.hh
@@ -4,8 +4,8 @@
#include "util/ersatz_progress.hh"
#include "util/exception.hh"
#include "util/file.hh"
-#include "util/have.hh"
#include "util/mmap.hh"
+#include "util/read_compressed.hh"
#include "util/string_piece.hh"
#include <cstddef>
@@ -13,10 +13,6 @@
#include <stdint.h>
-#ifdef HAVE_ZLIB
-#include <zlib.h>
-#endif
-
namespace util {
class ParseNumberException : public Exception {
@@ -25,28 +21,19 @@ class ParseNumberException : public Exception {
~ParseNumberException() throw() {}
};
-class GZException : public Exception {
- public:
-#ifdef HAVE_ZLIB
- explicit GZException(gzFile file);
-#endif
- GZException() throw() {}
- ~GZException() throw() {}
-};
-
extern const bool kSpaces[256];
-// Memory backing the returned StringPiece may vanish on the next call.
+// Memory backing the returned StringPiece may vanish on the next call.
class FilePiece {
public:
- // 32 MB default.
- explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432);
- // Takes ownership of fd. name is used for messages.
- explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432);
+ // 1 MB default.
+ explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
+ // Takes ownership of fd. name is used for messages.
+ explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
~FilePiece();
-
- char get() {
+
+ char get() {
if (position_ == position_end_) {
Shift();
if (at_end_) throw EndOfFileException();
@@ -54,14 +41,14 @@ class FilePiece {
return *(position_++);
}
- // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace().
+ // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace().
StringPiece ReadDelimited(const bool *delim = kSpaces) {
SkipSpaces(delim);
return Consume(FindDelimiterOrEOF(delim));
}
// Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.
- // It is similar to getline in that way.
+ // It is similar to getline in that way.
StringPiece ReadLine(char delim = '\n');
float ReadFloat();
@@ -69,7 +56,7 @@ class FilePiece {
long int ReadLong();
unsigned long int ReadULong();
- // Skip spaces defined by isspace.
+ // Skip spaces defined by isspace.
void SkipSpaces(const bool *delim = kSpaces) {
for (; ; ++position_) {
if (position_ == position_end_) Shift();
@@ -82,7 +69,7 @@ class FilePiece {
}
const std::string &FileName() const { return file_name_; }
-
+
private:
void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);
@@ -122,9 +109,7 @@ class FilePiece {
std::string file_name_;
-#ifdef HAVE_ZLIB
- gzFile gz_file_;
-#endif // HAVE_ZLIB
+ ReadCompressed fell_back_;
};
} // namespace util
diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc
index f912e18a..e79ece7a 100644
--- a/klm/util/file_piece_test.cc
+++ b/klm/util/file_piece_test.cc
@@ -38,7 +38,7 @@ BOOST_AUTO_TEST_CASE(MMapReadLine) {
BOOST_CHECK_THROW(test.get(), EndOfFileException);
}
-#ifndef __APPLE__
+#if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__)
/* Apple isn't happy with the popen, fileno, dup. And I don't want to
* reimplement popen. This is an issue with the test.
*/
@@ -65,7 +65,7 @@ BOOST_AUTO_TEST_CASE(StreamReadLine) {
BOOST_CHECK_THROW(test.get(), EndOfFileException);
BOOST_REQUIRE(!pclose(catter));
}
-#endif // __APPLE__
+#endif
#ifdef HAVE_ZLIB
diff --git a/klm/util/have.hh b/klm/util/have.hh
index b8181e99..1523c0c5 100644
--- a/klm/util/have.hh
+++ b/klm/util/have.hh
@@ -2,22 +2,12 @@
#ifndef UTIL_HAVE__
#define UTIL_HAVE__
-#ifndef HAVE_ZLIB
-#if !defined(_WIN32) && !defined(_WIN64)
-#define HAVE_ZLIB
-#endif
-#endif
-
#ifndef HAVE_ICU
//#define HAVE_ICU
#endif
#ifndef HAVE_BOOST
-#define HAVE_BOOST
-#endif
-
-#ifndef HAVE_THREADS
-//#define HAVE_THREADS
+//#define HAVE_BOOST
#endif
#endif // UTIL_HAVE__
diff --git a/klm/util/joint_sort.hh b/klm/util/joint_sort.hh
index cf3d8432..1b43ddcf 100644
--- a/klm/util/joint_sort.hh
+++ b/klm/util/joint_sort.hh
@@ -60,7 +60,7 @@ template <class KeyIter, class ValueIter> class JointProxy {
JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {}
JointProxy(const JointProxy<KeyIter, ValueIter> &other) : inner_(other.inner_) {}
- operator const value_type() const {
+ operator value_type() const {
value_type ret;
ret.key = *inner_.key_;
ret.value = *inner_.value_;
@@ -121,7 +121,7 @@ template <class Proxy, class Less> class LessWrapper : public std::binary_functi
template <class KeyIter, class ValueIter> class PairedIterator : public ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > {
public:
- PairedIterator(const KeyIter &key, const ValueIter &value) :
+ PairedIterator(const KeyIter &key, const ValueIter &value) :
ProxyIterator<detail::JointProxy<KeyIter, ValueIter> >(detail::JointProxy<KeyIter, ValueIter>(key, value)) {}
};
diff --git a/klm/util/read_compressed.cc b/klm/util/read_compressed.cc
new file mode 100644
index 00000000..4ec94c4e
--- /dev/null
+++ b/klm/util/read_compressed.cc
@@ -0,0 +1,403 @@
+#include "util/read_compressed.hh"
+
+#include "util/file.hh"
+#include "util/have.hh"
+#include "util/scoped.hh"
+
+#include <algorithm>
+#include <iostream>
+
+#include <assert.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef HAVE_ZLIB
+#include <zlib.h>
+#endif
+
+#ifdef HAVE_BZLIB
+#include <bzlib.h>
+#endif
+
+#ifdef HAVE_XZLIB
+#include <lzma.h>
+#endif
+
+namespace util {
+
+CompressedException::CompressedException() throw() {}
+CompressedException::~CompressedException() throw() {}
+
+GZException::GZException() throw() {}
+GZException::~GZException() throw() {}
+
+BZException::BZException() throw() {}
+BZException::~BZException() throw() {}
+
+XZException::XZException() throw() {}
+XZException::~XZException() throw() {}
+
+class ReadBase {
+ public:
+ virtual ~ReadBase() {}
+
+ virtual std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) = 0;
+
+ protected:
+ static void ReplaceThis(ReadBase *with, ReadCompressed &thunk) {
+ thunk.internal_.reset(with);
+ }
+
+ static uint64_t &ReadCount(ReadCompressed &thunk) {
+ return thunk.raw_amount_;
+ }
+};
+
+namespace {
+
+// Completed file that other classes can thunk to.
+class Complete : public ReadBase {
+ public:
+ std::size_t Read(void *, std::size_t, ReadCompressed &) {
+ return 0;
+ }
+};
+
+class Uncompressed : public ReadBase {
+ public:
+ explicit Uncompressed(int fd) : fd_(fd) {}
+
+ std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
+ std::size_t got = PartialRead(fd_.get(), to, amount);
+ ReadCount(thunk) += got;
+ return got;
+ }
+
+ private:
+ scoped_fd fd_;
+};
+
+class UncompressedWithHeader : public ReadBase {
+ public:
+ UncompressedWithHeader(int fd, void *already_data, std::size_t already_size) : fd_(fd) {
+ assert(already_size);
+ buf_.reset(malloc(already_size));
+ if (!buf_.get()) throw std::bad_alloc();
+ memcpy(buf_.get(), already_data, already_size);
+ remain_ = static_cast<uint8_t*>(buf_.get());
+ end_ = remain_ + already_size;
+ }
+
+ std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
+ assert(buf_.get());
+ std::size_t sending = std::min<std::size_t>(amount, end_ - remain_);
+ memcpy(to, remain_, sending);
+ remain_ += sending;
+ if (remain_ == end_) {
+ ReplaceThis(new Uncompressed(fd_.release()), thunk);
+ }
+ return sending;
+ }
+
+ private:
+ scoped_malloc buf_;
+ uint8_t *remain_;
+ uint8_t *end_;
+
+ scoped_fd fd_;
+};
+
+#ifdef HAVE_ZLIB
+class GZip : public ReadBase {
+ private:
+ static const std::size_t kInputBuffer = 16384;
+ public:
+ GZip(int fd, void *already_data, std::size_t already_size)
+ : file_(fd), in_buffer_(malloc(kInputBuffer)) {
+ if (!in_buffer_.get()) throw std::bad_alloc();
+ assert(already_size < kInputBuffer);
+ if (already_size) {
+ memcpy(in_buffer_.get(), already_data, already_size);
+ stream_.next_in = static_cast<Bytef *>(in_buffer_.get());
+ stream_.avail_in = already_size;
+ stream_.avail_in += ReadOrEOF(file_.get(), static_cast<uint8_t*>(in_buffer_.get()) + already_size, kInputBuffer - already_size);
+ } else {
+ stream_.avail_in = 0;
+ }
+ stream_.zalloc = Z_NULL;
+ stream_.zfree = Z_NULL;
+ stream_.opaque = Z_NULL;
+ stream_.msg = NULL;
+ // 32 for zlib and gzip decoding with automatic header detection.
+ // 15 for maximum window size.
+ UTIL_THROW_IF(Z_OK != inflateInit2(&stream_, 32 + 15), GZException, "Failed to initialize zlib.");
+ }
+
+ ~GZip() {
+ if (Z_OK != inflateEnd(&stream_)) {
+ std::cerr << "zlib could not close properly." << std::endl;
+ abort();
+ }
+ }
+
+ std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
+ if (amount == 0) return 0;
+ stream_.next_out = static_cast<Bytef*>(to);
+ stream_.avail_out = std::min<std::size_t>(std::numeric_limits<uInt>::max(), amount);
+ do {
+ if (!stream_.avail_in) ReadInput(thunk);
+ int result = inflate(&stream_, 0);
+ switch (result) {
+ case Z_OK:
+ break;
+ case Z_STREAM_END:
+ {
+ std::size_t ret = static_cast<uint8_t*>(stream_.next_out) - static_cast<uint8_t*>(to);
+ ReplaceThis(new Complete(), thunk);
+ return ret;
+ }
+ case Z_ERRNO:
+ UTIL_THROW(ErrnoException, "zlib error");
+ default:
+ UTIL_THROW(GZException, "zlib encountered " << (stream_.msg ? stream_.msg : "an error ") << " code " << result);
+ }
+ } while (stream_.next_out == to);
+ return static_cast<uint8_t*>(stream_.next_out) - static_cast<uint8_t*>(to);
+ }
+
+ private:
+ void ReadInput(ReadCompressed &thunk) {
+ assert(!stream_.avail_in);
+ stream_.next_in = static_cast<Bytef *>(in_buffer_.get());
+ stream_.avail_in = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer);
+ ReadCount(thunk) += stream_.avail_in;
+ }
+
+ scoped_fd file_;
+ scoped_malloc in_buffer_;
+ z_stream stream_;
+};
+#endif // HAVE_ZLIB
+
+#ifdef HAVE_BZLIB
+class BZip : public ReadBase {
+ public:
+ explicit BZip(int fd, void *already_data, std::size_t already_size) {
+ scoped_fd hold(fd);
+ closer_.reset(FDOpenReadOrThrow(hold));
+ int bzerror = BZ_OK;
+ file_ = BZ2_bzReadOpen(&bzerror, closer_.get(), 0, 0, already_data, already_size);
+ switch (bzerror) {
+ case BZ_OK:
+ return;
+ case BZ_CONFIG_ERROR:
+ UTIL_THROW(BZException, "Looks like bzip2 was miscompiled.");
+ case BZ_PARAM_ERROR:
+ UTIL_THROW(BZException, "Parameter error");
+ case BZ_IO_ERROR:
+ UTIL_THROW(BZException, "IO error reading file");
+ case BZ_MEM_ERROR:
+ throw std::bad_alloc();
+ }
+ }
+
+ ~BZip() {
+ int bzerror = BZ_OK;
+ BZ2_bzReadClose(&bzerror, file_);
+ if (bzerror != BZ_OK) {
+ std::cerr << "bz2 readclose error" << std::endl;
+ abort();
+ }
+ }
+
+ std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
+ int bzerror = BZ_OK;
+ int ret = BZ2_bzRead(&bzerror, file_, to, std::min<std::size_t>(static_cast<std::size_t>(INT_MAX), amount));
+ long pos;
+ switch (bzerror) {
+ case BZ_STREAM_END:
+ pos = ftell(closer_.get());
+ if (pos != -1) ReadCount(thunk) = pos;
+ ReplaceThis(new Complete(), thunk);
+ return ret;
+ case BZ_OK:
+ pos = ftell(closer_.get());
+ if (pos != -1) ReadCount(thunk) = pos;
+ return ret;
+ default:
+ UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror);
+ }
+ }
+
+ private:
+ scoped_FILE closer_;
+ BZFILE *file_;
+};
+#endif // HAVE_BZLIB
+
+#ifdef HAVE_XZLIB
+class XZip : public ReadBase {
+ private:
+ static const std::size_t kInputBuffer = 16384;
+ public:
+ XZip(int fd, void *already_data, std::size_t already_size)
+ : file_(fd), in_buffer_(malloc(kInputBuffer)), stream_(), action_(LZMA_RUN) {
+ if (!in_buffer_.get()) throw std::bad_alloc();
+ assert(already_size < kInputBuffer);
+ if (already_size) {
+ memcpy(in_buffer_.get(), already_data, already_size);
+ stream_.next_in = static_cast<const uint8_t*>(in_buffer_.get());
+ stream_.avail_in = already_size;
+ stream_.avail_in += ReadOrEOF(file_.get(), static_cast<uint8_t*>(in_buffer_.get()) + already_size, kInputBuffer - already_size);
+ } else {
+ stream_.avail_in = 0;
+ }
+ stream_.allocator = NULL;
+ lzma_ret ret = lzma_stream_decoder(&stream_, UINT64_MAX, LZMA_CONCATENATED);
+ switch (ret) {
+ case LZMA_OK:
+ break;
+ case LZMA_MEM_ERROR:
+ UTIL_THROW(ErrnoException, "xz open error");
+ default:
+ UTIL_THROW(XZException, "xz error code " << ret);
+ }
+ }
+
+ ~XZip() {
+ lzma_end(&stream_);
+ }
+
+ std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
+ if (amount == 0) return 0;
+ stream_.next_out = static_cast<uint8_t*>(to);
+ stream_.avail_out = amount;
+ do {
+ if (!stream_.avail_in) ReadInput(thunk);
+ lzma_ret status = lzma_code(&stream_, action_);
+ switch (status) {
+ case LZMA_OK:
+ break;
+ case LZMA_STREAM_END:
+ UTIL_THROW_IF(action_ != LZMA_FINISH, XZException, "Input not finished yet.");
+ {
+ std::size_t ret = static_cast<uint8_t*>(stream_.next_out) - static_cast<uint8_t*>(to);
+ ReplaceThis(new Complete(), thunk);
+ return ret;
+ }
+ case LZMA_MEM_ERROR:
+ throw std::bad_alloc();
+ case LZMA_FORMAT_ERROR:
+ UTIL_THROW(XZException, "xzlib says file format not recognized");
+ case LZMA_OPTIONS_ERROR:
+ UTIL_THROW(XZException, "xzlib says unsupported compression options");
+ case LZMA_DATA_ERROR:
+ UTIL_THROW(XZException, "xzlib says this file is corrupt");
+ case LZMA_BUF_ERROR:
+ UTIL_THROW(XZException, "xzlib says unexpected end of input");
+ default:
+ UTIL_THROW(XZException, "unrecognized xzlib error " << status);
+ }
+ } while (stream_.next_out == to);
+ return static_cast<uint8_t*>(stream_.next_out) - static_cast<uint8_t*>(to);
+ }
+
+ private:
+ void ReadInput(ReadCompressed &thunk) {
+ assert(!stream_.avail_in);
+ stream_.next_in = static_cast<const uint8_t*>(in_buffer_.get());
+ stream_.avail_in = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer);
+ if (!stream_.avail_in) action_ = LZMA_FINISH;
+ ReadCount(thunk) += stream_.avail_in;
+ }
+
+ scoped_fd file_;
+ scoped_malloc in_buffer_;
+ lzma_stream stream_;
+
+ lzma_action action_;
+};
+#endif // HAVE_XZLIB
+
+enum MagicResult {
+ UNKNOWN, GZIP, BZIP, XZIP
+};
+
+MagicResult DetectMagic(const void *from_void) {
+ const uint8_t *header = static_cast<const uint8_t*>(from_void);
+ if (header[0] == 0x1f && header[1] == 0x8b) {
+ return GZIP;
+ }
+ if (header[0] == 'B' && header[1] == 'Z') {
+ return BZIP;
+ }
+ const uint8_t xzmagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
+ if (!memcmp(header, xzmagic, 6)) {
+ return XZIP;
+ }
+ return UNKNOWN;
+}
+
+ReadBase *ReadFactory(int fd, uint64_t &raw_amount) {
+ scoped_fd hold(fd);
+ unsigned char header[ReadCompressed::kMagicSize];
+ raw_amount = ReadOrEOF(fd, header, ReadCompressed::kMagicSize);
+ if (!raw_amount)
+ return new Uncompressed(hold.release());
+ if (raw_amount != ReadCompressed::kMagicSize)
+ return new UncompressedWithHeader(hold.release(), header, raw_amount);
+ switch (DetectMagic(header)) {
+ case GZIP:
+#ifdef HAVE_ZLIB
+ return new GZip(hold.release(), header, ReadCompressed::kMagicSize);
+#else
+ UTIL_THROW(CompressedException, "This looks like a gzip file but gzip support was not compiled in.");
+#endif
+ case BZIP:
+#ifdef HAVE_BZLIB
+ return new BZip(hold.release(), header, ReadCompressed::kMagicSize);
+#else
+ UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZ), but bzip support was not compiled in.");
+#endif
+ case XZIP:
+#ifdef HAVE_XZLIB
+ return new XZip(hold.release(), header, ReadCompressed::kMagicSize);
+#else
+ UTIL_THROW(CompressedException, "This looks like an xz file, but xz support was not compiled in.");
+#endif
+ case UNKNOWN:
+ break;
+ }
+ try {
+ AdvanceOrThrow(fd, -ReadCompressed::kMagicSize);
+ } catch (const util::ErrnoException &e) {
+ return new UncompressedWithHeader(hold.release(), header, ReadCompressed::kMagicSize);
+ }
+ return new Uncompressed(hold.release());
+}
+
+} // namespace
+
+bool ReadCompressed::DetectCompressedMagic(const void *from_void) {
+ return DetectMagic(from_void) != UNKNOWN;
+}
+
+ReadCompressed::ReadCompressed(int fd) {
+ Reset(fd);
+}
+
+ReadCompressed::ReadCompressed() {}
+
+ReadCompressed::~ReadCompressed() {}
+
+void ReadCompressed::Reset(int fd) {
+ internal_.reset();
+ internal_.reset(ReadFactory(fd, raw_amount_));
+}
+
+std::size_t ReadCompressed::Read(void *to, std::size_t amount) {
+ return internal_->Read(to, amount, *this);
+}
+
+} // namespace util
diff --git a/klm/util/read_compressed.hh b/klm/util/read_compressed.hh
new file mode 100644
index 00000000..83ca9fb2
--- /dev/null
+++ b/klm/util/read_compressed.hh
@@ -0,0 +1,74 @@
+#ifndef UTIL_READ_COMPRESSED__
+#define UTIL_READ_COMPRESSED__
+
+#include "util/exception.hh"
+#include "util/scoped.hh"
+
+#include <cstddef>
+
+#include <stdint.h>
+
+namespace util {
+
+class CompressedException : public Exception {
+ public:
+ CompressedException() throw();
+ virtual ~CompressedException() throw();
+};
+
+class GZException : public CompressedException {
+ public:
+ GZException() throw();
+ ~GZException() throw();
+};
+
+class BZException : public CompressedException {
+ public:
+ BZException() throw();
+ ~BZException() throw();
+};
+
+class XZException : public CompressedException {
+ public:
+ XZException() throw();
+ ~XZException() throw();
+};
+
+class ReadBase;
+
+class ReadCompressed {
+ public:
+ static const std::size_t kMagicSize = 6;
+ // Must have at least kMagicSize bytes.
+ static bool DetectCompressedMagic(const void *from);
+
+ // Takes ownership of fd.
+ explicit ReadCompressed(int fd);
+
+ // Must call Reset later.
+ ReadCompressed();
+
+ ~ReadCompressed();
+
+ // Takes ownership of fd.
+ void Reset(int fd);
+
+ std::size_t Read(void *to, std::size_t amount);
+
+ uint64_t RawAmount() const { return raw_amount_; }
+
+ private:
+ friend class ReadBase;
+
+ scoped_ptr<ReadBase> internal_;
+
+ uint64_t raw_amount_;
+
+ // No copying.
+ ReadCompressed(const ReadCompressed &);
+ void operator=(const ReadCompressed &);
+};
+
+} // namespace util
+
+#endif // UTIL_READ_COMPRESSED__
diff --git a/klm/util/read_compressed_test.cc b/klm/util/read_compressed_test.cc
new file mode 100644
index 00000000..6fd97e5e
--- /dev/null
+++ b/klm/util/read_compressed_test.cc
@@ -0,0 +1,94 @@
+#include "util/read_compressed.hh"
+
+#include "util/file.hh"
+#include "util/have.hh"
+
+#define BOOST_TEST_MODULE ReadCompressedTest
+#include <boost/test/unit_test.hpp>
+#include <boost/scoped_ptr.hpp>
+
+#include <fstream>
+#include <string>
+
+#include <stdlib.h>
+
+namespace util {
+namespace {
+
+void ReadLoop(ReadCompressed &reader, void *to_void, std::size_t amount) {
+ uint8_t *to = static_cast<uint8_t*>(to_void);
+ while (amount) {
+ std::size_t ret = reader.Read(to, amount);
+ BOOST_REQUIRE(ret);
+ to += ret;
+ amount -= ret;
+ }
+}
+
+void TestRandom(const char *compressor) {
+ const uint32_t kSize4 = 100000 / 4;
+ char name[] = "tempXXXXXX";
+
+ // Write test file.
+ {
+ scoped_fd original(mkstemp(name));
+ BOOST_REQUIRE(original.get() > 0);
+ for (uint32_t i = 0; i < kSize4; ++i) {
+ WriteOrThrow(original.get(), &i, sizeof(uint32_t));
+ }
+ }
+
+ char gzname[] = "tempXXXXXX";
+ scoped_fd gzipped(mkstemp(gzname));
+
+ std::string command(compressor);
+#ifdef __CYGWIN__
+ command += ".exe";
+#endif
+ command += " <\"";
+ command += name;
+ command += "\" >\"";
+ command += gzname;
+ command += "\"";
+ BOOST_REQUIRE_EQUAL(0, system(command.c_str()));
+
+ BOOST_CHECK_EQUAL(0, unlink(name));
+ BOOST_CHECK_EQUAL(0, unlink(gzname));
+
+ ReadCompressed reader(gzipped.release());
+ for (uint32_t i = 0; i < kSize4; ++i) {
+ uint32_t got;
+ ReadLoop(reader, &got, sizeof(uint32_t));
+ BOOST_CHECK_EQUAL(i, got);
+ }
+
+ char ignored;
+ BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1));
+ // Test double EOF call.
+ BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1));
+}
+
+BOOST_AUTO_TEST_CASE(Uncompressed) {
+ TestRandom("cat");
+}
+
+#ifdef HAVE_ZLIB
+BOOST_AUTO_TEST_CASE(ReadGZ) {
+ TestRandom("gzip");
+}
+#endif // HAVE_ZLIB
+
+#ifdef HAVE_BZLIB
+BOOST_AUTO_TEST_CASE(ReadBZ) {
+ TestRandom("bzip2");
+}
+#endif // HAVE_BZLIB
+
+#ifdef HAVE_XZLIB
+BOOST_AUTO_TEST_CASE(ReadXZ) {
+ TestRandom("xz");
+}
+#endif
+
+} // namespace
+} // namespace util
diff --git a/klm/util/scoped.hh b/klm/util/scoped.hh
index 93e2e817..d62c6df1 100644
--- a/klm/util/scoped.hh
+++ b/klm/util/scoped.hh
@@ -1,40 +1,13 @@
#ifndef UTIL_SCOPED__
#define UTIL_SCOPED__
+/* Other scoped objects in the style of scoped_ptr. */
#include "util/exception.hh"
-
-/* Other scoped objects in the style of scoped_ptr. */
#include <cstddef>
#include <cstdlib>
namespace util {
-template <class T, class R, R (*Free)(T*)> class scoped_thing {
- public:
- explicit scoped_thing(T *c = static_cast<T*>(0)) : c_(c) {}
-
- ~scoped_thing() { if (c_) Free(c_); }
-
- void reset(T *c) {
- if (c_) Free(c_);
- c_ = c;
- }
-
- T &operator*() { return *c_; }
- const T&operator*() const { return *c_; }
- T &operator->() { return *c_; }
- const T&operator->() const { return *c_; }
-
- T *get() { return c_; }
- const T *get() const { return c_; }
-
- private:
- T *c_;
-
- scoped_thing(const scoped_thing &);
- scoped_thing &operator=(const scoped_thing &);
-};
-
class scoped_malloc {
public:
scoped_malloc() : p_(NULL) {}
@@ -77,9 +50,6 @@ template <class T> class scoped_array {
T &operator*() { return *c_; }
const T&operator*() const { return *c_; }
- T &operator->() { return *c_; }
- const T&operator->() const { return *c_; }
-
T &operator[](std::size_t idx) { return c_[idx]; }
const T &operator[](std::size_t idx) const { return c_[idx]; }
@@ -90,6 +60,39 @@ template <class T> class scoped_array {
private:
T *c_;
+
+ scoped_array(const scoped_array &);
+ void operator=(const scoped_array &);
+};
+
+template <class T> class scoped_ptr {
+ public:
+ explicit scoped_ptr(T *content = NULL) : c_(content) {}
+
+ ~scoped_ptr() { delete c_; }
+
+ T *get() { return c_; }
+ const T* get() const { return c_; }
+
+ T &operator*() { return *c_; }
+ const T&operator*() const { return *c_; }
+
+ T *operator->() { return c_; }
+ const T*operator->() const { return c_; }
+
+ T &operator[](std::size_t idx) { return c_[idx]; }
+ const T &operator[](std::size_t idx) const { return c_[idx]; }
+
+ void reset(T *to = NULL) {
+ scoped_ptr<T> other(c_);
+ c_ = to;
+ }
+
+ private:
+ T *c_;
+
+ scoped_ptr(const scoped_ptr &);
+ void operator=(const scoped_ptr &);
};
} // namespace util
diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh
index be6a643d..51481646 100644
--- a/klm/util/string_piece.hh
+++ b/klm/util/string_piece.hh
@@ -1,6 +1,6 @@
/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If
* you don't use ICU, then this will use the Google implementation from Chrome.
- * This has been modified from the original version to let you choose.
+ * This has been modified from the original version to let you choose.
*/
// Copyright 2008, Google Inc.
@@ -62,9 +62,9 @@
#include <unicode/stringpiece.h>
#include <unicode/uversion.h>
-// Old versions of ICU don't define operator== and operator!=.
+// Old versions of ICU don't define operator== and operator!=.
#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
-#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6.
+#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6.
inline bool operator==(const StringPiece& x, const StringPiece& y) {
if (x.size() != y.size())
return false;
@@ -274,15 +274,28 @@ struct StringPieceCompatibleEquals : public std::binary_function<const StringPie
}
};
template <class T> typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) {
+#if BOOST_VERSION < 104200
+ std::string temp(key.data(), key.size());
+ return t.find(temp);
+#else
return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
+#endif
}
+
template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece &key) {
+#if BOOST_VERSION < 104200
+ std::string temp(key.data(), key.size());
+ return t.find(temp);
+#else
return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
+#endif
}
#endif
#ifdef HAVE_ICU
U_NAMESPACE_END
+using U_NAMESPACE_QUALIFIER StringPiece;
#endif
+
#endif // BASE_STRING_PIECE_H__
diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh
index 4a7f5460..a588c3fc 100644
--- a/klm/util/tokenize_piece.hh
+++ b/klm/util/tokenize_piece.hh
@@ -20,6 +20,7 @@ class OutOfTokens : public Exception {
class SingleCharacter {
public:
+ SingleCharacter() {}
explicit SingleCharacter(char delim) : delim_(delim) {}
StringPiece Find(const StringPiece &in) const {
@@ -32,6 +33,8 @@ class SingleCharacter {
class MultiCharacter {
public:
+ MultiCharacter() {}
+
explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {}
StringPiece Find(const StringPiece &in) const {
@@ -44,6 +47,7 @@ class MultiCharacter {
class AnyCharacter {
public:
+ AnyCharacter() {}
explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {}
StringPiece Find(const StringPiece &in) const {
@@ -56,6 +60,8 @@ class AnyCharacter {
class AnyCharacterLast {
public:
+ AnyCharacterLast() {}
+
explicit AnyCharacterLast(const StringPiece &chars) : chars_(chars) {}
StringPiece Find(const StringPiece &in) const {
@@ -81,8 +87,8 @@ template <class Find, bool SkipEmpty = false> class TokenIter : public boost::it
return current_.data() != 0;
}
- static TokenIter<Find> end() {
- return TokenIter<Find>();
+ static TokenIter<Find, SkipEmpty> end() {
+ return TokenIter<Find, SkipEmpty>();
}
private:
@@ -100,8 +106,8 @@ template <class Find, bool SkipEmpty = false> class TokenIter : public boost::it
} while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false.
}
- bool equal(const TokenIter<Find> &other) const {
- return after_.data() == other.after_.data();
+ bool equal(const TokenIter<Find, SkipEmpty> &other) const {
+ return current_.data() == other.current_.data();
}
const StringPiece &dereference() const {