diff options
Diffstat (limited to 'klm')
| -rw-r--r-- | klm/lm/binary_format.cc | 21 | ||||
| -rw-r--r-- | klm/lm/config.cc | 1 | ||||
| -rw-r--r-- | klm/lm/config.hh | 59 | ||||
| -rw-r--r-- | klm/lm/max_order.hh | 2 | ||||
| -rw-r--r-- | klm/lm/model.cc | 30 | ||||
| -rw-r--r-- | klm/lm/search_trie.cc | 47 | ||||
| -rw-r--r-- | klm/util/Makefile.am | 1 | ||||
| -rw-r--r-- | klm/util/exception.hh | 8 | ||||
| -rw-r--r-- | klm/util/file.cc | 38 | ||||
| -rw-r--r-- | klm/util/file.hh | 8 | ||||
| -rw-r--r-- | klm/util/file_piece.cc | 66 | ||||
| -rw-r--r-- | klm/util/file_piece.hh | 41 | ||||
| -rw-r--r-- | klm/util/file_piece_test.cc | 4 | ||||
| -rw-r--r-- | klm/util/have.hh | 12 | ||||
| -rw-r--r-- | klm/util/joint_sort.hh | 4 | ||||
| -rw-r--r-- | klm/util/read_compressed.cc | 403 | ||||
| -rw-r--r-- | klm/util/read_compressed.hh | 74 | ||||
| -rw-r--r-- | klm/util/read_compressed_test.cc | 94 | ||||
| -rw-r--r-- | klm/util/scoped.hh | 65 | ||||
| -rw-r--r-- | klm/util/string_piece.hh | 19 | ||||
| -rw-r--r-- | klm/util/tokenize_piece.hh | 14 | 
21 files changed, 780 insertions, 231 deletions
| diff --git a/klm/lm/binary_format.cc b/klm/lm/binary_format.cc index efa67056..39c4a9b6 100644 --- a/klm/lm/binary_format.cc +++ b/klm/lm/binary_format.cc @@ -16,11 +16,11 @@ namespace ngram {  namespace {  const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";  const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0"; -// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).  +// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).  const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n";  const long int kMagicVersion = 5; -// Old binary files built on 32-bit machines have this header.   +// Old binary files built on 32-bit machines have this header.  // TODO: eliminate with next binary release.  struct OldSanity {    char magic[sizeof(kMagicBytes)]; @@ -39,7 +39,7 @@ struct OldSanity {  }; -// Test values aligned to 8 bytes.     +// Test values aligned to 8 bytes.  struct Sanity {    char magic[ALIGN8(sizeof(kMagicBytes))];    float zero_f, one_f, minus_half_f; @@ -101,7 +101,7 @@ uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_  uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) {    std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad;    if (config.write_mmap) { -    // Grow the file to accomodate the search, using zeros.   +    // Grow the file to accomodate the search, using zeros.      try {        util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size);      } catch (util::ErrnoException &e) { @@ -114,7 +114,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t        return reinterpret_cast<uint8_t*>(backing.search.get());      }      // mmap it now. -    // We're skipping over the header and vocab for the search space mmap.  mmap likes page aligned offsets, so some arithmetic to round the offset down.   +    // We're skipping over the header and vocab for the search space mmap.  mmap likes page aligned offsets, so some arithmetic to round the offset down.      std::size_t page_size = util::SizePage();      std::size_t alignment_cruft = adjusted_vocab % page_size;      backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED); @@ -122,7 +122,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t    } else {      util::MapAnonymous(memory_size, backing.search);      return reinterpret_cast<uint8_t*>(backing.search.get()); -  }  +  }  }  void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) { @@ -140,7 +140,7 @@ void FinishFile(const Config &config, ModelType model_type, unsigned int search_        util::FSyncOrThrow(backing.file.get());        break;    } -  // header and vocab share the same mmap.  The header is written here because we know the counts.   +  // header and vocab share the same mmap.  The header is written here because we know the counts.    Parameters params = Parameters();    params.counts = counts;    params.fixed.order = counts.size(); @@ -160,7 +160,7 @@ namespace detail {  bool IsBinaryFormat(int fd) {    const uint64_t size = util::SizeFile(fd);    if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false; -  // Try reading the header.   +  // Try reading the header.    util::scoped_memory memory;    try {      util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory); @@ -214,7 +214,7 @@ void SeekPastHeader(int fd, const Parameters ¶ms) {  uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, uint64_t memory_size, Backing &backing) {    const uint64_t file_size = util::SizeFile(backing.file.get()); -  // The header is smaller than a page, so we have to map the whole header as well.   +  // The header is smaller than a page, so we have to map the whole header as well.    std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size);    if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)      UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map); @@ -233,7 +233,8 @@ void ComplainAboutARPA(const Config &config, ModelType model_type) {    if (config.write_mmap || !config.messages) return;    if (config.arpa_complain == Config::ALL) {      *config.messages << "Loading the LM will be faster if you build a binary file." << std::endl; -  } else if (config.arpa_complain == Config::EXPENSIVE && model_type == TRIE_SORTED) { +  } else if (config.arpa_complain == Config::EXPENSIVE && +             (model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {      *config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive.  Save time by building a binary format." << std::endl;    }  } diff --git a/klm/lm/config.cc b/klm/lm/config.cc index f9d988ca..9520c41c 100644 --- a/klm/lm/config.cc +++ b/klm/lm/config.cc @@ -6,6 +6,7 @@ namespace lm {  namespace ngram {  Config::Config() : +  show_progress(true),    messages(&std::cerr),    enumerate_vocab(NULL),    unknown_missing(COMPLAIN), diff --git a/klm/lm/config.hh b/klm/lm/config.hh index 739cee9c..0de7b7c6 100644 --- a/klm/lm/config.hh +++ b/klm/lm/config.hh @@ -11,46 +11,52 @@  /* Configuration for ngram model.  Separate header to reduce pollution. */  namespace lm { -   +  class EnumerateVocab;  namespace ngram {  struct Config { -  // EFFECTIVE FOR BOTH ARPA AND BINARY READS  +  // EFFECTIVE FOR BOTH ARPA AND BINARY READS + +  // (default true) print progress bar to messages +  bool show_progress;    // Where to log messages including the progress bar.  Set to NULL for    // silence.    std::ostream *messages; +  std::ostream *ProgressMessages() const { +    return show_progress ? messages : 0; +  } +    // This will be called with every string in the vocabulary.  See    // enumerate_vocab.hh for more detail.  Config does not take ownership; you -  // are still responsible for deleting it (or stack allocating).   +  // are still responsible for deleting it (or stack allocating).    EnumerateVocab *enumerate_vocab; -    // ONLY EFFECTIVE WHEN READING ARPA -  // What to do when <unk> isn't in the provided model.  +  // What to do when <unk> isn't in the provided model.    WarningAction unknown_missing; -  // What to do when <s> or </s> is missing from the model.  -  // If THROW_UP, the exception will be of type util::SpecialWordMissingException.   +  // What to do when <s> or </s> is missing from the model. +  // If THROW_UP, the exception will be of type util::SpecialWordMissingException.    WarningAction sentence_marker_missing;    // What to do with a positive log probability.  For COMPLAIN and SILENT, map -  // to 0.   +  // to 0.    WarningAction positive_log_probability; -  // The probability to substitute for <unk> if it's missing from the model.   +  // The probability to substitute for <unk> if it's missing from the model.    // No effect if the model has <unk> or unknown_missing == THROW_UP.    float unknown_missing_logprob;    // Size multiplier for probing hash table.  Must be > 1.  Space is linear in    // this.  Time is probing_multiplier / (probing_multiplier - 1).  No effect -  // for sorted variant.   +  // for sorted variant.    // If you find yourself setting this to a low number, consider using the -  // TrieModel which has lower memory consumption.   +  // TrieModel which has lower memory consumption.    float probing_multiplier;    // Amount of memory to use for building.  The actual memory usage will be @@ -58,10 +64,10 @@ struct Config {    // models.    std::size_t building_memory; -  // Template for temporary directory appropriate for passing to mkdtemp.   +  // Template for temporary directory appropriate for passing to mkdtemp.    // The characters XXXXXX are appended before passing to mkdtemp.  Only    // applies to trie.  If NULL, defaults to write_mmap.  If that's NULL, -  // defaults to input file name.   +  // defaults to input file name.    const char *temporary_directory_prefix;    // Level of complaining to do when loading from ARPA instead of binary format. @@ -69,49 +75,46 @@ struct Config {    ARPALoadComplain arpa_complain;    // While loading an ARPA file, also write out this binary format file.  Set -  // to NULL to disable.   +  // to NULL to disable.    const char *write_mmap;    enum WriteMethod { -    WRITE_MMAP, // Map the file directly.   -    WRITE_AFTER // Write after we're done.   +    WRITE_MMAP, // Map the file directly. +    WRITE_AFTER // Write after we're done.    };    WriteMethod write_method; -  // Include the vocab in the binary file?  Only effective if write_mmap != NULL.   +  // Include the vocab in the binary file?  Only effective if write_mmap != NULL.    bool include_vocab; -  // Left rest options.  Only used when the model includes rest costs.   +  // Left rest options.  Only used when the model includes rest costs.    enum RestFunction {      REST_MAX,   // Maximum of any score to the left -    REST_LOWER, // Use lower-order files given below.   +    REST_LOWER, // Use lower-order files given below.    };    RestFunction rest_function; -  // Only used for REST_LOWER.   +  // Only used for REST_LOWER.    std::vector<std::string> rest_lower_files; -    // Quantization options.  Only effective for QuantTrieModel.  One value is    // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used -  // to quantize (and one of the remaining backoffs will be 0).   +  // to quantize (and one of the remaining backoffs will be 0).    uint8_t prob_bits, backoff_bits;    // Bhiksha compression (simple form).  Only works with trie.    uint8_t pointer_bhiksha_bits; -   -   +    // ONLY EFFECTIVE WHEN READING BINARY -   +    // How to get the giant array into memory: lazy mmap, populate, read etc. -  // See util/mmap.hh for details of MapMethod.   +  // See util/mmap.hh for details of MapMethod.    util::LoadMethod load_method; - -  // Set defaults.  +  // Set defaults.    Config();  }; diff --git a/klm/lm/max_order.hh b/klm/lm/max_order.hh index ea0dea46..3eb97ccd 100644 --- a/klm/lm/max_order.hh +++ b/klm/lm/max_order.hh @@ -7,5 +7,3 @@  #ifndef KENLM_ORDER_MESSAGE  #define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile.  In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'.  Otherwise, edit lm/max_order.hh."  #endif - -#define KENLM_MAX_ORDER 5 diff --git a/klm/lm/model.cc b/klm/lm/model.cc index fc61efee..a40fd2fb 100644 --- a/klm/lm/model.cc +++ b/klm/lm/model.cc @@ -37,7 +37,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT  template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &config) {    LoadLM(file, config, *this); -  // g++ prints warnings unless these are fully initialized.   +  // g++ prints warnings unless these are fully initialized.    State begin_sentence = State();    begin_sentence.length = 1;    begin_sentence.words[0] = vocab_.BeginSentence(); @@ -69,8 +69,8 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT  }  template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(const char *file, const Config &config) { -  // Backing file is the ARPA.  Steal it so we can make the backing file the mmap output if any.   -  util::FilePiece f(backing_.file.release(), file, config.messages); +  // Backing file is the ARPA.  Steal it so we can make the backing file the mmap output if any. +  util::FilePiece f(backing_.file.release(), file, config.ProgressMessages());    try {      std::vector<uint64_t> counts;      // File counts do not include pruned trigrams that extend to quadgrams etc.   These will be fixed by search_. @@ -80,7 +80,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT      if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0");      std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config)); -    // Setup the binary file for writing the vocab lookup table.  The search_ is responsible for growing the binary file to its needs.   +    // Setup the binary file for writing the vocab lookup table.  The search_ is responsible for growing the binary file to its needs.      vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config);      if (config.write_mmap) { @@ -95,7 +95,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT      if (!vocab_.SawUnk()) {        assert(config.unknown_missing != THROW_UP); -      // Default probabilities for unknown.   +      // Default probabilities for unknown.        search_.UnknownUnigram().backoff = 0.0;        search_.UnknownUnigram().prob = config.unknown_missing_logprob;      } @@ -147,7 +147,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,  }  template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const { -  // Generate a state from context.   +  // Generate a state from context.    context_rend = std::min(context_rend, context_rbegin + P::Order() - 1);    if (context_rend == context_rbegin) {      out_state.length = 0; @@ -191,7 +191,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,      ret.rest = ptr.Rest();      ret.prob = ptr.Prob();      ret.extend_left = extend_pointer; -    // If this function is called, then it does depend on left words.    +    // If this function is called, then it does depend on left words.      ret.independent_left = false;    }    float subtract_me = ret.rest; @@ -199,7 +199,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,    next_use = extend_length;    ResumeScore(add_rbegin, add_rend, extend_length - 1, node, backoff_out, next_use, ret);    next_use -= extend_length; -  // Charge backoffs.   +  // Charge backoffs.    for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b;    ret.prob -= subtract_me;    ret.rest -= subtract_me; @@ -209,7 +209,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,  namespace {  // Do a paraonoid copy of history, assuming new_word has already been copied  // (hence the -1).  out_state.length could be zero so I avoided using -// std::copy.    +// std::copy.  void CopyRemainingHistory(const WordIndex *from, State &out_state) {    WordIndex *out = out_state.words + 1;    const WordIndex *in_end = from + static_cast<ptrdiff_t>(out_state.length) - 1; @@ -217,10 +217,10 @@ void CopyRemainingHistory(const WordIndex *from, State &out_state) {  }  } // namespace -/* Ugly optimized function.  Produce a score excluding backoff.   - * The search goes in increasing order of ngram length.   +/* Ugly optimized function.  Produce a score excluding backoff. + * The search goes in increasing order of ngram length.   * Context goes backward, so context_begin is the word immediately preceeding - * new_word.   + * new_word.   */  template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ScoreExceptBackoff(      const WordIndex *const context_rbegin, @@ -229,7 +229,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,      State &out_state) const {    assert(new_word < vocab_.Bound());    FullScoreReturn ret; -  // ret.ngram_length contains the last known non-blank ngram length.   +  // ret.ngram_length contains the last known non-blank ngram length.    ret.ngram_length = 1;    typename Search::Node node; @@ -238,9 +238,9 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,    ret.prob = uni.Prob();    ret.rest = uni.Rest(); -  // This is the length of the context that should be used for continuation to the right.   +  // This is the length of the context that should be used for continuation to the right.    out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0; -  // We'll write the word anyway since it will probably be used and does no harm being there.   +  // We'll write the word anyway since it will probably be used and does no harm being there.    out_state.words[0] = new_word;    if (context_rbegin == context_rend) return ret; diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index debcfd07..1b0d9b26 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -55,7 +55,7 @@ struct ProbPointer {    uint64_t index;  }; -// Array of n-grams and float indices.   +// Array of n-grams and float indices.  class BackoffMessages {    public:      void Init(std::size_t entry_size) { @@ -100,7 +100,7 @@ class BackoffMessages {      void Apply(float *const *const base, RecordReader &reader) {        FinishedAdding();        if (current_ == allocated_) return; -      // We'll also use the same buffer to record messages to blanks that they extend.   +      // We'll also use the same buffer to record messages to blanks that they extend.        WordIndex *extend_out = reinterpret_cast<WordIndex*>(current_);        const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex);        for (reader.Rewind(); reader && (current_ != allocated_); ) { @@ -109,7 +109,7 @@ class BackoffMessages {              ++reader;              break;            case 1: -            // Message but nobody to receive it.  Write it down at the beginning of the buffer so we can inform this blank that it extends.   +            // Message but nobody to receive it.  Write it down at the beginning of the buffer so we can inform this blank that it extends.              for (const WordIndex *w = reinterpret_cast<const WordIndex *>(current_); w != reinterpret_cast<const WordIndex *>(current_) + order; ++w, ++extend_out) *extend_out = *w;              current_ += entry_size_;              break; @@ -126,7 +126,7 @@ class BackoffMessages {              break;          }        } -      // Now this is a list of blanks that extend right.   +      // Now this is a list of blanks that extend right.        entry_size_ = sizeof(WordIndex) * order;        Resize(sizeof(WordIndex) * (extend_out - (const WordIndex*)backing_.get()));        current_ = (uint8_t*)backing_.get(); @@ -153,7 +153,7 @@ class BackoffMessages {    private:      void FinishedAdding() {        Resize(current_ - (uint8_t*)backing_.get()); -      // Sort requests in same order as files.   +      // Sort requests in same order as files.        std::sort(            util::SizedIterator(util::SizedProxy(backing_.get(), entry_size_)),            util::SizedIterator(util::SizedProxy(current_, entry_size_)), @@ -220,7 +220,7 @@ class SRISucks {      }    private: -    // This used to be one array.  Then I needed to separate it by order for quantization to work.   +    // This used to be one array.  Then I needed to separate it by order for quantization to work.      std::vector<float> values_[KENLM_MAX_ORDER - 1];      BackoffMessages messages_[KENLM_MAX_ORDER - 1]; @@ -253,7 +253,7 @@ class FindBlanks {        ++counts_.back();      } -    // Unigrams wrote one past.   +    // Unigrams wrote one past.      void Cleanup() {        --counts_[0];      } @@ -270,15 +270,15 @@ class FindBlanks {      SRISucks &sri_;  }; -// Phase to actually write n-grams to the trie.   +// Phase to actually write n-grams to the trie.  template <class Quant, class Bhiksha> class WriteEntries {    public: -    WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) :  +    WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) :        contexts_(contexts),        quant_(quant),        unigrams_(unigrams),        middle_(middle), -      longest_(longest),  +      longest_(longest),        bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)),        order_(order),        sri_(sri) {} @@ -328,7 +328,7 @@ struct Gram {    const WordIndex *begin, *end; -  // For queue, this is the direction we want.   +  // For queue, this is the direction we want.    bool operator<(const Gram &other) const {      return std::lexicographical_compare(other.begin, other.end, begin, end);    } @@ -353,7 +353,7 @@ template <class Doing> class BlankManager {          been_length_ = length;          return;        } -      // There are blanks to insert starting with order blank.   +      // There are blanks to insert starting with order blank.        unsigned char blank = cur - to + 1;        UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context.");        const float *lower_basis; @@ -363,7 +363,7 @@ template <class Doing> class BlankManager {          assert(*lower_basis != kBadProb);          doing_.MiddleBlank(blank, to, based_on, *lower_basis);          *pre = *cur; -        // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.   +        // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.          basis_[blank - 1] = kBadProb;        }        *pre = *cur; @@ -377,7 +377,7 @@ template <class Doing> class BlankManager {      unsigned char been_length_;      float basis_[KENLM_MAX_ORDER]; -     +      Doing &doing_;  }; @@ -451,7 +451,7 @@ template <class Quant> void TrainProbQuantizer(uint8_t order, uint64_t count, Re  }  void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) { -  // Fill unigram probabilities.   +  // Fill unigram probabilities.    try {      rewind(file);      for (WordIndex i = 0; i < unigram_count; ++i) { @@ -486,7 +486,7 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve      util::scoped_memory unigrams;      MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);      FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri); -    RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder); +    RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Identifying n-grams omitted by SRI", finder);      fixed_counts = finder.Counts();    }    unigram_file.reset(util::FDOpenOrThrow(unigram_fd)); @@ -504,7 +504,8 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve      inputs[i-2].Rewind();    }    if (Quant::kTrain) { -    util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0), config.messages, "Quantizing"); +    util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0), +                                  config.ProgressMessages(), "Quantizing");      for (unsigned char i = 2; i < counts.size(); ++i) {        TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, quant);      } @@ -519,13 +520,13 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve    for (unsigned char i = 2; i <= counts.size(); ++i) {      inputs[i-2].Rewind();    } -  // Fill entries except unigram probabilities.   +  // Fill entries except unigram probabilities.    {      WriteEntries<Quant, Bhiksha> writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri); -    RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Writing trie", writer); +    RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Writing trie", writer);    } -  // Do not disable this error message or else too little state will be returned.  Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.    +  // Do not disable this error message or else too little state will be returned.  Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.    for (unsigned char order = 2; order <= counts.size(); ++order) {      const RecordReader &context = contexts[order - 2];      if (context) { @@ -541,13 +542,13 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve    }    /* Set ending offsets so the last entry will be sized properly */ -  // Last entry for unigrams was already set.   +  // Last entry for unigrams was already set.    if (out.middle_begin_ != out.middle_end_) {      for (typename TrieSearch<Quant, Bhiksha>::Middle *i = out.middle_begin_; i != out.middle_end_ - 1; ++i) {        i->FinishedLoading((i+1)->InsertIndex(), config);      }      (out.middle_end_ - 1)->FinishedLoading(out.longest_.InsertIndex(), config); -  }   +  }  }  template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) { @@ -595,7 +596,7 @@ template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::Initializ    } else {      temporary_prefix = file;    } -  // At least 1MB sorting memory.   +  // At least 1MB sorting memory.    SortedFiles sorted(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab);    BuildTrie(sorted, counts, config, *this, quant_, vocab, backing); diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am index 5306850f..a676bdb3 100644 --- a/klm/util/Makefile.am +++ b/klm/util/Makefile.am @@ -27,6 +27,7 @@ libklm_util_a_SOURCES = \    mmap.cc \    murmur_hash.cc \    pool.cc \ +	read_compressed.cc \    string_piece.cc \  	usage.cc diff --git a/klm/util/exception.hh b/klm/util/exception.hh index 053a850b..0165a7a3 100644 --- a/klm/util/exception.hh +++ b/klm/util/exception.hh @@ -87,8 +87,14 @@ template <class Except, class Data> typename Except::template ExceptionTag<Excep    throw UTIL_e; \  } while (0) +#if __GNUC__ >= 3 +#define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0) +#else +#define UTIL_UNLIKELY(x) (x) +#endif +  #define UTIL_THROW_IF(Condition, Exception, Modify) do { \ -  if (Condition) { \ +  if (UTIL_UNLIKELY(Condition)) { \      Exception UTIL_e; \      UTIL_SET_LOCATION(UTIL_e, #Exception, #Condition); \      UTIL_e << Modify; \ diff --git a/klm/util/file.cc b/klm/util/file.cc index 6bf879ac..b9a77cf9 100644 --- a/klm/util/file.cc +++ b/klm/util/file.cc @@ -15,6 +15,8 @@  #if defined(_WIN32) || defined(_WIN64)  #include <windows.h>  #include <io.h> +#include <algorithm> +#include <limits.h>  #else  #include <unistd.h>  #endif @@ -48,7 +50,7 @@ int OpenReadOrThrow(const char *name) {  int CreateOrThrow(const char *name) {    int ret;  #if defined(_WIN32) || defined(_WIN64) -  UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); +  UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR | _O_BINARY, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);  #else    UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);  #endif @@ -74,16 +76,22 @@ void ResizeOrThrow(int fd, uint64_t to) {  #endif  } -#ifdef WIN32 -typedef int ssize_t; +std::size_t PartialRead(int fd, void *to, std::size_t amount) { +#if defined(_WIN32) || defined(_WIN64) +  amount = min(static_cast<std::size_t>(INT_MAX), amount); +  int ret = _read(fd, to, amount);  +#else +  ssize_t ret = read(fd, to, amount);  #endif +  UTIL_THROW_IF(ret < 0, ErrnoException, "Reading " << amount << " from fd " << fd << " failed."); +  return static_cast<std::size_t>(ret); +}  void ReadOrThrow(int fd, void *to_void, std::size_t amount) {    uint8_t *to = static_cast<uint8_t*>(to_void);    while (amount) { -    ssize_t ret = read(fd, to, amount); -    UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << amount << " from fd " << fd << " failed."); -    UTIL_THROW_IF(ret == 0, EndOfFileException, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read."); +    std::size_t ret = PartialRead(fd, to, amount); +    UTIL_THROW_IF(ret == 0, EndOfFileException, " in fd " << fd << " but there should be " << amount << " more bytes to read.");      amount -= ret;      to += ret;    } @@ -93,8 +101,7 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) {    uint8_t *to = static_cast<uint8_t*>(to_void);    std::size_t remaining = amount;    while (remaining) { -    ssize_t ret = read(fd, to, remaining); -    UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << remaining << " from fd " << fd << " failed."); +    std::size_t ret = PartialRead(fd, to, remaining);      if (!ret) return amount - remaining;      remaining -= ret;      to += ret; @@ -105,7 +112,11 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) {  void WriteOrThrow(int fd, const void *data_void, std::size_t size) {    const uint8_t *data = static_cast<const uint8_t*>(data_void);    while (size) { +#if defined(_WIN32) || defined(_WIN64) +    int ret = write(fd, data, min(static_cast<std::size_t>(INT_MAX), size)); +#else      ssize_t ret = write(fd, data, size); +#endif      if (ret < 1) UTIL_THROW(util::ErrnoException, "Write failed");      data += ret;      size -= ret; @@ -114,7 +125,7 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {  void WriteOrThrow(FILE *to, const void *data, std::size_t size) {    assert(size); -  if (1 != std::fwrite(data, size, 1, to)) UTIL_THROW(util::ErrnoException, "Short write; requested size " << size); +  UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), util::ErrnoException, "Short write; requested size " << size);  }  void FSyncOrThrow(int fd) { @@ -149,14 +160,15 @@ void SeekEnd(int fd) {  std::FILE *FDOpenOrThrow(scoped_fd &file) {    std::FILE *ret = fdopen(file.get(), "r+b"); -  if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen"); +  if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen descriptor " << file.get());    file.release();    return ret;  } -std::FILE *FOpenOrThrow(const char *path, const char *mode) { -  std::FILE *ret; -  UTIL_THROW_IF(!(ret = fopen(path, mode)), util::ErrnoException, "Could not fopen " << path << " for " << mode); +std::FILE *FDOpenReadOrThrow(scoped_fd &file) { +  std::FILE *ret = fdopen(file.get(), "rb"); +  if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen descriptor " << file.get()); +  file.release();    return ret;  } diff --git a/klm/util/file.hh b/klm/util/file.hh index 185cb1f3..c24580d6 100644 --- a/klm/util/file.hh +++ b/klm/util/file.hh @@ -32,8 +32,6 @@ class scoped_fd {        return ret;      } -    operator bool() { return fd_ != -1; } -    private:      int fd_; @@ -76,8 +74,9 @@ uint64_t SizeFile(int fd);  void ResizeOrThrow(int fd, uint64_t to); +std::size_t PartialRead(int fd, void *to, std::size_t size);  void ReadOrThrow(int fd, void *to, std::size_t size); -std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount); +std::size_t ReadOrEOF(int fd, void *to_void, std::size_t size);  void WriteOrThrow(int fd, const void *data_void, std::size_t size);  void WriteOrThrow(FILE *to, const void *data, std::size_t size); @@ -90,8 +89,7 @@ void AdvanceOrThrow(int fd, int64_t off);  void SeekEnd(int fd);  std::FILE *FDOpenOrThrow(scoped_fd &file); - -std::FILE *FOpenOrThrow(const char *path, const char *mode); +std::FILE *FDOpenReadOrThrow(scoped_fd &file);  class TempMaker {    public: diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index 280f438c..5a208eff 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -14,7 +14,6 @@  #include <limits>  #include <assert.h> -#include <ctype.h>  #include <fcntl.h>  #include <stdlib.h>  #include <sys/types.h> @@ -26,13 +25,6 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {    *this << "Could not parse \"" << value << "\" into a number";  } -#ifdef HAVE_ZLIB -GZException::GZException(gzFile file) { -  int num; -  *this << gzerror(file, &num) << " from zlib"; -} -#endif // HAVE_ZLIB -  // Sigh this is the only way I could come up with to do a _const_ bool.  It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).   const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; @@ -48,19 +40,7 @@ FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std:    Initialize(name, show_progress, min_buffer);  } -FilePiece::~FilePiece() { -#ifdef HAVE_ZLIB -  if (gz_file_) { -    // zlib took ownership -    file_.release(); -    int ret; -    if (Z_OK != (ret = gzclose(gz_file_))) { -      std::cerr << "could not close file " << file_name_ << " using zlib" << std::endl; -      abort(); -    } -  } -#endif -} +FilePiece::~FilePiece() {}  StringPiece FilePiece::ReadLine(char delim) {    std::size_t skip = 0; @@ -95,9 +75,6 @@ unsigned long int FilePiece::ReadULong() {  }  void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer)  { -#ifdef HAVE_ZLIB -  gz_file_ = NULL; -#endif    file_name_ = name;    default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2); @@ -117,10 +94,7 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s    }    Shift();    // gzip detect. -  if ((position_end_ - position_) > 2 && *position_ == 0x1f && static_cast<unsigned char>(*(position_ + 1)) == 0x8b) { -#ifndef HAVE_ZLIB -    UTIL_THROW(GZException, "Looks like a gzip file but support was not compiled in."); -#endif +  if ((position_end_ - position_) >= ReadCompressed::kMagicSize && ReadCompressed::DetectCompressedMagic(position_)) {      if (!fallback_to_read_) {        at_end_ = false;        TransitionToRead(); @@ -197,7 +171,7 @@ void FilePiece::Shift() {    if (fallback_to_read_) ReadShift();    for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) { -    if (isspace(*last_space_))  break; +    if (kSpaces[static_cast<unsigned char>(*last_space_)])  break;    }  } @@ -248,17 +222,14 @@ void FilePiece::TransitionToRead() {    position_ = data_.begin();    position_end_ = position_; -#ifdef HAVE_ZLIB -  assert(!gz_file_); -  gz_file_ = gzdopen(file_.get(), "r"); -  UTIL_THROW_IF(!gz_file_, GZException, "zlib failed to open " << file_name_); -#endif +  try { +    fell_back_.Reset(file_.release()); +  } catch (util::Exception &e) { +    e << " in file " << file_name_; +    throw; +  }  } -#ifdef WIN32 -typedef int ssize_t; -#endif -  void FilePiece::ReadShift() {    assert(fallback_to_read_);    // Bytes [data_.begin(), position_) have been consumed.   @@ -283,7 +254,7 @@ void FilePiece::ReadShift() {        position_ = data_.begin();        position_end_ = position_ + valid_length;      } else { -      size_t moving = position_end_ - position_; +      std::size_t moving = position_end_ - position_;        memmove(data_.get(), position_, moving);        position_ = data_.begin();        position_end_ = position_ + moving; @@ -291,20 +262,9 @@ void FilePiece::ReadShift() {      }    } -  ssize_t read_return; -#ifdef HAVE_ZLIB -  read_return = gzread(gz_file_, static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read); -  if (read_return == -1) throw GZException(gz_file_); -  if (total_size_ != kBadSize) { -    // Just get the position, don't actually seek.  Apparently this is how you do it. . .  -    off_t ret = lseek(file_.get(), 0, SEEK_CUR); -    if (ret != -1) progress_.Set(ret); -  } -#else -  read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read); -  UTIL_THROW_IF(read_return == -1, ErrnoException, "read failed"); -  progress_.Set(mapped_offset_); -#endif +  std::size_t read_return = fell_back_.Read(static_cast<uint8_t*>(data_.get()) + already_read, default_map_size_ - already_read); +  progress_.Set(fell_back_.RawAmount()); +    if (read_return == 0) {      at_end_ = true;    } diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index af93d8aa..39bd1581 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -4,8 +4,8 @@  #include "util/ersatz_progress.hh"  #include "util/exception.hh"  #include "util/file.hh" -#include "util/have.hh"  #include "util/mmap.hh" +#include "util/read_compressed.hh"  #include "util/string_piece.hh"  #include <cstddef> @@ -13,10 +13,6 @@  #include <stdint.h> -#ifdef HAVE_ZLIB -#include <zlib.h> -#endif -  namespace util {  class ParseNumberException : public Exception { @@ -25,28 +21,19 @@ class ParseNumberException : public Exception {      ~ParseNumberException() throw() {}  }; -class GZException : public Exception { -  public: -#ifdef HAVE_ZLIB -    explicit GZException(gzFile file); -#endif -    GZException() throw() {} -    ~GZException() throw() {} -}; -  extern const bool kSpaces[256]; -// Memory backing the returned StringPiece may vanish on the next call.   +// Memory backing the returned StringPiece may vanish on the next call.  class FilePiece {    public: -    // 32 MB default. -    explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432); -    // Takes ownership of fd.  name is used for messages.   -    explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432); +    // 1 MB default. +    explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); +    // Takes ownership of fd.  name is used for messages. +    explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);      ~FilePiece(); -      -    char get() {  + +    char get() {        if (position_ == position_end_) {          Shift();          if (at_end_) throw EndOfFileException(); @@ -54,14 +41,14 @@ class FilePiece {        return *(position_++);      } -    // Leaves the delimiter, if any, to be returned by get().  Delimiters defined by isspace().   +    // Leaves the delimiter, if any, to be returned by get().  Delimiters defined by isspace().      StringPiece ReadDelimited(const bool *delim = kSpaces) {        SkipSpaces(delim);        return Consume(FindDelimiterOrEOF(delim));      }      // Unlike ReadDelimited, this includes leading spaces and consumes the delimiter. -    // It is similar to getline in that way.   +    // It is similar to getline in that way.      StringPiece ReadLine(char delim = '\n');      float ReadFloat(); @@ -69,7 +56,7 @@ class FilePiece {      long int ReadLong();      unsigned long int ReadULong(); -    // Skip spaces defined by isspace.   +    // Skip spaces defined by isspace.      void SkipSpaces(const bool *delim = kSpaces) {        for (; ; ++position_) {          if (position_ == position_end_) Shift(); @@ -82,7 +69,7 @@ class FilePiece {      }      const std::string &FileName() const { return file_name_; } -     +    private:      void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer); @@ -122,9 +109,7 @@ class FilePiece {      std::string file_name_; -#ifdef HAVE_ZLIB -    gzFile gz_file_; -#endif // HAVE_ZLIB +    ReadCompressed fell_back_;  };  } // namespace util diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc index f912e18a..e79ece7a 100644 --- a/klm/util/file_piece_test.cc +++ b/klm/util/file_piece_test.cc @@ -38,7 +38,7 @@ BOOST_AUTO_TEST_CASE(MMapReadLine) {    BOOST_CHECK_THROW(test.get(), EndOfFileException);  } -#ifndef __APPLE__ +#if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__)  /* Apple isn't happy with the popen, fileno, dup.  And I don't want to   * reimplement popen.  This is an issue with the test.     */ @@ -65,7 +65,7 @@ BOOST_AUTO_TEST_CASE(StreamReadLine) {    BOOST_CHECK_THROW(test.get(), EndOfFileException);    BOOST_REQUIRE(!pclose(catter));  } -#endif // __APPLE__ +#endif  #ifdef HAVE_ZLIB diff --git a/klm/util/have.hh b/klm/util/have.hh index b8181e99..1523c0c5 100644 --- a/klm/util/have.hh +++ b/klm/util/have.hh @@ -2,22 +2,12 @@  #ifndef UTIL_HAVE__  #define UTIL_HAVE__ -#ifndef HAVE_ZLIB -#if !defined(_WIN32) && !defined(_WIN64) -#define HAVE_ZLIB -#endif -#endif -  #ifndef HAVE_ICU  //#define HAVE_ICU  #endif  #ifndef HAVE_BOOST -#define HAVE_BOOST -#endif - -#ifndef HAVE_THREADS -//#define HAVE_THREADS +//#define HAVE_BOOST  #endif  #endif // UTIL_HAVE__ diff --git a/klm/util/joint_sort.hh b/klm/util/joint_sort.hh index cf3d8432..1b43ddcf 100644 --- a/klm/util/joint_sort.hh +++ b/klm/util/joint_sort.hh @@ -60,7 +60,7 @@ template <class KeyIter, class ValueIter> class JointProxy {      JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {}      JointProxy(const JointProxy<KeyIter, ValueIter> &other) : inner_(other.inner_) {} -    operator const value_type() const { +    operator value_type() const {        value_type ret;        ret.key = *inner_.key_;        ret.value = *inner_.value_; @@ -121,7 +121,7 @@ template <class Proxy, class Less> class LessWrapper : public std::binary_functi  template <class KeyIter, class ValueIter> class PairedIterator : public ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > {    public: -    PairedIterator(const KeyIter &key, const ValueIter &value) :  +    PairedIterator(const KeyIter &key, const ValueIter &value) :        ProxyIterator<detail::JointProxy<KeyIter, ValueIter> >(detail::JointProxy<KeyIter, ValueIter>(key, value)) {}  }; diff --git a/klm/util/read_compressed.cc b/klm/util/read_compressed.cc new file mode 100644 index 00000000..4ec94c4e --- /dev/null +++ b/klm/util/read_compressed.cc @@ -0,0 +1,403 @@ +#include "util/read_compressed.hh" + +#include "util/file.hh" +#include "util/have.hh" +#include "util/scoped.hh" + +#include <algorithm> +#include <iostream> + +#include <assert.h> +#include <limits.h> +#include <stdlib.h> +#include <string.h> + +#ifdef HAVE_ZLIB +#include <zlib.h> +#endif + +#ifdef HAVE_BZLIB +#include <bzlib.h> +#endif + +#ifdef HAVE_XZLIB +#include <lzma.h> +#endif + +namespace util { + +CompressedException::CompressedException() throw() {} +CompressedException::~CompressedException() throw() {} + +GZException::GZException() throw() {} +GZException::~GZException() throw() {} + +BZException::BZException() throw() {} +BZException::~BZException() throw() {} + +XZException::XZException() throw() {} +XZException::~XZException() throw() {} + +class ReadBase { +  public: +    virtual ~ReadBase() {} + +    virtual std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) = 0; + +  protected: +    static void ReplaceThis(ReadBase *with, ReadCompressed &thunk) { +      thunk.internal_.reset(with); +    } + +    static uint64_t &ReadCount(ReadCompressed &thunk) { +      return thunk.raw_amount_; +    } +}; + +namespace { + +// Completed file that other classes can thunk to.   +class Complete : public ReadBase { +  public: +    std::size_t Read(void *, std::size_t, ReadCompressed &) { +      return 0; +    } +}; + +class Uncompressed : public ReadBase { +  public: +    explicit Uncompressed(int fd) : fd_(fd) {} + +    std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { +      std::size_t got = PartialRead(fd_.get(), to, amount); +      ReadCount(thunk) += got; +      return got; +    } + +  private: +    scoped_fd fd_; +}; + +class UncompressedWithHeader : public ReadBase { +  public: +    UncompressedWithHeader(int fd, void *already_data, std::size_t already_size) : fd_(fd) { +      assert(already_size); +      buf_.reset(malloc(already_size)); +      if (!buf_.get()) throw std::bad_alloc(); +      memcpy(buf_.get(), already_data, already_size); +      remain_ = static_cast<uint8_t*>(buf_.get()); +      end_ = remain_ + already_size; +    } + +    std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { +      assert(buf_.get()); +      std::size_t sending = std::min<std::size_t>(amount, end_ - remain_); +      memcpy(to, remain_, sending); +      remain_ += sending; +      if (remain_ == end_) { +        ReplaceThis(new Uncompressed(fd_.release()), thunk); +      } +      return sending; +    } + +  private: +    scoped_malloc buf_; +    uint8_t *remain_; +    uint8_t *end_; + +    scoped_fd fd_; +}; + +#ifdef HAVE_ZLIB +class GZip : public ReadBase { +  private: +    static const std::size_t kInputBuffer = 16384; +  public: +    GZip(int fd, void *already_data, std::size_t already_size)  +      : file_(fd), in_buffer_(malloc(kInputBuffer)) { +      if (!in_buffer_.get()) throw std::bad_alloc(); +      assert(already_size < kInputBuffer); +      if (already_size) { +        memcpy(in_buffer_.get(), already_data, already_size); +        stream_.next_in = static_cast<Bytef *>(in_buffer_.get()); +        stream_.avail_in = already_size; +        stream_.avail_in += ReadOrEOF(file_.get(), static_cast<uint8_t*>(in_buffer_.get()) + already_size, kInputBuffer - already_size); +      } else { +        stream_.avail_in = 0; +      } +      stream_.zalloc = Z_NULL; +      stream_.zfree = Z_NULL; +      stream_.opaque = Z_NULL; +      stream_.msg = NULL; +      // 32 for zlib and gzip decoding with automatic header detection.   +      // 15 for maximum window size.   +      UTIL_THROW_IF(Z_OK != inflateInit2(&stream_, 32 + 15), GZException, "Failed to initialize zlib."); +    } + +    ~GZip() { +      if (Z_OK != inflateEnd(&stream_)) { +        std::cerr << "zlib could not close properly." << std::endl; +        abort(); +      } +    } + +    std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { +      if (amount == 0) return 0; +      stream_.next_out = static_cast<Bytef*>(to); +      stream_.avail_out = std::min<std::size_t>(std::numeric_limits<uInt>::max(), amount); +      do { +        if (!stream_.avail_in) ReadInput(thunk); +        int result = inflate(&stream_, 0); +        switch (result) { +          case Z_OK: +            break; +          case Z_STREAM_END: +            { +              std::size_t ret = static_cast<uint8_t*>(stream_.next_out) - static_cast<uint8_t*>(to); +              ReplaceThis(new Complete(), thunk); +              return ret; +            } +          case Z_ERRNO: +            UTIL_THROW(ErrnoException, "zlib error"); +          default: +            UTIL_THROW(GZException, "zlib encountered " << (stream_.msg ? stream_.msg : "an error ") << " code " << result); +        } +      } while (stream_.next_out == to); +      return static_cast<uint8_t*>(stream_.next_out) - static_cast<uint8_t*>(to); +    } + +  private: +    void ReadInput(ReadCompressed &thunk) { +      assert(!stream_.avail_in); +      stream_.next_in = static_cast<Bytef *>(in_buffer_.get()); +      stream_.avail_in = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer); +      ReadCount(thunk) += stream_.avail_in; +    } + +    scoped_fd file_; +    scoped_malloc in_buffer_; +    z_stream stream_; +}; +#endif // HAVE_ZLIB + +#ifdef HAVE_BZLIB +class BZip : public ReadBase { +  public: +    explicit BZip(int fd, void *already_data, std::size_t already_size) { +      scoped_fd hold(fd); +      closer_.reset(FDOpenReadOrThrow(hold)); +      int bzerror = BZ_OK; +      file_ = BZ2_bzReadOpen(&bzerror, closer_.get(), 0, 0, already_data, already_size); +      switch (bzerror) { +        case BZ_OK: +          return; +        case BZ_CONFIG_ERROR: +          UTIL_THROW(BZException, "Looks like bzip2 was miscompiled."); +        case BZ_PARAM_ERROR: +          UTIL_THROW(BZException, "Parameter error"); +        case BZ_IO_ERROR: +          UTIL_THROW(BZException, "IO error reading file"); +        case BZ_MEM_ERROR: +          throw std::bad_alloc(); +      } +    } + +    ~BZip() { +      int bzerror = BZ_OK; +      BZ2_bzReadClose(&bzerror, file_); +      if (bzerror != BZ_OK) { +        std::cerr << "bz2 readclose error" << std::endl; +        abort(); +      } +    } + +    std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { +      int bzerror = BZ_OK; +      int ret = BZ2_bzRead(&bzerror, file_, to, std::min<std::size_t>(static_cast<std::size_t>(INT_MAX), amount)); +      long pos; +      switch (bzerror) { +        case BZ_STREAM_END: +          pos = ftell(closer_.get()); +          if (pos != -1) ReadCount(thunk) = pos; +          ReplaceThis(new Complete(), thunk); +          return ret; +        case BZ_OK: +          pos = ftell(closer_.get()); +          if (pos != -1) ReadCount(thunk) = pos; +          return ret; +        default: +          UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror); +      } +    } + +  private: +    scoped_FILE closer_; +    BZFILE *file_; +}; +#endif // HAVE_BZLIB + +#ifdef HAVE_XZLIB +class XZip : public ReadBase { +  private: +    static const std::size_t kInputBuffer = 16384; +  public: +    XZip(int fd, void *already_data, std::size_t already_size)  +      : file_(fd), in_buffer_(malloc(kInputBuffer)), stream_(), action_(LZMA_RUN) { +      if (!in_buffer_.get()) throw std::bad_alloc(); +      assert(already_size < kInputBuffer); +      if (already_size) { +        memcpy(in_buffer_.get(), already_data, already_size); +        stream_.next_in = static_cast<const uint8_t*>(in_buffer_.get()); +        stream_.avail_in = already_size; +        stream_.avail_in += ReadOrEOF(file_.get(), static_cast<uint8_t*>(in_buffer_.get()) + already_size, kInputBuffer - already_size); +      } else { +        stream_.avail_in = 0; +      } +      stream_.allocator = NULL; +      lzma_ret ret = lzma_stream_decoder(&stream_, UINT64_MAX, LZMA_CONCATENATED); +      switch (ret) { +        case LZMA_OK: +          break; +        case LZMA_MEM_ERROR: +          UTIL_THROW(ErrnoException, "xz open error"); +        default: +          UTIL_THROW(XZException, "xz error code " << ret); +      } +    } + +    ~XZip() { +      lzma_end(&stream_); +    } + +    std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { +      if (amount == 0) return 0; +      stream_.next_out = static_cast<uint8_t*>(to); +      stream_.avail_out = amount; +      do { +        if (!stream_.avail_in) ReadInput(thunk); +        lzma_ret status = lzma_code(&stream_, action_); +        switch (status) { +          case LZMA_OK: +            break; +          case LZMA_STREAM_END: +            UTIL_THROW_IF(action_ != LZMA_FINISH, XZException, "Input not finished yet."); +            { +              std::size_t ret = static_cast<uint8_t*>(stream_.next_out) - static_cast<uint8_t*>(to); +              ReplaceThis(new Complete(), thunk); +              return ret; +            } +          case LZMA_MEM_ERROR: +            throw std::bad_alloc(); +          case LZMA_FORMAT_ERROR: +            UTIL_THROW(XZException, "xzlib says file format not recognized"); +          case LZMA_OPTIONS_ERROR: +            UTIL_THROW(XZException, "xzlib says unsupported compression options"); +          case LZMA_DATA_ERROR: +            UTIL_THROW(XZException, "xzlib says this file is corrupt"); +          case LZMA_BUF_ERROR: +            UTIL_THROW(XZException, "xzlib says unexpected end of input"); +          default: +            UTIL_THROW(XZException, "unrecognized xzlib error " << status); +        } +      } while (stream_.next_out == to); +      return static_cast<uint8_t*>(stream_.next_out) - static_cast<uint8_t*>(to); +    } + +  private: +    void ReadInput(ReadCompressed &thunk) { +      assert(!stream_.avail_in); +      stream_.next_in = static_cast<const uint8_t*>(in_buffer_.get()); +      stream_.avail_in = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer); +      if (!stream_.avail_in) action_ = LZMA_FINISH; +      ReadCount(thunk) += stream_.avail_in; +    } + +    scoped_fd file_; +    scoped_malloc in_buffer_; +    lzma_stream stream_; + +    lzma_action action_; +}; +#endif // HAVE_XZLIB + +enum MagicResult { +  UNKNOWN, GZIP, BZIP, XZIP +}; + +MagicResult DetectMagic(const void *from_void) { +  const uint8_t *header = static_cast<const uint8_t*>(from_void); +  if (header[0] == 0x1f && header[1] == 0x8b) { +    return GZIP; +  } +  if (header[0] == 'B' && header[1] == 'Z') { +    return BZIP; +  } +  const uint8_t xzmagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; +  if (!memcmp(header, xzmagic, 6)) { +    return XZIP; +  } +  return UNKNOWN; +} + +ReadBase *ReadFactory(int fd, uint64_t &raw_amount) { +  scoped_fd hold(fd); +  unsigned char header[ReadCompressed::kMagicSize]; +  raw_amount = ReadOrEOF(fd, header, ReadCompressed::kMagicSize); +  if (!raw_amount) +    return new Uncompressed(hold.release()); +  if (raw_amount != ReadCompressed::kMagicSize) +    return new UncompressedWithHeader(hold.release(), header, raw_amount); +  switch (DetectMagic(header)) { +    case GZIP: +#ifdef HAVE_ZLIB +      return new GZip(hold.release(), header, ReadCompressed::kMagicSize); +#else +      UTIL_THROW(CompressedException, "This looks like a gzip file but gzip support was not compiled in."); +#endif +    case BZIP: +#ifdef HAVE_BZLIB +      return new BZip(hold.release(), header, ReadCompressed::kMagicSize); +#else +      UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZ), but bzip support was not compiled in."); +#endif +    case XZIP: +#ifdef HAVE_XZLIB +      return new XZip(hold.release(), header, ReadCompressed::kMagicSize); +#else +      UTIL_THROW(CompressedException, "This looks like an xz file, but xz support was not compiled in."); +#endif +    case UNKNOWN: +      break; +  } +  try { +    AdvanceOrThrow(fd, -ReadCompressed::kMagicSize); +  } catch (const util::ErrnoException &e) { +    return new UncompressedWithHeader(hold.release(), header, ReadCompressed::kMagicSize); +  } +  return new Uncompressed(hold.release()); +} + +} // namespace + +bool ReadCompressed::DetectCompressedMagic(const void *from_void) { +  return DetectMagic(from_void) != UNKNOWN; +} + +ReadCompressed::ReadCompressed(int fd) { +  Reset(fd); +} + +ReadCompressed::ReadCompressed() {} + +ReadCompressed::~ReadCompressed() {} + +void ReadCompressed::Reset(int fd) { +  internal_.reset(); +  internal_.reset(ReadFactory(fd, raw_amount_)); +} + +std::size_t ReadCompressed::Read(void *to, std::size_t amount) { +  return internal_->Read(to, amount, *this); +} + +} // namespace util diff --git a/klm/util/read_compressed.hh b/klm/util/read_compressed.hh new file mode 100644 index 00000000..83ca9fb2 --- /dev/null +++ b/klm/util/read_compressed.hh @@ -0,0 +1,74 @@ +#ifndef UTIL_READ_COMPRESSED__ +#define UTIL_READ_COMPRESSED__ + +#include "util/exception.hh" +#include "util/scoped.hh" + +#include <cstddef> + +#include <stdint.h> + +namespace util { + +class CompressedException : public Exception { +  public: +    CompressedException() throw(); +    virtual ~CompressedException() throw(); +}; + +class GZException : public CompressedException { +  public: +    GZException() throw(); +    ~GZException() throw(); +}; + +class BZException : public CompressedException { +  public: +    BZException() throw(); +    ~BZException() throw(); +}; + +class XZException : public CompressedException { +  public: +    XZException() throw(); +    ~XZException() throw(); +}; + +class ReadBase; + +class ReadCompressed { +  public: +    static const std::size_t kMagicSize = 6; +    // Must have at least kMagicSize bytes.   +    static bool DetectCompressedMagic(const void *from); + +    // Takes ownership of fd.    +    explicit ReadCompressed(int fd); + +    // Must call Reset later. +    ReadCompressed(); + +    ~ReadCompressed(); + +    // Takes ownership of fd.   +    void Reset(int fd); + +    std::size_t Read(void *to, std::size_t amount); + +    uint64_t RawAmount() const { return raw_amount_; } + +  private: +    friend class ReadBase; + +    scoped_ptr<ReadBase> internal_; + +    uint64_t raw_amount_; + +    // No copying.   +    ReadCompressed(const ReadCompressed &); +    void operator=(const ReadCompressed &); +}; + +} // namespace util + +#endif // UTIL_READ_COMPRESSED__ diff --git a/klm/util/read_compressed_test.cc b/klm/util/read_compressed_test.cc new file mode 100644 index 00000000..6fd97e5e --- /dev/null +++ b/klm/util/read_compressed_test.cc @@ -0,0 +1,94 @@ +#include "util/read_compressed.hh" + +#include "util/file.hh" +#include "util/have.hh" + +#define BOOST_TEST_MODULE ReadCompressedTest +#include <boost/test/unit_test.hpp> +#include <boost/scoped_ptr.hpp> + +#include <fstream> +#include <string> + +#include <stdlib.h> + +namespace util { +namespace { + +void ReadLoop(ReadCompressed &reader, void *to_void, std::size_t amount) { +  uint8_t *to = static_cast<uint8_t*>(to_void); +  while (amount) { +    std::size_t ret = reader.Read(to, amount); +    BOOST_REQUIRE(ret); +    to += ret; +    amount -= ret; +  } +} + +void TestRandom(const char *compressor) { +  const uint32_t kSize4 = 100000 / 4; +  char name[] = "tempXXXXXX"; + +  // Write test file.   +  { +    scoped_fd original(mkstemp(name)); +    BOOST_REQUIRE(original.get() > 0); +    for (uint32_t i = 0; i < kSize4; ++i) { +      WriteOrThrow(original.get(), &i, sizeof(uint32_t)); +    } +  } + +  char gzname[] = "tempXXXXXX"; +  scoped_fd gzipped(mkstemp(gzname)); + +  std::string command(compressor); +#ifdef __CYGWIN__ +  command += ".exe"; +#endif +  command += " <\""; +  command += name; +  command += "\" >\""; +  command += gzname; +  command += "\""; +  BOOST_REQUIRE_EQUAL(0, system(command.c_str())); + +  BOOST_CHECK_EQUAL(0, unlink(name)); +  BOOST_CHECK_EQUAL(0, unlink(gzname)); + +  ReadCompressed reader(gzipped.release()); +  for (uint32_t i = 0; i < kSize4; ++i) { +    uint32_t got; +    ReadLoop(reader, &got, sizeof(uint32_t)); +    BOOST_CHECK_EQUAL(i, got); +  } + +  char ignored; +  BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1)); +  // Test double EOF call. +  BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1)); +} + +BOOST_AUTO_TEST_CASE(Uncompressed) { +  TestRandom("cat"); +} + +#ifdef HAVE_ZLIB +BOOST_AUTO_TEST_CASE(ReadGZ) { +  TestRandom("gzip"); +} +#endif // HAVE_ZLIB + +#ifdef HAVE_BZLIB +BOOST_AUTO_TEST_CASE(ReadBZ) { +  TestRandom("bzip2"); +} +#endif // HAVE_BZLIB + +#ifdef HAVE_XZLIB +BOOST_AUTO_TEST_CASE(ReadXZ) { +  TestRandom("xz"); +} +#endif + +} // namespace +} // namespace util diff --git a/klm/util/scoped.hh b/klm/util/scoped.hh index 93e2e817..d62c6df1 100644 --- a/klm/util/scoped.hh +++ b/klm/util/scoped.hh @@ -1,40 +1,13 @@  #ifndef UTIL_SCOPED__  #define UTIL_SCOPED__ +/* Other scoped objects in the style of scoped_ptr. */  #include "util/exception.hh" - -/* Other scoped objects in the style of scoped_ptr. */  #include <cstddef>  #include <cstdlib>  namespace util { -template <class T, class R, R (*Free)(T*)> class scoped_thing { -  public: -    explicit scoped_thing(T *c = static_cast<T*>(0)) : c_(c) {} - -    ~scoped_thing() { if (c_) Free(c_); } - -    void reset(T *c) { -      if (c_) Free(c_); -      c_ = c; -    } - -    T &operator*() { return *c_; } -    const T&operator*() const { return *c_; } -    T &operator->() { return *c_; } -    const T&operator->() const { return *c_; } - -    T *get() { return c_; } -    const T *get() const { return c_; } - -  private: -    T *c_; - -    scoped_thing(const scoped_thing &); -    scoped_thing &operator=(const scoped_thing &); -}; -  class scoped_malloc {    public:      scoped_malloc() : p_(NULL) {} @@ -77,9 +50,6 @@ template <class T> class scoped_array {      T &operator*() { return *c_; }      const T&operator*() const { return *c_; } -    T &operator->() { return *c_; } -    const T&operator->() const { return *c_; } -      T &operator[](std::size_t idx) { return c_[idx]; }      const T &operator[](std::size_t idx) const { return c_[idx]; } @@ -90,6 +60,39 @@ template <class T> class scoped_array {    private:      T *c_; + +    scoped_array(const scoped_array &); +    void operator=(const scoped_array &); +}; + +template <class T> class scoped_ptr { +  public: +    explicit scoped_ptr(T *content = NULL) : c_(content) {} + +    ~scoped_ptr() { delete c_; } + +    T *get() { return c_; } +    const T* get() const { return c_; } + +    T &operator*() { return *c_; } +    const T&operator*() const { return *c_; } + +    T *operator->() { return c_; } +    const T*operator->() const { return c_; } + +    T &operator[](std::size_t idx) { return c_[idx]; } +    const T &operator[](std::size_t idx) const { return c_[idx]; } + +    void reset(T *to = NULL) { +      scoped_ptr<T> other(c_); +      c_ = to; +    } + +  private: +    T *c_; + +    scoped_ptr(const scoped_ptr &); +    void operator=(const scoped_ptr &);  };  } // namespace util diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh index be6a643d..51481646 100644 --- a/klm/util/string_piece.hh +++ b/klm/util/string_piece.hh @@ -1,6 +1,6 @@  /* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n.  If   * you don't use ICU, then this will use the Google implementation from Chrome. - * This has been modified from the original version to let you choose.   + * This has been modified from the original version to let you choose.   */  // Copyright 2008, Google Inc. @@ -62,9 +62,9 @@  #include <unicode/stringpiece.h>  #include <unicode/uversion.h> -// Old versions of ICU don't define operator== and operator!=.   +// Old versions of ICU don't define operator== and operator!=.  #if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4)) -#warning You are using an old version of ICU.  Consider upgrading to ICU >= 4.6.   +#warning You are using an old version of ICU.  Consider upgrading to ICU >= 4.6.  inline bool operator==(const StringPiece& x, const StringPiece& y) {    if (x.size() != y.size())      return false; @@ -274,15 +274,28 @@ struct StringPieceCompatibleEquals : public std::binary_function<const StringPie    }  };  template <class T> typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) { +#if BOOST_VERSION < 104200 +  std::string temp(key.data(), key.size()); +  return t.find(temp); +#else    return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); +#endif  } +  template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece &key) { +#if BOOST_VERSION < 104200 +  std::string temp(key.data(), key.size()); +  return t.find(temp); +#else    return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); +#endif  }  #endif  #ifdef HAVE_ICU  U_NAMESPACE_END +using U_NAMESPACE_QUALIFIER StringPiece;  #endif +  #endif  // BASE_STRING_PIECE_H__ diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh index 4a7f5460..a588c3fc 100644 --- a/klm/util/tokenize_piece.hh +++ b/klm/util/tokenize_piece.hh @@ -20,6 +20,7 @@ class OutOfTokens : public Exception {  class SingleCharacter {    public: +    SingleCharacter() {}      explicit SingleCharacter(char delim) : delim_(delim) {}      StringPiece Find(const StringPiece &in) const { @@ -32,6 +33,8 @@ class SingleCharacter {  class MultiCharacter {    public: +    MultiCharacter() {} +      explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {}      StringPiece Find(const StringPiece &in) const { @@ -44,6 +47,7 @@ class MultiCharacter {  class AnyCharacter {    public: +    AnyCharacter() {}      explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {}      StringPiece Find(const StringPiece &in) const { @@ -56,6 +60,8 @@ class AnyCharacter {  class AnyCharacterLast {    public: +    AnyCharacterLast() {} +      explicit AnyCharacterLast(const StringPiece &chars) : chars_(chars) {}      StringPiece Find(const StringPiece &in) const { @@ -81,8 +87,8 @@ template <class Find, bool SkipEmpty = false> class TokenIter : public boost::it        return current_.data() != 0;      } -    static TokenIter<Find> end() { -      return TokenIter<Find>(); +    static TokenIter<Find, SkipEmpty> end() { +      return TokenIter<Find, SkipEmpty>();      }    private: @@ -100,8 +106,8 @@ template <class Find, bool SkipEmpty = false> class TokenIter : public boost::it        } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false.        } -    bool equal(const TokenIter<Find> &other) const { -      return after_.data() == other.after_.data(); +    bool equal(const TokenIter<Find, SkipEmpty> &other) const { +      return current_.data() == other.current_.data();      }      const StringPiece &dereference() const { | 
