From d3e2ec203a5cf550320caa8023ac3dd103b0be7d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 13 Oct 2014 00:42:37 -0400 Subject: new kenlm --- klm/lm/builder/sort.hh | 157 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 149 insertions(+), 8 deletions(-) (limited to 'klm/lm/builder/sort.hh') diff --git a/klm/lm/builder/sort.hh b/klm/lm/builder/sort.hh index 9989389b..712bb8e3 100644 --- a/klm/lm/builder/sort.hh +++ b/klm/lm/builder/sort.hh @@ -1,7 +1,7 @@ -#ifndef LM_BUILDER_SORT__ -#define LM_BUILDER_SORT__ +#ifndef LM_BUILDER_SORT_H +#define LM_BUILDER_SORT_H -#include "lm/builder/multi_stream.hh" +#include "lm/builder/ngram_stream.hh" #include "lm/builder/ngram.hh" #include "lm/word_index.hh" #include "util/stream/sort.hh" @@ -14,24 +14,71 @@ namespace lm { namespace builder { +/** + * Abstract parent class for defining custom n-gram comparators. + */ template class Comparator : public std::binary_function { public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ explicit Comparator(std::size_t order) : order_(order) {} + /** + * Applies the comparator using the Compare method that must be defined in any class that inherits from this class. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + * + * @see ContextOrder::Compare + * @see PrefixOrder::Compare + * @see SuffixOrder::Compare + */ inline bool operator()(const void *lhs, const void *rhs) const { return static_cast(this)->Compare(static_cast(lhs), static_cast(rhs)); } + /** Gets the n-gram order defined for this comparator. */ std::size_t Order() const { return order_; } protected: std::size_t order_; }; +/** + * N-gram comparator that compares n-grams according to their reverse (suffix) order. + * + * This comparator compares n-grams lexicographically, one word at a time, + * beginning with the last word of each n-gram and ending with the first word of each n-gram. + * + * Some examples of n-gram comparisons as defined by this comparator: + * - a b c == a b c + * - a b c < a b d + * - a b c > a d b + * - a b c > a b b + * - a b c > x a c + * - a b c < x y z + */ class SuffixOrder : public Comparator { public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ explicit SuffixOrder(std::size_t order) : Comparator(order) {} + /** + * Compares two n-grams lexicographically, one word at a time, + * beginning with the last word of each n-gram and ending with the first word of each n-gram. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + */ inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { for (std::size_t i = order_ - 1; i != 0; --i) { if (lhs[i] != rhs[i]) @@ -43,10 +90,40 @@ class SuffixOrder : public Comparator { static const unsigned kMatchOffset = 1; }; + +/** + * N-gram comparator that compares n-grams according to the reverse (suffix) order of the n-gram context. + * + * This comparator compares n-grams lexicographically, one word at a time, + * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram; + * finally, this comparator compares the last word of each n-gram. + * + * Some examples of n-gram comparisons as defined by this comparator: + * - a b c == a b c + * - a b c < a b d + * - a b c < a d b + * - a b c > a b b + * - a b c > x a c + * - a b c < x y z + */ class ContextOrder : public Comparator { public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ explicit ContextOrder(std::size_t order) : Comparator(order) {} + /** + * Compares two n-grams lexicographically, one word at a time, + * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram; + * finally, this comparator compares the last word of each n-gram. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + */ inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { for (int i = order_ - 2; i >= 0; --i) { if (lhs[i] != rhs[i]) @@ -56,10 +133,37 @@ class ContextOrder : public Comparator { } }; +/** + * N-gram comparator that compares n-grams according to their natural (prefix) order. + * + * This comparator compares n-grams lexicographically, one word at a time, + * beginning with the first word of each n-gram and ending with the last word of each n-gram. + * + * Some examples of n-gram comparisons as defined by this comparator: + * - a b c == a b c + * - a b c < a b d + * - a b c < a d b + * - a b c > a b b + * - a b c < x a c + * - a b c < x y z + */ class PrefixOrder : public Comparator { public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ explicit PrefixOrder(std::size_t order) : Comparator(order) {} + /** + * Compares two n-grams lexicographically, one word at a time, + * beginning with the first word of each n-gram and ending with the last word of each n-gram. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + */ inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { for (std::size_t i = 0; i < order_; ++i) { if (lhs[i] != rhs[i]) @@ -84,15 +188,52 @@ struct AddCombiner { }; // The combiner is only used on a single chain, so I didn't bother to allow -// that template. -template class Sorts : public FixedArray > { +// that template. +/** + * Represents an @ref util::FixedArray "array" capable of storing @ref util::stream::Sort "Sort" objects. + * + * In the anticipated use case, an instance of this class will maintain one @ref util::stream::Sort "Sort" object + * for each n-gram order (ranging from 1 up to the maximum n-gram order being processed). + * Use in this manner would enable the n-grams each n-gram order to be sorted, in parallel. + * + * @tparam Compare An @ref Comparator "ngram comparator" to use during sorting. + */ +template class Sorts : public util::FixedArray > { private: typedef util::stream::Sort S; - typedef FixedArray P; + typedef util::FixedArray P; public: + + /** + * Constructs, but does not initialize. + * + * @ref util::FixedArray::Init() "Init" must be called before use. + * + * @see util::FixedArray::Init() + */ + Sorts() {} + + /** + * Constructs an @ref util::FixedArray "array" capable of storing a fixed number of @ref util::stream::Sort "Sort" objects. + * + * @param number The maximum number of @ref util::stream::Sort "sorters" that can be held by this @ref util::FixedArray "array" + * @see util::FixedArray::FixedArray() + */ + explicit Sorts(std::size_t number) : util::FixedArray >(number) {} + + /** + * Constructs a new @ref util::stream::Sort "Sort" object which is stored in this @ref util::FixedArray "array". + * + * The new @ref util::stream::Sort "Sort" object is constructed using the provided @ref util::stream::SortConfig "SortConfig" and @ref Comparator "ngram comparator"; + * once constructed, a new worker @ref util::stream::Thread "thread" (owned by the @ref util::stream::Chain "chain") will sort the n-gram data stored + * in the @ref util::stream::Block "blocks" of the provided @ref util::stream::Chain "chain". + * + * @see util::stream::Sort::Sort() + * @see util::stream::Chain::operator>>() + */ void push_back(util::stream::Chain &chain, const util::stream::SortConfig &config, const Compare &compare) { - new (P::end()) S(chain, config, compare); + new (P::end()) S(chain, config, compare); // use "placement new" syntax to initalize S in an already-allocated memory location P::Constructed(); } }; @@ -100,4 +241,4 @@ template class Sorts : public FixedArray