diff options
Diffstat (limited to 'utils')
| -rw-r--r-- | utils/ccrp_onetable.h | 2 | ||||
| -rw-r--r-- | utils/corpus_tools.cc | 20 | ||||
| -rw-r--r-- | utils/corpus_tools.h | 4 | ||||
| -rw-r--r-- | utils/dict.h | 3 | ||||
| -rw-r--r-- | utils/fast_sparse_vector.h | 108 | ||||
| -rw-r--r-- | utils/hash.h | 7 | ||||
| -rw-r--r-- | utils/sampler.h | 16 | 
7 files changed, 141 insertions, 19 deletions
| diff --git a/utils/ccrp_onetable.h b/utils/ccrp_onetable.h index 1fe01b0e..abe399ea 100644 --- a/utils/ccrp_onetable.h +++ b/utils/ccrp_onetable.h @@ -183,7 +183,7 @@ class CCRP_OneTable {      assert(has_discount_prior() || has_alpha_prior());      DiscountResampler dr(*this);      ConcentrationResampler cr(*this); -    for (int iter = 0; iter < nloop; ++iter) { +    for (unsigned iter = 0; iter < nloop; ++iter) {        if (has_alpha_prior()) {          alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,                                 std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); diff --git a/utils/corpus_tools.cc b/utils/corpus_tools.cc index d17785af..191153a2 100644 --- a/utils/corpus_tools.cc +++ b/utils/corpus_tools.cc @@ -8,6 +8,26 @@  using namespace std; +void CorpusTools::ReadLine(const string& line, +                           vector<WordID>* src, +                           vector<WordID>* trg) { +  static const WordID kDIV = TD::Convert("|||"); +  static vector<WordID> tmp; +  src->clear(); +  trg->clear(); +  TD::ConvertSentence(line, &tmp); +  unsigned i = 0; +  while(i < tmp.size() && tmp[i] != kDIV) { +    src->push_back(tmp[i]); +    ++i; +  } +  if (i < tmp.size() && tmp[i] == kDIV) { +    ++i; +    for (; i < tmp.size() ; ++i) +      trg->push_back(tmp[i]); +  } +} +  void CorpusTools::ReadFromFile(const string& filename,                             vector<vector<WordID> >* src,                             set<WordID>* src_vocab, diff --git a/utils/corpus_tools.h b/utils/corpus_tools.h index 97bdaa94..f6699d87 100644 --- a/utils/corpus_tools.h +++ b/utils/corpus_tools.h @@ -7,6 +7,10 @@  #include "wordid.h"  struct CorpusTools { +  static void ReadLine(const std::string& line, +                       std::vector<WordID>* src, +                       std::vector<WordID>* trg); +    static void ReadFromFile(const std::string& filename,                             std::vector<std::vector<WordID> >* src,                             std::set<WordID>* src_vocab = NULL, diff --git a/utils/dict.h b/utils/dict.h index 75ea3def..f08d0cf4 100644 --- a/utils/dict.h +++ b/utils/dict.h @@ -12,7 +12,8 @@  class Dict {   typedef - HASH_MAP<std::string, WordID, boost::hash<std::string> > Map; + //HASH_MAP<std::string, WordID, boost::hash<std::string> > Map; + HASH_MAP<std::string, WordID> Map;   public:    Dict() : b0_("<bad0>") {      HASH_MAP_EMPTY(d_,"<bad1>"); diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index e86cbdc1..433a5cc5 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -66,6 +66,60 @@ BOOST_STATIC_ASSERT(sizeof(PairIntT<float>) == sizeof(std::pair<unsigned,float>)  template <typename T, unsigned LOCAL_MAX = (sizeof(T) == sizeof(float) ? 15u : 7u)>  class FastSparseVector {   public: +  struct iterator { +    iterator(FastSparseVector<T>& v, const bool is_end) : local_(!v.is_remote_) { +      if (local_) { +        local_it_ = &v.data_.local[is_end ? v.local_size_ : 0]; +      } else { +        if (is_end) +          remote_it_ = v.data_.rbmap->end(); +        else +          remote_it_ = v.data_.rbmap->begin(); +      } +    } +    iterator(FastSparseVector<T>& v, const bool, const unsigned k) : local_(!v.is_remote_) { +      if (local_) { +        unsigned i = 0; +        while(i < v.local_size_ && v.data_.local[i].first() != k) { ++i; } +        local_it_ = &v.data_.local[i]; +      } else { +        remote_it_ = v.data_.rbmap->find(k); +      } +    } +    const bool local_; +    PairIntT<T>* local_it_; +    typename SPARSE_HASH_MAP<unsigned, T>::iterator remote_it_; +    std::pair<const unsigned, T>& operator*() const { +      if (local_) +        return *reinterpret_cast<std::pair<const unsigned, T>*>(local_it_); +      else +        return *remote_it_; +    } + +    std::pair<const unsigned, T>* operator->() const { +      if (local_) +        return reinterpret_cast<std::pair<const unsigned, T>*>(local_it_); +      else +        return &*remote_it_; +    } + +    iterator& operator++() { +      if (local_) ++local_it_; else ++remote_it_; +      return *this; +    } + +    inline bool operator==(const iterator& o) const { +      if (o.local_ != local_) return false; +      if (local_) { +        return local_it_ == o.local_it_; +      } else { +        return remote_it_ == o.remote_it_; +      } +    } +    inline bool operator!=(const iterator& o) const { +      return !(o == *this); +    } +  };    struct const_iterator {      const_iterator(const FastSparseVector<T>& v, const bool is_end) : local_(!v.is_remote_) {        if (local_) { @@ -77,12 +131,21 @@ class FastSparseVector {            remote_it_ = v.data_.rbmap->begin();        }      } +    const_iterator(const FastSparseVector<T>& v, const bool, const unsigned k) : local_(!v.is_remote_) { +      if (local_) { +        unsigned i = 0; +        while(i < v.local_size_ && v.data_.local[i].first() != k) { ++i; } +        local_it_ = &v.data_.local[i]; +      } else { +        remote_it_ = v.data_.rbmap->find(k); +      } +    }      const bool local_;      const PairIntT<T>* local_it_; -    typename std::map<unsigned, T>::const_iterator remote_it_; +    typename SPARSE_HASH_MAP<unsigned, T>::const_iterator remote_it_;      const std::pair<const unsigned, T>& operator*() const {        if (local_) -        return *reinterpret_cast<const std::pair<const unsigned, float>*>(local_it_); +        return *reinterpret_cast<const std::pair<const unsigned, T>*>(local_it_);        else          return *remote_it_;      } @@ -118,7 +181,7 @@ class FastSparseVector {    }    FastSparseVector(const FastSparseVector& other) {      std::memcpy(this, &other, sizeof(FastSparseVector)); -    if (is_remote_) data_.rbmap = new std::map<unsigned, T>(*data_.rbmap); +    if (is_remote_) data_.rbmap = new SPARSE_HASH_MAP<unsigned, T>(*data_.rbmap);    }    FastSparseVector(std::pair<unsigned, T>* first, std::pair<unsigned, T>* last) {      const ptrdiff_t n = last - first; @@ -128,7 +191,7 @@ class FastSparseVector {        std::memcpy(data_.local, first, sizeof(std::pair<unsigned, T>) * n);      } else {        is_remote_ = true; -      data_.rbmap = new std::map<unsigned, T>(first, last); +      data_.rbmap = new SPARSE_HASH_MAP<unsigned, T>(first, last);      }    }    void erase(int k) { @@ -150,7 +213,7 @@ class FastSparseVector {      clear();      std::memcpy(this, &other, sizeof(FastSparseVector));      if (is_remote_) -      data_.rbmap = new std::map<unsigned, T>(*data_.rbmap); +      data_.rbmap = new SPARSE_HASH_MAP<unsigned, T>(*data_.rbmap);      return *this;    }    T const& get_singleton() const { @@ -160,6 +223,9 @@ class FastSparseVector {    bool nonzero(unsigned k) const {      return static_cast<bool>(value(k));    } +  inline T& operator[](unsigned k) { +    return get_or_create_bin(k); +  }    inline void set_value(unsigned k, const T& v) {      get_or_create_bin(k) = v;    } @@ -171,7 +237,7 @@ class FastSparseVector {    }    inline T value(unsigned k) const {      if (is_remote_) { -      typename std::map<unsigned, T>::const_iterator it = data_.rbmap->find(k); +      typename SPARSE_HASH_MAP<unsigned, T>::const_iterator it = data_.rbmap->find(k);        if (it != data_.rbmap->end()) return it->second;      } else {        for (unsigned i = 0; i < local_size_; ++i) { @@ -256,8 +322,8 @@ class FastSparseVector {    }    inline FastSparseVector& operator*=(const T& scalar) {      if (is_remote_) { -      const typename std::map<unsigned, T>::iterator end = data_.rbmap->end(); -      for (typename std::map<unsigned, T>::iterator it = data_.rbmap->begin(); it != end; ++it) +      const typename SPARSE_HASH_MAP<unsigned, T>::iterator end = data_.rbmap->end(); +      for (typename SPARSE_HASH_MAP<unsigned, T>::iterator it = data_.rbmap->begin(); it != end; ++it)          it->second *= scalar;      } else {        for (int i = 0; i < local_size_; ++i) @@ -267,8 +333,8 @@ class FastSparseVector {    }    inline FastSparseVector& operator/=(const T& scalar) {      if (is_remote_) { -      const typename std::map<unsigned, T>::iterator end = data_.rbmap->end(); -      for (typename std::map<unsigned, T>::iterator it = data_.rbmap->begin(); it != end; ++it) +      const typename SPARSE_HASH_MAP<unsigned, T>::iterator end = data_.rbmap->end(); +      for (typename SPARSE_HASH_MAP<unsigned, T>::iterator it = data_.rbmap->begin(); it != end; ++it)          it->second /= scalar;      } else {        for (int i = 0; i < local_size_; ++i) @@ -283,6 +349,18 @@ class FastSparseVector {      }      return o;    } +  iterator find(unsigned k) { +    return iterator(*this, false, k); +  } +  iterator begin() { +    return iterator(*this, false); +  } +  iterator end() { +    return iterator(*this, true); +  } +  const_iterator find(unsigned k) const { +    return const_iterator(*this, false, k); +  }    const_iterator begin() const {      return const_iterator(*this, false);    } @@ -353,17 +431,19 @@ class FastSparseVector {    void swap_local_rbmap() {      if (is_remote_) { // data is in rbmap, move to local        assert(data_.rbmap->size() < LOCAL_MAX); -      const std::map<unsigned, T>* m = data_.rbmap; +      const SPARSE_HASH_MAP<unsigned, T>* m = data_.rbmap;        local_size_ = m->size();        int i = 0; -      for (typename std::map<unsigned, T>::const_iterator it = m->begin(); +      for (typename SPARSE_HASH_MAP<unsigned, T>::const_iterator it = m->begin();             it != m->end(); ++it) {          data_.local[i] = *it;          ++i;        }        is_remote_ = false;      } else { // data is local, move to rbmap -      std::map<unsigned, T>* m = new std::map<unsigned, T>(&data_.local[0], &data_.local[local_size_]); +      SPARSE_HASH_MAP<unsigned, T>* m = new SPARSE_HASH_MAP<unsigned, T>( +         reinterpret_cast<std::pair<unsigned, T>*>(&data_.local[0]), +         reinterpret_cast<std::pair<unsigned, T>*>(&data_.local[local_size_]), local_size_ * 1.5 + 1);        data_.rbmap = m;        is_remote_ = true;      } @@ -371,7 +451,7 @@ class FastSparseVector {    union {      PairIntT<T> local[LOCAL_MAX]; -    std::map<unsigned, T>* rbmap; +    SPARSE_HASH_MAP<unsigned, T>* rbmap;    } data_;    unsigned char local_size_;    bool is_remote_; diff --git a/utils/hash.h b/utils/hash.h index 31457430..6d992086 100644 --- a/utils/hash.h +++ b/utils/hash.h @@ -10,8 +10,10 @@  #endif  #ifdef HAVE_SPARSEHASH -# include <google/dense_hash_map> -# include <google/dense_hash_set> +# include <sparsehash/dense_hash_map> +# include <sparsehash/dense_hash_set> +# include <sparsehash/sparse_hash_map> +# define SPARSE_HASH_MAP google::sparse_hash_map  # define HASH_MAP google::dense_hash_map  # define HASH_SET google::dense_hash_set  # define HASH_MAP_RESERVED(h,empty,deleted) do { h.set_empty_key(empty); h.set_deleted_key(deleted); } while(0) @@ -19,6 +21,7 @@  #else  # include <tr1/unordered_map>  # include <tr1/unordered_set> +# define SPARSE_HASH_MAP std::tr1::unordered_map  # define HASH_MAP std::tr1::unordered_map  # define HASH_SET std::tr1::unordered_set  # define HASH_MAP_RESERVED(h,empty,deleted) diff --git a/utils/sampler.h b/utils/sampler.h index b237c716..3e4a4086 100644 --- a/utils/sampler.h +++ b/utils/sampler.h @@ -12,6 +12,7 @@  #include <boost/random/mersenne_twister.hpp>  #include <boost/random/uniform_real.hpp>  #include <boost/random/variate_generator.hpp> +#include <boost/random/gamma_distribution.hpp>  #include <boost/random/normal_distribution.hpp>  #include <boost/random/poisson_distribution.hpp>  #include <boost/random/uniform_int.hpp> @@ -76,6 +77,18 @@ struct RandomNumberGenerator {      return boost::poisson_distribution<int>(lambda)(m_random);    } +  double NextGamma(double shape, double scale = 1.0) { +    boost::gamma_distribution<> gamma(shape); +    boost::variate_generator<boost::mt19937&,boost::gamma_distribution<> > vg(m_generator, gamma); +    return vg() * scale; +  } + +  double NextBeta(double alpha, double beta) { +    double x = NextGamma(alpha); +    double y = NextGamma(beta); +    return x / (x + y); +  } +    bool AcceptMetropolisHastings(const prob_t& p_cur,                                  const prob_t& p_prev,                                  const prob_t& q_cur, @@ -123,11 +136,12 @@ size_t RandomNumberGenerator<RNG>::SelectSample(const SampleSet<F>& ss, double T    const bool anneal = (T != 1.0);    F sum = F(0);    if (anneal) { -    for (int i = 0; i < ss.m_scores.size(); ++i) +    for (unsigned i = 0; i < ss.m_scores.size(); ++i)        sum += pow(ss.m_scores[i], annealing_factor);  // p^(1/T)    } else {      sum = std::accumulate(ss.m_scores.begin(), ss.m_scores.end(), F(0));    } +  //std::cerr << "SUM: " << sum << std::endl;    //for (size_t i = 0; i < ss.m_scores.size(); ++i) std::cerr << ss.m_scores[i] << ",";    //std::cerr << std::endl; | 
