From 80686d4e567bae579ea39e009826a2de92cd4ace Mon Sep 17 00:00:00 2001 From: redpony Date: Wed, 11 Aug 2010 02:37:10 +0000 Subject: major refactor, break bad circular deps git-svn-id: https://ws10smt.googlecode.com/svn/trunk@509 ec762483-ff6d-05da-a07a-a48fb63a330f --- utils/Makefile.am | 38 ++++ utils/alignment_pharaoh.cc | 77 +++++++ utils/alignment_pharaoh.h | 14 ++ utils/array2d.h | 172 +++++++++++++++ utils/b64tools.cc | 59 +++++ utils/b64tools.h | 9 + utils/dict.cc | 27 +++ utils/dict.h | 66 ++++++ utils/dict_test.cc | 47 ++++ utils/fdict.cc | 143 ++++++++++++ utils/fdict.h | 34 +++ utils/feature_accum.h | 129 +++++++++++ utils/feature_vector.h | 18 ++ utils/filelib.cc | 22 ++ utils/filelib.h | 106 +++++++++ utils/gzstream.cc | 182 +++++++++++++++ utils/gzstream.h | 127 +++++++++++ utils/hash.h | 54 +++++ utils/have_64_bits.h | 17 ++ utils/int_or_pointer.h | 70 ++++++ utils/intrusive_refcount.hpp | 84 +++++++ utils/logval.h | 174 +++++++++++++++ utils/logval_test.cc | 73 ++++++ utils/murmur_hash.h | 186 ++++++++++++++++ utils/null_deleter.h | 9 + utils/prob.h | 8 + utils/sampler.h | 147 +++++++++++++ utils/small_vector.h | 265 ++++++++++++++++++++++ utils/small_vector_test.cc | 129 +++++++++++ utils/sparse_vector.cc | 98 +++++++++ utils/sparse_vector.h | 512 +++++++++++++++++++++++++++++++++++++++++++ utils/static_utoa.h | 115 ++++++++++ utils/stringlib.cc | 87 ++++++++ utils/stringlib.h | 267 ++++++++++++++++++++++ utils/stringlib_test.cc | 17 ++ utils/tdict.cc | 154 +++++++++++++ utils/tdict.h | 50 +++++ utils/test_data/weights | 8 + utils/threadlocal.h | 71 ++++++ utils/timing_stats.cc | 24 ++ utils/timing_stats.h | 25 +++ utils/weights.cc | 77 +++++++ utils/weights.h | 21 ++ utils/weights_test.cc | 27 +++ utils/wordid.h | 6 + 45 files changed, 4045 insertions(+) create mode 100644 utils/Makefile.am create mode 100644 utils/alignment_pharaoh.cc create mode 100644 utils/alignment_pharaoh.h create mode 100644 utils/array2d.h create mode 100644 utils/b64tools.cc create mode 100644 utils/b64tools.h create mode 100644 utils/dict.cc create mode 100644 utils/dict.h create mode 100644 utils/dict_test.cc create mode 100644 utils/fdict.cc create mode 100644 utils/fdict.h create mode 100755 utils/feature_accum.h create mode 100755 utils/feature_vector.h create mode 100644 utils/filelib.cc create mode 100644 utils/filelib.h create mode 100644 utils/gzstream.cc create mode 100644 utils/gzstream.h create mode 100755 utils/hash.h create mode 100755 utils/have_64_bits.h create mode 100755 utils/int_or_pointer.h create mode 100755 utils/intrusive_refcount.hpp create mode 100644 utils/logval.h create mode 100644 utils/logval_test.cc create mode 100755 utils/murmur_hash.h create mode 100755 utils/null_deleter.h create mode 100644 utils/prob.h create mode 100644 utils/sampler.h create mode 100644 utils/small_vector.h create mode 100644 utils/small_vector_test.cc create mode 100644 utils/sparse_vector.cc create mode 100644 utils/sparse_vector.h create mode 100755 utils/static_utoa.h create mode 100644 utils/stringlib.cc create mode 100644 utils/stringlib.h create mode 100755 utils/stringlib_test.cc create mode 100644 utils/tdict.cc create mode 100644 utils/tdict.h create mode 100644 utils/test_data/weights create mode 100755 utils/threadlocal.h create mode 100644 utils/timing_stats.cc create mode 100644 utils/timing_stats.h create mode 100644 utils/weights.cc create mode 100644 utils/weights.h create mode 100644 utils/weights_test.cc create mode 100644 utils/wordid.h (limited to 'utils') diff --git a/utils/Makefile.am b/utils/Makefile.am new file mode 100644 index 00000000..e513febd --- /dev/null +++ b/utils/Makefile.am @@ -0,0 +1,38 @@ +if HAVE_GTEST +noinst_PROGRAMS = \ + dict_test \ + weights_test \ + logval_test \ + small_vector_test +endif + +noinst_LIBRARIES = libutils.a + +libutils_a_SOURCES = \ + alignment_pharaoh.cc \ + b64tools.cc \ + dict.cc \ + tdict.cc \ + fdict.cc \ + gzstream.cc \ + filelib.cc \ + stringlib.cc \ + sparse_vector.cc \ + timing_stats.cc \ + weights.cc + +dict_test_SOURCES = dict_test.cc +dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) +weights_test_SOURCES = weights_test.cc +weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) +logval_test_SOURCES = logval_test.cc +logval_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) +small_vector_test_SOURCES = small_vector_test.cc +small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) + +AM_LDFLAGS = libutils.a -lz + +################################################################ +# do NOT NOT NOT add any other -I includes NO NO NO NO NO ###### +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I. +################################################################ diff --git a/utils/alignment_pharaoh.cc b/utils/alignment_pharaoh.cc new file mode 100644 index 00000000..890ff565 --- /dev/null +++ b/utils/alignment_pharaoh.cc @@ -0,0 +1,77 @@ +#include "utils/alignment_pharaoh.h" + +#include + +using namespace std; + +static bool is_digit(char x) { return x >= '0' && x <= '9'; } + +boost::shared_ptr > AlignmentPharaoh::ReadPharaohAlignmentGrid(const string& al) { + int max_x = 0; + int max_y = 0; + int i = 0; + size_t pos = al.rfind(" ||| "); + if (pos != string::npos) { i = pos + 5; } + while (i < al.size()) { + if (al[i] == '\n' || al[i] == '\r') break; + int x = 0; + while(i < al.size() && is_digit(al[i])) { + x *= 10; + x += al[i] - '0'; + ++i; + } + if (x > max_x) max_x = x; + assert(i < al.size()); + if(al[i] != '-') { + cerr << "BAD ALIGNMENT: " << al << endl; + abort(); + } + ++i; + int y = 0; + while(i < al.size() && is_digit(al[i])) { + y *= 10; + y += al[i] - '0'; + ++i; + } + if (y > max_y) max_y = y; + while(i < al.size() && al[i] == ' ') { ++i; } + } + + boost::shared_ptr > grid(new Array2D(max_x + 1, max_y + 1)); + i = 0; + if (pos != string::npos) { i = pos + 5; } + while (i < al.size()) { + if (al[i] == '\n' || al[i] == '\r') break; + int x = 0; + while(i < al.size() && is_digit(al[i])) { + x *= 10; + x += al[i] - '0'; + ++i; + } + assert(i < al.size()); + assert(al[i] == '-'); + ++i; + int y = 0; + while(i < al.size() && is_digit(al[i])) { + y *= 10; + y += al[i] - '0'; + ++i; + } + (*grid)(x, y) = true; + while(i < al.size() && al[i] == ' ') { ++i; } + } + // cerr << *grid << endl; + return grid; +} + +void AlignmentPharaoh::SerializePharaohFormat(const Array2D& alignment, ostream* out) { + bool need_space = false; + for (int i = 0; i < alignment.width(); ++i) + for (int j = 0; j < alignment.height(); ++j) + if (alignment(i,j)) { + if (need_space) (*out) << ' '; else need_space = true; + (*out) << i << '-' << j; + } + (*out) << endl; +} + diff --git a/utils/alignment_pharaoh.h b/utils/alignment_pharaoh.h new file mode 100644 index 00000000..d111c8bf --- /dev/null +++ b/utils/alignment_pharaoh.h @@ -0,0 +1,14 @@ +#ifndef _PHARAOH_ALIGNMENT_H_ +#define _PHARAOH_ALIGNMENT_H_ + +#include +#include +#include +#include "array2d.h" + +struct AlignmentPharaoh { + static boost::shared_ptr > ReadPharaohAlignmentGrid(const std::string& al); + static void SerializePharaohFormat(const Array2D& alignment, std::ostream* out); +}; + +#endif diff --git a/utils/array2d.h b/utils/array2d.h new file mode 100644 index 00000000..e63eda0d --- /dev/null +++ b/utils/array2d.h @@ -0,0 +1,172 @@ +#ifndef ARRAY2D_H_ +#define ARRAY2D_H_ + +#include +#include +#include +#include +#include + +template +class Array2D { + public: + typedef typename std::vector::reference reference; + typedef typename std::vector::const_reference const_reference; + typedef typename std::vector::iterator iterator; + typedef typename std::vector::const_iterator const_iterator; + Array2D() : width_(0), height_(0) {} + Array2D(int w, int h, const T& d = T()) : + width_(w), height_(h), data_(w*h, d) {} + Array2D(const Array2D& rhs) : + width_(rhs.width_), height_(rhs.height_), data_(rhs.data_) {} + bool empty() const { return data_.empty(); } + void resize(int w, int h, const T& d = T()) { + data_.resize(w * h, d); + width_ = w; + height_ = h; + } + const Array2D& operator=(const Array2D& rhs) { + data_ = rhs.data_; + width_ = rhs.width_; + height_ = rhs.height_; + return *this; + } + void fill(const T& v) { data_.assign(data_.size(), v); } + int width() const { return width_; } + int height() const { return height_; } + reference operator()(int i, int j) { + return data_[offset(i, j)]; + } + void clear() { data_.clear(); width_=0; height_=0; } + const_reference operator()(int i, int j) const { + return data_[offset(i, j)]; + } + iterator begin_col(int j) { + return data_.begin() + offset(0,j); + } + const_iterator begin_col(int j) const { + return data_.begin() + offset(0,j); + } + iterator end_col(int j) { + return data_.begin() + offset(0,j) + width_; + } + const_iterator end_col(int j) const { + return data_.begin() + offset(0,j) + width_; + } + iterator end() { return data_.end(); } + const_iterator end() const { return data_.end(); } + const Array2D& operator*=(const T& x) { + std::transform(data_.begin(), data_.end(), data_.begin(), + std::bind2nd(std::multiplies(), x)); + } + const Array2D& operator/=(const T& x) { + std::transform(data_.begin(), data_.end(), data_.begin(), + std::bind2nd(std::divides(), x)); + } + const Array2D& operator+=(const Array2D& m) { + std::transform(m.data_.begin(), m.data_.end(), data_.begin(), data_.begin(), std::plus()); + } + const Array2D& operator-=(const Array2D& m) { + std::transform(m.data_.begin(), m.data_.end(), data_.begin(), data_.begin(), std::minus()); + } + + private: + inline int offset(int i, int j) const { + assert(i data_; +}; + +template +Array2D operator*(const Array2D& l, const T& scalar) { + Array2D res(l); + res *= scalar; + return res; +} + +template +Array2D operator*(const T& scalar, const Array2D& l) { + Array2D res(l); + res *= scalar; + return res; +} + +template +Array2D operator/(const Array2D& l, const T& scalar) { + Array2D res(l); + res /= scalar; + return res; +} + +template +Array2D operator+(const Array2D& l, const Array2D& r) { + Array2D res(l); + res += r; + return res; +} + +template +Array2D operator-(const Array2D& l, const Array2D& r) { + Array2D res(l); + res -= r; + return res; +} + +template +inline std::ostream& operator<<(std::ostream& os, const Array2D& m) { + for (int i=0; i& m) { + os << ' '; + for (int j=0; j >& m) { + os << ' '; + for (int j=0; j& ar = m(i,j); + for (int k=0; k +#include + +using namespace std; + +namespace B64 { + +static const char cb64[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static const char cd64[]="|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`abcdefghijklmnopq"; + +static void encodeblock(const unsigned char* in, ostream* os, int len) { + char out[4]; + out[0] = cb64[ in[0] >> 2 ]; + out[1] = cb64[ ((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4) ]; + out[2] = (len > 1 ? cb64[ ((in[1] & 0x0f) << 2) | ((in[2] & 0xc0) >> 6) ] : '='); + out[3] = (len > 2 ? cb64[ in[2] & 0x3f ] : '='); + os->write(out, 4); +} + +void b64encode(const char* data, const size_t size, ostream* out) { + size_t cur = 0; + while(cur < size) { + int len = min(static_cast(3), size - cur); + encodeblock(reinterpret_cast(&data[cur]), out, len); + cur += len; + } +} + +static void decodeblock(const unsigned char* in, unsigned char* out) { + out[0] = (unsigned char ) (in[0] << 2 | in[1] >> 4); + out[1] = (unsigned char ) (in[1] << 4 | in[2] >> 2); + out[2] = (unsigned char ) (((in[2] << 6) & 0xc0) | in[3]); +} + +bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize) { + size_t cur = 0; + size_t ocur = 0; + unsigned char in[4]; + while(cur < insize) { + assert(ocur < outsize); + for (int i = 0; i < 4; ++i) { + unsigned char v = data[cur]; + v = (unsigned char) ((v < 43 || v > 122) ? '\0' : cd64[ v - 43 ]); + if (!v) { + cerr << "B64 decode error at offset " << cur << " offending character: " << (int)data[cur] << endl; + return false; + } + v = (unsigned char) ((v == '$') ? '\0' : v - 61); + if (v) in[i] = v - 1; else in[i] = 0; + ++cur; + } + decodeblock(in, reinterpret_cast(&out[ocur])); + ocur += 3; + } + return true; +} + +} + diff --git a/utils/b64tools.h b/utils/b64tools.h new file mode 100644 index 00000000..c821fc8f --- /dev/null +++ b/utils/b64tools.h @@ -0,0 +1,9 @@ +#ifndef _B64_TOOLS_H_ +#define _B64_TOOLS_H_ + +namespace B64 { + bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize); + void b64encode(const char* data, const size_t size, std::ostream* out); +} + +#endif diff --git a/utils/dict.cc b/utils/dict.cc new file mode 100644 index 00000000..2d6986c8 --- /dev/null +++ b/utils/dict.cc @@ -0,0 +1,27 @@ +#include "dict.h" + +#include +#include + +void TokenizeStringSeparator( + const std::string& str, + const std::string& separator, + std::vector* tokens) { + + size_t pos = 0; + std::string::size_type nextPos = str.find(separator, pos); + + while (nextPos != std::string::npos) { + tokens->push_back(str.substr(pos, nextPos - pos)); + pos = nextPos + separator.size(); + nextPos = str.find(separator, pos); + } + tokens->push_back(str.substr(pos, nextPos - pos)); +} + + +void Dict::AsVector(const WordID& id, std::vector* results) const { + results->clear(); + TokenizeStringSeparator(Convert(id), " ||| ", results); +} + diff --git a/utils/dict.h b/utils/dict.h new file mode 100644 index 00000000..348a97e3 --- /dev/null +++ b/utils/dict.h @@ -0,0 +1,66 @@ +#ifndef DICT_H_ +#define DICT_H_ + + +#include +#include + +#include +#include +#include "hash.h" +#include "wordid.h" + +class Dict { + typedef + HASH_MAP > Map; + public: + Dict() : b0_("") { + HASH_MAP_EMPTY(d_,""); + words_.reserve(1000); + } + + inline int max() const { return words_.size(); } + + inline WordID Convert(const std::string& word, bool frozen = false) { + Map::iterator i = d_.find(word); + if (i == d_.end()) { + if (frozen) + return 0; + words_.push_back(word); + d_[word] = words_.size(); + return words_.size(); + } else { + return i->second; + } + } + + inline WordID Convert(const std::vector& words, bool frozen = false) + { return Convert(toString(words), frozen); } + + static inline std::string toString(const std::vector& words) { + std::string word= ""; + for (std::vector::const_iterator it=words.begin(); + it != words.end(); ++it) { + if (it != words.begin()) word += " ||| "; + word += *it; + } + return word; + } + + inline const std::string& Convert(const WordID& id) const { + if (id == 0) return b0_; + assert(id <= (int)words_.size()); + return words_[id-1]; + } + + void AsVector(const WordID& id, std::vector* results) const; + + void clear() { words_.clear(); d_.clear(); } + + private: + const std::string b0_; + std::vector words_; + Map d_; +}; + +#endif diff --git a/utils/dict_test.cc b/utils/dict_test.cc new file mode 100644 index 00000000..2049ec27 --- /dev/null +++ b/utils/dict_test.cc @@ -0,0 +1,47 @@ +#include "dict.h" + +#include "fdict.h" + +#include +#include +#include + +using namespace std; + +class DTest : public testing::Test { + public: + DTest() {} + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +TEST_F(DTest, Convert) { + Dict d; + WordID a = d.Convert("foo"); + WordID b = d.Convert("bar"); + std::string x = "foo"; + WordID c = d.Convert(x); + EXPECT_NE(a, b); + EXPECT_EQ(a, c); + EXPECT_EQ(d.Convert(a), "foo"); + EXPECT_EQ(d.Convert(b), "bar"); +} + +TEST_F(DTest, FDictTest) { + int fid = FD::Convert("First"); + EXPECT_GT(fid, 0); + EXPECT_EQ(FD::Convert(fid), "First"); + string x = FD::Escape("="); + cerr << x << endl; + EXPECT_NE(x, "="); + x = FD::Escape(";"); + cerr << x << endl; + EXPECT_NE(x, ";"); +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/utils/fdict.cc b/utils/fdict.cc new file mode 100644 index 00000000..baa0b552 --- /dev/null +++ b/utils/fdict.cc @@ -0,0 +1,143 @@ +#include "fdict.h" +#include "stdlib.h" +//for malloc (need on cygwin); todo and std::malloc +#include +#include + +using namespace std; + +Dict FD::dict_; +bool FD::frozen_ = false; + +std::string FD::Convert(std::vector const& v) { + return Convert(&*v.begin(),&*v.end()); +} + +std::string FD::Convert(WordID const *b,WordID const* e) { + ostringstream o; + for (WordID const* i=b;ib) o << ' '; + o << FD::Convert(*i); + } + return o.str(); +} + +static int HexPairValue(const char * code) { + int value = 0; + const char * pch = code; + for (;;) { + int digit = *pch++; + if (digit >= '0' && digit <= '9') { + value += digit - '0'; + } + else if (digit >= 'A' && digit <= 'F') { + value += digit - 'A' + 10; + } + else if (digit >= 'a' && digit <= 'f') { + value += digit - 'a' + 10; + } + else { + return -1; + } + if (pch == code + 2) + return value; + value <<= 4; + } +} + +int UrlDecode(const char *source, char *dest) +{ + char * start = dest; + + while (*source) { + switch (*source) { + case '+': + *(dest++) = ' '; + break; + case '%': + if (source[1] && source[2]) { + int value = HexPairValue(source + 1); + if (value >= 0) { + *(dest++) = value; + source += 2; + } + else { + *dest++ = '?'; + } + } + else { + *dest++ = '?'; + } + break; + default: + *dest++ = *source; + } + source++; + } + + *dest = 0; + return dest - start; +} + +int UrlEncode(const char *source, char *dest, unsigned max) { + static const char *digits = "0123456789ABCDEF"; + unsigned char ch; + unsigned len = 0; + char *start = dest; + + while (len < max - 4 && *source) + { + ch = (unsigned char)*source; + if (*source == ' ') { + *dest++ = '+'; + } + else if (strchr("=:;,_| %", ch)) { + *dest++ = '%'; + *dest++ = digits[(ch >> 4) & 0x0F]; + *dest++ = digits[ ch & 0x0F]; + } + else { + *dest++ = *source; + } + source++; + } + *dest = 0; + return start - dest; +} + +std::string UrlDecodeString(const std::string & encoded) { + const char * sz_encoded = encoded.c_str(); + size_t needed_length = encoded.length(); + for (const char * pch = sz_encoded; *pch; pch++) { + if (*pch == '%') + needed_length += 2; + } + needed_length += 10; + char stackalloc[64]; + char * buf = needed_length > sizeof(stackalloc)/sizeof(*stackalloc) ? + (char *)malloc(needed_length) : stackalloc; + UrlDecode(encoded.c_str(), buf); + std::string result(buf); + if (buf != stackalloc) { + free(buf); + } + return result; +} + +std::string UrlEncodeString(const std::string & decoded) { + size_t needed_length = decoded.length() * 3 + 3; + char stackalloc[64]; + char * buf = needed_length > sizeof(stackalloc)/sizeof(*stackalloc) ? + (char *)malloc(needed_length) : stackalloc; + UrlEncode(decoded.c_str(), buf, needed_length); + std::string result(buf); + if (buf != stackalloc) { + free(buf); + } + return result; +} + +string FD::Escape(const string& s) { + return UrlEncodeString(s); +} + diff --git a/utils/fdict.h b/utils/fdict.h new file mode 100644 index 00000000..f9673023 --- /dev/null +++ b/utils/fdict.h @@ -0,0 +1,34 @@ +#ifndef _FDICT_H_ +#define _FDICT_H_ + +#include +#include +#include "dict.h" + +struct FD { + // once the FD is frozen, new features not already in the + // dictionary will return 0 + static void Freeze() { + frozen_ = true; + } + static inline int NumFeats() { + return dict_.max() + 1; + } + static inline WordID Convert(const std::string& s) { + return dict_.Convert(s, frozen_); + } + static inline const std::string& Convert(const WordID& w) { + return dict_.Convert(w); + } + static std::string Convert(WordID const *i,WordID const* e); + static std::string Convert(std::vector const& v); + + // Escape any string to a form that can be used as the name + // of a weight in a weights file + static std::string Escape(const std::string& s); + static Dict dict_; + private: + static bool frozen_; +}; + +#endif diff --git a/utils/feature_accum.h b/utils/feature_accum.h new file mode 100755 index 00000000..851b29db --- /dev/null +++ b/utils/feature_accum.h @@ -0,0 +1,129 @@ +#ifndef FEATURE_ACCUM_H +#define FEATURE_ACCUM_H + +#include "ff.h" +#include "sparse_vector.h" +#include "value_array.h" + +struct SparseFeatureAccumulator : public FeatureVector { + typedef FeatureVector State; + SparseFeatureAccumulator() { } + template + FeatureVector const& describe(FF const& ) { return *this; } + void Store(FeatureVector *fv) const { + fv->set_from(*this); + } + template + void Store(FF const& /* ff */,FeatureVector *fv) const { + fv->set_from(*this); + } + template + void Add(FF const& /* ff */,FeatureVector const& fv) { + (*this)+=fv; + } + void Add(FeatureVector const& fv) { + (*this)+=fv; + } + /* + SparseFeatureAccumulator(FeatureVector const& fv) : State(fv) {} + FeatureAccumulator(Features const& fids) {} + FeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fv) {} + void Add(Features const& fids,FeatureVector const& fv) { + *this += fv; + } + */ + void Add(int i,Featval v) { + (*this)[i]+=v; + } + void Add(Features const& fids,int i,Featval v) { + (*this)[i]+=v; + } +}; + +struct SingleFeatureAccumulator { + typedef Featval State; + typedef SingleFeatureAccumulator Self; + State v; + /* + void operator +=(State const& o) { + v+=o; + } + */ + void operator +=(Self const& s) { + v+=s.v; + } + SingleFeatureAccumulator() : v() {} + template + State const& describe(FF const& ) const { return v; } + + template + void Store(FF const& ff,FeatureVector *fv) const { + fv->set_value(ff.fid_,v); + } + void Store(Features const& fids,FeatureVector *fv) const { + assert(fids.size()==1); + fv->set_value(fids[0],v); + } + /* + SingleFeatureAccumulator(Features const& fids) { assert(fids.size()==1); } + SingleFeatureAccumulator(Features const& fids,FeatureVector const& fv) + { + assert(fids.size()==1); + v=fv.get_singleton(); + } + */ + + template + void Add(FF const& ff,FeatureVector const& fv) { + v+=fv.get(ff.fid_); + } + void Add(FeatureVector const& fv) { + v+=fv.get_singleton(); + } + + void Add(Features const& fids,FeatureVector const& fv) { + v += fv.get(fids[0]); + } + void Add(Featval dv) { + v+=dv; + } + void Add(int,Featval dv) { + v+=dv; + } + void Add(FeatureVector const& fids,int i,Featval dv) { + assert(fids.size()==1 && i==0); + v+=dv; + } +}; + + +#if 0 +// omitting this so we can default construct an accum. might be worth resurrecting in the future +struct ArrayFeatureAccumulator : public ValueArray { + typedef ValueArray State; + template + ArrayFeatureAccumulator(Fsa const& fsa) : State(fsa.features_.size()) { } + ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) { } + ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) { } + ArrayFeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fids.size()) { + for (int i=0,e=iset_value(fids[i],(*this)[i]); + } + void Add(Features const& fids,FeatureVector const& fv) { + for (int i=0,e=i +#include "sparse_vector.h" +#include "fdict.h" + +typedef double Featval; +typedef SparseVectorList FeatureVectorList; +typedef SparseVector FeatureVector; +typedef SparseVector WeightVector; +typedef std::vector DenseWeightVector; + +inline void sparse_to_dense(WeightVector const& wv,DenseWeightVector *dv) { + wv.init_vector(dv); +} + +#endif diff --git a/utils/filelib.cc b/utils/filelib.cc new file mode 100644 index 00000000..79ad2847 --- /dev/null +++ b/utils/filelib.cc @@ -0,0 +1,22 @@ +#include "filelib.h" + +#include +#include + +using namespace std; + +bool FileExists(const std::string& fn) { + struct stat info; + int s = stat(fn.c_str(), &info); + return (s==0); +} + +bool DirectoryExists(const string& dir) { + if (access(dir.c_str(),0) == 0) { + struct stat status; + stat(dir.c_str(), &status); + if (status.st_mode & S_IFDIR) return true; + } + return false; +} + diff --git a/utils/filelib.h b/utils/filelib.h new file mode 100644 index 00000000..b9fef9a7 --- /dev/null +++ b/utils/filelib.h @@ -0,0 +1,106 @@ +#ifndef _FILELIB_H_ +#define _FILELIB_H_ + +#include +#include +#include +#include +#include +#include +#include "gzstream.h" +#include "null_deleter.h" + +bool FileExists(const std::string& file_name); +bool DirectoryExists(const std::string& dir_name); + +// reads from standard in if filename is - +// uncompresses if file ends with .gz +// otherwise, reads from a normal file + +template +struct BaseFile { + typedef Stream S; + typedef boost::shared_ptr PS; + void Reset() { + ps_.reset(); + } + bool is_null() const { return !ps_; } + operator bool() const { + return ps_; + } + S* stream() { return ps_.get(); } + S* operator->() { return ps_.get(); } // compat with old ReadFile * -> new Readfile. remove? + S &operator *() const { return get(); } + S &get() const { return *ps_; } + bool is_std() { + return filename_=="-"; + } + std::string filename_; +protected: + void error(std::string const& reason,std::string const& filename) { + throw std::runtime_error("File "+filename+" - "+reason); + } + + PS ps_; + static bool EndsWith(const std::string& f, const std::string& suf) { + return (f.size() > suf.size()) && (f.rfind(suf) == f.size() - suf.size()); + } +}; + +class ReadFile : public BaseFile { + public: + ReadFile() { } + explicit ReadFile(const std::string& filename) { + Init(filename); + } + void Init(const std::string& filename) { + filename_=filename; + if (is_std()) { + ps_=PS(&std::cin,null_deleter()); + } else { + if (!FileExists(filename)) { + std::cerr << "File does not exist: " << filename << std::endl; + error(filename," couldn't read nonexistant file."); + abort(); + } + char const* file=filename_.c_str(); // just in case the gzstream keeps using the filename for longer than the constructor, e.g. inflateReset2. warning in valgrind that I'm hoping will disappear - it makes no sense. + ps_=PS(EndsWith(filename, ".gz") ? + static_cast(new igzstream(file)) : + static_cast(new std::ifstream(file))); + if (!*ps_) { + std::cerr << "Failed to open " << filename << std::endl; + error(filename," open for reading failed."); + abort(); + } + } + } + +}; + +class WriteFile : public BaseFile { + public: + WriteFile() {} + explicit WriteFile(std::string const& filename) { Init(filename); } + void Init(const std::string& filename) { + filename_=filename; + if (is_std()) { + ps_=PS(&std::cout,null_deleter()); + } else { + char const* file=filename_.c_str(); // just in case the gzstream keeps using the filename for longer than the constructor, e.g. inflateReset2. warning in valgrind that I'm hoping will disappear - it makes no sense. + ps_=PS(EndsWith(filename, ".gz") ? + static_cast(new ogzstream(file)) : + static_cast(new std::ofstream(file))); + if (!*ps_) { + std::cerr << "Failed to open " << filename << std::endl; + error(filename," open for writing failed."); + abort(); + } + } + } + ~WriteFile() { + if (ps_) + get() << std::flush; + } +}; + +#endif diff --git a/utils/gzstream.cc b/utils/gzstream.cc new file mode 100644 index 00000000..88cd1bd2 --- /dev/null +++ b/utils/gzstream.cc @@ -0,0 +1,182 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.C +// Revision : $Revision: 1.7 $ +// Revision_date : $Date: 2003/01/08 14:41:27 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#include +#include +#include // for memcpy +#include + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See header file for user classes. +// ---------------------------------------------------------------------------- + +// -------------------------------------- +// class gzstreambuf: +// -------------------------------------- + +gzstreambuf* gzstreambuf::open( const char* name, int open_mode) { + if ( is_open()) + return (gzstreambuf*)0; + mode = open_mode; + // no append nor read/write mode + if ((mode & std::ios::ate) || (mode & std::ios::app) + || ((mode & std::ios::in) && (mode & std::ios::out))) + return (gzstreambuf*)0; + const int Nmode=10; + char fmode[Nmode]; + char* fmodeptr = fmode; + if ( mode & std::ios::in) + *fmodeptr++ = 'r'; + else if ( mode & std::ios::out) + *fmodeptr++ = 'w'; + *fmodeptr++ = 'b'; + while (fmodeptr( gptr()); + + if ( ! (mode & std::ios::in) || ! opened) + return EOF; + // Josuttis' implementation of inbuf + int n_putback = gptr() - eback(); + if ( n_putback > 4) + n_putback = 4; + std::memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback); + + int num = gzread( file, buffer+4, bufferSize-4); + if (num <= 0) // ERROR or EOF + { + if (gzeof(file)) + return EOF; + handle_gzerror(); + } + + // reset buffer pointers + setg( buffer + (4 - n_putback), // beginning of putback area + buffer + 4, // read position + buffer + 4 + num); // end of buffer + + // return next character + return * reinterpret_cast( gptr()); +} + +int gzstreambuf::flush_buffer() { + // Separate the writing of the buffer from overflow() and + // sync() operation. + int w = pptr() - pbase(); + if ( gzwrite( file, pbase(), w) != w) + handle_gzerror(); + pbump( -w); + return w; +} + +int gzstreambuf::overflow( int c) { // used for output buffer only + if ( ! ( mode & std::ios::out) || ! opened) + return EOF; + if (c != EOF) { + *pptr() = c; + pbump(1); + } + if ( flush_buffer() == EOF) + return EOF; + return c; +} + +int gzstreambuf::sync() { + // Changed to use flush_buffer() instead of overflow( EOF) + // which caused improper behavior with std::endl and flush(), + // bug reported by Vincent Ricard. + if ( pptr() && pptr() > pbase()) { + if ( flush_buffer() == EOF) + return -1; + } + return 0; +} + +// -------------------------------------- +// class gzstreambase: +// -------------------------------------- + +gzstreambase::gzstreambase( const char* name, int mode) { + init( &buf); + open( name, mode); +} + +gzstreambase::~gzstreambase() { + buf.close(); +} + +void gzstreambase::open( const char* name, int open_mode) { + if ( ! buf.open( name, open_mode)) + clear( rdstate() | std::ios::badbit); +} + +void gzstreambase::close() { + if ( buf.is_open()) + if ( ! buf.close()) + clear( rdstate() | std::ios::badbit); +} + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +// ============================================================================ +// EOF // diff --git a/utils/gzstream.h b/utils/gzstream.h new file mode 100644 index 00000000..a7effd90 --- /dev/null +++ b/utils/gzstream.h @@ -0,0 +1,127 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.h +// Revision : $Revision: 1.5 $ +// Revision_date : $Date: 2002/04/26 23:30:15 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#ifndef GZSTREAM_H +#define GZSTREAM_H 1 + +// standard C++ with new header file names and std:: namespace +#include +#include +#include + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See below for user classes. +// ---------------------------------------------------------------------------- + +class gzstreambuf : public std::streambuf { +private: + static const int bufferSize = 47+(1024*256); // size of data buff + // totals 512 bytes under g++ for igzstream at the end. + + gzFile file; // file handle for compressed file + char buffer[bufferSize]; // data buffer + char opened; // open/close state of stream + int mode; // I/O mode + + int flush_buffer(); + void handle_gzerror(); // throws exception +public: +#if defined(_WIN32) && !defined(CYGWIN) && !defined(EOF) + enum { + EOF = -1 + }; +#endif + gzstreambuf() : opened(0) { + setp( buffer, buffer + (bufferSize-1)); + setg( buffer + 4, // beginning of putback area + buffer + 4, // read position + buffer + 4); // end position + // ASSERT: both input & output capabilities will not be used together + } + int is_open() { return opened; } + gzstreambuf* open( const char* name, int open_mode); + gzstreambuf* close(); + ~gzstreambuf() { close(); } + + virtual int overflow( int c = EOF); + virtual int underflow(); + virtual int sync(); +}; + +class gzstreambase : virtual public std::ios { +protected: + gzstreambuf buf; +public: + gzstreambase() { init(&buf); } + gzstreambase( const char* name, int open_mode); + ~gzstreambase(); + void open( const char* name, int open_mode); + void close(); + gzstreambuf* rdbuf() { return &buf; } +}; + +// ---------------------------------------------------------------------------- +// User classes. Use igzstream and ogzstream analogously to ifstream and +// ofstream respectively. They read and write files based on the gz* +// function interface of the zlib. Files are compatible with gzip compression. +// ---------------------------------------------------------------------------- + +class igzstream : public gzstreambase, public std::istream { +public: + igzstream() : std::istream( &buf) {} + igzstream( const char* name, int open_mode = std::ios::in) + : gzstreambase( name, std::ios::in | open_mode), std::istream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::in) { + gzstreambase::open( name, open_mode); + } +}; + +class ogzstream : public gzstreambase, public std::ostream { +public: + ogzstream() : std::ostream( &buf) {} + ogzstream( const char* name, int mode = std::ios::out) + : gzstreambase( name, mode), std::ostream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::out) { + gzstreambase::open( name, open_mode); + } +}; + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +#endif // GZSTREAM_H +// ============================================================================ +// EOF // + diff --git a/utils/hash.h b/utils/hash.h new file mode 100755 index 00000000..3a60a429 --- /dev/null +++ b/utils/hash.h @@ -0,0 +1,54 @@ +#ifndef CDEC_HASH_H +#define CDEC_HASH_H + +#include "murmur_hash.h" + +#include "config.h" +#ifdef HAVE_SPARSEHASH +# include +# define HASH_MAP google::dense_hash_map +# define HASH_MAP_RESERVED(h,empty,deleted) do { h.set_empty_key(empty); h.set_deleted_key(deleted); } while(0) +# define HASH_MAP_EMPTY(h,empty) do { h.set_empty_key(empty); } while(0) +#else +# include +# define HASH_MAP std::tr1::unordered_map +# define HASH_MAP_RESERVED(h,empty,deleted) +# define HASH_MAP_EMPTY(h,empty) +#endif + +#include + +// assumes C is POD +template +struct murmur_hash +{ + typedef MurmurInt return_type; + typedef C /*const&*/ argument_type; + return_type operator()(argument_type const& c) const { + return MurmurHash((void*)&c,sizeof(c)); + } +}; + +// murmur_hash_array isn't std guaranteed safe (you need to use string::data()) +template <> +struct murmur_hash +{ + typedef MurmurInt return_type; + typedef std::string /*const&*/ argument_type; + return_type operator()(argument_type const& c) const { + return MurmurHash(c.data(),c.size()); + } +}; + +// uses begin(),size() assuming contiguous layout and POD +template +struct murmur_hash_array +{ + typedef MurmurInt return_type; + typedef C /*const&*/ argument_type; + return_type operator()(argument_type const& c) const { + return MurmurHash(&*c.begin(),c.size()*sizeof(*c.begin())); + } +}; + +#endif diff --git a/utils/have_64_bits.h b/utils/have_64_bits.h new file mode 100755 index 00000000..d1e6064f --- /dev/null +++ b/utils/have_64_bits.h @@ -0,0 +1,17 @@ +#ifndef HAVE_64_BITS_H +#define HAVE_64_BITS_H + +#include + +#undef HAVE_64_BITS + +#if INTPTR_MAX == INT32_MAX +# define HAVE_64_BITS 0 +#elif INTPTR_MAX >= INT64_MAX +# define HAVE_64_BITS 1 +#else +# error "couldn't tell if HAVE_64_BITS from INTPTR_MAX INT32_MAX INT64_MAX" +#endif + + +#endif diff --git a/utils/int_or_pointer.h b/utils/int_or_pointer.h new file mode 100755 index 00000000..4b6a9e4a --- /dev/null +++ b/utils/int_or_pointer.h @@ -0,0 +1,70 @@ +#ifndef INT_OR_POINTER_H +#define INT_OR_POINTER_H + +// if you ever wanted to store a discriminated union of pointer/integer without an extra boolean flag, this will do it, assuming your pointers are never odd. + +// check lsb for expected tag? +#ifndef IOP_CHECK_LSB +# define IOP_CHECK_LSB 1 +#endif +#if IOP_CHECK_LSB +# define iop_assert(x) assert(x) +#else +# define iop_assert(x) +#endif + +#include +#include + +template +struct IntOrPointer { + typedef Pointed pointed_type; + typedef Int integer_type; + typedef Pointed *value_type; + typedef IntOrPointer self_type; + IntOrPointer(int j) { *this=j; } + IntOrPointer(size_t j) { *this=j; } + IntOrPointer(value_type v) { *this=v; } + bool is_integer() const { return i&1; } + bool is_pointer() const { return !(i&1); } + value_type & pointer() { return p; } + const value_type & pointer() const { iop_assert(is_pointer()); return p; } + integer_type integer() const { iop_assert(is_integer()); return i >> 1; } + void set_integer(Int j) { i=2*j+1; } + void set_pointer(value_type p_) { p=p_;iop_assert(is_pointer()); } + void operator=(unsigned j) { i = 2*(integer_type)j+1; } + void operator=(int j) { i = 2*(integer_type)j+1; } + template + void operator=(C j) { i = 2*(integer_type)j+1; } + void operator=(value_type v) { p=v; } + IntOrPointer() {} + IntOrPointer(const self_type &s) : p(s.p) {} + void operator=(const self_type &s) { p=s.p; } + template + bool operator ==(C* v) const { return p==v; } + template + bool operator ==(const C* v) const { return p==v; } + template + bool operator ==(C j) const { return integer() == j; } + bool operator ==(self_type s) const { return p==s.p; } + bool operator !=(self_type s) const { return p!=s.p; } + template void print(O&o) const + { + if (is_integer()) + o << integer(); + else { + o << "0x" << std::hex << (size_t)pointer() << std::dec; + } + } + friend inline std::ostream& operator<<(std::ostream &o,self_type const& s) { + s.print(o); return o; + } +protected: + union { + value_type p; // must be even (guaranteed unless you're pointing at packed chars) + integer_type i; // stored as 2*data+1, so only has half the range (one less bit) of a normal integer_type + }; +}; + + +#endif diff --git a/utils/intrusive_refcount.hpp b/utils/intrusive_refcount.hpp new file mode 100755 index 00000000..4a4b0187 --- /dev/null +++ b/utils/intrusive_refcount.hpp @@ -0,0 +1,84 @@ +#ifndef GRAEHL__SHARED__INTRUSIVE_REFCOUNT_HPP +#define GRAEHL__SHARED__INTRUSIVE_REFCOUNT_HPP + +#include +#include +#include +#include + +/** usage: + struct mine : public boost::instrusive_refcount {}; + + boost::intrusive_ptr p(new mine()); +*/ + +namespace boost { +// note: the free functions need to be in boost namespace, OR namespace of involved type. this is the only way to do it. + +template +class intrusive_refcount; + +template +class atomic_intrusive_refcount; + +template +void intrusive_ptr_add_ref(intrusive_refcount* ptr) +{ + ++(ptr->refs); +} + +template +void intrusive_ptr_release(intrusive_refcount* ptr) +{ + if (!--(ptr->refs)) delete static_cast(ptr); +} + + +//WARNING: only 2^32 (unsigned) refs allowed. hope that's ok :) +template +class intrusive_refcount : boost::noncopyable +{ + protected: +// typedef intrusive_refcount pointed_type; + friend void intrusive_ptr_add_ref(intrusive_refcount* ptr); + friend void intrusive_ptr_release(intrusive_refcount* ptr); +// friend class intrusive_ptr; + + intrusive_refcount(): refs(0) {} + ~intrusive_refcount() { assert(refs==0); } + +private: + unsigned refs; +}; + + +template +void intrusive_ptr_add_ref(atomic_intrusive_refcount* ptr) +{ + ++(ptr->refs); +} + +template +void intrusive_ptr_release(atomic_intrusive_refcount* ptr) +{ + if(!--(ptr->refs)) delete static_cast(ptr); +} + +template +class atomic_intrusive_refcount : boost::noncopyable +{ + protected: + friend void intrusive_ptr_add_ref(atomic_intrusive_refcount* ptr); + friend void intrusive_ptr_release(atomic_intrusive_refcount* ptr); + + atomic_intrusive_refcount(): refs(0) {} + ~atomic_intrusive_refcount() { assert(refs==0); } + +private: + boost::detail::atomic_count refs; +}; + +} + + +#endif diff --git a/utils/logval.h b/utils/logval.h new file mode 100644 index 00000000..37f14ae5 --- /dev/null +++ b/utils/logval.h @@ -0,0 +1,174 @@ +#ifndef LOGVAL_H_ +#define LOGVAL_H_ + +#define LOGVAL_CHECK_NEG false + +#include +#include +#include +#include + +template +class LogVal { + public: + LogVal() : s_(), v_(-std::numeric_limits::infinity()) {} + explicit LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {} + LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {} + LogVal(unsigned x) : s_(0), v_(std::log(x)) { } + LogVal(double lnx,bool sign) : s_(sign),v_(lnx) {} + static LogVal exp(T lnx) { return LogVal(lnx,false); } + + static LogVal One() { return LogVal(1); } + static LogVal Zero() { return LogVal(); } + static LogVal e() { return LogVal(1,false); } + void logeq(const T& v) { s_ = false; v_ = v; } + + LogVal& operator+=(const LogVal& a) { + if (a.v_ == -std::numeric_limits::infinity()) return *this; + if (a.s_ == s_) { + if (a.v_ < v_) { + v_ = v_ + log1p(std::exp(a.v_ - v_)); + } else { + v_ = a.v_ + log1p(std::exp(v_ - a.v_)); + } + } else { + if (a.v_ < v_) { + v_ = v_ + log1p(-std::exp(a.v_ - v_)); + } else { + v_ = a.v_ + log1p(-std::exp(v_ - a.v_)); + s_ = !s_; + } + } + return *this; + } + + LogVal& operator*=(const LogVal& a) { + s_ = (s_ != a.s_); + v_ += a.v_; + return *this; + } + + LogVal& operator/=(const LogVal& a) { + s_ = (s_ != a.s_); + v_ -= a.v_; + return *this; + } + + LogVal& operator-=(const LogVal& a) { + LogVal b = a; + b.invert(); + return *this += b; + } + + // LogVal(fabs(log(x)),x.s_) + friend LogVal abslog(LogVal x) { + if (x.v_<0) x.v_=-x.v_; + return x; + } + + LogVal& poweq(const T& power) { +#if LOGVAL_CHECK_NEG + if (s_) { + std::cerr << "poweq(T) not implemented when s_ is true\n"; + std::abort(); + } else +#endif + v_ *= power; + return *this; + } + + void invert() { s_ = !s_; } + + LogVal pow(const T& power) const { + LogVal res = *this; + res.poweq(power); + return res; + } + + LogVal root(const T& root) const { + return pow(1/root); + } + + operator T() const { + if (s_) return -std::exp(v_); else return std::exp(v_); + } + + bool s_; + T v_; +}; + +// copy elision - as opposed to explicit copy of LogVal const& o1, we should be able to construct Logval r=a+(b+c) as a single result in place in r. todo: return std::move(o1) - C++0x +template +LogVal operator+(LogVal o1, const LogVal& o2) { + o1 += o2; + return o1; +} + +template +LogVal operator*(LogVal o1, const LogVal& o2) { + o1 *= o2; + return o1; +} + +template +LogVal operator/(LogVal o1, const LogVal& o2) { + o1 /= o2; + return o1; +} + +template +LogVal operator-(LogVal o1, const LogVal& o2) { + o1 -= o2; + return o1; +} + +template +T log(const LogVal& o) { +#ifdef LOGVAL_CHECK_NEG + if (o.s_) return log(-1.0); +#endif + return o.v_; +} + +template +LogVal pow(const LogVal& b, const T& e) { + return b.pow(e); +} + +template +bool operator<(const LogVal& lhs, const LogVal& rhs) { + if (lhs.s_ == rhs.s_) { + return (lhs.v_ < rhs.v_); + } else { + return lhs.s_ > rhs.s_; + } +} + +#if 0 +template +bool operator<=(const LogVal& lhs, const LogVal& rhs) { + return (lhs.v_ <= rhs.v_); +} + +template +bool operator>(const LogVal& lhs, const LogVal& rhs) { + return (lhs.v_ > rhs.v_); +} + +template +bool operator>=(const LogVal& lhs, const LogVal& rhs) { + return (lhs.v_ >= rhs.v_); +} +#endif + +template +bool operator==(const LogVal& lhs, const LogVal& rhs) { + return (lhs.v_ == rhs.v_) && (lhs.s_ == rhs.s_); +} + +template +bool operator!=(const LogVal& lhs, const LogVal& rhs) { + return !(lhs == rhs); +} + +#endif diff --git a/utils/logval_test.cc b/utils/logval_test.cc new file mode 100644 index 00000000..1a23177d --- /dev/null +++ b/utils/logval_test.cc @@ -0,0 +1,73 @@ +#include "logval.h" + +#include +#include + +class LogValTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +using namespace std; + +TEST_F(LogValTest,Order) { + LogVal a(-0.3); + LogVal b(0.3); + LogVal c(2.4); + EXPECT_LT(a,b); + EXPECT_LT(b,c); + EXPECT_LT(a,c); + EXPECT_FALSE(b < a); + EXPECT_FALSE(c < a); + EXPECT_FALSE(c < b); + EXPECT_FALSE(c < c); + EXPECT_FALSE(b < b); + EXPECT_FALSE(a < a); +} + +TEST_F(LogValTest,Invert) { + LogVal x(-2.4); + LogVal y(2.4); + y.invert(); + EXPECT_FLOAT_EQ(x,y); +} + +TEST_F(LogValTest,Minus) { + LogVal x(12); + LogVal y(2); + LogVal z1 = x - y; + LogVal z2 = x; + z2 -= y; + EXPECT_FLOAT_EQ(z1, z2); + EXPECT_FLOAT_EQ(z1, 10.0); + EXPECT_FLOAT_EQ(y - x, -10.0); +} + +TEST_F(LogValTest,TestOps) { + LogVal x(-12.12); + LogVal y(x); + cerr << x << endl; + cerr << (x*y) << endl; + cerr << (x*y + x) << endl; + cerr << (x + x*y) << endl; + cerr << log1p(-0.5) << endl; + LogVal aa(0.2); + LogVal bb(-0.3); + cerr << (aa + bb) << endl; + cerr << (bb + aa) << endl; + EXPECT_FLOAT_EQ((aa + bb), (bb + aa)); + EXPECT_FLOAT_EQ((aa + bb), -0.1); +} + +TEST_F(LogValTest,TestSizes) { + cerr << sizeof(LogVal) << endl; + cerr << sizeof(LogVal) << endl; + cerr << sizeof(void*) << endl; +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/utils/murmur_hash.h b/utils/murmur_hash.h new file mode 100755 index 00000000..8dbd7807 --- /dev/null +++ b/utils/murmur_hash.h @@ -0,0 +1,186 @@ +#ifndef _MURMUR_HASH_H_ +#define _MURMUR_HASH_H_ + +//NOTE: quite fast, nice collision properties, but endian dependent hash values + +#include "have_64_bits.h" +typedef uintptr_t MurmurInt; + +// MurmurHash2, by Austin Appleby + +static const uint32_t DEFAULT_SEED=2654435769U; + +#if HAVE_64_BITS +//MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED); + +inline uint64_t MurmurHash64( const void * key, int len, unsigned int seed=DEFAULT_SEED ) +{ + const uint64_t m = 0xc6a4a7935bd1e995; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= uint64_t(data2[6]) << 48; + case 6: h ^= uint64_t(data2[5]) << 40; + case 5: h ^= uint64_t(data2[4]) << 32; + case 4: h ^= uint64_t(data2[3]) << 24; + case 3: h ^= uint64_t(data2[2]) << 16; + case 2: h ^= uint64_t(data2[1]) << 8; + case 1: h ^= uint64_t(data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +inline uint32_t MurmurHash32(void const *key, int len, uint32_t seed=DEFAULT_SEED) +{ + return (uint32_t) MurmurHash64(key,len,seed); +} + +inline MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED) +{ + return MurmurHash64(key,len,seed); +} + +#else +// 32-bit + +// Note - This code makes a few assumptions about how your machine behaves - +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 +inline uint32_t MurmurHash32 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const uint32_t m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + uint32_t h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + uint32_t k = *(uint32_t *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +inline MurmurInt MurmurHash ( const void * key, int len, uint32_t seed=DEFAULT_SEED) { + return MurmurHash32(key,len,seed); +} + +// 64-bit hash for 32-bit platforms + +inline uint64_t MurmurHash64 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) +{ + const uint32_t m = 0x5bd1e995; + const int r = 24; + + uint32_t h1 = seed ^ len; + uint32_t h2 = 0; + + const uint32_t * data = (const uint32_t *)key; + + while(len >= 8) + { + uint32_t k1 = *data++; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + + uint32_t k2 = *data++; + k2 *= m; k2 ^= k2 >> r; k2 *= m; + h2 *= m; h2 ^= k2; + len -= 4; + } + + if(len >= 4) + { + uint32_t k1 = *data++; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + } + + switch(len) + { + case 3: h2 ^= ((unsigned char*)data)[2] << 16; + case 2: h2 ^= ((unsigned char*)data)[1] << 8; + case 1: h2 ^= ((unsigned char*)data)[0]; + h2 *= m; + }; + + h1 ^= h2 >> 18; h1 *= m; + h2 ^= h1 >> 22; h2 *= m; + h1 ^= h2 >> 17; h1 *= m; + h2 ^= h1 >> 19; h2 *= m; + + uint64_t h = h1; + + h = (h << 32) | h2; + + return h; +} + +#endif +//32bit + +#endif diff --git a/utils/null_deleter.h b/utils/null_deleter.h new file mode 100755 index 00000000..082ab453 --- /dev/null +++ b/utils/null_deleter.h @@ -0,0 +1,9 @@ +#ifndef NULL_DELETER_H +#define NULL_DELETER_H + +struct null_deleter { + void operator()(void*) const {} + void operator()(void const*) const {} +}; + +#endif diff --git a/utils/prob.h b/utils/prob.h new file mode 100644 index 00000000..bc297870 --- /dev/null +++ b/utils/prob.h @@ -0,0 +1,8 @@ +#ifndef _PROB_H_ +#define _PROB_H_ + +#include "logval.h" + +typedef LogVal prob_t; + +#endif diff --git a/utils/sampler.h b/utils/sampler.h new file mode 100644 index 00000000..5fef45d0 --- /dev/null +++ b/utils/sampler.h @@ -0,0 +1,147 @@ +#ifndef SAMPLER_H_ +#define SAMPLER_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "prob.h" + +struct SampleSet; + +template +struct RandomNumberGenerator { + static uint32_t GetTrulyRandomSeed() { + uint32_t seed; + std::ifstream r("/dev/urandom"); + if (r) { + r.read((char*)&seed,sizeof(uint32_t)); + } + if (r.fail() || !r) { + std::cerr << "Warning: could not read from /dev/urandom. Seeding from clock" << std::endl; + seed = std::time(NULL); + } + std::cerr << "Seeding random number sequence to " << seed << std::endl; + return seed; + } + + RandomNumberGenerator() : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) { + uint32_t seed = GetTrulyRandomSeed(); + m_generator.seed(seed); + } + explicit RandomNumberGenerator(uint32_t seed) : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) { + if (!seed) seed = GetTrulyRandomSeed(); + m_generator.seed(seed); + } + + size_t SelectSample(const prob_t& a, const prob_t& b, double T = 1.0) { + if (T == 1.0) { + if (this->next() > (a / (a + b))) return 1; else return 0; + } else { + assert(!"not implemented"); + } + } + + // T is the annealing temperature, if desired + size_t SelectSample(const SampleSet& ss, double T = 1.0); + + // draw a value from U(0,1) + double next() {return m_random();} + + // draw a value from N(mean,var) + double NextNormal(double mean, double var) { + return boost::normal_distribution(mean, var)(m_random); + } + + // draw a value from a Poisson distribution + // lambda must be greater than 0 + int NextPoisson(int lambda) { + return boost::poisson_distribution(lambda)(m_random); + } + + bool AcceptMetropolisHastings(const prob_t& p_cur, + const prob_t& p_prev, + const prob_t& q_cur, + const prob_t& q_prev) { + const prob_t a = (p_cur / p_prev) * (q_prev / q_cur); + if (log(a) >= 0.0) return true; + return (prob_t(this->next()) < a); + } + + RNG &gen() { return m_generator; } + typedef boost::variate_generator > IntRNG; + IntRNG inclusive(int low,int high_incl) { + assert(high_incl>=low); + return IntRNG(m_generator,boost::uniform_int<>(low,high_incl)); + } + + private: + boost::uniform_real<> m_dist; + RNG m_generator; + boost::variate_generator > m_random; +}; + +typedef RandomNumberGenerator MT19937; + +class SampleSet { + public: + const prob_t& operator[](int i) const { return m_scores[i]; } + prob_t& operator[](int i) { return m_scores[i]; } + bool empty() const { return m_scores.empty(); } + void add(const prob_t& s) { m_scores.push_back(s); } + void clear() { m_scores.clear(); } + size_t size() const { return m_scores.size(); } + void resize(int size) { m_scores.resize(size); } + std::vector m_scores; +}; + +template +size_t RandomNumberGenerator::SelectSample(const SampleSet& ss, double T) { + assert(T > 0.0); + assert(ss.m_scores.size() > 0); + if (ss.m_scores.size() == 1) return 0; + const prob_t annealing_factor(1.0 / T); + const bool anneal = (annealing_factor != prob_t::One()); + prob_t sum = prob_t::Zero(); + if (anneal) { + for (int i = 0; i < ss.m_scores.size(); ++i) + sum += ss.m_scores[i].pow(annealing_factor); // p^(1/T) + } else { + sum = std::accumulate(ss.m_scores.begin(), ss.m_scores.end(), prob_t::Zero()); + } + //for (size_t i = 0; i < ss.m_scores.size(); ++i) std::cerr << ss.m_scores[i] << ","; + //std::cerr << std::endl; + + prob_t random(this->next()); // random number between 0 and 1 + random *= sum; // scale with normalization factor + //std::cerr << "Random number " << random << std::endl; + + //now figure out which sample + size_t position = 1; + sum = ss.m_scores[0]; + if (anneal) { + sum.poweq(annealing_factor); + for (; position < ss.m_scores.size() && sum < random; ++position) + sum += ss.m_scores[position].pow(annealing_factor); + } else { + for (; position < ss.m_scores.size() && sum < random; ++position) + sum += ss.m_scores[position]; + } + //std::cout << "random: " << random << " sample: " << position << std::endl; + //std::cerr << "Sample: " << position-1 << std::endl; + //exit(1); + return position-1; +} + +#endif diff --git a/utils/small_vector.h b/utils/small_vector.h new file mode 100644 index 00000000..25c52359 --- /dev/null +++ b/utils/small_vector.h @@ -0,0 +1,265 @@ +#ifndef _SMALL_VECTOR_H_ +#define _SMALL_VECTOR_H_ + +/* REQUIRES that T is POD (can be memcpy). won't work (yet) due to union with SMALL_VECTOR_POD==0 - may be possible to handle movable types that have ctor/dtor, by using explicit allocation, ctor/dtor calls. but for now JUST USE THIS FOR no-meaningful ctor/dtor POD types. + + stores small element (<=SV_MAX items) vectors inline. recommend SV_MAX=sizeof(T)/sizeof(T*)>1?sizeof(T)/sizeof(T*):1. may not work if SV_MAX==0. + */ + +#define SMALL_VECTOR_POD 1 + +#include // std::max - where to get this? +#include +#include +#include +#include +#include +//sizeof(T)/sizeof(T*)>1?sizeof(T)/sizeof(T*):1 + +template +class SmallVector { +// typedef unsigned short uint16_t; + public: + typedef SmallVector Self; + SmallVector() : size_(0) {} + + typedef T const* const_iterator; + typedef T* iterator; + typedef T value_type; + typedef T &reference; + typedef T const& const_reference; + + T *begin() { return size_>SV_MAX?data_.ptr:data_.vals; } + T const* begin() const { return const_cast(this)->begin(); } + T *end() { return begin()+size_; } + T const* end() const { return begin()+size_; } + + explicit SmallVector(size_t s) : size_(s) { + assert(s < 0xA000); + if (s <= SV_MAX) { + for (int i = 0; i < s; ++i) new(&data_.vals[i]) T(); + } else { + capacity_ = s; + size_ = s; + data_.ptr = new T[s]; // TODO: replace this with allocator or ::operator new(sizeof(T)*s) everywhere + for (int i = 0; i < size_; ++i) new(&data_.ptr[i]) T(); + } + } + + SmallVector(size_t s, T const& v) : size_(s) { + assert(s < 0xA000); + if (s <= SV_MAX) { + for (int i = 0; i < s; ++i) data_.vals[i] = v; + } else { + capacity_ = s; + size_ = s; + data_.ptr = new T[s]; + for (int i = 0; i < size_; ++i) data_.ptr[i] = v; + } + } + + SmallVector(const Self& o) : size_(o.size_) { + if (size_ <= SV_MAX) { + std::memcpy(data_.vals,o.data_.vals,size_*sizeof(T)); +// for (int i = 0; i < size_; ++i) data_.vals[i] = o.data_.vals[i]; + } else { + capacity_ = size_ = o.size_; + data_.ptr = new T[capacity_]; + std::memcpy(data_.ptr, o.data_.ptr, size_ * sizeof(T)); + } + } + + const Self& operator=(const Self& o) { + if (size_ <= SV_MAX) { + if (o.size_ <= SV_MAX) { + size_ = o.size_; + for (int i = 0; i < SV_MAX; ++i) data_.vals[i] = o.data_.vals[i]; + } else { + capacity_ = size_ = o.size_; + data_.ptr = new T[capacity_]; + std::memcpy(data_.ptr, o.data_.ptr, size_ * sizeof(T)); + } + } else { + if (o.size_ <= SV_MAX) { + delete[] data_.ptr; + size_ = o.size_; + for (int i = 0; i < size_; ++i) data_.vals[i] = o.data_.vals[i]; + } else { + if (capacity_ < o.size_) { + delete[] data_.ptr; + capacity_ = o.size_; + data_.ptr = new T[capacity_]; + } + size_ = o.size_; + for (int i = 0; i < size_; ++i) + data_.ptr[i] = o.data_.ptr[i]; + } + } + return *this; + } + + ~SmallVector() { + if (size_ <= SV_MAX) { + // skip if pod? yes, we required pod anyway. no need to destruct +#if !SMALL_VECTOR_POD + for (int i=0;i SV_MAX) { + delete[] data_.ptr; + } + size_ = 0; + } + + bool empty() const { return size_ == 0; } + size_t size() const { return size_; } + + inline void ensure_capacity(uint16_t min_size) { + assert(min_size > SV_MAX); + if (min_size < capacity_) return; + uint16_t new_cap = std::max(static_cast(capacity_ << 1), min_size); + T* tmp = new T[new_cap]; + std::memcpy(tmp, data_.ptr, capacity_ * sizeof(T)); + delete[] data_.ptr; + data_.ptr = tmp; + capacity_ = new_cap; + } + +private: + inline void copy_vals_to_ptr() { + capacity_ = SV_MAX * 2; + T* tmp = new T[capacity_]; + for (int i = 0; i < SV_MAX; ++i) tmp[i] = data_.vals[i]; + data_.ptr = tmp; + } + inline void ptr_to_small() { + assert(size_<=SV_MAX); + int *tmp=data_.ptr; + for (int i=0;ioperator[](size_ - 1); } + const T& back() const { return this->operator[](size_ - 1); } + T& front() { return this->operator[](0); } + const T& front() const { return this->operator[](0); } + + void pop_back() { + assert(size_>0); + --size_; + if (size_==SV_MAX) + ptr_to_small(); + } + + void compact() { + compact(size_); + } + + // size must be <= size_ - TODO: test + void compact(uint16_t size) { + assert(size<=size_); + if (size_>SV_MAX) { + size_=size; + if (size<=SV_MAX) + ptr_to_small(); + } else + size_=size; + } + + void resize(size_t s, int v = 0) { + if (s <= SV_MAX) { + if (size_ > SV_MAX) { + T *tmp=data_.ptr; + for (int i = 0; i < s; ++i) data_.vals[i] = tmp[i]; + delete[] tmp; + size_ = s; + return; + } + if (s <= size_) { + size_ = s; + return; + } else { + for (int i = size_; i < s; ++i) + data_.vals[i] = v; + size_ = s; + return; + } + } else { + if (size_ <= SV_MAX) + copy_vals_to_ptr(); + if (s > capacity_) + ensure_capacity(s); + if (s > size_) { + for (int i = size_; i < s; ++i) + data_.ptr[i] = v; + } + size_ = s; + } + } + + T& operator[](size_t i) { + if (size_ <= SV_MAX) return data_.vals[i]; + return data_.ptr[i]; + } + + const T& operator[](size_t i) const { + if (size_ <= SV_MAX) return data_.vals[i]; + return data_.ptr[i]; + } + + bool operator==(const Self& o) const { + if (size_ != o.size_) return false; + if (size_ <= SV_MAX) { + for (size_t i = 0; i < size_; ++i) + if (data_.vals[i] != o.data_.vals[i]) return false; + return true; + } else { + for (size_t i = 0; i < size_; ++i) + if (data_.ptr[i] != o.data_.ptr[i]) return false; + return true; + } + } + + friend bool operator!=(const Self& a, const Self& b) { + return !(a==b); + } + + private: + union StorageType { + T vals[SV_MAX]; + T* ptr; + }; + StorageType data_; + uint16_t size_; + uint16_t capacity_; // only defined when size_ > __SV_MAX_STATIC +}; + +typedef SmallVector SmallVectorInt; + +template +void memcpy(void *out,SmallVector const& v) { + std::memcpy(out,v.begin(),v.size()*sizeof(T)); +} + +#endif diff --git a/utils/small_vector_test.cc b/utils/small_vector_test.cc new file mode 100644 index 00000000..d1d8dcab --- /dev/null +++ b/utils/small_vector_test.cc @@ -0,0 +1,129 @@ +#include "small_vector.h" + +#include +#include +#include +#include + +using namespace std; + +class SVTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +TEST_F(SVTest, LargerThan2) { + SmallVectorInt v; + SmallVectorInt v2; + v.push_back(0); + v.push_back(1); + v.push_back(2); + assert(v.size() == 3); + assert(v[2] == 2); + assert(v[1] == 1); + assert(v[0] == 0); + v2 = v; + SmallVectorInt copy(v); + assert(copy.size() == 3); + assert(copy[0] == 0); + assert(copy[1] == 1); + assert(copy[2] == 2); + assert(copy == v2); + copy[1] = 99; + assert(copy != v2); + assert(v2.size() == 3); + assert(v2[2] == 2); + assert(v2[1] == 1); + assert(v2[0] == 0); + v2[0] = -2; + v2[1] = -1; + v2[2] = 0; + assert(v2[2] == 0); + assert(v2[1] == -1); + assert(v2[0] == -2); + SmallVectorInt v3(1,1); + assert(v3[0] == 1); + v2 = v3; + assert(v2.size() == 1); + assert(v2[0] == 1); + SmallVectorInt v4(10, 1); + assert(v4.size() == 10); + assert(v4[5] == 1); + assert(v4[9] == 1); + v4 = v; + assert(v4.size() == 3); + assert(v4[2] == 2); + assert(v4[1] == 1); + assert(v4[0] == 0); + SmallVectorInt v5(10, 2); + assert(v5.size() == 10); + assert(v5[7] == 2); + assert(v5[0] == 2); + assert(v.size() == 3); + v = v5; + assert(v.size() == 10); + assert(v[2] == 2); + assert(v[9] == 2); + SmallVectorInt cc; + for (int i = 0; i < 33; ++i) + cc.push_back(i); + for (int i = 0; i < 33; ++i) + assert(cc[i] == i); + cc.resize(20); + assert(cc.size() == 20); + for (int i = 0; i < 20; ++i) + assert(cc[i] == i); + cc[0]=-1; + cc.resize(1, 999); + assert(cc.size() == 1); + assert(cc[0] == -1); + cc.resize(99, 99); + for (int i = 1; i < 99; ++i) { + cerr << i << " " << cc[i] << endl; + assert(cc[i] == 99); + } + cc.clear(); + assert(cc.size() == 0); +} + +TEST_F(SVTest, Small) { + SmallVectorInt v; + SmallVectorInt v1(1,0); + SmallVectorInt v2(2,10); + SmallVectorInt v1a(2,0); + EXPECT_TRUE(v1 != v1a); + EXPECT_TRUE(v1 == v1); + EXPECT_EQ(v1[0], 0); + EXPECT_EQ(v2[1], 10); + EXPECT_EQ(v2[0], 10); + ++v2[1]; + --v2[0]; + EXPECT_EQ(v2[0], 9); + EXPECT_EQ(v2[1], 11); + SmallVectorInt v3(v2); + assert(v3[0] == 9); + assert(v3[1] == 11); + assert(!v3.empty()); + assert(v3.size() == 2); + v3.clear(); + assert(v3.empty()); + assert(v3.size() == 0); + assert(v3 != v2); + assert(v2 != v3); + v3 = v2; + assert(v3 == v2); + assert(v2 == v3); + assert(v3[0] == 9); + assert(v3[1] == 11); + assert(!v3.empty()); + assert(v3.size() == 2); + cerr << sizeof(SmallVectorInt) << endl; + cerr << sizeof(vector) << endl; +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/utils/sparse_vector.cc b/utils/sparse_vector.cc new file mode 100644 index 00000000..6e42a216 --- /dev/null +++ b/utils/sparse_vector.cc @@ -0,0 +1,98 @@ +#include "sparse_vector.h" + +#include +#include + +#include "b64tools.h" + +using namespace std; + +namespace B64 { + +void Encode(double objective, const SparseVector& v, ostream* out) { + const int num_feats = v.num_active(); + size_t tot_size = 0; + const size_t off_objective = tot_size; + tot_size += sizeof(double); // objective + const size_t off_num_feats = tot_size; + tot_size += sizeof(int); // num_feats + const size_t off_data = tot_size; + tot_size += sizeof(unsigned char) * num_feats; // lengths of feature names; + typedef SparseVector::const_iterator const_iterator; + for (const_iterator it = v.begin(); it != v.end(); ++it) + tot_size += FD::Convert(it->first).size(); // feature names; + tot_size += sizeof(double) * num_feats; // gradient + const size_t off_magic = tot_size; + tot_size += 4; // magic + + // size_t b64_size = tot_size * 4 / 3; + // cerr << "Sparse vector binary size: " << tot_size << " (b64 size=" << b64_size << ")\n"; + char* data = new char[tot_size]; + *reinterpret_cast(&data[off_objective]) = objective; + *reinterpret_cast(&data[off_num_feats]) = num_feats; + char* cur = &data[off_data]; + assert(cur - data == off_data); + for (const_iterator it = v.begin(); it != v.end(); ++it) { + const string& fname = FD::Convert(it->first); + *cur++ = static_cast(fname.size()); // name len + memcpy(cur, &fname[0], fname.size()); + cur += fname.size(); + *reinterpret_cast(cur) = it->second; + cur += sizeof(double); + } + assert(cur - data == off_magic); + *reinterpret_cast(cur) = 0xBAABABBAu; + cur += sizeof(unsigned int); + assert(cur - data == tot_size); + b64encode(data, tot_size, out); + delete[] data; +} + +bool Decode(double* objective, SparseVector* v, const char* in, size_t size) { + v->clear(); + if (size % 4 != 0) { + cerr << "B64 error - line % 4 != 0\n"; + return false; + } + const size_t decoded_size = size * 3 / 4 - sizeof(unsigned int); + const size_t buf_size = decoded_size + sizeof(unsigned int); + if (decoded_size < 6) { cerr << "SparseVector decoding error: too short!\n"; return false; } + char* data = new char[buf_size]; + if (!b64decode(reinterpret_cast(in), size, data, buf_size)) { + delete[] data; + return false; + } + size_t cur = 0; + *objective = *reinterpret_cast(data); + cur += sizeof(double); + const int num_feats = *reinterpret_cast(&data[cur]); + cur += sizeof(int); + int fc = 0; + while(fc < num_feats && cur < decoded_size) { + ++fc; + const int fname_len = data[cur++]; + assert(fname_len > 0); + assert(fname_len < 256); + string fname(fname_len, '\0'); + memcpy(&fname[0], &data[cur], fname_len); + cur += fname_len; + const double val = *reinterpret_cast(&data[cur]); + cur += sizeof(double); + int fid = FD::Convert(fname); + v->set_value(fid, val); + } + if(num_feats != fc) { + cerr << "Expected " << num_feats << " but only decoded " << fc << "!\n"; + delete[] data; + return false; + } + if (*reinterpret_cast(&data[cur]) != 0xBAABABBAu) { + cerr << "SparseVector decodeding error : magic does not match!\n"; + delete[] data; + return false; + } + delete[] data; + return true; +} + +} diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h new file mode 100644 index 00000000..207489c5 --- /dev/null +++ b/utils/sparse_vector.h @@ -0,0 +1,512 @@ +#ifndef _SPARSE_VECTOR_H_ +#define _SPARSE_VECTOR_H_ + +//#define SPARSE_VECTOR_HASH + +#ifdef SPARSE_VECTOR_HASH +#include "hash.h" +# define SPARSE_VECTOR_MAP HASH_MAP +# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted) HASH_MAP_RESERVED(h,empty,deleted) +#else +# define SPARSE_VECTOR_MAP std::map +# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted) +#endif +/* + use SparseVectorList (pair smallvector) for feat funcs / hypergraphs (you rarely need random access; just append a feature to the list) +*/ +/* hack: index 0 never gets printed because cdyer is creative and efficient. features which have no weight got feature dict id 0, see, and the models all clobered that value. nobody wants to see it. except that vlad is also creative and efficient and stored the oracle bleu there. */ +/* NOTE: zero vals may or may not be dropped from map (sparse, but not guaranteed to be so). + + I rely on !v the same as !((bool)v) the same as v==0 and v() same as v(0). + + one exception: + + a local: + T sum = 0; + is used instead of + T sum; + + because T may be a primitive type, and + + T sum(); + + is parsed as a function decl :( + + the alternative T sum=T() is also be reasonable. i've switched to that. +*/ + +// this is a modified version of code originally written +// by Phil Blunsom + +#include +#include +#include +#include +#include + +#include "fdict.h" +#include "small_vector.h" + +template +inline T & extend_vector(std::vector &v,int i) { + if (i>=v.size()) + v.resize(i+1); + return v[i]; +} + +template +class SparseVector { + void init_reserved() { + SPARSE_VECTOR_MAP_RESERVED(values_,-1,-2); + } +public: + T const& get_singleton() const { + assert(values_.size()==1); + return values_.begin()->second; + } + + typedef SparseVector Self; + typedef SPARSE_VECTOR_MAP MapType; + typedef typename MapType::const_iterator const_iterator; + SparseVector() { + init_reserved(); + } + explicit SparseVector(std::vector const& v) { + init_reserved(); + typename MapType::iterator p=values_.begin(); + const T z=0; + for (unsigned i=0;i *vp) const { + init_vector(*vp); + } + + void init_vector(std::vector &v) const { + v.clear(); + for (const_iterator i=values_.begin(),e=values_.end();i!=e;++i) + extend_vector(v,i->first)=i->second; + } + + void set_new_value(int index, T const& val) { + assert(values_.find(index)==values_.end()); + values_[index]=val; + } + + + // warning: exploits the fact that 0 values are always removed from map. change this if you change that. + bool nonzero(int index) const { + typename MapType::const_iterator found = values_.find(index); + return found==values_.end() || !found->second; + } + + + T get(int index) const { + typename MapType::const_iterator found = values_.find(index); + return found==values_.end()?T():found->second; + } + + T value(int i) const { return get(i); } + + // same as above but may add a 0 entry. TODO: check that people relying on no entry use get + T & operator[](int index){ + return values_[index]; + } + + inline void set_value(int index, const T &value) { + values_[index] = value; + } + + inline void maybe_add(int index, const T& value) { + if (value) add_value(index,value); + } + + T& add_value(int index, const T &value) { +#if 1 + return values_[index]+=value; +#else + // this is not really going to be any faster, and we already rely on default init = 0 init + std::pair art=values_.insert(std::make_pair(index,value)); + T &val=art.first->second; + if (!art.second) val += value; // already existed + return val; +#endif + } + + + void store(std::valarray* target) const { + (*target) *= 0; + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) { + if (it->first >= target->size()) break; + (*target)[it->first] = it->second; + } + } + + int max_index() const { + if (empty()) return 0; + typename MapType::const_iterator found =values_.end(); + --found; + return found->first; + } + + // dot product with a unit vector of the same length + // as the sparse vector + T dot() const { + T sum = T(); + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) + sum += it->second; + return sum; + } + + template + S cosine_sim(const SparseVector &vec) const { + return dot(vec)/(l2norm()*vec.l2norm()); + } + + // if values are binary, gives |A intersect B|/|A union B| + template + S tanimoto_coef(const SparseVector &vec) const { + S dp=dot(vec); + return dp/(l2norm_sq()+vec.l2norm_sq()-dp); + } + + template + S dot(const SparseVector &vec) const { + S sum = S(); + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) + { + typename MapType::const_iterator + found = vec.values_.find(it->first); + if (found != vec.values_.end()) + sum += it->second * found->second; + } + return sum; + } + + template + S dot(const std::vector &vec) const { + S sum = S(); + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) + { + if (it->first < static_cast(vec.size())) + sum += it->second * vec[it->first]; + } + return sum; + } + + template + S dot(const S *vec) const { + // this is not range checked! + S sum = S(); + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) + sum += it->second * vec[it->first]; + std::cout << "dot(*vec) " << sum << std::endl; + return sum; + } + + T l1norm() const { + T sum = T(); + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) + sum += fabs(it->second); + return sum; + } + + T l2norm_sq() const { + T sum = T(); + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) + sum += it->second * it->second; + return sum; + } + + T l2norm() const { + return sqrt(l2norm_sq()); + } + + void erase(int key) { + values_.erase(key); +/* typename MapType::iterator found = values_.find(key); + if (found!=values_end()) + values_.erase(found);*/ + } + + template + void set_from(SparseVector const& other) { + for (typename MapType::const_iterator + it = other.values_.begin(); it != other.values_.end(); ++it) + { + values_[it->first]=it->second; + } + } + + SparseVector &operator+=(const SparseVector &other) { + for (typename MapType::const_iterator + it = other.values_.begin(); it != other.values_.end(); ++it) + { +// T v = + (values_[it->first] += it->second); +// if (!v) values_.erase(it->first); + } + return *this; + } + + SparseVector &operator-=(const SparseVector &other) { + for (typename MapType::const_iterator + it = other.values_.begin(); it != other.values_.end(); ++it) + { +// T v = + (values_[it->first] -= it->second); +// if (!v) values_.erase(it->first); + } + return *this; + } + + friend SparseVector operator -(SparseVector x,SparseVector const& y) { + x-=y; + return x; + } + friend SparseVector operator +(SparseVector x,SparseVector const& y) { + x+=y; + return x; + } + +private: + // DEPRECATED: becuase 0 values are dropped from the map, this doesn't even make sense if you have a fully populated (not really sparse re: what you'll ever use) vector + SparseVector &operator-=(T const& x) { + for (typename MapType::iterator + it = values_.begin(); it != values_.end(); ++it) + it->second -= x; + return *this; + } + + SparseVector &operator+=(T const& x) { + for (typename MapType::iterator + it = values_.begin(); it != values_.end(); ++it) + it->second += x; + return *this; + } +public: + SparseVector &operator/=(const T &x) { + for (typename MapType::iterator + it = values_.begin(); it != values_.end(); ++it) + it->second /= x; + return *this; + } + + SparseVector &operator*=(const T& x) { + for (typename MapType::iterator + it = values_.begin(); it != values_.end(); ++it) + it->second *= x; + return *this; + } + + SparseVector operator+(T const& x) const { + SparseVector result = *this; + return result += x; + } + + SparseVector operator-(T const& x) const { + SparseVector result = *this; + return result -= x; + } + + SparseVector operator/(T const& x) const { + SparseVector result = *this; + return result /= x; + } + + std::ostream &operator<<(std::ostream& out) const { + Write(true, &out); + return out; + } + + void Write(const bool with_semi, std::ostream* os) const { + bool first = true; + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) { + // by definition feature id 0 is a dummy value + if (!it->first) continue; + if (with_semi) { + (*os) << (first ? "" : ";") + << FD::Convert(it->first) << '=' << it->second; + } else { + (*os) << (first ? "" : " ") + << FD::Convert(it->first) << '=' << it->second; + } + first = false; + } + } + + bool operator==(Self const & other) const { + return size()==other.size() && contains_keys_of(other) && other.contains_i(*this); + } + + bool contains(Self const &o) const { + return size()>o.size() && contains(o); + } + + bool at_equals(int i,T const& val) const { + const_iterator it=values_.find(i); + if (it==values_.end()) return !val; + return it->second==val; + } + + bool contains_i(Self const& o) const { + for (typename MapType::const_iterator i=o.begin(),e=o.end();i!=e;++i) + if (!at_equals(i->first,i->second)) + return false; + return true; + } + + bool contains_keys_of(Self const& o) const { + for (typename MapType::const_iterator i=o.begin(),e=o.end();i!=e;++i) + if (values_.find(i)==values_.end()) + return false; + return true; + } + +#ifndef SPARSE_VECTOR_HASH + bool operator<(const SparseVector &other) const { + typename MapType::const_iterator it = values_.begin(); + typename MapType::const_iterator other_it = other.values_.begin(); + + for (; it != values_.end() && other_it != other.values_.end(); ++it, ++other_it) + { + if (it->first < other_it->first) return true; + if (it->first > other_it->first) return false; + if (it->second < other_it->second) return true; + if (it->second > other_it->second) return false; + } + return values_.size() < other.values_.size(); + } +#endif + + int size() const { return values_.size(); } + + int num_active() const { return values_.size(); } + bool empty() const { return values_.empty(); } + + const_iterator begin() const { return values_.begin(); } + const_iterator end() const { return values_.end(); } + + void clear() { + values_.clear(); + } + + void swap(SparseVector& other) { + values_.swap(other.values_); + } + +private: + MapType values_; +}; + +//like a pair but can live in a union, because it lacks default+copy ctors, dtor. +template +struct feature_val { + int fid; + T val; +}; + +template +inline feature_val featval(int fid,T const &val) { + feature_val f; + f.fid=fid; + f.val=val; + return f; +} + + +// doesn't support fast indexing directly +template +class SparseVectorList { + typedef feature_val Pair; + typedef SmallVector List; + typedef typename List::const_iterator const_iterator; + SparseVectorList() { } + template + SparseVectorList(I i,I const& end) { + int c=0; + for (;i const& v) { + for (unsigned i=0;i *to) const { + for (int i=0;iset_value(p[i].fid,p[i].val); + } + void copy_to(SparseVector *to) const { + to->clear(); + overlay(to); + } + SparseVector sparse() const { + SparseVector r; + copy_to(r); + return r; + } +private: + List p; +}; + +template +SparseVector operator+(const SparseVector& a, const SparseVector& b) { + SparseVector result = a; + return result += b; +} + +template +SparseVector operator*(const SparseVector& a, const double& b) { + SparseVector result = a; + return result *= b; +} + +template +SparseVector operator*(const SparseVector& a, const T& b) { + SparseVector result = a; + return result *= b; +} + +template +SparseVector operator*(const double& a, const SparseVector& b) { + SparseVector result = b; + return result *= a; +} + +template +std::ostream &operator<<(std::ostream &out, const SparseVector &vec) +{ + return vec.operator<<(out); +} + +namespace B64 { + void Encode(double objective, const SparseVector& v, std::ostream* out); + // returns false if failed to decode + bool Decode(double* objective, SparseVector* v, const char* data, size_t size); +} + +#endif diff --git a/utils/static_utoa.h b/utils/static_utoa.h new file mode 100755 index 00000000..fe5f6d92 --- /dev/null +++ b/utils/static_utoa.h @@ -0,0 +1,115 @@ +#ifndef STATIC_UTOA_H +#define STATIC_UTOA_H + +#include "threadlocal.h" + + +#include +#include + +#define DIGIT_LOOKUP_TABLE 0 + +namespace { +THREADLOCAL char utoa_buf[] = "01234567890123456789"; // to put end of string character at buf[20] +const unsigned utoa_bufsize=sizeof(utoa_buf); +const unsigned utoa_bufsizem1=utoa_bufsize-1; +#ifdef DIGIT_LOOKUP_TABLE +char digits[] = "0123456789"; +#endif +} + +inline char digit_to_char(int d) { + return +#ifdef DIGIT_LOOKUP_TABLE + digits[d]; +#else + '0'+d; +#endif +} + +// returns n in string [return,num); *num=0 yourself before calling if you want a c_str +inline char *utoa(char *num,unsigned n) { + if ( !n ) { + *--num='0'; + } else { + unsigned rem; + // 3digit lookup table, divide by 1000 faster? + while ( n ) { +#if 1 + rem = n; + n /= 10; + rem -= 10*n; // maybe this is faster than mod because we are already dividing +#else + rem = n%10; // would optimizer combine these together? + n = n/10; +#endif + *--num = digit_to_char(rem); + } + } + return num; +} + +inline char *static_utoa(unsigned n) { + return utoa(utoa_buf+utoa_bufsizem1,n); +} + +//returns position of '\0' terminating number written starting at to +inline char* append_utoa(char *to,unsigned n) { + char *s=static_utoa(n); + int ns=(utoa_buf+utoa_bufsize)-s; + std::memcpy(to,s,ns); + return to+ns; +} + +// so named to avoid gcc segfault when named itoa +inline char *itoa(char *p,int n) { + if (n<0) { + p=utoa(p,-n); // TODO: check that (unsigned)(-INT_MIN) == 0x1000000 in 2s complement and not == 0 + *--p='-'; + return p; + } else + return utoa(p,n); +} + +inline char *static_itoa(int n) { + return itoa(utoa_buf+utoa_bufsizem1,n); +} + + +inline std::string utos(unsigned n) { + const int bufsz=20; + char buf[bufsz]; + char *end=buf+bufsz; + char *p=utoa(end,n); + return std::string(p,end); +} + +inline std::string itos(int n) { + const int bufsz=20; + char buf[bufsz]; + char *end=buf+bufsz; + char *p=itoa(end,n); + return std::string(p,end); +} + +#ifdef ITOA_SAMPLE +# include +# include +# include +using namespace std; + +int main(int argc,char *argv[]) { + printf("d U d U d U\n"); + for (int i=1;i +#include +#include +#include +#include + +using namespace std; + +void ParseTranslatorInput(const string& line, string* input, string* ref) { + size_t hint = 0; + if (line.find("{\"rules\":") == 0) { + hint = line.find("}}"); + if (hint == string::npos) { + cerr << "Syntax error: " << line << endl; + abort(); + } + hint += 2; + } + size_t pos = line.find("|||", hint); + if (pos == string::npos) { *input = line; return; } + ref->clear(); + *input = line.substr(0, pos - 1); + string rline = line.substr(pos + 4); + if (rline.size() > 0) { + assert(ref); + *ref = rline; + } +} + +void ProcessAndStripSGML(string* pline, map* out) { + map& meta = *out; + string& line = *pline; + string lline = LowercaseString(line); + if (lline.find(""); + if (close == string::npos) return; // error + size_t end = lline.find(""); + string seg = Trim(lline.substr(4, close-4)); + string text = line.substr(close+1, end - close - 1); + for (size_t i = 1; i < seg.size(); i++) { + if (seg[i] == '=' && seg[i-1] == ' ') { + string less = seg.substr(0, i-1) + seg.substr(i); + seg = less; i = 0; continue; + } + if (seg[i] == '=' && seg[i+1] == ' ') { + string less = seg.substr(0, i+1); + if (i+2 < seg.size()) less += seg.substr(i+2); + seg = less; i = 0; continue; + } + } + line = Trim(text); + if (seg == "") return; + for (size_t i = 1; i < seg.size(); i++) { + if (seg[i] == '=') { + string label = seg.substr(0, i); + string val = seg.substr(i+1); + if (val[0] == '"') { + val = val.substr(1); + size_t close = val.find('"'); + if (close == string::npos) { + cerr << "SGML parse error: missing \"\n"; + seg = ""; + i = 0; + } else { + seg = val.substr(close+1); + val = val.substr(0, close); + i = 0; + } + } else { + size_t close = val.find(' '); + if (close == string::npos) { + seg = ""; + i = 0; + } else { + seg = val.substr(close+1); + val = val.substr(0, close); + } + } + label = Trim(label); + seg = Trim(seg); + meta[label] = val; + } + } +} + diff --git a/utils/stringlib.h b/utils/stringlib.h new file mode 100644 index 00000000..84e95d44 --- /dev/null +++ b/utils/stringlib.h @@ -0,0 +1,267 @@ +#ifndef CDEC_STRINGLIB_H_ +#define CDEC_STRINGLIB_H_ + +//usage: string s=MAKESTRE(1<<" "<(ostringstream()< +#define SLIBDBG(x) do { std::cerr<<"DBG(stringlib): "< +#include +#include +#include +#include +#include +#include + +inline std::size_t skip_ws(std::string const& s,std::size_t starting=0,char const* ws=" \t\n\r") { + return s.find_first_not_of(ws,starting); +} + +// returns position of end of all non-ws chars before ending, i.e. string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)) strips both ends +inline std::size_t trailing_ws(std::string const& s,std::size_t ending=std::string::npos,char const* ws=" \t\n\r") { + std::size_t n=s.find_last_not_of(ws,ending); + if (n==std::string::npos) return n; + else return n+1; +} + +//TEST: if string is all whitespace, make sure that string(a+npos,a+npos) can't segfault (i.e. won't access any memory because begin==end) +inline std::string strip_ws(std::string const& s) { + return std::string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)); +} + + +inline bool is_single_line(std::string const& line) { + return std::count(line.begin(),line.end(),'\n')==0; // but we want to allow terminal newlines/blanks +} + +// is_single_line(strip_ws(line)) +inline bool is_single_line_stripped(std::string const& line) { + std::size_t b=skip_ws(line),e=trailing_ws(line); + std::size_t n=line.find('\n',b); + return n==std::string::npos || n>=e; +} + +struct toupperc { + inline char operator()(char c) const { + return std::toupper(c); + } +}; + +inline std::string toupper(std::string s) { + std::transform(s.begin(),s.end(),s.begin(),toupperc()); + return s; +} + +template inline +bool match_begin(Istr bstr,Istr estr,Isubstr bsub,Isubstr esub) +{ + while (bsub != esub) { + if (bstr == estr) + return false; + if (*bsub++ != *bstr++) + return false; + } + return true; +} + +template inline +bool match_begin(Istr bstr,Istr estr,Prefix prefix) +{ + return match_begin(bstr,estr,prefix.begin(),prefix.end()); +} + +template inline +bool match_begin(Str const& str,Prefix const& prefix) +{ + return match_begin(str.begin(),str.end(),prefix.begin(),prefix.end()); +} + + +// read line in the form of either: +// source +// source ||| target +// source will be returned as a string, target must be a sentence or +// a lattice (in PLF format) and will be returned as a Lattice object +void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref); +struct Lattice; +void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref); + +inline std::string Trim(const std::string& str, const std::string& dropChars = " \t") { + std::string res = str; + res.erase(str.find_last_not_of(dropChars)+1); + return res.erase(0, res.find_first_not_of(dropChars)); +} + +inline void Tokenize(const std::string& str, char delimiter, std::vector* res) { + std::string s = str; + int last = 0; + res->clear(); + for (int i=0; i < s.size(); ++i) + if (s[i] == delimiter) { + s[i]=0; + if (last != i) { + res->push_back(&s[last]); + } + last = i + 1; + } + if (last != s.size()) + res->push_back(&s[last]); +} + +inline unsigned NTokens(const std::string& str, char delimiter) +{ + std::vector r; + Tokenize(str,delimiter,&r); + return r.size(); +} + +inline std::string LowercaseString(const std::string& in) { + std::string res(in.size(),' '); + for (int i = 0; i < in.size(); ++i) + res[i] = tolower(in[i]); + return res; +} + +inline int CountSubstrings(const std::string& str, const std::string& sub) { + size_t p = 0; + int res = 0; + while (p < str.size()) { + p = str.find(sub, p); + if (p == std::string::npos) break; + ++res; + p += sub.size(); + } + return res; +} + +inline int SplitOnWhitespace(const std::string& in, std::vector* out) { + out->clear(); + int i = 0; + int start = 0; + std::string cur; + while(i < in.size()) { + if (in[i] == ' ' || in[i] == '\t') { + if (i - start > 0) + out->push_back(in.substr(start, i - start)); + start = i + 1; + } + ++i; + } + if (i > start) + out->push_back(in.substr(start, i - start)); + return out->size(); +} + +inline std::vector SplitOnWhitespace(std::string const& in) +{ + std::vector r; + SplitOnWhitespace(in,&r); + return r; +} + + +struct mutable_c_str { + // because making a copy of a string might not copy its storage, so modifying a c_str() could screw up original (nobody uses cow nowadays because it needs locking under threading) + char *p; + mutable_c_str(std::string const& s) : p((char *)::operator new(s.size()+1)) { + std::memcpy(p,s.data(),s.size()); + p[s.size()]=0; + } + ~mutable_c_str() { ::operator delete(p); } +private: + mutable_c_str(mutable_c_str const&); +}; + +// ' ' '\t' tokens hardcoded +//NOTE: you should have stripped endline chars out first. +inline bool IsWordSep(char c) { + return c==' '||c=='\t'; +} + + +template +// *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens +void VisitTokens(char *p,char *const end,F f) { + SLIBDBG("VisitTokens. p="<* out); + +// given the first character of a UTF8 block, find out how wide it is +// see http://en.wikipedia.org/wiki/UTF-8 for more info +inline unsigned int UTF8Len(unsigned char x) { + if (x < 0x80) return 1; + else if ((x >> 5) == 0x06) return 2; + else if ((x >> 4) == 0x0e) return 3; + else if ((x >> 3) == 0x1e) return 4; + else return 0; +} + +#endif diff --git a/utils/stringlib_test.cc b/utils/stringlib_test.cc new file mode 100755 index 00000000..f66cdbeb --- /dev/null +++ b/utils/stringlib_test.cc @@ -0,0 +1,17 @@ +#define STRINGLIB_DEBUG +#include "stringlib.h" + +using namespace std; +struct print { + template + void operator()(S const& s) const { + cout<= end() will give a numeric token name (single per-thread shared buffer), which of course won't be Convert-able back to the id, because it's not added to the dict. This is a convenience for logging fake token indices. Any tokens actually added to the dict may cause end() to overlap the range of fake ids you were using - that's up to you to prevent. + +#include +#include +#include +#include "Ngram.h" +#include "dict.h" +#include "tdict.h" +#include "Vocab.h" +#include "stringlib.h" +#include "threadlocal.h" + +using namespace std; + +Vocab TD::dict_(0,TD::max_wordid); +WordID TD::ss=dict_.ssIndex(); +WordID TD::se=dict_.seIndex(); +WordID TD::unk=dict_.unkIndex(); +char const*const TD::ss_str=Vocab_SentStart; +char const*const TD::se_str=Vocab_SentEnd; +char const*const TD::unk_str=Vocab_Unknown; + +// pre+(i-base)+">" for i in [base,e) +inline void pad(std::string const& pre,int base,int e) { + assert(base<=e); + ostringstream o; + for (int i=base;i'; + WordID id=TD::Convert(o.str()); + assert(id==i); // this fails. why? + } +} + + +namespace { +struct TD_init { + TD_init() { + /* + // disabled for now since it's breaking trunk + assert(TD::Convert(TD::ss_str)==TD::ss); + assert(TD::Convert(TD::se_str)==TD::se); + assert(TD::Convert(TD::unk_str)==TD::unk); + assert(TD::none==Vocab_None); + pad("=dict_.highIndex()) return undef_token(w); +#endif + return dict_.getWord((VocabIndex)w); +} + + +void TD::GetWordIDs(const std::vector& strings, std::vector* ids) { + ids->clear(); + for (vector::const_iterator i = strings.begin(); i != strings.end(); ++i) + ids->push_back(TD::Convert(*i)); +} + +std::string TD::GetString(const std::vector& str) { + ostringstream o; + for (int i=0;i Ws; + Ws *ids; + explicit add_wordids(Ws *i) : ids(i) { } + add_wordids(const add_wordids& o) : ids(o.ids) { } + void operator()(char const* s) { + ids->push_back(TD::Convert(s)); + } + void operator()(std::string const& s) { + ids->push_back(TD::Convert(s)); + } +}; + +} + +void TD::ConvertSentence(std::string const& s, std::vector* ids) { + ids->clear(); + VisitTokens(s,add_wordids(ids)); +} diff --git a/utils/tdict.h b/utils/tdict.h new file mode 100644 index 00000000..a7b3ee1c --- /dev/null +++ b/utils/tdict.h @@ -0,0 +1,50 @@ +#ifndef _TDICT_H_ +#define _TDICT_H_ + +#include +#include +#include "wordid.h" +#include + +class Vocab; + +struct TD { + /* // disabled for now + static const int reserved_begin=10; // allow room for SRI special tokens e.g. unk ss se pause. tokens until this get "" + static const int n_reserved=10; // 0...n_reserved-1 get token '' + static inline WordID reserved(int i) { + assert(i>=0 && i"; + static char const* const se_str; //=""; + static char const* const unk_str; //=""; + static WordID ss,se,unk; // x=Convert(x_str) + static WordID end(); // next id to be assigned; [begin,end) give the non-reserved tokens seen so far + static Vocab dict_; + static void ConvertSentence(std::string const& sent, std::vector* ids); + static void GetWordIDs(const std::vector& strings, std::vector* ids); + static std::string GetString(const std::vector& str); + static std::string GetString(WordID const* i,WordID const* e); + static int AppendString(const WordID& w, int pos, int bufsize, char* buffer); + static unsigned int NumWords(); + static WordID Convert(const std::string& s); + static WordID Convert(char const* s); + static const char* Convert(WordID w); +}; + +struct ToTD { + typedef WordID result_type; + result_type operator()(std::string const& t) const { + return TD::Convert(t); + } +}; + + +#endif diff --git a/utils/test_data/weights b/utils/test_data/weights new file mode 100644 index 00000000..ea70229c --- /dev/null +++ b/utils/test_data/weights @@ -0,0 +1,8 @@ +# hiero +WordPenalty -0.387029 +LanguageModel 0.253195 +PhraseModel_0 0.142926 +PhraseModel_1 0.465119 +PhraseModel_2 0.079503 +CNPosteriorProbability 0.09259 +Inf -inf diff --git a/utils/threadlocal.h b/utils/threadlocal.h new file mode 100755 index 00000000..d79f5d9d --- /dev/null +++ b/utils/threadlocal.h @@ -0,0 +1,71 @@ +#ifndef THREADLOCAL_H +#define THREADLOCAL_H + +#ifndef SETLOCAL_SWAP +# define SETLOCAL_SWAP 0 +#endif + +#ifdef BOOST_NO_MT + +# define THREADLOCAL + +#else + +#ifdef _MSC_VER + +//FIXME: doesn't work with DLLs ... use TLS apis instead (http://www.boost.org/libs/thread/doc/tss.html) +# define THREADLOCAL __declspec(thread) + +#else + +# define THREADLOCAL __thread + +#endif + +#endif + +#include //swap + +// naturally, the below are only thread-safe if value is THREADLOCAL +template +struct SaveLocal { + D &value; + D old_value; + SaveLocal(D& val) : value(val), old_value(val) {} + ~SaveLocal() { +#if SETLOCAL_SWAP + swap(value,old_value); +#else + value=old_value; +#endif + } +}; + +template +struct SetLocal { + D &value; + D old_value; + SetLocal(D& val,const D &new_value) : value(val), old_value( +#if SETLOCAL_SWAP + new_value +#else + val +#endif + ) { +#if SETLOCAL_SWAP + swap(value,old_value); +#else + value=new_value; +#endif + } + ~SetLocal() { +#if SETLOCAL_SWAP + swap(value,old_value); +#else + value=old_value; +#endif + } +}; + + +#endif diff --git a/utils/timing_stats.cc b/utils/timing_stats.cc new file mode 100644 index 00000000..fc8e9df1 --- /dev/null +++ b/utils/timing_stats.cc @@ -0,0 +1,24 @@ +#include "timing_stats.h" + +#include +#include "time.h" //cygwin needs +using namespace std; + +map Timer::stats; + +Timer::Timer(const string& timername) : start_t(clock()), cur(stats[timername]) {} + +Timer::~Timer() { + ++cur.calls; + const clock_t end_t = clock(); + const double elapsed = (end_t - start_t) / 1000000.0; + cur.total_time += elapsed; +} + +void Timer::Summarize() { + for (map::iterator it = stats.begin(); it != stats.end(); ++it) { + cerr << it->first << ": " << it->second.total_time << " secs (" << it->second.calls << " calls)\n"; + } + stats.clear(); +} + diff --git a/utils/timing_stats.h b/utils/timing_stats.h new file mode 100644 index 00000000..0a9f7656 --- /dev/null +++ b/utils/timing_stats.h @@ -0,0 +1,25 @@ +#ifndef _TIMING_STATS_H_ +#define _TIMING_STATS_H_ + +#include +#include + +struct TimerInfo { + int calls; + double total_time; + TimerInfo() : calls(), total_time() {} +}; + +struct Timer { + Timer(const std::string& info); + ~Timer(); + static void Summarize(); + private: + static std::map stats; + clock_t start_t; + TimerInfo& cur; + Timer(const Timer& other); + const Timer& operator=(const Timer& other); +}; + +#endif diff --git a/utils/weights.cc b/utils/weights.cc new file mode 100644 index 00000000..84647585 --- /dev/null +++ b/utils/weights.cc @@ -0,0 +1,77 @@ +#include "weights.h" + +#include + +#include "fdict.h" +#include "filelib.h" + +using namespace std; + +void Weights::InitFromFile(const std::string& filename, vector* feature_list) { + cerr << "Reading weights from " << filename << endl; + ReadFile in_file(filename); + istream& in = *in_file.stream(); + assert(in); + int weight_count = 0; + bool fl = false; + while (in) { + double val = 0; + string buf; + getline(in, buf); + if (buf.size() == 0) continue; + if (buf[0] == '#') continue; + for (int i = 0; i < buf.size(); ++i) + if (buf[i] == '=') buf[i] = ' '; + int start = 0; + while(start < buf.size() && buf[start] == ' ') ++start; + int end = 0; + while(end < buf.size() && buf[end] != ' ') ++end; + int fid = FD::Convert(buf.substr(start, end - start)); + while(end < buf.size() && buf[end] == ' ') ++end; + val = strtod(&buf.c_str()[end], NULL); + if (isnan(val)) { + cerr << FD::Convert(fid) << " has weight NaN!\n"; + abort(); + } + if (wv_.size() <= fid) + wv_.resize(fid + 1); + wv_[fid] = val; + if (feature_list) { feature_list->push_back(FD::Convert(fid)); } + ++weight_count; + if (weight_count % 50000 == 0) { cerr << '.' << flush; fl = true; } + if (weight_count % 2000000 == 0) { cerr << " [" << weight_count << "]\n"; fl = false; } + } + if (fl) { cerr << endl; } + cerr << "Loaded " << weight_count << " feature weights\n"; +} + +void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_features) const { + WriteFile out(fname); + ostream& o = *out.stream(); + assert(o); + o.precision(17); + const int num_feats = FD::NumFeats(); + for (int i = 1; i < num_feats; ++i) { + const double val = (i < wv_.size() ? wv_[i] : 0.0); + if (hide_zero_value_features && val == 0.0) continue; + o << FD::Convert(i) << ' ' << val << endl; + } +} + +void Weights::InitVector(std::vector* w) const { + *w = wv_; +} + +void Weights::InitSparseVector(SparseVector* w) const { + for (int i = 1; i < wv_.size(); ++i) { + const double& weight = wv_[i]; + if (weight) w->set_value(i, weight); + } +} + +void Weights::InitFromVector(const std::vector& w) { + wv_ = w; + if (wv_.size() > FD::NumFeats()) + cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n"; + wv_.resize(FD::NumFeats(), 0); +} diff --git a/utils/weights.h b/utils/weights.h new file mode 100644 index 00000000..f19aa3ce --- /dev/null +++ b/utils/weights.h @@ -0,0 +1,21 @@ +#ifndef _WEIGHTS_H_ +#define _WEIGHTS_H_ + +#include +#include +#include +#include "sparse_vector.h" + +class Weights { + public: + Weights() {} + void InitFromFile(const std::string& fname, std::vector* feature_list = NULL); + void WriteToFile(const std::string& fname, bool hide_zero_value_features = true) const; + void InitVector(std::vector* w) const; + void InitSparseVector(SparseVector* w) const; + void InitFromVector(const std::vector& w); + private: + std::vector wv_; +}; + +#endif diff --git a/utils/weights_test.cc b/utils/weights_test.cc new file mode 100644 index 00000000..8a4c26ef --- /dev/null +++ b/utils/weights_test.cc @@ -0,0 +1,27 @@ +#include +#include +#include +#include +#include +#include "weights.h" +#include "tdict.h" + +using namespace std; + +class WeightsTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + + +TEST_F(WeightsTest,Load) { + Weights w; + w.InitFromFile("test_data/weights"); + w.WriteToFile("-"); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/utils/wordid.h b/utils/wordid.h new file mode 100644 index 00000000..fb50bcc1 --- /dev/null +++ b/utils/wordid.h @@ -0,0 +1,6 @@ +#ifndef _WORD_ID_H_ +#define _WORD_ID_H_ + +typedef int WordID; + +#endif -- cgit v1.2.3