diff options
Diffstat (limited to 'utils')
-rw-r--r-- | utils/Makefile.am | 3 | ||||
-rw-r--r-- | utils/alias_sampler.h | 4 | ||||
-rw-r--r-- | utils/alignment_io.h | 4 | ||||
-rw-r--r-- | utils/b64featvector.cc | 55 | ||||
-rw-r--r-- | utils/b64featvector.h | 12 | ||||
-rw-r--r-- | utils/b64tools.h | 4 | ||||
-rw-r--r-- | utils/corpus_tools.h | 4 | ||||
-rw-r--r-- | utils/exp_semiring.h | 4 | ||||
-rw-r--r-- | utils/fast_sparse_vector.h | 4 | ||||
-rw-r--r-- | utils/fdict.h | 4 | ||||
-rw-r--r-- | utils/feature_vector.h | 4 | ||||
-rw-r--r-- | utils/filelib.h | 4 | ||||
-rw-r--r-- | utils/kernel_string_subseq.h | 4 | ||||
-rw-r--r-- | utils/m.h | 4 | ||||
-rw-r--r-- | utils/murmur_hash3.h | 4 | ||||
-rw-r--r-- | utils/perfect_hash.h | 4 | ||||
-rw-r--r-- | utils/prob.h | 4 | ||||
-rw-r--r-- | utils/small_vector.h | 20 | ||||
-rw-r--r-- | utils/small_vector_test.cc | 30 | ||||
-rw-r--r-- | utils/sparse_vector.h | 4 | ||||
-rw-r--r-- | utils/star.h | 4 | ||||
-rw-r--r-- | utils/sv_test.cc | 31 | ||||
-rw-r--r-- | utils/tdict.h | 4 | ||||
-rw-r--r-- | utils/timing_stats.h | 4 | ||||
-rw-r--r-- | utils/verbose.h | 4 | ||||
-rw-r--r-- | utils/weights.h | 4 | ||||
-rw-r--r-- | utils/wordid.h | 4 |
27 files changed, 190 insertions, 45 deletions
diff --git a/utils/Makefile.am b/utils/Makefile.am index 727fa8a5..64f6d433 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -22,6 +22,7 @@ libutils_a_SOURCES = \ alias_sampler.h \ alignment_io.h \ array2d.h \ + b64featvector.h \ b64tools.h \ batched_append.h \ city.h \ @@ -70,6 +71,7 @@ libutils_a_SOURCES = \ fast_lexical_cast.hpp \ intrusive_refcount.hpp \ alignment_io.cc \ + b64featvector.cc \ b64tools.cc \ corpus_tools.cc \ dict.cc \ @@ -117,4 +119,3 @@ stringlib_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_U # do NOT NOT NOT add any other -I includes NO NO NO NO NO ###### AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -I. -I$(top_srcdir) -DTEST_DATA=\"$(top_srcdir)/utils/test_data\" ################################################################ - diff --git a/utils/alias_sampler.h b/utils/alias_sampler.h index 81541f7a..0f9d3f6d 100644 --- a/utils/alias_sampler.h +++ b/utils/alias_sampler.h @@ -1,5 +1,5 @@ -#ifndef _ALIAS_SAMPLER_H_ -#define _ALIAS_SAMPLER_H_ +#ifndef ALIAS_SAMPLER_H_ +#define ALIAS_SAMPLER_H_ #include <vector> #include <limits> diff --git a/utils/alignment_io.h b/utils/alignment_io.h index 63fb916b..ec70688e 100644 --- a/utils/alignment_io.h +++ b/utils/alignment_io.h @@ -1,5 +1,5 @@ -#ifndef _ALIGNMENT_IO_H_ -#define _ALIGNMENT_IO_H_ +#ifndef ALIGNMENT_IO_H_ +#define ALIGNMENT_IO_H_ #include <string> #include <iostream> diff --git a/utils/b64featvector.cc b/utils/b64featvector.cc new file mode 100644 index 00000000..c7d08b29 --- /dev/null +++ b/utils/b64featvector.cc @@ -0,0 +1,55 @@ +#include "b64featvector.h" + +#include <sstream> +#include <boost/scoped_array.hpp> +#include "b64tools.h" +#include "fdict.h" + +using namespace std; + +static inline void EncodeFeatureWeight(const string &featname, weight_t weight, + ostream *output) { + output->write(featname.data(), featname.size() + 1); + output->write(reinterpret_cast<char *>(&weight), sizeof(weight_t)); +} + +string EncodeFeatureVector(const SparseVector<weight_t> &vec) { + string b64; + { + ostringstream base64_strm; + { + ostringstream strm; + for (SparseVector<weight_t>::const_iterator it = vec.begin(); + it != vec.end(); ++it) + if (it->second != 0) + EncodeFeatureWeight(FD::Convert(it->first), it->second, &strm); + string data(strm.str()); + B64::b64encode(data.data(), data.size(), &base64_strm); + } + b64 = base64_strm.str(); + } + return b64; +} + +void DecodeFeatureVector(const string &data, SparseVector<weight_t> *vec) { + vec->clear(); + if (data.empty()) return; + // Decode data + size_t b64_len = data.size(), len = b64_len / 4 * 3; + boost::scoped_array<char> buf(new char[len]); + bool res = + B64::b64decode(reinterpret_cast<const unsigned char *>(data.data()), + b64_len, buf.get(), len); + assert(res); + // Apply updates + size_t cur = 0; + while (cur < len) { + string feat_name(buf.get() + cur); + if (feat_name.empty()) break; // Encountered trailing \0 + int feat_id = FD::Convert(feat_name); + weight_t feat_delta = + *reinterpret_cast<weight_t *>(buf.get() + cur + feat_name.size() + 1); + (*vec)[feat_id] = feat_delta; + cur += feat_name.size() + 1 + sizeof(weight_t); + } +} diff --git a/utils/b64featvector.h b/utils/b64featvector.h new file mode 100644 index 00000000..6ac04d44 --- /dev/null +++ b/utils/b64featvector.h @@ -0,0 +1,12 @@ +#ifndef _B64FEATVECTOR_H_ +#define _B64FEATVECTOR_H_ + +#include <string> + +#include "sparse_vector.h" +#include "weights.h" + +std::string EncodeFeatureVector(const SparseVector<weight_t> &); +void DecodeFeatureVector(const std::string &, SparseVector<weight_t> *); + +#endif // _B64FEATVECTOR_H_ diff --git a/utils/b64tools.h b/utils/b64tools.h index c821fc8f..130a9102 100644 --- a/utils/b64tools.h +++ b/utils/b64tools.h @@ -1,5 +1,5 @@ -#ifndef _B64_TOOLS_H_ -#define _B64_TOOLS_H_ +#ifndef B64_TOOLS_H_ +#define B64_TOOLS_H_ namespace B64 { bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize); diff --git a/utils/corpus_tools.h b/utils/corpus_tools.h index f6699d87..3ccaf6ef 100644 --- a/utils/corpus_tools.h +++ b/utils/corpus_tools.h @@ -1,5 +1,5 @@ -#ifndef _CORPUS_TOOLS_H_ -#define _CORPUS_TOOLS_H_ +#ifndef CORPUS_TOOLS_H_ +#define CORPUS_TOOLS_H_ #include <string> #include <set> diff --git a/utils/exp_semiring.h b/utils/exp_semiring.h index 26a22071..164286e3 100644 --- a/utils/exp_semiring.h +++ b/utils/exp_semiring.h @@ -1,5 +1,5 @@ -#ifndef _EXP_SEMIRING_H_ -#define _EXP_SEMIRING_H_ +#ifndef EXP_SEMIRING_H_ +#define EXP_SEMIRING_H_ #include <iostream> #include "star.h" diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index 6e2a77cd..1e0ab428 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -1,5 +1,5 @@ -#ifndef _FAST_SPARSE_VECTOR_H_ -#define _FAST_SPARSE_VECTOR_H_ +#ifndef FAST_SPARSE_VECTOR_H_ +#define FAST_SPARSE_VECTOR_H_ // FastSparseVector<T> is a integer indexed unordered map that supports very fast // (mathematical) vector operations when the sizes are very small, and reasonably diff --git a/utils/fdict.h b/utils/fdict.h index eb853fb2..94763890 100644 --- a/utils/fdict.h +++ b/utils/fdict.h @@ -1,5 +1,5 @@ -#ifndef _FDICT_H_ -#define _FDICT_H_ +#ifndef FDICT_H_ +#define FDICT_H_ #ifdef HAVE_CONFIG_H #include "config.h" diff --git a/utils/feature_vector.h b/utils/feature_vector.h index a7b61a66..bf77b5ac 100644 --- a/utils/feature_vector.h +++ b/utils/feature_vector.h @@ -1,5 +1,5 @@ -#ifndef _FEATURE_VECTOR_H_ -#define _FEATURE_VECTOR_H_ +#ifndef FEATURE_VECTOR_H_ +#define FEATURE_VECTOR_H_ #include <vector> #include "sparse_vector.h" diff --git a/utils/filelib.h b/utils/filelib.h index 4fa69760..90620d05 100644 --- a/utils/filelib.h +++ b/utils/filelib.h @@ -1,5 +1,5 @@ -#ifndef _FILELIB_H_ -#define _FILELIB_H_ +#ifndef FILELIB_H_ +#define FILELIB_H_ #include <cassert> #include <string> diff --git a/utils/kernel_string_subseq.h b/utils/kernel_string_subseq.h index 516e8b89..00ee7da7 100644 --- a/utils/kernel_string_subseq.h +++ b/utils/kernel_string_subseq.h @@ -1,5 +1,5 @@ -#ifndef _KERNEL_STRING_SUBSEQ_H_ -#define _KERNEL_STRING_SUBSEQ_H_ +#ifndef KERNEL_STRING_SUBSEQ_H_ +#define KERNEL_STRING_SUBSEQ_H_ #include <vector> #include <cmath> @@ -1,5 +1,5 @@ -#ifndef _M_H_ -#define _M_H_ +#ifndef M_H_HEADER_ +#define M_H_HEADER_ #include <cassert> #include <cmath> diff --git a/utils/murmur_hash3.h b/utils/murmur_hash3.h index a125d775..e8a8b10b 100644 --- a/utils/murmur_hash3.h +++ b/utils/murmur_hash3.h @@ -2,8 +2,8 @@ // MurmurHash3 was written by Austin Appleby, and is placed in the public // domain. The author hereby disclaims copyright to this source code. -#ifndef _MURMURHASH3_H_ -#define _MURMURHASH3_H_ +#ifndef MURMURHASH3_H_ +#define MURMURHASH3_H_ //----------------------------------------------------------------------------- // Platform-specific functions and macros diff --git a/utils/perfect_hash.h b/utils/perfect_hash.h index 29ea48a9..8c12c9f0 100644 --- a/utils/perfect_hash.h +++ b/utils/perfect_hash.h @@ -1,5 +1,5 @@ -#ifndef _PERFECT_HASH_MAP_H_ -#define _PERFECT_HASH_MAP_H_ +#ifndef PERFECT_HASH_MAP_H_ +#define PERFECT_HASH_MAP_H_ #include <vector> #include <boost/utility.hpp> diff --git a/utils/prob.h b/utils/prob.h index bc297870..32ba9a86 100644 --- a/utils/prob.h +++ b/utils/prob.h @@ -1,5 +1,5 @@ -#ifndef _PROB_H_ -#define _PROB_H_ +#ifndef PROB_H_ +#define PROB_H_ #include "logval.h" diff --git a/utils/small_vector.h b/utils/small_vector.h index 280ab72c..f16bc898 100644 --- a/utils/small_vector.h +++ b/utils/small_vector.h @@ -1,5 +1,5 @@ -#ifndef _SMALL_VECTOR_H_ -#define _SMALL_VECTOR_H_ +#ifndef SMALL_VECTOR_H_ +#define SMALL_VECTOR_H_ /* REQUIRES that T is POD (can be memcpy). won't work (yet) due to union with SMALL_VECTOR_POD==0 - may be possible to handle movable types that have ctor/dtor, by using explicit allocation, ctor/dtor calls. but for now JUST USE THIS FOR no-meaningful ctor/dtor POD types. @@ -15,6 +15,7 @@ #include <new> #include <stdint.h> #include <boost/functional/hash.hpp> +#include <boost/serialization/map.hpp> //sizeof(T)/sizeof(T*)>1?sizeof(T)/sizeof(T*):1 @@ -297,6 +298,21 @@ public: return hash_range(data_.ptr,data_.ptr+size_); } + template<class Archive> + void save(Archive & ar, const unsigned int) const { + ar & size_; + for (unsigned i = 0; i < size_; ++i) + ar & (*this)[i]; + } + template<class Archive> + void load(Archive & ar, const unsigned int) { + uint16_t s; + ar & s; + this->resize(s); + for (unsigned i = 0; i < size_; ++i) + ar & (*this)[i]; + } + BOOST_SERIALIZATION_SPLIT_MEMBER() private: union StorageType { T vals[SV_MAX]; diff --git a/utils/small_vector_test.cc b/utils/small_vector_test.cc index a4eb89ae..9e1a148d 100644 --- a/utils/small_vector_test.cc +++ b/utils/small_vector_test.cc @@ -3,6 +3,10 @@ #define BOOST_TEST_MODULE svTest #include <boost/test/unit_test.hpp> #include <boost/test/floating_point_comparison.hpp> +#include <boost/archive/text_oarchive.hpp> +#include <boost/archive/text_iarchive.hpp> +#include <string> +#include <sstream> #include <iostream> #include <vector> @@ -128,3 +132,29 @@ BOOST_AUTO_TEST_CASE(Small) { cerr << sizeof(SmallVectorInt) << endl; cerr << sizeof(vector<int>) << endl; } + +BOOST_AUTO_TEST_CASE(Serialize) { + std::string in; + { + SmallVectorInt v; + v.push_back(0); + v.push_back(1); + v.push_back(-2); + ostringstream os; + boost::archive::text_oarchive oa(os); + oa << v; + in = os.str(); + cerr << in; + } + { + istringstream is(in); + boost::archive::text_iarchive ia(is); + SmallVectorInt v; + ia >> v; + BOOST_CHECK_EQUAL(v.size(), 3); + BOOST_CHECK_EQUAL(v[0], 0); + BOOST_CHECK_EQUAL(v[1], 1); + BOOST_CHECK_EQUAL(v[2], -2); + } +} + diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h index 049151f7..13601376 100644 --- a/utils/sparse_vector.h +++ b/utils/sparse_vector.h @@ -1,5 +1,5 @@ -#ifndef _SPARSE_VECTOR_H_ -#define _SPARSE_VECTOR_H_ +#ifndef SPARSE_VECTOR_H_ +#define SPARSE_VECTOR_H_ #include "fast_sparse_vector.h" #define SparseVector FastSparseVector diff --git a/utils/star.h b/utils/star.h index 21977dc9..01433d12 100644 --- a/utils/star.h +++ b/utils/star.h @@ -1,5 +1,5 @@ -#ifndef _STAR_H_ -#define _STAR_H_ +#ifndef STAR_H_ +#define STAR_H_ // star(x) computes the infinite sum x^0 + x^1 + x^2 + ... diff --git a/utils/sv_test.cc b/utils/sv_test.cc index 67df8c57..b006e66d 100644 --- a/utils/sv_test.cc +++ b/utils/sv_test.cc @@ -1,7 +1,12 @@ #define BOOST_TEST_MODULE WeightsTest #include <boost/test/unit_test.hpp> #include <boost/test/floating_point_comparison.hpp> +#include <boost/archive/text_oarchive.hpp> +#include <boost/archive/text_iarchive.hpp> +#include <sstream> +#include <string> #include "sparse_vector.h" +#include "fdict.h" using namespace std; @@ -33,3 +38,29 @@ BOOST_AUTO_TEST_CASE(Division) { x /= -1; BOOST_CHECK(x == y); } + +BOOST_AUTO_TEST_CASE(Serialization) { + string arc; + FD::dict_.clear(); + { + SparseVector<double> x; + x.set_value(FD::Convert("Feature1"), 1.0); + x.set_value(FD::Convert("Pi"), 3.14); + ostringstream os; + boost::archive::text_oarchive oa(os); + oa << x; + arc = os.str(); + } + FD::dict_.clear(); + FD::Convert("SomeNewString"); + { + SparseVector<double> x; + istringstream is(arc); + boost::archive::text_iarchive ia(is); + ia >> x; + cerr << x << endl; + BOOST_CHECK_CLOSE(x.get(FD::Convert("Pi")), 3.14, 1e-9); + BOOST_CHECK_CLOSE(x.get(FD::Convert("Feature1")), 1.0, 1e-9); + } +} + diff --git a/utils/tdict.h b/utils/tdict.h index bb19ecd5..eed33c3a 100644 --- a/utils/tdict.h +++ b/utils/tdict.h @@ -1,5 +1,5 @@ -#ifndef _TDICT_H_ -#define _TDICT_H_ +#ifndef TDICT_H_ +#define TDICT_H_ #include <string> #include <vector> diff --git a/utils/timing_stats.h b/utils/timing_stats.h index 0a9f7656..69a1cf4b 100644 --- a/utils/timing_stats.h +++ b/utils/timing_stats.h @@ -1,5 +1,5 @@ -#ifndef _TIMING_STATS_H_ -#define _TIMING_STATS_H_ +#ifndef TIMING_STATS_H_ +#define TIMING_STATS_H_ #include <string> #include <map> diff --git a/utils/verbose.h b/utils/verbose.h index 73476383..e39e23cb 100644 --- a/utils/verbose.h +++ b/utils/verbose.h @@ -1,5 +1,5 @@ -#ifndef _VERBOSE_H_ -#define _VERBOSE_H_ +#ifndef VERBOSE_H_ +#define VERBOSE_H_ extern bool SILENT; diff --git a/utils/weights.h b/utils/weights.h index 920fdd75..0bd4c2d9 100644 --- a/utils/weights.h +++ b/utils/weights.h @@ -1,5 +1,5 @@ -#ifndef _WEIGHTS_H_ -#define _WEIGHTS_H_ +#ifndef WEIGHTS_H_ +#define WEIGHTS_H_ #include <string> #include <vector> diff --git a/utils/wordid.h b/utils/wordid.h index 714dcd0b..3aa6cc23 100644 --- a/utils/wordid.h +++ b/utils/wordid.h @@ -1,5 +1,5 @@ -#ifndef _WORD_ID_H_ -#define _WORD_ID_H_ +#ifndef WORD_ID_H_ +#define WORD_ID_H_ #include <limits> |