From 8c0e4a5c1f168a419b3a236a94815c97164bddbc Mon Sep 17 00:00:00 2001 From: "Wu, Ke" Date: Sun, 12 Oct 2014 16:30:02 -0400 Subject: Cherry picked Mr.MIRA compatibility mode code --- utils/Makefile.am | 3 ++- utils/b64featvector.cc | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ utils/b64featvector.h | 12 +++++++++++ 3 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 utils/b64featvector.cc create mode 100644 utils/b64featvector.h (limited to 'utils') diff --git a/utils/Makefile.am b/utils/Makefile.am index 727fa8a5..64f6d433 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -22,6 +22,7 @@ libutils_a_SOURCES = \ alias_sampler.h \ alignment_io.h \ array2d.h \ + b64featvector.h \ b64tools.h \ batched_append.h \ city.h \ @@ -70,6 +71,7 @@ libutils_a_SOURCES = \ fast_lexical_cast.hpp \ intrusive_refcount.hpp \ alignment_io.cc \ + b64featvector.cc \ b64tools.cc \ corpus_tools.cc \ dict.cc \ @@ -117,4 +119,3 @@ stringlib_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_U # do NOT NOT NOT add any other -I includes NO NO NO NO NO ###### AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -I. -I$(top_srcdir) -DTEST_DATA=\"$(top_srcdir)/utils/test_data\" ################################################################ - diff --git a/utils/b64featvector.cc b/utils/b64featvector.cc new file mode 100644 index 00000000..c7d08b29 --- /dev/null +++ b/utils/b64featvector.cc @@ -0,0 +1,55 @@ +#include "b64featvector.h" + +#include +#include +#include "b64tools.h" +#include "fdict.h" + +using namespace std; + +static inline void EncodeFeatureWeight(const string &featname, weight_t weight, + ostream *output) { + output->write(featname.data(), featname.size() + 1); + output->write(reinterpret_cast(&weight), sizeof(weight_t)); +} + +string EncodeFeatureVector(const SparseVector &vec) { + string b64; + { + ostringstream base64_strm; + { + ostringstream strm; + for (SparseVector::const_iterator it = vec.begin(); + it != vec.end(); ++it) + if (it->second != 0) + EncodeFeatureWeight(FD::Convert(it->first), it->second, &strm); + string data(strm.str()); + B64::b64encode(data.data(), data.size(), &base64_strm); + } + b64 = base64_strm.str(); + } + return b64; +} + +void DecodeFeatureVector(const string &data, SparseVector *vec) { + vec->clear(); + if (data.empty()) return; + // Decode data + size_t b64_len = data.size(), len = b64_len / 4 * 3; + boost::scoped_array buf(new char[len]); + bool res = + B64::b64decode(reinterpret_cast(data.data()), + b64_len, buf.get(), len); + assert(res); + // Apply updates + size_t cur = 0; + while (cur < len) { + string feat_name(buf.get() + cur); + if (feat_name.empty()) break; // Encountered trailing \0 + int feat_id = FD::Convert(feat_name); + weight_t feat_delta = + *reinterpret_cast(buf.get() + cur + feat_name.size() + 1); + (*vec)[feat_id] = feat_delta; + cur += feat_name.size() + 1 + sizeof(weight_t); + } +} diff --git a/utils/b64featvector.h b/utils/b64featvector.h new file mode 100644 index 00000000..6ac04d44 --- /dev/null +++ b/utils/b64featvector.h @@ -0,0 +1,12 @@ +#ifndef _B64FEATVECTOR_H_ +#define _B64FEATVECTOR_H_ + +#include + +#include "sparse_vector.h" +#include "weights.h" + +std::string EncodeFeatureVector(const SparseVector &); +void DecodeFeatureVector(const std::string &, SparseVector *); + +#endif // _B64FEATVECTOR_H_ -- cgit v1.2.3