diff options
-rwxr-xr-x | decoder/apply_fsa_models.cc | 20 | ||||
-rwxr-xr-x | decoder/apply_fsa_models.h | 27 | ||||
-rw-r--r-- | decoder/cdec.cc | 2 | ||||
-rwxr-xr-x | decoder/cfg.cc | 22 | ||||
-rwxr-xr-x | decoder/cfg.h | 16 | ||||
-rwxr-xr-x | decoder/cfg_options.h | 1 | ||||
-rwxr-xr-x | decoder/program_options.h | 25 | ||||
-rwxr-xr-x | utils/static_utoa.h | 24 | ||||
-rwxr-xr-x | utils/utoa.h | 199 |
9 files changed, 245 insertions, 91 deletions
diff --git a/decoder/apply_fsa_models.cc b/decoder/apply_fsa_models.cc index 1c30eb90..2854b28b 100755 --- a/decoder/apply_fsa_models.cc +++ b/decoder/apply_fsa_models.cc @@ -13,6 +13,8 @@ using namespace std; +DEFINE_NAMED_ENUM(FSA_BY) + struct ApplyFsa { ApplyFsa(HgCFG &i, const SentenceMetadata& smeta, @@ -74,6 +76,7 @@ void ApplyFsaModels(HgCFG &i, a.Compute(); } +/* namespace { char const* anames[]={ "BU_CUBE", @@ -82,14 +85,18 @@ char const* anames[]={ 0 }; } +*/ //TODO: named enum type in boost? std::string ApplyFsaBy::name() const { - return anames[algorithm]; +// return anames[algorithm]; + return GetName(algorithm); } std::string ApplyFsaBy::all_names() { + return FsaByNames(" "); + /* std::ostringstream o; for (int i=0;i<N_ALGORITHMS;++i) { assert(anames[i]); @@ -97,19 +104,24 @@ std::string ApplyFsaBy::all_names() { o<<anames[i]; } return o.str(); + */ } ApplyFsaBy::ApplyFsaBy(std::string const& n, int pop_limit) : pop_limit(pop_limit) { - algorithm=0; std::string uname=toupper(n); + algorithm=GetFsaBy(uname); +/*anames=0; while(anames[algorithm] && anames[algorithm] != uname) ++algorithm; if (!anames[algorithm]) throw std::runtime_error("Unknown ApplyFsaBy type: "+n+" - legal types: "+all_names()); +*/ } -ApplyFsaBy::ApplyFsaBy(int i, int pop_limit) : pop_limit(pop_limit) { - if (i<0 || i>=N_ALGORITHMS) +ApplyFsaBy::ApplyFsaBy(FsaBy i, int pop_limit) : pop_limit(pop_limit) { +/* if (i<0 || i>=N_ALGORITHMS) throw std::runtime_error("Unknown ApplyFsaBy type id: "+itos(i)+" - legal types: "+all_names()); +*/ + GetName(i); // checks validity algorithm=i; } diff --git a/decoder/apply_fsa_models.h b/decoder/apply_fsa_models.h index 5120fb4e..6561c70c 100755 --- a/decoder/apply_fsa_models.h +++ b/decoder/apply_fsa_models.h @@ -4,25 +4,36 @@ #include <string> #include <iostream> #include "feature_vector.h" +#include "named_enum.h" struct FsaFeatureFunction; struct Hypergraph; struct SentenceMetadata; struct HgCFG; + +#define FSA_BY(X,t) \ + X(t,BU_CUBE,) \ + X(t,BU_FULL,) \ + X(t,EARLEY,) \ + +#define FSA_BY_TYPE FsaBy + +DECLARE_NAMED_ENUM(FSA_BY) + struct ApplyFsaBy { - enum { - BU_CUBE, - BU_FULL, - EARLEY, - N_ALGORITHMS - }; +/*enum { + BU_CUBE, + BU_FULL, + EARLEY, + N_ALGORITHMS + };*/ int pop_limit; // only applies to BU_FULL so far bool IsBottomUp() const { return algorithm==BU_FULL || algorithm==BU_CUBE; } int BottomUpAlgorithm() const; - int algorithm; + FsaBy algorithm; std::string name() const; friend inline std::ostream &operator << (std::ostream &o,ApplyFsaBy const& c) { o << c.name(); @@ -30,7 +41,7 @@ struct ApplyFsaBy { o << "("<<c.pop_limit<<")"; return o; } - explicit ApplyFsaBy(int alg, int poplimit=200); + explicit ApplyFsaBy(FsaBy alg, int poplimit=200); ApplyFsaBy(std::string const& name, int poplimit=200); ApplyFsaBy(const ApplyFsaBy &o) : algorithm(o.algorithm) { } static std::string all_names(); // space separated diff --git a/decoder/cdec.cc b/decoder/cdec.cc index 3633febd..5898b245 100644 --- a/decoder/cdec.cc +++ b/decoder/cdec.cc @@ -193,7 +193,7 @@ void InitCommandLine(int argc, char** argv, OracleBleu &ob, po::variables_map* c dconfig_options.add(opts).add(cfgo); //add(opts).add(cfgo) dcmdline_options.add(dconfig_options).add(clo); - + argv_minus_to_underscore(argc,argv); po::store(parse_command_line(argc, argv, dcmdline_options), conf); if (conf.count("compgen")) { print_options(cout,dcmdline_options); diff --git a/decoder/cfg.cc b/decoder/cfg.cc index 81a17355..aa9e5f30 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -11,13 +11,21 @@ using namespace std; namespace { CFG::BinRhs nullrhs(std::numeric_limits<int>::min(),std::numeric_limits<int>::min()); -} - -WordID CFG::BinName(BinRhs const& b) +// index i >= N.size()? then it's in M[i-N.size()] +WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) { + int nn=N.size(); ostringstream o; -#define BinNameOWORD(w) do { int n=w; if (n>0) o << TD::Convert(n); else { o << 'V' << -n; } } while(0) +#define BinNameOWORD(w) \ + do { \ + int n=w; if (n>0) o << TD::Convert(n); \ + else { \ + int i=-n; \ + CFG::NT const&nt = i<nn?N[i]:M[i-nn]; \ + o << nt.from << i; } \ + } while(0) + BinNameOWORD(b.first); o<<'+'; BinNameOWORD(b.second); @@ -25,6 +33,10 @@ WordID CFG::BinName(BinRhs const& b) return TD::Convert(o.str()); } +} + + + void CFG::Binarize(CFGBinarize const& b) { if (!b.Binarizing()) return; if (!b.bin_l2r) { @@ -57,7 +69,7 @@ void CFG::Binarize(CFGBinarize const& b) { new_nts.back().ruleids.push_back(newruleid); new_rules.push_back(Rule(newnt,bin)); if (b.bin_name_nts) - new_nts.back().from.nt=BinName(bin); + new_nts.back().from.nt=BinName(bin,nts,new_nts); ++newnt;++newruleid; } } diff --git a/decoder/cfg.h b/decoder/cfg.h index 64924f14..e1f818e8 100755 --- a/decoder/cfg.h +++ b/decoder/cfg.h @@ -40,11 +40,10 @@ struct CFG { typedef std::vector<RuleHandle> Ruleids; void print_nt_name(std::ostream &o,NTHandle n) const { - o << nts[n].from; + o << nts[n].from << n; } typedef std::pair<int,int> BinRhs; - WordID BinName(BinRhs const& b); struct Rule { // for binarizing - no costs/probs @@ -106,16 +105,17 @@ struct CFG { swap(goal_nt,o.goal_nt); } void Binarize(CFGBinarize const& binarize_options); + + typedef std::vector<NT> NTs; + NTs nts; + typedef std::vector<Rule> Rules; + Rules rules; + int goal_nt; + prob_t goal_inside,pushed_inside; // when we push viterbi weights to goal, we store the removed probability in pushed_inside protected: bool uninit; Hypergraph const* hg_; // shouldn't be used for anything, esp. after binarization - prob_t goal_inside,pushed_inside; // when we push viterbi weights to goal, we store the removed probability in pushed_inside // rules/nts will have same index as hg edges/nodes - typedef std::vector<Rule> Rules; - Rules rules; - typedef std::vector<NT> NTs; - NTs nts; - int goal_nt; }; inline void swap(CFG &a,CFG &b) { diff --git a/decoder/cfg_options.h b/decoder/cfg_options.h index 956586f0..acd8d05b 100755 --- a/decoder/cfg_options.h +++ b/decoder/cfg_options.h @@ -28,6 +28,7 @@ struct CFGOptions { void Validate() { format.Validate(); binarize.Validate(); +// if (cfg_output.empty()) binarize.bin_name_nts=false; } char const* description() const { return "CFG output options"; diff --git a/decoder/program_options.h b/decoder/program_options.h index 251f5680..87afb320 100755 --- a/decoder/program_options.h +++ b/decoder/program_options.h @@ -13,6 +13,31 @@ #include <iosfwd> +// change --opt-name=x --opt_name=x for all strings x. danger: probably the argv from int main isn't supposed to be modified? +inline int arg_minusto_underscore(char *s) { + if (!*s || *s++ != '-') return 0; + if (!*s || *s++ != '-') return 0; + int chars_replaced=0; + for(;*s;++s) { + if (*s=='=') + break; + if (*s=='-') { + *s='_'; + ++chars_replaced; + } + } + return chars_replaced; +} + +inline +int argv_minus_to_underscore(int argc, char **argv) { + int chars_replaced=0; + for (int i=1;i<argc;++i) { + chars_replaced+=arg_minusto_underscore(argv[i]); + } + return chars_replaced; +} + template <class T> boost::program_options::typed_value<T>* defaulted_value(T *v) diff --git a/utils/static_utoa.h b/utils/static_utoa.h index 3af9fbb6..d15ed35b 100755 --- a/utils/static_utoa.h +++ b/utils/static_utoa.h @@ -5,35 +5,19 @@ #include "utoa.h" namespace { +static const int utoa_bufsize=40; // 64bit safe. +static const int utoa_bufsizem1=utoa_bufsize-1; // 64bit safe. THREADLOCAL char utoa_buf[utoa_bufsize]; // to put end of string character at buf[20] } inline char *static_utoa(unsigned n) { + assert(utoa_buf[utoa_bufsizem1]==0); return utoa(utoa_buf+utoa_bufsizem1,n); } inline char *static_itoa(int n) { + assert(utoa_buf[utoa_bufsizem1]==0); return itoa(utoa_buf+utoa_bufsizem1,n); } -#ifdef ITOA_SAMPLE -# include <cstdio> -# include <sstream> -# include <iostream> -using namespace std; - -int main(int argc,char *argv[]) { - printf("d U d U d U\n"); - for (int i=1;i<argc;++i) { - int n; - unsigned un; - sscanf(argv[i],"%d",&n); - sscanf(argv[i],"%u",&un); - printf("%d %u %s",n,un,static_itoa(n)); - printf(" %s %s %s\n",static_utoa(un),itos(n).c_str(),utos(un).c_str()); - } - return 0; -} -#endif - #endif diff --git a/utils/utoa.h b/utils/utoa.h index 8d304f97..341965cc 100755 --- a/utils/utoa.h +++ b/utils/utoa.h @@ -1,19 +1,61 @@ #ifndef UTOA_H #define UTOA_H + #include <stdint.h> #include <string> #include <cstring> +#include <limits> +// define this if you're paranoid about converting 0-9 (int) to 0-9 (char) by adding to '0', which is safe for ascii, utf8, etc. #ifndef DIGIT_LOOKUP_TABLE # define DIGIT_LOOKUP_TABLE 0 #endif +//TODO: 3 decimal digit lookup table, divide by 1000 faster? +//TODO: benchmark these two (also, some assembly that does effectively divmod?) +#if 1 +// maybe this is faster than mod because we are already dividing +#define NDIV10MOD(rem,n) rem = n; n /= 10; rem -= 10*n; +#else +// or maybe optimizer does it just as well or better with this: +#define NDIV10MOD(rem,n) rem = n%10; n = n/10; +#endif + +template <class T> +struct signed_for_int { +}; + + +#define DEFINE_SIGNED_FOR_3(t,it,ut) \ +template <> \ +struct signed_for_int<t> { \ + typedef ut unsigned_t; \ + typedef it signed_t; \ + typedef t original_t; \ + enum { toa_bufsize = 3 + std::numeric_limits<t>::digits10, toa_bufsize_minus_1=toa_bufsize-1 }; \ +}; + +// toa_bufsize will hold enough chars for a c string converting to sign,digits (for both signed and unsigned types), because normally an unsigned would only need 2 extra chars. we reserve 3 explicitly for the case that itoa(buf,UINT_MAX,true) is called, with output +4...... + +#define DEFINE_SIGNED_FOR(it) DEFINE_SIGNED_FOR_3(it,it,u ## it) \ + DEFINE_SIGNED_FOR_3(u ## it,it,u ## it) + +DEFINE_SIGNED_FOR(int8_t) +DEFINE_SIGNED_FOR(int16_t) +DEFINE_SIGNED_FOR(int32_t) +DEFINE_SIGNED_FOR(int64_t) +//DEFINE_SIGNED_FOR_3(int,int,unsigned) +//DEFINE_SIGNED_FOR_3(unsigned,int,unsigned) + +/* // The largest 32-bit integer is 4294967295, that is 10 chars // 1 more for sign, and 1 for 0-termination of string -// generally: 2 + std::numeric_limits<T>::is_signed + std::numeric_limits<T>::digits10 const unsigned utoa_bufsize=12; const unsigned utoa_bufsizem1=utoa_bufsize-1; +const unsigned ultoa_bufsize=22; +const unsigned ultoa_bufsizem1=utoa_bufsize-1; +*/ #ifdef DIGIT_LOOKUP_TABLE namespace { @@ -30,72 +72,139 @@ inline char digit_to_char(int d) { #endif } -// returns n in string [return,num); *num=0 yourself before calling if you want a c_str -inline char *utoa(char *num,uint32_t n) { - if ( !n ) { - *--num='0'; + +// returns n in string [return,num); *num=0 yourself before calling if you want a c_str. in other words, the sequence [ret,buf) contains the written digits +template <class Int> +char *utoa(char *buf,Int n_) { + typedef typename signed_for_int<Int>::unsigned_t Uint; + Uint n=n_; + if (!n) { + *--buf='0'; } else { - uint32_t rem; + Uint rem; // 3digit lookup table, divide by 1000 faster? - while ( n ) { -#if 1 - rem = n; - n /= 10; - rem -= 10*n; // maybe this is faster than mod because we are already dividing -#else - rem = n%10; // would optimizer combine these together? - n = n/10; -#endif - *--num = digit_to_char(rem); + while (n) { + NDIV10MOD(rem,n); + *--buf = digit_to_char(rem); } } - return num; + return buf; } -inline char *itoa(char *p,int32_t n) { - if (n<0) { - p=utoa(p,-n); // (unsigned)(-INT_MIN) == 0x1000000 in 2s complement and not == 0. - *--p='-'; - return p; - } else - return utoa(p,n); +// left_pad_0(buf,utoa(buf+bufsz,n)) means that [buf,buf+bufsz) is a left-0 padded seq. of digits. no 0s are added if utoa is already past buf (you must have ensured that this is valid memory, naturally) +inline void left_pad(char *left,char *buf,char pad='0') { + while (buf>left) + *--buf=pad; + // return buf; } -inline std::string utos(uint32_t n) { - char buf[utoa_bufsize]; - char *end=buf+utoa_bufsize; - char *p=utoa(end,n); +template <class Int> +char *utoa_left_pad(char *buf,char *bufend,Int n, char pad='0') { + char *r=utoa(bufend,n); + assert(buf<=r); + left_pad(buf,r,pad); + return buf; +} + +// note: 0 -> 0, but otherwise x000000 -> x (x has no trailing 0s). same conditions as utoa; [ret,buf) gives the sequence of digits +// useful for floating point fraction output +template <class Uint_> +char *utoa_drop_trailing_0(char *buf,Uint_ n_, unsigned &n_skipped) { + typedef typename signed_for_int<Uint_>::unsigned_t Uint; + Uint n=n_; + n_skipped=0; + if (!n) { + *--buf='0'; + return buf; + } else { + Uint rem; + while (n) { + NDIV10MOD(rem,n); + if (rem) { + *--buf = digit_to_char(rem); + // some non-0 trailing digits; now output all digits. + while (n) { + NDIV10MOD(rem,n); + *--buf = digit_to_char(rem); + } + return buf; + } + ++n_skipped; + } + assert(0); + return 0; + } +} + +// desired feature: itoa(unsigned) = utoa(unsigned) +// positive sign: 0 -> +0, 1-> +1. obviously -n -> -n +template <class Int> +//typename signed_for_int<Int>::original_t instead of Int to give more informative wrong-type message? +char *itoa(char *buf,Int i,bool positive_sign=false) { + typename signed_for_int<Int>::unsigned_t n=i; + if (i<0) + n=-n; //sidesteps 2s complement issue doing this rather than just u=-n. + char * ret=utoa(buf,n); + if (i<0) { + *--ret='-'; + } else if (positive_sign) + *--ret='+'; + return ret; +} + +template <class Int> +char * itoa_left_pad(char *buf,char *bufend,Int i,bool positive_sign=false,char pad='0') { + typename signed_for_int<Int>::unsigned_t n=i; + if (i<0) { + n=-n; //sidesteps 2s complement issue doing this rather than just u=-n. + *buf='-'; + } else if (positive_sign) + *buf='+'; + char * r=utoa(bufend,n); + assert(buf<r); + left_pad(buf+1,r,pad); + return buf; +} + +template <class Int> +inline std::string itos(Int n) { + char buf[signed_for_int<Int>::toa_bufsize]; + char *end=buf+signed_for_int<Int>::toa_bufsize; + char *p=itoa(end,n); return std::string(p,end); } -inline std::string itos(int32_t n) { - char buf[utoa_bufsize]; - char *end=buf+utoa_bufsize; +template <class Int> +inline std::string utos(Int n) { + char buf[signed_for_int<Int>::toa_bufsize]; + char *end=buf+signed_for_int<Int>::toa_bufsize; char *p=itoa(end,n); return std::string(p,end); } //returns position of '\0' terminating number written starting at to -inline char* append_utoa(char *to,uint32_t n) { - char buf[utoa_bufsize]; - char *end=buf+utoa_bufsize; - char *s=utoa(end,n); - int ns=end-s; - std::memcpy(to,s,ns); +template <class Int> +inline char* append_utoa(char *to,typename signed_for_int<Int>::unsigned_t n) { + char buf[signed_for_int<Int>::toa_bufsize]; + char *end=buf+signed_for_int<Int>::toa_bufsize; + char *p=itoa(end,n); + int ns=end-p; + std::memcpy(to,p,ns); to+=ns; - *to++=0; + *to=0; return to; } //returns position of '\0' terminating number written starting at to -inline char* append_itoa(char *to,int32_t n) { - char buf[utoa_bufsize]; - char *end=buf+utoa_bufsize; - char *s=itoa(end,n); - int ns=end-s; - std::memcpy(to,s,ns); +template <class Int> +inline char* append_itoa(char *to,typename signed_for_int<Int>::signed_t n) { + char buf[signed_for_int<Int>::toa_bufsize]; + char *end=buf+signed_for_int<Int>::toa_bufsize; + char *p=utoa(end,n); + int ns=end-p; + std::memcpy(to,p,ns); to+=ns; - *to++=0; + *to=0; return to; } |