diff options
| -rwxr-xr-x | decoder/cfg.cc | 12 | ||||
| -rwxr-xr-x | decoder/cfg.h | 8 | ||||
| -rwxr-xr-x | decoder/cfg_format.h | 36 | ||||
| -rwxr-xr-x | decoder/program_options.h | 283 | ||||
| -rw-r--r-- | decoder/stringlib.h | 28 | ||||
| -rw-r--r-- | decoder/trule.cc | 24 | ||||
| -rw-r--r-- | decoder/trule.h | 2 | 
7 files changed, 388 insertions, 5 deletions
| diff --git a/decoder/cfg.cc b/decoder/cfg.cc index 0f20ba0f..b83fc54d 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -1,5 +1,6 @@  #include "cfg.h"  #include "hg.h" +#include "cfg_format.h"  using namespace std; @@ -10,6 +11,7 @@ void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool pus    pushed_inside=push_weights ? goal_inside : prob_t(1);    int nn=hg.nodes_.size(),ne=hg.edges_.size();    nts.resize(nn); +  goal_nt=nn-1;    rules.resize(ne);    for (int i=0;i<nn;++i)      nts[i].ruleids=hg.nodes_[i].in_edges_; @@ -40,3 +42,13 @@ void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool pus      }    }  } + +namespace { +} + +void CFG::Print(std::ostream &o,CFGFormat const& f) const { +  char const* partsep=" ||| "; +  if (!f.goal_nt_name.empty()) +    o << '['<<f.goal_nt_name <<']' << partsep; // print rhs +  //TODO: +} diff --git a/decoder/cfg.h b/decoder/cfg.h index 9e1b2837..8d7a5eee 100755 --- a/decoder/cfg.h +++ b/decoder/cfg.h @@ -27,9 +27,7 @@  #include "small_vector.h"  class Hypergraph; -class CFG; - - +class CFGFormat; // #include "cfg_format.h"  struct CFG {    typedef int RuleHandle; @@ -48,7 +46,7 @@ struct CFG {    };    struct NT { -    Ruleids ruleids; // index into CFG rules with this lhs +    Ruleids ruleids; // index into CFG rules with lhs = this NT.  aka in_edges_    };    CFG() : hg_() {  } @@ -58,6 +56,7 @@ struct CFG {      Init(hg,target_side,copy_features,push_weights);    }    void Init(Hypergraph const& hg,bool target_side=true,bool copy_features=false,bool push_weights=true); +  void Print(std::ostream &o,CFGFormat const& format) const; // see cfg_format.h  protected:    Hypergraph const* hg_; // shouldn't be used for anything, esp. after binarization    prob_t goal_inside,pushed_inside; // when we push viterbi weights to goal, we store the removed probability in pushed_inside @@ -66,6 +65,7 @@ protected:    Rules rules;    typedef std::vector<NT> NTs;    NTs nts; +  int goal_nt;  };  #endif diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h new file mode 100755 index 00000000..1bce3d06 --- /dev/null +++ b/decoder/cfg_format.h @@ -0,0 +1,36 @@ +#ifndef CFG_FORMAT_H +#define CFG_FORMAT_H + +#include <program_options.h> +#include <string> + +struct CFGFormat { +  bool identity_scfg;bool features;bool logprob_feat;bool cfg_comma_nt;std::string goal_nt_name;std::string nt_prefix; +  template <class Opts> // template to support both printable_opts and boost nonprintable +  void AddOptions(Opts *opts) { +    using namespace boost::program_options; +    using namespace std; +    opts->add_options() +      ("identity_scfg",defaulted_value(&identity_scfg),"output an identity SCFG: add an identity target side - '[X12] ||| [X13,1] a ||| [1] a ||| feat= ...' - the redundant target '[1] a |||' is omitted otherwise.") +      ("features",defaulted_value(&features),"print the CFG feature vector") +      ("logprob_feat",defaulted_value(&logprob_feat),"print a LogProb=-1.5 feature irrespective of --features.") +      ("cfg_comma_nt",defaulted_value(&cfg_comma_nt),"if false, omit the usual [NP,1] ',1' variable index in the source side") +      ("goal_nt_name",defaulted_value(&goal_nt_name),"if nonempty, the first production will be '[goal_nt_name] ||| [x123] ||| LogProb=y' where x123 is the actual goal nt, and y is the pushed prob, if any") +      ("nt_prefix",defaulted_value(&nt_prefix),"NTs are [<nt_prefix>123] where 123 is the node number starting at 0, and the highest node (last in file) is the goal node in an acyclic hypergraph") +      ; +  } +  void set_defaults() { +    identity_scfg=false; +    features=true; +    logprob_feat=true; +    cfg_comma_nt=true; +    goal_nt_name="S"; +    nt_prefix=""; +  } +  CFGFormat() { +    set_defaults(); +  } +}; + + +#endif diff --git a/decoder/program_options.h b/decoder/program_options.h new file mode 100755 index 00000000..251f5680 --- /dev/null +++ b/decoder/program_options.h @@ -0,0 +1,283 @@ +#ifndef PROGRAM_OPTIONS_H +#define PROGRAM_OPTIONS_H + +/* wraps boost options_description as printable_opts - show the options values used as well as defaults in usage or log messages.  boost program options library lacks any concept of printing configured values; it only supports parsing them from strings */ + +#ifdef _WIN32 +#include <iso646.h> +#endif +#include <boost/program_options.hpp> +#include <boost/function.hpp> +#include <boost/shared_ptr.hpp> +#include <stdexcept> +#include <iosfwd> + + +template <class T> +boost::program_options::typed_value<T>* +defaulted_value(T *v) +{ +  return boost::program_options::value<T>(v)->default_value(*v); +} + +inline void program_options_fatal(std::string const& msg) { +  throw std::runtime_error(msg); +} + + +inline std::string const& get_single_arg(boost::any& v,std::vector<std::string> const& values) +{ +  boost::program_options::validators::check_first_occurrence(v); +  return boost::program_options::validators::get_single_string(values); +} + +template <class I> +void must_complete_read(I &in,std::string const& msg="Couldn't parse") +{ +  char c; +  if (in.bad()) +    program_options_fatal(msg + " - failed input"); +  if (in >> c) +    program_options_fatal(msg + " - got extra char: " + std::string(c,1)); +} + +template <class Ostream> +struct any_printer  : public boost::function<void (Ostream &,boost::any const&)> +{ +  typedef boost::function<void (Ostream &,boost::any const&)> F; + +  template <class T> +  struct typed_print +  { +    void operator()(Ostream &o,boost::any const& t) const +    { +      o << *boost::any_cast<T const>(&t); +    } +  }; + +  template <class T> +  static +  void typed_print_template(Ostream &o,boost::any const& t) +  { +    o << *boost::any_cast<T const>(&t); +  } + +  any_printer() {} + +  any_printer(const any_printer& x) +    : F(static_cast<F const&>(x)) +  {} + +  template <class T> +  explicit any_printer(T const* tag) : F(typed_print<T>()) { +  } + +  template <class T> +  void set() +  { +    F f(typed_print<T>()); +    swap(f); +  } +}; + +// have to wrap regular options_description and store our own tables because +// author didn't make enough stuff protected/public or add a virtual print +// method to value_semantic +template <class Ostream> +struct printable_options_description +  : public boost::program_options::options_description +{ +  typedef printable_options_description<Ostream> self_type; +  typedef boost::program_options::options_description options_description; +  typedef boost::program_options::option_description option_description; +  typedef boost::shared_ptr<self_type> group_type; +  typedef std::vector<group_type > groups_type; + +  struct printable_option +  { +    typedef boost::shared_ptr<option_description> OD; + +    any_printer<Ostream> print; +    OD od; +    bool in_group; + +    std::string const& name() +    { return od->long_name(); } + +    std::string const& description() +    { return od->description(); } + +    std::string const& vmkey() +    { +      return od->key(name()); +    } +    template <class T> +    printable_option(T *tag, OD const& od) : print(tag),od(od),in_group(false) {} +    printable_option() : in_group(false) {} +  }; +  typedef std::vector<printable_option > options_type; +  BOOST_STATIC_CONSTANT(unsigned,default_linewrap=80); // options_description::m_default_line_length +  printable_options_description(unsigned line_length = default_linewrap) : +    options_description(line_length) {} + +  printable_options_description(const std::string& caption, +                                unsigned line_length = default_linewrap) +    : options_description(caption,line_length), caption(caption) {} + +  self_type &add_options() +  { return *this; } + +  template <class T,class C> +  self_type & +  operator()(char const* name, +             boost::program_options::typed_value<T,C> *val, +             char const*description=NULL) +  { +    printable_option opt((T *)0,simple_add(name,val,description)); +    pr_options.push_back(opt); +    return *this; +  } + +  self_type& +  add(self_type const& desc) +  { +    options_description::add(desc); +    groups.push_back(group_type(new self_type(desc))); +    for (typename options_type::const_iterator i=desc.pr_options.begin(),e=desc.pr_options.end(); +         i!=e;++i) { +      pr_options.push_back(*i); +      pr_options.back().in_group=true; +    } +    return *this; +  } + +  void print_option(Ostream &o, +                    printable_option &opt, +                    boost::program_options::variable_value const & var, +                    bool only_value=false) +  { +    using namespace boost; +    using namespace boost::program_options; +    using namespace std; +    string const& name=opt.name(); +    if (!only_value) { +      if (var.defaulted()) +        o << "#DEFAULTED# "; +      if (var.empty()) { +        o << "#EMPTY# "<<name; +        return; +      } +      o << name<<" = "; +    } +    opt.print(o,var.value()); +  } + +  enum { SHOW_DEFAULTED=0x1 +         , SHOW_EMPTY=0x2 +         , SHOW_DESCRIPTION=0x4 +         ,  SHOW_HIERARCHY=0x8 +         ,  SHOW_ALL=0x0FFF +         ,  SHOW_HELP=0x1000 +  }; + +  void print(Ostream &o, +             boost::program_options::variables_map &vm, +             int show_flags=SHOW_DESCRIPTION & SHOW_DEFAULTED & SHOW_HIERARCHY) +  { +    const bool show_defaulted=bool(show_flags & SHOW_DEFAULTED); +    const bool show_description=bool(show_flags & SHOW_DESCRIPTION); +    const bool hierarchy=bool(show_flags & SHOW_HIERARCHY); +    const bool show_empty=bool(show_flags & SHOW_EMPTY); +    const bool show_help=bool(show_flags & SHOW_HELP); + +    using namespace boost::program_options; +    using namespace std; +    o << "### " << caption << endl; +    for (typename options_type::iterator i=pr_options.begin(),e=pr_options.end(); +         i!=e;++i) { +      printable_option & opt=*i; +      if (!show_help && opt.name()=="help") +        continue; +      if (hierarchy and opt.in_group) +        continue; +      variable_value const & var=vm[opt.vmkey()]; +      if (var.defaulted() && !show_defaulted) +        continue; +      if (var.empty() && !show_empty) +        continue; +      if (show_description) +        o << "# " << opt.description() << endl; +      print_option(o,opt,var); +      o << endl; +    } +    o << endl; +    if (hierarchy) +      for (typename groups_type::iterator i=groups.begin(),e=groups.end(); +           i!=e;++i) +        (*i)->print(o,vm,show_flags); +  } + +/// parses arguments, then stores/notifies from opts->vm.  returns unparsed +/// options and positional arguments, but if not empty, throws exception unless +/// allow_unrecognized_positional is true +  std::vector<std::string> +  parse_options(int argc,char **argv, +                boost::program_options::variables_map &vm, +                boost::program_options::positional_options_description *po=NULL, +                bool allow_unrecognized_positional=false, +                bool allow_unrecognized_opts=false) +  { +    using namespace boost::program_options; +    using namespace std; +    command_line_parser cl(argc,argv); +    cl.options(*this); +    if (po) +      cl.positional(*po); +    if (allow_unrecognized_opts) +      cl.allow_unregistered(); +    parsed_options parsed=cl.run(); +    vector<string> unparsed=collect_unrecognized(parsed.options, +                                                 po ? exclude_positional : include_positional); +    if (!allow_unrecognized_positional) { +      if (!unparsed.empty()) +        program_options_fatal("Unrecognized argument: "+unparsed.front()); +    } +    store(parsed,vm); +    notify(vm); +    return unparsed; +  } + +  std::vector<std::string> +  parse_options(int argc,char const*argv[], +                boost::program_options::variables_map &vm, +                boost::program_options::positional_options_description *po=NULL, +                bool allow_unrecognized_positional=false, +                bool allow_unrecognized_opts=false) +  { +    return parse_options(argc,const_cast<char **>(argv),vm,po +                         ,allow_unrecognized_positional,allow_unrecognized_opts); +  } + +private: +  groups_type groups; +  options_type pr_options; +  std::string caption; +  boost::shared_ptr<option_description> +  simple_add(const char* name, +             const boost::program_options::value_semantic* s, +             const char * description = NULL) +  { +    typedef option_description OD; +    boost::shared_ptr<OD> od( +      (description ? new OD(name,s,description) : new OD(name,s)) +      ); +    options_description::add(od); +    return od; +  } +}; + +typedef printable_options_description<std::ostream> printable_opts; + + + +#endif diff --git a/decoder/stringlib.h b/decoder/stringlib.h index 53e6fe50..84e95d44 100644 --- a/decoder/stringlib.h +++ b/decoder/stringlib.h @@ -20,6 +20,34 @@  #include <sstream>  #include <algorithm> +inline std::size_t skip_ws(std::string const& s,std::size_t starting=0,char const* ws=" \t\n\r") { +  return s.find_first_not_of(ws,starting); +} + +// returns position of end of all non-ws chars before ending, i.e. string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)) strips both ends +inline std::size_t trailing_ws(std::string const& s,std::size_t ending=std::string::npos,char const* ws=" \t\n\r") { +  std::size_t n=s.find_last_not_of(ws,ending); +  if (n==std::string::npos) return n; +  else return n+1; +} + +//TEST: if string is all whitespace, make sure that string(a+npos,a+npos) can't segfault (i.e. won't access any memory because begin==end) +inline std::string strip_ws(std::string const& s) { +  return std::string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)); +} + + +inline bool is_single_line(std::string const& line) { +  return std::count(line.begin(),line.end(),'\n')==0; // but we want to allow terminal newlines/blanks +} + +// is_single_line(strip_ws(line)) +inline bool is_single_line_stripped(std::string const& line) { +  std::size_t b=skip_ws(line),e=trailing_ws(line); +  std::size_t n=line.find('\n',b); +  return n==std::string::npos || n>=e; +} +  struct toupperc {    inline char operator()(char c) const {      return std::toupper(c); diff --git a/decoder/trule.cc b/decoder/trule.cc index 170e3a95..330db67f 100644 --- a/decoder/trule.cc +++ b/decoder/trule.cc @@ -4,6 +4,8 @@  #include "stringlib.h"  #include "tdict.h" +#include "rule_lexer.h" +#include "threadlocal.h"  using namespace std; @@ -91,7 +93,29 @@ TRule* TRule::CreateRuleMonolingual(const string& rule) {    return new TRule(rule, false, true);  } +namespace { +// callback for lexer +THREADLOCAL int n_assigned=0; +void assign_trule(const TRulePtr& new_rule, const unsigned int ctf_level, const TRulePtr& coarse_rule, void* extra) { +  TRule *assignto=(TRule *)extra; +  *assignto=*new_rule; +} + +} +  bool TRule::ReadFromString(const string& line, bool strict, bool mono) { +  if (!is_single_line_stripped(line)) +    std::cerr<<"\nWARNING: building rule from multi-line string "<<line<<".\n"; +  if (!(mono||strict)) { +    // use lexer +    istringstream il(line); +    n_assigned=0; +    RuleLexer::ReadRules(&il,assign_trule,this); +    if (n_assigned>1) +      std::cerr<<"\nWARNING: more than one rule parsed from multi-line string; kept last: "<<line<<".\n"; +    return n_assigned; +  } +    e_.clear();    f_.clear();    scores_.clear(); diff --git a/decoder/trule.h b/decoder/trule.h index 6b98a8fa..04058a41 100644 --- a/decoder/trule.h +++ b/decoder/trule.h @@ -37,7 +37,7 @@ class TRule {    TRule(const TRule& other) :      e_(other.e_), f_(other.f_), lhs_(other.lhs_), scores_(other.scores_), arity_(other.arity_), prev_i(-1), prev_j(-1) {} -  // deprecated - this will be private soon +  // if mono or strict is true, then lexer won't be used, and //FIXME: > 9 variables won't work    explicit TRule(const std::string& text, bool strict = false, bool mono = false) : prev_i(-1), prev_j(-1) {      ReadFromString(text, strict, mono);    } | 
