diff options
author | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-10 02:29:56 +0000 |
---|---|---|
committer | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-10 02:29:56 +0000 |
commit | 52c656a62d05135cf6ffd80249d5a44a07a40816 (patch) | |
tree | 7f9c7b0862dc02f2a0b3fbe2c6d6d4c3da9322f7 | |
parent | 4f99f17541c1fe104afbcf04e3d8d04ad9f1227a (diff) |
parse trule(string) using lexer - needs testing, affects earley_composer
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@497 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-x | decoder/cfg.cc | 12 | ||||
-rwxr-xr-x | decoder/cfg.h | 8 | ||||
-rwxr-xr-x | decoder/cfg_format.h | 36 | ||||
-rwxr-xr-x | decoder/program_options.h | 283 | ||||
-rw-r--r-- | decoder/stringlib.h | 28 | ||||
-rw-r--r-- | decoder/trule.cc | 24 | ||||
-rw-r--r-- | decoder/trule.h | 2 |
7 files changed, 388 insertions, 5 deletions
diff --git a/decoder/cfg.cc b/decoder/cfg.cc index 0f20ba0f..b83fc54d 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -1,5 +1,6 @@ #include "cfg.h" #include "hg.h" +#include "cfg_format.h" using namespace std; @@ -10,6 +11,7 @@ void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool pus pushed_inside=push_weights ? goal_inside : prob_t(1); int nn=hg.nodes_.size(),ne=hg.edges_.size(); nts.resize(nn); + goal_nt=nn-1; rules.resize(ne); for (int i=0;i<nn;++i) nts[i].ruleids=hg.nodes_[i].in_edges_; @@ -40,3 +42,13 @@ void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool pus } } } + +namespace { +} + +void CFG::Print(std::ostream &o,CFGFormat const& f) const { + char const* partsep=" ||| "; + if (!f.goal_nt_name.empty()) + o << '['<<f.goal_nt_name <<']' << partsep; // print rhs + //TODO: +} diff --git a/decoder/cfg.h b/decoder/cfg.h index 9e1b2837..8d7a5eee 100755 --- a/decoder/cfg.h +++ b/decoder/cfg.h @@ -27,9 +27,7 @@ #include "small_vector.h" class Hypergraph; -class CFG; - - +class CFGFormat; // #include "cfg_format.h" struct CFG { typedef int RuleHandle; @@ -48,7 +46,7 @@ struct CFG { }; struct NT { - Ruleids ruleids; // index into CFG rules with this lhs + Ruleids ruleids; // index into CFG rules with lhs = this NT. aka in_edges_ }; CFG() : hg_() { } @@ -58,6 +56,7 @@ struct CFG { Init(hg,target_side,copy_features,push_weights); } void Init(Hypergraph const& hg,bool target_side=true,bool copy_features=false,bool push_weights=true); + void Print(std::ostream &o,CFGFormat const& format) const; // see cfg_format.h protected: Hypergraph const* hg_; // shouldn't be used for anything, esp. after binarization prob_t goal_inside,pushed_inside; // when we push viterbi weights to goal, we store the removed probability in pushed_inside @@ -66,6 +65,7 @@ protected: Rules rules; typedef std::vector<NT> NTs; NTs nts; + int goal_nt; }; #endif diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h new file mode 100755 index 00000000..1bce3d06 --- /dev/null +++ b/decoder/cfg_format.h @@ -0,0 +1,36 @@ +#ifndef CFG_FORMAT_H +#define CFG_FORMAT_H + +#include <program_options.h> +#include <string> + +struct CFGFormat { + bool identity_scfg;bool features;bool logprob_feat;bool cfg_comma_nt;std::string goal_nt_name;std::string nt_prefix; + template <class Opts> // template to support both printable_opts and boost nonprintable + void AddOptions(Opts *opts) { + using namespace boost::program_options; + using namespace std; + opts->add_options() + ("identity_scfg",defaulted_value(&identity_scfg),"output an identity SCFG: add an identity target side - '[X12] ||| [X13,1] a ||| [1] a ||| feat= ...' - the redundant target '[1] a |||' is omitted otherwise.") + ("features",defaulted_value(&features),"print the CFG feature vector") + ("logprob_feat",defaulted_value(&logprob_feat),"print a LogProb=-1.5 feature irrespective of --features.") + ("cfg_comma_nt",defaulted_value(&cfg_comma_nt),"if false, omit the usual [NP,1] ',1' variable index in the source side") + ("goal_nt_name",defaulted_value(&goal_nt_name),"if nonempty, the first production will be '[goal_nt_name] ||| [x123] ||| LogProb=y' where x123 is the actual goal nt, and y is the pushed prob, if any") + ("nt_prefix",defaulted_value(&nt_prefix),"NTs are [<nt_prefix>123] where 123 is the node number starting at 0, and the highest node (last in file) is the goal node in an acyclic hypergraph") + ; + } + void set_defaults() { + identity_scfg=false; + features=true; + logprob_feat=true; + cfg_comma_nt=true; + goal_nt_name="S"; + nt_prefix=""; + } + CFGFormat() { + set_defaults(); + } +}; + + +#endif diff --git a/decoder/program_options.h b/decoder/program_options.h new file mode 100755 index 00000000..251f5680 --- /dev/null +++ b/decoder/program_options.h @@ -0,0 +1,283 @@ +#ifndef PROGRAM_OPTIONS_H +#define PROGRAM_OPTIONS_H + +/* wraps boost options_description as printable_opts - show the options values used as well as defaults in usage or log messages. boost program options library lacks any concept of printing configured values; it only supports parsing them from strings */ + +#ifdef _WIN32 +#include <iso646.h> +#endif +#include <boost/program_options.hpp> +#include <boost/function.hpp> +#include <boost/shared_ptr.hpp> +#include <stdexcept> +#include <iosfwd> + + +template <class T> +boost::program_options::typed_value<T>* +defaulted_value(T *v) +{ + return boost::program_options::value<T>(v)->default_value(*v); +} + +inline void program_options_fatal(std::string const& msg) { + throw std::runtime_error(msg); +} + + +inline std::string const& get_single_arg(boost::any& v,std::vector<std::string> const& values) +{ + boost::program_options::validators::check_first_occurrence(v); + return boost::program_options::validators::get_single_string(values); +} + +template <class I> +void must_complete_read(I &in,std::string const& msg="Couldn't parse") +{ + char c; + if (in.bad()) + program_options_fatal(msg + " - failed input"); + if (in >> c) + program_options_fatal(msg + " - got extra char: " + std::string(c,1)); +} + +template <class Ostream> +struct any_printer : public boost::function<void (Ostream &,boost::any const&)> +{ + typedef boost::function<void (Ostream &,boost::any const&)> F; + + template <class T> + struct typed_print + { + void operator()(Ostream &o,boost::any const& t) const + { + o << *boost::any_cast<T const>(&t); + } + }; + + template <class T> + static + void typed_print_template(Ostream &o,boost::any const& t) + { + o << *boost::any_cast<T const>(&t); + } + + any_printer() {} + + any_printer(const any_printer& x) + : F(static_cast<F const&>(x)) + {} + + template <class T> + explicit any_printer(T const* tag) : F(typed_print<T>()) { + } + + template <class T> + void set() + { + F f(typed_print<T>()); + swap(f); + } +}; + +// have to wrap regular options_description and store our own tables because +// author didn't make enough stuff protected/public or add a virtual print +// method to value_semantic +template <class Ostream> +struct printable_options_description + : public boost::program_options::options_description +{ + typedef printable_options_description<Ostream> self_type; + typedef boost::program_options::options_description options_description; + typedef boost::program_options::option_description option_description; + typedef boost::shared_ptr<self_type> group_type; + typedef std::vector<group_type > groups_type; + + struct printable_option + { + typedef boost::shared_ptr<option_description> OD; + + any_printer<Ostream> print; + OD od; + bool in_group; + + std::string const& name() + { return od->long_name(); } + + std::string const& description() + { return od->description(); } + + std::string const& vmkey() + { + return od->key(name()); + } + template <class T> + printable_option(T *tag, OD const& od) : print(tag),od(od),in_group(false) {} + printable_option() : in_group(false) {} + }; + typedef std::vector<printable_option > options_type; + BOOST_STATIC_CONSTANT(unsigned,default_linewrap=80); // options_description::m_default_line_length + printable_options_description(unsigned line_length = default_linewrap) : + options_description(line_length) {} + + printable_options_description(const std::string& caption, + unsigned line_length = default_linewrap) + : options_description(caption,line_length), caption(caption) {} + + self_type &add_options() + { return *this; } + + template <class T,class C> + self_type & + operator()(char const* name, + boost::program_options::typed_value<T,C> *val, + char const*description=NULL) + { + printable_option opt((T *)0,simple_add(name,val,description)); + pr_options.push_back(opt); + return *this; + } + + self_type& + add(self_type const& desc) + { + options_description::add(desc); + groups.push_back(group_type(new self_type(desc))); + for (typename options_type::const_iterator i=desc.pr_options.begin(),e=desc.pr_options.end(); + i!=e;++i) { + pr_options.push_back(*i); + pr_options.back().in_group=true; + } + return *this; + } + + void print_option(Ostream &o, + printable_option &opt, + boost::program_options::variable_value const & var, + bool only_value=false) + { + using namespace boost; + using namespace boost::program_options; + using namespace std; + string const& name=opt.name(); + if (!only_value) { + if (var.defaulted()) + o << "#DEFAULTED# "; + if (var.empty()) { + o << "#EMPTY# "<<name; + return; + } + o << name<<" = "; + } + opt.print(o,var.value()); + } + + enum { SHOW_DEFAULTED=0x1 + , SHOW_EMPTY=0x2 + , SHOW_DESCRIPTION=0x4 + , SHOW_HIERARCHY=0x8 + , SHOW_ALL=0x0FFF + , SHOW_HELP=0x1000 + }; + + void print(Ostream &o, + boost::program_options::variables_map &vm, + int show_flags=SHOW_DESCRIPTION & SHOW_DEFAULTED & SHOW_HIERARCHY) + { + const bool show_defaulted=bool(show_flags & SHOW_DEFAULTED); + const bool show_description=bool(show_flags & SHOW_DESCRIPTION); + const bool hierarchy=bool(show_flags & SHOW_HIERARCHY); + const bool show_empty=bool(show_flags & SHOW_EMPTY); + const bool show_help=bool(show_flags & SHOW_HELP); + + using namespace boost::program_options; + using namespace std; + o << "### " << caption << endl; + for (typename options_type::iterator i=pr_options.begin(),e=pr_options.end(); + i!=e;++i) { + printable_option & opt=*i; + if (!show_help && opt.name()=="help") + continue; + if (hierarchy and opt.in_group) + continue; + variable_value const & var=vm[opt.vmkey()]; + if (var.defaulted() && !show_defaulted) + continue; + if (var.empty() && !show_empty) + continue; + if (show_description) + o << "# " << opt.description() << endl; + print_option(o,opt,var); + o << endl; + } + o << endl; + if (hierarchy) + for (typename groups_type::iterator i=groups.begin(),e=groups.end(); + i!=e;++i) + (*i)->print(o,vm,show_flags); + } + +/// parses arguments, then stores/notifies from opts->vm. returns unparsed +/// options and positional arguments, but if not empty, throws exception unless +/// allow_unrecognized_positional is true + std::vector<std::string> + parse_options(int argc,char **argv, + boost::program_options::variables_map &vm, + boost::program_options::positional_options_description *po=NULL, + bool allow_unrecognized_positional=false, + bool allow_unrecognized_opts=false) + { + using namespace boost::program_options; + using namespace std; + command_line_parser cl(argc,argv); + cl.options(*this); + if (po) + cl.positional(*po); + if (allow_unrecognized_opts) + cl.allow_unregistered(); + parsed_options parsed=cl.run(); + vector<string> unparsed=collect_unrecognized(parsed.options, + po ? exclude_positional : include_positional); + if (!allow_unrecognized_positional) { + if (!unparsed.empty()) + program_options_fatal("Unrecognized argument: "+unparsed.front()); + } + store(parsed,vm); + notify(vm); + return unparsed; + } + + std::vector<std::string> + parse_options(int argc,char const*argv[], + boost::program_options::variables_map &vm, + boost::program_options::positional_options_description *po=NULL, + bool allow_unrecognized_positional=false, + bool allow_unrecognized_opts=false) + { + return parse_options(argc,const_cast<char **>(argv),vm,po + ,allow_unrecognized_positional,allow_unrecognized_opts); + } + +private: + groups_type groups; + options_type pr_options; + std::string caption; + boost::shared_ptr<option_description> + simple_add(const char* name, + const boost::program_options::value_semantic* s, + const char * description = NULL) + { + typedef option_description OD; + boost::shared_ptr<OD> od( + (description ? new OD(name,s,description) : new OD(name,s)) + ); + options_description::add(od); + return od; + } +}; + +typedef printable_options_description<std::ostream> printable_opts; + + + +#endif diff --git a/decoder/stringlib.h b/decoder/stringlib.h index 53e6fe50..84e95d44 100644 --- a/decoder/stringlib.h +++ b/decoder/stringlib.h @@ -20,6 +20,34 @@ #include <sstream> #include <algorithm> +inline std::size_t skip_ws(std::string const& s,std::size_t starting=0,char const* ws=" \t\n\r") { + return s.find_first_not_of(ws,starting); +} + +// returns position of end of all non-ws chars before ending, i.e. string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)) strips both ends +inline std::size_t trailing_ws(std::string const& s,std::size_t ending=std::string::npos,char const* ws=" \t\n\r") { + std::size_t n=s.find_last_not_of(ws,ending); + if (n==std::string::npos) return n; + else return n+1; +} + +//TEST: if string is all whitespace, make sure that string(a+npos,a+npos) can't segfault (i.e. won't access any memory because begin==end) +inline std::string strip_ws(std::string const& s) { + return std::string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)); +} + + +inline bool is_single_line(std::string const& line) { + return std::count(line.begin(),line.end(),'\n')==0; // but we want to allow terminal newlines/blanks +} + +// is_single_line(strip_ws(line)) +inline bool is_single_line_stripped(std::string const& line) { + std::size_t b=skip_ws(line),e=trailing_ws(line); + std::size_t n=line.find('\n',b); + return n==std::string::npos || n>=e; +} + struct toupperc { inline char operator()(char c) const { return std::toupper(c); diff --git a/decoder/trule.cc b/decoder/trule.cc index 170e3a95..330db67f 100644 --- a/decoder/trule.cc +++ b/decoder/trule.cc @@ -4,6 +4,8 @@ #include "stringlib.h" #include "tdict.h" +#include "rule_lexer.h" +#include "threadlocal.h" using namespace std; @@ -91,7 +93,29 @@ TRule* TRule::CreateRuleMonolingual(const string& rule) { return new TRule(rule, false, true); } +namespace { +// callback for lexer +THREADLOCAL int n_assigned=0; +void assign_trule(const TRulePtr& new_rule, const unsigned int ctf_level, const TRulePtr& coarse_rule, void* extra) { + TRule *assignto=(TRule *)extra; + *assignto=*new_rule; +} + +} + bool TRule::ReadFromString(const string& line, bool strict, bool mono) { + if (!is_single_line_stripped(line)) + std::cerr<<"\nWARNING: building rule from multi-line string "<<line<<".\n"; + if (!(mono||strict)) { + // use lexer + istringstream il(line); + n_assigned=0; + RuleLexer::ReadRules(&il,assign_trule,this); + if (n_assigned>1) + std::cerr<<"\nWARNING: more than one rule parsed from multi-line string; kept last: "<<line<<".\n"; + return n_assigned; + } + e_.clear(); f_.clear(); scores_.clear(); diff --git a/decoder/trule.h b/decoder/trule.h index 6b98a8fa..04058a41 100644 --- a/decoder/trule.h +++ b/decoder/trule.h @@ -37,7 +37,7 @@ class TRule { TRule(const TRule& other) : e_(other.e_), f_(other.f_), lhs_(other.lhs_), scores_(other.scores_), arity_(other.arity_), prev_i(-1), prev_j(-1) {} - // deprecated - this will be private soon + // if mono or strict is true, then lexer won't be used, and //FIXME: > 9 variables won't work explicit TRule(const std::string& text, bool strict = false, bool mono = false) : prev_i(-1), prev_j(-1) { ReadFromString(text, strict, mono); } |