From 52c656a62d05135cf6ffd80249d5a44a07a40816 Mon Sep 17 00:00:00 2001 From: graehl Date: Tue, 10 Aug 2010 02:29:56 +0000 Subject: parse trule(string) using lexer - needs testing, affects earley_composer git-svn-id: https://ws10smt.googlecode.com/svn/trunk@497 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/cfg.cc | 12 ++ decoder/cfg.h | 8 +- decoder/cfg_format.h | 36 ++++++ decoder/program_options.h | 283 ++++++++++++++++++++++++++++++++++++++++++++++ decoder/stringlib.h | 28 +++++ decoder/trule.cc | 24 ++++ decoder/trule.h | 2 +- 7 files changed, 388 insertions(+), 5 deletions(-) create mode 100755 decoder/cfg_format.h create mode 100755 decoder/program_options.h (limited to 'decoder') diff --git a/decoder/cfg.cc b/decoder/cfg.cc index 0f20ba0f..b83fc54d 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -1,5 +1,6 @@ #include "cfg.h" #include "hg.h" +#include "cfg_format.h" using namespace std; @@ -10,6 +11,7 @@ void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool pus pushed_inside=push_weights ? goal_inside : prob_t(1); int nn=hg.nodes_.size(),ne=hg.edges_.size(); nts.resize(nn); + goal_nt=nn-1; rules.resize(ne); for (int i=0;i NTs; NTs nts; + int goal_nt; }; #endif diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h new file mode 100755 index 00000000..1bce3d06 --- /dev/null +++ b/decoder/cfg_format.h @@ -0,0 +1,36 @@ +#ifndef CFG_FORMAT_H +#define CFG_FORMAT_H + +#include +#include + +struct CFGFormat { + bool identity_scfg;bool features;bool logprob_feat;bool cfg_comma_nt;std::string goal_nt_name;std::string nt_prefix; + template // template to support both printable_opts and boost nonprintable + void AddOptions(Opts *opts) { + using namespace boost::program_options; + using namespace std; + opts->add_options() + ("identity_scfg",defaulted_value(&identity_scfg),"output an identity SCFG: add an identity target side - '[X12] ||| [X13,1] a ||| [1] a ||| feat= ...' - the redundant target '[1] a |||' is omitted otherwise.") + ("features",defaulted_value(&features),"print the CFG feature vector") + ("logprob_feat",defaulted_value(&logprob_feat),"print a LogProb=-1.5 feature irrespective of --features.") + ("cfg_comma_nt",defaulted_value(&cfg_comma_nt),"if false, omit the usual [NP,1] ',1' variable index in the source side") + ("goal_nt_name",defaulted_value(&goal_nt_name),"if nonempty, the first production will be '[goal_nt_name] ||| [x123] ||| LogProb=y' where x123 is the actual goal nt, and y is the pushed prob, if any") + ("nt_prefix",defaulted_value(&nt_prefix),"NTs are [123] where 123 is the node number starting at 0, and the highest node (last in file) is the goal node in an acyclic hypergraph") + ; + } + void set_defaults() { + identity_scfg=false; + features=true; + logprob_feat=true; + cfg_comma_nt=true; + goal_nt_name="S"; + nt_prefix=""; + } + CFGFormat() { + set_defaults(); + } +}; + + +#endif diff --git a/decoder/program_options.h b/decoder/program_options.h new file mode 100755 index 00000000..251f5680 --- /dev/null +++ b/decoder/program_options.h @@ -0,0 +1,283 @@ +#ifndef PROGRAM_OPTIONS_H +#define PROGRAM_OPTIONS_H + +/* wraps boost options_description as printable_opts - show the options values used as well as defaults in usage or log messages. boost program options library lacks any concept of printing configured values; it only supports parsing them from strings */ + +#ifdef _WIN32 +#include +#endif +#include +#include +#include +#include +#include + + +template +boost::program_options::typed_value* +defaulted_value(T *v) +{ + return boost::program_options::value(v)->default_value(*v); +} + +inline void program_options_fatal(std::string const& msg) { + throw std::runtime_error(msg); +} + + +inline std::string const& get_single_arg(boost::any& v,std::vector const& values) +{ + boost::program_options::validators::check_first_occurrence(v); + return boost::program_options::validators::get_single_string(values); +} + +template +void must_complete_read(I &in,std::string const& msg="Couldn't parse") +{ + char c; + if (in.bad()) + program_options_fatal(msg + " - failed input"); + if (in >> c) + program_options_fatal(msg + " - got extra char: " + std::string(c,1)); +} + +template +struct any_printer : public boost::function +{ + typedef boost::function F; + + template + struct typed_print + { + void operator()(Ostream &o,boost::any const& t) const + { + o << *boost::any_cast(&t); + } + }; + + template + static + void typed_print_template(Ostream &o,boost::any const& t) + { + o << *boost::any_cast(&t); + } + + any_printer() {} + + any_printer(const any_printer& x) + : F(static_cast(x)) + {} + + template + explicit any_printer(T const* tag) : F(typed_print()) { + } + + template + void set() + { + F f(typed_print()); + swap(f); + } +}; + +// have to wrap regular options_description and store our own tables because +// author didn't make enough stuff protected/public or add a virtual print +// method to value_semantic +template +struct printable_options_description + : public boost::program_options::options_description +{ + typedef printable_options_description self_type; + typedef boost::program_options::options_description options_description; + typedef boost::program_options::option_description option_description; + typedef boost::shared_ptr group_type; + typedef std::vector groups_type; + + struct printable_option + { + typedef boost::shared_ptr OD; + + any_printer print; + OD od; + bool in_group; + + std::string const& name() + { return od->long_name(); } + + std::string const& description() + { return od->description(); } + + std::string const& vmkey() + { + return od->key(name()); + } + template + printable_option(T *tag, OD const& od) : print(tag),od(od),in_group(false) {} + printable_option() : in_group(false) {} + }; + typedef std::vector options_type; + BOOST_STATIC_CONSTANT(unsigned,default_linewrap=80); // options_description::m_default_line_length + printable_options_description(unsigned line_length = default_linewrap) : + options_description(line_length) {} + + printable_options_description(const std::string& caption, + unsigned line_length = default_linewrap) + : options_description(caption,line_length), caption(caption) {} + + self_type &add_options() + { return *this; } + + template + self_type & + operator()(char const* name, + boost::program_options::typed_value *val, + char const*description=NULL) + { + printable_option opt((T *)0,simple_add(name,val,description)); + pr_options.push_back(opt); + return *this; + } + + self_type& + add(self_type const& desc) + { + options_description::add(desc); + groups.push_back(group_type(new self_type(desc))); + for (typename options_type::const_iterator i=desc.pr_options.begin(),e=desc.pr_options.end(); + i!=e;++i) { + pr_options.push_back(*i); + pr_options.back().in_group=true; + } + return *this; + } + + void print_option(Ostream &o, + printable_option &opt, + boost::program_options::variable_value const & var, + bool only_value=false) + { + using namespace boost; + using namespace boost::program_options; + using namespace std; + string const& name=opt.name(); + if (!only_value) { + if (var.defaulted()) + o << "#DEFAULTED# "; + if (var.empty()) { + o << "#EMPTY# "<print(o,vm,show_flags); + } + +/// parses arguments, then stores/notifies from opts->vm. returns unparsed +/// options and positional arguments, but if not empty, throws exception unless +/// allow_unrecognized_positional is true + std::vector + parse_options(int argc,char **argv, + boost::program_options::variables_map &vm, + boost::program_options::positional_options_description *po=NULL, + bool allow_unrecognized_positional=false, + bool allow_unrecognized_opts=false) + { + using namespace boost::program_options; + using namespace std; + command_line_parser cl(argc,argv); + cl.options(*this); + if (po) + cl.positional(*po); + if (allow_unrecognized_opts) + cl.allow_unregistered(); + parsed_options parsed=cl.run(); + vector unparsed=collect_unrecognized(parsed.options, + po ? exclude_positional : include_positional); + if (!allow_unrecognized_positional) { + if (!unparsed.empty()) + program_options_fatal("Unrecognized argument: "+unparsed.front()); + } + store(parsed,vm); + notify(vm); + return unparsed; + } + + std::vector + parse_options(int argc,char const*argv[], + boost::program_options::variables_map &vm, + boost::program_options::positional_options_description *po=NULL, + bool allow_unrecognized_positional=false, + bool allow_unrecognized_opts=false) + { + return parse_options(argc,const_cast(argv),vm,po + ,allow_unrecognized_positional,allow_unrecognized_opts); + } + +private: + groups_type groups; + options_type pr_options; + std::string caption; + boost::shared_ptr + simple_add(const char* name, + const boost::program_options::value_semantic* s, + const char * description = NULL) + { + typedef option_description OD; + boost::shared_ptr od( + (description ? new OD(name,s,description) : new OD(name,s)) + ); + options_description::add(od); + return od; + } +}; + +typedef printable_options_description printable_opts; + + + +#endif diff --git a/decoder/stringlib.h b/decoder/stringlib.h index 53e6fe50..84e95d44 100644 --- a/decoder/stringlib.h +++ b/decoder/stringlib.h @@ -20,6 +20,34 @@ #include #include +inline std::size_t skip_ws(std::string const& s,std::size_t starting=0,char const* ws=" \t\n\r") { + return s.find_first_not_of(ws,starting); +} + +// returns position of end of all non-ws chars before ending, i.e. string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)) strips both ends +inline std::size_t trailing_ws(std::string const& s,std::size_t ending=std::string::npos,char const* ws=" \t\n\r") { + std::size_t n=s.find_last_not_of(ws,ending); + if (n==std::string::npos) return n; + else return n+1; +} + +//TEST: if string is all whitespace, make sure that string(a+npos,a+npos) can't segfault (i.e. won't access any memory because begin==end) +inline std::string strip_ws(std::string const& s) { + return std::string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)); +} + + +inline bool is_single_line(std::string const& line) { + return std::count(line.begin(),line.end(),'\n')==0; // but we want to allow terminal newlines/blanks +} + +// is_single_line(strip_ws(line)) +inline bool is_single_line_stripped(std::string const& line) { + std::size_t b=skip_ws(line),e=trailing_ws(line); + std::size_t n=line.find('\n',b); + return n==std::string::npos || n>=e; +} + struct toupperc { inline char operator()(char c) const { return std::toupper(c); diff --git a/decoder/trule.cc b/decoder/trule.cc index 170e3a95..330db67f 100644 --- a/decoder/trule.cc +++ b/decoder/trule.cc @@ -4,6 +4,8 @@ #include "stringlib.h" #include "tdict.h" +#include "rule_lexer.h" +#include "threadlocal.h" using namespace std; @@ -91,7 +93,29 @@ TRule* TRule::CreateRuleMonolingual(const string& rule) { return new TRule(rule, false, true); } +namespace { +// callback for lexer +THREADLOCAL int n_assigned=0; +void assign_trule(const TRulePtr& new_rule, const unsigned int ctf_level, const TRulePtr& coarse_rule, void* extra) { + TRule *assignto=(TRule *)extra; + *assignto=*new_rule; +} + +} + bool TRule::ReadFromString(const string& line, bool strict, bool mono) { + if (!is_single_line_stripped(line)) + std::cerr<<"\nWARNING: building rule from multi-line string "<1) + std::cerr<<"\nWARNING: more than one rule parsed from multi-line string; kept last: "< 9 variables won't work explicit TRule(const std::string& text, bool strict = false, bool mono = false) : prev_i(-1), prev_j(-1) { ReadFromString(text, strict, mono); } -- cgit v1.2.3