summaryrefslogtreecommitdiff
path: root/decoder
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-10 02:29:56 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-10 02:29:56 +0000
commit52c656a62d05135cf6ffd80249d5a44a07a40816 (patch)
tree7f9c7b0862dc02f2a0b3fbe2c6d6d4c3da9322f7 /decoder
parent4f99f17541c1fe104afbcf04e3d8d04ad9f1227a (diff)
parse trule(string) using lexer - needs testing, affects earley_composer
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@497 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder')
-rwxr-xr-xdecoder/cfg.cc12
-rwxr-xr-xdecoder/cfg.h8
-rwxr-xr-xdecoder/cfg_format.h36
-rwxr-xr-xdecoder/program_options.h283
-rw-r--r--decoder/stringlib.h28
-rw-r--r--decoder/trule.cc24
-rw-r--r--decoder/trule.h2
7 files changed, 388 insertions, 5 deletions
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
index 0f20ba0f..b83fc54d 100755
--- a/decoder/cfg.cc
+++ b/decoder/cfg.cc
@@ -1,5 +1,6 @@
#include "cfg.h"
#include "hg.h"
+#include "cfg_format.h"
using namespace std;
@@ -10,6 +11,7 @@ void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool pus
pushed_inside=push_weights ? goal_inside : prob_t(1);
int nn=hg.nodes_.size(),ne=hg.edges_.size();
nts.resize(nn);
+ goal_nt=nn-1;
rules.resize(ne);
for (int i=0;i<nn;++i)
nts[i].ruleids=hg.nodes_[i].in_edges_;
@@ -40,3 +42,13 @@ void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool pus
}
}
}
+
+namespace {
+}
+
+void CFG::Print(std::ostream &o,CFGFormat const& f) const {
+ char const* partsep=" ||| ";
+ if (!f.goal_nt_name.empty())
+ o << '['<<f.goal_nt_name <<']' << partsep; // print rhs
+ //TODO:
+}
diff --git a/decoder/cfg.h b/decoder/cfg.h
index 9e1b2837..8d7a5eee 100755
--- a/decoder/cfg.h
+++ b/decoder/cfg.h
@@ -27,9 +27,7 @@
#include "small_vector.h"
class Hypergraph;
-class CFG;
-
-
+class CFGFormat; // #include "cfg_format.h"
struct CFG {
typedef int RuleHandle;
@@ -48,7 +46,7 @@ struct CFG {
};
struct NT {
- Ruleids ruleids; // index into CFG rules with this lhs
+ Ruleids ruleids; // index into CFG rules with lhs = this NT. aka in_edges_
};
CFG() : hg_() { }
@@ -58,6 +56,7 @@ struct CFG {
Init(hg,target_side,copy_features,push_weights);
}
void Init(Hypergraph const& hg,bool target_side=true,bool copy_features=false,bool push_weights=true);
+ void Print(std::ostream &o,CFGFormat const& format) const; // see cfg_format.h
protected:
Hypergraph const* hg_; // shouldn't be used for anything, esp. after binarization
prob_t goal_inside,pushed_inside; // when we push viterbi weights to goal, we store the removed probability in pushed_inside
@@ -66,6 +65,7 @@ protected:
Rules rules;
typedef std::vector<NT> NTs;
NTs nts;
+ int goal_nt;
};
#endif
diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h
new file mode 100755
index 00000000..1bce3d06
--- /dev/null
+++ b/decoder/cfg_format.h
@@ -0,0 +1,36 @@
+#ifndef CFG_FORMAT_H
+#define CFG_FORMAT_H
+
+#include <program_options.h>
+#include <string>
+
+struct CFGFormat {
+ bool identity_scfg;bool features;bool logprob_feat;bool cfg_comma_nt;std::string goal_nt_name;std::string nt_prefix;
+ template <class Opts> // template to support both printable_opts and boost nonprintable
+ void AddOptions(Opts *opts) {
+ using namespace boost::program_options;
+ using namespace std;
+ opts->add_options()
+ ("identity_scfg",defaulted_value(&identity_scfg),"output an identity SCFG: add an identity target side - '[X12] ||| [X13,1] a ||| [1] a ||| feat= ...' - the redundant target '[1] a |||' is omitted otherwise.")
+ ("features",defaulted_value(&features),"print the CFG feature vector")
+ ("logprob_feat",defaulted_value(&logprob_feat),"print a LogProb=-1.5 feature irrespective of --features.")
+ ("cfg_comma_nt",defaulted_value(&cfg_comma_nt),"if false, omit the usual [NP,1] ',1' variable index in the source side")
+ ("goal_nt_name",defaulted_value(&goal_nt_name),"if nonempty, the first production will be '[goal_nt_name] ||| [x123] ||| LogProb=y' where x123 is the actual goal nt, and y is the pushed prob, if any")
+ ("nt_prefix",defaulted_value(&nt_prefix),"NTs are [<nt_prefix>123] where 123 is the node number starting at 0, and the highest node (last in file) is the goal node in an acyclic hypergraph")
+ ;
+ }
+ void set_defaults() {
+ identity_scfg=false;
+ features=true;
+ logprob_feat=true;
+ cfg_comma_nt=true;
+ goal_nt_name="S";
+ nt_prefix="";
+ }
+ CFGFormat() {
+ set_defaults();
+ }
+};
+
+
+#endif
diff --git a/decoder/program_options.h b/decoder/program_options.h
new file mode 100755
index 00000000..251f5680
--- /dev/null
+++ b/decoder/program_options.h
@@ -0,0 +1,283 @@
+#ifndef PROGRAM_OPTIONS_H
+#define PROGRAM_OPTIONS_H
+
+/* wraps boost options_description as printable_opts - show the options values used as well as defaults in usage or log messages. boost program options library lacks any concept of printing configured values; it only supports parsing them from strings */
+
+#ifdef _WIN32
+#include <iso646.h>
+#endif
+#include <boost/program_options.hpp>
+#include <boost/function.hpp>
+#include <boost/shared_ptr.hpp>
+#include <stdexcept>
+#include <iosfwd>
+
+
+template <class T>
+boost::program_options::typed_value<T>*
+defaulted_value(T *v)
+{
+ return boost::program_options::value<T>(v)->default_value(*v);
+}
+
+inline void program_options_fatal(std::string const& msg) {
+ throw std::runtime_error(msg);
+}
+
+
+inline std::string const& get_single_arg(boost::any& v,std::vector<std::string> const& values)
+{
+ boost::program_options::validators::check_first_occurrence(v);
+ return boost::program_options::validators::get_single_string(values);
+}
+
+template <class I>
+void must_complete_read(I &in,std::string const& msg="Couldn't parse")
+{
+ char c;
+ if (in.bad())
+ program_options_fatal(msg + " - failed input");
+ if (in >> c)
+ program_options_fatal(msg + " - got extra char: " + std::string(c,1));
+}
+
+template <class Ostream>
+struct any_printer : public boost::function<void (Ostream &,boost::any const&)>
+{
+ typedef boost::function<void (Ostream &,boost::any const&)> F;
+
+ template <class T>
+ struct typed_print
+ {
+ void operator()(Ostream &o,boost::any const& t) const
+ {
+ o << *boost::any_cast<T const>(&t);
+ }
+ };
+
+ template <class T>
+ static
+ void typed_print_template(Ostream &o,boost::any const& t)
+ {
+ o << *boost::any_cast<T const>(&t);
+ }
+
+ any_printer() {}
+
+ any_printer(const any_printer& x)
+ : F(static_cast<F const&>(x))
+ {}
+
+ template <class T>
+ explicit any_printer(T const* tag) : F(typed_print<T>()) {
+ }
+
+ template <class T>
+ void set()
+ {
+ F f(typed_print<T>());
+ swap(f);
+ }
+};
+
+// have to wrap regular options_description and store our own tables because
+// author didn't make enough stuff protected/public or add a virtual print
+// method to value_semantic
+template <class Ostream>
+struct printable_options_description
+ : public boost::program_options::options_description
+{
+ typedef printable_options_description<Ostream> self_type;
+ typedef boost::program_options::options_description options_description;
+ typedef boost::program_options::option_description option_description;
+ typedef boost::shared_ptr<self_type> group_type;
+ typedef std::vector<group_type > groups_type;
+
+ struct printable_option
+ {
+ typedef boost::shared_ptr<option_description> OD;
+
+ any_printer<Ostream> print;
+ OD od;
+ bool in_group;
+
+ std::string const& name()
+ { return od->long_name(); }
+
+ std::string const& description()
+ { return od->description(); }
+
+ std::string const& vmkey()
+ {
+ return od->key(name());
+ }
+ template <class T>
+ printable_option(T *tag, OD const& od) : print(tag),od(od),in_group(false) {}
+ printable_option() : in_group(false) {}
+ };
+ typedef std::vector<printable_option > options_type;
+ BOOST_STATIC_CONSTANT(unsigned,default_linewrap=80); // options_description::m_default_line_length
+ printable_options_description(unsigned line_length = default_linewrap) :
+ options_description(line_length) {}
+
+ printable_options_description(const std::string& caption,
+ unsigned line_length = default_linewrap)
+ : options_description(caption,line_length), caption(caption) {}
+
+ self_type &add_options()
+ { return *this; }
+
+ template <class T,class C>
+ self_type &
+ operator()(char const* name,
+ boost::program_options::typed_value<T,C> *val,
+ char const*description=NULL)
+ {
+ printable_option opt((T *)0,simple_add(name,val,description));
+ pr_options.push_back(opt);
+ return *this;
+ }
+
+ self_type&
+ add(self_type const& desc)
+ {
+ options_description::add(desc);
+ groups.push_back(group_type(new self_type(desc)));
+ for (typename options_type::const_iterator i=desc.pr_options.begin(),e=desc.pr_options.end();
+ i!=e;++i) {
+ pr_options.push_back(*i);
+ pr_options.back().in_group=true;
+ }
+ return *this;
+ }
+
+ void print_option(Ostream &o,
+ printable_option &opt,
+ boost::program_options::variable_value const & var,
+ bool only_value=false)
+ {
+ using namespace boost;
+ using namespace boost::program_options;
+ using namespace std;
+ string const& name=opt.name();
+ if (!only_value) {
+ if (var.defaulted())
+ o << "#DEFAULTED# ";
+ if (var.empty()) {
+ o << "#EMPTY# "<<name;
+ return;
+ }
+ o << name<<" = ";
+ }
+ opt.print(o,var.value());
+ }
+
+ enum { SHOW_DEFAULTED=0x1
+ , SHOW_EMPTY=0x2
+ , SHOW_DESCRIPTION=0x4
+ , SHOW_HIERARCHY=0x8
+ , SHOW_ALL=0x0FFF
+ , SHOW_HELP=0x1000
+ };
+
+ void print(Ostream &o,
+ boost::program_options::variables_map &vm,
+ int show_flags=SHOW_DESCRIPTION & SHOW_DEFAULTED & SHOW_HIERARCHY)
+ {
+ const bool show_defaulted=bool(show_flags & SHOW_DEFAULTED);
+ const bool show_description=bool(show_flags & SHOW_DESCRIPTION);
+ const bool hierarchy=bool(show_flags & SHOW_HIERARCHY);
+ const bool show_empty=bool(show_flags & SHOW_EMPTY);
+ const bool show_help=bool(show_flags & SHOW_HELP);
+
+ using namespace boost::program_options;
+ using namespace std;
+ o << "### " << caption << endl;
+ for (typename options_type::iterator i=pr_options.begin(),e=pr_options.end();
+ i!=e;++i) {
+ printable_option & opt=*i;
+ if (!show_help && opt.name()=="help")
+ continue;
+ if (hierarchy and opt.in_group)
+ continue;
+ variable_value const & var=vm[opt.vmkey()];
+ if (var.defaulted() && !show_defaulted)
+ continue;
+ if (var.empty() && !show_empty)
+ continue;
+ if (show_description)
+ o << "# " << opt.description() << endl;
+ print_option(o,opt,var);
+ o << endl;
+ }
+ o << endl;
+ if (hierarchy)
+ for (typename groups_type::iterator i=groups.begin(),e=groups.end();
+ i!=e;++i)
+ (*i)->print(o,vm,show_flags);
+ }
+
+/// parses arguments, then stores/notifies from opts->vm. returns unparsed
+/// options and positional arguments, but if not empty, throws exception unless
+/// allow_unrecognized_positional is true
+ std::vector<std::string>
+ parse_options(int argc,char **argv,
+ boost::program_options::variables_map &vm,
+ boost::program_options::positional_options_description *po=NULL,
+ bool allow_unrecognized_positional=false,
+ bool allow_unrecognized_opts=false)
+ {
+ using namespace boost::program_options;
+ using namespace std;
+ command_line_parser cl(argc,argv);
+ cl.options(*this);
+ if (po)
+ cl.positional(*po);
+ if (allow_unrecognized_opts)
+ cl.allow_unregistered();
+ parsed_options parsed=cl.run();
+ vector<string> unparsed=collect_unrecognized(parsed.options,
+ po ? exclude_positional : include_positional);
+ if (!allow_unrecognized_positional) {
+ if (!unparsed.empty())
+ program_options_fatal("Unrecognized argument: "+unparsed.front());
+ }
+ store(parsed,vm);
+ notify(vm);
+ return unparsed;
+ }
+
+ std::vector<std::string>
+ parse_options(int argc,char const*argv[],
+ boost::program_options::variables_map &vm,
+ boost::program_options::positional_options_description *po=NULL,
+ bool allow_unrecognized_positional=false,
+ bool allow_unrecognized_opts=false)
+ {
+ return parse_options(argc,const_cast<char **>(argv),vm,po
+ ,allow_unrecognized_positional,allow_unrecognized_opts);
+ }
+
+private:
+ groups_type groups;
+ options_type pr_options;
+ std::string caption;
+ boost::shared_ptr<option_description>
+ simple_add(const char* name,
+ const boost::program_options::value_semantic* s,
+ const char * description = NULL)
+ {
+ typedef option_description OD;
+ boost::shared_ptr<OD> od(
+ (description ? new OD(name,s,description) : new OD(name,s))
+ );
+ options_description::add(od);
+ return od;
+ }
+};
+
+typedef printable_options_description<std::ostream> printable_opts;
+
+
+
+#endif
diff --git a/decoder/stringlib.h b/decoder/stringlib.h
index 53e6fe50..84e95d44 100644
--- a/decoder/stringlib.h
+++ b/decoder/stringlib.h
@@ -20,6 +20,34 @@
#include <sstream>
#include <algorithm>
+inline std::size_t skip_ws(std::string const& s,std::size_t starting=0,char const* ws=" \t\n\r") {
+ return s.find_first_not_of(ws,starting);
+}
+
+// returns position of end of all non-ws chars before ending, i.e. string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)) strips both ends
+inline std::size_t trailing_ws(std::string const& s,std::size_t ending=std::string::npos,char const* ws=" \t\n\r") {
+ std::size_t n=s.find_last_not_of(ws,ending);
+ if (n==std::string::npos) return n;
+ else return n+1;
+}
+
+//TEST: if string is all whitespace, make sure that string(a+npos,a+npos) can't segfault (i.e. won't access any memory because begin==end)
+inline std::string strip_ws(std::string const& s) {
+ return std::string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s));
+}
+
+
+inline bool is_single_line(std::string const& line) {
+ return std::count(line.begin(),line.end(),'\n')==0; // but we want to allow terminal newlines/blanks
+}
+
+// is_single_line(strip_ws(line))
+inline bool is_single_line_stripped(std::string const& line) {
+ std::size_t b=skip_ws(line),e=trailing_ws(line);
+ std::size_t n=line.find('\n',b);
+ return n==std::string::npos || n>=e;
+}
+
struct toupperc {
inline char operator()(char c) const {
return std::toupper(c);
diff --git a/decoder/trule.cc b/decoder/trule.cc
index 170e3a95..330db67f 100644
--- a/decoder/trule.cc
+++ b/decoder/trule.cc
@@ -4,6 +4,8 @@
#include "stringlib.h"
#include "tdict.h"
+#include "rule_lexer.h"
+#include "threadlocal.h"
using namespace std;
@@ -91,7 +93,29 @@ TRule* TRule::CreateRuleMonolingual(const string& rule) {
return new TRule(rule, false, true);
}
+namespace {
+// callback for lexer
+THREADLOCAL int n_assigned=0;
+void assign_trule(const TRulePtr& new_rule, const unsigned int ctf_level, const TRulePtr& coarse_rule, void* extra) {
+ TRule *assignto=(TRule *)extra;
+ *assignto=*new_rule;
+}
+
+}
+
bool TRule::ReadFromString(const string& line, bool strict, bool mono) {
+ if (!is_single_line_stripped(line))
+ std::cerr<<"\nWARNING: building rule from multi-line string "<<line<<".\n";
+ if (!(mono||strict)) {
+ // use lexer
+ istringstream il(line);
+ n_assigned=0;
+ RuleLexer::ReadRules(&il,assign_trule,this);
+ if (n_assigned>1)
+ std::cerr<<"\nWARNING: more than one rule parsed from multi-line string; kept last: "<<line<<".\n";
+ return n_assigned;
+ }
+
e_.clear();
f_.clear();
scores_.clear();
diff --git a/decoder/trule.h b/decoder/trule.h
index 6b98a8fa..04058a41 100644
--- a/decoder/trule.h
+++ b/decoder/trule.h
@@ -37,7 +37,7 @@ class TRule {
TRule(const TRule& other) :
e_(other.e_), f_(other.f_), lhs_(other.lhs_), scores_(other.scores_), arity_(other.arity_), prev_i(-1), prev_j(-1) {}
- // deprecated - this will be private soon
+ // if mono or strict is true, then lexer won't be used, and //FIXME: > 9 variables won't work
explicit TRule(const std::string& text, bool strict = false, bool mono = false) : prev_i(-1), prev_j(-1) {
ReadFromString(text, strict, mono);
}