From b84c2f7a650949dd8fa2e77ccf0362fc025b3ca6 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Fri, 11 May 2012 18:50:23 -0400 Subject: Pedantic rename to .ll for C++ lex files --- decoder/Makefile.am | 2 +- decoder/rule_lexer.l | 316 -------------------------------------------------- decoder/rule_lexer.ll | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 317 insertions(+), 317 deletions(-) delete mode 100644 decoder/rule_lexer.l create mode 100644 decoder/rule_lexer.ll diff --git a/decoder/Makefile.am b/decoder/Makefile.am index d16a9147..00d01e53 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -21,7 +21,7 @@ cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/lm/libkl AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -I../mteval -I../utils -I../klm -rule_lexer.cc: rule_lexer.l +rule_lexer.cc: rule_lexer.ll $(LEX) -s -CF -8 -o$@ $< noinst_LIBRARIES = libcdec.a diff --git a/decoder/rule_lexer.l b/decoder/rule_lexer.l deleted file mode 100644 index 083a5bb1..00000000 --- a/decoder/rule_lexer.l +++ /dev/null @@ -1,316 +0,0 @@ -%option nounput -%{ -#include "rule_lexer.h" - -#include -#include -#include -#include -#include -#include -#include "tdict.h" -#include "fdict.h" -#include "trule.h" -#include "verbose.h" - -int lex_line = 0; -std::istream* scfglex_stream = NULL; -RuleLexer::RuleCallback rule_callback = NULL; -void* rule_callback_extra = NULL; -std::vector scfglex_phrase_fnames; - -#undef YY_INPUT -#define YY_INPUT(buf, result, max_size) (result = scfglex_stream->read(buf, max_size).gcount()) - -#define YY_SKIP_YYWRAP 1 -int num_rules = 0; -int yywrap() { return 1; } -bool fl = true; -#define MAX_TOKEN_SIZE 255 -std::string scfglex_tmp_token(MAX_TOKEN_SIZE, '\0'); - -#define MAX_RULE_SIZE 200 -WordID scfglex_src_rhs[MAX_RULE_SIZE]; -WordID scfglex_trg_rhs[MAX_RULE_SIZE]; -int scfglex_src_rhs_size; -int scfglex_trg_rhs_size; -WordID scfglex_lhs; -int scfglex_src_arity; -int scfglex_trg_arity; - -#define MAX_FEATS 100 -int scfglex_feat_ids[MAX_FEATS]; -double scfglex_feat_vals[MAX_FEATS]; -int scfglex_num_feats; - -#define MAX_ARITY 200 -int scfglex_nt_sanity[MAX_ARITY]; -int scfglex_src_nts[MAX_ARITY]; -// float scfglex_nt_size_means[MAX_ARITY]; -// float scfglex_nt_size_vars[MAX_ARITY]; -std::stack ctf_rule_stack; -unsigned int ctf_level = 0; - -#define MAX_ALS 200 -AlignmentPoint scfglex_als[MAX_ALS]; -int scfglex_num_als; - -void sanity_check_trg_symbol(WordID nt, int index) { - if (scfglex_src_nts[index-1] != nt) { - std::cerr << "Target symbol with index " << index << " is of type " << TD::Convert(nt*-1) - << " but corresponding source is of type " - << TD::Convert(scfglex_src_nts[index-1] * -1) << std::endl; - abort(); - } -} - -void sanity_check_trg_index(int index) { - if (index > scfglex_src_arity) { - std::cerr << "Target index " << index << " exceeds source arity " << scfglex_src_arity << std::endl; - abort(); - } - int& flag = scfglex_nt_sanity[index - 1]; - if (flag) { - std::cerr << "Target index " << index << " used multiple times!" << std::endl; - abort(); - } - flag = 1; -} - -void scfglex_reset() { - scfglex_src_arity = 0; - scfglex_trg_arity = 0; - scfglex_num_feats = 0; - scfglex_src_rhs_size = 0; - scfglex_trg_rhs_size = 0; - scfglex_num_als = 0; -} - -void check_and_update_ctf_stack(const TRulePtr& rp) { - if (ctf_level > ctf_rule_stack.size()){ - std::cerr << "Found rule at projection level " << ctf_level << " but previous rule was at level " - << ctf_rule_stack.size()-1 << " (cannot exceed previous level by more than one; line " << lex_line << ")" << std::endl; - abort(); - } - while (ctf_rule_stack.size() > ctf_level) - ctf_rule_stack.pop(); - // ensure that rule has the same signature as parent (coarse) rule. Rules may *only* - // differ by the rhs nonterminals, not terminals or permutation of nonterminals. - if (ctf_rule_stack.size() > 0) { - TRulePtr& coarse_rp = ctf_rule_stack.top(); - if (rp->f_.size() != coarse_rp->f_.size() || rp->e_ != coarse_rp->e_) { - std::cerr << "Rule " << (rp->AsString()) << " is not a projection of " << - (coarse_rp->AsString()) << std::endl; - abort(); - } - for (int i=0; if_.size(); ++i) { - if (((rp->f_[i]<0) != (coarse_rp->f_[i]<0)) || - ((rp->f_[i]>0) && (rp->f_[i] != coarse_rp->f_[i]))) { - std::cerr << "Rule " << (rp->AsString()) << " is not a projection of " << - (coarse_rp->AsString()) << std::endl; - abort(); - } - } - } -} - -%} - -REAL [\-+]?[0-9]+(\.[0-9]*([eE][-+]*[0-9]+)?)?|inf|[\-+]inf -NT [^\t \[\],]+ - -%x LHS_END SRC TRG FEATS FEATVAL ALIGNS -%% - -[ \t] { - ctf_level++; - }; - -\[{NT}\] { - scfglex_tmp_token.assign(yytext + 1, yyleng - 2); - scfglex_lhs = -TD::Convert(scfglex_tmp_token); - // std::cerr << scfglex_tmp_token << "\n"; - BEGIN(LHS_END); - } - -\[{NT}\] { - scfglex_tmp_token.assign(yytext + 1, yyleng - 2); - scfglex_src_nts[scfglex_src_arity] = scfglex_src_rhs[scfglex_src_rhs_size] = -TD::Convert(scfglex_tmp_token); - ++scfglex_src_arity; - ++scfglex_src_rhs_size; - } - -\[{NT},[1-9][0-9]?\] { - int index = yytext[yyleng - 2] - '0'; - if (yytext[yyleng - 3] == ',') { - scfglex_tmp_token.assign(yytext + 1, yyleng - 4); - } else { - scfglex_tmp_token.assign(yytext + 1, yyleng - 5); - index += 10 * (yytext[yyleng - 3] - '0'); - } - if ((scfglex_src_arity+1) != index) { - std::cerr << "Src indices must go in order: expected " << scfglex_src_arity << " but got " << index << std::endl; - abort(); - } - scfglex_src_nts[scfglex_src_arity] = scfglex_src_rhs[scfglex_src_rhs_size] = -TD::Convert(scfglex_tmp_token); - ++scfglex_src_rhs_size; - ++scfglex_src_arity; - } - -\[{NT},[1-9][0-9]?\] { - int index = yytext[yyleng - 2] - '0'; - if (yytext[yyleng - 3] == ',') { - scfglex_tmp_token.assign(yytext + 1, yyleng - 4); - } else { - scfglex_tmp_token.assign(yytext + 1, yyleng - 5); - index += 10 * (yytext[yyleng - 3] - '0'); - } - ++scfglex_trg_arity; - // std::cerr << "TRG INDEX: " << index << std::endl; - sanity_check_trg_symbol(-TD::Convert(scfglex_tmp_token), index); - sanity_check_trg_index(index); - scfglex_trg_rhs[scfglex_trg_rhs_size] = 1 - index; - ++scfglex_trg_rhs_size; -} - -\[[1-9][0-9]?\] { - int index = yytext[yyleng - 2] - '0'; - if (yyleng == 4) { - index += 10 * (yytext[yyleng - 3] - '0'); - } - ++scfglex_trg_arity; - sanity_check_trg_index(index); - scfglex_trg_rhs[scfglex_trg_rhs_size] = 1 - index; - ++scfglex_trg_rhs_size; -} - -[ \t] { ; } -\|\|\| { - scfglex_reset(); - BEGIN(SRC); - } -. { - std::cerr << "Line " << lex_line << ": unexpected input in LHS: " << yytext << std::endl; - abort(); - } - -\|\|\| { - memset(scfglex_nt_sanity, 0, scfglex_src_arity * sizeof(int)); - BEGIN(TRG); - } -[^ \t]+ { - scfglex_tmp_token.assign(yytext, yyleng); - scfglex_src_rhs[scfglex_src_rhs_size] = TD::Convert(scfglex_tmp_token); - ++scfglex_src_rhs_size; - } -[ \t]+ { ; } - -\|\|\| { - BEGIN(FEATS); - } -[^ \t]+ { - scfglex_tmp_token.assign(yytext, yyleng); - scfglex_trg_rhs[scfglex_trg_rhs_size] = TD::Convert(scfglex_tmp_token); - ++scfglex_trg_rhs_size; - } -[ \t]+ { ; } - -\n { - if (scfglex_src_arity != scfglex_trg_arity) { - std::cerr << "Line " << lex_line << ": LHS and RHS arity mismatch!\n"; - abort(); - } - // const bool ignore_grammar_features = false; - // if (ignore_grammar_features) scfglex_num_feats = 0; - TRulePtr rp(new TRule(scfglex_lhs, scfglex_src_rhs, scfglex_src_rhs_size, scfglex_trg_rhs, scfglex_trg_rhs_size, scfglex_feat_ids, scfglex_feat_vals, scfglex_num_feats, scfglex_src_arity, scfglex_als, scfglex_num_als)); - check_and_update_ctf_stack(rp); - TRulePtr coarse_rp = ((ctf_level == 0) ? TRulePtr() : ctf_rule_stack.top()); - rule_callback(rp, ctf_level, coarse_rp, rule_callback_extra); - ctf_rule_stack.push(rp); - // std::cerr << rp->AsString() << std::endl; - num_rules++; - lex_line++; - if (!SILENT) { - if (num_rules % 50000 == 0) { std::cerr << '.' << std::flush; fl = true; } - if (num_rules % 2000000 == 0) { std::cerr << " [" << num_rules << "]\n"; fl = false; } - } - ctf_level = 0; - BEGIN(INITIAL); - } - -[ \t;] { ; } -[^ \t=;]+= { - scfglex_tmp_token.assign(yytext, yyleng - 1); - const int fid = FD::Convert(scfglex_tmp_token); - if (fid < 1) { - std::cerr << "\nUNWEIGHED FEATURE " << scfglex_tmp_token << std::endl; - abort(); - } - scfglex_feat_ids[scfglex_num_feats] = fid; - BEGIN(FEATVAL); - } -\|\|\| { - BEGIN(ALIGNS); - } -{REAL} { - scfglex_feat_vals[scfglex_num_feats] = strtod(yytext, NULL); - ++scfglex_num_feats; - BEGIN(FEATS); - } -. { - std::cerr << "Line " << lex_line << ": unexpected input in feature value: " << yytext << std::endl; - abort(); - } -{REAL} { - scfglex_feat_ids[scfglex_num_feats] = scfglex_phrase_fnames[scfglex_num_feats]; - scfglex_feat_vals[scfglex_num_feats] = strtod(yytext, NULL); - ++scfglex_num_feats; - } -. { - std::cerr << "Line " << lex_line << " unexpected input in features: " << yytext << std::endl; - abort(); - } -[0-9]+-[0-9]+ { - int i = 0; - int a = 0; - int b = 0; - while (i < yyleng) { - char c = yytext[i]; - if (c == '-') break; - a *= 10; - a += c - '0'; - ++i; - } - ++i; - while (i < yyleng) { - b *= 10; - b += yytext[i] - '0'; - ++i; - } - scfglex_als[scfglex_num_als++]=AlignmentPoint(a,b); - } -[ \t] ; -. { - std::cerr << "Line " << lex_line << ": unexpected input in alignment: " << yytext << std::endl; - abort(); - } -%% - -#include "filelib.h" - -void RuleLexer::ReadRules(std::istream* in, RuleLexer::RuleCallback func, void* extra) { - if (scfglex_phrase_fnames.empty()) { - scfglex_phrase_fnames.resize(100); - for (int i = 0; i < scfglex_phrase_fnames.size(); ++i) { - std::ostringstream os; - os << "PhraseModel_" << i; - scfglex_phrase_fnames[i] = FD::Convert(os.str()); - } - } - lex_line = 1; - scfglex_stream = in; - rule_callback_extra = extra, - rule_callback = func; - yylex(); -} - diff --git a/decoder/rule_lexer.ll b/decoder/rule_lexer.ll new file mode 100644 index 00000000..083a5bb1 --- /dev/null +++ b/decoder/rule_lexer.ll @@ -0,0 +1,316 @@ +%option nounput +%{ +#include "rule_lexer.h" + +#include +#include +#include +#include +#include +#include +#include "tdict.h" +#include "fdict.h" +#include "trule.h" +#include "verbose.h" + +int lex_line = 0; +std::istream* scfglex_stream = NULL; +RuleLexer::RuleCallback rule_callback = NULL; +void* rule_callback_extra = NULL; +std::vector scfglex_phrase_fnames; + +#undef YY_INPUT +#define YY_INPUT(buf, result, max_size) (result = scfglex_stream->read(buf, max_size).gcount()) + +#define YY_SKIP_YYWRAP 1 +int num_rules = 0; +int yywrap() { return 1; } +bool fl = true; +#define MAX_TOKEN_SIZE 255 +std::string scfglex_tmp_token(MAX_TOKEN_SIZE, '\0'); + +#define MAX_RULE_SIZE 200 +WordID scfglex_src_rhs[MAX_RULE_SIZE]; +WordID scfglex_trg_rhs[MAX_RULE_SIZE]; +int scfglex_src_rhs_size; +int scfglex_trg_rhs_size; +WordID scfglex_lhs; +int scfglex_src_arity; +int scfglex_trg_arity; + +#define MAX_FEATS 100 +int scfglex_feat_ids[MAX_FEATS]; +double scfglex_feat_vals[MAX_FEATS]; +int scfglex_num_feats; + +#define MAX_ARITY 200 +int scfglex_nt_sanity[MAX_ARITY]; +int scfglex_src_nts[MAX_ARITY]; +// float scfglex_nt_size_means[MAX_ARITY]; +// float scfglex_nt_size_vars[MAX_ARITY]; +std::stack ctf_rule_stack; +unsigned int ctf_level = 0; + +#define MAX_ALS 200 +AlignmentPoint scfglex_als[MAX_ALS]; +int scfglex_num_als; + +void sanity_check_trg_symbol(WordID nt, int index) { + if (scfglex_src_nts[index-1] != nt) { + std::cerr << "Target symbol with index " << index << " is of type " << TD::Convert(nt*-1) + << " but corresponding source is of type " + << TD::Convert(scfglex_src_nts[index-1] * -1) << std::endl; + abort(); + } +} + +void sanity_check_trg_index(int index) { + if (index > scfglex_src_arity) { + std::cerr << "Target index " << index << " exceeds source arity " << scfglex_src_arity << std::endl; + abort(); + } + int& flag = scfglex_nt_sanity[index - 1]; + if (flag) { + std::cerr << "Target index " << index << " used multiple times!" << std::endl; + abort(); + } + flag = 1; +} + +void scfglex_reset() { + scfglex_src_arity = 0; + scfglex_trg_arity = 0; + scfglex_num_feats = 0; + scfglex_src_rhs_size = 0; + scfglex_trg_rhs_size = 0; + scfglex_num_als = 0; +} + +void check_and_update_ctf_stack(const TRulePtr& rp) { + if (ctf_level > ctf_rule_stack.size()){ + std::cerr << "Found rule at projection level " << ctf_level << " but previous rule was at level " + << ctf_rule_stack.size()-1 << " (cannot exceed previous level by more than one; line " << lex_line << ")" << std::endl; + abort(); + } + while (ctf_rule_stack.size() > ctf_level) + ctf_rule_stack.pop(); + // ensure that rule has the same signature as parent (coarse) rule. Rules may *only* + // differ by the rhs nonterminals, not terminals or permutation of nonterminals. + if (ctf_rule_stack.size() > 0) { + TRulePtr& coarse_rp = ctf_rule_stack.top(); + if (rp->f_.size() != coarse_rp->f_.size() || rp->e_ != coarse_rp->e_) { + std::cerr << "Rule " << (rp->AsString()) << " is not a projection of " << + (coarse_rp->AsString()) << std::endl; + abort(); + } + for (int i=0; if_.size(); ++i) { + if (((rp->f_[i]<0) != (coarse_rp->f_[i]<0)) || + ((rp->f_[i]>0) && (rp->f_[i] != coarse_rp->f_[i]))) { + std::cerr << "Rule " << (rp->AsString()) << " is not a projection of " << + (coarse_rp->AsString()) << std::endl; + abort(); + } + } + } +} + +%} + +REAL [\-+]?[0-9]+(\.[0-9]*([eE][-+]*[0-9]+)?)?|inf|[\-+]inf +NT [^\t \[\],]+ + +%x LHS_END SRC TRG FEATS FEATVAL ALIGNS +%% + +[ \t] { + ctf_level++; + }; + +\[{NT}\] { + scfglex_tmp_token.assign(yytext + 1, yyleng - 2); + scfglex_lhs = -TD::Convert(scfglex_tmp_token); + // std::cerr << scfglex_tmp_token << "\n"; + BEGIN(LHS_END); + } + +\[{NT}\] { + scfglex_tmp_token.assign(yytext + 1, yyleng - 2); + scfglex_src_nts[scfglex_src_arity] = scfglex_src_rhs[scfglex_src_rhs_size] = -TD::Convert(scfglex_tmp_token); + ++scfglex_src_arity; + ++scfglex_src_rhs_size; + } + +\[{NT},[1-9][0-9]?\] { + int index = yytext[yyleng - 2] - '0'; + if (yytext[yyleng - 3] == ',') { + scfglex_tmp_token.assign(yytext + 1, yyleng - 4); + } else { + scfglex_tmp_token.assign(yytext + 1, yyleng - 5); + index += 10 * (yytext[yyleng - 3] - '0'); + } + if ((scfglex_src_arity+1) != index) { + std::cerr << "Src indices must go in order: expected " << scfglex_src_arity << " but got " << index << std::endl; + abort(); + } + scfglex_src_nts[scfglex_src_arity] = scfglex_src_rhs[scfglex_src_rhs_size] = -TD::Convert(scfglex_tmp_token); + ++scfglex_src_rhs_size; + ++scfglex_src_arity; + } + +\[{NT},[1-9][0-9]?\] { + int index = yytext[yyleng - 2] - '0'; + if (yytext[yyleng - 3] == ',') { + scfglex_tmp_token.assign(yytext + 1, yyleng - 4); + } else { + scfglex_tmp_token.assign(yytext + 1, yyleng - 5); + index += 10 * (yytext[yyleng - 3] - '0'); + } + ++scfglex_trg_arity; + // std::cerr << "TRG INDEX: " << index << std::endl; + sanity_check_trg_symbol(-TD::Convert(scfglex_tmp_token), index); + sanity_check_trg_index(index); + scfglex_trg_rhs[scfglex_trg_rhs_size] = 1 - index; + ++scfglex_trg_rhs_size; +} + +\[[1-9][0-9]?\] { + int index = yytext[yyleng - 2] - '0'; + if (yyleng == 4) { + index += 10 * (yytext[yyleng - 3] - '0'); + } + ++scfglex_trg_arity; + sanity_check_trg_index(index); + scfglex_trg_rhs[scfglex_trg_rhs_size] = 1 - index; + ++scfglex_trg_rhs_size; +} + +[ \t] { ; } +\|\|\| { + scfglex_reset(); + BEGIN(SRC); + } +. { + std::cerr << "Line " << lex_line << ": unexpected input in LHS: " << yytext << std::endl; + abort(); + } + +\|\|\| { + memset(scfglex_nt_sanity, 0, scfglex_src_arity * sizeof(int)); + BEGIN(TRG); + } +[^ \t]+ { + scfglex_tmp_token.assign(yytext, yyleng); + scfglex_src_rhs[scfglex_src_rhs_size] = TD::Convert(scfglex_tmp_token); + ++scfglex_src_rhs_size; + } +[ \t]+ { ; } + +\|\|\| { + BEGIN(FEATS); + } +[^ \t]+ { + scfglex_tmp_token.assign(yytext, yyleng); + scfglex_trg_rhs[scfglex_trg_rhs_size] = TD::Convert(scfglex_tmp_token); + ++scfglex_trg_rhs_size; + } +[ \t]+ { ; } + +\n { + if (scfglex_src_arity != scfglex_trg_arity) { + std::cerr << "Line " << lex_line << ": LHS and RHS arity mismatch!\n"; + abort(); + } + // const bool ignore_grammar_features = false; + // if (ignore_grammar_features) scfglex_num_feats = 0; + TRulePtr rp(new TRule(scfglex_lhs, scfglex_src_rhs, scfglex_src_rhs_size, scfglex_trg_rhs, scfglex_trg_rhs_size, scfglex_feat_ids, scfglex_feat_vals, scfglex_num_feats, scfglex_src_arity, scfglex_als, scfglex_num_als)); + check_and_update_ctf_stack(rp); + TRulePtr coarse_rp = ((ctf_level == 0) ? TRulePtr() : ctf_rule_stack.top()); + rule_callback(rp, ctf_level, coarse_rp, rule_callback_extra); + ctf_rule_stack.push(rp); + // std::cerr << rp->AsString() << std::endl; + num_rules++; + lex_line++; + if (!SILENT) { + if (num_rules % 50000 == 0) { std::cerr << '.' << std::flush; fl = true; } + if (num_rules % 2000000 == 0) { std::cerr << " [" << num_rules << "]\n"; fl = false; } + } + ctf_level = 0; + BEGIN(INITIAL); + } + +[ \t;] { ; } +[^ \t=;]+= { + scfglex_tmp_token.assign(yytext, yyleng - 1); + const int fid = FD::Convert(scfglex_tmp_token); + if (fid < 1) { + std::cerr << "\nUNWEIGHED FEATURE " << scfglex_tmp_token << std::endl; + abort(); + } + scfglex_feat_ids[scfglex_num_feats] = fid; + BEGIN(FEATVAL); + } +\|\|\| { + BEGIN(ALIGNS); + } +{REAL} { + scfglex_feat_vals[scfglex_num_feats] = strtod(yytext, NULL); + ++scfglex_num_feats; + BEGIN(FEATS); + } +. { + std::cerr << "Line " << lex_line << ": unexpected input in feature value: " << yytext << std::endl; + abort(); + } +{REAL} { + scfglex_feat_ids[scfglex_num_feats] = scfglex_phrase_fnames[scfglex_num_feats]; + scfglex_feat_vals[scfglex_num_feats] = strtod(yytext, NULL); + ++scfglex_num_feats; + } +. { + std::cerr << "Line " << lex_line << " unexpected input in features: " << yytext << std::endl; + abort(); + } +[0-9]+-[0-9]+ { + int i = 0; + int a = 0; + int b = 0; + while (i < yyleng) { + char c = yytext[i]; + if (c == '-') break; + a *= 10; + a += c - '0'; + ++i; + } + ++i; + while (i < yyleng) { + b *= 10; + b += yytext[i] - '0'; + ++i; + } + scfglex_als[scfglex_num_als++]=AlignmentPoint(a,b); + } +[ \t] ; +. { + std::cerr << "Line " << lex_line << ": unexpected input in alignment: " << yytext << std::endl; + abort(); + } +%% + +#include "filelib.h" + +void RuleLexer::ReadRules(std::istream* in, RuleLexer::RuleCallback func, void* extra) { + if (scfglex_phrase_fnames.empty()) { + scfglex_phrase_fnames.resize(100); + for (int i = 0; i < scfglex_phrase_fnames.size(); ++i) { + std::ostringstream os; + os << "PhraseModel_" << i; + scfglex_phrase_fnames[i] = FD::Convert(os.str()); + } + } + lex_line = 1; + scfglex_stream = in; + rule_callback_extra = extra, + rule_callback = func; + yylex(); +} + -- cgit v1.2.3