diff options
Diffstat (limited to 'extools/sg_lexer.l')
-rw-r--r-- | extools/sg_lexer.l | 242 |
1 files changed, 242 insertions, 0 deletions
diff --git a/extools/sg_lexer.l b/extools/sg_lexer.l new file mode 100644 index 00000000..f115e5bd --- /dev/null +++ b/extools/sg_lexer.l @@ -0,0 +1,242 @@ +%{ +#include "rule_lexer.h" + +#include <string> +#include <iostream> +#include <sstream> +#include <cstring> +#include <cassert> +#include "tdict.h" +#include "fdict.h" +#include "trule.h" +#include "striped_grammar.h" + +int lex_line = 0; +std::istream* sglex_stream = NULL; +StripedGrammarLexer::GrammarCallback grammar_callback = NULL; +void* grammar_callback_extra = NULL; + +#undef YY_INPUT +#define YY_INPUT(buf, result, max_size) (result = sglex_stream->read(buf, max_size).gcount()) + +#define YY_SKIP_YYWRAP 1 +int num_rules = 0; +int yywrap() { return 1; } +bool fl = true; +#define MAX_TOKEN_SIZE 255 +std::string sglex_tmp_token(MAX_TOKEN_SIZE, '\0'); + +#define MAX_RULE_SIZE 48 +WordID sglex_src_rhs[MAX_RULE_SIZE]; +WordID sglex_trg_rhs[MAX_RULE_SIZE]; +int sglex_src_rhs_size; +int sglex_trg_rhs_size; +WordID sglex_lhs; +int sglex_src_arity; +int sglex_trg_arity; + +#define MAX_FEATS 100 +int sglex_feat_ids[MAX_FEATS]; +double sglex_feat_vals[MAX_FEATS]; +int sglex_num_feats; + +#define MAX_ARITY 20 +int sglex_nt_sanity[MAX_ARITY]; +int sglex_src_nts[MAX_ARITY]; +float sglex_nt_size_means[MAX_ARITY]; +float sglex_nt_size_vars[MAX_ARITY]; + +std::vector<WordID> cur_src_rhs; +std::vector<WordID> cur_trg_rhs; +ID2RuleStatistics cur_options; +RuleStatistics* cur_stats = NULL; +int sglex_cur_fid = 0; + +static void sanity_check_trg_index(int index) { + if (index > sglex_src_arity) { + std::cerr << "Target index " << index << " exceeds source arity " << sglex_src_arity << std::endl; + abort(); + } + int& flag = sglex_nt_sanity[index - 1]; + if (flag) { + std::cerr << "Target index " << index << " used multiple times!" << std::endl; + abort(); + } + flag = 1; +} + +static void sglex_reset() { + sglex_src_arity = 0; + sglex_trg_arity = 0; + sglex_num_feats = 0; + sglex_src_rhs_size = 0; + sglex_trg_rhs_size = 0; +} + +%} + +REAL [\-+]?[0-9]+(\.[0-9]*([eE][-+]*[0-9]+)?)?|inf|[\-+]inf +NT [\-#$A-Z_:=.",\\][\-#$".A-Z+/=_0-9!:@\\]* +ALIGN [0-9]+-[0-9]+ + +%x LHS_END SRC TRG FEATS FEATVAL ALIGNS +%% + +<INITIAL>[ ] ; + +<INITIAL>\[{NT}\] { + sglex_tmp_token.assign(yytext + 1, yyleng - 2); + sglex_lhs = -TD::Convert(sglex_tmp_token); + // std::cerr << sglex_tmp_token << "\n"; + BEGIN(LHS_END); + } + +<SRC>\[{NT}\] { + sglex_tmp_token.assign(yytext + 1, yyleng - 2); + sglex_src_nts[sglex_src_arity] = sglex_src_rhs[sglex_src_rhs_size] = -TD::Convert(sglex_tmp_token); + ++sglex_src_arity; + ++sglex_src_rhs_size; + } + +<LHS_END>[ ] { ; } +<LHS_END>\|\|\| { + sglex_reset(); + BEGIN(SRC); + } +<INITIAL,LHS_END>. { + std::cerr << "Line " << lex_line << ": unexpected input in LHS: " << yytext << std::endl; + exit(1); + } + + +<SRC>\[{NT},[1-9][0-9]?\] { + int index = yytext[yyleng - 2] - '0'; + if (yytext[yyleng - 3] == ',') { + sglex_tmp_token.assign(yytext + 1, yyleng - 4); + } else { + sglex_tmp_token.assign(yytext + 1, yyleng - 5); + index += 10 * (yytext[yyleng - 3] - '0'); + } + if ((sglex_src_arity+1) != index) { + std::cerr << "Src indices must go in order: expected " << sglex_src_arity << " but got " << index << std::endl; + abort(); + } + sglex_src_nts[sglex_src_arity] = sglex_src_rhs[sglex_src_rhs_size] = -TD::Convert(sglex_tmp_token); + ++sglex_src_rhs_size; + ++sglex_src_arity; + } + +<SRC>[^ \t]+ { + sglex_tmp_token.assign(yytext, yyleng); + sglex_src_rhs[sglex_src_rhs_size] = TD::Convert(sglex_tmp_token); + ++sglex_src_rhs_size; + } +<SRC>[ ] { ; } +<SRC>\t { + //std::cerr << "LHS=" << TD::Convert(-sglex_lhs) << " "; + //std::cerr << " src_size: " << sglex_src_rhs_size << std::endl; + //std::cerr << " src_arity: " << sglex_src_arity << std::endl; + memset(sglex_nt_sanity, 0, sglex_src_arity * sizeof(int)); + cur_options.clear(); + sglex_trg_rhs_size = 0; + BEGIN(TRG); + } + +<TRG>\[[1-9][0-9]?\] { + int index = yytext[yyleng - 2] - '0'; + if (yyleng == 4) { + index += 10 * (yytext[yyleng - 3] - '0'); + } + ++sglex_trg_arity; + sanity_check_trg_index(index); + sglex_trg_rhs[sglex_trg_rhs_size] = 1 - index; + ++sglex_trg_rhs_size; +} + +<TRG>\|\|\| { + assert(sglex_trg_rhs_size > 0); + cur_trg_rhs.resize(sglex_trg_rhs_size); + for (int i = 0; i < sglex_trg_rhs_size; ++i) + cur_trg_rhs[i] = sglex_trg_rhs[i]; + cur_stats = &cur_options[cur_trg_rhs]; + BEGIN(FEATS); + } + +<TRG>[^ ]+ { + sglex_tmp_token.assign(yytext, yyleng); + sglex_trg_rhs[sglex_trg_rhs_size] = TD::Convert(sglex_tmp_token); + + ++sglex_trg_rhs_size; + } +<TRG>[ ]+ { ; } + +<FEATS>\n { + assert(sglex_lhs < 0); + assert(sglex_src_rhs_size > 0); + cur_src_rhs.resize(sglex_src_rhs_size); + for (int i = 0; i < sglex_src_rhs_size; ++i) + cur_src_rhs[i] = sglex_src_rhs[i]; + grammar_callback(sglex_lhs, cur_src_rhs, cur_options, grammar_callback_extra); + cur_options.clear(); + BEGIN(INITIAL); + } +<FEATS>[ ]+ { ; } +<FEATS>\|\|\| { + memset(sglex_nt_sanity, 0, sglex_src_arity * sizeof(int)); + sglex_trg_rhs_size = 0; + BEGIN(TRG); + } +<FEATS>[A-Z][A-Z_0-9]*= { + // std::cerr << "FV: " << yytext << std::endl; + sglex_tmp_token.assign(yytext, yyleng - 1); + sglex_cur_fid = FD::Convert(sglex_tmp_token); + static const int Afid = FD::Convert("A"); + if (sglex_cur_fid == Afid) { + BEGIN(ALIGNS); + } else { + BEGIN(FEATVAL); + } + } +<FEATVAL>{REAL} { + // std::cerr << "Feature val input: " << yytext << std::endl; + cur_stats->counts.set_value(sglex_cur_fid, strtod(yytext, NULL)); + BEGIN(FEATS); + } +<FEATVAL>. { + std::cerr << "Feature val unexpected input: " << yytext << std::endl; + exit(1); + } +<FEATS>. { + std::cerr << "Features unexpected input: " << yytext << std::endl; + exit(1); + } +<ALIGNS>{ALIGN}(,{ALIGN})* { + assert(cur_stats->aligns.empty()); + int i = 0; + while(i < yyleng) { + short a = 0; + short b = 0; + while (yytext[i] != '-') { a *= 10; a += yytext[i] - '0'; ++i; } + ++i; + while (yytext[i] != ',' && i < yyleng) { b *= 10; b += yytext[i] - '0'; ++i; } + ++i; + cur_stats->aligns.push_back(std::make_pair(a,b)); + } + BEGIN(FEATS); + } +<ALIGNS>. { + std::cerr << "Aligns unexpected input: " << yytext << std::endl; + exit(1); + } +%% + +#include "filelib.h" + +void StripedGrammarLexer::ReadStripedGrammar(std::istream* in, GrammarCallback func, void* extra) { + lex_line = 1; + sglex_stream = in; + grammar_callback_extra = extra; + grammar_callback = func; + yylex(); +} + |