%{ #include <string> #include <iostream> #include <sstream> #include <cstring> #include <cassert> #include "tdict.h" #include "fdict.h" #include "striped_grammar.h" int lex_line = 0; int read_contexts = 0; std::istream* sglex_stream = NULL; StripedGrammarLexer::GrammarCallback grammar_callback = NULL; StripedGrammarLexer::ContextCallback context_callback = NULL; void* grammar_callback_extra = NULL; void* context_callback_extra = NULL; #undef YY_INPUT #define YY_INPUT(buf, result, max_size) (result = sglex_stream->read(buf, max_size).gcount()) #define YY_SKIP_YYWRAP 1 int num_rules = 0; int yywrap() { return 1; } bool fl = true; #define MAX_TOKEN_SIZE 255 std::string sglex_tmp_token(MAX_TOKEN_SIZE, '\0'); #define MAX_RULE_SIZE 48 WordID sglex_src_rhs[MAX_RULE_SIZE]; WordID sglex_trg_rhs[MAX_RULE_SIZE]; int sglex_src_rhs_size; int sglex_trg_rhs_size; WordID sglex_lhs; int sglex_src_arity; int sglex_trg_arity; #define MAX_FEATS 100 int sglex_feat_ids[MAX_FEATS]; double sglex_feat_vals[MAX_FEATS]; int sglex_num_feats; #define MAX_ARITY 20 int sglex_nt_sanity[MAX_ARITY]; int sglex_src_nts[MAX_ARITY]; float sglex_nt_size_means[MAX_ARITY]; float sglex_nt_size_vars[MAX_ARITY]; std::vector<WordID> cur_src_rhs; std::vector<WordID> cur_trg_rhs; ID2RuleStatistics cur_options; RuleStatistics* cur_stats = NULL; int sglex_cur_fid = 0; static void sanity_check_trg_index(int index) { if (index > sglex_src_arity) { std::cerr << "Target index " << index << " exceeds source arity " << sglex_src_arity << std::endl; abort(); } int& flag = sglex_nt_sanity[index - 1]; if (flag) { std::cerr << "Target index " << index << " used multiple times!" << std::endl; abort(); } flag = 1; } static void sglex_reset() { sglex_src_arity = 0; sglex_trg_arity = 0; sglex_num_feats = 0; sglex_src_rhs_size = 0; sglex_trg_rhs_size = 0; } %} REAL [\-+]?[0-9]+(\.[0-9]*([eE][-+]*[0-9]+)?)?|inf|[\-+]inf NT [^\t \[\],]+ ALIGN [0-9]+-[0-9]+ %x LHS_END SRC TRG FEATS FEATVAL ALIGNS %% <INITIAL>[ ] ; <INITIAL>[\t] { if (read_contexts) { cur_options.clear(); BEGIN(TRG); } else { std::cerr << "Unexpected tab while reading striped grammar\n"; exit(1); } } <INITIAL>\[{NT}\] { if (read_contexts) { sglex_tmp_token.assign(yytext, yyleng); sglex_src_rhs[sglex_src_rhs_size] = TD::Convert(sglex_tmp_token); ++sglex_src_rhs_size; } else { sglex_tmp_token.assign(yytext + 1, yyleng - 2); sglex_lhs = -TD::Convert(sglex_tmp_token); // std::cerr << sglex_tmp_token << "\n"; BEGIN(LHS_END); } } <INITIAL>[^ \t]+ { if (read_contexts) { // std::cerr << "Context: " << yytext << std::endl; sglex_tmp_token.assign(yytext, yyleng); sglex_src_rhs[sglex_src_rhs_size] = TD::Convert(sglex_tmp_token); ++sglex_src_rhs_size; } else { std::cerr << "Unexpected input: " << yytext << " when NT expected\n"; exit(1); } } <SRC>\[{NT}\] { sglex_tmp_token.assign(yytext + 1, yyleng - 2); sglex_src_nts[sglex_src_arity] = sglex_src_rhs[sglex_src_rhs_size] = -TD::Convert(sglex_tmp_token); ++sglex_src_arity; ++sglex_src_rhs_size; } <LHS_END>[ ] { ; } <LHS_END>\|\|\| { sglex_reset(); BEGIN(SRC); } <LHS_END>. { std::cerr << "Line " << lex_line << ": unexpected input in LHS: " << yytext << std::endl; exit(1); } <SRC>\[{NT},[1-9][0-9]?\] { int index = yytext[yyleng - 2] - '0'; if (yytext[yyleng - 3] == ',') { sglex_tmp_token.assign(yytext + 1, yyleng - 4); } else { sglex_tmp_token.assign(yytext + 1, yyleng - 5); index += 10 * (yytext[yyleng - 3] - '0'); } if ((sglex_src_arity+1) != index) { std::cerr << "Src indices must go in order: expected " << sglex_src_arity << " but got " << index << std::endl; abort(); } sglex_src_nts[sglex_src_arity] = sglex_src_rhs[sglex_src_rhs_size] = -TD::Convert(sglex_tmp_token); ++sglex_src_rhs_size; ++sglex_src_arity; } <SRC>[^ \t]+ { sglex_tmp_token.assign(yytext, yyleng); sglex_src_rhs[sglex_src_rhs_size] = TD::Convert(sglex_tmp_token); ++sglex_src_rhs_size; } <SRC>[ ] { ; } <SRC>\t { //std::cerr << "LHS=" << TD::Convert(-sglex_lhs) << " "; //std::cerr << " src_size: " << sglex_src_rhs_size << std::endl; //std::cerr << " src_arity: " << sglex_src_arity << std::endl; cur_options.clear(); memset(sglex_nt_sanity, 0, sglex_src_arity * sizeof(int)); sglex_trg_rhs_size = 0; BEGIN(TRG); } <TRG>\[[1-9][0-9]?\] { if (read_contexts) { sglex_tmp_token.assign(yytext, yyleng); sglex_trg_rhs[sglex_trg_rhs_size] = TD::Convert(sglex_tmp_token); ++sglex_trg_rhs_size; } else { int index = yytext[yyleng - 2] - '0'; if (yyleng == 4) { index += 10 * (yytext[yyleng - 3] - '0'); } ++sglex_trg_arity; sanity_check_trg_index(index); sglex_trg_rhs[sglex_trg_rhs_size] = 1 - index; ++sglex_trg_rhs_size; } } <TRG>\|\|\| { //std::cerr << " trg_size: " << sglex_trg_rhs_size << std::endl; //std::cerr << " trg_arity: " << sglex_trg_arity << std::endl; assert(sglex_trg_rhs_size > 0); cur_trg_rhs.resize(sglex_trg_rhs_size); for (int i = 0; i < sglex_trg_rhs_size; ++i) cur_trg_rhs[i] = sglex_trg_rhs[i]; cur_stats = &cur_options[cur_trg_rhs]; BEGIN(FEATS); } <TRG>[^ ]+ { sglex_tmp_token.assign(yytext, yyleng); sglex_trg_rhs[sglex_trg_rhs_size] = TD::Convert(sglex_tmp_token); ++sglex_trg_rhs_size; } <TRG>[ ]+ { ; } <FEATS>\n { assert(sglex_src_rhs_size > 0); cur_src_rhs.resize(sglex_src_rhs_size); for (int i = 0; i < sglex_src_rhs_size; ++i) cur_src_rhs[i] = sglex_src_rhs[i]; if (read_contexts) { context_callback(cur_src_rhs, cur_options, context_callback_extra); } else { assert(sglex_lhs < 0); grammar_callback(sglex_lhs, cur_src_rhs, cur_options, grammar_callback_extra); } cur_options.clear(); sglex_reset(); BEGIN(INITIAL); } <FEATS>[ ]+ { ; } <FEATS>\|\|\| { memset(sglex_nt_sanity, 0, sglex_src_arity * sizeof(int)); sglex_trg_rhs_size = 0; BEGIN(TRG); } <FEATS>[A-Z][A-Z_0-9]*= { // std::cerr << "FV: " << yytext << std::endl; sglex_tmp_token.assign(yytext, yyleng - 1); sglex_cur_fid = FD::Convert(sglex_tmp_token); static const int Afid = FD::Convert("A"); if (sglex_cur_fid == Afid) { BEGIN(ALIGNS); } else { BEGIN(FEATVAL); } } <FEATVAL>{REAL} { // std::cerr << "Feature val input: " << yytext << std::endl; cur_stats->counts.add_value(sglex_cur_fid, strtod(yytext, NULL)); BEGIN(FEATS); } <FEATVAL>. { std::cerr << "Feature val unexpected input: " << yytext << std::endl; exit(1); } <FEATS>. { std::cerr << "Features unexpected input: " << yytext << std::endl; exit(1); } <ALIGNS>{ALIGN}(,{ALIGN})* { assert(cur_stats->aligns.empty()); int i = 0; while(i < yyleng) { short a = 0; short b = 0; while (yytext[i] != '-') { a *= 10; a += yytext[i] - '0'; ++i; } ++i; while (yytext[i] != ',' && i < yyleng) { b *= 10; b += yytext[i] - '0'; ++i; } ++i; cur_stats->aligns.push_back(std::make_pair(a,b)); } BEGIN(FEATS); } <ALIGNS>. { std::cerr << "Aligns unexpected input: " << yytext << std::endl; exit(1); } %% #include "filelib.h" void StripedGrammarLexer::ReadStripedGrammar(std::istream* in, GrammarCallback func, void* extra) { read_contexts = 0; lex_line = 1; sglex_stream = in; grammar_callback_extra = extra; grammar_callback = func; yylex(); } void StripedGrammarLexer::ReadContexts(std::istream* in, ContextCallback func, void* extra) { read_contexts = 1; lex_line = 1; sglex_stream = in; context_callback_extra = extra; context_callback = func; yylex(); }