summaryrefslogtreecommitdiff
path: root/extools/sg_lexer.l
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-13 06:29:00 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-13 06:29:00 +0000
commitf47330182fdf2e44eb28d39d8db55deb31b54d1f (patch)
tree4b074d1f5a22d899a55ba2017ebc3ce5e65693ba /extools/sg_lexer.l
parent2dc76ceae3dfbe333b6b404e5b1298be99b211c9 (diff)
start moving toward striped grammar lexer
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@233 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'extools/sg_lexer.l')
-rw-r--r--extools/sg_lexer.l242
1 files changed, 242 insertions, 0 deletions
diff --git a/extools/sg_lexer.l b/extools/sg_lexer.l
new file mode 100644
index 00000000..f115e5bd
--- /dev/null
+++ b/extools/sg_lexer.l
@@ -0,0 +1,242 @@
+%{
+#include "rule_lexer.h"
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <cstring>
+#include <cassert>
+#include "tdict.h"
+#include "fdict.h"
+#include "trule.h"
+#include "striped_grammar.h"
+
+int lex_line = 0;
+std::istream* sglex_stream = NULL;
+StripedGrammarLexer::GrammarCallback grammar_callback = NULL;
+void* grammar_callback_extra = NULL;
+
+#undef YY_INPUT
+#define YY_INPUT(buf, result, max_size) (result = sglex_stream->read(buf, max_size).gcount())
+
+#define YY_SKIP_YYWRAP 1
+int num_rules = 0;
+int yywrap() { return 1; }
+bool fl = true;
+#define MAX_TOKEN_SIZE 255
+std::string sglex_tmp_token(MAX_TOKEN_SIZE, '\0');
+
+#define MAX_RULE_SIZE 48
+WordID sglex_src_rhs[MAX_RULE_SIZE];
+WordID sglex_trg_rhs[MAX_RULE_SIZE];
+int sglex_src_rhs_size;
+int sglex_trg_rhs_size;
+WordID sglex_lhs;
+int sglex_src_arity;
+int sglex_trg_arity;
+
+#define MAX_FEATS 100
+int sglex_feat_ids[MAX_FEATS];
+double sglex_feat_vals[MAX_FEATS];
+int sglex_num_feats;
+
+#define MAX_ARITY 20
+int sglex_nt_sanity[MAX_ARITY];
+int sglex_src_nts[MAX_ARITY];
+float sglex_nt_size_means[MAX_ARITY];
+float sglex_nt_size_vars[MAX_ARITY];
+
+std::vector<WordID> cur_src_rhs;
+std::vector<WordID> cur_trg_rhs;
+ID2RuleStatistics cur_options;
+RuleStatistics* cur_stats = NULL;
+int sglex_cur_fid = 0;
+
+static void sanity_check_trg_index(int index) {
+ if (index > sglex_src_arity) {
+ std::cerr << "Target index " << index << " exceeds source arity " << sglex_src_arity << std::endl;
+ abort();
+ }
+ int& flag = sglex_nt_sanity[index - 1];
+ if (flag) {
+ std::cerr << "Target index " << index << " used multiple times!" << std::endl;
+ abort();
+ }
+ flag = 1;
+}
+
+static void sglex_reset() {
+ sglex_src_arity = 0;
+ sglex_trg_arity = 0;
+ sglex_num_feats = 0;
+ sglex_src_rhs_size = 0;
+ sglex_trg_rhs_size = 0;
+}
+
+%}
+
+REAL [\-+]?[0-9]+(\.[0-9]*([eE][-+]*[0-9]+)?)?|inf|[\-+]inf
+NT [\-#$A-Z_:=.",\\][\-#$".A-Z+/=_0-9!:@\\]*
+ALIGN [0-9]+-[0-9]+
+
+%x LHS_END SRC TRG FEATS FEATVAL ALIGNS
+%%
+
+<INITIAL>[ ] ;
+
+<INITIAL>\[{NT}\] {
+ sglex_tmp_token.assign(yytext + 1, yyleng - 2);
+ sglex_lhs = -TD::Convert(sglex_tmp_token);
+ // std::cerr << sglex_tmp_token << "\n";
+ BEGIN(LHS_END);
+ }
+
+<SRC>\[{NT}\] {
+ sglex_tmp_token.assign(yytext + 1, yyleng - 2);
+ sglex_src_nts[sglex_src_arity] = sglex_src_rhs[sglex_src_rhs_size] = -TD::Convert(sglex_tmp_token);
+ ++sglex_src_arity;
+ ++sglex_src_rhs_size;
+ }
+
+<LHS_END>[ ] { ; }
+<LHS_END>\|\|\| {
+ sglex_reset();
+ BEGIN(SRC);
+ }
+<INITIAL,LHS_END>. {
+ std::cerr << "Line " << lex_line << ": unexpected input in LHS: " << yytext << std::endl;
+ exit(1);
+ }
+
+
+<SRC>\[{NT},[1-9][0-9]?\] {
+ int index = yytext[yyleng - 2] - '0';
+ if (yytext[yyleng - 3] == ',') {
+ sglex_tmp_token.assign(yytext + 1, yyleng - 4);
+ } else {
+ sglex_tmp_token.assign(yytext + 1, yyleng - 5);
+ index += 10 * (yytext[yyleng - 3] - '0');
+ }
+ if ((sglex_src_arity+1) != index) {
+ std::cerr << "Src indices must go in order: expected " << sglex_src_arity << " but got " << index << std::endl;
+ abort();
+ }
+ sglex_src_nts[sglex_src_arity] = sglex_src_rhs[sglex_src_rhs_size] = -TD::Convert(sglex_tmp_token);
+ ++sglex_src_rhs_size;
+ ++sglex_src_arity;
+ }
+
+<SRC>[^ \t]+ {
+ sglex_tmp_token.assign(yytext, yyleng);
+ sglex_src_rhs[sglex_src_rhs_size] = TD::Convert(sglex_tmp_token);
+ ++sglex_src_rhs_size;
+ }
+<SRC>[ ] { ; }
+<SRC>\t {
+ //std::cerr << "LHS=" << TD::Convert(-sglex_lhs) << " ";
+ //std::cerr << " src_size: " << sglex_src_rhs_size << std::endl;
+ //std::cerr << " src_arity: " << sglex_src_arity << std::endl;
+ memset(sglex_nt_sanity, 0, sglex_src_arity * sizeof(int));
+ cur_options.clear();
+ sglex_trg_rhs_size = 0;
+ BEGIN(TRG);
+ }
+
+<TRG>\[[1-9][0-9]?\] {
+ int index = yytext[yyleng - 2] - '0';
+ if (yyleng == 4) {
+ index += 10 * (yytext[yyleng - 3] - '0');
+ }
+ ++sglex_trg_arity;
+ sanity_check_trg_index(index);
+ sglex_trg_rhs[sglex_trg_rhs_size] = 1 - index;
+ ++sglex_trg_rhs_size;
+}
+
+<TRG>\|\|\| {
+ assert(sglex_trg_rhs_size > 0);
+ cur_trg_rhs.resize(sglex_trg_rhs_size);
+ for (int i = 0; i < sglex_trg_rhs_size; ++i)
+ cur_trg_rhs[i] = sglex_trg_rhs[i];
+ cur_stats = &cur_options[cur_trg_rhs];
+ BEGIN(FEATS);
+ }
+
+<TRG>[^ ]+ {
+ sglex_tmp_token.assign(yytext, yyleng);
+ sglex_trg_rhs[sglex_trg_rhs_size] = TD::Convert(sglex_tmp_token);
+
+ ++sglex_trg_rhs_size;
+ }
+<TRG>[ ]+ { ; }
+
+<FEATS>\n {
+ assert(sglex_lhs < 0);
+ assert(sglex_src_rhs_size > 0);
+ cur_src_rhs.resize(sglex_src_rhs_size);
+ for (int i = 0; i < sglex_src_rhs_size; ++i)
+ cur_src_rhs[i] = sglex_src_rhs[i];
+ grammar_callback(sglex_lhs, cur_src_rhs, cur_options, grammar_callback_extra);
+ cur_options.clear();
+ BEGIN(INITIAL);
+ }
+<FEATS>[ ]+ { ; }
+<FEATS>\|\|\| {
+ memset(sglex_nt_sanity, 0, sglex_src_arity * sizeof(int));
+ sglex_trg_rhs_size = 0;
+ BEGIN(TRG);
+ }
+<FEATS>[A-Z][A-Z_0-9]*= {
+ // std::cerr << "FV: " << yytext << std::endl;
+ sglex_tmp_token.assign(yytext, yyleng - 1);
+ sglex_cur_fid = FD::Convert(sglex_tmp_token);
+ static const int Afid = FD::Convert("A");
+ if (sglex_cur_fid == Afid) {
+ BEGIN(ALIGNS);
+ } else {
+ BEGIN(FEATVAL);
+ }
+ }
+<FEATVAL>{REAL} {
+ // std::cerr << "Feature val input: " << yytext << std::endl;
+ cur_stats->counts.set_value(sglex_cur_fid, strtod(yytext, NULL));
+ BEGIN(FEATS);
+ }
+<FEATVAL>. {
+ std::cerr << "Feature val unexpected input: " << yytext << std::endl;
+ exit(1);
+ }
+<FEATS>. {
+ std::cerr << "Features unexpected input: " << yytext << std::endl;
+ exit(1);
+ }
+<ALIGNS>{ALIGN}(,{ALIGN})* {
+ assert(cur_stats->aligns.empty());
+ int i = 0;
+ while(i < yyleng) {
+ short a = 0;
+ short b = 0;
+ while (yytext[i] != '-') { a *= 10; a += yytext[i] - '0'; ++i; }
+ ++i;
+ while (yytext[i] != ',' && i < yyleng) { b *= 10; b += yytext[i] - '0'; ++i; }
+ ++i;
+ cur_stats->aligns.push_back(std::make_pair(a,b));
+ }
+ BEGIN(FEATS);
+ }
+<ALIGNS>. {
+ std::cerr << "Aligns unexpected input: " << yytext << std::endl;
+ exit(1);
+ }
+%%
+
+#include "filelib.h"
+
+void StripedGrammarLexer::ReadStripedGrammar(std::istream* in, GrammarCallback func, void* extra) {
+ lex_line = 1;
+ sglex_stream = in;
+ grammar_callback_extra = extra;
+ grammar_callback = func;
+ yylex();
+}
+