summaryrefslogtreecommitdiff
path: root/extools/sg_lexer.l
diff options
context:
space:
mode:
Diffstat (limited to 'extools/sg_lexer.l')
-rw-r--r--extools/sg_lexer.l294
1 files changed, 0 insertions, 294 deletions
diff --git a/extools/sg_lexer.l b/extools/sg_lexer.l
deleted file mode 100644
index c85cdea7..00000000
--- a/extools/sg_lexer.l
+++ /dev/null
@@ -1,294 +0,0 @@
-%{
-#include <string>
-#include <iostream>
-#include <sstream>
-#include <cstring>
-#include <cassert>
-#include "tdict.h"
-#include "fdict.h"
-#include "striped_grammar.h"
-
-int lex_line = 0;
-int read_contexts = 0;
-std::istream* sglex_stream = NULL;
-StripedGrammarLexer::GrammarCallback grammar_callback = NULL;
-StripedGrammarLexer::ContextCallback context_callback = NULL;
-void* grammar_callback_extra = NULL;
-void* context_callback_extra = NULL;
-
-#undef YY_INPUT
-#define YY_INPUT(buf, result, max_size) (result = sglex_stream->read(buf, max_size).gcount())
-
-#define YY_SKIP_YYWRAP 1
-int num_rules = 0;
-int yywrap() { return 1; }
-bool fl = true;
-#define MAX_TOKEN_SIZE 255
-std::string sglex_tmp_token(MAX_TOKEN_SIZE, '\0');
-
-#define MAX_RULE_SIZE 48
-WordID sglex_src_rhs[MAX_RULE_SIZE];
-WordID sglex_trg_rhs[MAX_RULE_SIZE];
-int sglex_src_rhs_size;
-int sglex_trg_rhs_size;
-WordID sglex_lhs;
-int sglex_src_arity;
-int sglex_trg_arity;
-
-#define MAX_FEATS 100
-int sglex_feat_ids[MAX_FEATS];
-double sglex_feat_vals[MAX_FEATS];
-int sglex_num_feats;
-
-#define MAX_ARITY 20
-int sglex_nt_sanity[MAX_ARITY];
-int sglex_src_nts[MAX_ARITY];
-float sglex_nt_size_means[MAX_ARITY];
-float sglex_nt_size_vars[MAX_ARITY];
-
-std::vector<WordID> cur_src_rhs;
-std::vector<WordID> cur_trg_rhs;
-ID2RuleStatistics cur_options;
-RuleStatistics* cur_stats = NULL;
-int sglex_cur_fid = 0;
-
-static void sanity_check_trg_index(int index) {
- if (index > sglex_src_arity) {
- std::cerr << "Target index " << index << " exceeds source arity " << sglex_src_arity << std::endl;
- abort();
- }
- int& flag = sglex_nt_sanity[index - 1];
- if (flag) {
- std::cerr << "Target index " << index << " used multiple times!" << std::endl;
- abort();
- }
- flag = 1;
-}
-
-static void sglex_reset() {
- sglex_src_arity = 0;
- sglex_trg_arity = 0;
- sglex_num_feats = 0;
- sglex_src_rhs_size = 0;
- sglex_trg_rhs_size = 0;
-}
-
-%}
-
-REAL [\-+]?[0-9]+(\.[0-9]*([eE][-+]*[0-9]+)?)?|inf|[\-+]inf
-NT [^\t \[\],]+
-ALIGN [0-9]+-[0-9]+
-
-%x LHS_END SRC TRG FEATS FEATVAL ALIGNS
-%%
-
-<INITIAL>[ ] ;
-<INITIAL>[\t] {
- if (read_contexts) {
- cur_options.clear();
- BEGIN(TRG);
- } else {
- std::cerr << "Unexpected tab while reading striped grammar\n";
- exit(1);
- }
- }
-
-<INITIAL>\[{NT}\] {
- if (read_contexts) {
- sglex_tmp_token.assign(yytext, yyleng);
- sglex_src_rhs[sglex_src_rhs_size] = TD::Convert(sglex_tmp_token);
- ++sglex_src_rhs_size;
- } else {
- sglex_tmp_token.assign(yytext + 1, yyleng - 2);
- sglex_lhs = -TD::Convert(sglex_tmp_token);
- // std::cerr << sglex_tmp_token << "\n";
- BEGIN(LHS_END);
- }
- }
-
-<INITIAL>[^ \t]+ {
- if (read_contexts) {
- // std::cerr << "Context: " << yytext << std::endl;
- sglex_tmp_token.assign(yytext, yyleng);
- sglex_src_rhs[sglex_src_rhs_size] = TD::Convert(sglex_tmp_token);
- ++sglex_src_rhs_size;
- } else {
- std::cerr << "Unexpected input: " << yytext << " when NT expected\n";
- exit(1);
- }
- }
-
-<SRC>\[{NT}\] {
- sglex_tmp_token.assign(yytext + 1, yyleng - 2);
- sglex_src_nts[sglex_src_arity] = sglex_src_rhs[sglex_src_rhs_size] = -TD::Convert(sglex_tmp_token);
- ++sglex_src_arity;
- ++sglex_src_rhs_size;
- }
-
-<LHS_END>[ ] { ; }
-<LHS_END>\|\|\| {
- sglex_reset();
- BEGIN(SRC);
- }
-
-<LHS_END>. {
- std::cerr << "Line " << lex_line << ": unexpected input in LHS: " << yytext << std::endl;
- exit(1);
- }
-
-
-<SRC>\[{NT},[1-9][0-9]?\] {
- int index = yytext[yyleng - 2] - '0';
- if (yytext[yyleng - 3] == ',') {
- sglex_tmp_token.assign(yytext + 1, yyleng - 4);
- } else {
- sglex_tmp_token.assign(yytext + 1, yyleng - 5);
- index += 10 * (yytext[yyleng - 3] - '0');
- }
- if ((sglex_src_arity+1) != index) {
- std::cerr << "Src indices must go in order: expected " << sglex_src_arity << " but got " << index << std::endl;
- abort();
- }
- sglex_src_nts[sglex_src_arity] = sglex_src_rhs[sglex_src_rhs_size] = -TD::Convert(sglex_tmp_token);
- ++sglex_src_rhs_size;
- ++sglex_src_arity;
- }
-
-<SRC>[^ \t]+ {
- sglex_tmp_token.assign(yytext, yyleng);
- sglex_src_rhs[sglex_src_rhs_size] = TD::Convert(sglex_tmp_token);
- ++sglex_src_rhs_size;
- }
-<SRC>[ ] { ; }
-<SRC>\t {
- //std::cerr << "LHS=" << TD::Convert(-sglex_lhs) << " ";
- //std::cerr << " src_size: " << sglex_src_rhs_size << std::endl;
- //std::cerr << " src_arity: " << sglex_src_arity << std::endl;
- cur_options.clear();
- memset(sglex_nt_sanity, 0, sglex_src_arity * sizeof(int));
- sglex_trg_rhs_size = 0;
- BEGIN(TRG);
- }
-
-<TRG>\[[1-9][0-9]?\] {
- if (read_contexts) {
- sglex_tmp_token.assign(yytext, yyleng);
- sglex_trg_rhs[sglex_trg_rhs_size] = TD::Convert(sglex_tmp_token);
- ++sglex_trg_rhs_size;
- } else {
- int index = yytext[yyleng - 2] - '0';
- if (yyleng == 4) {
- index += 10 * (yytext[yyleng - 3] - '0');
- }
- ++sglex_trg_arity;
- sanity_check_trg_index(index);
- sglex_trg_rhs[sglex_trg_rhs_size] = 1 - index;
- ++sglex_trg_rhs_size;
- }
-}
-
-<TRG>\|\|\| {
- //std::cerr << " trg_size: " << sglex_trg_rhs_size << std::endl;
- //std::cerr << " trg_arity: " << sglex_trg_arity << std::endl;
- assert(sglex_trg_rhs_size > 0);
- cur_trg_rhs.resize(sglex_trg_rhs_size);
- for (int i = 0; i < sglex_trg_rhs_size; ++i)
- cur_trg_rhs[i] = sglex_trg_rhs[i];
- cur_stats = &cur_options[cur_trg_rhs];
- BEGIN(FEATS);
- }
-
-<TRG>[^ ]+ {
- sglex_tmp_token.assign(yytext, yyleng);
- sglex_trg_rhs[sglex_trg_rhs_size] = TD::Convert(sglex_tmp_token);
-
- ++sglex_trg_rhs_size;
- }
-<TRG>[ ]+ { ; }
-
-<FEATS>\n {
- assert(sglex_src_rhs_size > 0);
- cur_src_rhs.resize(sglex_src_rhs_size);
- for (int i = 0; i < sglex_src_rhs_size; ++i)
- cur_src_rhs[i] = sglex_src_rhs[i];
- if (read_contexts) {
- context_callback(cur_src_rhs, cur_options, context_callback_extra);
- } else {
- assert(sglex_lhs < 0);
- grammar_callback(sglex_lhs, cur_src_rhs, cur_options, grammar_callback_extra);
- }
- cur_options.clear();
- sglex_reset();
- BEGIN(INITIAL);
- }
-<FEATS>[ ]+ { ; }
-<FEATS>\|\|\| {
- memset(sglex_nt_sanity, 0, sglex_src_arity * sizeof(int));
- sglex_trg_rhs_size = 0;
- BEGIN(TRG);
- }
-<FEATS>[A-Z][A-Z_0-9]*= {
- // std::cerr << "FV: " << yytext << std::endl;
- sglex_tmp_token.assign(yytext, yyleng - 1);
- sglex_cur_fid = FD::Convert(sglex_tmp_token);
- static const int Afid = FD::Convert("A");
- if (sglex_cur_fid == Afid) {
- BEGIN(ALIGNS);
- } else {
- BEGIN(FEATVAL);
- }
- }
-<FEATVAL>{REAL} {
- // std::cerr << "Feature val input: " << yytext << std::endl;
- cur_stats->counts.add_value(sglex_cur_fid, strtod(yytext, NULL));
- BEGIN(FEATS);
- }
-<FEATVAL>. {
- std::cerr << "Feature val unexpected input: " << yytext << std::endl;
- exit(1);
- }
-<FEATS>. {
- std::cerr << "Features unexpected input: " << yytext << std::endl;
- exit(1);
- }
-<ALIGNS>{ALIGN}(,{ALIGN})* {
- assert(cur_stats->aligns.empty());
- int i = 0;
- while(i < yyleng) {
- short a = 0;
- short b = 0;
- while (yytext[i] != '-') { a *= 10; a += yytext[i] - '0'; ++i; }
- ++i;
- while (yytext[i] != ',' && i < yyleng) { b *= 10; b += yytext[i] - '0'; ++i; }
- ++i;
- cur_stats->aligns.push_back(std::make_pair(a,b));
- }
- BEGIN(FEATS);
- }
-<ALIGNS>. {
- std::cerr << "Aligns unexpected input: " << yytext << std::endl;
- exit(1);
- }
-%%
-
-#include "filelib.h"
-
-void StripedGrammarLexer::ReadStripedGrammar(std::istream* in, GrammarCallback func, void* extra) {
- read_contexts = 0;
- lex_line = 1;
- sglex_stream = in;
- grammar_callback_extra = extra;
- grammar_callback = func;
- yylex();
-}
-
-void StripedGrammarLexer::ReadContexts(std::istream* in, ContextCallback func, void* extra) {
- read_contexts = 1;
- lex_line = 1;
- sglex_stream = in;
- context_callback_extra = extra;
- context_callback = func;
- yylex();
-}
-
-