diff options
author | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-28 15:01:17 +0000 |
---|---|---|
committer | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-28 15:01:17 +0000 |
commit | 1d089b02eff4fa8837faecf99021f624d8845e5d (patch) | |
tree | b6e3d20094514749c37485e154117871cdc8696f /gi/pyp-topics/src/contexts_lexer.l | |
parent | 088725c4708e83343154d1bed9dee18286446eaf (diff) |
Added contexts_corpus for reading text data files.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@36 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/src/contexts_lexer.l')
-rw-r--r-- | gi/pyp-topics/src/contexts_lexer.l | 110 |
1 files changed, 110 insertions, 0 deletions
diff --git a/gi/pyp-topics/src/contexts_lexer.l b/gi/pyp-topics/src/contexts_lexer.l new file mode 100644 index 00000000..61189a73 --- /dev/null +++ b/gi/pyp-topics/src/contexts_lexer.l @@ -0,0 +1,110 @@ +%{ +#include "contexts_lexer.h" + +#include <string> +#include <iostream> +#include <sstream> +#include <cstring> +#include <cassert> + +int lex_line = 0; +std::istream* contextslex_stream = NULL; +ContextsLexer::ContextsCallback contexts_callback = NULL; +void* contexts_callback_extra = NULL; + +#undef YY_INPUT +#define YY_INPUT(buf, result, max_size) (result = contextslex_stream->read(buf, max_size).gcount()) + +#define YY_SKIP_YYWRAP 1 +int num_phrases = 0; +int yywrap() { return 1; } + +#define MAX_TOKEN_SIZE 255 +std::string contextslex_tmp_token(MAX_TOKEN_SIZE, '\0'); +ContextsLexer::PhraseContextsType current_contexts; + +#define MAX_CONTEXT_SIZE 255 +//std::string tmp_context[MAX_CONTEXT_SIZE]; +ContextsLexer::Context tmp_context; + + +void contextslex_reset() { + current_contexts.phrase.clear(); + current_contexts.contexts.clear(); + current_contexts.counts.clear(); + tmp_context.clear(); +} + +%} + +INT [\-+]?[0-9]+|inf|[\-+]inf + +%x CONTEXT COUNT COUNT_END +%% + +<INITIAL>[^\t]+ { + contextslex_reset(); + current_contexts.phrase.assign(yytext, yyleng); + BEGIN(CONTEXT); + } +<INITIAL>\t { + ; + } + +<INITIAL,CONTEXT,COUNT>\n { + std::cerr << "ERROR: contexts_lexer.l: unexpected newline while trying to read phrase|context|count." << std::endl; + abort(); + } + +<CONTEXT>\|\|\| { + current_contexts.contexts.push_back(tmp_context); + tmp_context.clear(); + BEGIN(COUNT); + } +<CONTEXT>[^ \t]+ { + contextslex_tmp_token.assign(yytext, yyleng); + tmp_context.push_back(contextslex_tmp_token); + } +<CONTEXT>[ \t]+ { ; } + +<COUNT>[ \t]+ { ; } +<COUNT>C={INT} { + current_contexts.counts.push_back(atoi(yytext+2)); + BEGIN(COUNT_END); + } +<COUNT>. { + std::cerr << "ERROR: contexts_lexer.l: unexpected content while reading count." << std::endl; + abort(); + } + +<COUNT_END>[ \t]+ { ; } +<COUNT_END>\|\|\| { + BEGIN(CONTEXT); + } +<COUNT_END>\n { + //std::cerr << "READ:" << current_contexts.phrase << " with " << current_contexts.contexts.size() + // << " contexts, and " << current_contexts.counts.size() << " counts." << std::endl; + contexts_callback(current_contexts, contexts_callback_extra); + current_contexts.phrase.clear(); + current_contexts.contexts.clear(); + current_contexts.counts.clear(); + BEGIN(INITIAL); + } +<COUNT_END>. { + contextslex_tmp_token.assign(yytext, yyleng); + std::cerr << "ERROR: contexts_lexer.l: unexpected content while looking for ||| closing count." << std::endl; + abort(); + } + +%% + +#include "../../../decoder/filelib.h" + +void ContextsLexer::ReadContexts(std::istream* in, ContextsLexer::ContextsCallback func, void* extra) { + lex_line = 1; + contextslex_stream = in; + contexts_callback_extra = extra, + contexts_callback = func; + yylex(); +} + |