summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--decoder/Makefile.am1
-rw-r--r--decoder/dict.cc11
-rw-r--r--decoder/dict.h8
-rw-r--r--extools/mr_stripe_rule_reduce.cc1
-rw-r--r--extools/sentence_pair.cc6
-rwxr-xr-xgi/pipeline/local-gi-pipeline.pl2
-rw-r--r--gi/pyp-topics/src/Makefile.am4
-rw-r--r--gi/pyp-topics/src/contexts_corpus.hh4
8 files changed, 24 insertions, 13 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 44d6adc8..fd4589e4 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -47,6 +47,7 @@ libcdec_a_SOURCES = \
rule_lexer.cc \
fst_translator.cc \
csplit.cc \
+ dict.cc \
translator.cc \
scfg_translator.cc \
hg.cc \
diff --git a/decoder/dict.cc b/decoder/dict.cc
new file mode 100644
index 00000000..485fa348
--- /dev/null
+++ b/decoder/dict.cc
@@ -0,0 +1,11 @@
+#include "dict.h"
+
+#include <string>
+#include <vector>
+#include <boost/regex.hpp>
+#include <boost/algorithm/string/regex.hpp>
+
+void Dict::AsVector(const WordID& id, std::vector<std::string>* results) const {
+ boost::algorithm::split_regex(*results, Convert(id), boost::regex("\\|\\|\\|"));
+}
+
diff --git a/decoder/dict.h b/decoder/dict.h
index 39baf6ed..1c8ebb67 100644
--- a/decoder/dict.h
+++ b/decoder/dict.h
@@ -8,8 +8,6 @@
#include <vector>
#include <boost/functional/hash.hpp>
-#include <boost/regex.hpp>
-#include <boost/algorithm/string/regex.hpp>
#include "wordid.h"
@@ -51,11 +49,7 @@ class Dict {
return words_[id-1];
}
- inline std::vector<std::string> AsVector(const WordID& id) const {
- std::vector<std::string> result;
- boost::algorithm::split_regex(result, Convert(id), boost::regex("\\|\\|\\|"));
- return result;
- }
+ void AsVector(const WordID& id, std::vector<std::string>* results) const;
void clear() { words_.clear(); d_.clear(); }
diff --git a/extools/mr_stripe_rule_reduce.cc b/extools/mr_stripe_rule_reduce.cc
index eaf1b6d7..902b6a07 100644
--- a/extools/mr_stripe_rule_reduce.cc
+++ b/extools/mr_stripe_rule_reduce.cc
@@ -73,6 +73,7 @@ int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end
if (w == kDIV) return ptr;
p->push_back(w);
}
+ assert(p->size() > 0);
return ptr;
}
diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc
index 91286059..5706398f 100644
--- a/extools/sentence_pair.cc
+++ b/extools/sentence_pair.cc
@@ -84,8 +84,10 @@ int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf,
void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) {
short a, b;
ReadAlignmentPoint(buf, start, end, false, &a, &b);
- assert(a < f_len);
- assert(b < e_len);
+ if (a >= f_len || b >= e_len) {
+ cerr << "(" << a << ',' << b << ") is out of bounds. INPUT=\n" << buf << endl;
+ exit(1);
+ }
aligned(a,b) = true;
++f_aligned[a];
++e_aligned[b];
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index be91f9ad..27d2047c 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -8,7 +8,7 @@ my $GZIP = 'gzip';
my $ZCAT = 'gunzip -c';
my $BASE_PHRASE_MAX_SIZE = 10;
my $COMPLETE_CACHE = 1;
-my $ITEMS_IN_MEMORY = 3000000; # cache size in extractors
+my $ITEMS_IN_MEMORY = 10000000; # cache size in extractors
my $NUM_TOPICS = 50;
my $NUM_SAMPLES = 100;
my $CONTEXT_SIZE = 1;
diff --git a/gi/pyp-topics/src/Makefile.am b/gi/pyp-topics/src/Makefile.am
index 7ca269a5..e4c4c1b9 100644
--- a/gi/pyp-topics/src/Makefile.am
+++ b/gi/pyp-topics/src/Makefile.am
@@ -4,10 +4,10 @@ contexts_lexer.cc: contexts_lexer.l
$(LEX) -s -CF -8 -o$@ $<
pyp_topics_train_SOURCES = corpus.cc gammadist.c gzstream.cc mt19937ar.c pyp-topics.cc train.cc contexts_lexer.cc contexts_corpus.cc
-pyp_topics_train_LDADD = -lz
+pyp_topics_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
pyp_contexts_train_SOURCES = corpus.cc gammadist.c gzstream.cc mt19937ar.c pyp-topics.cc contexts_lexer.cc contexts_corpus.cc train-contexts.cc
-pyp_contexts_train_LDADD = -lz
+pyp_contexts_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops
diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh
index 9614e7e3..a55e52f2 100644
--- a/gi/pyp-topics/src/contexts_corpus.hh
+++ b/gi/pyp-topics/src/contexts_corpus.hh
@@ -60,7 +60,9 @@ public:
}
std::vector<std::string> context2string(const WordID& id) const {
- return m_dict.AsVector(id);
+ std::vector<std::string> res;
+ m_dict.AsVector(id, &res);
+ return res;
}
const std::string& key(const int& i) const {