summaryrefslogtreecommitdiff
path: root/decoder
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2012-03-13 09:24:47 +0100
committerPatrick Simianer <p@simianer.de>2012-03-13 09:24:47 +0100
commitc3a9ea64251605532c7954959662643a6a927bb7 (patch)
treefed6048a5acdaf3834740107771c2bc48f26fd4d /decoder
parent867bca3e5fa0cdd63bf032e5859fb5092d9a4ca1 (diff)
parenta45af4a3704531a8382cd231f6445b3a33b598a3 (diff)
merge with upstream
Diffstat (limited to 'decoder')
-rwxr-xr-xdecoder/1dev.ur1
-rw-r--r--decoder/Makefile.am2
-rwxr-xr-xdecoder/apply_fsa_models.README21
-rw-r--r--[-rwxr-xr-x]decoder/apply_fsa_models.h0
-rw-r--r--decoder/apply_models.cc9
-rwxr-xr-xdecoder/cdec-gz.ini7
-rwxr-xr-xdecoder/cdec-nolm-tuned.ini7
-rw-r--r--decoder/cdec_ff.cc2
-rw-r--r--[-rwxr-xr-x]decoder/cfg.cc0
-rw-r--r--[-rwxr-xr-x]decoder/cfg.h0
-rw-r--r--[-rwxr-xr-x]decoder/cfg_binarize.h0
-rw-r--r--[-rwxr-xr-x]decoder/cfg_format.h0
-rw-r--r--[-rwxr-xr-x]decoder/cfg_options.h0
-rw-r--r--[-rwxr-xr-x]decoder/cfg_test.cc0
-rwxr-xr-xdecoder/decode.sh10
-rw-r--r--decoder/decoder.cc4
-rwxr-xr-xdecoder/do.tests.sh1
-rw-r--r--decoder/earley_composer.cc5
-rw-r--r--decoder/ff_context.cc99
-rw-r--r--decoder/ff_context.h23
-rw-r--r--decoder/ff_csplit.cc2
-rw-r--r--decoder/ff_ngrams.cc43
-rw-r--r--[-rwxr-xr-x]decoder/ff_register.h0
-rw-r--r--[-rwxr-xr-x]decoder/ff_sample_fsa.h0
-rw-r--r--decoder/freqdict.cc29
-rw-r--r--decoder/freqdict.h37
-rwxr-xr-xdecoder/fsa-decode.sh3
-rwxr-xr-xdecoder/fsa-hiero.ini5
-rwxr-xr-xdecoder/fsa.ini2
-rw-r--r--decoder/fst_translator.cc5
-rwxr-xr-xdecoder/glue-lda.scfg8
-rwxr-xr-xdecoder/grammar.hiero151
-rw-r--r--[-rwxr-xr-x]decoder/hg_cfg.h0
-rw-r--r--decoder/hg_io.cc27
-rw-r--r--decoder/hg_io.h3
-rw-r--r--[-rwxr-xr-x]decoder/hg_test.h0
-rw-r--r--decoder/lattice.cc1
-rw-r--r--[-rwxr-xr-x]decoder/nt_span.h0
-rw-r--r--[-rwxr-xr-x]decoder/oracle_bleu.h0
-rwxr-xr-xdecoder/perro.sh1
-rwxr-xr-xdecoder/perro.ur1
-rw-r--r--[-rwxr-xr-x]decoder/program_options.h0
-rw-r--r--[-rwxr-xr-x]decoder/sentences.h0
-rwxr-xr-xdecoder/short.ur1
-rw-r--r--decoder/trule.cc16
-rw-r--r--decoder/weights-fsa14
-rwxr-xr-xdecoder/weights.hiero10
47 files changed, 247 insertions, 303 deletions
diff --git a/decoder/1dev.ur b/decoder/1dev.ur
deleted file mode 100755
index adeaa101..00000000
--- a/decoder/1dev.ur
+++ /dev/null
@@ -1 +0,0 @@
-krAcy ( AstRAf rpwrtRr ) krAcy myN pyr kw mxtlf HAdvAt myN xAtwn smyt 4 AfrAd hlAk hw gyY jbkh smndr sY Ayk $xS ky lA$ mly .
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 30eaf04d..ec51d643 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -63,6 +63,7 @@ libcdec_a_SOURCES = \
ff.cc \
ff_rules.cc \
ff_wordset.cc \
+ ff_context.cc \
ff_charset.cc \
ff_lm.cc \
ff_klm.cc \
@@ -75,7 +76,6 @@ libcdec_a_SOURCES = \
ff_source_syntax.cc \
ff_bleu.cc \
ff_factory.cc \
- freqdict.cc \
lexalign.cc \
lextrans.cc \
tagger.cc \
diff --git a/decoder/apply_fsa_models.README b/decoder/apply_fsa_models.README
deleted file mode 100755
index 7e116a62..00000000
--- a/decoder/apply_fsa_models.README
+++ /dev/null
@@ -1,21 +0,0 @@
-trie root and trie lhs2[lhs-nodeid] -> trie node
-
-trie node edges (adj) - list of w,dest,p. dest==0 means it's a completed rule (note: p is redundant with node e.dest->p-p, except in case of dest=0). we will also use null_wordid (max_int) for dest=0 edges, but that doesn't matter
-
-we intersect by iterating over adj and scoring w/ fsa. TODO: index for sparse fsa; for now we assume smoothed ngram fsa where all items are scorable.
-
-predicted items: we don't make copies of the pending predictions as we scan toward completion; instead, item backpointers are followed until the prediction (where backpointer=0). such backpointer=0 items have a queue of prediction-originating items.
-
-reusing completed items using a lookup on pair [NT,a] -> all [NT,a,b] lazy best-first. b-next (right state) index in lazy index.
-
-perhaps predictors need to register the # of items it has already mated with. (b-next index)
-
-comb-like (cube) t-next (position in trie node edge list), b-next? or just check chart and don't redup. depends on whether we want just 1best or kbest deriv - diff. ways of reaching same result are good in kbest.
-
-types of chart items:
-
-A->t.*,a,b (trie node t) with mutable state t-next for generating successor lazily (vs. all at once)
-
-A->t.B,a,b (t-next of A->t.* points to (B,t')): mutable state b-next for choosing which B->b,? to use. note: such an item can't be queued immediately on its own, but can be added to the pending list of B->b,? ; once any B->b,? is completed then we see if any more b-next are already known; if they're exhausted then we add back to pending list?
-
-A->a,? - list of all known (b,inside prob) such that A[a,b]. we may also choose to represent this as A->.*,a,a.
diff --git a/decoder/apply_fsa_models.h b/decoder/apply_fsa_models.h
index 6561c70c..6561c70c 100755..100644
--- a/decoder/apply_fsa_models.h
+++ b/decoder/apply_fsa_models.h
diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc
index 40fd27e4..9ba59d1b 100644
--- a/decoder/apply_models.cc
+++ b/decoder/apply_models.cc
@@ -270,7 +270,8 @@ public:
const Hypergraph::Edge& edge = in.edges_[in_edges[i]];
const JVector j(edge.tail_nodes_.size(), 0);
cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal));
- assert(unique_cands.insert(cand.back()).second); // these should all be unique!
+ bool is_new = unique_cands.insert(cand.back()).second;
+ assert(is_new); // these should all be unique!
}
// cerr << " making heap of " << cand.size() << " candidates\n";
make_heap(cand.begin(), cand.end(), HeapCandCompare());
@@ -378,7 +379,8 @@ public:
pop_heap(cand.begin(), cand.end(), HeapCandCompare());
Candidate* item = cand.back();
cand.pop_back();
- assert(unique_accepted.insert(item).second); // these should all be unique!
+ bool is_new = unique_accepted.insert(item).second;
+ assert(is_new); // these should all be unique!
// cerr << "POPPED: " << *item << endl;
PushSuccFast2(*item, is_goal, &cand, &unique_accepted);
@@ -419,7 +421,8 @@ public:
Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal);
cand.push_back(new_cand);
push_heap(cand.begin(), cand.end(), HeapCandCompare());
- assert(cs->insert(new_cand).second); // insert into uniqueness set, sanity check
+ bool is_new = cs->insert(new_cand).second;
+ assert(is_new); // insert into uniqueness set, sanity check
}
}
}
diff --git a/decoder/cdec-gz.ini b/decoder/cdec-gz.ini
deleted file mode 100755
index f9b15420..00000000
--- a/decoder/cdec-gz.ini
+++ /dev/null
@@ -1,7 +0,0 @@
-cubepruning_pop_limit=200
-feature_function=WordPenalty
-feature_function=ArityPenalty
-add_pass_through_rules=true
-formalism=scfg
-grammar=mt09.grammar.gz
-weights=weights.tune.nolm
diff --git a/decoder/cdec-nolm-tuned.ini b/decoder/cdec-nolm-tuned.ini
deleted file mode 100755
index 5ebab747..00000000
--- a/decoder/cdec-nolm-tuned.ini
+++ /dev/null
@@ -1,7 +0,0 @@
-cubepruning_pop_limit=200
-feature_function=WordPenalty
-feature_function=ArityPenalty
-add_pass_through_rules=true
-formalism=scfg
-grammar=mt09.grammar
-weights=weights.tune.nolm
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index 4ce5749e..b516c386 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -1,6 +1,7 @@
#include <boost/shared_ptr.hpp>
#include "ff.h"
+#include "ff_context.h"
#include "ff_spans.h"
#include "ff_lm.h"
#include "ff_klm.h"
@@ -42,6 +43,7 @@ void register_feature_functions() {
#endif
ff_registry.Register("SpanFeatures", new FFFactory<SpanFeatures>());
ff_registry.Register("NgramFeatures", new FFFactory<NgramDetector>());
+ ff_registry.Register("RuleContextFeatures", new FFFactory<RuleContextFeatures>());
ff_registry.Register("RuleIdentityFeatures", new FFFactory<RuleIdentityFeatures>());
ff_registry.Register("SourceSyntaxFeatures", new FFFactory<SourceSyntaxFeatures>);
ff_registry.Register("SourceSpanSizeFeatures", new FFFactory<SourceSpanSizeFeatures>);
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
index cd7e66e9..cd7e66e9 100755..100644
--- a/decoder/cfg.cc
+++ b/decoder/cfg.cc
diff --git a/decoder/cfg.h b/decoder/cfg.h
index 8cb29bb9..8cb29bb9 100755..100644
--- a/decoder/cfg.h
+++ b/decoder/cfg.h
diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h
index ae06f8bf..ae06f8bf 100755..100644
--- a/decoder/cfg_binarize.h
+++ b/decoder/cfg_binarize.h
diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h
index 2f40d483..2f40d483 100755..100644
--- a/decoder/cfg_format.h
+++ b/decoder/cfg_format.h
diff --git a/decoder/cfg_options.h b/decoder/cfg_options.h
index 7b59c05c..7b59c05c 100755..100644
--- a/decoder/cfg_options.h
+++ b/decoder/cfg_options.h
diff --git a/decoder/cfg_test.cc b/decoder/cfg_test.cc
index c61f9f2c..c61f9f2c 100755..100644
--- a/decoder/cfg_test.cc
+++ b/decoder/cfg_test.cc
diff --git a/decoder/decode.sh b/decoder/decode.sh
deleted file mode 100755
index 677e64ad..00000000
--- a/decoder/decode.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-d=$(dirname `readlink -f $0`)/
-decode() {
-if [ "$lm" ] ; then
- lmargs0=-F
- lmargs1="LanguageModel lm.gz -n LM"
-fi
-set -x
-$gdb ${cdec:=$d/cdec} -c $d/${cfg:=cdec-fsa}.ini -i $d/${in:=1dev.ur} $lmargs0 "$lmargs1" --show_features --show_config --show_weights "$@"
-set +x
-}
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index b93925d1..53c47d21 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -408,6 +408,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
("show_partition,z", "Compute and show the partition (inside score)")
("show_conditional_prob", "Output the conditional log prob to STDOUT instead of a translation")
("show_cfg_search_space", "Show the search space as a CFG")
+ ("show_target_graph", "Output the target hypergraph")
("coarse_to_fine_beam_prune", po::value<double>(), "Prune paths from coarse parse forest before fine parse, keeping paths within exp(alpha>=0)")
("ctf_beam_widen", po::value<double>()->default_value(2.0), "Expand coarse pass beam by this factor if no fine parse is found")
("ctf_num_widenings", po::value<int>()->default_value(2), "Widen coarse beam this many times before backing off to full parse")
@@ -815,6 +816,9 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
abort();
}
+ if (conf.count("show_target_graph"))
+ HypergraphIO::WriteTarget(forest);
+
for (int pass = 0; pass < rescoring_passes.size(); ++pass) {
const RescoringPass& rp = rescoring_passes[pass];
const vector<weight_t>& cur_weights = *rp.weight_vector;
diff --git a/decoder/do.tests.sh b/decoder/do.tests.sh
deleted file mode 100755
index b3ddeb18..00000000
--- a/decoder/do.tests.sh
+++ /dev/null
@@ -1 +0,0 @@
-for f in *_test; do ./$f; done
diff --git a/decoder/earley_composer.cc b/decoder/earley_composer.cc
index 48e94a31..b7af801a 100644
--- a/decoder/earley_composer.cc
+++ b/decoder/earley_composer.cc
@@ -329,7 +329,10 @@ class EarleyComposerImpl {
forest->ReserveNodes(kMAX_NODES);
assert(sit != g.end());
Edge* init = new Edge(start_cat_, &sit->second, q_0_);
- assert(IncorporateNewEdge(init));
+ if (!IncorporateNewEdge(init)) {
+ cerr << "Failed to create initial edge!\n";
+ abort();
+ }
while (exp_agenda.HasWork() || agenda.HasWork()) {
while(exp_agenda.HasWork()) {
const Edge* edge = exp_agenda.Next();
diff --git a/decoder/ff_context.cc b/decoder/ff_context.cc
new file mode 100644
index 00000000..19f9a413
--- /dev/null
+++ b/decoder/ff_context.cc
@@ -0,0 +1,99 @@
+#include "ff_context.h"
+
+#include <sstream>
+#include <cassert>
+#include <cmath>
+
+#include "filelib.h"
+#include "stringlib.h"
+#include "sentence_metadata.h"
+#include "lattice.h"
+#include "fdict.h"
+#include "verbose.h"
+
+using namespace std;
+
+namespace {
+ string Escape(const string& x) {
+ string y = x;
+ for (int i = 0; i < y.size(); ++i) {
+ if (y[i] == '=') y[i]='_';
+ if (y[i] == ';') y[i]='_';
+ }
+ return y;
+ }
+}
+
+RuleContextFeatures::RuleContextFeatures(const std::string& param) {
+ kSOS = TD::Convert("<s>");
+ kEOS = TD::Convert("</s>");
+
+ // TODO param lets you pass in a string from the cdec.ini file
+}
+
+void RuleContextFeatures::PrepareForInput(const SentenceMetadata& smeta) {
+ const Lattice& sl = smeta.GetSourceLattice();
+ current_input.resize(sl.size());
+ for (unsigned i = 0; i < sl.size(); ++i) {
+ if (sl[i].size() != 1) {
+ cerr << "Context features not supported with lattice inputs!\nid=" << smeta.GetSentenceId() << endl;
+ abort();
+ }
+ current_input[i] = sl[i][0].label;
+ }
+}
+
+void RuleContextFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const {
+ const TRule& rule = *edge.rule_;
+
+ if (rule.Arity() != 0 || // arity = 0, no nonterminals
+ rule.e_.size() != 1) return; // size = 1, predicted label is a single token
+
+
+ // you can see the current label "for free"
+ const WordID cur_label = rule.e_[0];
+ // (if you want to see more labels, you have to be very careful, and muck
+ // about with contexts and ant_contexts)
+
+ // but... you can look at as much of the source as you want!
+ const int from_src_index = edge.i_; // start of the span in the input being labeled
+ const int to_src_index = edge.j_; // end of the span in the input
+ // (note: in the case of tagging the size of the spans being labeled will
+ // always be 1, but in other formalisms, you can have bigger spans.)
+
+ // this is the current token being labeled:
+ const WordID cur_input = current_input[from_src_index];
+
+ // let's get the previous token in the input (may be to the left of the start
+ // of the sentence!)
+ WordID prev_input = kSOS;
+ if (from_src_index > 0) { prev_input = current_input[from_src_index - 1]; }
+ // let's get the next token (may be to the left of the start of the sentence!)
+ WordID next_input = kEOS;
+ if (to_src_index < current_input.size()) { next_input = current_input[to_src_index]; }
+
+ // now, build a feature string
+ ostringstream os;
+ // TD::Convert converts from the internal integer representation of a token
+ // to the actual token
+ os << "C1:" << TD::Convert(prev_input) << '_'
+ << TD::Convert(cur_input) << '|' << TD::Convert(cur_label);
+ // C1 is just to prevent a name clash
+
+ // pick a value
+ double fval = 1.0; // can be any real value
+
+ // add it to the feature vector FD::Convert converts the feature string to a
+ // feature int, Escape makes sure the feature string doesn't have any bad
+ // symbols that could confuse a parser somewhere
+ features->add_value(FD::Convert(Escape(os.str())), fval);
+ // that's it!
+
+ // create more features if you like...
+}
+
diff --git a/decoder/ff_context.h b/decoder/ff_context.h
new file mode 100644
index 00000000..0d22b027
--- /dev/null
+++ b/decoder/ff_context.h
@@ -0,0 +1,23 @@
+#ifndef _FF_CONTEXT_H_
+#define _FF_CONTEXT_H_
+
+#include <vector>
+#include "ff.h"
+
+class RuleContextFeatures : public FeatureFunction {
+ public:
+ RuleContextFeatures(const std::string& param);
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const;
+ virtual void PrepareForInput(const SentenceMetadata& smeta);
+ private:
+ std::vector<WordID> current_input;
+ WordID kSOS, kEOS;
+};
+
+#endif
diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc
index 3991d38f..c9ed996c 100644
--- a/decoder/ff_csplit.cc
+++ b/decoder/ff_csplit.cc
@@ -72,7 +72,7 @@ struct BasicCSplitFeaturesImpl {
const int fl1_;
const int fl2_;
const int bad_;
- FreqDict freq_dict_;
+ FreqDict<float> freq_dict_;
set<WordID> bad_words_;
};
diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc
index 04dd1906..d6d79f5e 100644
--- a/decoder/ff_ngrams.cc
+++ b/decoder/ff_ngrams.cc
@@ -57,6 +57,39 @@ namespace {
}
}
+static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order) {
+ vector<string> const& argv=SplitOnWhitespace(in);
+ *explicit_markers = false;
+ *order = 3;
+#define LMSPEC_NEXTARG if (i==argv.end()) { \
+ cerr << "Missing argument for "<<*last<<". "; goto usage; \
+ } else { ++i; }
+
+ for (vector<string>::const_iterator last,i=argv.begin(),e=argv.end();i!=e;++i) {
+ string const& s=*i;
+ if (s[0]=='-') {
+ if (s.size()>2) goto fail;
+ switch (s[1]) {
+ case 'x':
+ *explicit_markers = true;
+ break;
+ case 'o':
+ LMSPEC_NEXTARG; *order=atoi((*i).c_str());
+ break;
+#undef LMSPEC_NEXTARG
+ default:
+ fail:
+ cerr<<"Unknown option on NgramFeatures "<<s<<" ; ";
+ goto usage;
+ }
+ }
+ }
+ return true;
+usage:
+ cerr << "NgramFeatures is incorrect!\n";
+ return false;
+}
+
class NgramDetectorImpl {
// returns the number of unscored words at the left edge of a span
@@ -264,10 +297,10 @@ class NgramDetectorImpl {
}
public:
- explicit NgramDetectorImpl(bool explicit_markers) :
+ explicit NgramDetectorImpl(bool explicit_markers, unsigned order) :
kCDEC_UNK(TD::Convert("<unk>")) ,
add_sos_eos_(!explicit_markers) {
- order_ = 3;
+ order_ = order;
state_size_ = (order_ - 1) * sizeof(WordID) + 2 + (order_ - 1) * sizeof(WordID);
unscored_size_offset_ = (order_ - 1) * sizeof(WordID);
is_complete_offset_ = unscored_size_offset_ + 1;
@@ -316,8 +349,10 @@ class NgramDetectorImpl {
NgramDetector::NgramDetector(const string& param) {
string filename, mapfile, featname;
- bool explicit_markers = (param == "-x");
- pimpl_ = new NgramDetectorImpl(explicit_markers);
+ bool explicit_markers = false;
+ unsigned order = 3;
+ ParseArgs(param, &explicit_markers, &order);
+ pimpl_ = new NgramDetectorImpl(explicit_markers, order);
SetStateSize(pimpl_->ReserveStateSize());
}
diff --git a/decoder/ff_register.h b/decoder/ff_register.h
index 80b1457e..80b1457e 100755..100644
--- a/decoder/ff_register.h
+++ b/decoder/ff_register.h
diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h
index 74d71b6a..74d71b6a 100755..100644
--- a/decoder/ff_sample_fsa.h
+++ b/decoder/ff_sample_fsa.h
diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc
deleted file mode 100644
index 9e25d346..00000000
--- a/decoder/freqdict.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <cassert>
-#include "freqdict.h"
-#include "tdict.h"
-#include "filelib.h"
-
-using namespace std;
-
-void FreqDict::Load(const std::string& fname) {
- cerr << "Reading word frequencies: " << fname << endl;
- ReadFile rf(fname);
- istream& ifs = *rf.stream();
- int cc=0;
- while (ifs) {
- std::string word;
- ifs >> word;
- if (word.size() == 0) continue;
- if (word[0] == '#') continue;
- double count = 0;
- ifs >> count;
- assert(count > 0.0); // use -log(f)
- counts_[TD::Convert(word)]=count;
- ++cc;
- if (cc % 10000 == 0) { std::cerr << "."; }
- }
- std::cerr << "\n";
- std::cerr << "Loaded " << cc << " words\n";
-}
diff --git a/decoder/freqdict.h b/decoder/freqdict.h
index 9acf0c33..4e03fadd 100644
--- a/decoder/freqdict.h
+++ b/decoder/freqdict.h
@@ -1,20 +1,47 @@
#ifndef _FREQDICT_H_
#define _FREQDICT_H_
+#include <iostream>
#include <map>
#include <string>
#include "wordid.h"
+#include "filelib.h"
+#include "tdict.h"
+template <typename T = float>
class FreqDict {
public:
- void Load(const std::string& fname);
- float LookUp(const WordID& word) const {
- std::map<WordID,float>::const_iterator i = counts_.find(word);
- if (i == counts_.end()) return 0;
+ FreqDict() : max_() {}
+ T Max() const { return max_; }
+ void Load(const std::string& fname) {
+ std::cerr << "Reading word statistics from: " << fname << std::endl;
+ ReadFile rf(fname);
+ std::istream& ifs = *rf.stream();
+ int cc=0;
+ std::string word;
+ while (ifs) {
+ ifs >> word;
+ if (word.size() == 0) continue;
+ if (word[0] == '#') continue;
+ T count = 0;
+ ifs >> count;
+ if (count > max_) max_ = count;
+ counts_[TD::Convert(word)]=count;
+ ++cc;
+ if (cc % 10000 == 0) { std::cerr << "."; }
+ }
+ std::cerr << "\n";
+ std::cerr << "Loaded " << cc << " words\n";
+ }
+
+ T LookUp(const WordID& word) const {
+ typename std::map<WordID,T>::const_iterator i = counts_.find(word);
+ if (i == counts_.end()) return T();
return i->second;
}
private:
- std::map<WordID, float> counts_;
+ T max_;
+ std::map<WordID, T> counts_;
};
#endif
diff --git a/decoder/fsa-decode.sh b/decoder/fsa-decode.sh
deleted file mode 100755
index 66879523..00000000
--- a/decoder/fsa-decode.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-d=$(dirname `readlink -f $0`)/
-. $d/decode.sh
-in=1dev.ur cfg=cdec-fsa decode
diff --git a/decoder/fsa-hiero.ini b/decoder/fsa-hiero.ini
deleted file mode 100755
index 7c7d0347..00000000
--- a/decoder/fsa-hiero.ini
+++ /dev/null
@@ -1,5 +0,0 @@
-formalism=scfg
-scfg_extra_glue_grammar=glue-lda.scfg
-grammar=grammar.hiero
-show_tree_structure=true
-weights=weights.hiero
diff --git a/decoder/fsa.ini b/decoder/fsa.ini
deleted file mode 100755
index 571a2e34..00000000
--- a/decoder/fsa.ini
+++ /dev/null
@@ -1,2 +0,0 @@
-feature_function=ShorterThanPrev
-feature_function=LongerThanPrev
diff --git a/decoder/fst_translator.cc b/decoder/fst_translator.cc
index 38dbd717..074de4c9 100644
--- a/decoder/fst_translator.cc
+++ b/decoder/fst_translator.cc
@@ -30,7 +30,10 @@ struct FSTTranslatorImpl {
if (input.find("{\"rules\"") == 0) {
istringstream is(input);
Hypergraph src_cfg_hg;
- assert(HypergraphIO::ReadFromJSON(&is, &src_cfg_hg));
+ if (!HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)) {
+ cerr << "Failed to read HG from JSON.\n";
+ abort();
+ }
if (add_pass_through_rules) {
SparseVector<double> feats;
feats.set_value(FD::Convert("PassThrough"), 1);
diff --git a/decoder/glue-lda.scfg b/decoder/glue-lda.scfg
deleted file mode 100755
index 27489817..00000000
--- a/decoder/glue-lda.scfg
+++ /dev/null
@@ -1,8 +0,0 @@
-[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1
-[S] ||| [X0,1] ||| [1] ||| GlueTop=1
-[S] ||| [S,1] [X1,2] ||| [1] [2] ||| Glue=1
-[S] ||| [X1,1] ||| [1] ||| GlueTop=1
-[S] ||| [S,1] [X2,2] ||| [1] [2] ||| Glue=1
-[S] ||| [X2,1] ||| [1] ||| GlueTop=1
-[S] ||| [S,1] [X3,2] ||| [1] [2] ||| Glue=1
-[S] ||| [X3,1] ||| [1] ||| GlueTop=1
diff --git a/decoder/grammar.hiero b/decoder/grammar.hiero
deleted file mode 100755
index 79adf33a..00000000
--- a/decoder/grammar.hiero
+++ /dev/null
@@ -1,151 +0,0 @@
-[X] ||| . ||| . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] . ||| [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] anciano ||| [1] old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| [X,1] anciano . ||| [1] old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| [X,1] anciano [X,2] ||| [1] old man [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| [X,1] feo ||| ugly [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] feo . ||| ugly [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] feo [X,2] ||| ugly [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato ||| [1] cat ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato . ||| [1] cat . ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato [X,2] ||| [1] [2] cat ||| EgivenF=0 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato [X,2] ||| [1] cat [2] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato [X,2] . ||| [1] [2] cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato negro ||| [1] black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato negro . ||| [1] black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] gato negro [X,2] ||| [1] black cat [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] grande ||| big [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] grande . ||| big [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] grande [X,2] ||| big [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] negro ||| black [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] negro . ||| black [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] negro [X,2] ||| black [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] oruga ||| [1] caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] oruga . ||| [1] caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] oruga [X,2] ||| [1] caterpiller [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] patito [X,2] ||| [1] [2] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] patito [X,2] . ||| [1] [2] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] patito feo ||| [1] ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] patito feo . ||| [1] ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] patito feo [X,2] ||| [1] ugly duckling [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] peces ||| [1] fish ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] peces . ||| [1] fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] peces [X,2] ||| [1] fish [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro ||| [1] dog ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro . ||| [1] dog . ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro [X,2] ||| [1] dog [2] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro [X,2] ||| [1] [2] dog ||| EgivenF=0 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro [X,2] . ||| [1] [2] dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro grande ||| [1] big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro grande . ||| [1] big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] perro grande [X,2] ||| [1] big dog [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] pájaro [X,2] ||| [1] [2] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] pájaro [X,2] . ||| [1] [2] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] pájaro negro ||| [1] black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] pájaro negro . ||| [1] black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| [X,1] pájaro negro [X,2] ||| [1] black bird [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| anciano ||| old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| anciano . ||| old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| anciano [X,1] ||| old man [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| el ||| the ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] ||| the [1] ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] . ||| the [1] . ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] feo ||| the ugly [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] feo . ||| the ugly [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] feo [X,2] ||| the ugly [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] grande ||| the big [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] grande . ||| the big [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] grande [X,2] ||| the big [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] negro ||| the black [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] negro . ||| the black [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el [X,1] negro [X,2] ||| the black [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato ||| the cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato . ||| the cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato [X,1] ||| the [1] cat ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato [X,1] ||| the cat [1] ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato [X,1] . ||| the [1] cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato negro ||| the black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato negro . ||| the black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el gato negro [X,1] ||| the black cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el patito [X,1] ||| the [1] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el patito [X,1] . ||| the [1] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el patito feo ||| the ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el patito feo . ||| the ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el patito feo [X,1] ||| the ugly duckling [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro ||| the dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro . ||| the dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro [X,1] ||| the [1] dog ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro [X,1] ||| the dog [1] ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro [X,1] . ||| the [1] dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro grande ||| the big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro grande . ||| the big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el perro grande [X,1] ||| the big dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el pájaro [X,1] ||| the [1] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el pájaro [X,1] . ||| the [1] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el pájaro negro ||| the black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el pájaro negro . ||| the black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| el pájaro negro [X,1] ||| the black bird [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0
-[X] ||| eso ||| that ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| eso [X,1] ||| that [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| eso [X,1] . ||| that [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| eso perro ||| that dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| eso perro . ||| that dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| eso perro [X,1] ||| that dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| este ||| this ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| este [X,1] ||| this [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| este [X,1] . ||| this [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| este anciano ||| this old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| este anciano . ||| this old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| este anciano [X,1] ||| this old man [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629
-[X] ||| este gato ||| this cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| este gato . ||| this cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| este gato [X,1] ||| this cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| feo ||| ugly ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato ||| cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato . ||| cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato [X,1] ||| [1] cat ||| EgivenF=1.09861 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato [X,1] ||| cat [1] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato [X,1] . ||| [1] cat . ||| EgivenF=1.09861 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato negro ||| black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato negro . ||| black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| gato negro [X,1] ||| black cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| grande ||| big ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| la ||| the ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| la [X,1] ||| the [1] ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| la [X,1] . ||| the [1] . ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| la oruga ||| the caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| la oruga . ||| the caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| la oruga [X,1] ||| the caterpiller [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| los ||| the ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| los [X,1] ||| the [1] ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| los [X,1] . ||| the [1] . ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| los peces ||| the fish ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| los peces . ||| the fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| los peces [X,1] ||| the fish [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0
-[X] ||| negro ||| black ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| oruga ||| caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| oruga . ||| caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| oruga [X,1] ||| caterpiller [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| patito ||| duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| patito [X,1] ||| [1] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| patito [X,1] . ||| [1] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| patito feo ||| ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| patito feo . ||| ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| patito feo [X,1] ||| ugly duckling [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| peces ||| fish ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| peces . ||| fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| peces [X,1] ||| fish [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro ||| dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro . ||| dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro [X,1] ||| [1] dog ||| EgivenF=1.09861 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro [X,1] ||| dog [1] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro [X,1] . ||| [1] dog . ||| EgivenF=1.09861 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro grande ||| big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro grande . ||| big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| perro grande [X,1] ||| big dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| pájaro ||| bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| pájaro [X,1] ||| [1] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| pájaro [X,1] . ||| [1] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| pájaro negro ||| black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| pájaro negro . ||| black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
-[X] ||| pájaro negro [X,1] ||| black bird [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
diff --git a/decoder/hg_cfg.h b/decoder/hg_cfg.h
index b90aca47..b90aca47 100755..100644
--- a/decoder/hg_cfg.h
+++ b/decoder/hg_cfg.h
diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc
index c1c93933..9f0f50fa 100644
--- a/decoder/hg_io.cc
+++ b/decoder/hg_io.cc
@@ -624,3 +624,30 @@ void HypergraphIO::WriteAsCFG(const Hypergraph& hg) {
}
}
+/* Output format:
+ * #vertices
+ * for each vertex in bottom-up topological order:
+ * #downward_edges
+ * for each downward edge:
+ * RHS with [vertex_index] for NTs ||| scores
+ */
+void HypergraphIO::WriteTarget(const Hypergraph& hg) {
+ cout << hg.nodes_.size() << ' ' << hg.edges_.size() << '\n';
+ for (unsigned int i = 0; i < hg.nodes_.size(); ++i) {
+ const Hypergraph::EdgesVector &edges = hg.nodes_[i].in_edges_;
+ cout << edges.size() << '\n';
+ for (unsigned int j = 0; j < edges.size(); ++j) {
+ const Hypergraph::Edge &edge = hg.edges_[edges[j]];
+ const std::vector<WordID> &e = edge.rule_->e();
+ for (std::vector<WordID>::const_iterator word = e.begin(); word != e.end(); ++word) {
+ if (*word <= 0) {
+ cout << '[' << edge.tail_nodes_[-*word] << "] ";
+ } else {
+ cout << TD::Convert(*word) << ' ';
+ }
+ }
+ cout << "||| " << edge.rule_->scores_ << '\n';
+ }
+ }
+}
+
diff --git a/decoder/hg_io.h b/decoder/hg_io.h
index 082489d8..44817157 100644
--- a/decoder/hg_io.h
+++ b/decoder/hg_io.h
@@ -23,6 +23,9 @@ struct HypergraphIO {
static void WriteAsCFG(const Hypergraph& hg);
+ // Write only the target size information in bottom-up order.
+ static void WriteTarget(const Hypergraph& hg);
+
// serialization utils
static void ReadFromPLF(const std::string& in, Hypergraph* out, int line = 0);
// return PLF string representation (undefined behavior on non-lattices)
diff --git a/decoder/hg_test.h b/decoder/hg_test.h
index 3da6533c..3da6533c 100755..100644
--- a/decoder/hg_test.h
+++ b/decoder/hg_test.h
diff --git a/decoder/lattice.cc b/decoder/lattice.cc
index e3631e59..89da3cd0 100644
--- a/decoder/lattice.cc
+++ b/decoder/lattice.cc
@@ -46,6 +46,7 @@ void LatticeTools::ConvertTextToLattice(const string& text, Lattice* pl) {
Lattice& l = *pl;
vector<WordID> ids;
TD::ConvertSentence(text, &ids);
+ l.clear();
l.resize(ids.size());
for (int i = 0; i < l.size(); ++i)
l[i].push_back(LatticeArc(ids[i], 0.0, 1));
diff --git a/decoder/nt_span.h b/decoder/nt_span.h
index a918f301..a918f301 100755..100644
--- a/decoder/nt_span.h
+++ b/decoder/nt_span.h
diff --git a/decoder/oracle_bleu.h b/decoder/oracle_bleu.h
index b603e27a..b603e27a 100755..100644
--- a/decoder/oracle_bleu.h
+++ b/decoder/oracle_bleu.h
diff --git a/decoder/perro.sh b/decoder/perro.sh
deleted file mode 100755
index 3e54ac71..00000000
--- a/decoder/perro.sh
+++ /dev/null
@@ -1 +0,0 @@
-$gdb $cdec "$@" -k 30 --show_features -c fsa-hiero.ini -i perro.ur
diff --git a/decoder/perro.ur b/decoder/perro.ur
deleted file mode 100755
index 6c5da6d7..00000000
--- a/decoder/perro.ur
+++ /dev/null
@@ -1 +0,0 @@
-eso perro feo
diff --git a/decoder/program_options.h b/decoder/program_options.h
index 87afb320..87afb320 100755..100644
--- a/decoder/program_options.h
+++ b/decoder/program_options.h
diff --git a/decoder/sentences.h b/decoder/sentences.h
index 54b5ffb3..54b5ffb3 100755..100644
--- a/decoder/sentences.h
+++ b/decoder/sentences.h
diff --git a/decoder/short.ur b/decoder/short.ur
deleted file mode 100755
index 48612801..00000000
--- a/decoder/short.ur
+++ /dev/null
@@ -1 +0,0 @@
-krAcy myN pyr kw mxtlf HAdvAt
diff --git a/decoder/trule.cc b/decoder/trule.cc
index 40235542..141b8faa 100644
--- a/decoder/trule.cc
+++ b/decoder/trule.cc
@@ -232,16 +232,6 @@ void TRule::ComputeArity() {
arity_ = 1 - min;
}
-static string AnonymousStrVar(int i) {
- string res("[v]");
- if(!(i <= 0 && i >= -8)) {
- cerr << "Can't handle more than 9 non-terminals: index=" << (-i) << endl;
- abort();
- }
- res[1] = '1' - i;
- return res;
-}
-
string TRule::AsString(bool verbose) const {
ostringstream os;
int idx = 0;
@@ -259,15 +249,11 @@ string TRule::AsString(bool verbose) const {
}
}
os << " ||| ";
- if (idx > 9) {
- cerr << "Too many non-terminals!\n partial: " << os.str() << endl;
- exit(1);
- }
for (int i =0; i<e_.size(); ++i) {
if (i) os << ' ';
const WordID& w = e_[i];
if (w < 1)
- os << AnonymousStrVar(w);
+ os << '[' << (1-w) << ']';
else
os << TD::Convert(w);
}
diff --git a/decoder/weights-fsa b/decoder/weights-fsa
deleted file mode 100644
index 3cc96c2f..00000000
--- a/decoder/weights-fsa
+++ /dev/null
@@ -1,14 +0,0 @@
-Arity_0 1.70741473606976
-Arity_1 1.12426238048012
-Arity_2 1.14986187839554
-Glue -0.04589037041388
-LanguageModel 1.09051
-LM 1.09051
-PassThrough -3.66226367902928
-PhraseModel_0 -1.94633451863252
-PhraseModel_1 -0.1475347695476
-PhraseModel_2 -1.614818994946
-WordPenalty -3.0
-WordPenaltyFsa -0.56028442964748
-ShorterThanPrev -10
-LongerThanPrev -10
diff --git a/decoder/weights.hiero b/decoder/weights.hiero
deleted file mode 100755
index 6747f059..00000000
--- a/decoder/weights.hiero
+++ /dev/null
@@ -1,10 +0,0 @@
-SameFirstLetter 1
-LongerThanPrev 1
-ShorterThanPrev 1
-GlueTop 0.0
-Glue -1.0
-EgivenF -0.5
-FgivenE -0.5
-LexEgivenF -0.5
-LexFgivenE -0.5
-LM 1