From 73f2592edc0c8f510029e4834acd899751c85862 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Fri, 20 Sep 2013 20:01:03 +0200
Subject: loo
---
extractor/grammar_extractor.cc | 6 ++++--
extractor/grammar_extractor.h | 3 ++-
extractor/rule_factory.cc | 5 +++--
extractor/rule_factory.h | 3 ++-
extractor/run_extractor.cc | 13 +++++++++++--
extractor/sample_alignment.txt | 3 +++
extractor/sample_bitext.txt | 3 +++
extractor/sampler.cc | 35 ++++++++++++++++++++++++++++++-----
extractor/sampler.h | 5 ++++-
9 files changed, 62 insertions(+), 14 deletions(-)
(limited to 'extractor')
diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc
index 8050ce7b..1fbdee5b 100644
--- a/extractor/grammar_extractor.cc
+++ b/extractor/grammar_extractor.cc
@@ -3,11 +3,13 @@
#include
#include
#include
+#include
#include "grammar.h"
#include "rule.h"
#include "rule_factory.h"
#include "vocabulary.h"
+#include "data_array.h"
using namespace std;
@@ -32,10 +34,10 @@ GrammarExtractor::GrammarExtractor(
vocabulary(vocabulary),
rule_factory(rule_factory) {}
-Grammar GrammarExtractor::GetGrammar(const string& sentence) {
+Grammar GrammarExtractor::GetGrammar(const string& sentence, const unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array) {
vector words = TokenizeSentence(sentence);
vector word_ids = AnnotateWords(words);
- return rule_factory->GetGrammar(word_ids);
+ return rule_factory->GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array);
}
vector GrammarExtractor::TokenizeSentence(const string& sentence) {
diff --git a/extractor/grammar_extractor.h b/extractor/grammar_extractor.h
index b36ceeb9..6c0aafbf 100644
--- a/extractor/grammar_extractor.h
+++ b/extractor/grammar_extractor.h
@@ -4,6 +4,7 @@
#include
#include
#include
+#include
using namespace std;
@@ -44,7 +45,7 @@ class GrammarExtractor {
// Converts the sentence to a vector of word ids and uses the RuleFactory to
// extract the SCFG rules which may be used to decode the sentence.
- Grammar GetGrammar(const string& sentence);
+ Grammar GetGrammar(const string& sentence, const unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array);
private:
// Splits the sentence in a vector of words.
diff --git a/extractor/rule_factory.cc b/extractor/rule_factory.cc
index 8c30fb9e..e52019ae 100644
--- a/extractor/rule_factory.cc
+++ b/extractor/rule_factory.cc
@@ -17,6 +17,7 @@
#include "suffix_array.h"
#include "time_util.h"
#include "vocabulary.h"
+#include "data_array.h"
using namespace std;
using namespace chrono;
@@ -100,7 +101,7 @@ HieroCachingRuleFactory::HieroCachingRuleFactory() {}
HieroCachingRuleFactory::~HieroCachingRuleFactory() {}
-Grammar HieroCachingRuleFactory::GetGrammar(const vector& word_ids) {
+Grammar HieroCachingRuleFactory::GetGrammar(const vector& word_ids, const unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array) {
Clock::time_point start_time = Clock::now();
double total_extract_time = 0;
double total_intersect_time = 0;
@@ -192,7 +193,7 @@ Grammar HieroCachingRuleFactory::GetGrammar(const vector& word_ids) {
Clock::time_point extract_start = Clock::now();
if (!state.starts_with_x) {
// Extract rules for the sampled set of occurrences.
- PhraseLocation sample = sampler->Sample(next_node->matchings);
+ PhraseLocation sample = sampler->Sample(next_node->matchings, blacklisted_sentence_ids, source_data_array);
vector new_rules =
rule_extractor->ExtractRules(next_phrase, sample);
rules.insert(rules.end(), new_rules.begin(), new_rules.end());
diff --git a/extractor/rule_factory.h b/extractor/rule_factory.h
index 52e8712a..c7332720 100644
--- a/extractor/rule_factory.h
+++ b/extractor/rule_factory.h
@@ -3,6 +3,7 @@
#include
#include
+#include
#include "matchings_trie.h"
@@ -71,7 +72,7 @@ class HieroCachingRuleFactory {
// Constructs SCFG rules for a given sentence.
// (See class description for more details.)
- virtual Grammar GetGrammar(const vector& word_ids);
+ virtual Grammar GetGrammar(const vector& word_ids, const unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array);
protected:
HieroCachingRuleFactory();
diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc
index 8a9ca89d..6eb55073 100644
--- a/extractor/run_extractor.cc
+++ b/extractor/run_extractor.cc
@@ -75,7 +75,9 @@ int main(int argc, char** argv) {
("max_samples", po::value()->default_value(300),
"Maximum number of samples")
("tight_phrases", po::value()->default_value(true),
- "False if phrases may be loose (better, but slower)");
+ "False if phrases may be loose (better, but slower)")
+ ("leave_one_out", po::value()->zero_tokens(),
+ "do leave-one-out estimation of grammars (e.g. for extracting grammars for the training set");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
@@ -96,6 +98,11 @@ int main(int argc, char** argv) {
return 1;
}
+ bool leave_one_out = false;
+ if (vm.count("leave_one_out")) {
+ leave_one_out = true;
+ }
+
int num_threads = vm["threads"].as();
cerr << "Grammar extraction will use " << num_threads << " threads." << endl;
@@ -223,7 +230,9 @@ int main(int argc, char** argv) {
}
suffixes[i] = suffix;
- Grammar grammar = extractor.GetGrammar(sentences[i]);
+ unordered_set blacklisted_sentence_ids;
+ if (leave_one_out) blacklisted_sentence_ids.insert(i);
+ Grammar grammar = extractor.GetGrammar(sentences[i], blacklisted_sentence_ids, source_data_array);
ofstream output(GetGrammarFilePath(grammar_path, i).c_str());
output << grammar;
}
diff --git a/extractor/sample_alignment.txt b/extractor/sample_alignment.txt
index 80b446a4..f0292b01 100644
--- a/extractor/sample_alignment.txt
+++ b/extractor/sample_alignment.txt
@@ -1,2 +1,5 @@
0-0 1-1 2-2
1-0 2-1
+0-0
+0-0 1-1
+0-0 1-1
diff --git a/extractor/sample_bitext.txt b/extractor/sample_bitext.txt
index 93d6b39d..2b7c8e40 100644
--- a/extractor/sample_bitext.txt
+++ b/extractor/sample_bitext.txt
@@ -1,2 +1,5 @@
+asdf ||| dontseeme
+qqq asdf ||| zzz fdsa
+asdf qqq ||| fdsa zzz
ana are mere . ||| anna has apples .
ana bea mult lapte . ||| anna drinks a lot of milk .
diff --git a/extractor/sampler.cc b/extractor/sampler.cc
index d81956b5..2f7738db 100644
--- a/extractor/sampler.cc
+++ b/extractor/sampler.cc
@@ -12,18 +12,43 @@ Sampler::Sampler() {}
Sampler::~Sampler() {}
-PhraseLocation Sampler::Sample(const PhraseLocation& location) const {
+PhraseLocation Sampler::Sample(const PhraseLocation& location, unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array) const {
vector sample;
int num_subpatterns;
if (location.matchings == NULL) {
// Sample suffix array range.
num_subpatterns = 1;
int low = location.sa_low, high = location.sa_high;
- double step = max(1.0, (double) (high - low) / max_samples);
- for (double i = low; i < high && sample.size() < max_samples; i += step) {
- sample.push_back(suffix_array->GetSuffix(Round(i)));
+ double step = Round(max(1.0, (double) (high - low) / max_samples));
+ int i = location.sa_low;
+ bool found = false;
+ while (sample.size() < max_samples && i <= location.sa_high) {
+ int x = suffix_array->GetSuffix(i);
+ int id = source_data_array->GetSentenceId(x);
+ if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) {
+ int backoff_step = 1;
+ while (true) {
+ int j = i - backoff_step;
+ x = suffix_array->GetSuffix(j);
+ id = source_data_array->GetSentenceId(x);
+ if ((j >= location.sa_low) && (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end())
+ && (find(sample.begin(), sample.end(), x) == sample.end())) { found = true; break; }
+ int k = i + backoff_step;
+ x = suffix_array->GetSuffix(k);
+ id = source_data_array->GetSentenceId(x);
+ if ((k <= location.sa_high) && (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end())
+ && (find(sample.begin(), sample.end(), x) == sample.end())) { found = true; break; }
+ if (j <= location.sa_low && k >= location.sa_high) break;
+ backoff_step++;
+ }
+ } else {
+ found = true;
+ }
+ if (found && (find(sample.begin(), sample.end(), x) == sample.end())) sample.push_back(x);
+ i += step;
+ found = false;
}
- } else {
+ } else { // when do we get here?
// Sample vector of occurrences.
num_subpatterns = location.num_subpatterns;
int num_matchings = location.matchings->size() / num_subpatterns;
diff --git a/extractor/sampler.h b/extractor/sampler.h
index be4aa1bb..30e747fd 100644
--- a/extractor/sampler.h
+++ b/extractor/sampler.h
@@ -2,6 +2,9 @@
#define _SAMPLER_H_
#include
+#include
+
+#include "data_array.h"
using namespace std;
@@ -20,7 +23,7 @@ class Sampler {
virtual ~Sampler();
// Samples uniformly at most max_samples phrase occurrences.
- virtual PhraseLocation Sample(const PhraseLocation& location) const;
+ virtual PhraseLocation Sample(const PhraseLocation& location, const unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array) const;
protected:
Sampler();
--
cgit v1.2.3
From eca30edc48af8922e96512f27a98f74d1c8b9721 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Fri, 20 Sep 2013 20:10:19 +0200
Subject: example file
---
extractor/sample_source.txt | 5 +++++
1 file changed, 5 insertions(+)
create mode 100644 extractor/sample_source.txt
(limited to 'extractor')
diff --git a/extractor/sample_source.txt b/extractor/sample_source.txt
new file mode 100644
index 00000000..9b46dd6a
--- /dev/null
+++ b/extractor/sample_source.txt
@@ -0,0 +1,5 @@
+asdf
+qqq asdf
+asdf qqq
+ana are mere .
+ana bea mult lapte .
--
cgit v1.2.3
From 077f83a44c420b42541159271c6482cf068fc972 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Tue, 24 Sep 2013 15:50:03 +0200
Subject: loo #2
---
extractor/sampler.cc | 26 +++++++++++++++-----------
1 file changed, 15 insertions(+), 11 deletions(-)
(limited to 'extractor')
diff --git a/extractor/sampler.cc b/extractor/sampler.cc
index 2f7738db..cb470962 100644
--- a/extractor/sampler.cc
+++ b/extractor/sampler.cc
@@ -20,35 +20,39 @@ PhraseLocation Sampler::Sample(const PhraseLocation& location, unordered_setGetSuffix(i);
int id = source_data_array->GetSentenceId(x);
if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) {
+ found = false;
int backoff_step = 1;
while (true) {
+ if ((double)backoff_step >= step) break;
int j = i - backoff_step;
x = suffix_array->GetSuffix(j);
id = source_data_array->GetSentenceId(x);
- if ((j >= location.sa_low) && (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end())
- && (find(sample.begin(), sample.end(), x) == sample.end())) { found = true; break; }
+ if (j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) {
+ found = true; last = i; break;
+ }
int k = i + backoff_step;
x = suffix_array->GetSuffix(k);
id = source_data_array->GetSentenceId(x);
- if ((k <= location.sa_high) && (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end())
- && (find(sample.begin(), sample.end(), x) == sample.end())) { found = true; break; }
- if (j <= location.sa_low && k >= location.sa_high) break;
+ if (k < min(i+step, (double)high) && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) {
+ found = true; last = k; break;
+ }
+ if (j <= last && k >= high) break;
backoff_step++;
}
} else {
found = true;
+ last = i;
}
- if (found && (find(sample.begin(), sample.end(), x) == sample.end())) sample.push_back(x);
+ if (found) sample.push_back(x);
i += step;
- found = false;
}
- } else { // when do we get here?
+ } else {
// Sample vector of occurrences.
num_subpatterns = location.num_subpatterns;
int num_matchings = location.matchings->size() / num_subpatterns;
--
cgit v1.2.3
From 2d3948b98bb9e8c7bad60f1acd99ff0b42b3ae30 Mon Sep 17 00:00:00 2001
From: Chris Dyer
Date: Sun, 10 Nov 2013 00:58:44 -0500
Subject: guard against direct includes of tr1
---
configure.ac | 3 +-
decoder/Makefile.am | 4 -
decoder/apply_models.cc | 11 +-
decoder/cdec_ff.cc | 11 -
decoder/decoder.cc | 10 +-
decoder/dwarf.cc | 3209 -----------------------------------
decoder/dwarf.h | 286 ----
decoder/earley_composer.cc | 11 +-
decoder/factored_lexicon_helper.cc | 1 +
decoder/factored_lexicon_helper.h | 3 +-
decoder/ff_dwarf.cc | 894 ----------
decoder/ff_dwarf.h | 100 --
decoder/ff_lm.cc | 101 --
decoder/ff_lm.h | 22 -
decoder/ff_source_syntax.cc | 6 +
decoder/ff_source_syntax2.cc | 4 +-
decoder/ff_source_syntax2_p.cc | 10 +-
decoder/ff_source_syntax_p.cc | 9 +-
decoder/ff_wordalign.cc | 9 +-
decoder/ff_wordalign.h | 13 +-
decoder/ff_wordset.cc | 52 +-
decoder/ff_wordset.h | 73 +-
decoder/grammar.cc | 11 +-
decoder/hg_intersect.cc | 8 +-
decoder/kbest.h | 11 +-
decoder/maxtrans_blunsom.cc | 11 +-
decoder/phrasebased_translator.cc | 11 +-
decoder/scfg_translator.cc | 10 +-
decoder/sentence_metadata.h | 4 +-
extractor/Makefile.am | 27 +-
klm/lm/builder/Makefile.am | 6 +-
mteval/mbr_kbest.cc | 8 +-
mteval/ns_ter.cc | 8 +-
mteval/scorer.cc | 23 +-
mteval/ter.cc | 8 +-
python/cdec/sa/strmap.cc | 11 +-
training/crf/mpi_online_optimize.cc | 8 +-
training/mira/kbest_cut_mira.cc | 65 +-
training/mira/kbest_mira.cc | 18 +-
training/mira/mira.py | 98 +-
training/pro/mr_pro_map.cc | 1 -
training/utils/candidate_set.cc | 11 +-
training/utils/online_optimizer.h | 8 +-
training/utils/optimize_test.cc | 6 +-
utils/hash.h | 12 +-
word-aligner/fast_align.cc | 8 +-
word-aligner/ttables.cc | 1 -
word-aligner/ttables.h | 11 +-
48 files changed, 369 insertions(+), 4877 deletions(-)
delete mode 100644 decoder/dwarf.cc
delete mode 100644 decoder/dwarf.h
delete mode 100644 decoder/ff_dwarf.cc
delete mode 100644 decoder/ff_dwarf.h
(limited to 'extractor')
diff --git a/configure.ac b/configure.ac
index 37d8cced..c3b749e3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cdec],[2013-03-08])
+AC_INIT([cdec],[2013-11-10])
AC_CONFIG_SRCDIR([decoder/cdec.cc])
AM_INIT_AUTOMAKE
AC_CONFIG_HEADERS(config.h)
@@ -7,7 +7,6 @@ AC_PROG_LEX
case $LEX in
:) AC_MSG_ERROR([No lex (Flex, lex, etc.) program found]);;
esac
-# CPPFLAGS="$CPPFLAGS -std=c++0x"
AC_PROG_CC
AC_PROG_CXX
AX_CXX_COMPILE_STDCXX_11
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 914faaea..39a13ad8 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -41,7 +41,6 @@ libcdec_a_SOURCES = \
cfg_options.h \
csplit.h \
decoder.h \
- dwarf.h \
earley_composer.h \
exp_semiring.h \
factored_lexicon_helper.h \
@@ -51,7 +50,6 @@ libcdec_a_SOURCES = \
ff_charset.h \
ff_context.h \
ff_csplit.h \
- ff_dwarf.h \
ff_external.h \
ff_factory.h \
ff_klm.h \
@@ -103,8 +101,6 @@ libcdec_a_SOURCES = \
maxtrans_blunsom.cc \
cdec_ff.cc \
cfg.cc \
- dwarf.cc \
- ff_dwarf.cc \
ff_external.cc \
rule_lexer.cc \
fst_translator.cc \
diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc
index 330de9e2..2e093d6a 100644
--- a/decoder/apply_models.cc
+++ b/decoder/apply_models.cc
@@ -8,8 +8,14 @@
#include
#include
-#include
-#include
+#ifdef HAVE_CXX11
+# include
+# include
+#else
+# include
+# include
+namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; }
+#endif
#include
@@ -23,7 +29,6 @@
#define FAST_CP_2 3
using namespace std;
-using namespace std::tr1;
struct Candidate;
typedef SmallVectorInt JVector;
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index e7b31f50..09597e87 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -29,12 +29,8 @@
#include "ff_register.h"
#include "ff_charset.h"
#include "ff_wordset.h"
-#include "ff_dwarf.h"
#include "ff_external.h"
-#ifdef HAVE_GLC
-#include
-#endif
void register_feature_functions() {
static bool registered = false;
@@ -51,9 +47,6 @@ void register_feature_functions() {
RegisterFF();
//TODO: use for all features the new Register which requires static FF::usage(false,false) give name
-#ifdef HAVE_RANDLM
- ff_registry.Register("RandLM", new FFFactory);
-#endif
ff_registry.Register("SpanFeatures", new FFFactory());
ff_registry.Register("NgramFeatures", new FFFactory());
ff_registry.Register("RuleContextFeatures", new FFFactory());
@@ -98,10 +91,6 @@ void register_feature_functions() {
ff_registry.Register("WordPairFeatures", new FFFactory);
ff_registry.Register("SourcePathFeatures", new FFFactory);
ff_registry.Register("WordSet", new FFFactory);
- ff_registry.Register("Dwarf", new FFFactory);
ff_registry.Register("External", new FFFactory);
-#ifdef HAVE_GLC
- ff_registry.Register("ContextCRF", new FFFactory);
-#endif
}
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index 31e6dc46..2c0e07b7 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -1,6 +1,11 @@
#include "decoder.h"
-#include
+#ifdef HAVE_CXX11
+# include
+#else
+# include
+namespace std { using std::tr1::unordered_map; }
+#endif
#include
#include
#include
@@ -61,7 +66,6 @@
static const double kMINUS_EPSILON = -1e-6; // don't be too strict
using namespace std;
-using namespace std::tr1;
namespace po = boost::program_options;
static bool verbose_feature_functions=true;
@@ -90,7 +94,7 @@ struct ELengthWeightFunction {
}
};
inline void ShowBanner() {
- cerr << "cdec v1.0 (c) 2009-2011 by Chris Dyer\n";
+ cerr << "cdec (c) 2009--2013 by Chris Dyer\n";
}
inline string str(char const* name,po::variables_map const& conf) {
diff --git a/decoder/dwarf.cc b/decoder/dwarf.cc
deleted file mode 100644
index fb0404a6..00000000
--- a/decoder/dwarf.cc
+++ /dev/null
@@ -1,3209 +0,0 @@
-#include "dwarf.h"
-#include "tdict.h"
-#include "wordid.h"
-#include "lattice.h"
-#include "ff_dwarf.h"
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include