diff options
-rw-r--r-- | configure.ac | 7 | ||||
-rw-r--r-- | decoder/Makefile.am | 2 | ||||
-rw-r--r-- | mteval/Makefile.am | 3 | ||||
-rw-r--r-- | mteval/levenshtein.h | 29 | ||||
-rw-r--r-- | mteval/ns.cc | 3 | ||||
-rw-r--r-- | mteval/ns_cer.cc | 26 | ||||
-rw-r--r-- | mteval/ns_cer.h | 3 | ||||
-rw-r--r-- | mteval/ns_wer.cc | 35 | ||||
-rw-r--r-- | mteval/ns_wer.h | 20 | ||||
-rw-r--r-- | utils/Makefile.am | 1 | ||||
-rw-r--r-- | word-aligner/Makefile.am | 1 | ||||
-rwxr-xr-x | word-aligner/force_align.py | 69 |
12 files changed, 168 insertions, 31 deletions
diff --git a/configure.ac b/configure.ac index d7ced0ea..eae2f32e 100644 --- a/configure.ac +++ b/configure.ac @@ -177,6 +177,13 @@ then AM_CONDITIONAL([HAVE_GTEST], true) fi +# Enable static linking +AC_ARG_WITH( + [static], + AS_HELP_STRING([--with-static], [Statically link binaries when possible]), + AC_SUBST(AS_TR_CPP([STATIC_FLAGS]), ["-all-static"]), +) + #BOOST_THREADS CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_REGEX_LDFLAGS $BOOST_SERIALIZATION_LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS" diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 8e61c13e..e46a7120 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -20,7 +20,7 @@ trule_test_SOURCES = trule_test.cc trule_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a cdec_SOURCES = cdec.cc -cdec_LDFLAGS= -rdynamic +cdec_LDFLAGS= -rdynamic $(STATIC_FLAGS) cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/search/libksearch.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../klm/util/double-conversion/libklm_util_double.a AM_CPPFLAGS = -DTEST_DATA=\"$(top_srcdir)/decoder/test_data\" -DBOOST_TEST_DYN_LINK -W -Wno-sign-compare -I$(top_srcdir) -I$(top_srcdir)/mteval -I$(top_srcdir)/utils -I$(top_srcdir)/klm diff --git a/mteval/Makefile.am b/mteval/Makefile.am index c833eb01..aac3e6b5 100644 --- a/mteval/Makefile.am +++ b/mteval/Makefile.am @@ -14,6 +14,7 @@ libmteval_a_SOURCES = \ aer_scorer.h \ comb_scorer.h \ external_scorer.h \ + levenshtein.h \ ns.h \ ns_cer.h \ ns_comb.h \ @@ -21,6 +22,7 @@ libmteval_a_SOURCES = \ ns_ext.h \ ns_ssk.h \ ns_ter.h \ + ns_wer.h \ scorer.h \ ter.h \ aer_scorer.cc \ @@ -34,6 +36,7 @@ libmteval_a_SOURCES = \ ns_ext.cc \ ns_ssk.cc \ ns_ter.cc \ + ns_wer.cc \ scorer.cc \ ter.cc diff --git a/mteval/levenshtein.h b/mteval/levenshtein.h new file mode 100644 index 00000000..13a97047 --- /dev/null +++ b/mteval/levenshtein.h @@ -0,0 +1,29 @@ +#ifndef _LEVENSHTEIN_H_ +#define _LEVENSHTEIN_H_ + +namespace cdec { + +template <typename V> +inline unsigned LevenshteinDistance(const V& a, const V& b) { + const unsigned m = a.size(), n = b.size(); + std::vector<unsigned> edit((m + 1) * 2); + for (unsigned i = 0; i <= n; i++) { + for (unsigned j = 0; j <= m; j++) { + if (i == 0) + edit[j] = j; + else if (j == 0) + edit[(i % 2) * (m + 1)] = i; + else + edit[(i % 2) * (m + 1) + j] = std::min(std::min( + edit[(i % 2) * (m + 1) + j - 1] + 1, + edit[((i - 1) % 2) * (m + 1) + j] + 1), + edit[((i - 1) % 2) * (m + 1) + (j - 1)] + + (a[j - 1] == b[i - 1] ? 0 : 1)); + } + } + return edit[(n % 2) * (m + 1) + m]; +} + +} + +#endif diff --git a/mteval/ns.cc b/mteval/ns.cc index c1ea238b..075e0121 100644 --- a/mteval/ns.cc +++ b/mteval/ns.cc @@ -3,6 +3,7 @@ #include "ns_ext.h" #include "ns_comb.h" #include "ns_cer.h" +#include "ns_wer.h" #include "ns_ssk.h" #include <cstdio> @@ -285,6 +286,8 @@ EvaluationMetric* EvaluationMetric::Instance(const string& imetric_id) { m = new CombinationMetric(metric_id); } else if (metric_id == "CER") { m = new CERMetric; + } else if (metric_id == "WER") { + m = new WERMetric; } else { cerr << "Implement please: " << metric_id << endl; abort(); diff --git a/mteval/ns_cer.cc b/mteval/ns_cer.cc index a843d471..da6683b1 100644 --- a/mteval/ns_cer.cc +++ b/mteval/ns_cer.cc @@ -1,5 +1,6 @@ #include "ns_cer.h" #include "tdict.h" +#include "levenshtein.h" static const unsigned kNUMFIELDS = 2; static const unsigned kEDITDISTANCE = 0; @@ -13,27 +14,6 @@ unsigned CERMetric::SufficientStatisticsVectorSize() const { return 2; } -unsigned CERMetric::EditDistance(const std::string& hyp, - const std::string& ref) const { - const unsigned m = hyp.size(), n = ref.size(); - std::vector<unsigned> edit((m + 1) * 2); - for(unsigned i = 0; i < n + 1; i++) { - for(unsigned j = 0; j < m + 1; j++) { - if(i == 0) - edit[j] = j; - else if(j == 0) - edit[(i%2)*(m+1)] = i; - else - edit[(i%2)*(m+1) + j] = std::min(std::min(edit[(i%2)*(m+1) + j-1] + 1, - edit[((i-1)%2)*(m+1) + j] + 1), - edit[((i-1)%2)*(m+1) + (j-1)] - + (hyp[j-1] == ref[i-1] ? 0 : 1)); - - } - } - return edit[(n%2)*(m+1) + m]; -} - void CERMetric::ComputeSufficientStatistics(const std::vector<WordID>& hyp, const std::vector<std::vector<WordID> >& refs, SufficientStats* out) const { @@ -42,7 +22,7 @@ void CERMetric::ComputeSufficientStatistics(const std::vector<WordID>& hyp, float best_score = hyp_str.size(); for (size_t i = 0; i < refs.size(); ++i) { std::string ref_str(TD::GetString(refs[i])); - float score = EditDistance(hyp_str, ref_str); + float score = cdec::LevenshteinDistance(hyp_str, ref_str); if (score < best_score) { out->fields[kEDITDISTANCE] = score; out->fields[kCHARCOUNT] = ref_str.size(); @@ -50,6 +30,8 @@ void CERMetric::ComputeSufficientStatistics(const std::vector<WordID>& hyp, } } } + float CERMetric::ComputeScore(const SufficientStats& stats) const { return stats.fields[kEDITDISTANCE] / stats.fields[kCHARCOUNT]; } + diff --git a/mteval/ns_cer.h b/mteval/ns_cer.h index 9d211181..cb2b4b4a 100644 --- a/mteval/ns_cer.h +++ b/mteval/ns_cer.h @@ -5,9 +5,6 @@ class CERMetric : public EvaluationMetric { friend class EvaluationMetric; - private: - unsigned EditDistance(const std::string& hyp, - const std::string& ref) const; protected: CERMetric() : EvaluationMetric("CER") {} diff --git a/mteval/ns_wer.cc b/mteval/ns_wer.cc new file mode 100644 index 00000000..f9b2bbbb --- /dev/null +++ b/mteval/ns_wer.cc @@ -0,0 +1,35 @@ +#include "ns_wer.h" +#include "tdict.h" +#include "levenshtein.h" + +static const unsigned kNUMFIELDS = 2; +static const unsigned kEDITDISTANCE = 0; +static const unsigned kCHARCOUNT = 1; + +bool WERMetric::IsErrorMetric() const { + return true; +} + +unsigned WERMetric::SufficientStatisticsVectorSize() const { + return 2; +} + +void WERMetric::ComputeSufficientStatistics(const std::vector<WordID>& hyp, + const std::vector<std::vector<WordID> >& refs, + SufficientStats* out) const { + out->fields.resize(kNUMFIELDS); + float best_score = hyp.size(); + for (size_t i = 0; i < refs.size(); ++i) { + float score = cdec::LevenshteinDistance(hyp, refs[i]); + if (score < best_score) { + out->fields[kEDITDISTANCE] = score; + out->fields[kCHARCOUNT] = refs[i].size(); + best_score = score; + } + } +} + +float WERMetric::ComputeScore(const SufficientStats& stats) const { + return stats.fields[kEDITDISTANCE] / stats.fields[kCHARCOUNT]; +} + diff --git a/mteval/ns_wer.h b/mteval/ns_wer.h new file mode 100644 index 00000000..24c85d83 --- /dev/null +++ b/mteval/ns_wer.h @@ -0,0 +1,20 @@ +#ifndef _NS_WER_H_ +#define _NS_WER_H_ + +#include "ns.h" + +class WERMetric : public EvaluationMetric { + friend class EvaluationMetric; + protected: + WERMetric() : EvaluationMetric("WER") {} + + public: + virtual bool IsErrorMetric() const; + virtual unsigned SufficientStatisticsVectorSize() const; + virtual void ComputeSufficientStatistics(const std::vector<WordID>& hyp, + const std::vector<std::vector<WordID> >& refs, + SufficientStats* out) const; + virtual float ComputeScore(const SufficientStats& stats) const; +}; + +#endif diff --git a/utils/Makefile.am b/utils/Makefile.am index 18495c3a..727fa8a5 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -92,6 +92,7 @@ reconstruct_weights_SOURCES = reconstruct_weights.cc reconstruct_weights_LDADD = libutils.a atools_SOURCES = atools.cc atools_LDADD = libutils.a +atools_LDFLAGS = $(STATIC_FLAGS) phmt_SOURCES = phmt.cc phmt_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) diff --git a/word-aligner/Makefile.am b/word-aligner/Makefile.am index 075ad009..071e4977 100644 --- a/word-aligner/Makefile.am +++ b/word-aligner/Makefile.am @@ -2,6 +2,7 @@ bin_PROGRAMS = fast_align binderiv fast_align_SOURCES = fast_align.cc ttables.cc da.h ttables.h fast_align_LDADD = ../utils/libutils.a +fast_align_LDFLAGS = $(STATIC_FLAGS) binderiv_SOURCES = binderiv.cc binderiv_LDADD = ../utils/libutils.a diff --git a/word-aligner/force_align.py b/word-aligner/force_align.py index 8386e6a5..5cef9026 100755 --- a/word-aligner/force_align.py +++ b/word-aligner/force_align.py @@ -1,11 +1,68 @@ #!/usr/bin/env python import os +import subprocess import sys +import threading -# Hook into realtime -sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'realtime', 'rt')) -from aligner import ForceAligner +# Simplified, non-threadsafe version for force_align.py +# Use the version in realtime for development +class Aligner: + + def __init__(self, fwd_params, fwd_err, rev_params, rev_err, heuristic='grow-diag-final-and'): + + cdec_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + fast_align = os.path.join(cdec_root, 'word-aligner', 'fast_align') + atools = os.path.join(cdec_root, 'utils', 'atools') + + (fwd_T, fwd_m) = self.read_err(fwd_err) + (rev_T, rev_m) = self.read_err(rev_err) + + fwd_cmd = [fast_align, '-i', '-', '-d', '-T', fwd_T, '-m', fwd_m, '-f', fwd_params] + rev_cmd = [fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r'] + tools_cmd = [atools, '-i', '-', '-j', '-', '-c', heuristic] + + self.fwd_align = popen_io(fwd_cmd) + self.rev_align = popen_io(rev_cmd) + self.tools = popen_io(tools_cmd) + + def align(self, line): + self.fwd_align.stdin.write('{}\n'.format(line)) + self.rev_align.stdin.write('{}\n'.format(line)) + # f words ||| e words ||| links ||| score + fwd_line = self.fwd_align.stdout.readline().split('|||')[2].strip() + rev_line = self.rev_align.stdout.readline().split('|||')[2].strip() + self.tools.stdin.write('{}\n'.format(fwd_line)) + self.tools.stdin.write('{}\n'.format(rev_line)) + al_line = self.tools.stdout.readline().strip() + return al_line + + def close(self): + self.fwd_align.stdin.close() + self.fwd_align.wait() + self.rev_align.stdin.close() + self.rev_align.wait() + self.tools.stdin.close() + self.tools.wait() + + def read_err(self, err): + (T, m) = ('', '') + for line in open(err): + # expected target length = source length * N + if 'expected target length' in line: + m = line.split()[-1] + # final tension: N + elif 'final tension' in line: + T = line.split()[-1] + return (T, m) + +def popen_io(cmd): + p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + def consume(s): + for _ in s: + pass + threading.Thread(target=consume, args=(p.stderr,)).start() + return p def main(): @@ -20,16 +77,18 @@ def main(): sys.stderr.write('where heuristic is one of: (intersect union grow-diag grow-diag-final grow-diag-final-and) default=grow-diag-final-and\n') sys.exit(2) - aligner = ForceAligner(*sys.argv[1:]) + aligner = Aligner(*sys.argv[1:]) while True: line = sys.stdin.readline() if not line: break - sys.stdout.write('{}\n'.format(aligner.align_formatted(line.strip()))) + sys.stdout.write('{}\n'.format(aligner.align(line.strip()))) sys.stdout.flush() aligner.close() if __name__ == '__main__': main() + + |