summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--configure.ac7
-rw-r--r--decoder/Makefile.am2
-rw-r--r--mteval/Makefile.am3
-rw-r--r--mteval/levenshtein.h29
-rw-r--r--mteval/ns.cc3
-rw-r--r--mteval/ns_cer.cc26
-rw-r--r--mteval/ns_cer.h3
-rw-r--r--mteval/ns_wer.cc35
-rw-r--r--mteval/ns_wer.h20
-rw-r--r--utils/Makefile.am1
-rw-r--r--word-aligner/Makefile.am1
-rwxr-xr-xword-aligner/force_align.py69
12 files changed, 168 insertions, 31 deletions
diff --git a/configure.ac b/configure.ac
index d7ced0ea..eae2f32e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -177,6 +177,13 @@ then
AM_CONDITIONAL([HAVE_GTEST], true)
fi
+# Enable static linking
+AC_ARG_WITH(
+ [static],
+ AS_HELP_STRING([--with-static], [Statically link binaries when possible]),
+ AC_SUBST(AS_TR_CPP([STATIC_FLAGS]), ["-all-static"]),
+)
+
#BOOST_THREADS
CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_REGEX_LDFLAGS $BOOST_SERIALIZATION_LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS"
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 8e61c13e..e46a7120 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -20,7 +20,7 @@ trule_test_SOURCES = trule_test.cc
trule_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a
cdec_SOURCES = cdec.cc
-cdec_LDFLAGS= -rdynamic
+cdec_LDFLAGS= -rdynamic $(STATIC_FLAGS)
cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/search/libksearch.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../klm/util/double-conversion/libklm_util_double.a
AM_CPPFLAGS = -DTEST_DATA=\"$(top_srcdir)/decoder/test_data\" -DBOOST_TEST_DYN_LINK -W -Wno-sign-compare -I$(top_srcdir) -I$(top_srcdir)/mteval -I$(top_srcdir)/utils -I$(top_srcdir)/klm
diff --git a/mteval/Makefile.am b/mteval/Makefile.am
index c833eb01..aac3e6b5 100644
--- a/mteval/Makefile.am
+++ b/mteval/Makefile.am
@@ -14,6 +14,7 @@ libmteval_a_SOURCES = \
aer_scorer.h \
comb_scorer.h \
external_scorer.h \
+ levenshtein.h \
ns.h \
ns_cer.h \
ns_comb.h \
@@ -21,6 +22,7 @@ libmteval_a_SOURCES = \
ns_ext.h \
ns_ssk.h \
ns_ter.h \
+ ns_wer.h \
scorer.h \
ter.h \
aer_scorer.cc \
@@ -34,6 +36,7 @@ libmteval_a_SOURCES = \
ns_ext.cc \
ns_ssk.cc \
ns_ter.cc \
+ ns_wer.cc \
scorer.cc \
ter.cc
diff --git a/mteval/levenshtein.h b/mteval/levenshtein.h
new file mode 100644
index 00000000..13a97047
--- /dev/null
+++ b/mteval/levenshtein.h
@@ -0,0 +1,29 @@
+#ifndef _LEVENSHTEIN_H_
+#define _LEVENSHTEIN_H_
+
+namespace cdec {
+
+template <typename V>
+inline unsigned LevenshteinDistance(const V& a, const V& b) {
+ const unsigned m = a.size(), n = b.size();
+ std::vector<unsigned> edit((m + 1) * 2);
+ for (unsigned i = 0; i <= n; i++) {
+ for (unsigned j = 0; j <= m; j++) {
+ if (i == 0)
+ edit[j] = j;
+ else if (j == 0)
+ edit[(i % 2) * (m + 1)] = i;
+ else
+ edit[(i % 2) * (m + 1) + j] = std::min(std::min(
+ edit[(i % 2) * (m + 1) + j - 1] + 1,
+ edit[((i - 1) % 2) * (m + 1) + j] + 1),
+ edit[((i - 1) % 2) * (m + 1) + (j - 1)]
+ + (a[j - 1] == b[i - 1] ? 0 : 1));
+ }
+ }
+ return edit[(n % 2) * (m + 1) + m];
+}
+
+}
+
+#endif
diff --git a/mteval/ns.cc b/mteval/ns.cc
index c1ea238b..075e0121 100644
--- a/mteval/ns.cc
+++ b/mteval/ns.cc
@@ -3,6 +3,7 @@
#include "ns_ext.h"
#include "ns_comb.h"
#include "ns_cer.h"
+#include "ns_wer.h"
#include "ns_ssk.h"
#include <cstdio>
@@ -285,6 +286,8 @@ EvaluationMetric* EvaluationMetric::Instance(const string& imetric_id) {
m = new CombinationMetric(metric_id);
} else if (metric_id == "CER") {
m = new CERMetric;
+ } else if (metric_id == "WER") {
+ m = new WERMetric;
} else {
cerr << "Implement please: " << metric_id << endl;
abort();
diff --git a/mteval/ns_cer.cc b/mteval/ns_cer.cc
index a843d471..da6683b1 100644
--- a/mteval/ns_cer.cc
+++ b/mteval/ns_cer.cc
@@ -1,5 +1,6 @@
#include "ns_cer.h"
#include "tdict.h"
+#include "levenshtein.h"
static const unsigned kNUMFIELDS = 2;
static const unsigned kEDITDISTANCE = 0;
@@ -13,27 +14,6 @@ unsigned CERMetric::SufficientStatisticsVectorSize() const {
return 2;
}
-unsigned CERMetric::EditDistance(const std::string& hyp,
- const std::string& ref) const {
- const unsigned m = hyp.size(), n = ref.size();
- std::vector<unsigned> edit((m + 1) * 2);
- for(unsigned i = 0; i < n + 1; i++) {
- for(unsigned j = 0; j < m + 1; j++) {
- if(i == 0)
- edit[j] = j;
- else if(j == 0)
- edit[(i%2)*(m+1)] = i;
- else
- edit[(i%2)*(m+1) + j] = std::min(std::min(edit[(i%2)*(m+1) + j-1] + 1,
- edit[((i-1)%2)*(m+1) + j] + 1),
- edit[((i-1)%2)*(m+1) + (j-1)]
- + (hyp[j-1] == ref[i-1] ? 0 : 1));
-
- }
- }
- return edit[(n%2)*(m+1) + m];
-}
-
void CERMetric::ComputeSufficientStatistics(const std::vector<WordID>& hyp,
const std::vector<std::vector<WordID> >& refs,
SufficientStats* out) const {
@@ -42,7 +22,7 @@ void CERMetric::ComputeSufficientStatistics(const std::vector<WordID>& hyp,
float best_score = hyp_str.size();
for (size_t i = 0; i < refs.size(); ++i) {
std::string ref_str(TD::GetString(refs[i]));
- float score = EditDistance(hyp_str, ref_str);
+ float score = cdec::LevenshteinDistance(hyp_str, ref_str);
if (score < best_score) {
out->fields[kEDITDISTANCE] = score;
out->fields[kCHARCOUNT] = ref_str.size();
@@ -50,6 +30,8 @@ void CERMetric::ComputeSufficientStatistics(const std::vector<WordID>& hyp,
}
}
}
+
float CERMetric::ComputeScore(const SufficientStats& stats) const {
return stats.fields[kEDITDISTANCE] / stats.fields[kCHARCOUNT];
}
+
diff --git a/mteval/ns_cer.h b/mteval/ns_cer.h
index 9d211181..cb2b4b4a 100644
--- a/mteval/ns_cer.h
+++ b/mteval/ns_cer.h
@@ -5,9 +5,6 @@
class CERMetric : public EvaluationMetric {
friend class EvaluationMetric;
- private:
- unsigned EditDistance(const std::string& hyp,
- const std::string& ref) const;
protected:
CERMetric() : EvaluationMetric("CER") {}
diff --git a/mteval/ns_wer.cc b/mteval/ns_wer.cc
new file mode 100644
index 00000000..f9b2bbbb
--- /dev/null
+++ b/mteval/ns_wer.cc
@@ -0,0 +1,35 @@
+#include "ns_wer.h"
+#include "tdict.h"
+#include "levenshtein.h"
+
+static const unsigned kNUMFIELDS = 2;
+static const unsigned kEDITDISTANCE = 0;
+static const unsigned kCHARCOUNT = 1;
+
+bool WERMetric::IsErrorMetric() const {
+ return true;
+}
+
+unsigned WERMetric::SufficientStatisticsVectorSize() const {
+ return 2;
+}
+
+void WERMetric::ComputeSufficientStatistics(const std::vector<WordID>& hyp,
+ const std::vector<std::vector<WordID> >& refs,
+ SufficientStats* out) const {
+ out->fields.resize(kNUMFIELDS);
+ float best_score = hyp.size();
+ for (size_t i = 0; i < refs.size(); ++i) {
+ float score = cdec::LevenshteinDistance(hyp, refs[i]);
+ if (score < best_score) {
+ out->fields[kEDITDISTANCE] = score;
+ out->fields[kCHARCOUNT] = refs[i].size();
+ best_score = score;
+ }
+ }
+}
+
+float WERMetric::ComputeScore(const SufficientStats& stats) const {
+ return stats.fields[kEDITDISTANCE] / stats.fields[kCHARCOUNT];
+}
+
diff --git a/mteval/ns_wer.h b/mteval/ns_wer.h
new file mode 100644
index 00000000..24c85d83
--- /dev/null
+++ b/mteval/ns_wer.h
@@ -0,0 +1,20 @@
+#ifndef _NS_WER_H_
+#define _NS_WER_H_
+
+#include "ns.h"
+
+class WERMetric : public EvaluationMetric {
+ friend class EvaluationMetric;
+ protected:
+ WERMetric() : EvaluationMetric("WER") {}
+
+ public:
+ virtual bool IsErrorMetric() const;
+ virtual unsigned SufficientStatisticsVectorSize() const;
+ virtual void ComputeSufficientStatistics(const std::vector<WordID>& hyp,
+ const std::vector<std::vector<WordID> >& refs,
+ SufficientStats* out) const;
+ virtual float ComputeScore(const SufficientStats& stats) const;
+};
+
+#endif
diff --git a/utils/Makefile.am b/utils/Makefile.am
index 18495c3a..727fa8a5 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -92,6 +92,7 @@ reconstruct_weights_SOURCES = reconstruct_weights.cc
reconstruct_weights_LDADD = libutils.a
atools_SOURCES = atools.cc
atools_LDADD = libutils.a
+atools_LDFLAGS = $(STATIC_FLAGS)
phmt_SOURCES = phmt.cc
phmt_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
diff --git a/word-aligner/Makefile.am b/word-aligner/Makefile.am
index 075ad009..071e4977 100644
--- a/word-aligner/Makefile.am
+++ b/word-aligner/Makefile.am
@@ -2,6 +2,7 @@ bin_PROGRAMS = fast_align binderiv
fast_align_SOURCES = fast_align.cc ttables.cc da.h ttables.h
fast_align_LDADD = ../utils/libutils.a
+fast_align_LDFLAGS = $(STATIC_FLAGS)
binderiv_SOURCES = binderiv.cc
binderiv_LDADD = ../utils/libutils.a
diff --git a/word-aligner/force_align.py b/word-aligner/force_align.py
index 8386e6a5..5cef9026 100755
--- a/word-aligner/force_align.py
+++ b/word-aligner/force_align.py
@@ -1,11 +1,68 @@
#!/usr/bin/env python
import os
+import subprocess
import sys
+import threading
-# Hook into realtime
-sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'realtime', 'rt'))
-from aligner import ForceAligner
+# Simplified, non-threadsafe version for force_align.py
+# Use the version in realtime for development
+class Aligner:
+
+ def __init__(self, fwd_params, fwd_err, rev_params, rev_err, heuristic='grow-diag-final-and'):
+
+ cdec_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+ fast_align = os.path.join(cdec_root, 'word-aligner', 'fast_align')
+ atools = os.path.join(cdec_root, 'utils', 'atools')
+
+ (fwd_T, fwd_m) = self.read_err(fwd_err)
+ (rev_T, rev_m) = self.read_err(rev_err)
+
+ fwd_cmd = [fast_align, '-i', '-', '-d', '-T', fwd_T, '-m', fwd_m, '-f', fwd_params]
+ rev_cmd = [fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r']
+ tools_cmd = [atools, '-i', '-', '-j', '-', '-c', heuristic]
+
+ self.fwd_align = popen_io(fwd_cmd)
+ self.rev_align = popen_io(rev_cmd)
+ self.tools = popen_io(tools_cmd)
+
+ def align(self, line):
+ self.fwd_align.stdin.write('{}\n'.format(line))
+ self.rev_align.stdin.write('{}\n'.format(line))
+ # f words ||| e words ||| links ||| score
+ fwd_line = self.fwd_align.stdout.readline().split('|||')[2].strip()
+ rev_line = self.rev_align.stdout.readline().split('|||')[2].strip()
+ self.tools.stdin.write('{}\n'.format(fwd_line))
+ self.tools.stdin.write('{}\n'.format(rev_line))
+ al_line = self.tools.stdout.readline().strip()
+ return al_line
+
+ def close(self):
+ self.fwd_align.stdin.close()
+ self.fwd_align.wait()
+ self.rev_align.stdin.close()
+ self.rev_align.wait()
+ self.tools.stdin.close()
+ self.tools.wait()
+
+ def read_err(self, err):
+ (T, m) = ('', '')
+ for line in open(err):
+ # expected target length = source length * N
+ if 'expected target length' in line:
+ m = line.split()[-1]
+ # final tension: N
+ elif 'final tension' in line:
+ T = line.split()[-1]
+ return (T, m)
+
+def popen_io(cmd):
+ p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ def consume(s):
+ for _ in s:
+ pass
+ threading.Thread(target=consume, args=(p.stderr,)).start()
+ return p
def main():
@@ -20,16 +77,18 @@ def main():
sys.stderr.write('where heuristic is one of: (intersect union grow-diag grow-diag-final grow-diag-final-and) default=grow-diag-final-and\n')
sys.exit(2)
- aligner = ForceAligner(*sys.argv[1:])
+ aligner = Aligner(*sys.argv[1:])
while True:
line = sys.stdin.readline()
if not line:
break
- sys.stdout.write('{}\n'.format(aligner.align_formatted(line.strip())))
+ sys.stdout.write('{}\n'.format(aligner.align(line.strip())))
sys.stdout.flush()
aligner.close()
if __name__ == '__main__':
main()
+
+