summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@cs.cmu.edu>2013-07-30 16:41:53 -0700
committerMichael Denkowski <mdenkows@cs.cmu.edu>2013-07-30 16:41:53 -0700
commit8730f338e6bd0e361d9f6ff574a7000042e984d2 (patch)
treea1f0794f214c8d3517d0dfcbade46522eedfb9d4
parentb45013c6272375220a24c6c1bb9867b1d5055f06 (diff)
Allow reading pairs of lines from stdin, easy force alignment script
-rw-r--r--utils/atools.cc3
-rwxr-xr-xword-aligner/force_align.py59
2 files changed, 60 insertions, 2 deletions
diff --git a/utils/atools.cc b/utils/atools.cc
index 24406b71..1726c4ac 100644
--- a/utils/atools.cc
+++ b/utils/atools.cc
@@ -299,8 +299,7 @@ void InitCommandLine(unsigned argc, char** argv, po::variables_map* conf) {
exit(1);
}
if ((*conf)["input_1"].as<string>() == "-" && (*conf)["input_2"].as<string>() == "-") {
- cerr << "Both inputs cannot be STDIN\n";
- exit(1);
+ cerr << "Both inputs STDIN, reading PAIRS of lines\n";
}
} else {
if (conf->count("input_2") != 0) {
diff --git a/word-aligner/force_align.py b/word-aligner/force_align.py
new file mode 100755
index 00000000..f404fb54
--- /dev/null
+++ b/word-aligner/force_align.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import subprocess
+
+def main():
+
+ if len(sys.argv[1:]) != 4:
+ sys.stderr.write('run:\n')
+ sys.stderr.write(' fast_align -i corpus.f-e -d -v -o -p fwd_params >fwd_align 2>fwd_err\n')
+ sys.stderr.write(' fast_align -i corpus.f-e -r -d -v -o -p rev_params >rev_align 2>rev_err\n')
+ sys.stderr.write('\n')
+ sys.stderr.write('then run:\n')
+ sys.stderr.write(' {} fwd_params fwd_err rev_params rev_err <in.f-e >out.f-e.gdfa\n'.format(sys.argv[0]))
+ sys.exit(2)
+
+ (f_p, f_err, r_p, r_err) = sys.argv[1:]
+
+ (f_T, f_m) = find_Tm(f_err)
+ (r_T, r_m) = find_Tm(r_err)
+
+ fast_align = os.path.join(os.path.dirname(__file__), 'fast_align')
+ f_cmd = [fast_align, '-i', '-', '-d', '-T', f_T, '-m', f_m, '-f', f_p]
+ r_cmd = [fast_align, '-i', '-', '-d', '-T', r_T, '-m', r_m, '-f', r_p, '-r']
+
+ atools = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'utils', 'atools')
+ tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and']
+
+ sys.stderr.write('running: {}\n'.format(' '.join(f_cmd)))
+ sys.stderr.write('running: {}\n'.format(' '.join(r_cmd)))
+ sys.stderr.write('running: {}\n'.format(' '.join(tools_cmd)))
+
+ f_a = subprocess.Popen(f_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ r_a = subprocess.Popen(r_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ tools = subprocess.Popen(tools_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+
+ for line in sys.stdin:
+ f_a.stdin.write(line)
+ r_a.stdin.write(line)
+ # f words ||| e words ||| links ||| score
+ f_line = f_a.stdout.readline().split(' ||| ')[2]
+ r_line = r_a.stdout.readline().split(' ||| ')[2]
+ tools.stdin.write('{}\n'.format(f_line))
+ tools.stdin.write('{}\n'.format(r_line))
+ sys.stdout.write(tools.stdout.readline())
+
+def find_Tm(err):
+ (T, m) = ('', '')
+ for line in open(err):
+ # expected target length = source length * N
+ if 'expected target length' in line:
+ m = line.split()[-1]
+ elif 'final tension' in line:
+ T = line.split()[-1]
+ return (T, m)
+
+if __name__ == '__main__':
+ main()