diff options
| author | Michael Denkowski <mdenkows@cs.cmu.edu> | 2013-07-30 16:41:53 -0700 | 
|---|---|---|
| committer | Michael Denkowski <mdenkows@cs.cmu.edu> | 2013-07-30 16:41:53 -0700 | 
| commit | 8730f338e6bd0e361d9f6ff574a7000042e984d2 (patch) | |
| tree | a1f0794f214c8d3517d0dfcbade46522eedfb9d4 /word-aligner | |
| parent | b45013c6272375220a24c6c1bb9867b1d5055f06 (diff) | |
Allow reading pairs of lines from stdin, easy force alignment script
Diffstat (limited to 'word-aligner')
| -rwxr-xr-x | word-aligner/force_align.py | 59 | 
1 files changed, 59 insertions, 0 deletions
| diff --git a/word-aligner/force_align.py b/word-aligner/force_align.py new file mode 100755 index 00000000..f404fb54 --- /dev/null +++ b/word-aligner/force_align.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +import os +import sys +import subprocess + +def main(): + +    if len(sys.argv[1:]) != 4: +        sys.stderr.write('run:\n') +        sys.stderr.write('  fast_align -i corpus.f-e -d -v -o -p fwd_params >fwd_align 2>fwd_err\n') +        sys.stderr.write('  fast_align -i corpus.f-e -r -d -v -o -p rev_params >rev_align 2>rev_err\n') +        sys.stderr.write('\n') +        sys.stderr.write('then run:\n') +        sys.stderr.write('  {} fwd_params fwd_err rev_params rev_err <in.f-e >out.f-e.gdfa\n'.format(sys.argv[0])) +        sys.exit(2) + +    (f_p, f_err, r_p, r_err) = sys.argv[1:] +     +    (f_T, f_m) = find_Tm(f_err) +    (r_T, r_m) = find_Tm(r_err) +     +    fast_align = os.path.join(os.path.dirname(__file__), 'fast_align') +    f_cmd = [fast_align, '-i', '-', '-d', '-T', f_T, '-m', f_m, '-f', f_p] +    r_cmd = [fast_align, '-i', '-', '-d', '-T', r_T, '-m', r_m, '-f', r_p, '-r'] + +    atools = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'utils', 'atools') +    tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and'] + +    sys.stderr.write('running: {}\n'.format(' '.join(f_cmd))) +    sys.stderr.write('running: {}\n'.format(' '.join(r_cmd))) +    sys.stderr.write('running: {}\n'.format(' '.join(tools_cmd))) + +    f_a = subprocess.Popen(f_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) +    r_a = subprocess.Popen(r_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) +    tools = subprocess.Popen(tools_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + +    for line in sys.stdin: +        f_a.stdin.write(line) +        r_a.stdin.write(line) +        # f words ||| e words ||| links ||| score +        f_line = f_a.stdout.readline().split(' ||| ')[2] +        r_line = r_a.stdout.readline().split(' ||| ')[2] +        tools.stdin.write('{}\n'.format(f_line)) +        tools.stdin.write('{}\n'.format(r_line)) +        sys.stdout.write(tools.stdout.readline()) + +def find_Tm(err): +    (T, m) = ('', '') +    for line in open(err): +        # expected target length = source length * N +        if 'expected target length' in line: +            m = line.split()[-1] +        elif 'final tension' in line: +            T = line.split()[-1] +    return (T, m) + +if __name__ == '__main__': +    main() | 
