summaryrefslogtreecommitdiff
path: root/python/pkg
diff options
context:
space:
mode:
authorVictor Chahuneau <vchahune@cs.cmu.edu>2012-07-28 17:12:40 -0400
committerVictor Chahuneau <vchahune@cs.cmu.edu>2012-07-28 17:12:40 -0400
commit2ed119792bc81f56b682613cbacab8aef8a693da (patch)
treed03b21cc7b61dc0f347a32a36cfc479b5fca1f1b /python/pkg
parentec7b8d08993072030e4749d850ead9edaf9deb5c (diff)
[python] Suffix array compiler can read bitext (-b)
Diffstat (limited to 'python/pkg')
-rw-r--r--python/pkg/cdec/sa/compile.py24
1 files changed, 18 insertions, 6 deletions
diff --git a/python/pkg/cdec/sa/compile.py b/python/pkg/cdec/sa/compile.py
index 30e605a6..2a89243b 100644
--- a/python/pkg/cdec/sa/compile.py
+++ b/python/pkg/cdec/sa/compile.py
@@ -37,16 +37,22 @@ def main():
help='Number of pre-computed super-frequent patterns)')
parser.add_argument('-c', '--config', default='/dev/stdout',
help='Output configuration')
- parser.add_argument('-o', '--output', required=True,
- help='Output path')
- parser.add_argument('-f', '--source', required=True,
+ parser.add_argument('-f', '--source',
help='Source language corpus')
- parser.add_argument('-e', '--target', required=True,
+ parser.add_argument('-e', '--target',
help='Target language corpus')
+ parser.add_argument('-b', '--bitext',
+ help='Parallel text (source ||| target)')
parser.add_argument('-a', '--alignment', required=True,
help='Bitext word alignment')
+ parser.add_argument('-o', '--output', required=True,
+ help='Output path')
args = parser.parse_args()
+ if not ((args.source and args.target) or args.bitext):
+ parser.error('a parallel corpus is required\n'
+ '\tuse -f (source) with -e (target) or -b (bitext)')
+
param_names = ("max_len", "max_nt", "max_size", "min_gap", "rank1", "rank2")
params = (args.maxlen, args.maxnt, args.maxsize, args.mingap, args.rank1, args.rank2)
@@ -61,11 +67,17 @@ def main():
lex_bin = os.path.join(args.output, 'lex.bin')
logger.info('Compiling source suffix array')
- f_sa = cdec.sa.SuffixArray(from_text=args.source)
+ if args.bitext:
+ f_sa = cdec.sa.SuffixArray(from_text=args.bitext, side='source')
+ else:
+ f_sa = cdec.sa.SuffixArray(from_text=args.source)
f_sa.write_binary(f_sa_bin)
logger.info('Compiling target data array')
- e = cdec.sa.DataArray(from_text=args.target)
+ if args.bitext:
+ e = cdec.sa.DataArray(from_text=args.bitext, side='target')
+ else:
+ e = cdec.sa.DataArray(from_text=args.target)
e.write_binary(e_bin)
logger.info('Precomputing frequent phrases')