diff options
Diffstat (limited to 'sa-extract/sa-compile.pl')
| -rwxr-xr-x | sa-extract/sa-compile.pl | 322 | 
1 files changed, 0 insertions, 322 deletions
| diff --git a/sa-extract/sa-compile.pl b/sa-extract/sa-compile.pl deleted file mode 100755 index 1cae83a7..00000000 --- a/sa-extract/sa-compile.pl +++ /dev/null @@ -1,322 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use Getopt::Long; - -my $cwd; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $cwd = cwd(); } - -my $rootdir = `dirname $0`; chomp $rootdir; -my $compile = "$rootdir/compile_bin.py"; -my $lcp = "$rootdir/lcp_ops.py"; -die "Can't find $compile" unless -f $compile; -die "Can't execute $compile" unless -x $compile; - -sub print_help; -sub cleanup; - -my $alignment; -my $bitext; -my $catalog; -my $dryrun = 0; -my $group; -my $help = 0; -my $ini = "$rootdir/extract.ini"; -my $lm; -my $precomp; -my $no_ini = 0; -my $remove; -my $type; -my $local_only = 1; -my $output; - -# Process command-line options -if (GetOptions( -  "alignment=s" => \$alignment, -  "bitext=s" => \$bitext, -  "help" => \$help, -  "ini=s" => \$ini, -  "output=s" => \$output, -  "precomp-options=s" => \$precomp, -  "no-ini" => \$no_ini, -) == 0 || $help == 1 || @ARGV > 0){ -  print_help; -  die "\n"; -} - -open(INI, $ini) or die "Can't read $ini: $!"; - -$bitext || die "You must specify a bitext with -b\n"; -$alignment || die "You must specify an alignment with -a\n"; - -my $top_dir; -if (defined $output) { -  $top_dir = $output; -} else { -  $top_dir = "$cwd/sa-compiled"; -} - -my $type_dir = "$top_dir"; - -my $bitext_name; -my $bitext_f_file; -my $bitext_e_file; -my $bitext_dir; -if ($bitext){ -  if ($bitext =~ /(.*)=(.*),(.*)/){ -    $bitext_name = $1; -    $bitext_f_file = $2; -    $bitext_e_file = $3; -    -e $bitext_f_file || die "Could not find file $bitext_f_file\n"; -    -e $bitext_e_file || die "Could not find file $bitext_e_file\n";     -  } else { -    $bitext_name = $bitext; -  } - -  $bitext_dir = "$type_dir/bitext/$bitext_name"; -  if ($bitext_f_file){ -    if (-e $bitext_dir) { -      die "Bitext $bitext_name already exists\n"; -    } -  } else { -    unless (-e $bitext_dir){ -      die "No bitext $bitext_name. You must specify bitext files with -b\n"; -    } -  } -} - -my $max_nt = 2; -my $max_len = 5; -my $max_size = 15; -my $min_gap = 1; -my $rank1 = 100; -my $rank2 = 10; -my $precomp_file; -if ($precomp){ -  unless ($bitext_name){ -    die "You must specify a bitext with -b if using -p\n"; -  } -  my @precomp_args = split(/,/, $precomp); -  my $precomp_arg; -  for $precomp_arg (@precomp_args){ -    if ($precomp_arg =~ /(.*)=(.*)/){ -      my $key = $1; -      my $value = $2; -      unless ($value =~ /^\d+$/){ -        die "Value for -p option must be a positive integer, found $value\n"; -      } -      if ($key eq "max-len"){ $max_len = $value; } -      elsif ($key eq "max-nt"){ $max_nt = $value; } -      elsif ($key eq "max-size"){ $max_size = $value; } -      elsif ($key eq "min-gap"){ $min_gap = $value; } -      elsif ($key eq "rank1"){ $rank1 = $value; } -      elsif ($key eq "rank2"){ $rank2 = $value; } -      else{ -        die "Unknown option $key given for -p\n"; -      } -    } else { -      die "When using -p, you must specify key-value pairs using syntax: <key1>=<value1>,...,<keyN>=<valueN>\n"; -    } -  } -} -my $precomp_compile_needed = 0; -if ($bitext_name){ -  $precomp_file = "$bitext_dir/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin"; -  unless (-e $precomp_file){ -    $precomp_compile_needed = 1; -  } -} - -my $alignment_name; -my $alignment_file; -my $alignment_dir; -if ($alignment){ -  $bitext || die "Specified alignment $alignment without specifying bitext using -b\n"; -  if ($alignment =~ /(.*)=(.*)/){ -    $alignment_name = $1; -    $alignment_file = $2; -    -e $alignment_file || die "Could not find file $alignment_file\n"; -  } else { -    $alignment_name = $alignment; -  } - -  $alignment_dir = "$bitext_dir/a/$alignment_name"; -  if ($alignment_file){ -    if (-e $alignment_dir){ -      die "Alignment $alignment_name already exists for bitext $bitext_name\n"; -    } -  } else { -    require_top_dirs(); -    unless (-e $alignment_dir){ -      die "No alignment $alignment_name for bitext $bitext_name\n"; -    } -  } -} - -if ($bitext_name){ -  print STDERR " from files $bitext_f_file and $bitext_e_file\n"; -} else { -  print " No bitext\n"; -} -if ($precomp_compile_needed){ -  print STDERR "   Precompilation needed: max-len=$max_len, max-nt=$max_nt, max-size=$max_size, min-gap=$min_gap, rank1=$rank1, rank2=$rank2\n"; -} -if ($alignment_name){ -  print STDERR " Alignment = $alignment_name"; -  if ($alignment_file){ -    print STDERR " from file $alignment_file\n"; -  } -} else { -  print STDERR " No alignment\n"; -} - -my $script; -my $compile_dir; -$SIG{INT} = "cleanup"; -$SIG{TERM} = "cleanup";  -$SIG{HUP} = "cleanup"; - -  if ($bitext_e_file || $precomp_compile_needed || $alignment_file){ -    my $compiled_e_file; -    my $compiled_f_file; - -    $compile_dir = $top_dir; -    my $compile_top_dir = "$compile_dir"; - -    my $compile_bitext_dir = "$compile_top_dir/bitext/$bitext_name"; -    if ($bitext_e_file){ -      `mkdir -p $compile_bitext_dir`; -      print STDERR "\nCompiling bitext (f side)...\n"; -      `$compile -s $bitext_f_file $compile_bitext_dir/f.sa.bin`; -      die "Command failed: $!" unless $? == 0; -      print STDERR "\nCompiling bitext (e side)...\n"; -      `$compile -d $bitext_e_file $compile_bitext_dir/e.bin`; -      die "Command failed: $!" unless $? == 0; - -      $compiled_f_file = "$compile_bitext_dir/f.sa.bin"; -      $compiled_e_file = "$compile_bitext_dir/e.bin"; -    } else { # bitext already compiled -      $compiled_f_file = "$bitext_dir/f.sa.bin"; -      $compiled_e_file = "$bitext_dir/e.bin"; -    } - -    if ($precomp_compile_needed){ -      `mkdir -p $compile_bitext_dir`; -      my $top_stats_file = "$compile_bitext_dir/f.top.$rank1"; -      my $compiled_precomp_file = "$compile_bitext_dir/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin"; -      my $cmd = "$lcp -t 4 $compiled_f_file | sort -nr | head -$rank1 > $top_stats_file"; -      print STDERR "$cmd\n"; -      `$cmd`; -      die "Command failed: $cmd" unless $? == 0; -      `$compile -r max-len=$max_len max-nt=$max_nt max-size=$max_size min-gap=$min_gap rank1=$rank1 rank2=$rank2 sa=$compiled_f_file $top_stats_file $compiled_precomp_file`; -      die "Command failed: $!" unless $? == 0; -    } - -    if ($alignment_file){ -      my $compile_alignment_dir = "$compile_top_dir/bitext/$bitext_name/a/$alignment_name"; -      `mkdir -p $compile_alignment_dir`; -      print STDERR "\nCompiling alignment...\n"; -      my $cmd= "$compile -a $alignment_file $compile_alignment_dir/a.bin"; -      print STDERR "  $cmd\n"; -      `$cmd`; -      die "Command failed: $!" unless $? == 0; - -      print STDERR "\nCompiling lexical weights file...\n"; -      $cmd="$compile -x $compiled_f_file $compiled_e_file $compile_alignment_dir/a.bin $compile_alignment_dir/lex.bin"; -      print STDERR "  $cmd\n"; -      `$cmd`; -      die "Command failed: $!" unless $? == 0; -    } - -    chdir $compile_dir; -    print STDERR "Compiling done: $compile_dir\n"; -  } -   -  unless ($no_ini){ -    my $line; -    while($line=<INI>){ -      $line =~ s/^([^#]*a_file\s*=\s*")(.*)("\s*)$/$1$alignment_dir\/a.bin$3/; -      $line =~ s/^([^#]*lex_file\s*=\s*")(.*)("\s*)$/$1$alignment_dir\/lex.bin$3/; -      $line =~ s/^([^#]*f_sa_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/f.sa.bin$3/; -      $line =~ s/^([^#]*e_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/e.bin$3/; -      $line =~ s/^([^#]*precompute_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin$3/; - -      $line =~ s/^([^#]*max_len\s*=\s*)(.*)(\s*)$/$1$max_len$3/; -      $line =~ s/^([^#]*max_nt\s*=\s*)(.*)(\s*)$/$1$max_nt$3/; -      $line =~ s/^([^#]*max_size\s*=\s*)(.*)(\s*)$/$1$max_size$3/; -      $line =~ s/^([^#]*min_gap\s*=\s*)(.*)(\s*)$/$1$min_gap$3/; -      $line =~ s/^([^#]*rank1\s*=\s*)(.*)(\s*)$/$1$rank1$3/; -      $line =~ s/^([^#]*rank2\s*=\s*)(.*)(\s*)$/$1$rank2$3/; - -      print $line; -    } -  } - -exit(0); - -sub cleanup { -  die "Cleanup.\n"; -} - -sub print_help -{ -  my $name = `basename $0`; chomp $name; -  print << "Help"; - -usage: $name [options] - -  Manage compilation of SA-Hiero files and creation of ini files. -  In the default usage, the command deploys a set of files needed -  to create a system, and writes an ini for the system on stdout. - -options: - -  -a, --alignment <name>[=<filename>] -    Name of an alignment of a bitext (which must be specified  -    with -b unless using the -c flag).  If used with -r, the  -    alignment is removed from the deployment.  If used with -c,  -    only alignments with this name are listed.  If a filename is  -    given, then the file will be deployed using the name. - -  -b, --bitext <name>[=<f file>,<e file>] -    Name of a bitext for a particular system type (which must be -    specified with -t unless using the -c flag).  If used with -r, -    the bitext is removed from the deployment.  If used with -c, -    only bitexts with the name are listed.  If a filename is given, -    then the file will be deployed using the name. - -  -h, --help -    Prints this message. - -  -i, --ini <filename> -    Use a specific ini file as the template for a system, rather than -    the default ini file. - -  -p, --precomp-options <key1>=<value1>[,<key2>=<value2>,...,<keyN>=<valueN>] -    Set parameters of the grammar.  This must be set by $name because -    many parameters involve precomputation.  There are six keys that can -    be set:  -      max-len: maximum number of symbols (T and NT) in a grammar rule -      max-nt: maximum number of nonterminals in a grammar rule -      max-size: maximum span of a grammar rule extracted from training -      min-gap: minimum gap spanned by a nonterminal in training -      rank1: number of frequent words to precompute collocations for. -      rank2: number of super-frequent words to precompute triple  -        collocations for. -    All values must be positive integers.  If not specified, defaults are: -      max-len = 5 -      max-nt = 2  (>2 not supported) -      max-size = 10 -      min-gap = 2 -      rank1 = 100 (>300 not recommended) -      rank2 = 10  (>10 not recommended) - -  -n, --no-ini -    Do not generate an ini file on stdout.  If this option is used, then -    the requirement to specify a full system is relaxed.  Therefore, this -    option can be used when the sole objective is deployment of files. - -  -o, --output-dir -    Write the compiled model to this directory. - -Help -} | 
