diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-10-11 14:06:32 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-10-11 14:06:32 -0400 |
commit | 9339c80d465545aec5a6dccfef7c83ca715bf11f (patch) | |
tree | 64c56d558331edad1db3832018c80e799551c39a /gi/morf-segmentation/vocabextractor.sh | |
parent | 438dac41810b7c69fa10203ac5130d20efa2da9f (diff) | |
parent | afd7da3b2338661657ad0c4e9eec681e014d37bf (diff) |
Merge branch 'master' of https://github.com/redpony/cdec
Diffstat (limited to 'gi/morf-segmentation/vocabextractor.sh')
-rwxr-xr-x | gi/morf-segmentation/vocabextractor.sh | 40 |
1 files changed, 0 insertions, 40 deletions
diff --git a/gi/morf-segmentation/vocabextractor.sh b/gi/morf-segmentation/vocabextractor.sh deleted file mode 100755 index 00ae7109..00000000 --- a/gi/morf-segmentation/vocabextractor.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -d=$(dirname `readlink -f $0`) -if [ $# -lt 1 ]; then - echo "Extracts unique words and their frequencies from a subset of a corpus." - echo - echo "Usage: `basename $0` input_file [number_of_lines] > output_file" - echo -e "\tinput_file contains a sentence per line." - echo - echo "Script also removes words from the vocabulary if they contain a digit or a special character. Output is printed to stdout in a format suitable for use with Morfessor." - echo - exit -fi - -srcname=$1 -reallen=0 - -if [[ $# -gt 1 ]]; then - reallen=$2 -fi - -pattern_file=$d/invalid_vocab.patterns - -if [[ ! -f $pattern_file ]]; then - echo "Pattern file missing" - exit 1 -fi - -#this awk strips entries from the vocabulary if they contain invalid characters -#invalid characters are digits and punctuation marks, and words beginning or ending with a dash -#uniq -c extracts the unique words and counts the occurrences - -if [[ $reallen -eq 0 ]]; then - #when a zero is passed, use the whole file - zcat -f $srcname | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^ *//' - -else - zcat -f $srcname | head -n $reallen | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^ *//' -fi - |