diff options
author | bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-23 18:03:47 +0000 |
---|---|---|
committer | bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-23 18:03:47 +0000 |
commit | c57c05d19fb306f7f50cc02516a8a2901c920cca (patch) | |
tree | 1120643e63ea2b46d6a3bc0b338fb225682c9dd7 /gi/morf-segmentation/vocabextractor.sh | |
parent | 58681ee5816d13c04002ca8aebe23c2768da4e5b (diff) |
Adding morphology-segmentation stuff. Changes include: local-gi-pipeline (--morf arg), eval-pipeline (--oov-grammar, --lmorder)
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@382 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/morf-segmentation/vocabextractor.sh')
-rwxr-xr-x | gi/morf-segmentation/vocabextractor.sh | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/gi/morf-segmentation/vocabextractor.sh b/gi/morf-segmentation/vocabextractor.sh new file mode 100755 index 00000000..00ae7109 --- /dev/null +++ b/gi/morf-segmentation/vocabextractor.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +d=$(dirname `readlink -f $0`) +if [ $# -lt 1 ]; then + echo "Extracts unique words and their frequencies from a subset of a corpus." + echo + echo "Usage: `basename $0` input_file [number_of_lines] > output_file" + echo -e "\tinput_file contains a sentence per line." + echo + echo "Script also removes words from the vocabulary if they contain a digit or a special character. Output is printed to stdout in a format suitable for use with Morfessor." + echo + exit +fi + +srcname=$1 +reallen=0 + +if [[ $# -gt 1 ]]; then + reallen=$2 +fi + +pattern_file=$d/invalid_vocab.patterns + +if [[ ! -f $pattern_file ]]; then + echo "Pattern file missing" + exit 1 +fi + +#this awk strips entries from the vocabulary if they contain invalid characters +#invalid characters are digits and punctuation marks, and words beginning or ending with a dash +#uniq -c extracts the unique words and counts the occurrences + +if [[ $reallen -eq 0 ]]; then + #when a zero is passed, use the whole file + zcat -f $srcname | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^ *//' + +else + zcat -f $srcname | head -n $reallen | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^ *//' +fi + |