summaryrefslogtreecommitdiff
path: root/gi/morf-segmentation/vocabextractor.sh
diff options
context:
space:
mode:
authorAvneesh Saluja <asaluja@gmail.com>2013-03-28 18:28:16 -0700
committerAvneesh Saluja <asaluja@gmail.com>2013-03-28 18:28:16 -0700
commit3d8d656fa7911524e0e6885647173474524e0784 (patch)
tree81b1ee2fcb67980376d03f0aa48e42e53abff222 /gi/morf-segmentation/vocabextractor.sh
parentbe7f57fdd484e063775d7abf083b9fa4c403b610 (diff)
parent96fedabebafe7a38a6d5928be8fff767e411d705 (diff)
fixed conflicts
Diffstat (limited to 'gi/morf-segmentation/vocabextractor.sh')
-rwxr-xr-xgi/morf-segmentation/vocabextractor.sh40
1 files changed, 0 insertions, 40 deletions
diff --git a/gi/morf-segmentation/vocabextractor.sh b/gi/morf-segmentation/vocabextractor.sh
deleted file mode 100755
index 00ae7109..00000000
--- a/gi/morf-segmentation/vocabextractor.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-
-d=$(dirname `readlink -f $0`)
-if [ $# -lt 1 ]; then
- echo "Extracts unique words and their frequencies from a subset of a corpus."
- echo
- echo "Usage: `basename $0` input_file [number_of_lines] > output_file"
- echo -e "\tinput_file contains a sentence per line."
- echo
- echo "Script also removes words from the vocabulary if they contain a digit or a special character. Output is printed to stdout in a format suitable for use with Morfessor."
- echo
- exit
-fi
-
-srcname=$1
-reallen=0
-
-if [[ $# -gt 1 ]]; then
- reallen=$2
-fi
-
-pattern_file=$d/invalid_vocab.patterns
-
-if [[ ! -f $pattern_file ]]; then
- echo "Pattern file missing"
- exit 1
-fi
-
-#this awk strips entries from the vocabulary if they contain invalid characters
-#invalid characters are digits and punctuation marks, and words beginning or ending with a dash
-#uniq -c extracts the unique words and counts the occurrences
-
-if [[ $reallen -eq 0 ]]; then
- #when a zero is passed, use the whole file
- zcat -f $srcname | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^ *//'
-
-else
- zcat -f $srcname | head -n $reallen | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^ *//'
-fi
-