Merge remote branch 'upstream/master'

Conflicts: Jamroot bjam decoder/Jamfile decoder/cdec.cc dpmert/Jamfile jam-files/sanity.jam klm/lm/Jamfile klm/util/Jamfile mira/Jamfile
author: Kenneth Heafield <github@kheafield.com> 2012-10-22 12:07:20 +0100
committer: Kenneth Heafield <github@kheafield.com> 2012-10-22 12:07:20 +0100
commit: 5f98fe5c4f2a2090eeb9d30c030305a70a8347d1 (patch)
tree: 9b6002f850e6dea1e3400c6b19bb31a9cdf3067f /gi/morf-segmentation/vocabextractor.sh
parent: cf9994131993b40be62e90e213b1e11e6b550143 (diff)
parent: 21825a09d97c2e0afd20512f306fb25fed55e529 (diff)
1 files changed, 0 insertions, 40 deletions
diff --git a/gi/morf-segmentation/vocabextractor.sh b/gi/morf-segmentation/vocabextractor.sh
deleted file mode 100755
index 00ae7109..00000000
--- a/gi/morf-segmentation/vocabextractor.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-
-d=$(dirname `readlink -f $0`)
-if [ $# -lt 1 ]; then
-	echo "Extracts unique words and their frequencies from a subset of a corpus."
-	echo
-	echo "Usage: `basename $0` input_file [number_of_lines] > output_file"
-	echo -e "\tinput_file contains a sentence per line."
-	echo
-	echo "Script also removes words from the vocabulary if they contain a digit or a special character. Output is printed to stdout in a format suitable for use with Morfessor."
-	echo
-	exit
-fi
-
-srcname=$1
-reallen=0
-
-if [[ $# -gt 1 ]]; then
-  reallen=$2
-fi
-
-pattern_file=$d/invalid_vocab.patterns
-
-if [[ ! -f $pattern_file ]]; then
-  echo "Pattern file missing"
-  exit 1 
-fi
-
-#this awk strips entries from the vocabulary if they contain invalid characters
-#invalid characters are digits and punctuation marks, and words beginning or ending with a dash
-#uniq -c extracts the unique words and counts the occurrences
-
-if [[ $reallen -eq 0 ]]; then
-	#when a zero is passed, use the whole file
-  zcat -f $srcname | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^  *//' 
-
-else
-	zcat -f $srcname | head -n $reallen | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^  *//'
-fi
-
author	Kenneth Heafield <github@kheafield.com>	2012-10-22 12:07:20 +0100
committer	Kenneth Heafield <github@kheafield.com>	2012-10-22 12:07:20 +0100
commit	5f98fe5c4f2a2090eeb9d30c030305a70a8347d1 (patch)
tree	9b6002f850e6dea1e3400c6b19bb31a9cdf3067f /gi/morf-segmentation/vocabextractor.sh
parent	cf9994131993b40be62e90e213b1e11e6b550143 (diff)
parent	21825a09d97c2e0afd20512f306fb25fed55e529 (diff)