From 8d222e20d8f253aa2c73d139d8ae6cc69483d071 Mon Sep 17 00:00:00 2001
From: bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Fri, 23 Jul 2010 18:03:47 +0000
Subject: Adding morphology-segmentation stuff. Changes include:
 local-gi-pipeline (--morf arg), eval-pipeline (--oov-grammar, --lmorder)

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@382 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 gi/morf-segmentation/vocabextractor.sh | 40 ++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100755 gi/morf-segmentation/vocabextractor.sh

(limited to 'gi/morf-segmentation/vocabextractor.sh')

diff --git a/gi/morf-segmentation/vocabextractor.sh b/gi/morf-segmentation/vocabextractor.sh
new file mode 100755
index 00000000..00ae7109
--- /dev/null
+++ b/gi/morf-segmentation/vocabextractor.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+d=$(dirname `readlink -f $0`)
+if [ $# -lt 1 ]; then
+	echo "Extracts unique words and their frequencies from a subset of a corpus."
+	echo
+	echo "Usage: `basename $0` input_file [number_of_lines] > output_file"
+	echo -e "\tinput_file contains a sentence per line."
+	echo
+	echo "Script also removes words from the vocabulary if they contain a digit or a special character. Output is printed to stdout in a format suitable for use with Morfessor."
+	echo
+	exit
+fi
+
+srcname=$1
+reallen=0
+
+if [[ $# -gt 1 ]]; then
+  reallen=$2
+fi
+
+pattern_file=$d/invalid_vocab.patterns
+
+if [[ ! -f $pattern_file ]]; then
+  echo "Pattern file missing"
+  exit 1 
+fi
+
+#this awk strips entries from the vocabulary if they contain invalid characters
+#invalid characters are digits and punctuation marks, and words beginning or ending with a dash
+#uniq -c extracts the unique words and counts the occurrences
+
+if [[ $reallen -eq 0 ]]; then
+	#when a zero is passed, use the whole file
+  zcat -f $srcname | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^  *//' 
+
+else
+	zcat -f $srcname | head -n $reallen | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^  *//'
+fi
+
-- 
cgit v1.2.3