gi/morf-segmentation/vocabextractor.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40

#!/bin/bash

d=$(dirname `readlink -f $0`)
if [ $# -lt 1 ]; then
	echo "Extracts unique words and their frequencies from a subset of a corpus."
	echo
	echo "Usage: `basename $0` input_file [number_of_lines] > output_file"
	echo -e "\tinput_file contains a sentence per line."
	echo
	echo "Script also removes words from the vocabulary if they contain a digit or a special character. Output is printed to stdout in a format suitable for use with Morfessor."
	echo
	exit
fi

srcname=$1
reallen=0

if [[ $# -gt 1 ]]; then
  reallen=$2
fi

pattern_file=$d/invalid_vocab.patterns

if [[ ! -f $pattern_file ]]; then
  echo "Pattern file missing"
  exit 1 
fi

#this awk strips entries from the vocabulary if they contain invalid characters
#invalid characters are digits and punctuation marks, and words beginning or ending with a dash
#uniq -c extracts the unique words and counts the occurrences

if [[ $reallen -eq 0 ]]; then
	#when a zero is passed, use the whole file
  zcat -f $srcname | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^  *//' 

else
	zcat -f $srcname | head -n $reallen | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^  *//'
fi