blob: 00ae71093227bb453ac1e71131d943bf598ca10b (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
#!/bin/bash
d=$(dirname `readlink -f $0`)
if [ $# -lt 1 ]; then
echo "Extracts unique words and their frequencies from a subset of a corpus."
echo
echo "Usage: `basename $0` input_file [number_of_lines] > output_file"
echo -e "\tinput_file contains a sentence per line."
echo
echo "Script also removes words from the vocabulary if they contain a digit or a special character. Output is printed to stdout in a format suitable for use with Morfessor."
echo
exit
fi
srcname=$1
reallen=0
if [[ $# -gt 1 ]]; then
reallen=$2
fi
pattern_file=$d/invalid_vocab.patterns
if [[ ! -f $pattern_file ]]; then
echo "Pattern file missing"
exit 1
fi
#this awk strips entries from the vocabulary if they contain invalid characters
#invalid characters are digits and punctuation marks, and words beginning or ending with a dash
#uniq -c extracts the unique words and counts the occurrences
if [[ $reallen -eq 0 ]]; then
#when a zero is passed, use the whole file
zcat -f $srcname | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^ *//'
else
zcat -f $srcname | head -n $reallen | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^ *//'
fi
|