summaryrefslogtreecommitdiff
path: root/gi/morf-segmentation/filter_docs.pl
diff options
context:
space:
mode:
Diffstat (limited to 'gi/morf-segmentation/filter_docs.pl')
-rwxr-xr-xgi/morf-segmentation/filter_docs.pl24
1 files changed, 24 insertions, 0 deletions
diff --git a/gi/morf-segmentation/filter_docs.pl b/gi/morf-segmentation/filter_docs.pl
new file mode 100755
index 00000000..a78575da
--- /dev/null
+++ b/gi/morf-segmentation/filter_docs.pl
@@ -0,0 +1,24 @@
+#!/usr/bin/perl
+
+#Filters the phrase&cluster document set to retain only documents that correspond to words or morphs, i.e. not crossing word boundaries.
+
+#Usage: filter_docs.pl [mark]
+# STDIN: data in the doc.txt format (i.e. phrase\t blahblah ), most likely from cdec extractor
+# STDOUT: the matching subset, same format
+
+use utf8;
+my $letter=qr/\p{L}\p{M}*/; # see http://www.regular-expressions.info/unicode.html
+
+my $morph=qr/$letter+/;
+
+my $m = "##"; # marker used to indicate morphemes
+if ((scalar @ARGV) >= 1) {
+ $m = $ARGV[0];
+ shift;
+}
+print STDERR "Using $m to filter for morphemes\n";
+
+my $expr = qr/^($morph\Q$m\E)? ?(\Q$m\E$morph\Q$m\E)* ?(\Q$m\E$morph)?\t/; #\Q and \E bounded sections are escaped
+while(<>) {
+ /$expr/ && print;
+}