cdec cleanup, remove bayesian stuff, parsing stuff

author: Chris Dyer <cdyer@cab.ark.cs.cmu.edu> 2012-10-02 00:19:43 -0400
committer: Chris Dyer <cdyer@cab.ark.cs.cmu.edu> 2012-10-02 00:19:43 -0400
commit: e26434979adc33bd949566ba7bf02dff64e80a3e (patch)
tree: d1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/morf-segmentation/filter_docs.pl
parent: 0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)
1 files changed, 0 insertions, 24 deletions
diff --git a/gi/morf-segmentation/filter_docs.pl b/gi/morf-segmentation/filter_docs.pl
deleted file mode 100755
index a78575da..00000000
--- a/gi/morf-segmentation/filter_docs.pl
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/perl
-
-#Filters the phrase&cluster document set to retain only documents that correspond to words or morphs, i.e. not crossing word boundaries.
-
-#Usage: filter_docs.pl [mark]
-#  STDIN: data in the doc.txt format (i.e. phrase\t blahblah ), most likely from cdec extractor
-#  STDOUT: the matching subset, same format
-
-use utf8;
-my $letter=qr/\p{L}\p{M}*/; # see http://www.regular-expressions.info/unicode.html
-
-my $morph=qr/$letter+/;
-
-my $m = "##"; # marker used to indicate morphemes
-if ((scalar @ARGV) >= 1) {
-   $m = $ARGV[0];
-   shift;
-}
-print STDERR "Using $m to filter for morphemes\n";
-
-my $expr = qr/^($morph\Q$m\E)? ?(\Q$m\E$morph\Q$m\E)* ?(\Q$m\E$morph)?\t/; #\Q and \E bounded sections are escaped
-while(<>) {
-   /$expr/ && print;
-}
author	Chris Dyer <cdyer@cab.ark.cs.cmu.edu>	2012-10-02 00:19:43 -0400
committer	Chris Dyer <cdyer@cab.ark.cs.cmu.edu>	2012-10-02 00:19:43 -0400
commit	e26434979adc33bd949566ba7bf02dff64e80a3e (patch)
tree	d1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/morf-segmentation/filter_docs.pl
parent	0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)