summaryrefslogtreecommitdiff
path: root/de-sgm
diff options
context:
space:
mode:
authorPatrick Simianer <pks@pks.rocks>2020-08-12 07:32:06 +0200
committerPatrick Simianer <pks@pks.rocks>2020-08-12 07:32:06 +0200
commit64e8bdba930479249b8dfbc4b5d4b659a95433f0 (patch)
treee26969b03d8380ee8d2cbc1328f851772006133c /de-sgm
parent74e20e00dfbffdcf117778049e47acd79e320110 (diff)
parent4732fb3be94ba3f88b18295cf1c00e8c616eec73 (diff)
Merge branch 'master' of ssh://github.com/pks/nlp_scripts
Diffstat (limited to 'de-sgm')
-rwxr-xr-xde-sgm14
1 files changed, 10 insertions, 4 deletions
diff --git a/de-sgm b/de-sgm
index 664c18c..fd4546e 100755
--- a/de-sgm
+++ b/de-sgm
@@ -1,7 +1,13 @@
#!/bin/sh
-
-egrep -v "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset|translator|reviewer)[^>]*>)[[:space:]]*$" \
- | egrep -v "^[[:space:]]*<(url|description|keywords|talkid|title|translator|reviewer)[^>]*>.*</(url|description|keywords|talkid|title|translator|reviewer)>[[:space:]]*$" \
- | sed "s|<seg[^>]*>\s*||" | sed "s|\s*</seg>$||"
+egrep -v -i "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset|translator|reviewer)[^>]*>)[[:space:]]*$" \
+ | egrep -v -i "^[[:space:]]*<(url|description|keywords|talkid|title|translator|reviewer)[^>]*>.*</(url|description|keywords|talkid|title|translator|reviewer)>[[:space:]]*$" \
+ | sed "s|<seg[^>]*>\s*||" \
+ | sed "s|\s*</seg>\s*$||" \
+ | egrep -v -i "^[[:space:]]*<p>[[:space:]]*$|^[[:space:]]*</p>[[:space:]]*$" \
+ | sed "s|<speaker>\s*||" \
+ | sed "s|\s*</speaker>\s*$||" \
+ | sed "s|\s*<hl>\s*$||" \
+ | sed "s|\s*</hl>\s*$||" \
+ | grep -v -P "^\s*$"