diff options
author | Patrick Simianer <patrick@lilt.com> | 2020-03-09 17:48:55 +0000 |
---|---|---|
committer | Patrick Simianer <patrick@lilt.com> | 2020-03-09 17:48:55 +0000 |
commit | 7d2fd2bf643671377e990b1c944aa3650397e3da (patch) | |
tree | 13e2b7b00d122a90988231f60b74945587ad86f6 | |
parent | 5178c4f31dd3b8eb1f1cba2b632863f8a92af029 (diff) |
de-sgm: match more stuff
-rwxr-xr-x | de-sgm | 11 |
1 files changed, 7 insertions, 4 deletions
@@ -1,10 +1,13 @@ #!/bin/sh -egrep -v "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset|translator|reviewer)[^>]*>)[[:space:]]*$" \ - | egrep -v "^[[:space:]]*<(url|description|keywords|talkid|title|translator|reviewer)[^>]*>.*</(url|description|keywords|talkid|title|translator|reviewer)>[[:space:]]*$" \ +egrep -v -i "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset|translator|reviewer)[^>]*>)[[:space:]]*$" \ + | egrep -v -i "^[[:space:]]*<(url|description|keywords|talkid|title|translator|reviewer)[^>]*>.*</(url|description|keywords|talkid|title|translator|reviewer)>[[:space:]]*$" \ | sed "s|<seg[^>]*>\s*||" \ | sed "s|\s*</seg>\s*$||" \ - | egrep -v "^[[:space:]]*<p>[[:space:]]*$|^[[:space:]]*</p>[[:space:]]*$" \ + | egrep -v -i "^[[:space:]]*<p>[[:space:]]*$|^[[:space:]]*</p>[[:space:]]*$" \ | sed "s|<speaker>\s*||" \ - | sed "s|\s*</speaker>\s*$||" + | sed "s|\s*</speaker>\s*$||" \ + | sed "s|\s*<hl>\s*$||" \ + | sed "s|\s*</hl>\s*$||" \ + | grep -v -P "^\s*$" |