summaryrefslogtreecommitdiff
path: root/extractor/matching_comparator.cc
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-01-28 11:56:31 +0000
committerPaul Baltescu <pauldb89@gmail.com>2013-01-28 11:56:31 +0000
commit4ab84a0be28fdb6c0c421fe5ba5e09cfa298f2d1 (patch)
tree61a9790298659944650e16121c28dc04397b07ba /extractor/matching_comparator.cc
parentae1bd3257aafba586f874c55e7e51e8776879434 (diff)
Initial working commit.
Diffstat (limited to 'extractor/matching_comparator.cc')
-rw-r--r--extractor/matching_comparator.cc46
1 files changed, 46 insertions, 0 deletions
diff --git a/extractor/matching_comparator.cc b/extractor/matching_comparator.cc
new file mode 100644
index 00000000..03db95c0
--- /dev/null
+++ b/extractor/matching_comparator.cc
@@ -0,0 +1,46 @@
+#include "matching_comparator.h"
+
+#include "matching.h"
+#include "vocabulary.h"
+
+MatchingComparator::MatchingComparator(int min_gap_size, int max_rule_span) :
+ min_gap_size(min_gap_size), max_rule_span(max_rule_span) {}
+
+int MatchingComparator::Compare(const Matching& left,
+ const Matching& right,
+ int last_chunk_len,
+ bool offset) const {
+ if (left.sentence_id != right.sentence_id) {
+ return left.sentence_id < right.sentence_id ? -1 : 1;
+ }
+
+ if (left.positions.size() == 1 && right.positions.size() == 1) {
+ // The prefix and the suffix must be separated by a non-terminal, otherwise
+ // we would be doing a suffix array lookup.
+ if (right.positions[0] - left.positions[0] <= min_gap_size) {
+ return 1;
+ }
+ } else if (offset) {
+ for (size_t i = 1; i < left.positions.size(); ++i) {
+ if (left.positions[i] != right.positions[i - 1]) {
+ return left.positions[i] < right.positions[i - 1] ? -1 : 1;
+ }
+ }
+ } else {
+ if (left.positions[0] + 1 != right.positions[0]) {
+ return left.positions[0] + 1 < right.positions[0] ? -1 : 1;
+ }
+ for (size_t i = 1; i < left.positions.size(); ++i) {
+ if (left.positions[i] != right.positions[i]) {
+ return left.positions[i] < right.positions[i] ? -1 : 1;
+ }
+ }
+ }
+
+ if (right.positions.back() + last_chunk_len - left.positions.front() >
+ max_rule_span) {
+ return -1;
+ }
+
+ return 0;
+}