From 5530575ae0ad939e17f08d6bd49978acea388ab7 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Mon, 28 Jan 2013 11:56:31 +0000 Subject: Initial working commit. --- extractor/matching_comparator.cc | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 extractor/matching_comparator.cc (limited to 'extractor/matching_comparator.cc') diff --git a/extractor/matching_comparator.cc b/extractor/matching_comparator.cc new file mode 100644 index 00000000..03db95c0 --- /dev/null +++ b/extractor/matching_comparator.cc @@ -0,0 +1,46 @@ +#include "matching_comparator.h" + +#include "matching.h" +#include "vocabulary.h" + +MatchingComparator::MatchingComparator(int min_gap_size, int max_rule_span) : + min_gap_size(min_gap_size), max_rule_span(max_rule_span) {} + +int MatchingComparator::Compare(const Matching& left, + const Matching& right, + int last_chunk_len, + bool offset) const { + if (left.sentence_id != right.sentence_id) { + return left.sentence_id < right.sentence_id ? -1 : 1; + } + + if (left.positions.size() == 1 && right.positions.size() == 1) { + // The prefix and the suffix must be separated by a non-terminal, otherwise + // we would be doing a suffix array lookup. + if (right.positions[0] - left.positions[0] <= min_gap_size) { + return 1; + } + } else if (offset) { + for (size_t i = 1; i < left.positions.size(); ++i) { + if (left.positions[i] != right.positions[i - 1]) { + return left.positions[i] < right.positions[i - 1] ? -1 : 1; + } + } + } else { + if (left.positions[0] + 1 != right.positions[0]) { + return left.positions[0] + 1 < right.positions[0] ? -1 : 1; + } + for (size_t i = 1; i < left.positions.size(); ++i) { + if (left.positions[i] != right.positions[i]) { + return left.positions[i] < right.positions[i] ? -1 : 1; + } + } + } + + if (right.positions.back() + last_chunk_len - left.positions.front() > + max_rule_span) { + return -1; + } + + return 0; +} -- cgit v1.2.3