diff options
Diffstat (limited to 'utils/alignment.h')
-rw-r--r-- | utils/alignment.h | 200 |
1 files changed, 0 insertions, 200 deletions
diff --git a/utils/alignment.h b/utils/alignment.h deleted file mode 100644 index 456577ca..00000000 --- a/utils/alignment.h +++ /dev/null @@ -1,200 +0,0 @@ -/* - * alignment.h - * - * Created on: May 23, 2013 - * Author: lijunhui - */ - -#ifndef ALIGNMENT_H_ -#define ALIGNMENT_H_ - -#include <string> -#include <assert.h> -#include <stdio.h> -#include <string.h> - -#include "stringlib.h" - -/* - * Note: - * m_vec_s_align.size() may not be equal to the length of source side - *sentence - * due to the last words may not be aligned - * - */ -struct SAlignment { - typedef std::vector<int> SingleAlign; - SAlignment(const char* pszAlign) { fnInitializeAlignment(pszAlign); } - ~SAlignment() {} - - bool fnIsAligned(int i, bool s) const { - const std::vector<SingleAlign>* palign; - if (s == true) - palign = &m_vec_s_align; - else - palign = &m_vec_t_align; - if ((*palign)[i].size() == 0) return false; - return true; - } - - /* - * return true if [b, e] is aligned phrases on source side (if s==true) or on - * the target side (if s==false); - * return false, otherwise. - */ - bool fnIsAlignedPhrase(int b, int e, bool s, int* pob, int* poe) const { - int ob, oe; //[b, e] on the other side - if (s == true) - fnGetLeftRightMost(b, e, m_vec_s_align, ob, oe); - else - fnGetLeftRightMost(b, e, m_vec_t_align, ob, oe); - - if (ob == -1) { - if (pob != NULL) (*pob) = -1; - if (poe != NULL) (*poe) = -1; - return false; // no aligned word among [b, e] - } - if (pob != NULL) (*pob) = ob; - if (poe != NULL) (*poe) = oe; - - int bb, be; //[b, e] back given [ob, oe] on the other side - if (s == true) - fnGetLeftRightMost(ob, oe, m_vec_t_align, bb, be); - else - fnGetLeftRightMost(ob, oe, m_vec_s_align, bb, be); - - if (bb < b || be > e) return false; - return true; - } - - bool fnIsAlignedTightPhrase(int b, int e, bool s, int* pob, int* poe) const { - const std::vector<SingleAlign>* palign; - if (s == true) - palign = &m_vec_s_align; - else - palign = &m_vec_t_align; - - if ((*palign).size() <= e || (*palign)[b].size() == 0 || - (*palign)[e].size() == 0) - return false; - - return fnIsAlignedPhrase(b, e, s, pob, poe); - } - - void fnGetLeftRightMost(int b, int e, bool s, int& ob, int& oe) const { - if (s == true) - fnGetLeftRightMost(b, e, m_vec_s_align, ob, oe); - else - fnGetLeftRightMost(b, e, m_vec_t_align, ob, oe); - } - - /* - * look the translation of source[b, e] is continuous or not - * 1) return "Unaligned": if the source[b, e] is translated silently; - * 2) return "Con't": if none of target words in target[.., ..] is exclusively - * aligned to any word outside source[b, e] - * 3) return "Discon't": otherwise; - */ - std::string fnIsContinuous(int b, int e) const { - int ob, oe; - fnGetLeftRightMost(b, e, true, ob, oe); - if (ob == -1) return "Unaligned"; - - for (int i = ob; i <= oe; i++) { - if (!fnIsAligned(i, false)) continue; - const SingleAlign& a = m_vec_t_align[i]; - int j; - for (j = 0; j < a.size(); j++) - if (a[j] >= b && a[j] <= e) break; - if (j == a.size()) return "Discon't"; - } - return "Con't"; - } - - const SingleAlign* fnGetSingleWordAlign(int i, bool s) const { - if (s == true) { - if (i >= m_vec_s_align.size()) return NULL; - return &(m_vec_s_align[i]); - } else { - if (i >= m_vec_t_align.size()) return NULL; - return &(m_vec_t_align[i]); - } - } - - private: - void fnGetLeftRightMost(int b, int e, const std::vector<SingleAlign>& align, - int& ob, int& oe) const { - ob = oe = -1; - for (int i = b; i <= e && i < align.size(); i++) { - if (align[i].size() > 0) { - if (align[i][0] < ob || ob == -1) ob = align[i][0]; - if (oe < align[i][align[i].size() - 1]) - oe = align[i][align[i].size() - 1]; - } - } - } - void fnInitializeAlignment(const char* pszAlign) { - m_vec_s_align.clear(); - m_vec_t_align.clear(); - - std::vector<std::string> terms = SplitOnWhitespace(std::string(pszAlign)); - int si, ti; - for (size_t i = 0; i < terms.size(); i++) { - sscanf(terms[i].c_str(), "%d-%d", &si, &ti); - - while (m_vec_s_align.size() <= si) { - SingleAlign sa; - m_vec_s_align.push_back(sa); - } - while (m_vec_t_align.size() <= ti) { - SingleAlign sa; - m_vec_t_align.push_back(sa); - } - - m_vec_s_align[si].push_back(ti); - m_vec_t_align[ti].push_back(si); - } - - // sort - for (size_t i = 0; i < m_vec_s_align.size(); i++) { - std::sort(m_vec_s_align[i].begin(), m_vec_s_align[i].end()); - } - for (size_t i = 0; i < m_vec_t_align.size(); i++) { - std::sort(m_vec_t_align[i].begin(), m_vec_t_align[i].end()); - } - } - - private: - std::vector<SingleAlign> m_vec_s_align; // source side words' alignment - std::vector<SingleAlign> m_vec_t_align; // target side words' alignment -}; - -struct SAlignmentReader { - SAlignmentReader(const char* pszFname) { - m_fpIn = fopen(pszFname, "r"); - assert(m_fpIn != NULL); - } - ~SAlignmentReader() { - if (m_fpIn != NULL) fclose(m_fpIn); - } - SAlignment* fnReadNextAlignment() { - if (feof(m_fpIn) == true) return NULL; - char* pszLine = new char[100001]; - pszLine[0] = '\0'; - fgets(pszLine, 10001, m_fpIn); - int iLen = strlen(pszLine); - if (iLen == 0) return NULL; - while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) { - pszLine[iLen - 1] = '\0'; - iLen--; - } - SAlignment* pAlign = new SAlignment(pszLine); - delete[] pszLine; - return pAlign; - } - - private: - FILE* m_fpIn; -}; - -#endif /* ALIGNMENT_H_ */ |