diff options
Diffstat (limited to 'training')
19 files changed, 1057 insertions, 22 deletions
| diff --git a/training/Makefile.am b/training/Makefile.am index 8ef3c939..2812a9be 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -8,5 +8,5 @@ SUBDIRS = \    dtrain \    latent_svm \    mira \ -  rampion - +  rampion \ +  const_reorder diff --git a/training/const_reorder/Makefile.am b/training/const_reorder/Makefile.am new file mode 100644 index 00000000..2c681398 --- /dev/null +++ b/training/const_reorder/Makefile.am @@ -0,0 +1,8 @@ +bin_PROGRAMS = const_reorder_model_trainer argument_reorder_model_trainer + +AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder + +const_reorder_model_trainer_SOURCES = constituent_reorder_model.cc trainer.h trainer.cc +const_reorder_model_trainer_LDADD = ../../utils/libutils.a +argument_reorder_model_trainer_SOURCES = argument_reorder_model.cc trainer.h trainer.cc +argument_reorder_model_trainer_LDADD = ../../utils/libutils.a diff --git a/training/const_reorder/argument_reorder_model.cc b/training/const_reorder/argument_reorder_model.cc new file mode 100644 index 00000000..87f2ce2f --- /dev/null +++ b/training/const_reorder/argument_reorder_model.cc @@ -0,0 +1,307 @@ +/* + * argument_reorder_model.cc + * + *  Created on: Dec 15, 2013 + *      Author: lijunhui + */ + +#include <boost/program_options.hpp> +#include <iostream> +#include <fstream> +#include <sstream> +#include <string> +#include <vector> + +#include "utils/filelib.h" + +#include "trainer.h" + +using namespace std; +using namespace const_reorder; + +inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, +                                    const char* pszNewFName) { +  Map hashPredicate; +  { +    ReadFile in(pszFName); +    string line; +    while (getline(*in.stream(), line)) { +      if (!line.size()) continue; +      vector<string> terms; +      SplitOnWhitespace(line, &terms); +      for (const auto& i : terms) { +        ++hashPredicate[i]; +      } +    } +  } + +  { +    ReadFile in(pszFName); +    WriteFile out(pszNewFName); +    string line; +    while (getline(*in.stream(), line)) { +      if (!line.size()) continue; +      vector<string> terms; +      SplitOnWhitespace(line, &terms); +      bool written = false; +      for (const auto& i : terms) { +        if (hashPredicate[i] >= iCutoff) { +          (*out.stream()) << i << " "; +          written = true; +        } +      } +      if (written) { +        (*out.stream()) << "\n"; +      } +    } +  } +} + +struct SArgumentReorderTrainer { +  SArgumentReorderTrainer( +      const char* pszSRLFname,           // source-side srl tree file name +      const char* pszAlignFname,         // alignment filename +      const char* pszSourceFname,        // source file name +      const char* pszTargetFname,        // target file name +      const char* pszTopPredicateFname,  // target file name +      const char* pszInstanceFname,      // training instance file name +      const char* pszModelFname,         // classifier model file name +      int iCutoff) { +    fnGenerateInstanceFiles(pszSRLFname, pszAlignFname, pszSourceFname, +                            pszTargetFname, pszTopPredicateFname, +                            pszInstanceFname); + +    string strInstanceFname, strModelFname; +    strInstanceFname = string(pszInstanceFname) + string(".left"); +    strModelFname = string(pszModelFname) + string(".left"); +    fnTraining(strInstanceFname.c_str(), strModelFname.c_str(), iCutoff); +    strInstanceFname = string(pszInstanceFname) + string(".right"); +    strModelFname = string(pszModelFname) + string(".right"); +    fnTraining(strInstanceFname.c_str(), strModelFname.c_str(), iCutoff); +  } + +  ~SArgumentReorderTrainer() {} + + private: +  void fnTraining(const char* pszInstanceFname, const char* pszModelFname, +                  int iCutoff) { +    char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; +    if (iCutoff > 0) { +      sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); +      fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); +    } else { +      strcpy(pszNewInstanceFName, pszInstanceFname); +    } + +    Tsuruoka_Maxent_Trainer* pMaxent = new Tsuruoka_Maxent_Trainer; +    pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname); +    delete pMaxent; + +    if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { +      sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); +      system(pszNewInstanceFName); +    } +    delete[] pszNewInstanceFName; +  } + +  void fnGenerateInstanceFiles( +      const char* pszSRLFname,     // source-side flattened parse tree file name +      const char* pszAlignFname,   // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszTopPredicateFname,  // top predicate file name (we only +                                         // consider predicates with 100+ +                                         // occurrences +      const char* pszInstanceFname       // training instance file name +      ) { +    SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); +    SSrlSentenceReader* pSRLReader = new SSrlSentenceReader(pszSRLFname); +    ReadFile source_file(pszSourceFname); +    ReadFile target_file(pszTargetFname); + +    Map* pMapPredicate; +    if (pszTopPredicateFname != NULL) +      pMapPredicate = fnLoadTopPredicates(pszTopPredicateFname); +    else +      pMapPredicate = NULL; + +    string line; + +    WriteFile left_file(pszInstanceFname + string(".left")); +    WriteFile right_file(pszInstanceFname + string(".right")); + +    // read sentence by sentence +    SAlignment* pAlign; +    SSrlSentence* pSRL; +    SParsedTree* pTree; +    int iSentNum = 0; +    while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { +      pSRL = pSRLReader->fnReadNextSrlSentence(); +      assert(pSRL != NULL); +      pTree = pSRL->m_pTree; +      assert(getline(*source_file.stream(), line)); +      vector<string> vecSTerms; +      SplitOnWhitespace(line, &vecSTerms); +      assert(getline(*target_file.stream(), line)); +      vector<string> vecTTerms; +      SplitOnWhitespace(line, &vecTTerms); +      // vecTPOSTerms.size() == 0, given the case when an english sentence fails +      // parsing + +      if (pTree != NULL) { +        for (size_t i = 0; i < pSRL->m_vecPred.size(); i++) { +          SPredicate* pPred = pSRL->m_vecPred[i]; +          if (strcmp(pTree->m_vecTerminals[pPred->m_iPosition] +                         ->m_ptParent->m_pszTerm, +                     "VA") == 0) +            continue; +          string strPred = +              string(pTree->m_vecTerminals[pPred->m_iPosition]->m_pszTerm); +          if (pMapPredicate != NULL) { +            Map::iterator iter_map = pMapPredicate->find(strPred); +            if (pMapPredicate != NULL && iter_map == pMapPredicate->end()) +              continue; +          } + +          SPredicateItem* pPredItem = new SPredicateItem(pTree, pPred); + +          vector<string> vecStrBlock; +          for (size_t j = 0; j < pPredItem->vec_items_.size(); j++) { +            SSRLItem* pItem1 = pPredItem->vec_items_[j]; +            vecStrBlock.push_back(SArgumentReorderModel::fnGetBlockOutcome( +                pItem1->tree_item_->m_iBegin, pItem1->tree_item_->m_iEnd, +                pAlign)); +          } + +          vector<string> vecStrLeftReorderType; +          vector<string> vecStrRightReorderType; +          SArgumentReorderModel::fnGetReorderType( +              pPredItem, pAlign, vecStrLeftReorderType, vecStrRightReorderType); +          for (int j = 1; j < pPredItem->vec_items_.size(); j++) { +            string strLeftOutcome, strRightOutcome; +            strLeftOutcome = vecStrLeftReorderType[j - 1]; +            strRightOutcome = vecStrRightReorderType[j - 1]; +            ostringstream ostr; +            SArgumentReorderModel::fnGenerateFeature(pTree, pPred, pPredItem, j, +                                                     vecStrBlock[j - 1], +                                                     vecStrBlock[j], ostr); + +            // fprintf(stderr, "%s %s\n", ostr.str().c_str(), +            // strOutcome.c_str()); +            // fprintf(fpOut, "sentid=%d %s %s\n", iSentNum, ostr.str().c_str(), +            // strOutcome.c_str()); +            (*left_file.stream()) << ostr.str() << " " << strLeftOutcome +                                  << "\n"; +            (*right_file.stream()) << ostr.str() << " " << strRightOutcome +                                   << "\n"; +          } +        } +      } +      delete pSRL; + +      delete pAlign; +      iSentNum++; + +      if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); +    } + +    delete pAlignReader; +    delete pSRLReader; +  } + +  Map* fnLoadTopPredicates(const char* pszTopPredicateFname) { +    if (pszTopPredicateFname == NULL) return NULL; + +    Map* pMapPredicate = new Map(); +    // STxtFileReader* pReader = new STxtFileReader(pszTopPredicateFname); +    ReadFile in(pszTopPredicateFname); +    // char* pszLine = new char[50001]; +    string line; +    int iNumCount = 0; +    while (getline(*in.stream(), line)) { +      if (line.size() && line[0] == '#') continue; +      auto p = line.find(' '); +      assert(p != string::npos); +      int iCount = atoi(line.substr(p + 1).c_str()); +      if (iCount < 100) break; +      (*pMapPredicate)[line] = iNumCount++; +    } +    return pMapPredicate; +  } +}; + +namespace po = boost::program_options; + +inline void print_options(std::ostream& out, +                          po::options_description const& opts) { +  typedef std::vector<boost::shared_ptr<po::option_description> > Ds; +  Ds const& ds = opts.options(); +  out << '"'; +  for (unsigned i = 0; i < ds.size(); ++i) { +    if (i) out << ' '; +    out << "--" << ds[i]->long_name(); +  } +  out << '\n'; +} +inline string str(char const* name, po::variables_map const& conf) { +  return conf[name].as<string>(); +} + +//--srl_file /scratch0/mt_exp/gale-align/gale-align.nw.srl.cn --align_file +/// scratch0/mt_exp/gale-align/gale-align.nw.al --source_file +/// scratch0/mt_exp/gale-align/gale-align.nw.cn --target_file +/// scratch0/mt_exp/gale-align/gale-align.nw.en --instance_file +/// scratch0/mt_exp/gale-align/gale-align.nw.argreorder.instance --model_prefix +/// scratch0/mt_exp/gale-align/gale-align.nw.argreorder.model --feature_cutoff 2 +//--srl_file /scratch0/mt_exp/gale-ctb/gale-ctb.srl.cn --align_file +/// scratch0/mt_exp/gale-ctb/gale-ctb.align --source_file +/// scratch0/mt_exp/gale-ctb/gale-ctb.cn --target_file +/// scratch0/mt_exp/gale-ctb/gale-ctb.en0 --instance_file +/// scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.instance --model_prefix +/// scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.model --feature_cutoff 2 +int main(int argc, char** argv) { + +  po::options_description opts("Configuration options"); +  opts.add_options()("srl_file", po::value<string>(), "srl file path (input)")( +      "align_file", po::value<string>(), "Alignment file path (input)")( +      "source_file", po::value<string>(), "Source text file path (input)")( +      "target_file", po::value<string>(), "Target text file path (input)")( +      "instance_file", po::value<string>(), "Instance file path (output)")( +      "model_prefix", po::value<string>(), +      "Model file path prefix (output): three files will be generated")( +      "feature_cutoff", po::value<int>()->default_value(100), +      "Feature cutoff threshold")("help", "produce help message"); + +  po::variables_map vm; +  if (argc) { +    po::store(po::parse_command_line(argc, argv, opts), vm); +    po::notify(vm); +  } + +  if (vm.count("help")) { +    print_options(cout, opts); +    return 1; +  } + +  if (!vm.count("srl_file") || !vm.count("align_file") || +      !vm.count("source_file") || !vm.count("target_file") || +      !vm.count("instance_file") || !vm.count("model_prefix")) { +    print_options(cout, opts); +    if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n"; +    if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n"; +    if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n"; +    if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n"; +    if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n"; +    if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n"; +    exit(0); +  } + +  SArgumentReorderTrainer* pTrainer = new SArgumentReorderTrainer( +      str("srl_file", vm).c_str(), str("align_file", vm).c_str(), +      str("source_file", vm).c_str(), str("target_file", vm).c_str(), NULL, +      str("instance_file", vm).c_str(), str("model_prefix", vm).c_str(), +      vm["feature_cutoff"].as<int>()); +  delete pTrainer; + +  return 1; +} diff --git a/training/const_reorder/constituent_reorder_model.cc b/training/const_reorder/constituent_reorder_model.cc new file mode 100644 index 00000000..d3ad0f2b --- /dev/null +++ b/training/const_reorder/constituent_reorder_model.cc @@ -0,0 +1,636 @@ +/* + * constituent_reorder_model.cc + * + *  Created on: Jul 10, 2013 + *      Author: junhuili + */ + +#include <string> +#include <unordered_map> + +#include <boost/program_options.hpp> + +#include "utils/filelib.h" + +#include "trainer.h" + +using namespace std; +using namespace const_reorder; + +typedef std::unordered_map<std::string, int> Map; +typedef std::unordered_map<std::string, int>::iterator Iterator; + +namespace po = boost::program_options; + +inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, +                                    const char* pszNewFName) { +  Map hashPredicate; +  { +    ReadFile f(pszFName); +    string line; +    while (getline(*f.stream(), line)) { +      if (!line.size()) continue; +      vector<string> terms; +      SplitOnWhitespace(line, &terms); +      for (const auto& i : terms) { +        ++hashPredicate[i]; +      } +    } +  } + +  { +    ReadFile in(pszFName); +    WriteFile out(pszNewFName); +    string line; +    while (getline(*in.stream(), line)) { +      if (!line.size()) continue; +      vector<string> terms; +      SplitOnWhitespace(line, &terms); +      bool written = false; +      for (const auto& i : terms) { +        if (hashPredicate[i] >= iCutoff) { +          (*out.stream()) << i << " "; +          written = true; +        } +      } +      if (written) { +        (*out.stream()) << "\n"; +      } +    } +  } +} + +struct SConstReorderTrainer { +  SConstReorderTrainer( +      const char* pszSynFname,     // source-side flattened parse tree file name +      const char* pszAlignFname,   // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszInstanceFname,  // training instance file name +      const char* pszModelPrefix,    // classifier model file name prefix +      int iCutoff,                   // feature count threshold +      const char* /*pszOption*/  // other classifier parameters (for svmlight) +      ) { +    fnGenerateInstanceFile(pszSynFname, pszAlignFname, pszSourceFname, +                           pszTargetFname, pszInstanceFname); + +    string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); +    string strInstanceRightFname = string(pszInstanceFname) + string(".right"); + +    string strModelLeftFname = string(pszModelPrefix) + string(".left"); +    string strModelRightFname = string(pszModelPrefix) + string(".right"); + +    fprintf(stdout, "...Training the left ordering model\n"); +    fnTraining(strInstanceLeftFname.c_str(), strModelLeftFname.c_str(), +               iCutoff); +    fprintf(stdout, "...Training the right ordering model\n"); +    fnTraining(strInstanceRightFname.c_str(), strModelRightFname.c_str(), +               iCutoff); +  } +  ~SConstReorderTrainer() {} + + private: +  void fnTraining(const char* pszInstanceFname, const char* pszModelFname, +                  int iCutoff) { +    char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; +    if (iCutoff > 0) { +      sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); +      fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); +    } else { +      strcpy(pszNewInstanceFName, pszInstanceFname); +    } + +    /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); +pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0); +delete pZhangleMaxent;*/ + +    Tsuruoka_Maxent_Trainer* pMaxent = new Tsuruoka_Maxent_Trainer; +    pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname); +    delete pMaxent; + +    if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { +      sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); +      system(pszNewInstanceFName); +    } +    delete[] pszNewInstanceFName; +  } + +  inline bool fnIsVerbPOS(const char* pszTerm) { +    if (strcmp(pszTerm, "VV") == 0 || strcmp(pszTerm, "VA") == 0 || +        strcmp(pszTerm, "VC") == 0 || strcmp(pszTerm, "VE") == 0) +      return true; +    return false; +  } + +  inline void fnGetOutcome(int iL1, int iR1, int iL2, int iR2, +                           const SAlignment* /*pAlign*/, string& strOutcome) { +    if (iL1 == -1 && iL2 == -1) +      strOutcome = "BU";  // 1. both are untranslated +    else if (iL1 == -1) +      strOutcome = "1U";  // 2. XP1 is untranslated +    else if (iL2 == -1) +      strOutcome = "2U";  // 3. XP2 is untranslated +    else if (iL1 == iL2 && iR1 == iR2) +      strOutcome = "SS";  // 4. Have same scope +    else if (iL1 <= iL2 && iR1 >= iR2) +      strOutcome = "1C2";  // 5. XP1's translation covers XP2's +    else if (iL1 >= iL2 && iR1 <= iR2) +      strOutcome = "2C1";  // 6. XP2's translation covers XP1's +    else if (iR1 < iL2) { +      int i = iR1 + 1; +      /*while (i < iL2) { +              if (pAlign->fnIsAligned(i, false)) +                      break; +              i++; +      }*/ +      if (i == iL2) +        strOutcome = "M";  // 7. Monotone +      else +        strOutcome = "DM";  // 8. Discontinuous monotone +    } else if (iL1 < iL2 && iL2 <= iR1 && iR1 < iR2) +      strOutcome = "OM";  // 9. Overlap monotone +    else if (iR2 < iL1) { +      int i = iR2 + 1; +      /*while (i < iL1) { +              if (pAlign->fnIsAligned(i, false)) +                      break; +              i++; +      }*/ +      if (i == iL1) +        strOutcome = "S";  // 10. Swap +      else +        strOutcome = "DS";  // 11. Discontinuous swap +    } else if (iL2 < iL1 && iL1 <= iR2 && iR2 < iR1) +      strOutcome = "OS";  // 12. Overlap swap +    else +      assert(false); +  } + +  inline void fnGetOutcome(int i1, int i2, string& strOutcome) { +    assert(i1 != i2); +    if (i1 < i2) { +      if (i2 > i1 + 1) +        strOutcome = string("DM"); +      else +        strOutcome = string("M"); +    } else { +      if (i1 > i2 + 1) +        strOutcome = string("DS"); +      else +        strOutcome = string("S"); +    } +  } + +  inline void fnGetRelativePosition(const vector<int>& vecLeft, +                                    vector<int>& vecPosition) { +    vecPosition.clear(); + +    vector<float> vec; +    for (size_t i = 0; i < vecLeft.size(); i++) { +      if (vecLeft[i] == -1) { +        if (i == 0) +          vec.push_back(-1); +        else +          vec.push_back(vecLeft[i - 1] + 0.1); +      } else +        vec.push_back(vecLeft[i]); +    } + +    for (size_t i = 0; i < vecLeft.size(); i++) { +      int count = 0; + +      for (size_t j = 0; j < vecLeft.size(); j++) { +        if (j == i) continue; +        if (vec[j] < vec[i]) { +          count++; +        } else if (vec[j] == vec[i] && j < i) { +          count++; +        } +      } +      vecPosition.push_back(count); +    } +  } + +  /* +   * features: +   * f1: (left_label, right_label, parent_label) +   * f2: (left_label, right_label, parent_label, other_right_sibling_label) +   * f3: (left_label, right_label, parent_label, other_left_sibling_label) +   * f4: (left_label, right_label, left_head_pos) +   * f5: (left_label, right_label, left_head_word) +   * f6: (left_label, right_label, right_head_pos) +   * f7: (left_label, right_label, right_head_word) +   * f8: (left_label, right_label, left_chunk_status) +   * f9: (left_label, right_label, right_chunk_status) +   * f10: (left_label, parent_label) +   * f11: (right_label, parent_label) +   */ +  void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pParent, +                          int iPos, const vector<string>& vecChunkStatus, +                          const vector<int>& vecPosition, +                          const vector<string>& vecSTerms, +                          const vector<string>& /*vecTTerms*/, string& strOutcome, +                          ostringstream& ostr) { +    STreeItem* pCon1, *pCon2; +    pCon1 = pParent->m_vecChildren[iPos - 1]; +    pCon2 = pParent->m_vecChildren[iPos]; + +    fnGetOutcome(vecPosition[iPos - 1], vecPosition[iPos], strOutcome); + +    string left_label = string(pCon1->m_pszTerm); +    string right_label = string(pCon2->m_pszTerm); +    string parent_label = string(pParent->m_pszTerm); + +    vector<string> vec_other_right_sibling; +    for (int i = iPos + 1; i < pParent->m_vecChildren.size(); i++) +      vec_other_right_sibling.push_back( +          string(pParent->m_vecChildren[i]->m_pszTerm)); +    if (vec_other_right_sibling.size() == 0) +      vec_other_right_sibling.push_back(string("NULL")); +    vector<string> vec_other_left_sibling; +    for (int i = 0; i < iPos - 1; i++) +      vec_other_left_sibling.push_back( +          string(pParent->m_vecChildren[i]->m_pszTerm)); +    if (vec_other_left_sibling.size() == 0) +      vec_other_left_sibling.push_back(string("NULL")); + +    // generate features +    // f1 +    ostr << "f1=" << left_label << "_" << right_label << "_" << parent_label; +    // f2 +    for (int i = 0; i < vec_other_right_sibling.size(); i++) +      ostr << " f2=" << left_label << "_" << right_label << "_" << parent_label +           << "_" << vec_other_right_sibling[i]; +    // f3 +    for (int i = 0; i < vec_other_left_sibling.size(); i++) +      ostr << " f3=" << left_label << "_" << right_label << "_" << parent_label +           << "_" << vec_other_left_sibling[i]; +    // f4 +    ostr << " f4=" << left_label << "_" << right_label << "_" +         << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; +    // f5 +    ostr << " f5=" << left_label << "_" << right_label << "_" +         << vecSTerms[pCon1->m_iHeadWord]; +    // f6 +    ostr << " f6=" << left_label << "_" << right_label << "_" +         << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_ptParent->m_pszTerm; +    // f7 +    ostr << " f7=" << left_label << "_" << right_label << "_" +         << vecSTerms[pCon2->m_iHeadWord]; +    // f8 +    ostr << " f8=" << left_label << "_" << right_label << "_" +         << vecChunkStatus[iPos - 1]; +    // f9 +    ostr << " f9=" << left_label << "_" << right_label << "_" +         << vecChunkStatus[iPos]; +    // f10 +    ostr << " f10=" << left_label << "_" << parent_label; +    // f11 +    ostr << " f11=" << right_label << "_" << parent_label; +  } + +  /* +   * Source side (11 features): +   * f1: the categories of XP1 and XP2 (f1_1, f1_2) +   * f2: the head words of XP1 and XP2 (f2_1, f2_2) +   * f3: the first and last word of XP1 (f3_f, f3_l) +   * f4: the first and last word of XP2 (f4_f, f4_l) +   * f5: is XP1 or XP2 the head node (f5_1, f5_2) +   * f6: the category of the common parent +   * Target side (6 features): +   * f7: the first and the last word of XP1's translation (f7_f, f7_l) +   * f8: the first and the last word of XP2's translation (f8_f, f8_l) +   * f9: the translation of XP1's and XP2's head word (f9_1, f9_2) +   */ +  void fnGenerateInstance(const SParsedTree* /*pTree*/, const STreeItem* pParent, +                          const STreeItem* pCon1, const STreeItem* pCon2, +                          const SAlignment* pAlign, +                          const vector<string>& vecSTerms, +                          const vector<string>& /*vecTTerms*/, string& strOutcome, +                          ostringstream& ostr) { + +    int iLeft1, iRight1, iLeft2, iRight2; +    pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, +                               iRight1); +    pAlign->fnGetLeftRightMost(pCon2->m_iBegin, pCon2->m_iEnd, true, iLeft2, +                               iRight2); + +    fnGetOutcome(iLeft1, iRight1, iLeft2, iRight2, pAlign, strOutcome); + +    // generate features +    // f1 +    ostr << "f1_1=" << pCon1->m_pszTerm << " f1_2=" << pCon2->m_pszTerm; +    // f2 +    ostr << " f2_1=" << vecSTerms[pCon1->m_iHeadWord] << " f2_2" +         << vecSTerms[pCon2->m_iHeadWord]; +    // f3 +    ostr << " f3_f=" << vecSTerms[pCon1->m_iBegin] +         << " f3_l=" << vecSTerms[pCon1->m_iEnd]; +    // f4 +    ostr << " f4_f=" << vecSTerms[pCon2->m_iBegin] +         << " f4_l=" << vecSTerms[pCon2->m_iEnd]; +    // f5 +    if (pParent->m_iHeadChild == pCon1->m_iBrotherIndex) +      ostr << " f5_1=1"; +    else +      ostr << " f5_1=0"; +    if (pParent->m_iHeadChild == pCon2->m_iBrotherIndex) +      ostr << " f5_2=1"; +    else +      ostr << " f5_2=0"; +    // f6 +    ostr << " f6=" << pParent->m_pszTerm; + +    /*//f7 +    if (iLeft1 != -1) { +            ostr << " f7_f=" << vecTTerms[iLeft1] << " f7_l=" << +    vecTTerms[iRight1]; +    } +    if (iLeft2 != -1) { +            ostr << " f8_f=" << vecTTerms[iLeft2] << " f8_l=" << +    vecTTerms[iRight2]; +    } + +    const vector<int>* pvecTarget = +    pAlign->fnGetSingleWordAlign(pCon1->m_iHeadWord, true); +    string str = ""; +    for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { +            str += vecTTerms[(*pvecTarget)[i]] + "_"; +    } +    if (str.length() > 0) { +            ostr << " f9_1=" << str.substr(0, str.size()-1); +    } +    pvecTarget = pAlign->fnGetSingleWordAlign(pCon2->m_iHeadWord, true); +    str = ""; +    for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { +            str += vecTTerms[(*pvecTarget)[i]] + "_"; +    } +    if (str.length() > 0) { +            ostr << " f9_2=" << str.substr(0, str.size()-1); +    } */ +  } + +  void fnGetFocusedParentNodes(const SParsedTree* pTree, +                               vector<STreeItem*>& vecFocused) { +    for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { +      STreeItem* pParent = pTree->m_vecTerminals[i]->m_ptParent; + +      while (pParent != NULL) { +        // if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - +        // pParent->m_iBegin > 5) { +        if (pParent->m_vecChildren.size() > 1) { +          // do constituent reordering for all children of pParent +          vecFocused.push_back(pParent); +        } +        if (pParent->m_iBrotherIndex != 0) break; +        pParent = pParent->m_ptParent; +      } +    } +  } + +  void fnGenerateInstanceFile( +      const char* pszSynFname,     // source-side flattened parse tree file name +      const char* pszAlignFname,   // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszInstanceFname  // training instance file name +      ) { +    SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); +    SParseReader* pParseReader = new SParseReader(pszSynFname, false); + +    ReadFile source_file(pszSourceFname); +    ReadFile target_file(pszTargetFname); +    string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); +    string strInstanceRightFname = string(pszInstanceFname) + string(".right"); +    WriteFile left_file(strInstanceLeftFname); +    WriteFile right_file(strInstanceRightFname); + +    // read sentence by sentence +    SAlignment* pAlign; +    SParsedTree* pTree; +    string line; +    int iSentNum = 0; +    while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { +      pTree = pParseReader->fnReadNextParseTree(); + +      assert(getline(*source_file.stream(), line)); +      vector<string> vecSTerms; +      SplitOnWhitespace(line, &vecSTerms); + +      assert(getline(*target_file.stream(), line)); +      vector<string> vecTTerms; +      SplitOnWhitespace(line, &vecTTerms); + +      if (pTree != NULL) { + +        vector<STreeItem*> vecFocused; +        fnGetFocusedParentNodes(pTree, vecFocused); + +        for (size_t i = 0; i < vecFocused.size(); i++) { + +          STreeItem* pParent = vecFocused[i]; + +          vector<int> vecLeft, vecRight; +          for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { +            STreeItem* pCon1 = pParent->m_vecChildren[j]; +            int iLeft1, iRight1; +            pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, +                                       iLeft1, iRight1); +            vecLeft.push_back(iLeft1); +            vecRight.push_back(iRight1); +          } +          vector<int> vecLeftPosition; +          fnGetRelativePosition(vecLeft, vecLeftPosition); +          vector<int> vecRightPosition; +          fnGetRelativePosition(vecRight, vecRightPosition); + +          vector<string> vecChunkStatus; +          for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { +            string strOutcome = +                pAlign->fnIsContinuous(pParent->m_vecChildren[j]->m_iBegin, +                                       pParent->m_vecChildren[j]->m_iEnd); +            vecChunkStatus.push_back(strOutcome); +          } + +          for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { +            // children[j-1] vs. children[j] reordering + +            string strLeftOutcome; +            ostringstream ostr; + +            fnGenerateInstance(pTree, pParent, j, vecChunkStatus, +                               vecLeftPosition, vecSTerms, vecTTerms, +                               strLeftOutcome, ostr); + +            string ostr_str = ostr.str(); + +            // fprintf(stderr, "%s %s\n", ostr.str().c_str(), +            // strLeftOutcome.c_str()); +            (*left_file.stream()) << ostr_str << " " << strLeftOutcome << "\n"; + +            string strRightOutcome; +            fnGetOutcome(vecRightPosition[j - 1], vecRightPosition[j], +                         strRightOutcome); +            (*right_file.stream()) << ostr_str +                                   << " LeftOrder=" << strLeftOutcome << " " +                                   << strRightOutcome << "\n"; +          } +        } +        delete pTree; +      } + +      delete pAlign; +      iSentNum++; + +      if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); +    } + +    delete pAlignReader; +    delete pParseReader; +  } + +  void fnGenerateInstanceFile2( +      const char* pszSynFname,     // source-side flattened parse tree file name +      const char* pszAlignFname,   // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszInstanceFname  // training instance file name +      ) { +    SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); +    SParseReader* pParseReader = new SParseReader(pszSynFname, false); + +    ReadFile source_file(pszSourceFname); +    ReadFile target_file(pszTargetFname); + +    WriteFile output_file(pszInstanceFname); + +    // read sentence by sentence +    SAlignment* pAlign; +    SParsedTree* pTree; +    string line; +    int iSentNum = 0; +    while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { +      pTree = pParseReader->fnReadNextParseTree(); +      assert(getline(*source_file.stream(), line)); +      vector<string> vecSTerms; +      SplitOnWhitespace(line, &vecSTerms); + +      assert(getline(*target_file.stream(), line)); +      vector<string> vecTTerms; +      SplitOnWhitespace(line, &vecTTerms); + +      if (pTree != NULL) { + +        vector<STreeItem*> vecFocused; +        fnGetFocusedParentNodes(pTree, vecFocused); + +        for (size_t i = 0; +             i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { + +          STreeItem* pParent = vecFocused[i]; + +          for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { +            // children[j-1] vs. children[j] reordering + +            string strOutcome; +            ostringstream ostr; + +            fnGenerateInstance(pTree, pParent, pParent->m_vecChildren[j - 1], +                               pParent->m_vecChildren[j], pAlign, vecSTerms, +                               vecTTerms, strOutcome, ostr); + +            // fprintf(stderr, "%s %s\n", ostr.str().c_str(), +            // strOutcome.c_str()); +            (*output_file.stream()) << ostr.str() << " " << strOutcome << "\n"; +          } +        } +        delete pTree; +      } + +      delete pAlign; +      iSentNum++; + +      if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); +    } + +    delete pAlignReader; +    delete pParseReader; +  } +}; + +inline void print_options(std::ostream& out, +                          po::options_description const& opts) { +  typedef std::vector<boost::shared_ptr<po::option_description> > Ds; +  Ds const& ds = opts.options(); +  out << '"'; +  for (unsigned i = 0; i < ds.size(); ++i) { +    if (i) out << ' '; +    out << "--" << ds[i]->long_name(); +  } +  out << '\n'; +} +inline string str(char const* name, po::variables_map const& conf) { +  return conf[name].as<string>(); +} + +//--parse_file /scratch0/mt_exp/gq-ctb/data/train.srl.cn --align_file +/// scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file +/// scratch0/mt_exp/gq-ctb/data/train.cn --target_file +/// scratch0/mt_exp/gq-ctb/data/train.en --instance_file +/// scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix +/// scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10 +int main(int argc, char** argv) { + +  po::options_description opts("Configuration options"); +  opts.add_options()("parse_file", po::value<string>(), +                     "parse file path (input)")( +      "align_file", po::value<string>(), "Alignment file path (input)")( +      "source_file", po::value<string>(), "Source text file path (input)")( +      "target_file", po::value<string>(), "Target text file path (input)")( +      "instance_file", po::value<string>(), "Instance file path (output)")( +      "model_prefix", po::value<string>(), +      "Model file path prefix (output): three files will be generated")( +      "feature_cutoff", po::value<int>()->default_value(100), +      "Feature cutoff threshold")("svm_option", po::value<string>(), +                                  "Parameters for SVMLight classifier")( +      "help", "produce help message"); + +  po::variables_map vm; +  if (argc) { +    po::store(po::parse_command_line(argc, argv, opts), vm); +    po::notify(vm); +  } + +  if (vm.count("help")) { +    print_options(cout, opts); +    return 1; +  } + +  if (!vm.count("parse_file") || !vm.count("align_file") || +      !vm.count("source_file") || !vm.count("target_file") || +      !vm.count("instance_file") || !vm.count("model_prefix")) { +    print_options(cout, opts); +    if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n"; +    if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n"; +    if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n"; +    if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n"; +    if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n"; +    if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n"; +    exit(0); +  } + +  const char* pOption; +  if (vm.count("svm_option")) +    pOption = str("svm_option", vm).c_str(); +  else +    pOption = NULL; + +  SConstReorderTrainer* pTrainer = new SConstReorderTrainer( +      str("parse_file", vm).c_str(), str("align_file", vm).c_str(), +      str("source_file", vm).c_str(), str("target_file", vm).c_str(), +      str("instance_file", vm).c_str(), str("model_prefix", vm).c_str(), +      vm["feature_cutoff"].as<int>(), pOption); +  delete pTrainer; + +  return 0; +} diff --git a/training/const_reorder/trainer.cc b/training/const_reorder/trainer.cc new file mode 100644 index 00000000..1d388eec --- /dev/null +++ b/training/const_reorder/trainer.cc @@ -0,0 +1,69 @@ +#include "trainer.h" + +#include "utils/maxent.h" + +Tsuruoka_Maxent_Trainer::Tsuruoka_Maxent_Trainer() +    : const_reorder::Tsuruoka_Maxent(NULL) {} + +void Tsuruoka_Maxent_Trainer::fnTrain(const char* pszInstanceFName, +                                      const char* pszAlgorithm, +                                      const char* pszModelFName) { +  assert(strcmp(pszAlgorithm, "l1") == 0 || strcmp(pszAlgorithm, "l2") == 0 || +         strcmp(pszAlgorithm, "sgd") == 0 || strcmp(pszAlgorithm, "SGD") == 0); +  FILE* fpIn = fopen(pszInstanceFName, "r"); + +  maxent::ME_Model* pModel = new maxent::ME_Model(); + +  char* pszLine = new char[100001]; +  int iNumInstances = 0; +  int iLen; +  while (!feof(fpIn)) { +    pszLine[0] = '\0'; +    fgets(pszLine, 20000, fpIn); +    if (strlen(pszLine) == 0) { +      continue; +    } + +    iLen = strlen(pszLine); +    while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) { +      pszLine[iLen - 1] = '\0'; +      iLen--; +    } + +    iNumInstances++; + +    maxent::ME_Sample* pmes = new maxent::ME_Sample(); + +    char* p = strrchr(pszLine, ' '); +    assert(p != NULL); +    p[0] = '\0'; +    p++; +    std::vector<std::string> vecContext; +    SplitOnWhitespace(std::string(pszLine), &vecContext); + +    pmes->label = std::string(p); +    for (size_t i = 0; i < vecContext.size(); i++) +      pmes->add_feature(vecContext[i]); +    pModel->add_training_sample((*pmes)); +    if (iNumInstances % 100000 == 0) +      fprintf(stdout, "......Reading #Instances: %1d\n", iNumInstances); +    delete pmes; +  } +  fprintf(stdout, "......Reading #Instances: %1d\n", iNumInstances); +  fclose(fpIn); + +  if (strcmp(pszAlgorithm, "l1") == 0) +    pModel->use_l1_regularizer(1.0); +  else if (strcmp(pszAlgorithm, "l2") == 0) +    pModel->use_l2_regularizer(1.0); +  else +    pModel->use_SGD(); + +  pModel->train(); +  pModel->save_to_file(pszModelFName); + +  delete pModel; +  fprintf(stdout, "......Finished Training\n"); +  fprintf(stdout, "......Model saved as %s\n", pszModelFName); +  delete[] pszLine; +} diff --git a/training/const_reorder/trainer.h b/training/const_reorder/trainer.h new file mode 100644 index 00000000..e574a536 --- /dev/null +++ b/training/const_reorder/trainer.h @@ -0,0 +1,12 @@ +#ifndef TRAINING_CONST_REORDER_TRAINER_H_ +#define TRAINING_CONST_REORDER_TRAINER_H_ + +#include "decoder/ff_const_reorder_common.h" + +struct Tsuruoka_Maxent_Trainer : const_reorder::Tsuruoka_Maxent { +  Tsuruoka_Maxent_Trainer(); +  void fnTrain(const char* pszInstanceFName, const char* pszAlgorithm, +               const char* pszModelFName); +}; + +#endif  // TRAINING_CONST_REORDER_TRAINER_H_ diff --git a/training/dpmert/lo_test.cc b/training/dpmert/lo_test.cc index b8776169..69e5aa3f 100644 --- a/training/dpmert/lo_test.cc +++ b/training/dpmert/lo_test.cc @@ -56,10 +56,11 @@ BOOST_AUTO_TEST_CASE(TestConvexHull) {  }  BOOST_AUTO_TEST_CASE(TestConvexHullInside) { -  const string json = "{\"rules\":[1,\"[X] ||| a ||| a\",2,\"[X] ||| A [X] ||| A [1]\",3,\"[X] ||| c ||| c\",4,\"[X] ||| C [X] ||| C [1]\",5,\"[X] ||| [X] B [X] ||| [1] B [2]\",6,\"[X] ||| [X] b [X] ||| [1] b [2]\",7,\"[X] ||| X [X] ||| X [1]\",8,\"[X] ||| Z [X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; +  std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : TEST_DATA);    Hypergraph hg; -  istringstream instr(json); -  HypergraphIO::ReadFromJSON(&instr, &hg); +  ReadFile rf(path + "/test-ch-inside.bin.gz"); +  assert(rf); +  HypergraphIO::ReadFromBinary(rf.stream(), &hg);    SparseVector<double> wts;    wts.set_value(FD::Convert("f1"), 0.4);    wts.set_value(FD::Convert("f2"), 1.0); @@ -121,13 +122,13 @@ BOOST_AUTO_TEST_CASE( TestS1) {    std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : TEST_DATA);    Hypergraph hg; -  ReadFile rf(path + "/0.json.gz"); -  HypergraphIO::ReadFromJSON(rf.stream(), &hg); +  ReadFile rf(path + "/0.bin.gz"); +  HypergraphIO::ReadFromBinary(rf.stream(), &hg);    hg.Reweight(wts);    Hypergraph hg2; -  ReadFile rf2(path + "/1.json.gz"); -  HypergraphIO::ReadFromJSON(rf2.stream(), &hg2); +  ReadFile rf2(path + "/1.bin.gz"); +  HypergraphIO::ReadFromBinary(rf2.stream(), &hg2);    hg2.Reweight(wts);    vector<vector<WordID> > refs1(4); @@ -193,10 +194,11 @@ BOOST_AUTO_TEST_CASE( TestS1) {  }  BOOST_AUTO_TEST_CASE(TestZeroOrigin) { -  const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}"; +  std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : TEST_DATA); +  ReadFile rf(path + "/test-zero-origin.bin.gz"); +  assert(rf);    Hypergraph hg; -  istringstream instr(json); -  HypergraphIO::ReadFromJSON(&instr, &hg); +  HypergraphIO::ReadFromBinary(rf.stream(), &hg);    SparseVector<double> wts;    wts.set_value(FD::Convert("PassThrough"), -0.929201533002898);    hg.Reweight(wts); diff --git a/training/dpmert/mr_dpmert_generate_mapper_input.cc b/training/dpmert/mr_dpmert_generate_mapper_input.cc index 199cd23a..3fa2f476 100644 --- a/training/dpmert/mr_dpmert_generate_mapper_input.cc +++ b/training/dpmert/mr_dpmert_generate_mapper_input.cc @@ -70,7 +70,7 @@ int main(int argc, char** argv) {    unsigned dev_set_size = conf["dev_set_size"].as<unsigned>();    for (unsigned i = 0; i < dev_set_size; ++i) {      for (unsigned j = 0; j < directions.size(); ++j) { -      cout << forest_repository << '/' << i << ".json.gz " << i << ' '; +      cout << forest_repository << '/' << i << ".bin.gz " << i << ' ';        print(cout, origin, "=", ";");        cout << ' ';        print(cout, directions[j], "=", ";"); diff --git a/training/dpmert/mr_dpmert_map.cc b/training/dpmert/mr_dpmert_map.cc index d1efcf96..2bf3f8fc 100644 --- a/training/dpmert/mr_dpmert_map.cc +++ b/training/dpmert/mr_dpmert_map.cc @@ -83,7 +83,7 @@ int main(int argc, char** argv) {      istringstream is(line);      int sent_id;      string file, s_origin, s_direction; -    // path-to-file (JSON) sent_ed starting-point search-direction +    // path-to-file sent_ed starting-point search-direction      is >> file >> sent_id >> s_origin >> s_direction;      SparseVector<double> origin;      ReadSparseVectorString(s_origin, &origin); @@ -93,7 +93,7 @@ int main(int argc, char** argv) {      if (last_file != file) {        last_file = file;        ReadFile rf(file); -      HypergraphIO::ReadFromJSON(rf.stream(), &hg); +      HypergraphIO::ReadFromBinary(rf.stream(), &hg);      }      const ConvexHullWeightFunction wf(origin, direction);      const ConvexHull hull = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); diff --git a/training/dpmert/test_data/0.bin.gz b/training/dpmert/test_data/0.bin.gzBinary files differ new file mode 100644 index 00000000..388298e9 --- /dev/null +++ b/training/dpmert/test_data/0.bin.gz diff --git a/training/dpmert/test_data/0.json.gz b/training/dpmert/test_data/0.json.gzBinary files differ deleted file mode 100644 index 30f8dd77..00000000 --- a/training/dpmert/test_data/0.json.gz +++ /dev/null diff --git a/training/dpmert/test_data/1.bin.gz b/training/dpmert/test_data/1.bin.gzBinary files differ new file mode 100644 index 00000000..44f9e0ff --- /dev/null +++ b/training/dpmert/test_data/1.bin.gz diff --git a/training/dpmert/test_data/1.json.gz b/training/dpmert/test_data/1.json.gzBinary files differ deleted file mode 100644 index c82cc179..00000000 --- a/training/dpmert/test_data/1.json.gz +++ /dev/null diff --git a/training/dpmert/test_data/test-ch-inside.bin.gz b/training/dpmert/test_data/test-ch-inside.bin.gzBinary files differ new file mode 100644 index 00000000..392f08c6 --- /dev/null +++ b/training/dpmert/test_data/test-ch-inside.bin.gz diff --git a/training/dpmert/test_data/test-zero-origin.bin.gz b/training/dpmert/test_data/test-zero-origin.bin.gzBinary files differ new file mode 100644 index 00000000..c641faaf --- /dev/null +++ b/training/dpmert/test_data/test-zero-origin.bin.gz diff --git a/training/minrisk/minrisk_optimize.cc b/training/minrisk/minrisk_optimize.cc index da8b5260..a2938fb0 100644 --- a/training/minrisk/minrisk_optimize.cc +++ b/training/minrisk/minrisk_optimize.cc @@ -178,7 +178,7 @@ int main(int argc, char** argv) {      ReadFile rf(file);      if (kis.size() % 5 == 0) { cerr << '.'; }      if (kis.size() % 200 == 0) { cerr << " [" << kis.size() << "]\n"; } -    HypergraphIO::ReadFromJSON(rf.stream(), &hg); +    HypergraphIO::ReadFromBinary(rf.stream(), &hg);      hg.Reweight(weights);      curkbest.AddKBestCandidates(hg, kbest_size, ds[sent_id]);      if (kbest_file.size()) diff --git a/training/pro/mr_pro_map.cc b/training/pro/mr_pro_map.cc index da58cd24..b142fd05 100644 --- a/training/pro/mr_pro_map.cc +++ b/training/pro/mr_pro_map.cc @@ -203,7 +203,7 @@ int main(int argc, char** argv) {      const string kbest_file = os.str();      if (FileExists(kbest_file))        J_i.ReadFromFile(kbest_file); -    HypergraphIO::ReadFromJSON(rf.stream(), &hg); +    HypergraphIO::ReadFromBinary(rf.stream(), &hg);      hg.Reweight(weights);      J_i.AddKBestCandidates(hg, kbest_size, ds[sent_id]);      J_i.WriteToFile(kbest_file); diff --git a/training/rampion/rampion_cccp.cc b/training/rampion/rampion_cccp.cc index 1e36dc51..1c45bac5 100644 --- a/training/rampion/rampion_cccp.cc +++ b/training/rampion/rampion_cccp.cc @@ -136,7 +136,7 @@ int main(int argc, char** argv) {      ReadFile rf(file);      if (kis.size() % 5 == 0) { cerr << '.'; }      if (kis.size() % 200 == 0) { cerr << " [" << kis.size() << "]\n"; } -    HypergraphIO::ReadFromJSON(rf.stream(), &hg); +    HypergraphIO::ReadFromBinary(rf.stream(), &hg);      hg.Reweight(weights);      curkbest.AddKBestCandidates(hg, kbest_size, ds[sent_id]);      if (kbest_file.size()) diff --git a/training/utils/grammar_convert.cc b/training/utils/grammar_convert.cc index 5c1b4d4a..04f1eb77 100644 --- a/training/utils/grammar_convert.cc +++ b/training/utils/grammar_convert.cc @@ -43,7 +43,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    po::notify(*conf);    if (conf->count("help") || conf->count("input") == 0) { -    cerr << "\nUsage: grammar_convert [-options]\n\nConverts a grammar file (in Hiero format) into JSON hypergraph.\n"; +    cerr << "\nUsage: grammar_convert [-options]\n\nConverts a grammar file (in Hiero format) into serialized hypergraph.\n";      cerr << dcmdline_options << endl;      exit(1);    } @@ -254,7 +254,8 @@ void ProcessHypergraph(const vector<double>& w, const po::variables_map& conf, c    if (w.size() > 0) { hg->Reweight(w); }    if (conf.count("collapse_weights")) CollapseWeights(hg);    if (conf["output"].as<string>() == "json") { -    HypergraphIO::WriteToJSON(*hg, false, &cout); +    cerr << "NOT IMPLEMENTED ... talk to cdyer if you need this functionality\n"; +    // HypergraphIO::WriteToBinary(*hg, &cout);      if (!ref.empty()) { cerr << "REF: " << ref << endl; }    } else {      vector<WordID> onebest; @@ -315,11 +316,11 @@ int main(int argc, char **argv) {          line = line.substr(0, pos + 2);        }        istringstream is(line); -      if (HypergraphIO::ReadFromJSON(&is, &hg)) { +      if (HypergraphIO::ReadFromBinary(&is, &hg)) {          ProcessHypergraph(w, conf, ref, &hg);          hg.clear();        } else { -        cerr << "Error reading grammar from JSON: line " << lc << endl; +        cerr << "Error reading grammar line " << lc << endl;          exit(1);        }      } else { | 
