From e26434979adc33bd949566ba7bf02dff64e80a3e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 2 Oct 2012 00:19:43 -0400 Subject: cdec cleanup, remove bayesian stuff, parsing stuff --- rst_parser/dep_training.cc | 76 ---------------------------------------------- 1 file changed, 76 deletions(-) delete mode 100644 rst_parser/dep_training.cc (limited to 'rst_parser/dep_training.cc') diff --git a/rst_parser/dep_training.cc b/rst_parser/dep_training.cc deleted file mode 100644 index ef97798b..00000000 --- a/rst_parser/dep_training.cc +++ /dev/null @@ -1,76 +0,0 @@ -#include "dep_training.h" - -#include -#include - -#include "stringlib.h" -#include "filelib.h" -#include "tdict.h" -#include "picojson.h" - -using namespace std; - -static void ParseInstance(const string& line, int start, TrainingInstance* out, int lc = 0) { - picojson::value obj; - string err; - picojson::parse(obj, line.begin() + start, line.end(), &err); - if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); } - TrainingInstance& cur = *out; - TaggedSentence& ts = cur.ts; - EdgeSubset& tree = cur.tree; - ts.pos.clear(); - ts.words.clear(); - tree.roots.clear(); - tree.h_m_pairs.clear(); - assert(obj.is()); - const picojson::object& d = obj.get(); - const picojson::array& ta = d.find("tokens")->second.get(); - for (unsigned i = 0; i < ta.size(); ++i) { - ts.words.push_back(TD::Convert(ta[i].get()[0].get())); - ts.pos.push_back(TD::Convert(ta[i].get()[1].get())); - } - if (d.find("deps") != d.end()) { - const picojson::array& da = d.find("deps")->second.get(); - for (unsigned i = 0; i < da.size(); ++i) { - const picojson::array& thm = da[i].get(); - // get dep type here - short h = thm[2].get(); - short m = thm[1].get(); - if (h < 0) - tree.roots.push_back(m); - else - tree.h_m_pairs.push_back(make_pair(h,m)); - } - } - //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl; -} - -bool TrainingInstance::ReadInstance(std::istream* in, TrainingInstance* instance) { - string line; - if (!getline(*in, line)) return false; - size_t pos = line.rfind('\t'); - assert(pos != string::npos); - static int lc = 0; ++lc; - ParseInstance(line, pos + 1, instance, lc); - return true; -} - -void TrainingInstance::ReadTrainingCorpus(const string& fname, vector* corpus, int rank, int size) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - int lc = 0; - bool flag = false; - while(getline(in, line)) { - ++lc; - if ((lc-1) % size != rank) continue; - if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; } - if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; } - size_t pos = line.rfind('\t'); - assert(pos != string::npos); - corpus->push_back(TrainingInstance()); - ParseInstance(line, pos + 1, &corpus->back(), lc); - } - if (flag) cerr << "\nRead " << lc << " training instances\n"; -} - -- cgit v1.2.3