From 22e05bf807ad59bfad38fcdf35bb51524034e23b Mon Sep 17 00:00:00 2001 From: redpony Date: Thu, 28 Oct 2010 00:22:42 +0000 Subject: change stem handling git-svn-id: https://ws10smt.googlecode.com/svn/trunk@693 ec762483-ff6d-05da-a07a-a48fb63a330f --- word-aligner/makefiles/makefile.grammars | 34 +++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) (limited to 'word-aligner/makefiles') diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars index f4b956bc..8a10cb19 100644 --- a/word-aligner/makefiles/makefile.grammars +++ b/word-aligner/makefiles/makefile.grammars @@ -1,8 +1,7 @@ -all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f +all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map clean: - $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* - + $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* SUPPORT_DIR = $(SCRIPT_DIR)/support GZIP = /usr/bin/gzip ZCAT = zcat @@ -12,21 +11,42 @@ SUPPLEMENT_WEIGHTS = $(SUPPORT_DIR)/supplement_weights_file.pl EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl +STEM_F = $(SCRIPT_DIR)/stemmers/$(F_LANG).pl +STEM_E = $(SCRIPT_DIR)/stemmers/$(E_LANG).pl + CLASSIFY = $(SUPPORT_DIR)/classify.pl MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl MODEL1 = $(TRAINING_DIR)/model1 MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl -orthonorm-dict.e: corpus.e - $(EXTRACT_VOCAB) corpus.e > e.voc +e.voc: corpus.e + $(EXTRACT_VOCAB) < corpus.e > $@ + +f.voc: corpus.f + $(EXTRACT_VOCAB) < corpus.f > $@ + +orthonorm-dict.e: corpus.e e.voc $(ORTHONORM_E) < e.voc > e.ortho-voc $(MERGE_CORPUS) e.voc e.ortho-voc > $@ -orthonorm-dict.f: corpus.f - $(EXTRACT_VOCAB) corpus.f > f.voc +orthonorm-dict.f: corpus.f f.voc $(ORTHONORM_F) < f.voc > f.ortho-voc $(MERGE_CORPUS) f.voc f.ortho-voc > $@ +# this is just a "stem" map +estem.map: e.voc + $(STEM_E) --vocab < e.voc > $@ + +fstem.map: f.voc + $(STEM_F) --vocab < f.voc > $@ + +# corpus.stemmed.f can use context to do "stemming" +corpus.stemmed.f: corpus.f + $(STEM_F) < corpus.f > $@ + +corpus.stemmed.e: corpus.e + $(STEM_E) < corpus.e > $@ + voc2class.e: corpus.e $(MKCLS) $(MKCLS) -c$(NCLASSES) -n10 -pcorpus.e -Vvoc2class.e opt -- cgit v1.2.3