diff options
Diffstat (limited to 'gi')
-rw-r--r-- | gi/scfg/abc/Release/IConv.d | 3 | ||||
-rw-r--r-- | gi/scfg/abc/Release/Util.d | 8 | ||||
-rw-r--r-- | gi/scfg/abc/Release/agrammar.d | 193 | ||||
-rwxr-xr-x | gi/scfg/abc/Release/dict_test | bin | 0 -> 1485797 bytes | |||
-rw-r--r-- | gi/scfg/abc/Release/grammar | 13 | ||||
-rw-r--r-- | gi/scfg/abc/Release/grammar.pr | 13 | ||||
-rw-r--r-- | gi/scfg/abc/Release/makefile | 66 | ||||
-rw-r--r-- | gi/scfg/abc/Release/process_grammar.pl | 36 | ||||
-rwxr-xr-x | gi/scfg/abc/Release/scfg | bin | 0 -> 4277125 bytes | |||
-rw-r--r-- | gi/scfg/abc/Release/scfg.d | 209 | ||||
-rw-r--r-- | gi/scfg/abc/Release/sources.mk | 27 | ||||
-rw-r--r-- | gi/scfg/abc/Release/subdir.mk | 59 | ||||
-rw-r--r-- | gi/scfg/abc/Release/tmp.grammar | 2 | ||||
l--------- | gi/scfg/abc/Release/toy-grammar | 1 | ||||
-rwxr-xr-x | gi/scfg/abc/a.out | bin | 0 -> 22639 bytes | |||
-rw-r--r-- | gi/scfg/abc/agrammar.cc | 378 | ||||
-rw-r--r-- | gi/scfg/abc/agrammar.h | 68 | ||||
-rw-r--r-- | gi/scfg/abc/old_agrammar.cc | 383 | ||||
-rw-r--r-- | gi/scfg/abc/old_agrammar.h | 45 | ||||
-rw-r--r-- | gi/scfg/abc/scfg.cpp | 152 | ||||
-rw-r--r-- | gi/scfg/abc/tmp.cpp | 36 |
21 files changed, 1692 insertions, 0 deletions
diff --git a/gi/scfg/abc/Release/IConv.d b/gi/scfg/abc/Release/IConv.d new file mode 100644 index 00000000..082cb15b --- /dev/null +++ b/gi/scfg/abc/Release/IConv.d @@ -0,0 +1,3 @@ +IConv.d IConv.o: ../../utils/IConv.cc ../../utils/IConv.hpp + +../../utils/IConv.hpp: diff --git a/gi/scfg/abc/Release/Util.d b/gi/scfg/abc/Release/Util.d new file mode 100644 index 00000000..586d4d60 --- /dev/null +++ b/gi/scfg/abc/Release/Util.d @@ -0,0 +1,8 @@ +Util.d Util.o: ../../utils/Util.cc ../../utils/Util.h \ + ../../utils/UtfConverter.h ../../utils/ConvertUTF.h + +../../utils/Util.h: + +../../utils/UtfConverter.h: + +../../utils/ConvertUTF.h: diff --git a/gi/scfg/abc/Release/agrammar.d b/gi/scfg/abc/Release/agrammar.d new file mode 100644 index 00000000..6cf14f0d --- /dev/null +++ b/gi/scfg/abc/Release/agrammar.d @@ -0,0 +1,193 @@ +agrammar.d agrammar.o: ../agrammar.cc \ + /home/tnguyen/ws10smt/decoder/rule_lexer.h \ + /home/tnguyen/ws10smt/decoder/trule.h \ + /export/ws10smt/software/include/boost/shared_ptr.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/shared_ptr.hpp \ + /export/ws10smt/software/include/boost/config.hpp \ + /export/ws10smt/software/include/boost/config/user.hpp \ + /export/ws10smt/software/include/boost/config/select_compiler_config.hpp \ + /export/ws10smt/software/include/boost/config/compiler/gcc.hpp \ + /export/ws10smt/software/include/boost/config/select_stdlib_config.hpp \ + /export/ws10smt/software/include/boost/config/no_tr1/utility.hpp \ + /export/ws10smt/software/include/boost/config/stdlib/libstdcpp3.hpp \ + /export/ws10smt/software/include/boost/config/select_platform_config.hpp \ + /export/ws10smt/software/include/boost/config/platform/linux.hpp \ + /export/ws10smt/software/include/boost/config/posix_features.hpp \ + /export/ws10smt/software/include/boost/config/suffix.hpp \ + /export/ws10smt/software/include/boost/config/no_tr1/memory.hpp \ + /export/ws10smt/software/include/boost/assert.hpp \ + /export/ws10smt/software/include/boost/checked_delete.hpp \ + /export/ws10smt/software/include/boost/throw_exception.hpp \ + /export/ws10smt/software/include/boost/exception/detail/attribute_noreturn.hpp \ + /export/ws10smt/software/include/boost/detail/workaround.hpp \ + /export/ws10smt/software/include/boost/exception/exception.hpp \ + /export/ws10smt/software/include/boost/current_function.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/shared_count.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/bad_weak_ptr.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/sp_counted_base.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/sp_has_sync.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp \ + /export/ws10smt/software/include/boost/detail/sp_typeinfo.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/sp_counted_impl.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/sp_convertible.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/spinlock_pool.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/spinlock.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/spinlock_sync.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/yield_k.hpp \ + /export/ws10smt/software/include/boost/memory_order.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/operator_bool.hpp \ + /home/tnguyen/ws10smt/decoder/sparse_vector.h \ + /home/tnguyen/ws10smt/decoder/fdict.h \ + /home/tnguyen/ws10smt/decoder/dict.h \ + /export/ws10smt/software/include/boost/functional/hash.hpp \ + /export/ws10smt/software/include/boost/functional/hash/hash.hpp \ + /export/ws10smt/software/include/boost/functional/hash/hash_fwd.hpp \ + /export/ws10smt/software/include/boost/functional/hash/detail/hash_float.hpp \ + /export/ws10smt/software/include/boost/functional/hash/detail/float_functions.hpp \ + /export/ws10smt/software/include/boost/config/no_tr1/cmath.hpp \ + /export/ws10smt/software/include/boost/functional/hash/detail/limits.hpp \ + /export/ws10smt/software/include/boost/limits.hpp \ + /export/ws10smt/software/include/boost/integer/static_log2.hpp \ + /export/ws10smt/software/include/boost/integer_fwd.hpp \ + /export/ws10smt/software/include/boost/cstdint.hpp \ + /export/ws10smt/software/include/boost/functional/hash/detail/hash_float_generic.hpp \ + /export/ws10smt/software/include/boost/functional/hash/extensions.hpp \ + /export/ws10smt/software/include/boost/detail/container_fwd.hpp \ + /home/tnguyen/ws10smt/decoder/wordid.h \ + /home/tnguyen/ws10smt/decoder/filelib.h \ + /home/tnguyen/ws10smt/decoder/gzstream.h \ + /home/tnguyen/ws10smt/decoder/tdict.h ../agrammar.h \ + /home/tnguyen/ws10smt/decoder/grammar.h \ + /home/tnguyen/ws10smt/decoder/lattice.h \ + /home/tnguyen/ws10smt/decoder/array2d.h ../../utils/Util.h \ + ../../utils/UtfConverter.h ../../utils/ConvertUTF.h + +/home/tnguyen/ws10smt/decoder/rule_lexer.h: + +/home/tnguyen/ws10smt/decoder/trule.h: + +/export/ws10smt/software/include/boost/shared_ptr.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/shared_ptr.hpp: + +/export/ws10smt/software/include/boost/config.hpp: + +/export/ws10smt/software/include/boost/config/user.hpp: + +/export/ws10smt/software/include/boost/config/select_compiler_config.hpp: + +/export/ws10smt/software/include/boost/config/compiler/gcc.hpp: + +/export/ws10smt/software/include/boost/config/select_stdlib_config.hpp: + +/export/ws10smt/software/include/boost/config/no_tr1/utility.hpp: + +/export/ws10smt/software/include/boost/config/stdlib/libstdcpp3.hpp: + +/export/ws10smt/software/include/boost/config/select_platform_config.hpp: + +/export/ws10smt/software/include/boost/config/platform/linux.hpp: + +/export/ws10smt/software/include/boost/config/posix_features.hpp: + +/export/ws10smt/software/include/boost/config/suffix.hpp: + +/export/ws10smt/software/include/boost/config/no_tr1/memory.hpp: + +/export/ws10smt/software/include/boost/assert.hpp: + +/export/ws10smt/software/include/boost/checked_delete.hpp: + +/export/ws10smt/software/include/boost/throw_exception.hpp: + +/export/ws10smt/software/include/boost/exception/detail/attribute_noreturn.hpp: + +/export/ws10smt/software/include/boost/detail/workaround.hpp: + +/export/ws10smt/software/include/boost/exception/exception.hpp: + +/export/ws10smt/software/include/boost/current_function.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/shared_count.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/bad_weak_ptr.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/sp_counted_base.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/sp_has_sync.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp: + +/export/ws10smt/software/include/boost/detail/sp_typeinfo.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/sp_counted_impl.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/sp_convertible.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/spinlock_pool.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/spinlock.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/spinlock_sync.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/yield_k.hpp: + +/export/ws10smt/software/include/boost/memory_order.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/operator_bool.hpp: + +/home/tnguyen/ws10smt/decoder/sparse_vector.h: + +/home/tnguyen/ws10smt/decoder/fdict.h: + +/home/tnguyen/ws10smt/decoder/dict.h: + +/export/ws10smt/software/include/boost/functional/hash.hpp: + +/export/ws10smt/software/include/boost/functional/hash/hash.hpp: + +/export/ws10smt/software/include/boost/functional/hash/hash_fwd.hpp: + +/export/ws10smt/software/include/boost/functional/hash/detail/hash_float.hpp: + +/export/ws10smt/software/include/boost/functional/hash/detail/float_functions.hpp: + +/export/ws10smt/software/include/boost/config/no_tr1/cmath.hpp: + +/export/ws10smt/software/include/boost/functional/hash/detail/limits.hpp: + +/export/ws10smt/software/include/boost/limits.hpp: + +/export/ws10smt/software/include/boost/integer/static_log2.hpp: + +/export/ws10smt/software/include/boost/integer_fwd.hpp: + +/export/ws10smt/software/include/boost/cstdint.hpp: + +/export/ws10smt/software/include/boost/functional/hash/detail/hash_float_generic.hpp: + +/export/ws10smt/software/include/boost/functional/hash/extensions.hpp: + +/export/ws10smt/software/include/boost/detail/container_fwd.hpp: + +/home/tnguyen/ws10smt/decoder/wordid.h: + +/home/tnguyen/ws10smt/decoder/filelib.h: + +/home/tnguyen/ws10smt/decoder/gzstream.h: + +/home/tnguyen/ws10smt/decoder/tdict.h: + +../agrammar.h: + +/home/tnguyen/ws10smt/decoder/grammar.h: + +/home/tnguyen/ws10smt/decoder/lattice.h: + +/home/tnguyen/ws10smt/decoder/array2d.h: + +../../utils/Util.h: + +../../utils/UtfConverter.h: + +../../utils/ConvertUTF.h: diff --git a/gi/scfg/abc/Release/dict_test b/gi/scfg/abc/Release/dict_test Binary files differnew file mode 100755 index 00000000..1ba94218 --- /dev/null +++ b/gi/scfg/abc/Release/dict_test diff --git a/gi/scfg/abc/Release/grammar b/gi/scfg/abc/Release/grammar new file mode 100644 index 00000000..75fac3a0 --- /dev/null +++ b/gi/scfg/abc/Release/grammar @@ -0,0 +1,13 @@ +[X] ||| . ||| . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 +[X] ||| [X,1] . ||| [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 +[X] ||| [X,1] anciano ||| [1] old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 +[X] ||| [X,1] anciano . ||| [1] old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 +[X] ||| [X,1] anciano [X,2] ||| [1] old man [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 +[X] ||| [X,1] feo ||| ugly [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 +[X] ||| [X,1] feo . ||| ugly [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 +[X] ||| [X,1] feo [X,2] ||| ugly [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 +[X] ||| [X,1] gato ||| [1] cat ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 +[X] ||| [X,1] gato . ||| [1] cat . ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 +[X] ||| el ||| the ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 +[X] ||| el [X,1] ||| the [1] ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 +[X] ||| el [X,1] . ||| the [1] . ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 diff --git a/gi/scfg/abc/Release/grammar.pr b/gi/scfg/abc/Release/grammar.pr new file mode 100644 index 00000000..e4e327cf --- /dev/null +++ b/gi/scfg/abc/Release/grammar.pr @@ -0,0 +1,13 @@ +[X] ||| . ||| . ||| MinusLogP=2.56494935746154 +[X] ||| [X,1] . ||| [1] . ||| MinusLogP=2.56494935746154 +[X] ||| [X,1] anciano ||| [1] old man ||| MinusLogP=2.56494935746154 +[X] ||| [X,1] anciano . ||| [1] old man . ||| MinusLogP=2.56494935746154 +[X] ||| [X,1] anciano [X,2] ||| [1] old man [2] ||| MinusLogP=2.56494935746154 +[X] ||| [X,1] feo ||| ugly [1] ||| MinusLogP=2.56494935746154 +[X] ||| [X,1] feo . ||| ugly [1] . ||| MinusLogP=2.56494935746154 +[X] ||| [X,1] feo [X,2] ||| ugly [1] [2] ||| MinusLogP=2.56494935746154 +[X] ||| [X,1] gato ||| [1] cat ||| MinusLogP=2.56494935746154 +[X] ||| [X,1] gato . ||| [1] cat . ||| MinusLogP=2.56494935746154 +[X] ||| el ||| the ||| MinusLogP=2.56494935746154 +[X] ||| el [X,1] ||| the [1] ||| MinusLogP=2.56494935746154 +[X] ||| el [X,1] . ||| the [1] . ||| MinusLogP=2.56494935746154 diff --git a/gi/scfg/abc/Release/makefile b/gi/scfg/abc/Release/makefile new file mode 100644 index 00000000..25949e74 --- /dev/null +++ b/gi/scfg/abc/Release/makefile @@ -0,0 +1,66 @@ +################################################################################ +# Automatically-generated file. Do not edit! +################################################################################ + +#-include ../makefile.init + +RM := rm -rf + +# All of the sources participating in the build are defined here +-include sources.mk +-include subdir.mk +-include objects.mk + +ifneq ($(MAKECMDGOALS),clean) +ifneq ($(strip $(C++_DEPS)),) +-include $(C++_DEPS) +endif +ifneq ($(strip $(CC_DEPS)),) +-include $(CC_DEPS) +endif +ifneq ($(strip $(C_DEPS)),) +-include $(C_DEPS) +endif +ifneq ($(strip $(CPP_DEPS)),) +-include $(CPP_DEPS) +endif +ifneq ($(strip $(CXX_DEPS)),) +-include $(CXX_DEPS) +endif +ifneq ($(strip $(C_UPPER_DEPS)),) +-include $(C_UPPER_DEPS) +endif +endif + +#-include ../makefile.defs + +# Add inputs and outputs from these tool invocations to the build variables + +# All Target +all: scfg + +# Tool invocations + +# scfg.o: ../scfg.cpp +# @echo 'Building file: $<' +# @echo 'Invoking: GCC C++ Compiler' +# g++ -O3 -g3 -Wall -c -fmessage-length=0 -I../../openfst-1.1/src/include/ -L../../openfst-1.1/src/lib/ -lfst -lpthread -ldl -lm -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o"$@" "$<" +# @echo 'Finished building: $<' +# @echo ' ' + +scfg: $(OBJS) $(USER_OBJS) + @echo 'Building target: $@' + @echo 'Invoking: GCC C++ Linker' + /bin/sh ../../../../libtool --tag=CXX --mode=link g++ -g -O2 -lz -L/export/ws10smt/software/lib -R/export/ws10smt/software/lib -L/export/ws10smt/software/srilm-1.5.10/lib/i686 -o scfg $(OBJS) -L/export/ws10smt/software/lib -lgtest -pthread ../../../../decoder/libcdec.a -lboost_program_options -loolm -ldstruct -lmisc + @echo 'Finished building target: $@' + @echo ' ' +#g++ -I/home/tnguyen/ws10smt/gi/scfg/cdec/ -I/export/ws10smt/software/srilm-1.5.10/include/ -L/home/tnguyen/ws10smt/decoder -lpthread -ldl -lm $(OBJS) $(USER_OBJS) $(LIBS) -o"scfg" +# Other Targets +clean: + -$(RM) $(OBJS)$(C++_DEPS)$(EXECUTABLES)$(CC_DEPS)$(C_DEPS)$(CPP_DEPS)$(CXX_DEPS)$(C_UPPER_DEPS) scfg + -@echo ' ' + +.PHONY: all clean dependents +.SECONDARY: + +-include ../makefile.targets diff --git a/gi/scfg/abc/Release/process_grammar.pl b/gi/scfg/abc/Release/process_grammar.pl new file mode 100644 index 00000000..f82a8e5a --- /dev/null +++ b/gi/scfg/abc/Release/process_grammar.pl @@ -0,0 +1,36 @@ +#!perl + +use warnings; +use strict; + +my $grammar_file = $ARGV[0]; + +my %nt_count; #maps nt--> count rules whose lhs is nt + +open(G, "<$grammar_file") or die "Can't open file $grammar_file"; + +while (<G>){ + + chomp(); + + s/\|\|\|.*//g; + s/\s//g; + + $nt_count{$_}++; +} + + +close (G); + +open(G, "<$grammar_file") or die "Can't open file $grammar_file"; + +while (<G>){ + + chomp(); + + (my $nt = $_) =~ s/\|\|\|.*//g; + $nt =~ s/\s//g; + + s/(.+\|\|\|.+\|\|\|.+\|\|\|).+/$1/g; + print $_ . " MinusLogP=" .(log($nt_count{$nt})) ."\n"; +} diff --git a/gi/scfg/abc/Release/scfg b/gi/scfg/abc/Release/scfg Binary files differnew file mode 100755 index 00000000..4b6cfb19 --- /dev/null +++ b/gi/scfg/abc/Release/scfg diff --git a/gi/scfg/abc/Release/scfg.d b/gi/scfg/abc/Release/scfg.d new file mode 100644 index 00000000..ae7a87bb --- /dev/null +++ b/gi/scfg/abc/Release/scfg.d @@ -0,0 +1,209 @@ +scfg.d scfg.o: ../scfg.cpp /home/tnguyen/ws10smt/decoder/lattice.h \ + /home/tnguyen/ws10smt/decoder/wordid.h \ + /home/tnguyen/ws10smt/decoder/array2d.h \ + /home/tnguyen/ws10smt/decoder/tdict.h ../agrammar.h \ + /home/tnguyen/ws10smt/decoder/grammar.h \ + /export/ws10smt/software/include/boost/shared_ptr.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/shared_ptr.hpp \ + /export/ws10smt/software/include/boost/config.hpp \ + /export/ws10smt/software/include/boost/config/user.hpp \ + /export/ws10smt/software/include/boost/config/select_compiler_config.hpp \ + /export/ws10smt/software/include/boost/config/compiler/gcc.hpp \ + /export/ws10smt/software/include/boost/config/select_stdlib_config.hpp \ + /export/ws10smt/software/include/boost/config/no_tr1/utility.hpp \ + /export/ws10smt/software/include/boost/config/stdlib/libstdcpp3.hpp \ + /export/ws10smt/software/include/boost/config/select_platform_config.hpp \ + /export/ws10smt/software/include/boost/config/platform/linux.hpp \ + /export/ws10smt/software/include/boost/config/posix_features.hpp \ + /export/ws10smt/software/include/boost/config/suffix.hpp \ + /export/ws10smt/software/include/boost/config/no_tr1/memory.hpp \ + /export/ws10smt/software/include/boost/assert.hpp \ + /export/ws10smt/software/include/boost/checked_delete.hpp \ + /export/ws10smt/software/include/boost/throw_exception.hpp \ + /export/ws10smt/software/include/boost/exception/detail/attribute_noreturn.hpp \ + /export/ws10smt/software/include/boost/detail/workaround.hpp \ + /export/ws10smt/software/include/boost/exception/exception.hpp \ + /export/ws10smt/software/include/boost/current_function.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/shared_count.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/bad_weak_ptr.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/sp_counted_base.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/sp_has_sync.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp \ + /export/ws10smt/software/include/boost/detail/sp_typeinfo.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/sp_counted_impl.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/sp_convertible.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/spinlock_pool.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/spinlock.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/spinlock_sync.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/yield_k.hpp \ + /export/ws10smt/software/include/boost/memory_order.hpp \ + /export/ws10smt/software/include/boost/smart_ptr/detail/operator_bool.hpp \ + /home/tnguyen/ws10smt/decoder/lattice.h \ + /home/tnguyen/ws10smt/decoder/trule.h \ + /home/tnguyen/ws10smt/decoder/sparse_vector.h \ + /home/tnguyen/ws10smt/decoder/fdict.h \ + /home/tnguyen/ws10smt/decoder/dict.h \ + /export/ws10smt/software/include/boost/functional/hash.hpp \ + /export/ws10smt/software/include/boost/functional/hash/hash.hpp \ + /export/ws10smt/software/include/boost/functional/hash/hash_fwd.hpp \ + /export/ws10smt/software/include/boost/functional/hash/detail/hash_float.hpp \ + /export/ws10smt/software/include/boost/functional/hash/detail/float_functions.hpp \ + /export/ws10smt/software/include/boost/config/no_tr1/cmath.hpp \ + /export/ws10smt/software/include/boost/functional/hash/detail/limits.hpp \ + /export/ws10smt/software/include/boost/limits.hpp \ + /export/ws10smt/software/include/boost/integer/static_log2.hpp \ + /export/ws10smt/software/include/boost/integer_fwd.hpp \ + /export/ws10smt/software/include/boost/cstdint.hpp \ + /export/ws10smt/software/include/boost/functional/hash/detail/hash_float_generic.hpp \ + /export/ws10smt/software/include/boost/functional/hash/extensions.hpp \ + /export/ws10smt/software/include/boost/detail/container_fwd.hpp \ + /home/tnguyen/ws10smt/decoder/bottom_up_parser.h \ + /home/tnguyen/ws10smt/decoder/grammar.h \ + /home/tnguyen/ws10smt/decoder/hg.h \ + /home/tnguyen/ws10smt/decoder/small_vector.h \ + /home/tnguyen/ws10smt/decoder/prob.h \ + /home/tnguyen/ws10smt/decoder/logval.h \ + /home/tnguyen/ws10smt/decoder/hg_intersect.h ../../utils/ParamsArray.h \ + ../../utils/Util.h ../../utils/UtfConverter.h ../../utils/ConvertUTF.h + +/home/tnguyen/ws10smt/decoder/lattice.h: + +/home/tnguyen/ws10smt/decoder/wordid.h: + +/home/tnguyen/ws10smt/decoder/array2d.h: + +/home/tnguyen/ws10smt/decoder/tdict.h: + +../agrammar.h: + +/home/tnguyen/ws10smt/decoder/grammar.h: + +/export/ws10smt/software/include/boost/shared_ptr.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/shared_ptr.hpp: + +/export/ws10smt/software/include/boost/config.hpp: + +/export/ws10smt/software/include/boost/config/user.hpp: + +/export/ws10smt/software/include/boost/config/select_compiler_config.hpp: + +/export/ws10smt/software/include/boost/config/compiler/gcc.hpp: + +/export/ws10smt/software/include/boost/config/select_stdlib_config.hpp: + +/export/ws10smt/software/include/boost/config/no_tr1/utility.hpp: + +/export/ws10smt/software/include/boost/config/stdlib/libstdcpp3.hpp: + +/export/ws10smt/software/include/boost/config/select_platform_config.hpp: + +/export/ws10smt/software/include/boost/config/platform/linux.hpp: + +/export/ws10smt/software/include/boost/config/posix_features.hpp: + +/export/ws10smt/software/include/boost/config/suffix.hpp: + +/export/ws10smt/software/include/boost/config/no_tr1/memory.hpp: + +/export/ws10smt/software/include/boost/assert.hpp: + +/export/ws10smt/software/include/boost/checked_delete.hpp: + +/export/ws10smt/software/include/boost/throw_exception.hpp: + +/export/ws10smt/software/include/boost/exception/detail/attribute_noreturn.hpp: + +/export/ws10smt/software/include/boost/detail/workaround.hpp: + +/export/ws10smt/software/include/boost/exception/exception.hpp: + +/export/ws10smt/software/include/boost/current_function.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/shared_count.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/bad_weak_ptr.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/sp_counted_base.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/sp_has_sync.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp: + +/export/ws10smt/software/include/boost/detail/sp_typeinfo.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/sp_counted_impl.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/sp_convertible.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/spinlock_pool.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/spinlock.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/spinlock_sync.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/yield_k.hpp: + +/export/ws10smt/software/include/boost/memory_order.hpp: + +/export/ws10smt/software/include/boost/smart_ptr/detail/operator_bool.hpp: + +/home/tnguyen/ws10smt/decoder/lattice.h: + +/home/tnguyen/ws10smt/decoder/trule.h: + +/home/tnguyen/ws10smt/decoder/sparse_vector.h: + +/home/tnguyen/ws10smt/decoder/fdict.h: + +/home/tnguyen/ws10smt/decoder/dict.h: + +/export/ws10smt/software/include/boost/functional/hash.hpp: + +/export/ws10smt/software/include/boost/functional/hash/hash.hpp: + +/export/ws10smt/software/include/boost/functional/hash/hash_fwd.hpp: + +/export/ws10smt/software/include/boost/functional/hash/detail/hash_float.hpp: + +/export/ws10smt/software/include/boost/functional/hash/detail/float_functions.hpp: + +/export/ws10smt/software/include/boost/config/no_tr1/cmath.hpp: + +/export/ws10smt/software/include/boost/functional/hash/detail/limits.hpp: + +/export/ws10smt/software/include/boost/limits.hpp: + +/export/ws10smt/software/include/boost/integer/static_log2.hpp: + +/export/ws10smt/software/include/boost/integer_fwd.hpp: + +/export/ws10smt/software/include/boost/cstdint.hpp: + +/export/ws10smt/software/include/boost/functional/hash/detail/hash_float_generic.hpp: + +/export/ws10smt/software/include/boost/functional/hash/extensions.hpp: + +/export/ws10smt/software/include/boost/detail/container_fwd.hpp: + +/home/tnguyen/ws10smt/decoder/bottom_up_parser.h: + +/home/tnguyen/ws10smt/decoder/grammar.h: + +/home/tnguyen/ws10smt/decoder/hg.h: + +/home/tnguyen/ws10smt/decoder/small_vector.h: + +/home/tnguyen/ws10smt/decoder/prob.h: + +/home/tnguyen/ws10smt/decoder/logval.h: + +/home/tnguyen/ws10smt/decoder/hg_intersect.h: + +../../utils/ParamsArray.h: + +../../utils/Util.h: + +../../utils/UtfConverter.h: + +../../utils/ConvertUTF.h: diff --git a/gi/scfg/abc/Release/sources.mk b/gi/scfg/abc/Release/sources.mk new file mode 100644 index 00000000..6c7070aa --- /dev/null +++ b/gi/scfg/abc/Release/sources.mk @@ -0,0 +1,27 @@ +################################################################################ +# Automatically-generated file. Do not edit! +################################################################################ + +C_UPPER_SRCS := +C_SRCS := +CPP_SRCS := +O_SRCS := +ASM_SRCS := +S_SRCS := +C++_SRCS := +CXX_SRCS := +CC_SRCS := +OBJ_SRCS := +OBJS := +C++_DEPS := +EXECUTABLES := +CC_DEPS := +C_DEPS := +CPP_DEPS := +CXX_DEPS := +C_UPPER_DEPS := + +# Every subdirectory with source files must be described here +SUBDIRS := \ +. \ + diff --git a/gi/scfg/abc/Release/subdir.mk b/gi/scfg/abc/Release/subdir.mk new file mode 100644 index 00000000..49080b36 --- /dev/null +++ b/gi/scfg/abc/Release/subdir.mk @@ -0,0 +1,59 @@ + +################################################################################ +# Automatically-generated file. Do not edit! +################################################################################ + +# Add inputs and outputs from these tool invocations to the build variables +CPP_SRCS += \ +../../utils/Util.cc \ +../agrammar.cc \ +../scfg.cpp + + +OBJS += \ +./Util.o \ +./agrammar.o \ +./scfg.o + + +CPP_DEPS += \ +./Util.d \ +./agrammar.d \ +./scfg.d + +# Each subdirectory must supply rules for building sources it contributes +# %.o: ../%.cpp +# @echo 'Building file: $<' +# @echo 'Invoking: GCC C++ Compiler' +# g++ -g -p -g3 -Wall -c -fmessage-length=0 -I../../openfst-1.1/src/include/ -L../../openfst-1.1/src/lib/ -lfst -lpthread -ldl -lm -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o"$@" "$<" +# +# @echo ' ' + +%.o: ../../utils/%.cc + @echo 'Building file: $<' + @echo 'Invoking: GCC C++ Compiler' + g++ -g -p -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o"$@" "$<" + @echo 'Finished building: $<' + @echo ' ' + +%.o: ../../utils/%.c + @echo 'Building file: $<' + @echo 'Invoking: GCC C++ Compiler' + g++ -g -p -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o"$@" "$<" + @echo 'Finished building: $<' + @echo ' ' + +%.o: ../%.cpp + @echo 'Building file: $<' + @echo 'Invoking: GCC C++ Compiler' + g++ -O3 -g3 -Wall -c -fmessage-length=0 -I../../utils/ -I/home/tnguyen/ws10smt/decoder -I/export/ws10smt/software/include -I/export/ws10smt/software/srilm-1.5.10/include -lpthread -ldl -lm -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o"$@" "$<" + @echo 'Finished building: $<' + @echo ' ' + +%.o: ../%.cc + @echo 'Building file: $<' + @echo 'Invoking: GCC C++ Compiler' + g++ -O3 -g3 -Wall -c -fmessage-length=0 -I../../utils/ -I/home/tnguyen/ws10smt/decoder -I/export/ws10smt/software/include -I/export/ws10smt/software/srilm-1.5.10/include -lpthread -ldl -lm -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o"$@" "$<" + @echo 'Finished building: $<' + @echo ' ' + diff --git a/gi/scfg/abc/Release/tmp.grammar b/gi/scfg/abc/Release/tmp.grammar new file mode 100644 index 00000000..9df1b77d --- /dev/null +++ b/gi/scfg/abc/Release/tmp.grammar @@ -0,0 +1,2 @@ +[A] ||| [B] [C] . ||| [B] [C]. ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 +[A] ||| [B] asd . ||| [B] asd . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0
\ No newline at end of file diff --git a/gi/scfg/abc/Release/toy-grammar b/gi/scfg/abc/Release/toy-grammar new file mode 120000 index 00000000..50dea8df --- /dev/null +++ b/gi/scfg/abc/Release/toy-grammar @@ -0,0 +1 @@ +/export/ws10smt/toy-grammar/
\ No newline at end of file diff --git a/gi/scfg/abc/a.out b/gi/scfg/abc/a.out Binary files differnew file mode 100755 index 00000000..0467acf0 --- /dev/null +++ b/gi/scfg/abc/a.out diff --git a/gi/scfg/abc/agrammar.cc b/gi/scfg/abc/agrammar.cc new file mode 100644 index 00000000..585255e3 --- /dev/null +++ b/gi/scfg/abc/agrammar.cc @@ -0,0 +1,378 @@ +#include <algorithm> +#include <utility> +#include <map> + +#include "rule_lexer.h" +#include "filelib.h" +#include "tdict.h" +#include "agrammar.h" +#include "../utils/Util.h" + +bool equal(TRulePtr const & rule1, TRulePtr const & rule2){ + if (rule1->lhs_ != rule2->lhs_) return false; + if (rule1->f_.size() != rule2->f_.size()) return false; + if (rule1->e_.size() != rule2->e_.size()) return false; + + for (int i=0; i<rule1->f_.size(); i++) + if (rule1->f_.at(i) != rule2->f_.at(i)) return false; + for (int i=0; i<rule1->e_.size(); i++) + if (rule1->e_.at(i) != rule2->e_.at(i)) return false; + return true; +} + +//const vector<TRulePtr> Grammar::NO_RULES; + +void aRemoveRule(vector<TRulePtr> & v, const TRulePtr & rule){ // remove rule from v if found + for (int i=0; i< v.size(); i++) + if (equal(v[i], rule )){ + cout<<"erase rule from vector:"<<rule->AsString()<<endl; + v.erase(v.begin()+i); + } +} + +struct aTextRuleBin : public RuleBin { + int GetNumRules() const { + return rules_.size(); + } + TRulePtr GetIthRule(int i) const { + return rules_[i]; + } + void AddRule(TRulePtr t) { + rules_.push_back(t); + } + void RemoveRule(TRulePtr t){ + for (int i=0; i<rules_.size(); i++){ + if (equal(rules_.at(i), t)){ + rules_.erase(rules_.begin() + i); + //cout<<"IntextRulebin removerulle\n"; + return; + } + } + } + + + int Arity() const { + return rules_.front()->Arity(); + } + void Dump() const { + for (int i = 0; i < rules_.size(); ++i) + cerr << rules_[i]->AsString() << endl; + } + private: + vector<TRulePtr> rules_; +}; + +struct aTextGrammarNode : public GrammarIter { + aTextGrammarNode() : rb_(NULL) {} + ~aTextGrammarNode() { + delete rb_; + } + const GrammarIter* Extend(int symbol) const { + map<WordID, aTextGrammarNode>::const_iterator i = tree_.find(symbol); + if (i == tree_.end()) return NULL; + return &i->second; + } + + const RuleBin* GetRules() const { + if (rb_) { + //rb_->Dump(); + } + return rb_; + } + + map<WordID, aTextGrammarNode> tree_; + aTextRuleBin* rb_; +}; + +struct aTGImpl { + aTextGrammarNode root_; +}; + +aTextGrammar::aTextGrammar() : max_span_(10), pimpl_(new aTGImpl) {} +aTextGrammar::aTextGrammar(const string& file) : + max_span_(10), + pimpl_(new aTGImpl) { + ReadFromFile(file); +} + +const GrammarIter* aTextGrammar::GetRoot() const { + return &pimpl_->root_; +} + +void aTextGrammar::SetGoalNT(const string & goal_str){ + goalID = TD::Convert(goal_str); + +} +void getNTRule( const TRulePtr & rule, map<WordID, NTRule> & ntrule_map){ + + NTRule lhs_ntrule(rule, rule->lhs_ * -1); + ntrule_map[rule->lhs_ * -1] = lhs_ntrule; + + for (int i=0; i< (rule->f_).size(); i++) + if (ntrule_map.find((rule->f_).at(i) * -1) == ntrule_map.end() && (rule->f_).at(i) <0 ){ + NTRule rhs_ntrule(rule, rule->f_.at(i) * -1); + ntrule_map[(rule->f_).at(i) *-1] = rhs_ntrule; + } + + +} +void aTextGrammar::AddRule(const TRulePtr& rule) { + if (rule->IsUnary()) { + rhs2unaries_[rule->f().front()].push_back(rule); + unaries_.push_back(rule); + } else { + aTextGrammarNode* cur = &pimpl_->root_; + for (int i = 0; i < rule->f_.size(); ++i) + cur = &cur->tree_[rule->f_[i]]; + if (cur->rb_ == NULL) + cur->rb_ = new aTextRuleBin; + cur->rb_->AddRule(rule); + } + + //add the rule to lhs_rules_ + lhs_rules_[rule->lhs_* -1].push_back(rule); + + //add the rule to nt_rules_ + map<WordID, NTRule> ntrule_map; + getNTRule (rule, ntrule_map); + for (map<WordID,NTRule>::const_iterator it= ntrule_map.begin(); it != ntrule_map.end(); it++){ + nt_rules_[it->first].push_back(it->second); + } +} + +void aTextGrammar::RemoveRule(const TRulePtr & rule){ + cout<<"Remove rule: "<<rule->AsString()<<endl; + if (rule->IsUnary()) { + aRemoveRule(rhs2unaries_[rule->f().front()], rule); + aRemoveRule(unaries_, rule); + } else { + aTextGrammarNode* cur = &pimpl_->root_; + for (int i = 0; i < rule->f_.size(); ++i) + cur = &cur->tree_[rule->f_[i]]; +// if (cur->rb_ == NULL) +// cur->rb_ = new aTextRuleBin; + cur->rb_->RemoveRule(rule); + } + + //remove rules from lhs_rules_ + + aRemoveRule(lhs_rules_[rule->lhs_ * -1] , rule); + +} + +void aTextGrammar::RemoveNonterminal(WordID wordID){ + vector<NTRule> rules = nt_rules_[wordID]; +// // remove the nonterminal from ntrules_ + nt_rules_.erase(wordID); + for (int i =0; i<rules.size(); i++) + RemoveRule(rules[i].rule_); + +} + +void aTextGrammar::setMaxSplit(int max_split){max_split_ = max_split;} + + + + +void aTextGrammar::AddSplitNonTerminal(WordID nt_old, vector<WordID> & nts){ + + vector<NTRule> rules = nt_rules_[nt_old]; + + // cout<<"\n\n\n start add splitting rules"<<endl; + + const double epsilon = 0.001; + for (int i=0; i<rules.size(); i++){ + NTRule old_rule = rules.at(i); + vector<int> ntPos = old_rule.ntPos_; //in rule old_rule, ntPos is the positions of nonterminal nt_old + //we have to substitute each nt in these positions by the list of new nonterminals in the input vector 'nts' + //there are cnt =size_of(nts)^ size_of(ntPos) possibilities for the substitutions, + //hence the rules' new probabilities have to divide to cnt also + // cout<<"splitting NT in rule "<<old_rule.rule_->AsString()<<endl; + +// cout<<"nt position in the rules"<<endl; +// for (int j=0; j<ntPos.size();j++) cout<<ntPos[j]<<" "; cout<<endl; + + int cnt_newrules = pow( nts.size(), ntPos.size() ); + // cout<<"cnt_newrules="<<cnt_newrules<<endl; + + double log_nts_size = log(nts.size()); + + + map<WordID, int> cnt_addepsilon; //cnt_addepsilon and cont_minusepsilon to track the number of rules epsilon is added or minus for each lhs nonterminal, ideally we want these two numbers are equal + map<WordID, int> cnt_minusepsilon; //these two number also use to control the random generated add epsilon/minus epsilon of a new rule + cnt_addepsilon[old_rule.rule_->lhs_] = 0; + cnt_minusepsilon[old_rule.rule_->lhs_] = 0; + for (int j =0; j<nts.size(); j++) { cnt_addepsilon[nts[j] ] = 0; cnt_minusepsilon[nts[j] ] = 0;} + + + for (int j=0; j<cnt_newrules; j++){ //each j represents a new rule + //convert j to a vector of size ntPos.size(), each entry in the vector >=0 and <nts.size() + int mod = nts.size(); + vector <int> j_vector(ntPos.size(), 0); //initiate the vector to all 0 + int j_tmp =j; + for (int k=0; k<ntPos.size(); k++){ + j_vector[k] = j_tmp % mod; + j_tmp = (j_tmp - j_vector[k]) / mod; + } + // cout<<"print vector j_vector"<<endl; + // for (int k=0; k<ntPos.size();k++) cout<<j_vector[k]<<" "; cout<<endl; + //now use the vector to create a new rule + TRulePtr newrule(new TRule()); + + newrule -> e_ = (old_rule.rule_)->e_; + newrule -> f_ = old_rule.rule_->f_; + newrule->lhs_ = old_rule.rule_->lhs_; + newrule -> arity_ = old_rule.rule_->arity_; + newrule -> scores_ = old_rule.rule_->scores_; + + // cout<<"end up update score\n"; + if (ntPos[0] == -1){ //update the lhs + newrule->lhs_ = nts[j_vector[0]] * -1; + + //score has to randomly add/minus a small epsilon to break the balance + if (nts.size() >1 && ntPos.size() >1){ + // cout<<"start to add/minus epsilon"<<endl; + if ( cnt_addepsilon[newrule->lhs_] >= cnt_newrules / (2*ntPos.size()) ) //there are enough rules added epsilon, the new rules has to minus epsilon + newrule-> scores_ -= epsilon; + else if ( cnt_minusepsilon[newrule->lhs_] >= cnt_newrules / (2*ntPos.size()) ) + newrule-> scores_ += epsilon; + else{ + double random = rand()/RAND_MAX; + if (random > .5){ + newrule-> scores_ += epsilon; + cnt_addepsilon[newrule->lhs_]++; + } + else{ + newrule-> scores_ -= epsilon; + cnt_minusepsilon[newrule->lhs_]++; + } + } + } + + + for (int k=1; k<ntPos.size(); k++){//update f_ + // cout<<"ntPos[k]="<<ntPos[k]<<endl; + newrule->f_[ntPos[k]] = nts[j_vector[k]] * -1; //update the ntPos[k-1]-th nonterminal in f_ to the j_vector[k] NT in nts + } + newrule -> scores_ += (ntPos.size() -1) * log_nts_size; + + + } + else{ + //score has to randomly add/minus a small epsilon to break the balance + if ( ntPos.size() >0 && nts.size()>1){ + // cout<<"start to add/minus epsilon"<<endl; + if ( cnt_addepsilon[newrule->lhs_] >= cnt_newrules / 2 ) //there are enough rules added epsilon, the new rules has to minus epsilon + newrule-> scores_ -= epsilon; + else if ( cnt_minusepsilon[newrule->lhs_] >= cnt_newrules /2 ) + newrule-> scores_ += epsilon; + else{ + double random = rand()/RAND_MAX; + if (random > .5){ + newrule-> scores_ += epsilon; + cnt_addepsilon[newrule->lhs_]++; + } + else{ + newrule-> scores_ -= epsilon; + cnt_minusepsilon[newrule->lhs_]++; + } + } + } + + + for (int k=0; k<ntPos.size(); k++){ //update f_ + // cout<<"ntPos[k]="<<ntPos[k]<<endl; + newrule->f_[ntPos[k]] = nts[j_vector[k]] * -1; + } + newrule -> scores_ += ntPos.size() * log_nts_size; + } + this->AddRule (newrule); + }//add new rules for each grammar rules + + } //iterate through all grammar rules + +} + + +void aTextGrammar::splitNonterminal(WordID wordID){ + + //first added the splits nonterminal into the TD dictionary + + string old_str = TD::Convert(wordID); //get the nonterminal label of wordID, the new nonterminals will be old_str+t where t=1..max_split + + vector<WordID> v_splits;//split nonterminal wordID into the list of nonterminals in v_splits + for (int i =0; i< this->max_split_; i++){ + string split_str = old_str + "+" + itos(i); + WordID splitID = TD::Convert(split_str); + v_splits.push_back(splitID); + + } + + // grSplitNonterminals[wordID] = v_splits; + + //print split nonterminas of wordID + // v_splits = grSplitNonterminals[wordID]; + // cout<<"print split nonterminals\n"; + // for (int i =0; i<v_splits.size(); i++) + // cout<<v_splits[i]<<"\t"<<TD::Convert(v_splits[i])<<endl; + + AddSplitNonTerminal(wordID, v_splits); + RemoveNonterminal(wordID); + + // grSplitNonterminals.erase (grSplitNonterminals.find(WordID) ); + + if (wordID == goalID){ //add rule X-> X1; X->X2,... if X is the goal NT + for (int i =0; i<v_splits.size(); i++){ + TRulePtr rule (new TRule()); + rule ->lhs_ = goalID * -1; + rule ->f_.push_back(v_splits[i] * -1); + rule->e_.push_back(0); + + rule->scores_.set_value(FD::Convert("MinusLogP"), log(v_splits.size()) ); + AddRule(rule); + } + + } + + +} + + + +void aTextGrammar::PrintAllRules() const{ + map<WordID, vector<TRulePtr> >::const_iterator it; + for (it= lhs_rules_.begin(); it != lhs_rules_.end(); it++){ + + vector<TRulePtr> v = it-> second; + for (int i =0; i< v.size(); i++){ + cout<<v[i]->AsString()<<"\t"<<endl; + } + } +} + + +void aTextGrammar::PrintNonterminalRules(WordID nt) const{ + vector< NTRule > v; + map<WordID, vector<NTRule> >::const_iterator mit= nt_rules_.find(nt); + if (mit == nt_rules_.end()) + return; + + v = mit->second; + + for (vector<NTRule>::const_iterator it = v.begin(); it != v.end(); it++) + cout<<it->rule_->AsString()<<endl; +} + +static void AddRuleHelper(const TRulePtr& new_rule, void* extra) { + static_cast<aTextGrammar*>(extra)->AddRule(new_rule); +} + +void aTextGrammar::ReadFromFile(const string& filename) { + ReadFile in(filename); + RuleLexer::ReadRules(in.stream(), &AddRuleHelper, this); +} + +bool aTextGrammar::HasRuleForSpan(int i, int j, int distance) const { + return (max_span_ >= distance); +} + diff --git a/gi/scfg/abc/agrammar.h b/gi/scfg/abc/agrammar.h new file mode 100644 index 00000000..8a7186bf --- /dev/null +++ b/gi/scfg/abc/agrammar.h @@ -0,0 +1,68 @@ +#ifndef AGRAMMAR_H_ +#define AGRAMMAR_H_ + +#include "grammar.h" + + +using namespace std; + +class aTGImpl; +struct NTRule{ + + NTRule(){}; + NTRule(const TRulePtr & rule, WordID nt){ + nt_ = nt; + rule_ = rule; + + if (rule->lhs_ * -1 == nt) + ntPos_.push_back(-1); + + for (int i=0; i< rule->f().size(); i++) + if (rule->f().at(i) * -1 == nt) + ntPos_.push_back(i); + } + + TRulePtr rule_; + WordID nt_; //the labelID of the nt (WordID>0); + + vector<int> ntPos_; //position of nt_ -1: lhs, from 0...f_.size() for nt of f_() + //i.e the rules is: NP-> DET NP; if nt_=5 is the labelID of NP then ntPos_ = (-1, 1): the indexes of nonterminal NP + + +}; + +struct aTextGrammar : public Grammar { + aTextGrammar(); + aTextGrammar(const std::string& file); + void SetMaxSpan(int m) { max_span_ = m; } + + virtual const GrammarIter* GetRoot() const; + void AddRule(const TRulePtr& rule); + void ReadFromFile(const std::string& filename); + virtual bool HasRuleForSpan(int i, int j, int distance) const; + const std::vector<TRulePtr>& GetUnaryRules(const WordID& cat) const; + + void AddSplitNonTerminal(WordID nt_old, vector<WordID> & nts); + void setMaxSplit(int max_split); + void splitNonterminal(WordID wordID); + + void PrintAllRules() const; + void PrintNonterminalRules(WordID nt) const; + void SetGoalNT(const string & goal_str); + private: + + void RemoveRule(const TRulePtr & rule); + void RemoveNonterminal(WordID wordID); + + int max_span_; + int max_split_; + boost::shared_ptr<aTGImpl> pimpl_; + map <WordID, vector<TRulePtr> > lhs_rules_;// WordID >0 + map <WordID, vector<NTRule> > nt_rules_; + + // map<WordID, vector<WordID> > grSplitNonterminals; + WordID goalID; +}; + + +#endif diff --git a/gi/scfg/abc/old_agrammar.cc b/gi/scfg/abc/old_agrammar.cc new file mode 100644 index 00000000..33d70dfc --- /dev/null +++ b/gi/scfg/abc/old_agrammar.cc @@ -0,0 +1,383 @@ +#include "agrammar.h" +#include "Util.h" + +#include <algorithm> +#include <utility> +#include <map> + +#include "rule_lexer.h" +#include "filelib.h" +#include "tdict.h" +#include <iostream> +#include <fstream> + +map<WordID, vector<WordID> > grSplitNonterminals; +//const vector<TRulePtr> Grammar::NO_RULES; + + +// vector<TRulePtr> substituteF(TRulePtr & rule, WordID wordID, vector<WordID> & v){ +// vector<TRulePtr> vRules; //outputs + +// vector<WordID> f = rule->f(); +// vector<vector<WordID> > newfvector; +// for (int i =0; i< f.size(); i++){ +// if (f[i] == wordID){ +// newfvector.push_back(v); +// } +// else +// newfvector.push_back(vector<WordID> (1, f[i])); +// } + +// //now creates new rules; + + +// return vRules; +// } + + +struct aTextRuleBin : public RuleBin { + int GetNumRules() const { + return rules_.size(); + } + TRulePtr GetIthRule(int i) const { + return rules_[i]; + } + void AddRule(TRulePtr t) { + rules_.push_back(t); + } + int Arity() const { + return rules_.front()->Arity(); + } + void Dump() const { + for (int i = 0; i < rules_.size(); ++i) + cerr << rules_[i]->AsString() << endl; + } + + + vector<TRulePtr> getRules(){ return rules_;} + + + void substituteF(vector<WordID> & f_path, map<WordID, vector<WordID> > & grSplitNonterminals){ + //this substituteF method is different with substituteF procedure found in cdec code; + // + //aTextRuleBin has a collection of rules with the same f() on the rhs, + //substituteF() replaces the f_ of all the rules with f_path vector, + //the grSplitNonterminals input to split the lhs_ nonterminals of the rules incase the lhs_ nonterminal found in grSplitNonterminals + + vector <TRulePtr> newrules; + for (vector<TRulePtr>::iterator it = rules_.begin() ; it != rules_.end(); it++){ + assert(f_path.size() == (*it)->f_.size()); + + if (grSplitNonterminals.find( (*it)->lhs_) == grSplitNonterminals.end()){ + (*it)->f_ = f_path; + } + else{ // split the lhs NT, + vector<WordID> new_lhs = grSplitNonterminals[ (*it)->lhs_ ]; + for (vector<WordID>::iterator vit = new_lhs.begin(); vit != new_lhs.end(); vit++){ + TRulePtr newrule; + newrule -> e_ = (*it)->e_; + newrule -> f_ = (*it)->f_; + newrule->lhs_ = *vit; + newrule -> scores_ = (*it)->scores_; + newrule -> arity_ = (*it)->arity_; + newrules.push_back (newrule); + } + rules_.erase(it); + } + } + + //now add back newrules(output of splitting lhs_) to rules_ + rules_.insert(newrules.begin(),newrules.begin(), newrules.end()); + } + +private: + vector<TRulePtr> rules_; +}; + + + +struct aTextGrammarNode : public GrammarIter { + aTextGrammarNode() : rb_(NULL) {} + + aTextGrammarNode(const aTextGrammarNode & a){ + nonterminals_ = a.nonterminals_; + tree_ = a.tree_; + rb_ = new aTextRuleBin(); //cp constructor: don't cp the set of rules over + } + + ~aTextGrammarNode() { + delete rb_; + } + const GrammarIter* Extend(int symbol) const { + map<WordID, aTextGrammarNode>::const_iterator i = tree_.find(symbol); + if (i == tree_.end()) return NULL; + return &i->second; + } + + const RuleBin* GetRules() const { + if (rb_) { + //rb_->Dump(); + } + return rb_; + } + + void DFS(); + + void visit (); //todo: make this as a function pointer + + vector <WordID > path_; //vector of f_ nonterminals/terminals from the top to the current node; + set<WordID> nonterminals_; //Linh added: the set of nonterminals extend the current TextGrammarNode, WordID is the label in the dict; i.e WordID>0 + map<WordID, aTextGrammarNode> tree_; + aTextRuleBin* rb_; + + void print_path(){ //for debug only + cout<<"path="<<endl; + for (int i =0; i< path_.size(); i++) + cout<<path_[i]<<" "; + cout<<endl; + } +}; + +void aTextGrammarNode::DFS(){ //because the grammar is a tree without circle, DFS does not require to color the nodes + + visit(); + + for (map<WordID, aTextGrammarNode>::iterator it = tree_.begin(); it != tree_.end(); it++){ + (it->second).DFS(); + } +} + + +void aTextGrammarNode::visit( ){ + + cout<<"start visit()"<<endl; + + cout<<"got grSplitNonterminals"<<endl; +// if (grSplitNonterminals.find(*it) != grSplitNonterminals.end()){ //split this *it nonterminal +// vector<WordID> vsplits = grSplitNonterminals[*it]; //split *it into vsplits + + //iterate through next terminals/nonterminals in tree_ + vector<WordID> tobe_removedNTs; //the list of nonterminal children in tree_ were splited hence will be removed from tree_ + + for (map<WordID, aTextGrammarNode>::iterator it = tree_.begin() ; it != tree_.end(); it++){ + cout<<"in visit(): inside for loop: wordID=="<<it->first<<endl; + + map<WordID, vector<WordID> >::const_iterator git = grSplitNonterminals.find(it->first * -1 ); + + if (git == grSplitNonterminals.end() || it->first >0){ //the next symbols is not to be split + cout<<"not split\n"; + tree_[it->first ].path_ = path_; + tree_[it->first ].path_.push_back(it->first); + cout<<"in visit() tree_[it->first ].path_= "; + tree_[it->first ].print_path(); + continue; + } + + + cout<<"tmp2"; + vector<WordID> vsplits = grSplitNonterminals[it->first * -1]; + // vector<WordID> vsplits = git->second; + cout<<"tmp3"; + // vector<WordID> vsplits = agrammar_ ->splitNonterminals_[it->first * -1]; + cout <<"got vsplits"<<endl; + for (int i =0 ; i<vsplits.size(); i++){ + // nonterminals_.insert(vsplits[i]); //add vsplits[i] into nonterminals_ of the current TextGrammarNode + tree_[vsplits[i] * -1] = aTextGrammarNode(tree_[it->first]); //cp the subtree to new nonterminal + tree_[vsplits[i] * -1].path_ = path_; //update the path if the subtrees + tree_[vsplits[i] * -1].path_.push_back(vsplits[i] * -1); + tree_[vsplits[i] * -1].print_path(); + } + + //remove the old node: + tobe_removedNTs.push_back(it->first); + + } + + for (int i =0; i<tobe_removedNTs.size(); i++) + tree_.erase(tobe_removedNTs[i]); + + if (tree_.size() ==0){ //the last (terminal/nonterminal + cout<<"inside visit(): the last terminal/nonterminal"<<endl; + rb_->substituteF(path_, grSplitNonterminals); + + } + cout<<"visit() end"<<endl; +} + +struct aTGImpl { + aTextGrammarNode root_; +}; + +aTextGrammar::aTextGrammar() : max_span_(10), pimpl_(new aTGImpl) {} +aTextGrammar::aTextGrammar(const std::string& file) : + max_span_(10), + pimpl_(new aTGImpl) { + ReadFromFile(file); +} + + +const GrammarIter* aTextGrammar::GetRoot() const { + return &pimpl_->root_; +} + + +void aTextGrammar::addNonterminal(WordID wordID){ + //addNonterminal add the nonterminal wordID (wordID<0) to the list of nonterminals (map<WordID, int>) nonterminals_ of grammar + //if the input parameter wordID<0 then do nothing + + if (wordID <0){ //it is a nonterminal + + map<WordID, int>::iterator it = nonterminals_.find(wordID * -1); + if (it == nonterminals_.end()) //if not found in the list of nonterminals(a new nonterminals) + nonterminals_[wordID * -1] = 1; + } +} + + + +void aTextGrammar::AddRule(const TRulePtr& rule) { + //add the LHS nonterminal to nonterminals_ map + + this->addNonterminal(rule->lhs_); + + if (rule->IsUnary()) { + rhs2unaries_[rule->f().front()].push_back(rule); + unaries_.push_back(rule); + if (rule->f().front() <0) + //add the RHS nonterminal to the list of nonterminals (the addNonterminal() function will check if it is the rhs symbol is a nonterminal then multiply by -1) + this->addNonterminal(rule->f().front()); + + + } else { + aTextGrammarNode* cur = &pimpl_->root_; + for (int i = 0; i < rule->f_.size(); ++i){ + if (rule->f_[i] <0){ + cur->nonterminals_.insert(rule->f_[i] * -1); //add the next(extend) nonterminals to the current node's nonterminals_ set + this->addNonterminal(rule->f_[i]); //add the rhs nonterminal to the grammar's list of nonterminals + } + cur = &cur->tree_[rule->f_[i]]; + + } + if (cur->rb_ == NULL) + cur->rb_ = new aTextRuleBin; + cur->rb_->AddRule(rule); + + } +} + +static void aAddRuleHelper(const TRulePtr& new_rule, void* extra) { + static_cast<aTextGrammar*>(extra)->AddRule(new_rule); +} + + +void aTextGrammar::ReadFromFile(const string& filename) { + ReadFile in(filename); + RuleLexer::ReadRules(in.stream(), &aAddRuleHelper, this); +} + +bool aTextGrammar::HasRuleForSpan(int i, int j, int distance) const { + return (max_span_ >= distance); +} + + +////Linh added + +void aTextGrammar::setMaxSplit(int max_split){max_split_ = max_split;} + + +void aTextGrammar::printAllNonterminals() const{ + for (map<WordID, int>::const_iterator it =nonterminals_.begin(); + it != nonterminals_.end(); it++){ + if (it->second >0){ + cout <<it->first<<"\t"<<TD::Convert(it->first)<<endl; + } + } + +} + + +void aTextGrammar::splitNonterminal(WordID wordID){ + + //first added the splits nonterminal into the TD dictionary + + string old_str = TD::Convert(wordID); //get the nonterminal label of wordID, the new nonterminals will be old_str+t where t=1..max_split + + vector<WordID> v_splits;//split nonterminal wordID into the list of nonterminals in v_splits + for (int i =0; i< this->max_split_; i++){ + string split_str = old_str + "+" + itos(i); + WordID splitID = TD::Convert(split_str); + v_splits.push_back(splitID); + nonterminals_[splitID] = 1; + } + + grSplitNonterminals[wordID] = v_splits; + //set wordID to be an inactive nonterminal + nonterminals_[wordID] = 0; + + //print split nonterminas of wordID + v_splits = grSplitNonterminals[wordID]; + cout<<"print split nonterminals\n"; + for (int i =0; i<v_splits.size(); i++) + cout<<v_splits[i]<<"\t"<<TD::Convert(v_splits[i])<<endl; + + + //now update in grammar rules and gramar tree: + vector<TRulePtr> newrules; + //first unary rules: + //iterate through unary rules + for (int i =0; i < unaries_.size(); i++){ + TRulePtr rule = unaries_[i]; + WordID lhs = rule.lhs_; + if (grSplitNonterminals.find(rule->f().front() ) != grSplitNonterminals.end()//if the rhs is in the list of splitting nonterminal + && grSplitNonterminals.find(lhs ) != grSplitNonterminals.end() //and the lhs is in the list of splitting nonterminal too + ){ + vector<WordID> rhs_nonterminals = grSplitNonterminals[rule->f().front()]; //split the rhs nonterminal into the list of nonterminals in 'rhs_nonterminals' + vector<WordID> lhs_nonterminals = grSplitNonterminals[lhs]; //split the rhs nonterminal into the list of nonterminals in 'lhs_nonterminals' + for (int k =0; k <rhs_nonterminals.size(); k++) + for (int j =0; j <lhs_nonterminals.size(); j++){ + TRulePtr newrule; + newrule -> e_ = rule->e_; + newrule -> f_ = rhs_nonterminals[k]->f_; + newrule->lhs_ = lhs_nonterminals[j]->lhs_; + newrule -> scores_ = rule->scores_; + newrule -> arity_ = (*it)->arity_; + newrules.push_back (newrule); + + //update + } + } + else{//the rhs terminal/nonterminal is not in the list of splitting nonterminal + + + } + } + + // for (Cat2Rule::const_iterator it = rhs2unaries_.begin(); it != rhs2unaries_.end(); it++){ + + // } + // if (rule->IsUnary()) { + // rhs2unaries_[rule->f().front()].push_back(rule); + // unaries_.push_back(rule); + // if (rule->f().front() <0) + // //add the RHS nonterminal to the list of nonterminals (the addNonterminal() function will check if it is the rhs symbol is a nonterminal then multiply by -1) + // this->addNonterminal(rule->f().front()); + + + pimpl_->root_.DFS(); + +} + + +// void aTextGrammar::splitNonterminal0(WordID wordID){ + +// TextGrammarNode* cur = &pimpl_->root_; +// for (int i = 0; i < rule->f_.size(); ++i) +// cur = &cur->tree_[rule->f_[i]]; + +// } + +void aTextGrammar::splitAllNonterminals(){ + + +} + diff --git a/gi/scfg/abc/old_agrammar.h b/gi/scfg/abc/old_agrammar.h new file mode 100644 index 00000000..d68c2548 --- /dev/null +++ b/gi/scfg/abc/old_agrammar.h @@ -0,0 +1,45 @@ +#ifndef _AGRAMMAR_H_ +#define _AGRAMMAR_H_ + +#include "grammar.h" + +using namespace std; + +class aTGImpl; + +struct aTextGrammar : public Grammar { + aTextGrammar(); + aTextGrammar(const std::string& file); + void SetMaxSpan(int m) { max_span_ = m; } + + virtual const GrammarIter* GetRoot() const; + void AddRule(const TRulePtr& rule); + void ReadFromFile(const std::string& filename); + virtual bool HasRuleForSpan(int i, int j, int distance) const; + const std::vector<TRulePtr>& GetUnaryRules(const WordID& cat) const; + + void setMaxSplit(int max_split); + + void printAllNonterminals() const; + void addNonterminal(WordID wordID); + + void splitAllNonterminals(); + void splitNonterminal(WordID wordID); + + // inline map<WordID, vector<WordID> > & getSplitNonterminals(){return splitNonterminals_;} + // map<WordID, vector<WordID> > splitNonterminals_; + private: + int max_span_; + boost::shared_ptr<aTGImpl> pimpl_; + int max_split_; + + map<WordID, int> nonterminals_; //list of nonterminals of the grammar if nonterminals_[WordID] > 0 the nonterminal WordID is found in the grammar + + + +}; + + + + +#endif diff --git a/gi/scfg/abc/scfg.cpp b/gi/scfg/abc/scfg.cpp new file mode 100644 index 00000000..4d094488 --- /dev/null +++ b/gi/scfg/abc/scfg.cpp @@ -0,0 +1,152 @@ +#include "lattice.h" +#include "tdict.h" +#include "agrammar.h" +#include "bottom_up_parser.h" +#include "hg.h" +#include "hg_intersect.h" +#include "../utils/ParamsArray.h" + + +using namespace std; + +typedef aTextGrammar aGrammar; +aGrammar * load_grammar(string & grammar_filename){ + cerr<<"start_load_grammar "<<grammar_filename<<endl; + + aGrammar * test = new aGrammar(grammar_filename); + + + return test; +} + +Lattice convertSentenceToLattice(const string & str){ + + std::vector<WordID> vID; + TD::ConvertSentence(str , &vID); + Lattice lsentence; + lsentence.resize(vID.size()); + + + for (int i=0; i<vID.size(); i++){ + + lsentence[i].push_back( LatticeArc(vID[i], 0.0, 1) ); + } + + // if(!lsentence.IsSentence()) + // cout<<"not a sentence"<<endl; + + return lsentence; + +} + +bool parseSentencePair(const string & goal_sym, const string & src, const string & tgt, GrammarPtr & g, Hypergraph &hg){ + + Lattice lsource = convertSentenceToLattice(src); + + //parse the source sentence by the grammar + + vector<GrammarPtr> grammars(1, g); + + ExhaustiveBottomUpParser parser = ExhaustiveBottomUpParser(goal_sym, grammars); + + if (!parser.Parse(lsource, &hg)){ + + cerr<<"source sentence does not parse by the grammar!"<<endl; + return false; + } + + //intersect the hg with the target sentence + Lattice ltarget = convertSentenceToLattice(tgt); + + //forest.PrintGraphviz(); + return HG::Intersect(ltarget, & hg); + +} + + + + +int main(int argc, char** argv){ + + ParamsArray params(argc, argv); + params.setDescription("scfg models"); + + params.addConstraint("grammar_file", "grammar file ", true); // optional + + params.addConstraint("input_file", "parallel input file", true); //optional + + if (!params.runConstraints("scfg")) { + return 0; + } + cerr<<"get parametters\n\n\n"; + + string input_file = params.asString("input_file", "parallel_corpora"); + string grammar_file = params.asString("grammar_file", "./grammar.pr"); + + + string src = "el gato ."; + + string tgt = "the cat ."; + + + string goal_sym = "X"; + srand(123); + /*load grammar*/ + + + aGrammar * agrammar = load_grammar(grammar_file); + agrammar->SetGoalNT(goal_sym); + cout<<"before split nonterminal"<<endl; + GrammarPtr g( agrammar); + + Hypergraph hg; + if (! parseSentencePair(goal_sym, src, tgt, g, hg) ){ + cerr<<"target sentence is not parsed by the grammar!\n"; + return 1; + + } + hg.PrintGraphviz(); + + if (! parseSentencePair(goal_sym, src, tgt, g, hg) ){ + cerr<<"target sentence is not parsed by the grammar!\n"; + return 1; + + } + hg.PrintGraphviz(); + //hg.clear(); + + if (1==1) return 1; + + agrammar->PrintAllRules(); + /*split grammar*/ + cout<<"split NTs\n"; + cerr<<"first of all write all nonterminals"<<endl; + // agrammar->printAllNonterminals(); + agrammar->setMaxSplit(2); + agrammar->splitNonterminal(4); + cout<<"after split nonterminal"<<endl; + agrammar->PrintAllRules(); + Hypergraph hg1; + if (! parseSentencePair(goal_sym, src, tgt, g, hg1) ){ + cerr<<"target sentence is not parsed by the grammar!\n"; + return 1; + + } + + hg1.PrintGraphviz(); + + + agrammar->splitNonterminal(15); + cout<<"after split nonterminal"<<TD::Convert(15)<<endl; + agrammar->PrintAllRules(); + + + /*load training corpus*/ + + + /*for each sentence pair in training corpus*/ + + // forest.PrintGraphviz(); + /*calculate expected count*/ + +} diff --git a/gi/scfg/abc/tmp.cpp b/gi/scfg/abc/tmp.cpp new file mode 100644 index 00000000..967a601d --- /dev/null +++ b/gi/scfg/abc/tmp.cpp @@ -0,0 +1,36 @@ +#include <iostream> +#include <set> +#include <vector> +using namespace std; + +int x = 5; + +class A{A(){x++;}}; +// { +// int a_; + +// }; + +class B: public A{ + + int b_; +}; + +int main(){ + + cout<<"Hello World"; + set<int> s; + + s.insert(1); + s.insert(2); + + x++; + cout<<"x="<<x<<endl; + + vector<int> t; + t.push_back(2); t.push_back(1); t.push_back(2); t.push_back(3); t.push_back(2); t.push_back(4); + for(vector<int>::iterator it = t.begin(); it != t.end(); it++){ + if (*it ==2) t.erase(it); + cout <<*it<<endl; + } +} |