import numpy as np import math dmodel = 32 embedding_dim = 8 nwords = 3 num_heads = 4 assert(dmodel/num_heads == embedding_dim) states = np.array([np.ones(shape=[embedding_dim])*(i+1) for i in range(nwords)]) # num. words x embedding dim Wqs = [] Wks = [] Wvs = [] scores = [] def softmax(m): return np.exp(m) / np.sum(np.exp(m), axis=1) for h in range(num_heads): Wq = np.random.rand(embedding_dim, int(dmodel/num_heads)) Wk = np.random.rand(embedding_dim, int(dmodel/num_heads)) Wv = np.random.rand(embedding_dim, int(dmodel/num_heads)) queries = np.matmul(states, Wq) keys = np.matmul(states, Wk) values = np.matmul(states, Wv) out = np.matmul(queries, np.transpose(keys)) out = out/math.sqrt(dmodel) # manual #out_max = [] #for i in range(out.shape[0]): # out_max.append(softmax(out[i])) #out = np.array(out_max) out = softmax(out) out = np.matmul(out, values) Wqs.append(Wq) Wks.append(Wk) Wvs.append(Wv) scores.append(out) out = np.concatenate(scores, axis=0) out = np.matmul(np.random.rand(nwords,out.shape[0]), out) print(out.shape) print(out)