|
@@ -0,0 +1,111 @@
|
|
|
+/*
|
|
|
+This package is a Go implementation of the original word2vec model.
|
|
|
+Currently only model loading is supported.
|
|
|
+*/
|
|
|
+package word2vec
|
|
|
+
|
|
|
+import (
|
|
|
+ "bufio"
|
|
|
+ "encoding/binary"
|
|
|
+ "fmt"
|
|
|
+ "os"
|
|
|
+)
|
|
|
+
|
|
|
+// Model is the Word2vec model.
|
|
|
+type Model struct {
|
|
|
+ Layer1Size int
|
|
|
+ Vocab map[string]int
|
|
|
+
|
|
|
+ data []float32
|
|
|
+}
|
|
|
+
|
|
|
+type Pair struct {
|
|
|
+ Word string
|
|
|
+ Sim float32
|
|
|
+}
|
|
|
+
|
|
|
+// Load the model generated by the original word2vec.
|
|
|
+func Load(filename string) (*Model, error) {
|
|
|
+ file, err := os.Open(filename)
|
|
|
+ if err != nil {
|
|
|
+ return nil, err
|
|
|
+ }
|
|
|
+ reader := bufio.NewReader(file)
|
|
|
+ var vocabSize, layer1Size int
|
|
|
+ fmt.Fscanln(reader, &vocabSize, &layer1Size)
|
|
|
+ var word string
|
|
|
+ model := &Model{
|
|
|
+ Layer1Size: layer1Size,
|
|
|
+ Vocab: make(map[string]int),
|
|
|
+ data: make([]float32, layer1Size*vocabSize),
|
|
|
+ }
|
|
|
+ for i := 0; i < vocabSize; i++ {
|
|
|
+ var vector = model.Vector(i)
|
|
|
+ bytes, err := reader.ReadBytes(' ')
|
|
|
+ if err != nil {
|
|
|
+ return nil, err
|
|
|
+ }
|
|
|
+ word = string(bytes[:len(bytes)-1])
|
|
|
+ err = binary.Read(reader, binary.LittleEndian, vector)
|
|
|
+ if err != nil {
|
|
|
+ return nil, err
|
|
|
+ }
|
|
|
+ vector.Normalize()
|
|
|
+ reader.ReadByte()
|
|
|
+
|
|
|
+ model.Vocab[word] = i
|
|
|
+ }
|
|
|
+ return model, nil
|
|
|
+}
|
|
|
+
|
|
|
+// Vector returns the vector of i-th word.
|
|
|
+func (m *Model) Vector(i int) Vector {
|
|
|
+ return Vector(m.data[m.Layer1Size*i : m.Layer1Size*(i+1)])
|
|
|
+}
|
|
|
+
|
|
|
+// Similarity returns the similarity of the two words.
|
|
|
+func (m *Model) Similarity(x, y string) (float32, error) {
|
|
|
+ id1, ok := m.Vocab[x]
|
|
|
+ if !ok {
|
|
|
+ return 0, fmt.Errorf("Word not found: %s", x)
|
|
|
+ }
|
|
|
+ id2, ok := m.Vocab[y]
|
|
|
+ if !ok {
|
|
|
+ return 0, fmt.Errorf("Word not found: %s", y)
|
|
|
+ }
|
|
|
+ return m.Vector(id1).Dot(m.Vector(id2)), nil
|
|
|
+}
|
|
|
+
|
|
|
+// MostSimilar returns the most similiar n words to sum(positives) - sum(negatives).
|
|
|
+func (m *Model) MostSimilar(positives, negatives []string, n int) ([]Pair, error) {
|
|
|
+ // Construct the target vector.
|
|
|
+ vec := Vector(make([]float32, m.Layer1Size))
|
|
|
+ for _, word := range positives {
|
|
|
+ if wordId, ok := m.Vocab[word]; !ok {
|
|
|
+ return nil, fmt.Errorf("Word not found: %s", word)
|
|
|
+ } else {
|
|
|
+ vec.Add(1, m.Vector(wordId))
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for _, word := range negatives {
|
|
|
+ if wordId, ok := m.Vocab[word]; !ok {
|
|
|
+ return nil, fmt.Errorf("Word not found: %s", word)
|
|
|
+ } else {
|
|
|
+ vec.Add(-1, m.Vector(wordId))
|
|
|
+ }
|
|
|
+ }
|
|
|
+ vec.Normalize()
|
|
|
+
|
|
|
+ // Find the top similar words.
|
|
|
+ r := make([]Pair, n)
|
|
|
+ for w, i := range m.Vocab {
|
|
|
+ sim := vec.Dot(m.Vector(i))
|
|
|
+ this := Pair{w, sim}
|
|
|
+ for j := 0; j < n; j++ {
|
|
|
+ if this.Sim > r[j].Sim {
|
|
|
+ this, r[j] = r[j], this
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return r, nil
|
|
|
+}
|