wangchuanjin 9 år sedan
förälder
incheckning
50c39907fa

+ 1 - 0
common/src/github.com/thinxer/go-word2vec/README.md

@@ -0,0 +1 @@
+A simple library for loading word2vec binary model.

+ 19 - 0
common/src/github.com/thinxer/go-word2vec/vector.go

@@ -0,0 +1,19 @@
+package word2vec
+
+import "github.com/ziutek/blas"
+
+type Vector []float32
+
+// Normalize this vector.
+func (v Vector) Normalize() {
+	w := blas.Snrm2(len(v), v, 1)
+	blas.Sscal(len(v), 1/w, v, 1)
+}
+
+func (y Vector) Add(alpha float32, x Vector) {
+	blas.Saxpy(len(y), alpha, x, 1, y, 1)
+}
+
+func (y Vector) Dot(x Vector) float32 {
+	return blas.Sdot(len(y), x, 1, y, 1)
+}

+ 111 - 0
common/src/github.com/thinxer/go-word2vec/word2vec.go

@@ -0,0 +1,111 @@
+/*
+This package is a Go implementation of the original word2vec model.
+Currently only model loading is supported.
+*/
+package word2vec
+
+import (
+	"bufio"
+	"encoding/binary"
+	"fmt"
+	"os"
+)
+
+// Model is the Word2vec model.
+type Model struct {
+	Layer1Size int
+	Vocab      map[string]int
+
+	data []float32
+}
+
+type Pair struct {
+	Word string
+	Sim  float32
+}
+
+// Load the model generated by the original word2vec.
+func Load(filename string) (*Model, error) {
+	file, err := os.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	reader := bufio.NewReader(file)
+	var vocabSize, layer1Size int
+	fmt.Fscanln(reader, &vocabSize, &layer1Size)
+	var word string
+	model := &Model{
+		Layer1Size: layer1Size,
+		Vocab:      make(map[string]int),
+		data:       make([]float32, layer1Size*vocabSize),
+	}
+	for i := 0; i < vocabSize; i++ {
+		var vector = model.Vector(i)
+		bytes, err := reader.ReadBytes(' ')
+		if err != nil {
+			return nil, err
+		}
+		word = string(bytes[:len(bytes)-1])
+		err = binary.Read(reader, binary.LittleEndian, vector)
+		if err != nil {
+			return nil, err
+		}
+		vector.Normalize()
+		reader.ReadByte()
+
+		model.Vocab[word] = i
+	}
+	return model, nil
+}
+
+// Vector returns the vector of i-th word.
+func (m *Model) Vector(i int) Vector {
+	return Vector(m.data[m.Layer1Size*i : m.Layer1Size*(i+1)])
+}
+
+// Similarity returns the similarity of the two words.
+func (m *Model) Similarity(x, y string) (float32, error) {
+	id1, ok := m.Vocab[x]
+	if !ok {
+		return 0, fmt.Errorf("Word not found: %s", x)
+	}
+	id2, ok := m.Vocab[y]
+	if !ok {
+		return 0, fmt.Errorf("Word not found: %s", y)
+	}
+	return m.Vector(id1).Dot(m.Vector(id2)), nil
+}
+
+// MostSimilar returns the most similiar n words to sum(positives) - sum(negatives).
+func (m *Model) MostSimilar(positives, negatives []string, n int) ([]Pair, error) {
+	// Construct the target vector.
+	vec := Vector(make([]float32, m.Layer1Size))
+	for _, word := range positives {
+		if wordId, ok := m.Vocab[word]; !ok {
+			return nil, fmt.Errorf("Word not found: %s", word)
+		} else {
+			vec.Add(1, m.Vector(wordId))
+		}
+	}
+	for _, word := range negatives {
+		if wordId, ok := m.Vocab[word]; !ok {
+			return nil, fmt.Errorf("Word not found: %s", word)
+		} else {
+			vec.Add(-1, m.Vector(wordId))
+		}
+	}
+	vec.Normalize()
+
+	// Find the top similar words.
+	r := make([]Pair, n)
+	for w, i := range m.Vocab {
+		sim := vec.Dot(m.Vector(i))
+		this := Pair{w, sim}
+		for j := 0; j < n; j++ {
+			if this.Sim > r[j].Sim {
+				this, r[j] = r[j], this
+			}
+		}
+	}
+	return r, nil
+}