wangchuanjin 9 年之前
父节点
当前提交
c1b9ec8946
共有 56 个文件被更改,包括 4552 次插入0 次删除
  1. 9 0
      common/src/github.com/ziutek/blas/.gitignore
  2. 24 0
      common/src/github.com/ziutek/blas/LICENSE
  3. 40 0
      common/src/github.com/ziutek/blas/README.md
  4. 13 0
      common/src/github.com/ziutek/blas/common.go
  5. 402 0
      common/src/github.com/ziutek/blas/d_test.go
  6. 20 0
      common/src/github.com/ziutek/blas/dasum.go
  7. 118 0
      common/src/github.com/ziutek/blas/dasum_amd64.s
  8. 50 0
      common/src/github.com/ziutek/blas/daxpy.go
  9. 302 0
      common/src/github.com/ziutek/blas/daxpy_amd64.s
  10. 17 0
      common/src/github.com/ziutek/blas/dcopy.go
  11. 51 0
      common/src/github.com/ziutek/blas/dcopy_amd64.s
  12. 34 0
      common/src/github.com/ziutek/blas/ddot.go
  13. 136 0
      common/src/github.com/ziutek/blas/ddot_amd64.s
  14. 8 0
      common/src/github.com/ziutek/blas/dgemv.go
  15. 31 0
      common/src/github.com/ziutek/blas/dnrm2.go
  16. 115 0
      common/src/github.com/ziutek/blas/dnrm2_amd64.s
  17. 2 0
      common/src/github.com/ziutek/blas/doc.go
  18. 16 0
      common/src/github.com/ziutek/blas/drot.go
  19. 131 0
      common/src/github.com/ziutek/blas/drot_amd64.s
  20. 43 0
      common/src/github.com/ziutek/blas/drotg.go
  21. 91 0
      common/src/github.com/ziutek/blas/drotg_amd64.s
  22. 105 0
      common/src/github.com/ziutek/blas/drotmg.go
  23. 17 0
      common/src/github.com/ziutek/blas/dscal.go
  24. 106 0
      common/src/github.com/ziutek/blas/dscal_amd64.s
  25. 13 0
      common/src/github.com/ziutek/blas/dswap.go
  26. 131 0
      common/src/github.com/ziutek/blas/dswap_amd64.s
  27. 24 0
      common/src/github.com/ziutek/blas/idamax.go
  28. 178 0
      common/src/github.com/ziutek/blas/idamax_amd64-simd_broken
  29. 58 0
      common/src/github.com/ziutek/blas/idamax_amd64.s
  30. 24 0
      common/src/github.com/ziutek/blas/isamax.go
  31. 58 0
      common/src/github.com/ziutek/blas/isamax_amd64.s
  32. 393 0
      common/src/github.com/ziutek/blas/s_test.go
  33. 20 0
      common/src/github.com/ziutek/blas/sasum.go
  34. 124 0
      common/src/github.com/ziutek/blas/sasum_amd64.s
  35. 50 0
      common/src/github.com/ziutek/blas/saxpy.go
  36. 290 0
      common/src/github.com/ziutek/blas/saxpy_amd64.s
  37. 17 0
      common/src/github.com/ziutek/blas/scopy.go
  38. 51 0
      common/src/github.com/ziutek/blas/scopy_amd64.s
  39. 34 0
      common/src/github.com/ziutek/blas/sdot.go
  40. 143 0
      common/src/github.com/ziutek/blas/sdot_amd64.s
  41. 34 0
      common/src/github.com/ziutek/blas/sdsdot.go
  42. 168 0
      common/src/github.com/ziutek/blas/sdsdot_amd64.s
  43. 4 0
      common/src/github.com/ziutek/blas/simd.txt
  44. 31 0
      common/src/github.com/ziutek/blas/snrm2.go
  45. 121 0
      common/src/github.com/ziutek/blas/snrm2_amd64.s
  46. 16 0
      common/src/github.com/ziutek/blas/srot.go
  47. 183 0
      common/src/github.com/ziutek/blas/srot_amd64.s
  48. 43 0
      common/src/github.com/ziutek/blas/srotg.go
  49. 92 0
      common/src/github.com/ziutek/blas/srotg_amd64.s
  50. 17 0
      common/src/github.com/ziutek/blas/sscal.go
  51. 118 0
      common/src/github.com/ziutek/blas/sscal_amd64.s
  52. 13 0
      common/src/github.com/ziutek/blas/sswap.go
  53. 124 0
      common/src/github.com/ziutek/blas/sswap_amd64.s
  54. 15 0
      common/src/github.com/ziutek/blas/stubs.bash
  55. 42 0
      common/src/github.com/ziutek/blas/stubs_386.s
  56. 42 0
      common/src/github.com/ziutek/blas/stubs_arm.s

+ 9 - 0
common/src/github.com/ziutek/blas/.gitignore

@@ -0,0 +1,9 @@
+*.6
+*.8
+*.a
+*.o
+*.so
+*.out
+*.go~
+_obj
+_testmain.go

+ 24 - 0
common/src/github.com/ziutek/blas/LICENSE

@@ -0,0 +1,24 @@
+Copyright (c) 2011, Michal Derkacz
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. The name of the author may not be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ 40 - 0
common/src/github.com/ziutek/blas/README.md

@@ -0,0 +1,40 @@
+### Go implementation of BLAS (Basic Linear Algebra Subprograms)
+
+Any function is implemented in generic Go and if it is justified, it is
+optimized for AMD64 (using SSE2 instructions).
+
+AMD64 implementation uses MOVUPS/MOVUPD instructions if all strides equal to 1
+so it run fast on Nehalem, Sandy Bridge and newer processors but relatively
+slow on older processors.
+
+Any implemented function has its own unity test and benchmark.
+
+#### Implemented functions
+
+*Level 1*
+
+Sdsdot, Sdot, Ddot, Snrm2, Dnrm2, Sasum, Dasum, Isamax, Idamax, Sswap, Dswap,
+Scopy, Dcopy, Saxpy, Daxpy, Sscal, Dscal, Srotg, Drotg, Srot, Drot
+
+*Level 2*
+
+not implemented
+
+*Level 3*
+
+not implemented
+
+####Example benchmarks
+
+<table>
+    <tr><th>Function</th><th>Generic Go</th><th>Optimized for AMD64</th></tr>
+    <tr><td>Ddot</td><td>2825 ns/op</td><td>895 ns/op</td></tr>
+    <tr><td>Dnrm2</td><td>2787 ns/op</td><td>597 ns/op</td></tr>
+    <tr><td>Dasum</td><td>3145 ns/op</td><td>560 ns/op</td></tr>
+    <tr><td>Sdsdot</td><td>3133 ns/op</td><td>1733 ns/op</td></tr>
+    <tr><td>Sdot</td><td>2832 ns/op</td><td>508 ns/op</td></tr>
+</table>
+
+#### Documentation
+
+http://godoc.org/github.com/ziutek/blas

+ 13 - 0
common/src/github.com/ziutek/blas/common.go

@@ -0,0 +1,13 @@
+package blas
+
+type Order int
+const (
+	RowMajor = Order(101)
+	ColMajor = Order(102)
+)
+
+type Transpose int
+const (
+	NoTrans = Transpose(111)
+	Trans   = Transpose(112)
+)

+ 402 - 0
common/src/github.com/ziutek/blas/d_test.go

@@ -0,0 +1,402 @@
+package blas
+
+import (
+	"math"
+	"math/rand"
+	"testing"
+)
+
+func iCheck(t *testing.T, inc, N, r, e int) {
+	t.Logf("inc=%d N=%d : r=%d e=%d", inc, N, r, e)
+	if r != e {
+		t.FailNow()
+	}
+}
+
+func dCheck(t *testing.T, inc, N int, r, e float64) {
+	t.Logf("inc=%d N=%d : r=%f e=%f e-r=%g", inc, N, r, e, e-r)
+	if r != e {
+		t.FailNow()
+	}
+}
+
+var (
+	xd = []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9}
+	yd = []float64{1e17, 1e16, 1e15, 1e14, 1e13, 1e12, 1e11, 1e10, 1e9, 1e8,
+		1e7, 1e6, 1e5, 1e4, 1e3, 100, 10, 1}
+)
+
+func TestDdot(t *testing.T) {
+	for inc := 1; inc < 9; inc++ {
+		e := 0.0
+		k := 0
+		for N := 0; N <= len(xd)/inc; N++ {
+			if N > 0 {
+				e += xd[k] * yd[k]
+				k += inc
+			}
+			r := Ddot(N, xd, inc, yd, inc)
+			dCheck(t, inc, N, r, e)
+		}
+	}
+}
+
+func TestDnrm2(t *testing.T) {
+	for inc := 1; inc < 9; inc++ {
+		for N := 0; N <= len(xd)/inc; N++ {
+			e := math.Sqrt(Ddot(N, xd, inc, xd, inc))
+			r := Dnrm2(N, xd, inc)
+			dCheck(t, inc, N, r, e)
+		}
+	}
+}
+
+func TestDasum(t *testing.T) {
+	xd := []float64{-1, -2, 3, -4, -5, -6, 7, -8, 9, 1, -2, 3, -4, 5, -6, -7, 8}
+	for inc := 1; inc < 9; inc++ {
+		e := 0.0
+		k := 0
+		for N := 0; N <= len(xd)/inc; N++ {
+			if N > 0 {
+				e += math.Abs(xd[k])
+				k += inc
+			}
+			r := Dasum(N, xd, inc)
+			dCheck(t, inc, N, r, e)
+		}
+	}
+}
+
+func TestIdamax(t *testing.T) {
+	xd := []float64{-1, -2, 3, -4, -5, 0, -5, 0, 4, 2, 3, -1, 4, -2, -9, 0,
+		-1, 0, 0, 2, 2, -8, 2, 1, 0, 2, 4, 5, 8, 1, -7, 2, 9, 0, 1, -1}
+	for inc := 1; inc < 9; inc++ {
+		for N := 0; N <= len(xd)/inc; N++ {
+			i_max := 0
+			x_max := 0.0
+			for i := 0; i < N; i++ {
+				x := math.Abs(xd[i*inc])
+				if x > x_max {
+					x_max = x
+					i_max = i
+				}
+			}
+			r := Idamax(N, xd, inc)
+			iCheck(t, inc, N, r, i_max)
+		}
+	}
+}
+
+func TestDswap(t *testing.T) {
+	a := make([]float64, len(xd))
+	b := make([]float64, len(yd))
+	for inc := 1; inc < 9; inc++ {
+		for N := 0; N <= len(xd)/inc; N++ {
+			copy(a, xd)
+			copy(b, yd)
+			Dswap(N, a, inc, b, inc)
+			for i := 0; i < len(a); i++ {
+				if i <= inc*(N-1) && i%inc == 0 {
+					if a[i] != yd[i] || b[i] != xd[i] {
+						t.Fatalf("inc=%d N=%d", inc, N)
+					}
+				} else {
+					if a[i] != xd[i] || b[i] != yd[i] {
+						t.Fatalf("inc=%d N=%d", inc, N)
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestDcopy(t *testing.T) {
+	for inc := 1; inc < 9; inc++ {
+		for N := 0; N <= len(xd)/inc; N++ {
+			a := make([]float64, len(xd))
+			Dcopy(N, xd, inc, a, inc)
+			for i := 0; i < inc*N; i++ {
+				if i%inc == 0 {
+					if a[i] != xd[i] {
+						t.Fatalf("inc=%d N=%d i=%d r=%f e=%f", inc, N, i, a[i],
+							xd[i])
+					}
+				} else {
+					if a[i] != 0 {
+						t.Fatalf("inc=%d N=%d i=%d r=%f e=0", inc, N, i, a[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestDaxpy(t *testing.T) {
+	for _, alpha := range []float64{0, -1, 1, 3} {
+		for inc := 1; inc < 9; inc++ {
+			for N := 0; N <= len(xd)/inc; N++ {
+				r := make([]float64, len(xd))
+				e := make([]float64, len(xd))
+				copy(r, xd)
+				copy(e, xd)
+				Daxpy(N, alpha, xd, inc, r, inc)
+				for i := 0; i < N; i++ {
+					e[i*inc] += alpha * xd[i*inc]
+				}
+				for i := 0; i < len(xd); i++ {
+					if r[i] != e[i] {
+						t.Fatalf("alpha=%f inc=%d N=%d i=%d r=%f e=%f",
+							alpha, inc, N, i, r[i], e[i])
+					}
+				}
+			}
+		}
+
+		/* This works only with assembler version.
+		TODO: Write general test for bounds checking
+		// Test bounds checks.
+		panicked := func(f func()) (panicked bool) {
+			panicked = false
+			defer func() {
+				if recover() != nil {
+					panicked = true
+				}
+			}()
+			f()
+			return panicked
+		}
+		d2 := []float64{1.0, 1.0}
+		d3 := []float64{1.0, 1.0, 1.0}
+		if !panicked(func() { Daxpy(3, alpha, d2, 1, d2, 1) }) {
+			t.Fatalf("Expected panic on index out of range.")
+		}
+		if !panicked(func() { Daxpy(2, alpha, d2, 2, d2, 1) }) {
+			t.Fatalf("Expected panic on index out of range.")
+		}
+		if !panicked(func() { Daxpy(2, alpha, d2, 1, d2, 2) }) {
+			t.Fatalf("Expected panic on index out of range.")
+		}
+		if !panicked(func() { Daxpy(3, alpha, d3, 1, d2, 1) }) {
+			t.Fatalf("Expected panic on index out of range.")
+		}
+		if !panicked(func() { Daxpy(3, alpha, d2, 1, d3, 2) }) {
+			t.Fatalf("Expected panic on index out of range.")
+		}*/
+	}
+}
+
+func TestDscal(t *testing.T) {
+	alpha := 3.0
+	for inc := 1; inc < 9; inc++ {
+		for N := 0; N <= len(xd)/inc; N++ {
+			r := make([]float64, len(xd))
+			e := make([]float64, len(xd))
+			copy(r, xd)
+			copy(e, xd)
+			Dscal(N, alpha, r, inc)
+			for i := 0; i < N; i++ {
+				e[i*inc] = alpha * xd[i*inc]
+			}
+			for i := 0; i < len(xd); i++ {
+				if r[i] != e[i] {
+					t.Fatalf("inc=%d N=%d i=%d r=%f e=%f", inc, N, i, r[i],
+						e[i])
+				}
+			}
+		}
+	}
+}
+
+func dEq(a, b, p float64) bool {
+	eps := math.SmallestNonzeroFloat64 * 2
+	r := math.Abs(a) + math.Abs(b)
+	if r <= eps {
+		return true
+	}
+	return math.Abs(a-b)/r < p
+}
+
+// Reference implementation of Drotg
+func refDrotg(a, b float64) (c, s, r, z float64) {
+	roe := b
+	if math.Abs(a) > math.Abs(b) {
+		roe = a
+	}
+	scale := math.Abs(a) + math.Abs(b)
+	if scale == 0 {
+		c = 1
+	} else {
+		r = scale * math.Sqrt((a/scale)*(a/scale)+(b/scale)*(b/scale))
+		if math.Signbit(roe) {
+			r = -r
+		}
+		c = a / r
+		s = b / r
+		z = 1
+		if math.Abs(a) > math.Abs(b) {
+			z = s
+		}
+		if math.Abs(b) >= math.Abs(a) && c != 0 {
+			z = 1 / c
+		}
+	}
+	return
+}
+
+func TestDrotg(t *testing.T) {
+	vs := []struct{ a, b float64 }{
+		{0, 0}, {0, 1}, {0, -1},
+		{1, 0}, {1, 1}, {1, -1},
+		{-1, 0}, {-1, 1}, {-1, -1},
+		{2, 0}, {2, 1}, {2, -1},
+		{-2, 0}, {-2, 1}, {-2, -1},
+		{0, 2}, {1, 2}, {-1, 2},
+		{0, -2}, {1, -2}, {-1, 2},
+	}
+	for _, v := range vs {
+		c, s, _, _ := Drotg(v.a, v.b)
+		ec, es, _, _ := refDrotg(v.a, v.b)
+		if !dEq(c, ec, 1e-30) || !dEq(s, es, 1e-30) {
+			t.Fatalf("a=%g b=%g c=%g ec=%g s=%g es=%g", v.a, v.b, c, ec, s, es)
+		}
+	}
+}
+
+func TestDrot(t *testing.T) {
+	s2 := math.Sqrt(2)
+	vs := []struct{ c, s float64 }{
+		{0, 0}, {0, 1}, {0, -1}, {1, 0}, {-1, 0},
+		{s2, s2}, {s2, -s2}, {-s2, s2}, {-s2, -s2},
+	}
+	x := make([]float64, len(xd))
+	y := make([]float64, len(yd))
+	ex := make([]float64, len(xd))
+	ey := make([]float64, len(yd))
+	for _, v := range vs {
+		for inc := 1; inc < 9; inc++ {
+			for N := 0; N <= len(xd)/inc; N++ {
+				copy(x, xd)
+				copy(y, yd)
+				copy(ex, xd)
+				copy(ey, yd)
+				Dscal(N, v.c, ex, inc)          // ex *= c
+				Daxpy(N, v.s, y, inc, ex, inc)  // ex += s*y
+				Dscal(N, v.c, ey, inc)          // ey *= c
+				Daxpy(N, -v.s, x, inc, ey, inc) // ey += (-s)*x
+
+				// (x, y) = (c*x + s*y, c*y - s*x)
+				Drot(N, x, inc, y, inc, v.c, v.s)
+
+				for i, _ := range x {
+					if !dEq(x[i], ex[i], 1e-20) || !dEq(y[i], ey[i], 1e-20) {
+						t.Fatalf(
+							"N=%d inc=%d c=%f s=%f i=%d x=%f ex=%f y=%f ey=%f",
+							N, inc, v.c, v.s, i, x[i], ex[i], y[i], ey[i],
+						)
+					}
+				}
+			}
+		}
+	}
+}
+
+var vd, wd []float64
+
+func init() {
+	vd = make([]float64, 514)
+	wd = make([]float64, len(vd))
+	for i := 0; i < len(vd); i++ {
+		vd[i] = rand.Float64()
+		wd[i] = rand.Float64()
+	}
+}
+
+func BenchmarkDdot(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		Ddot(len(vd), vd, 1, wd, 1)
+	}
+}
+
+func BenchmarkDnrm2(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		Dnrm2(len(vd), vd, 1)
+	}
+}
+
+func BenchmarkDasum(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		Dasum(len(vd), vd, 1)
+	}
+}
+
+func BenchmarkIdamax(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		Idamax(len(vd), vd, 1)
+	}
+}
+
+func BenchmarkDswap(b *testing.B) {
+	b.StopTimer()
+	x := make([]float64, len(vd))
+	y := make([]float64, len(vd))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Dswap(len(x), x, 1, y, 1)
+	}
+}
+
+func BenchmarkDcopy(b *testing.B) {
+	b.StopTimer()
+	y := make([]float64, len(vd))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Dcopy(len(vd), vd, 1, y, 1)
+	}
+}
+
+func BenchmarkDaxpy(b *testing.B) {
+	b.StopTimer()
+	y := make([]float64, len(vd))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Daxpy(len(vd), -1.0, vd, 1, y, 1)
+	}
+}
+
+func BenchmarkDscal(b *testing.B) {
+	b.StopTimer()
+	y := make([]float64, len(vd))
+	copy(y, vd)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Dscal(len(y), -1.0, y, 1)
+	}
+}
+
+func BenchmarkDrotg(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		Drotg(0, 0)
+		Drotg(0, 1)
+		Drotg(0, -1)
+		Drotg(1, 0)
+		Drotg(1, 1)
+		Drotg(1, -1)
+		Drotg(-1, 0)
+		Drotg(-1, 1)
+		Drotg(-1, -1)
+	}
+}
+
+func BenchmarkDrot(b *testing.B) {
+	b.StopTimer()
+	x := make([]float64, len(vd))
+	y := make([]float64, len(vd))
+	copy(x, vd)
+	copy(y, vd)
+	c := math.Sqrt(2)
+	s := c
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Drot(len(x), x, 1, y, 1, c, s)
+	}
+}

+ 20 - 0
common/src/github.com/ziutek/blas/dasum.go

@@ -0,0 +1,20 @@
+package blas
+
+// Absolute sum: \sum |X_i|
+func Dasum(N int, X []float64, incX int) float64
+
+func dasum(N int, X []float64, incX int) float64 {
+	var (
+		a  float64
+		xi int
+	)
+	for ; N > 0; N-- {
+		x := X[xi]
+		if x < 0 {
+			x = -x
+		}
+		a += x
+		xi += incX
+	}
+	return a
+}

+ 118 - 0
common/src/github.com/ziutek/blas/dasum_amd64.s

@@ -0,0 +1,118 @@
+// func Dasum(N int, X []float64, incX int) float64
+TEXT ·Dasum(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVQ	X+8(FP), SI	// X.data
+	MOVQ	incX+32(FP), AX
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	CMPQ	CX, X_len+16(FP)
+	JGE		panic
+
+	// Clear accumulators
+	XORPD	X0, X0
+	XORPD	X1, X1
+
+	// Setup mask for sign bit clear
+	PCMPEQL	X3, X3 
+	PSRLQ	$1, X3
+
+	// Setup stride
+	SALQ	$3, AX	// AX = sizeof(float64) * incX
+
+	// Check that there are 4 or more values for SIMD calculations
+	SUBQ	$4, BP
+	JL		rest	// There are less than 4 values to process
+
+	// Check if incX != 1
+	CMPQ	AX, $8
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == incY == 1)
+	full_simd_loop:
+		// Clear sign on first two values
+		MOVUPD	(SI), X2
+		ANDPD	X3, X2
+		// Clear sign on second two values
+		MOVUPD	16(SI), X4
+		ANDPD	X3, X4
+
+		// Update data pointer
+		ADDQ	$32, SI
+
+		// Accumulate the results
+		ADDPD	X2, X0
+		ADDPD	X4, X1
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop	// There are 4 or more values to process
+
+	JMP	hsum
+	
+with_stride:
+	// Setup long stride
+	MOVQ	AX, CX
+	SALQ	$1, CX 	// CX = 16 * incX
+
+	half_simd_loop:
+		// Clear sign on first two values
+		MOVLPD	(SI), X2
+		MOVHPD	(SI)(AX*1), X2
+		ANDPD	X3, X2
+
+		// Update data pointer using long stride
+		ADDQ	CX, SI
+
+		// Clear sign on second two values
+		MOVLPD	(SI), X4
+		MOVHPD	(SI)(AX*1), X4
+		ANDPD	X3, X4
+
+		// Update data pointer using long stride
+		ADDQ	CX, SI
+
+		// Accumulate the results
+		ADDPD	X2, X0
+		ADDPD	X4, X1
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more values to process
+
+hsum:
+	// Summ intermediate results from SIMD operations
+	ADDPD	X0, X1
+	// Horizontal sum
+	MOVHLPS X1, X0
+	ADDSD	X1, X0
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+
+	// Check that are there any value to process
+	JE	end
+
+loop:
+	// Clear sign bit
+	MOVSD	(SI), X2
+	ANDPD	X3, X2
+
+	// Update data pointers
+	ADDQ	AX, SI
+
+	// Accumulate the results of multiplication
+	ADDSD	X2, X0
+
+	DECQ	BP
+	JNE	loop
+
+end:
+	// Return the square root of sum
+	MOVSD	X0, r+40(FP)
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 50 - 0
common/src/github.com/ziutek/blas/daxpy.go

@@ -0,0 +1,50 @@
+package blas
+
+// Compute the sum Y = \alpha X + Y for the vectors X and Y 
+func Daxpy(N int, alpha float64, X []float64, incX int, Y []float64, incY int)
+
+func daxpy(N int, alpha float64, X []float64, incX int, Y []float64, incY int) {
+	var xi, yi int
+	switch alpha {
+	case 0:
+	case 1:
+		for ; N >= 2; N -= 2 {
+			Y[yi] += X[xi]
+			xi += incX
+			yi += incY
+
+			Y[yi] += X[xi]
+			xi += incX
+			yi += incY
+		}
+		if N != 0 {
+			Y[yi] += alpha * X[xi]
+		}
+	case -1:
+		for ; N >= 2; N -= 2 {
+			Y[yi] -= X[xi]
+			xi += incX
+			yi += incY
+
+			Y[yi] -= X[xi]
+			xi += incX
+			yi += incY
+		}
+		if N != 0 {
+			Y[yi] -= X[xi]
+		}
+	default:
+		for ; N >= 2; N -= 2 {
+			Y[yi] += alpha * X[xi]
+			xi += incX
+			yi += incY
+
+			Y[yi] += alpha * X[xi]
+			xi += incX
+			yi += incY
+		}
+		if N != 0 {
+			Y[yi] += alpha * X[xi]
+		}
+	}
+}

+ 302 - 0
common/src/github.com/ziutek/blas/daxpy_amd64.s

@@ -0,0 +1,302 @@
+//func Daxpy(N int, alpha float64, X []float64, incX int, Y []float64, incY int)
+TEXT ·Daxpy(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVSD	alpha+8(FP), X0
+	MOVQ	X_data+16(FP), SI
+	MOVQ	incX+40(FP), AX
+	MOVQ	Y_data+48(FP), DI
+	MOVQ	incY+72(FP), BX
+
+	// Setup 0, 1, -1
+	PCMPEQL	X1, X1
+	PCMPEQL	X7, X7
+	XORPD	X6, X6	// 0
+	PSLLQ	$54, X1
+	PSRLQ	$2, X1	// 1
+	PSLLQ	$63, X7
+	ORPD	X1, X7	// -1
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	MOVQ	CX, DX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	IMULQ	BX, DX	// DX = incY * (N - 1)
+	CMPQ	CX, X_len+24(FP)
+	JGE		panic
+	CMPQ	DX, Y_len+56(FP)
+	JGE		panic
+
+	// Check that is there any work to do
+	UCOMISD	X0, X6	
+	JE	end	// alpha == 0
+
+	// Setup strides
+	SALQ	$3, AX	// AX = sizeof(float64) * incX
+	SALQ	$3, BX	// BX = sizeof(float64) * incY
+
+	// Check that there are 4 or more pairs for SIMD calculations
+	SUBQ	$4, BP
+	JL		rest	// There are less than 4 pairs to process
+
+	// Setup two alphas in X0
+	MOVLHPS	X0, X0
+
+	// Check if incX != 1 or incY != 1
+	CMPQ	AX, $8
+	JNE	with_stride
+	CMPQ	BX, $8
+	JNE	with_stride
+
+	// Fully optimized loops (for incX == incY == 1)
+	UCOMISD	X0, X1
+	JE	full_simd_loop_sum	// alpha == 1
+	UCOMISD	X0, X7
+	JE	full_simd_loop_diff	// alpha == -1
+
+	full_simd_loop:
+		// Load first two pairs and scale
+		MOVUPD	(SI), X2
+		MOVUPD	(DI), X3
+		MULPD	X0, X2
+		// Load second two pairs and scale
+		MOVUPD	16(SI), X4
+		MOVUPD	16(DI), X5
+		MULPD	X0, X4
+		// Save sum of first two pairs
+		ADDPD	X2, X3
+		MOVUPD	X3, (DI)
+		// Save sum of second two pairs
+		ADDPD	X4, X5
+		MOVUPD	X5, 16(DI)
+
+		// Update data pointers
+		ADDQ	$32, SI
+		ADDQ	$32, DI
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop	// There are 4 or more pairs to process
+	JMP	rest
+
+	full_simd_loop_sum:
+		// Load first two pairs
+		MOVUPD	(SI), X2
+		MOVUPD	(DI), X3
+		// Load second two pairs
+		MOVUPD	16(SI), X4
+		MOVUPD	16(DI), X5
+		// Save a sum of first two pairs
+		ADDPD	X2, X3
+		MOVUPD	X3, (DI)
+		// Save a sum of second two pairs
+		ADDPD	X4, X5
+		MOVUPD	X5, 16(DI)
+
+		// Update data pointers
+		ADDQ	$32, SI
+		ADDQ	$32, DI
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop_sum	// There are 4 or more pairs to process
+	JMP	rest_sum
+
+	full_simd_loop_diff:
+		// Load first two pairs
+		MOVUPD	(SI), X2
+		MOVUPD	(DI), X3
+		// Load second two pairs
+		MOVUPD	16(SI), X4
+		MOVUPD	16(DI), X5
+		// Save a difference of first two pairs
+		SUBPD	X2, X3
+		MOVUPD	X3, (DI)
+		// Save a difference of second two pairs
+		SUBPD	X4, X5
+		MOVUPD	X5, 16(DI)
+
+		// Update data pointers
+		ADDQ	$32, SI
+		ADDQ	$32, DI
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop_diff	// There are 4 or more pairs to process
+	JMP	rest_diff
+
+with_stride:
+	// Setup long strides
+	MOVQ	AX, CX
+	MOVQ	BX, DX
+	SALQ	$1, CX 	// CX = 16 * incX
+	SALQ	$1, DX 	// DX = 16 * incY
+
+	UCOMISD	X0, X1
+	JE	half_simd_loop_sum	// alpha == 1
+	UCOMISD	X0, X7
+	JE	half_simd_loop_diff	// alpha == -1
+
+	half_simd_loop:
+		// Load first two pairs and scale
+		MOVLPD	(SI), X2
+		MOVHPD	(SI)(AX*1), X2
+		MOVLPD	(DI), X3
+		MOVHPD	(DI)(BX*1), X3
+		MULPD	X0, X2
+		// Save sum of first two pairs
+		ADDPD	X2, X3
+		MOVLPD	X3, (DI)
+		MOVHPD	X3, (DI)(BX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		// Load second two pairs and scale
+		MOVLPD	(SI), X4
+		MOVHPD	(SI)(AX*1), X4
+		MOVLPD	(DI), X5
+		MOVHPD	(DI)(BX*1), X5
+		MULPD	X0, X4
+		// Save sum of second two pairs
+		ADDPD	X4, X5
+		MOVLPD	X5, (DI)
+		MOVHPD	X5, (DI)(BX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more pairs to process
+	JMP rest
+
+	half_simd_loop_sum:
+		// Load first two pairs
+		MOVLPD	(SI), X2
+		MOVHPD	(SI)(AX*1), X2
+		MOVLPD	(DI), X3
+		MOVHPD	(DI)(BX*1), X3
+		// Save a sum of first two pairs
+		ADDPD	X2, X3
+		MOVLPD	X3, (DI)
+		MOVHPD	X3, (DI)(BX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		// Load second two pairs
+		MOVLPD	(SI), X4
+		MOVHPD	(SI)(AX*1), X4
+		MOVLPD	(DI), X5
+		MOVHPD	(DI)(BX*1), X5
+		// Save a sum of second two pairs
+		ADDPD	X4, X5
+		MOVLPD	X5, (DI)
+		MOVHPD	X5, (DI)(BX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more pairs to process
+	JMP rest_sum
+
+	half_simd_loop_diff:
+		// Load first two pairs
+		MOVLPD	(SI), X2
+		MOVHPD	(SI)(AX*1), X2
+		MOVLPD	(DI), X3
+		MOVHPD	(DI)(BX*1), X3
+		// Save a difference of first two pairs
+		SUBPD	X2, X3
+		MOVLPD	X3, (DI)
+		MOVHPD	X3, (DI)(BX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		// Load second two pairs
+		MOVLPD	(SI), X4
+		MOVHPD	(SI)(AX*1), X4
+		MOVLPD	(DI), X5
+		MOVHPD	(DI)(BX*1), X5
+		// Save a difference of second two pairs
+		SUBPD	X4, X5
+		MOVLPD	X5, (DI)
+		MOVHPD	X5, (DI)(BX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more pairs to process
+	JMP	rest_diff
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+	// Check that are there any value to process
+	JE	end
+	loop:
+		// Load from X and scale
+		MOVSD	(SI), X2
+		MULSD	X0, X2
+		// Save a sum in Y
+		ADDSD	(DI), X2
+		MOVSD	X2, (DI)
+
+		// Update data pointers
+		ADDQ	AX, SI
+		ADDQ	BX, DI
+
+		DECQ	BP
+		JNE	loop
+	RET
+
+rest_sum:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+	// Check that are there any value to process
+	JE	end
+	loop_sum:
+		// Load from X
+		MOVSD	(SI), X2
+		// Save a sum in Y
+		ADDSD	(DI), X2
+		MOVSD	X2, (DI)
+
+		// Update data pointers
+		ADDQ	AX, SI
+		ADDQ	BX, DI
+
+		DECQ	BP
+		JNE	loop_sum
+	RET
+
+rest_diff:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+	// Check that are there any value to process
+	JE	end
+	loop_diff:
+		// Load from Y 
+		MOVSD	(DI), X2
+		// Save sum in Y
+		SUBSD	(SI), X2
+		MOVSD	X2, (DI)
+
+		// Update data pointers
+		ADDQ	AX, SI
+		ADDQ	BX, DI
+
+		DECQ	BP
+		JNE	loop_diff
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+end:
+	RET

+ 17 - 0
common/src/github.com/ziutek/blas/dcopy.go

@@ -0,0 +1,17 @@
+package blas
+
+// Copy the  elements of the vectors X and Y.
+func Dcopy(N int, X []float64, incX int, Y []float64, incY int)
+
+func dcopy(N int, X []float64, incX int, Y []float64, incY int) {
+	if incX == 1 && incY == 1 {
+		copy(Y[:N], X[:N])
+		return
+	}
+	var xi, yi int
+	for ; N > 0; N-- {
+		Y[yi] = X[xi]
+		xi += incX
+		yi += incY
+	}
+}

+ 51 - 0
common/src/github.com/ziutek/blas/dcopy_amd64.s

@@ -0,0 +1,51 @@
+// func Dcopy(N int, X []float64, incX int, Y []float64, incY int)
+TEXT ·Dcopy(SB), 7, $0
+	MOVQ	N+0(FP), CX
+	MOVQ	X_data+8(FP), SI
+	MOVQ	incX+32(FP), AX
+	MOVQ	Y_data+40(FP), DI
+	MOVQ	incY+64(FP), BX
+
+	// Check data bounaries
+	MOVQ	CX, BP
+	DECQ	BP
+	MOVQ	BP, DX
+	IMULQ	AX, BP	// BP = incX * (N - 1)
+	IMULQ	BX, DX	// DX = incY * (N - 1)
+	CMPQ	BP, X_len+16(FP)
+	JGE		panic
+	CMPQ	DX, Y_len+48(FP)
+	JGE		panic
+
+	// Check if incX != 1 or incY != 1
+	CMPQ	AX, $1
+	JNE	with_stride
+	CMPQ	BX, $1
+	JNE	with_stride
+
+	// Optimized copy for incX == incY == 1
+	REP; MOVSQ
+	RET
+
+with_stride:
+	// Setup strides
+	SALQ	$3, AX	// AX = sizeof(float64) * incX
+	SALQ	$3, BX	// BX = sizeof(float64) * incY
+
+	CMPQ	CX, $0
+	JE	end
+
+	loop:
+		MOVQ	(SI), DX
+		MOVQ	DX, (DI)
+		ADDQ	AX, SI
+		ADDQ	BX, DI
+		DECQ	CX
+		JNE	loop
+
+end:
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 34 - 0
common/src/github.com/ziutek/blas/ddot.go

@@ -0,0 +1,34 @@
+package blas
+
+// Scalar product: X^T Y 
+func Ddot(N int, X []float64, incX int, Y []float64, incY int) float64
+
+func ddot(N int, X []float64, incX int, Y []float64, incY int) float64 {
+	var (
+		a, b, c, d float64
+		xi, yi     int
+	)
+	for ; N >= 4; N -= 4 {
+		a += X[xi] * Y[yi]
+		xi += incX
+		yi += incY
+
+		b += X[xi] * Y[yi]
+		xi += incX
+		yi += incY
+
+		c += X[xi] * Y[yi]
+		xi += incX
+		yi += incY
+
+		d += X[xi] * Y[yi]
+		xi += incX
+		yi += incY
+	}
+	for ; N > 0; N-- {
+		a += X[xi] * Y[yi]
+		xi += incX
+		yi += incY
+	}
+	return (b + c) + (d + a)
+}

+ 136 - 0
common/src/github.com/ziutek/blas/ddot_amd64.s

@@ -0,0 +1,136 @@
+// func Ddot(N int, X []float64, incX int, Y []float64, incY int) float64
+TEXT ·Ddot(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVQ	X_data+8(FP), SI
+	MOVQ	incX+32(FP), AX
+	MOVQ	Y_data+40(FP), DI
+	MOVQ	incY+64(FP), BX
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	MOVQ	CX, DX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	IMULQ	BX, DX	// DX = incY * (N - 1)
+	CMPQ	CX, X_len+16(FP)
+	JGE		panic
+	CMPQ	DX, Y_len+48(FP)
+	JGE		panic
+
+	// Clear accumulators
+	XORPD	X0, X0
+	XORPD	X1, X1
+
+	// Setup strides
+	SALQ	$3, AX	// AX = sizeof(float64) * incX
+	SALQ	$3, BX	// BX = sizeof(float64) * incY
+
+	// Check that there are 4 or more pairs for SIMD calculations
+	SUBQ	$4, BP
+	JL		rest	// There are less than 4 pairs to process
+
+	// Check if incX != 1 or incY != 1
+	CMPQ	AX, $8
+	JNE	with_stride
+	CMPQ	BX, $8
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == incY == 1)
+	full_simd_loop:
+		// Multiply first two pairs
+		MOVUPD	(SI), X2
+		MOVUPD	(DI), X3
+		MULPD	X2, X3
+		// Multiply second two values
+		MOVUPD	16(SI), X4
+		MOVUPD	16(DI), X5
+		MULPD	X4, X5
+
+		// Update data pointers
+		ADDQ	$32, SI
+		ADDQ	$32, DI
+
+		// Accumulate the results of multiplications
+		ADDPD	X3, X0
+		ADDPD	X5, X1
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop	// There are 4 or more pairs to process
+
+	JMP hsum
+
+with_stride:
+	// Setup long strides
+	MOVQ	AX, CX
+	MOVQ	BX, DX
+	SALQ	$1, CX 	// CX = 16 * incX
+	SALQ	$1, DX 	// DX = 16 * incY
+
+	// Partially optimized loop
+	half_simd_loop:
+		// Multiply first two pairs
+		MOVLPD	(SI), X2
+		MOVHPD	(SI)(AX*1), X2
+		MOVLPD	(DI), X3
+		MOVHPD	(DI)(BX*1), X3
+		MULPD	X2, X3
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		// Multiply second two pairs
+		MOVLPD	(SI), X4
+		MOVHPD	(SI)(AX*1), X4
+		MOVLPD	(DI), X5
+		MOVHPD	(DI)(BX*1), X5
+		MULPD	X4, X5
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		// Accumulate the results of multiplications
+		ADDPD	X3, X0
+		ADDPD	X5, X1
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more pairs to process
+
+hsum:
+	// Summ intermediate results from SIMD operations
+	ADDPD	X0, X1
+	// Horizontal sum
+	MOVHLPS X1, X0
+	ADDSD	X1, X0
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+
+	// Check that are there any value to process
+	JE	end
+
+	loop:
+		// Multiply one pair
+		MOVSD	(SI), X2
+		MULSD	(DI), X2
+
+		// Update data pointers
+		ADDQ	AX, SI
+		ADDQ	BX,	DI
+
+		// Accumulate the results of multiplication
+		ADDSD	X2, X0
+
+		DECQ	BP
+		JNE	loop
+
+end:
+	// Return the sum
+	MOVSD	X0, r+72(FP)
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 8 - 0
common/src/github.com/ziutek/blas/dgemv.go

@@ -0,0 +1,8 @@
+package blas
+
+// Performs: y = alpha * A * x + beta * y  or y = alpha * A^T * x + beta * y
+/*func Dgemv(order Order, TransA Transpose, N, M, KL, KU int,
+	alpha float64, A []float64, lda int, X []float64, incX int,
+	beta float64, Y []float64, incY int) {
+
+}*/

+ 31 - 0
common/src/github.com/ziutek/blas/dnrm2.go

@@ -0,0 +1,31 @@
+package blas
+
+import "math"
+
+// Euclidean norm: ||X||_2 = \sqrt {\sum X_i^2}
+func Dnrm2(N int, X []float64, incX int) float64
+
+func dnrm2(N int, X []float64, incX int) float64 {
+	var (
+		a, b, c, d float64
+		xi         int
+	)
+	for ; N >= 4; N -= 4 {
+		a += X[xi] * X[xi]
+		xi += incX
+
+		b += X[xi] * X[xi]
+		xi += incX
+
+		c += X[xi] * X[xi]
+		xi += incX
+
+		d += X[xi] * X[xi]
+		xi += incX
+	}
+	for ; N > 0; N-- {
+		a += X[xi] * X[xi]
+		xi += incX
+	}
+	return math.Sqrt((b + c) + (d + a))
+}

+ 115 - 0
common/src/github.com/ziutek/blas/dnrm2_amd64.s

@@ -0,0 +1,115 @@
+// func Dnrm2(N int, X []float64, incX int) float64
+TEXT ·Dnrm2(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVQ	X+8(FP), SI	// X.data
+	MOVQ	incX+32(FP), AX
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	CMPQ	CX, X_len+16(FP)
+	JGE		panic
+
+	// Clear accumulators
+	XORPD	X0, X0
+	XORPD	X1, X1
+
+	// Setup stride
+	SALQ	$3, AX	// AX = sizeof(float64) * incX
+
+	// Check that there ar 4 or more values for SIMD calculations
+	SUBQ	$4, BP
+	JL		rest	// There are less than 4 values to process
+
+	// Check if incX != 1
+	CMPQ	AX, $8
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == incY == 1)
+	full_simd_loop:
+		// Multiply first two values
+		MOVUPD	(SI), X2
+		MULPD	X2, X2
+		// Multiply second two values
+		MOVUPD	16(SI), X4
+		MULPD	X4, X4
+
+		// Update data pointer
+		ADDQ	$32, SI
+
+		// Accumulate the results of multiplications
+		ADDPD	X2, X0
+		ADDPD	X4, X1
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop	// There are 4 or more values to process
+
+	JMP	hsum
+	
+with_stride:
+	// Setup long stride
+	MOVQ	AX, CX
+	SALQ	$1, CX 	// CX = 16 * incX
+
+	half_simd_loop:
+		// Square first two values
+		MOVLPD	(SI), X2
+		MOVHPD	(SI)(AX*1), X2
+		MULPD	X2, X2
+
+		// Update data pointer using long stride
+		ADDQ	CX, SI
+
+		// Square second two values
+		MOVLPD	(SI), X4
+		MOVHPD	(SI)(AX*1), X4
+		MULPD	X4, X4
+
+		// Update data pointer using long stride
+		ADDQ	CX, SI
+
+		// Accumulate the results of multiplications
+		ADDPD	X2, X0
+		ADDPD	X4, X1
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more values to process
+
+hsum:
+	// Summ intermediate results from SIMD operations
+	ADDPD	X0, X1
+	// Horizontal sum
+	MOVHLPS X1, X0
+	ADDSD	X1, X0
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+
+	// Check that are there any value to process
+	JE	end
+
+loop:
+	// Square one value
+	MOVSD	(SI), X2
+	MULSD	X2, X2
+
+	// Update data pointers
+	ADDQ	AX, SI
+
+	// Accumulate the results of multiplication
+	ADDSD	X2, X0
+
+	DECQ	BP
+	JNE	loop
+
+end:
+	// Return the square root of sum
+	SQRTSD	X0, X0
+	MOVSD	X0, r+40(FP)
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 2 - 0
common/src/github.com/ziutek/blas/doc.go

@@ -0,0 +1,2 @@
+// Go implementation of BLAS (Basic Linear Algebra Subprograms)
+package blas

+ 16 - 0
common/src/github.com/ziutek/blas/drot.go

@@ -0,0 +1,16 @@
+package blas
+
+// Apply a Givens rotation (X', Y') = (c X + s Y, c Y - s X) to the vectors X, Y
+func Drot(N int, X []float64, incX int, Y []float64, incY int, c, s float64)
+
+func drot(N int, X []float64, incX int, Y []float64, incY int, c, s float64) {
+	var xi, yi int
+	for ; N > 0; N-- {
+		x := X[xi]
+		y := Y[yi]
+		X[xi] = c*x + s*y
+		Y[yi] = c*y - s*x
+		xi += incX
+		yi += incY
+	}
+}

+ 131 - 0
common/src/github.com/ziutek/blas/drot_amd64.s

@@ -0,0 +1,131 @@
+// func Drot(N int, X []float64, incX int, Y []float64, incY int, c, s float64)
+TEXT ·Drot(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVQ	X_data+8(FP), SI
+	MOVQ	incX+32(FP), AX
+	MOVQ	Y_data+40(FP), DI
+	MOVQ	incY+64(FP), BX
+	MOVSD	c+72(FP), X0
+	MOVSD	s+80(FP), X1
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	MOVQ	CX, DX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	IMULQ	BX, DX	// DX = incY * (N - 1)
+	CMPQ	CX, X_len+16(FP)
+	JGE		panic
+	CMPQ	DX, Y_len+48(FP)
+	JGE		panic
+
+	// Setup strides
+	SALQ	$3, AX	// AX = sizeof(float64) * incX
+	SALQ	$3, BX	// BX = sizeof(float64) * incY
+
+	// Check that there are 2 or more pairs for SIMD calculations
+	SUBQ	$2, BP
+	JL		rest	// There are less than 2 pairs to process
+
+	// Setup two c in X0, and two s in X1
+	MOVLHPS	X0, X0 // (c, c)
+	MOVLHPS	X1, X1 // (s, s)
+
+	// Check if incX != 1 or incY != 1
+	CMPQ	AX, $8
+	JNE	with_stride
+	CMPQ	BX, $8
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == incY == 1)
+	full_simd_loop:
+		// Load two pairs
+		MOVUPD	(SI), X2	// x
+		MOVUPD	(DI), X3	// y
+		MOVAPD	X2, X4	// x
+		MOVAPD	X3, X5	// y
+		// Givens rotation
+		MULPD	X0, X2	// c * x
+		MULPD	X1, X3	// s * y
+		MULPD	X1, X4	// s * x
+		MULPD	X0, X5	// c * y
+		ADDPD	X2, X3	// s * y + c * x
+		SUBPD	X4, X5	// c * y - s * x
+		// Save the result
+		MOVUPD	X3, (SI)
+		MOVUPD	X5, (DI)
+		// Update data pointers
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+
+		SUBQ	$2, BP
+		JGE		full_simd_loop	// There are 2 or more pairs to process
+
+	JMP	rest
+
+with_stride:
+	// Setup long strides
+	MOVQ	AX, CX
+	MOVQ	BX, DX
+	SALQ	$1, CX 	// CX = 16 * incX
+	SALQ	$1, DX 	// DX = 16 * incY
+
+	// Partially optimized loop
+	half_simd_loop:
+		// Load two pairs
+		MOVLPD	(SI), X2
+		MOVHPD	(SI)(AX*1), X2
+		MOVLPD	(DI), X3
+		MOVHPD	(DI)(BX*1), X3
+		MOVAPD	X2, X4	// x
+		MOVAPD	X3, X5	// y
+		// Givens rotation
+		MULPD	X0, X2	// c * x
+		MULPD	X1, X3	// s * y
+		MULPD	X1, X4	// s * x
+		MULPD	X0, X5	// c * y
+		ADDPD	X2, X3	// s * y + c * x
+		SUBPD	X4, X5	// c * y - s * x
+		// Save the result
+		MOVLPD	X3, (SI)
+		MOVHPD	X3, (SI)(AX*1)
+		MOVLPD	X5, (DI)
+		MOVHPD	X5, (DI)(BX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		SUBQ	$2, BP
+		JGE		half_simd_loop	// There are 2 or more pairs to process
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$2,	BP
+
+	// Check that are there any value to process
+	JE	end
+
+	// Load one pair
+	MOVSD	(SI), X2
+	MOVSD	(DI), X3
+
+	MOVSD	X2, X4	// x
+	MOVSD	X3, X5	// y
+	// Givens rotation
+	MULSD	X0, X2	// c * x
+	MULSD	X1, X3	// s * y
+	MULSD	X1, X4	// s * x
+	MULSD	X0, X5	// c * y
+	ADDSD	X2, X3	// s * y + c * x
+	SUBSD	X4, X5	// c * y - s * x
+	// Save the result
+	MOVSD	X3, (SI)
+	MOVSD	X5, (DI)
+
+end:
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 43 - 0
common/src/github.com/ziutek/blas/drotg.go

@@ -0,0 +1,43 @@
+package blas
+
+import "math"
+
+// Compute a Givens rotation (c,s) which zeroes the vector (a,b)
+func Drotg(a, b float64) (c, s, r, z float64)
+
+func drotg(a, b float64) (c, s, r, z float64) {
+	abs_a := a
+	if a < 0 {
+		abs_a = -a
+	}
+	abs_b := b
+	if b < 0 {
+		abs_b = -b
+	}
+	roe := b
+	if abs_a > abs_b {
+		roe = a
+	}
+	scale := abs_a + abs_b
+	if scale == 0 {
+		c = 1
+	} else {
+		sa := a / scale
+		sb := b / scale
+		r = scale * math.Sqrt(sa*sa+sb*sb)
+		if roe < 0 {
+			r = -r
+		}
+		c = a / r
+		s = b / r
+		z = 1
+		if abs_a > abs_b {
+			z = s
+		} else {
+			if c != 0 {
+				z /= c
+			}
+		}
+	}
+	return
+}

+ 91 - 0
common/src/github.com/ziutek/blas/drotg_amd64.s

@@ -0,0 +1,91 @@
+// func Drotg(a, b float64) (c, s, r, z float64)
+TEXT ·Drotg(SB), 7, $0
+	MOVSD   a+0(FP), X0
+	MOVSD   b+8(FP), X1
+
+	// Setup mask for sign bit clear
+	PCMPEQL	X6, X6
+	PSRLQ	$1, X6
+
+	// Setup 0
+	XORPD	X8, X8
+
+	// Setup 1
+	PCMPEQL	X7, X7
+	PSLLQ	$54, X7
+	PSRLQ	$2, X7 
+
+	// Compute |a|, |b|
+	MOVSD	X0, X2
+	MOVSD	X1, X3
+	ANDPD	X6, X2
+	ANDPD	X6, X3
+
+	// Compute roe
+	MOVSD	X1, X4	// roe = b
+	UCOMISD	X3,	X2	// cmp(abs_b, abs_a)
+	JBE	roe_b
+
+	MOVSD	X0, X4	// roe = a
+
+roe_b:
+
+	// Compute scale
+	MOVSD	X2, X5
+	ADDSD	X3, X5
+
+	UCOMISD	X8,	X5	// cmp(0, scale)
+	JNE	scale_NE_zero
+	
+	MOVSD	X7, c+16(FP)	// c = 1
+	MOVHPD	X8,	s+24(FP)	// s = 0
+	MOVSD	X8,	r+32(FP)	// r = 0
+	MOVSD	X8, z+40(FP)	// z = 0
+	RET
+
+scale_NE_zero:
+
+	MOVLHPS	X5, X5	// (scale, scale)
+	MOVLHPS	X1, X0	// (a, b)
+	MOVAPD	X0, X1	// (a, b)
+	DIVPD	X5, X0	// (a/scale, b/scale)
+	MULPD	X0, X0	// ((a/scale)^2, (b/scale)^2)
+	MOVHLPS	X0, X6
+	ADDSD	X6, X0	// (a/scale)^2 + (b/scale)^2
+	SQRTSD	X0, X0
+	MULSD	X5, X0	// r
+	MOVLHPS	X0, X0	// (r, r)
+
+	UCOMISD	X8, X4 // cmp(0, roe)
+	JAE	roe_GE_zero
+
+	PCMPEQL X4, X4
+	PSLLQ	$63, X4 // Sign bit
+	XORPD	X4, X0	// (r, r) = (-r, -r)
+
+roe_GE_zero:
+
+	DIVPD	X0, X1			// (a/r, b/r) 	
+	MOVLPD	X1, c+16(FP)	// c = a/r
+	MOVHPD	X1,	s+24(FP)	// s = b/r
+	MOVSD	X0,	r+32(FP)
+
+	MOVSD	X7, X4	// z = 1
+
+	UCOMISD	X3,	X2	// cmp(abs_b, abs_a)
+	JBE	abs_a_LE_abs_b
+
+	MOVHLPS	X1, X4	// z = s
+
+	JMP end
+	
+abs_a_LE_abs_b:
+
+	UCOMISD	X8, X1
+	JE	end
+
+	DIVSD	X1, X4	// z /= c
+
+end:
+	MOVSD	X4, z+40(FP)
+	RET

+ 105 - 0
common/src/github.com/ziutek/blas/drotmg.go

@@ -0,0 +1,105 @@
+package blas
+
+import "math"
+
+type DrotmgParam struct {
+	flag, h11, h21, h12, h22 float64
+}
+
+// Compute a modified Givens transformation
+func Drotmg(d1, d2, x1, y1 float64) (p DrotmgParam, rd1, rd2, rx1 float64) {
+	var p1, p2, q1, q2, u float64
+
+	gam := 4096.0
+	gamsq := 16777216.0
+	rgamsq := 5.9604645e-8
+
+	rd1 = d1
+	rd2 = d2
+	rx1 = x1
+
+	if d1 < 0 {
+		p.flag = -1
+	} else {
+		p2 = rd2 * y1
+		if p2 == 0 {
+			p.flag = -2
+			return
+		}
+		p1 = rd1 * x1
+		q2 = p2 * y1
+		q1 = p1 * x1
+		if math.Abs(q1) > math.Abs(q2) {
+			p.h21 = -y1 / x1
+			p.h12 = p2 / p1
+			u = 1 - p.h12 * p.h21
+			if u > 0 {
+				p.flag = 0
+				rd1 /= u
+				rd2 /= u
+				rx1 /= u
+			}
+		} else {
+			if q2 < 0 {
+				p.flag = -1
+				rd1 = 0
+				rd2 = 0
+				rx1 = 0
+			} else {
+				p.flag = 1
+				p.h11 = p1 / p2
+				p.h22 = x1 / y1
+				u = 1 + p.h11 + p.h22
+				rd1, rd2 = rd2 / u, rd1 / u
+				rx1 = y1 / u
+			}
+		}
+		if rd1 != 0 {
+			for rd1 <= rgamsq || rd1 >= gamsq {
+				if p.flag == 0 {
+					p.h11 = 1
+					p.h22 = 1
+					p.flag = -1
+				} else {
+					p.h21 = -1
+					p.h12 = 1
+					p.flag = -1
+				}
+				if rd1 <= rgamsq {
+					rd1 *= gam * gam
+					rx1 /= gam
+					p.h11 /= gam
+					p.h12 /= gam
+				} else {
+					rd1 /= gam * gam
+					rx1 *= gam
+					p.h11 *= gam
+					p.h12 *= gam
+				}
+			}
+		}
+		if rd2 != 0 {
+			for math.Abs(rd2) <= rgamsq || math.Abs(rd2) >= gamsq {
+				if p.flag == 0 {
+					p.h11 = 1
+					p.h22 = 1
+					p.flag = -1
+				} else {
+					p.h21 = -1
+					p.h12 = 1
+					p.flag = -1
+				}
+				if math.Abs(rd2) <= rgamsq {
+					rd2 *= gam * gam
+					p.h21 /= gam
+					p.h22 /= gam
+				} else {
+					rd2 /= gam * gam
+					p.h21 *= gam
+					p.h22 *= gam
+				}
+			}
+		}
+	}
+	return
+}

+ 17 - 0
common/src/github.com/ziutek/blas/dscal.go

@@ -0,0 +1,17 @@
+package blas
+
+// Rescale the vector X by the multiplicative factor alpha
+func Dscal(N int, alpha float64, X []float64, incX int)
+
+func dscal(N int, alpha float64, X []float64, incX int) {
+	var xi int
+	for ; N >= 2; N -= 2 {
+		X[xi] = alpha * X[xi]
+		xi += incX
+		X[xi] = alpha * X[xi]
+		xi += incX
+	}
+	if N != 0 {
+		X[xi] = alpha * X[xi]
+	}
+}

+ 106 - 0
common/src/github.com/ziutek/blas/dscal_amd64.s

@@ -0,0 +1,106 @@
+// func Dscal(N int, alpha float64, X []float64, incX int)
+TEXT ·Dscal(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVSD	alpha+8(FP), X0
+	MOVQ	X_data+16(FP), SI
+	MOVQ	incX+40(FP), AX
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	CMPQ	CX, X_len+24(FP)
+	JGE		panic
+
+	// Setup strides
+	SALQ	$3, AX	// AX = sizeof(float64) * incX
+
+	// Check that there are 4 or more values for SIMD calculations
+	SUBQ	$4, BP
+	JL		rest	// There are less than 4 values to process
+
+	// Setup two alphas in X0
+	MOVLHPS	X0, X0
+
+	// Check if incX != 1
+	CMPQ	AX, $8
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == 1)
+	full_simd_loop:
+		// Load first two values and scale
+		MOVUPD	(SI), X2
+		MULPD	X0, X2
+		// Load second two values and scale
+		MOVUPD	16(SI), X4
+		MULPD	X0, X4
+		// Save scaled values
+		MOVUPD	X2, (SI)
+		MOVUPD	X4, 16(SI)
+
+		// Update data pointers
+		ADDQ	$32, SI
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop	// There are 4 or more pairs to process
+
+	JMP	rest
+
+with_stride:
+	// Setup long stride
+	MOVQ	AX, CX
+	SALQ	$1, CX 	// CX = 16 * incX
+
+	// Partially optimized loop
+	half_simd_loop:
+		// Load first two values and scale
+		MOVLPD	(SI), X2
+		MOVHPD	(SI)(AX*1), X2
+		MULPD	X0, X2
+		// Save scaled values
+		MOVLPD	X2, (SI)
+		MOVHPD	X2, (SI)(AX*1)
+
+		// Update data pointer using long stride
+		ADDQ	CX, SI
+
+		// Load second two values and scale
+		MOVLPD	(SI), X4
+		MOVHPD	(SI)(AX*1), X4
+		MULPD	X0, X4
+		// Save scaled values
+		MOVLPD	X4, (SI)
+		MOVHPD	X4, (SI)(AX*1)
+
+		// Update data pointer using long stride
+		ADDQ	CX, SI
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more pairs to process
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+
+	// Check that are there any value to process
+	JE	end
+
+	loop:
+		// Load from X and scale
+		MOVSD	(SI), X2
+		MULSD	X0, X2
+		// Save scaled value
+		MOVSD	X2, (SI)
+
+		// Update data pointers
+		ADDQ	AX, SI
+
+		DECQ	BP
+		JNE	loop
+
+end:
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 13 - 0
common/src/github.com/ziutek/blas/dswap.go

@@ -0,0 +1,13 @@
+package blas
+
+// Exchange the elements of the vectors X and Y.
+func Dswap(N int, X []float64, incX int, Y []float64, incY int)
+
+func dswap(N int, X []float64, incX int, Y []float64, incY int) {
+	var xi, yi int
+	for ; N > 0; N-- {
+		X[xi], Y[yi] = Y[yi], X[xi]
+		xi += incX
+		yi += incY
+	}
+}

+ 131 - 0
common/src/github.com/ziutek/blas/dswap_amd64.s

@@ -0,0 +1,131 @@
+// 6g -B: 1926 ns/op, 6a MOVUPD: 2397 ns/op, 6a MOVAPD: 474 ns/op
+// MOVUPD is very inefficient on my U4100. Can it work better on other CPU?
+
+// func Dswap(N int, X []float64, incX int, Y []float64, incY int)
+TEXT ·Dswap(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVQ	X_data+8(FP), SI
+	MOVQ	incX+32(FP), AX
+	MOVQ	Y_data+40(FP), DI
+	MOVQ	incY+64(FP), BX
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	MOVQ	CX, DX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	IMULQ	BX, DX	// DX = incY * (N - 1)
+	CMPQ	CX, X_len+16(FP)
+	JGE		panic
+	CMPQ	DX, Y_len+48(FP)
+	JGE		panic
+
+	// Setup strides
+	SALQ	$3, AX	// AX = sizeof(float64) * incX
+	SALQ	$3, BX	// BX = sizeof(float64) * incY
+
+	// Check that there are 4 or more pairs for SIMD calculations
+	SUBQ	$4, BP
+	JL		rest	// There are less than 4 pairs to process
+
+	// Check if incX != 1 or incY != 1
+	CMPQ	AX, $8
+	JNE	with_stride
+	CMPQ	BX, $8
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == incY == 1)
+	full_simd_loop:
+		// Load four values from X
+		MOVUPD	(SI), X0
+		MOVUPD	16(SI), X1
+		// Load four values from Y
+		MOVUPD	(DI), X2
+		MOVUPD	16(DI), X3
+		// Save them
+		MOVUPD	X0, (DI)
+		MOVUPD	X1, 16(DI)
+		MOVUPD	X2, (SI)
+		MOVUPD	X3, 16(SI)
+
+		// Update data pointers
+		ADDQ	$32, SI
+		ADDQ	$32, DI
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop	// There are 4 or more pairs to process
+
+	JMP rest
+
+with_stride:
+	// Setup long strides
+	MOVQ	AX, CX
+	MOVQ	BX, DX
+	SALQ	$1, CX 	// CX = 16 * incX
+	SALQ	$1, DX 	// DX = 16 * incY
+
+	// Partially optimized loop
+	half_simd_loop:
+		// Load two values from X
+		MOVSD	(SI), X0
+		MOVSD	(SI)(AX*1), X1
+		// Load two values from Y
+		MOVSD	(DI), X2
+		MOVSD	(DI)(BX*1), X3
+		// Save them
+		MOVSD	X0, (DI)
+		MOVSD	X1, (DI)(BX*1)
+		MOVSD	X2, (SI)
+		MOVSD	X3, (SI)(AX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		// Load two values from X
+		MOVSD	(SI), X0
+		MOVSD	(SI)(AX*1), X1
+		// Load two values from Y
+		MOVSD	(DI), X2
+		MOVSD	(DI)(BX*1), X3
+		// Save them
+		MOVSD	X0, (DI)
+		MOVSD	X1, (DI)(BX*1)
+		MOVSD	X2, (SI)
+		MOVSD	X3, (SI)(AX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more pairs to process
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+
+	// Check that are there any value to process
+	JE	end
+
+	loop:
+		// Load values from X and Y
+		MOVSD	(SI), X0
+		MOVSD	(DI), X1
+		// Save them
+		MOVSD	X0, (DI)
+		MOVSD	X1, (SI)
+
+		// Update data pointers
+		ADDQ	AX, SI
+		ADDQ	BX,	DI
+
+		DECQ	BP
+		JNE	loop
+
+end:
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 24 - 0
common/src/github.com/ziutek/blas/idamax.go

@@ -0,0 +1,24 @@
+package blas
+
+// Index of largest (absoulute) element of the vector X
+func Idamax(N int, X []float64, incX int) int
+
+func idamax(N int, X []float64, incX int) int {
+	var (
+		max_x float64
+		xi    int
+	)
+	max_n := 0
+	for n := 0; n < N; n++ {
+		x := X[xi]
+		if x < 0 {
+			x = -x
+		}
+		if x > max_x {
+			max_x = x
+			max_n = n
+		}
+		xi += incX
+	}
+	return max_n
+}

+ 178 - 0
common/src/github.com/ziutek/blas/idamax_amd64-simd_broken

@@ -0,0 +1,178 @@
+// func Idamax(N int, X []float64, incX int) int
+TEXT ·Idamax(SB), 7, $0
+	MOVL	N+0(FP), BP
+	MOVQ	X+8(FP), SI	// X.data
+	MOVL	incX+24(FP), AX
+
+	// Check data bounaries
+	MOVL	BP, CX
+	DECL	CX
+	IMULL	AX, CX	// CX = incX * (N - 1)
+	CMPL	CX, X_len+16(FP)
+	JGE		panic
+
+	// Max fp value
+	XORPD	X0, X0	// X0 = {0, 0}
+	// Max index
+	PXOR	X1, X1	// X1 = {0, 0}
+	// Increment register
+	PCMPEQL X2, X2
+	PSRLQ	$63, X2	// X2 = {1, 1}
+	// Index register
+	MOVQ	X2, X3
+	PSLLDQ	$8, X3	// X3 = {0, 1}
+	// Mask for sign bit clear
+	PCMPEQL	X4, X4 
+	PSRLQ	$1, X4
+
+	// Setup stride
+	SALQ	$3, AX	// AX = sizeof(float64) * incX
+
+	// Check that there are 2 or more values for SIMD calculations
+	SUBQ	$2, BP
+	JL		rest	// There are less than 2 values to process
+
+	// Increment register
+	PSLLQ	$1, X2	// X2 = (2, 2)
+
+	// Check if incX != 1
+	CMPQ	AX, $8
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == incY == 1)
+	full_simd_loop:
+		// Load two values
+		MOVUPD	(SI), X5
+
+		// Clear sign on two values
+		ANDPD	X4, X5
+
+		// Update data pointer
+		ADDQ	$16, SI
+
+		// Compare first two values with max values
+		MOVAPD	X0, X6
+		CMPPD	X5, X6, 5 // NLT
+
+		// Clear previous max indexes
+		PAND	X6,	X1
+		// Select indexes of max values
+		PANDN	X3,	X6
+		// Update max indexes
+		POR		X6, X1
+
+		// Update max values
+		MAXPD	X5, X0
+
+		// Update indexes
+		PADDQ	X2, X3
+
+		SUBQ	$2, BP
+		JGE		full_simd_loop	// There are 2 or more values to process
+
+	JMP	hsum
+	
+with_stride:
+	// Setup long stride
+	MOVQ	AX, CX
+	SALQ	$1, CX 	// CX = 16 * incX
+
+	half_simd_loop:
+		// Load two values
+		MOVLPD	(SI), X5
+		MOVHPD	(SI)(AX*1), X5
+
+		// Clear sign on two values
+		ANDPD	X4, X5
+
+		// Update data pointer using long stride
+		ADDQ	CX, SI
+
+		// Compare first two values with max values
+		MOVAPD	X0, X6
+		CMPPD	X5, X6, 5	// NLT
+
+		// Clear previous max indexes
+		PAND	X6,	X1
+		// Select indexes of max values
+		PANDN	X3,	X6
+		// Update max indexes
+		POR		X6, X1
+
+		// Update max values
+		MAXPD	X5, X0
+
+		// Update indexes
+		PADDQ	X2, X3
+
+		SUBQ	$2, BP
+		JGE		half_simd_loop	// There are 2 or more values to process
+
+hsum:
+	// Increment register
+	//PSRLQ	$1, X2
+
+	// Uvectorize max values and indexes
+	MOVHLPS X0, X5
+	PSHUFL	$0xee, X1, X7
+
+	// BUG: we need to return less index if there are two identical max vals
+	// Compare max values
+	MOVSD	X0, X6
+	CMPSD	X5, X6, 5	// NLT
+
+	// Clear previous max indexes
+	PAND	X6,	X1
+	// Select indexes of max values
+	PANDN	X7,	X6
+	// Update max indexes
+	POR		X6, X1
+
+	// Update max value
+	MAXSD	X5, X0
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$2,	BP
+
+	// Check that are there any value to process
+	JE	end
+
+//loop:
+	// Load value
+	MOVSD	(SI), X5
+	// Clear sign
+	ANDPD	X4, X5
+
+	// Update data pointers
+	//ADDQ	AX, SI
+
+	// Compare current value with max value
+	MOVSD	X0, X6
+	CMPSD	X5, X6, 5	// NLT
+
+	// Clear previous max indexes
+	PAND	X6,	X1
+
+	// Select indexes of max values
+	PANDN	X3,	X6
+	// Update max indexes
+	POR		X6, X1
+
+	// Update max value
+	//MAXSD	X5, X0
+
+	// Update indexes
+	//PADDQ	X2, X3
+
+	//DECQ	BP
+	//JNE	loop
+
+end:
+	// Return the max index
+	MOVSD	X1, r+32(FP)
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 58 - 0
common/src/github.com/ziutek/blas/idamax_amd64.s

@@ -0,0 +1,58 @@
+// func Idamax(N int, X []float64, incX int) int
+TEXT ·Idamax(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVQ	X+8(FP), SI	// X.data
+	MOVQ	incX+32(FP), AX
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	CMPQ	CX, X_len+16(FP)
+	JGE		panic
+
+	// Max value
+	XORPD	X0, X0
+	// Index
+	XORQ	DI, DI
+	// Max index
+	XORQ	BX, BX
+
+	// Mask for sign bit clear
+	PCMPEQL	X4, X4 
+	PSRLQ	$1, X4
+
+	// Setup stride
+	SALQ	$3, AX	// AX = sizeof(float64) * incX
+
+loop:
+	CMPQ	BP, DI
+	JE	end
+
+	// Load value
+	MOVSD	(SI), X1
+	// Clear sign of loaded value
+	ANDPD	X4, X1
+
+	// Is loaded value less or equal to max value?
+	UCOMISD	X0,	X1
+	JBE	less_or_equal
+
+	// Save max index and value
+	MOVQ	DI, BX
+	MOVSD	X1, X0
+
+less_or_equal:
+	// Update data pointers
+	ADDQ	AX, SI
+	INCQ	DI
+	JMP	loop
+
+end:
+	// Return the max index
+	MOVQ	BX, r+40(FP)
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 24 - 0
common/src/github.com/ziutek/blas/isamax.go

@@ -0,0 +1,24 @@
+package blas
+
+// Index of largest (absoulute) element of the vector X
+func Isamax(N int, X []float32, incX int) int
+
+func isamax(N int, X []float32, incX int) int {
+	var (
+		max_x float32
+		xi    int
+	)
+	max_n := 0
+	for n := 0; n < N; n++ {
+		x := X[xi]
+		if x < 0 {
+			x = -x
+		}
+		if x > max_x {
+			max_x = x
+			max_n = n
+		}
+		xi += incX
+	}
+	return max_n
+}

+ 58 - 0
common/src/github.com/ziutek/blas/isamax_amd64.s

@@ -0,0 +1,58 @@
+// func Isamax(N int, X []float32, incX int) int
+TEXT ·Isamax(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVQ	X+8(FP), SI	// X.data
+	MOVQ	incX+32(FP), AX
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	CMPQ	CX, X_len+16(FP)
+	JGE		panic
+
+	// Max value
+	XORPS	X0, X0
+	// Index
+	XORQ	DI, DI
+	// Max index
+	XORQ	BX, BX
+
+	// Mask for sign bit clear
+	PCMPEQW	X4, X4 
+	PSRLL	$1, X4
+
+	// Setup stride
+	SALQ	$2, AX	// AX = sizeof(float32) * incX
+
+loop:
+	CMPQ	BP, DI
+	JE	end
+
+	// Load value
+	MOVSS	(SI), X1
+	// Clear sign of loaded value
+	ANDPS	X4, X1
+
+	// Is loaded value less or equal to max value?
+	UCOMISS	X0,	X1
+	JBE	less_or_equal
+
+	// Save max index and value
+	MOVQ	DI, BX
+	MOVSS	X1, X0
+
+less_or_equal:
+	// Update data pointers
+	ADDQ	AX, SI
+	INCQ	DI
+	JMP	loop
+
+end:
+	// Return the max index
+	MOVQ	BX, r+40(FP)
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 393 - 0
common/src/github.com/ziutek/blas/s_test.go

@@ -0,0 +1,393 @@
+package blas
+
+import (
+	"math"
+	"math/rand"
+	"testing"
+)
+
+func fabs(f float32) float32 {
+	return float32(math.Abs(float64(f)))
+}
+
+func fsqrt(f float32) float32 {
+	return float32(math.Sqrt(float64(f)))
+}
+
+func fCheck(t *testing.T, inc, N int, r, e float32) {
+	t.Logf("inc=%d N=%d : r=%f e=%f e-r=%g", inc, N, r, e, e-r)
+	if r != e {
+		t.FailNow()
+	}
+}
+
+var (
+	xf = []float32{1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7}
+	yf = []float32{1e6, 1e5, 1e4, 1e3, 100, 10, 1,
+		1e6, 1e5, 1e4, 1e3, 100, 10, 1}
+)
+
+func TestSdsdot(t *testing.T) {
+	for inc := 1; inc < 9; inc++ {
+		e := float64(0)
+		k := 0
+		for N := 0; N <= len(xf)/inc; N++ {
+			if N > 0 {
+				e += float64(xf[k]) * float64(yf[k])
+				k += inc
+			}
+			r := Sdsdot(N, 10, xf, inc, yf, inc)
+			fCheck(t, inc, N, r, float32(e+10))
+		}
+	}
+}
+
+func TestSdot(t *testing.T) {
+	for inc := 1; inc < 9; inc++ {
+		e := float32(0)
+		k := 0
+		for N := 0; N <= len(xf)/inc; N++ {
+			if N > 0 {
+				e += xf[k] * yf[k]
+				k += inc
+			}
+			r := Sdot(N, xf, inc, yf, inc)
+			fCheck(t, inc, N, r, e)
+		}
+	}
+}
+
+func TestSnrm2(t *testing.T) {
+	for inc := 1; inc < 9; inc++ {
+		for N := 0; N <= len(xf)/inc; N++ {
+			e := fsqrt(Sdot(N, xf, inc, xf, inc))
+			r := Snrm2(N, xf, inc)
+			fCheck(t, inc, N, r, e)
+		}
+	}
+}
+
+func TestSasum(t *testing.T) {
+	xf := []float32{-1, -2, 3, -4, -5, -6, 7, -8, 9}
+	for inc := 1; inc < 9; inc++ {
+		e := float32(0)
+		k := 0
+		for N := 0; N <= len(xf)/inc; N++ {
+			if N > 0 {
+				e += fabs(xf[k])
+				k += inc
+			}
+			r := Sasum(N, xf, inc)
+			fCheck(t, inc, N, r, e)
+		}
+	}
+}
+
+func TestIsamax(t *testing.T) {
+	xf := []float32{-1, -2, 3, -4, -5, 0, -5, 0, 4, 2, 3, -1, 4, -2, -9, 0,
+		-1, 0, 0, 2, 2, -8, 2, 1, 0, 2, 4, 5, 8, 1, -7, 2, 9, 0, 1, -1}
+	for inc := 1; inc < 9; inc++ {
+		for N := 0; N <= len(xf)/inc; N++ {
+			i_max := 0
+			x_max := float32(0.0)
+			for i := 0; i < N; i++ {
+				x := fabs(xf[i*inc])
+				if x > x_max {
+					x_max = x
+					i_max = i
+				}
+			}
+			r := Isamax(N, xf, inc)
+			iCheck(t, inc, N, r, i_max)
+		}
+	}
+}
+
+func TestSswap(t *testing.T) {
+	a := make([]float32, len(xf))
+	b := make([]float32, len(yf))
+	for inc := 1; inc < 9; inc++ {
+		for N := 0; N <= len(xf)/inc; N++ {
+			copy(a, xf)
+			copy(b, yf)
+			Sswap(N, a, inc, b, inc)
+			for i := 0; i < len(a); i++ {
+				if i <= inc*(N-1) && i%inc == 0 {
+					if a[i] != yf[i] || b[i] != xf[i] {
+						t.Fatalf("inc=%d N=%d", inc, N)
+					}
+				} else {
+					if a[i] != xf[i] || b[i] != yf[i] {
+						t.Fatalf("inc=%d N=%d", inc, N)
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestScopy(t *testing.T) {
+	for inc := 1; inc < 9; inc++ {
+		for N := 0; N <= len(xf)/inc; N++ {
+			a := make([]float32, len(xf))
+			Scopy(N, xf, inc, a, inc)
+			for i := 0; i < inc*N; i++ {
+				if i%inc == 0 {
+					if a[i] != xf[i] {
+						t.Fatalf("inc=%d N=%d i=%d r=%f e=%f", inc, N, i, a[i],
+							xf[i])
+					}
+				} else {
+					if a[i] != 0 {
+						t.Fatalf("inc=%d N=%d i=%d r=%f e=0", inc, N, i, a[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestSaxpy(t *testing.T) {
+	for _, alpha := range []float32{0, -1, 1, 3} {
+		for inc := 1; inc < 9; inc++ {
+			for N := 0; N <= len(xf)/inc; N++ {
+				r := make([]float32, len(xf))
+				e := make([]float32, len(xf))
+				copy(r, xf)
+				copy(e, xf)
+				Saxpy(N, alpha, xf, inc, r, inc)
+				for i := 0; i < N; i++ {
+					e[i*inc] += alpha * xf[i*inc]
+				}
+				for i := 0; i < len(xf); i++ {
+					if r[i] != e[i] {
+						t.Fatalf("alpha=%f inc=%d N=%d i=%d r=%f e=%f",
+							alpha, inc, N, i, r[i], e[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestSscal(t *testing.T) {
+	alpha := float32(3.0)
+	for inc := 1; inc < 9; inc++ {
+		for N := 0; N <= len(xf)/inc; N++ {
+			r := make([]float32, len(xf))
+			e := make([]float32, len(xf))
+			copy(r, xf)
+			copy(e, xf)
+			Sscal(N, alpha, r, inc)
+			for i := 0; i < N; i++ {
+				e[i*inc] = alpha * xf[i*inc]
+			}
+			for i := 0; i < len(xf); i++ {
+				if r[i] != e[i] {
+					t.Fatalf("inc=%d N=%d i=%d r=%f e=%f", inc, N, i, r[i],
+						e[i])
+				}
+			}
+		}
+	}
+}
+
+func fEq(a, b, p float32) bool {
+	eps := float32(math.SmallestNonzeroFloat32 * 2)
+	r := fabs(a) + fabs(b)
+	if r <= eps {
+		return true
+	}
+	return fabs(a-b)/r < p
+}
+
+// Reference implementation of Srotg
+func refSrotg(a, b float32) (c, s, r, z float32) {
+	roe := b
+	if fabs(a) > fabs(b) {
+		roe = a
+	}
+	scale := fabs(a) + fabs(b)
+	if scale == 0 {
+		c = 1
+	} else {
+		r = scale * fsqrt((a/scale)*(a/scale)+(b/scale)*(b/scale))
+		if math.Signbit(float64(roe)) {
+			r = -r
+		}
+		c = a / r
+		s = b / r
+		z = 1
+		if fabs(a) > fabs(b) {
+			z = s
+		}
+		if fabs(b) >= fabs(a) && c != 0 {
+			z = 1 / c
+		}
+	}
+	return
+}
+
+func TestSrotg(t *testing.T) {
+	vs := []struct{ a, b float32 }{
+		{0, 0}, {0, 1}, {0, -1},
+		{1, 0}, {1, 1}, {1, -1},
+		{-1, 0}, {-1, 1}, {-1, -1},
+		{2, 0}, {2, 1}, {2, -1},
+		{-2, 0}, {-2, 1}, {-2, -1},
+		{0, 2}, {1, 2}, {-1, 2},
+		{0, -2}, {1, -2}, {-1, 2},
+	}
+	for _, v := range vs {
+		c, s, _, _ := Srotg(v.a, v.b)
+		ec, es, _, _ := refSrotg(v.a, v.b)
+		if !fEq(c, ec, 1e-20) || !fEq(s, es, 1e-20) {
+			t.Fatalf("a=%g b=%g c=%g ec=%g s=%g es=%g", v.a, v.b, c, ec, s, es)
+		}
+	}
+}
+
+func TestSrot(t *testing.T) {
+	s2 := fsqrt(2)
+	vs := []struct{ c, s float32 }{
+		{0, 0}, {0, 1}, {0, -1}, {1, 0}, {-1, 0},
+		{s2, s2}, {s2, -s2}, {-s2, s2}, {-s2, -s2},
+	}
+	x := make([]float32, len(xf))
+	y := make([]float32, len(yf))
+	ex := make([]float32, len(xf))
+	ey := make([]float32, len(yf))
+	for _, v := range vs {
+		for inc := 1; inc < 9; inc++ {
+			for N := 0; N <= len(xf)/inc; N++ {
+				copy(x, xf)
+				copy(y, yf)
+				copy(ex, xf)
+				copy(ey, yf)
+				Sscal(N, v.c, ex, inc)          // ex *= c
+				Saxpy(N, v.s, y, inc, ex, inc)  // ex += s*y
+				Sscal(N, v.c, ey, inc)          // ey *= c
+				Saxpy(N, -v.s, x, inc, ey, inc) // ey += (-s)*x
+
+				// (x, y) = (c*x + s*y, c*y - s*x) 
+				Srot(N, x, inc, y, inc, v.c, v.s)
+
+				for i, _ := range x {
+					if !fEq(x[i], ex[i], 1e-7) || !fEq(y[i], ey[i], 1e-7) {
+						t.Fatalf(
+							"N=%d inc=%d c=%f s=%f i=%d x=%f ex=%f y=%f ey=%f",
+							N, inc, v.c, v.s, i, x[i], ex[i], y[i], ey[i],
+						)
+					}
+				}
+			}
+		}
+	}
+}
+
+var vf, wf []float32
+
+func init() {
+	vf = make([]float32, 514)
+	wf = make([]float32, len(vf))
+	for i := 0; i < len(vf); i++ {
+		vf[i] = rand.Float32()
+		wf[i] = rand.Float32()
+	}
+}
+
+func BenchmarkSdsdot(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		Sdsdot(len(vf), 10, vf, 1, wf, 1)
+	}
+}
+
+func BenchmarkSdot(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		Sdot(len(vf), vf, 1, wf, 1)
+	}
+}
+
+func BenchmarkSnrm2(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		Snrm2(len(vf), vf, 1)
+	}
+}
+
+func BenchmarkSasum(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		Sasum(len(vf), vf, 1)
+	}
+}
+
+func BenchmarkIsamax(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		Isamax(len(vf), vf, 1)
+	}
+}
+
+func BenchmarkSswap(b *testing.B) {
+	b.StopTimer()
+	x := make([]float32, len(vd))
+	y := make([]float32, len(vd))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Sswap(len(x), x, 1, y, 1)
+	}
+}
+
+func BenchmarkScopy(b *testing.B) {
+	b.StopTimer()
+	y := make([]float32, len(vf))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Scopy(len(vf), vf, 1, y, 1)
+	}
+}
+
+func BenchmarkSaxpy(b *testing.B) {
+	b.StopTimer()
+	y := make([]float32, len(vf))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Saxpy(len(vf), -1.0, vf, 1, y, 1)
+	}
+}
+
+func BenchmarkSscal(b *testing.B) {
+	b.StopTimer()
+	y := make([]float32, len(vf))
+	copy(y, vf)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Sscal(len(y), -1.0, y, 1)
+	}
+}
+
+func BenchmarkSrotg(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		Srotg(0, 0)
+		Srotg(0, 1)
+		Srotg(0, -1)
+		Srotg(1, 0)
+		Srotg(1, 1)
+		Srotg(1, -1)
+		Srotg(-1, 0)
+		Srotg(-1, 1)
+		Srotg(-1, -1)
+	}
+}
+
+func BenchmarkSrot(b *testing.B) {
+	b.StopTimer()
+	x := make([]float32, len(vf))
+	y := make([]float32, len(vf))
+	copy(x, vf)
+	copy(y, vf)
+	c := fsqrt(2)
+	s := c
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Srot(len(x), x, 1, y, 1, c, s)
+	}
+}

+ 20 - 0
common/src/github.com/ziutek/blas/sasum.go

@@ -0,0 +1,20 @@
+package blas
+
+// Absolute sum: \sum |X_i|
+func Sasum(N int, X []float32, incX int) float32
+
+func sasum(N int, X []float32, incX int) float32 {
+	var (
+		a  float32
+		xi int
+	)
+	for ; N > 0; N-- {
+		x := X[xi]
+		if x < 0 {
+			x = -x
+		}
+		a += x
+		xi += incX
+	}
+	return a
+}

+ 124 - 0
common/src/github.com/ziutek/blas/sasum_amd64.s

@@ -0,0 +1,124 @@
+// func Sasum(N int, X []float32, incX int) float32
+TEXT ·Sasum(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVQ	X_data+8(FP), SI
+	MOVQ	incX+32(FP), AX
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	CMPQ	CX, X_len+16(FP)
+	JGE		panic
+
+	// Clear accumulators
+	XORPS	X0, X0
+
+	// Setup mask for sign bit clear
+	PCMPEQW	X4, X4
+	PSRLL	$1, X4
+
+	// Setup strides
+	SALQ	$2, AX	// AX = sizeof(float32) * incX
+
+	// Check that there are 4 or more pairs for SIMD calculations
+	SUBQ	$4, BP
+	JL		rest	// There are less than 4 pairs to process
+
+	// Check if incX != 1 or incY != 1
+	CMPQ	AX, $4
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == incY == 1)
+	full_simd_loop:
+		// Clear sign on all four values
+		MOVUPS	(SI), X1
+		ANDPS	X4, X1
+
+		// Update data pointer
+		ADDQ	$16, SI
+
+		// Accumulate the results of multiplications
+		ADDPS	X1, X0
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop	// There are 4 or more pairs to process
+
+	JMP hsum
+
+with_stride:
+	// Setup long strides
+	MOVQ	AX, CX
+	SALQ	$1, CX 	// CX = 8 * incX
+
+	// Partially optimized loop
+	half_simd_loop:
+		// Load first two values
+		MOVSS	(SI), X1
+		MOVSS	(SI)(AX*1), X2
+
+		// Create half-vector
+		UNPCKLPS	X2, X1
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+
+		// Load second two values
+		MOVSS	(SI), X2
+		MOVSS	(SI)(AX*1), X3
+
+		// Create half-vector
+		UNPCKLPS	X3, X2
+
+		// Update data pointer using long strides
+		ADDQ	CX, SI
+		
+		// Create full-vector
+		MOVLHPS	X2, X1
+
+		// Clear sign on all four values
+		ANDPS	X4, X1
+
+		// Accumulate the result of multiplication
+		ADDPS	X1, X0
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more values to process
+
+hsum:
+	// Horizontal sum
+	MOVHLPS X0, X1
+	ADDPS	X0, X1
+	MOVSS	X1, X0
+	SHUFPS	$0xe1, X1, X1
+	ADDSS	X1, X0
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+
+	// Check that are there any value to process
+	JE	end
+
+	loop:
+		// Multiply one value
+		MOVSS	(SI), X1
+		ANDPS	X4, X1
+
+		// Update data pointers
+		ADDQ	AX, SI
+
+		// Accumulate the results of multiplication
+		ADDSS	X1, X0
+
+		DECQ	BP
+		JNE		loop
+
+end:
+	// Return the square root of sum
+	MOVSS	X0, r+40(FP)
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 50 - 0
common/src/github.com/ziutek/blas/saxpy.go

@@ -0,0 +1,50 @@
+package blas
+
+// Compute the sum Y = \alpha X + Y for the vectors X and Y 
+func Saxpy(N int, alpha float32, X []float32, incX int, Y []float32, incY int)
+
+func saxpy(N int, alpha float32, X []float32, incX int, Y []float32, incY int) {
+	var xi, yi int
+	switch alpha {
+	case 0:
+	case 1:
+		for ; N >= 2; N -= 2 {
+			Y[yi] += X[xi]
+			xi += incX
+			yi += incY
+
+			Y[yi] += X[xi]
+			xi += incX
+			yi += incY
+		}
+		if N != 0 {
+			Y[yi] += alpha * X[xi]
+		}
+	case -1:
+		for ; N >= 2; N -= 2 {
+			Y[yi] -= X[xi]
+			xi += incX
+			yi += incY
+
+			Y[yi] -= X[xi]
+			xi += incX
+			yi += incY
+		}
+		if N != 0 {
+			Y[yi] -= X[xi]
+		}
+	default:
+		for ; N >= 2; N -= 2 {
+			Y[yi] += alpha * X[xi]
+			xi += incX
+			yi += incY
+
+			Y[yi] += alpha * X[xi]
+			xi += incX
+			yi += incY
+		}
+		if N != 0 {
+			Y[yi] += alpha * X[xi]
+		}
+	}
+}

+ 290 - 0
common/src/github.com/ziutek/blas/saxpy_amd64.s

@@ -0,0 +1,290 @@
+//func Saxpy(N int, alpha float32, X []float32, incX int, Y []float32, incY int)
+TEXT ·Saxpy(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVSS	alpha+8(FP), X0
+	MOVQ	X_data+16(FP), SI
+	MOVQ	incX+40(FP), AX
+	MOVQ	Y_data+48(FP), DI
+	MOVQ	incY+72(FP), BX
+
+	// Setup 0, 1, -1
+	PCMPEQW	X1, X1
+	PCMPEQW	X8, X8
+	XORPS	X7, X7	// 0
+	PSLLL	$25, X1
+	PSRLL	$2, X1	// 1
+	PSLLL	$31, X8
+	ORPS	X1, X8	// -1
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	MOVQ	CX, DX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	IMULQ	BX, DX	// DX = incY * (N - 1)
+	CMPQ	CX, X_len+24(FP)
+	JGE		panic
+	CMPQ	DX, Y_len+56(FP)
+	JGE		panic
+
+	// Check that is there any work to do
+	UCOMISS	X0, X7	
+	JE	end	// alpha == 0
+
+	// Setup strides
+	SALQ	$2, AX	// AX = sizeof(float32) * incX
+	SALQ	$2, BX	// BX = sizeof(float32) * incY
+
+	// Check that there are 4 or more pairs for SIMD calculations
+	SUBQ	$4, BP
+	JL		rest	// There are less than 4 pairs to process
+
+	// Setup four alphas in X0
+	SHUFPS	$0, X0, X0
+
+	// Check if incX != 1 or incY != 1
+	CMPQ	AX, $4
+	JNE	with_stride
+	CMPQ	BX, $4
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == incY == 1)
+	UCOMISS	X0, X1
+	JE	full_simd_loop_sum	// alpha == 1
+	UCOMISS	X0, X8
+	JE	full_simd_loop_diff	// alpha == -1
+
+	full_simd_loop:
+		// Load four pairs and scale
+		MOVUPS	(SI), X2
+		MOVUPS	(DI), X3
+		MULPS	X0, X2
+		// Save sum
+		ADDPS	X2, X3
+		MOVUPS	X3, (DI)
+
+		// Update data pointers
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop	// There are 4 or more pairs to process
+	JMP	rest
+
+	full_simd_loop_sum:
+		// Load four pairs
+		MOVUPS	(SI), X2
+		MOVUPS	(DI), X3
+		// Save a sum
+		ADDPS	X2, X3
+		MOVUPS	X3, (DI)
+
+		// Update data pointers
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop_sum	// There are 4 or more pairs to process
+	JMP	rest_sum
+
+	full_simd_loop_diff:
+		// Load four pairs
+		MOVUPS	(SI), X2
+		MOVUPS	(DI), X3
+		// Save a difference
+		SUBPS	X2, X3
+		MOVUPS	X3, (DI)
+
+		// Update data pointers
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop_diff	// There are 4 or more pairs to process
+	JMP	rest_diff
+
+with_stride:
+	// Setup long strides
+	MOVQ	AX, CX
+	MOVQ	BX, DX
+	SALQ	$1, CX 	// CX = 8 * incX
+	SALQ	$1, DX 	// DX = 8 * incY
+
+	UCOMISS	X0, X1
+	JE	half_simd_loop_sum	// alpha == 1
+	UCOMISS	X0, X8
+	JE	half_simd_loop_diff	// alpha == -1
+
+	half_simd_loop:
+		// Load first two pairs
+		MOVSS	(SI), X2
+		MOVSS	(SI)(AX*1), X4
+		MOVSS	(DI), X3
+		MOVSS	(DI)(BX*1), X5
+
+		// Create two half-vectors
+		UNPCKLPS	X4, X2
+		UNPCKLPS	X5, X3
+
+		// Save data pointer for destination
+		MOVQ	DI, R8
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		// Load second two pairs
+		MOVSS	(SI), X4
+		MOVSS	(SI)(AX*1), X6
+		MOVSS	(DI), X5
+		MOVSS	(DI)(BX*1), X7
+
+		// Create two half-vectors
+		UNPCKLPS	X6, X4
+		UNPCKLPS	X7, X5
+
+		// Create two full-vectors
+		MOVLHPS	X4, X2
+		MOVLHPS	X5, X3
+
+		// Scale and sum
+		MULPS	X0, X2
+		ADDPS	X2, X3
+
+		// Unvectorize and save the result
+		MOVHLPS	X3, X5
+		MOVSS	X3, X4
+		MOVSS	X5, X6
+		SHUFPS  $0xe1, X3, X3
+		SHUFPS  $0xe1, X5, X5
+		MOVSS	X4, (R8)
+		MOVSS	X3, (R8)(BX*1)
+		MOVSS	X6, (DI)
+		MOVSS	X5, (DI)(BX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more pairs to process
+	JMP rest
+
+	half_simd_loop_sum:
+		MOVSS	(DI), X2
+		MOVSS	(DI)(BX*1), X3
+		ADDSS	(SI), X2
+		ADDSS	(SI)(AX*1), X3
+		MOVSS	X2, (DI)
+		MOVSS	X3, (DI)(BX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		MOVSS	(DI), X4
+		MOVSS	(DI)(BX*1), X5
+		ADDSS	(SI), X4
+		ADDSS	(SI)(AX*1), X5
+		MOVSS	X4, (DI)
+		MOVSS	X5, (DI)(BX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop_sum	// There are 4 or more pairs to process
+	JMP rest_sum
+
+	half_simd_loop_diff:
+		MOVSS	(DI), X2
+		MOVSS	(DI)(BX*1), X3
+		SUBSS	(SI), X2
+		SUBSS	(SI)(AX*1), X3
+		MOVSS	X2, (DI)
+		MOVSS	X3, (DI)(BX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		MOVSS	(DI), X4
+		MOVSS	(DI)(BX*1), X5
+		SUBSS	(SI), X4
+		SUBSS	(SI)(AX*1), X5
+		MOVSS	X4, (DI)
+		MOVSS	X5, (DI)(BX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop_diff	// There are 4 or more pairs to process
+	JMP rest_diff
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+	// Check that are there any value to process
+	JE	end
+	loop:
+		// Load from X and scale
+		MOVSS	(SI), X2
+		MULSS	X0, X2
+		// Save sum in Y
+		ADDSS	(DI), X2
+		MOVSS	X2, (DI)
+
+		// Update data pointers
+		ADDQ	AX, SI
+		ADDQ	BX, DI
+
+		DECQ	BP
+		JNE	loop
+	RET
+
+rest_sum:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+	// Check that are there any value to process
+	JE	end
+	loop_sum:
+		// Load from X
+		MOVSS	(SI), X2
+		// Save sum in Y
+		ADDSS	(DI), X2
+		MOVSS	X2, (DI)
+
+		// Update data pointers
+		ADDQ	AX, SI
+		ADDQ	BX, DI
+
+		DECQ	BP
+		JNE	loop_sum
+	RET
+
+rest_diff:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+	// Check that are there any value to process
+	JE	end
+	loop_diff:
+		// Load from Y 
+		MOVSS	(DI), X2
+		// Save sum in Y
+		SUBSS	(SI), X2
+		MOVSS	X2, (DI)
+
+		// Update data pointers
+		ADDQ	AX, SI
+		ADDQ	BX, DI
+
+		DECQ	BP
+		JNE	loop_diff
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+end:
+	RET

+ 17 - 0
common/src/github.com/ziutek/blas/scopy.go

@@ -0,0 +1,17 @@
+package blas
+
+// Copy the  elements of the vectors X and Y.
+func Scopy(N int, X []float32, incX int, Y []float32, incY int)
+
+func scopy(N int, X []float32, incX int, Y []float32, incY int) {
+	if incX == 1 && incY == 1 {
+		copy(Y[:N], X[:N])
+		return
+	}
+	var xi, yi int
+	for ; N > 0; N-- {
+		Y[yi] = X[xi]
+		xi += incX
+		yi += incY
+	}
+}

+ 51 - 0
common/src/github.com/ziutek/blas/scopy_amd64.s

@@ -0,0 +1,51 @@
+// func Scopy(N int, X []float32, incX int, Y []float32, incY int)
+TEXT ·Scopy(SB), 7, $0
+	MOVQ	N+0(FP), CX
+	MOVQ	X_data+8(FP), SI
+	MOVQ	incX+32(FP), AX
+	MOVQ	Y_data+40(FP), DI
+	MOVQ	incY+64(FP), BX
+
+	// Check data bounaries
+	MOVQ	CX, BP
+	DECQ	BP
+	MOVQ	BP, DX
+	IMULQ	AX, BP	// BP = incX * (N - 1)
+	IMULQ	BX, DX	// DX = incY * (N - 1)
+	CMPQ	BP, X_len+16(FP)
+	JGE		panic
+	CMPQ	DX, Y_len+48(FP)
+	JGE		panic
+
+	// Check if incX != 1 or incY != 1
+	CMPQ	AX, $1
+	JNE	with_stride
+	CMPQ	BX, $1
+	JNE	with_stride
+
+	// Optimized copy for incX == incY == 1
+	REP; MOVSL
+	RET
+
+with_stride:
+	// Setup strides
+	SALQ	$2, AX	// AX = sizeof(float32) * incX
+	SALQ	$2, BX	// BX = sizeof(float32) * incY
+
+	CMPQ	CX, $0
+	JE	end
+
+	loop:
+		MOVL	(SI), DX
+		MOVL	DX, (DI)
+		ADDQ	AX, SI
+		ADDQ	BX, DI
+		DECQ	CX
+		JNE	loop
+
+end:
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 34 - 0
common/src/github.com/ziutek/blas/sdot.go

@@ -0,0 +1,34 @@
+package blas
+
+// Scalar product: X^T Y
+func Sdot(N int, X []float32, incX int, Y []float32, incY int) float32
+
+func sdot(N int, X []float32, incX int, Y []float32, incY int) float32 {
+	var (
+		a, b, c, d float32
+		xi, yi     int
+	)
+	for ; N >= 4; N -= 4 {
+		a += X[xi] * Y[yi]
+		xi += incX
+		yi += incY
+
+		b += X[xi] * Y[yi]
+		xi += incX
+		yi += incY
+
+		c += X[xi] * Y[yi]
+		xi += incX
+		yi += incY
+
+		d += X[xi] * Y[yi]
+		xi += incX
+		yi += incY
+	}
+	for ; N > 0; N-- {
+		a += X[xi] * Y[yi]
+		xi += incX
+		yi += incY
+	}
+	return (b + c) + (d + a)
+}

+ 143 - 0
common/src/github.com/ziutek/blas/sdot_amd64.s

@@ -0,0 +1,143 @@
+// func Sdot(N int, X []float32, incX int, Y []float32, incY int) float32
+TEXT ·Sdot(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVQ	X_data+8(FP), SI
+	MOVQ	incX+32(FP), AX
+	MOVQ	Y_data+40(FP), DI
+	MOVQ	incY+64(FP), BX
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	MOVQ	CX, DX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	IMULQ	BX, DX	// DX = incY * (N - 1)
+	CMPQ	CX, X_len+16(FP)
+	JGE		panic
+	CMPQ	DX, Y_len+48(FP)
+	JGE		panic
+
+	// Clear accumulators
+	XORPS	X0, X0
+
+	// Setup strides
+	SALQ	$2, AX	// AX = sizeof(float32) * incX
+	SALQ	$2, BX	// BX = sizeof(float32) * incY
+
+	// Check that there are 4 or more pairs for SIMD calculations
+	SUBQ	$4, BP
+	JL		rest	// There are less than 4 pairs to process
+
+	// Check if incX != 1 or incY != 1
+	CMPQ	AX, $4
+	JNE	with_stride
+	CMPQ	BX, $4
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == incY == 1)
+	full_simd_loop:
+		// Multiply four pairs
+		MOVUPS	(SI), X2
+		MOVUPS	(DI), X3
+		MULPS	X2, X3
+
+		// Update data pointers
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+
+		// Accumulate the results of multiplications
+		ADDPS	X3, X0
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop	// There are 4 or more pairs to process
+
+	JMP hsum
+
+with_stride:
+	// Setup long strides
+	MOVQ	AX, CX
+	MOVQ	BX, DX
+	SALQ	$1, CX 	// CX = 8 * incX
+	SALQ	$1, DX 	// DX = 8 * incY
+
+	// Partially optimized loop
+	half_simd_loop:
+		// Load first two pairs
+		MOVSS	(SI), X2
+		MOVSS	(SI)(AX*1), X4
+		MOVSS	(DI), X3
+		MOVSS	(DI)(BX*1), X5
+
+		// Create two half-vectors
+		UNPCKLPS	X4, X2
+		UNPCKLPS	X5, X3
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		// Load second two pairs
+		MOVSS	(SI), X4
+		MOVSS	(SI)(AX*1), X6
+		MOVSS	(DI), X5
+		MOVSS	(DI)(BX*1), X7
+
+		// Create two half-vectors
+		UNPCKLPS	X6, X4
+		UNPCKLPS	X7, X5
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+		
+		// Create two full-vectors
+		MOVLHPS	X4, X2
+		MOVLHPS	X5, X3
+
+		// Multiply them
+		MULPS	X2, X3
+
+		// Accumulate the result of multiplication
+		ADDPS	X3, X0
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more pairs to process
+
+hsum:
+	// Horizontal sum
+	MOVHLPS X0, X1
+	ADDPS	X0, X1
+	MOVSS	X1, X0
+	SHUFPS	$0xe1, X1, X1
+	ADDSS	X1, X0
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+
+	// Check that are there any value to process
+	JE	end
+
+	loop:
+		// Multiply one pair
+		MOVSS	(SI), X1
+		MULSS	(DI), X1
+
+		// Update data pointers
+		ADDQ	AX, SI
+		ADDQ	BX,	DI
+
+		// Accumulate the results of multiplication
+		ADDSS	X1, X0
+
+		DECQ	BP
+		JNE	loop
+
+end:
+	// Return the sum
+	MOVSS	X0, r+72(FP)
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 34 - 0
common/src/github.com/ziutek/blas/sdsdot.go

@@ -0,0 +1,34 @@
+package blas
+
+// \alpha + X^T Y  computed using float64
+func Sdsdot(N int, alpha float32, X []float32, incX int, Y []float32, incY int) float32
+
+func sdsdot(N int, alpha float32, X []float32, incX int, Y []float32, incY int) float32 {
+	var (
+		a, b, c, d float64
+		xi, yi     int
+	)
+	for ; N >= 4; N -= 4 {
+		a += float64(X[xi]) * float64(Y[yi])
+		xi += incX
+		yi += incY
+
+		b += float64(X[xi]) * float64(Y[yi])
+		xi += incX
+		yi += incY
+
+		c += float64(X[xi]) * float64(Y[yi])
+		xi += incX
+		yi += incY
+
+		d += float64(X[xi]) * float64(Y[yi])
+		xi += incX
+		yi += incY
+	}
+	for ; N > 0; N-- {
+		a += float64(X[xi]) * float64(Y[yi])
+		xi += incX
+		yi += incY
+	}
+	return float32(float64(alpha) + (b + c) + (d + a))
+}

+ 168 - 0
common/src/github.com/ziutek/blas/sdsdot_amd64.s

@@ -0,0 +1,168 @@
+// func Sdsdot(N int, alpha float32, X []float32, incX int, Y []float32, incY int) float32
+TEXT ·Sdsdot(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVQ	X_data+16(FP), SI
+	MOVQ	incX+40(FP), AX
+	MOVQ	Y_data+48(FP), DI
+	MOVQ	incY+72(FP), BX
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	MOVQ	CX, DX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	IMULQ	BX, DX	// DX = incY * (N - 1)
+	CMPQ	CX, X_len+16(FP)
+	JGE		panic
+	CMPQ	DX, Y_len+56(FP)
+	JGE		panic
+
+	// Clear accumulators
+	XORPD	X0, X0
+	XORPD	X1, X1
+
+	// Setup strides
+	SALQ	$2, AX	// AX = sizeof(float32) * incX
+	SALQ	$2, BX	// BX = sizeof(float32) * incY
+
+	// Check that there are 4 or more pairs for SIMD calculations
+	SUBQ	$4, BP
+	JL		rest	// There are less than 4 pairs to process
+
+	// Check if incX != 1 or incY != 1
+	CMPQ	AX, $4
+	JNE	with_stride
+	CMPQ	BX, $4
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == incY == 1)
+	full_simd_loop:
+		// Load four pairs
+		MOVUPS	(SI), X2
+		MOVUPS	(DI), X3
+
+		// Move two high pairs to low part of another registers
+		MOVHLPS	X2, X4
+		MOVHLPS	X3, X5
+		
+		// Convert to float64
+		CVTPS2PD X2, X2
+		CVTPS2PD X3, X3
+		CVTPS2PD X4, X4
+		CVTPS2PD X5, X5
+
+		// Multiply converted values
+		MULPD	X2, X3
+		MULPD	X4, X5
+
+		// Update data pointers
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+
+		// Accumulate the results of multiplications
+		ADDPD	X3, X0
+		ADDPD	X5, X1
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop	// There are 4 or more pairs to process
+
+	JMP hsum
+
+with_stride:
+	// Setup long strides
+	MOVQ	AX, CX
+	MOVQ	BX, DX
+	SALQ	$1, CX 	// CX = 16 * incX
+	SALQ	$1, DX 	// DX = 16 * incY
+
+	// Partially optimized loop
+	half_simd_loop:
+		// Load first two pairs
+		MOVSS		(SI), X2
+		MOVSS		(SI)(AX*1), X6
+		MOVSS		(DI), X3
+		MOVSS		(DI)(BX*1), X7
+
+		// Convert them to float64 and multiply
+		UNPCKLPS	X6, X2
+		UNPCKLPS	X7, X3
+		CVTPS2PD	X2, X2
+		CVTPS2PD	X3, X3
+		MULPD		X2, X3
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		// Load second two pairs
+		MOVSS		(SI), X4
+		MOVSS		(SI)(AX*1), X6
+		MOVSS		(DI), X5
+		MOVSS		(DI)(BX*1), X7
+
+		// Convert them to float64 and multiply
+		UNPCKLPS	X6, X4
+		UNPCKLPS	X7, X5
+		CVTPS2PD	X4, X4
+		CVTPS2PD	X5, X5
+		MULPD		X4, X5
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		// Accumulate the results of multiplications
+		ADDPD	X3, X0
+		ADDPD	X5, X1
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more pairs to process
+
+hsum:
+	// Summ intermediate results from SIMD operations
+	ADDPD	X0, X1
+	// Horizontal sum
+	MOVHLPS X1, X0
+	ADDSD	X1, X0
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+
+	// Check that are there any value to process
+	JE	end
+
+	loop:
+		// Load one pair
+		MOVSS	(SI), X2
+		MOVSS	(DI), X3
+		
+		// Convert them to float64 and multiply
+		CVTSS2SD	X2, X2
+		CVTSS2SD	X3, X3
+		MULSD		X3, X2
+
+		// Update data pointers
+		ADDQ	AX, SI
+		ADDQ	BX,	DI
+
+		// Accumulate the results of multiplication
+		ADDSD	X2, X0
+
+		DECQ	BP
+		JNE	loop
+
+end:
+	// Add alpha
+	MOVSS		alpha+8(FP), X1
+	CVTSS2SD	X1, X1
+	ADDSD		X1, X0
+
+	// Convert result to float32 and return
+	CVTSD2SS	X0, X0	
+	MOVSS		X0, r+80(FP)
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 4 - 0
common/src/github.com/ziutek/blas/simd.txt

@@ -0,0 +1,4 @@
+Core 2 SSE latency/rate
+MULPD:       5/1
+ADDPD:       1/0.33
+ADDPD+MULPD: 5/1

+ 31 - 0
common/src/github.com/ziutek/blas/snrm2.go

@@ -0,0 +1,31 @@
+package blas
+
+import "math"
+
+// Euclidean norm: ||X||_2 = \sqrt {\sum X_i^2}
+func Snrm2(N int, X []float32, incX int) float32
+
+func snrm2(N int, X []float32, incX int) float32 {
+	var (
+		a, b, c, d float32
+		xi         int
+	)
+	for ; N >= 4; N -= 4 {
+		a += X[xi] * X[xi]
+		xi += incX
+
+		b += X[xi] * X[xi]
+		xi += incX
+
+		c += X[xi] * X[xi]
+		xi += incX
+
+		d += X[xi] * X[xi]
+		xi += incX
+	}
+	for ; N > 0; N-- {
+		a += X[xi] * X[xi]
+		xi += incX
+	}
+	return float32(math.Sqrt(float64((b + c) + (d + a))))
+}

+ 121 - 0
common/src/github.com/ziutek/blas/snrm2_amd64.s

@@ -0,0 +1,121 @@
+// func Snrm2(N int, X []float32, incX int) float32
+TEXT ·Snrm2(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVQ	X_data+8(FP), SI
+	MOVQ	incX+32(FP), AX
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	CMPQ	CX, X_len+16(FP)
+	JGE		panic
+
+	// Clear accumulators
+	XORPS	X0, X0
+
+	// Setup strides
+	SALQ	$2, AX	// AX = sizeof(float32) * incX
+
+	// Check that there are 4 or more pairs for SIMD calculations
+	SUBQ	$4, BP
+	JL		rest	// There are less than 4 pairs to process
+
+	// Check if incX != 1 or incY != 1
+	CMPQ	AX, $4
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == incY == 1)
+	full_simd_loop:
+		// Multiply four values
+		MOVUPS	(SI), X1
+		MULPS	X1, X1
+
+		// Update data pointer
+		ADDQ	$16, SI
+
+		// Accumulate the results of multiplications
+		ADDPS	X1, X0
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop	// There are 4 or more pairs to process
+
+	JMP hsum
+
+with_stride:
+	// Setup long strides
+	MOVQ	AX, CX
+	SALQ	$1, CX 	// CX = 8 * incX
+
+	// Partially optimized loop
+	half_simd_loop:
+		// Load first two values
+		MOVSS	(SI), X1
+		MOVSS	(SI)(AX*1), X2
+
+		// Create half-vector
+		UNPCKLPS	X2, X1
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+
+		// Load second two values
+		MOVSS	(SI), X2
+		MOVSS	(SI)(AX*1), X3
+
+		// Create half-vector
+		UNPCKLPS	X3, X2
+
+		// Update data pointer using long strides
+		ADDQ	CX, SI
+		
+		// Create full-vector
+		MOVLHPS	X2, X1
+
+		// Square it
+		MULPS	X1, X1
+
+		// Accumulate the result of multiplication
+		ADDPS	X1, X0
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more values to process
+
+hsum:
+	// Horizontal sum
+	MOVHLPS X0, X1
+	ADDPS	X0, X1
+	MOVSS	X1, X0
+	SHUFPS	$0xe1, X1, X1
+	ADDSS	X1, X0
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+
+	// Check that are there any value to process
+	JE	end
+
+	loop:
+		// Multiply one value
+		MOVSS	(SI), X1
+		MULSS	X1, X1
+
+		// Update data pointers
+		ADDQ	AX, SI
+
+		// Accumulate the results of multiplication
+		ADDSS	X1, X0
+
+		DECQ	BP
+		JNE		loop
+
+end:
+	// Return the square root of sum
+	SQRTSS	X0, X0
+	MOVSS	X0, r+40(FP)
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 16 - 0
common/src/github.com/ziutek/blas/srot.go

@@ -0,0 +1,16 @@
+package blas
+
+// Apply a Givens rotation (X', Y') = (c X + s Y, c Y - s X) to the vectors X, Y
+func Srot(N int, X []float32, incX int, Y []float32, incY int, c, s float32)
+
+func srot(N int, X []float32, incX int, Y []float32, incY int, c, s float32) {
+	var xi, yi int
+	for ; N > 0; N-- {
+		x := X[xi]
+		y := Y[yi]
+		X[xi] = c*x + s*y
+		Y[yi] = c*y - s*x
+		xi += incX
+		yi += incY
+	}
+}

+ 183 - 0
common/src/github.com/ziutek/blas/srot_amd64.s

@@ -0,0 +1,183 @@
+// func Srot(N int, X []float32, incX int, Y []float32, incY int, c, s float32)
+TEXT ·Srot(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVQ	X_data+8(FP), SI
+	MOVQ	incX+32(FP), AX
+	MOVQ	Y_data+40(FP), DI
+	MOVQ	incY+64(FP), BX
+	MOVSS	c+72(FP), X0
+	MOVSS	s+76(FP), X1
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	MOVQ	CX, DX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	IMULQ	BX, DX	// DX = incY * (N - 1)
+	CMPQ	CX, X_len+16(FP)
+	JGE		panic
+	CMPQ	DX, Y_len+48(FP)
+	JGE		panic
+
+	// Setup strides
+	SALQ	$2, AX	// AX = sizeof(float32) * incX
+	SALQ	$2, BX	// BX = sizeof(float32) * incY
+
+	// Check that there are 4 or more pairs for SIMD calculations
+	SUBQ	$4, BP
+	JL		rest	// There are less than 2 pairs to process
+
+	// Setup four c in X0, and four s in X1
+	SHUFPS	$0,	X0, X0 // (c, c, c, c)
+	SHUFPS	$0,	X1, X1 // (s, s, s, s)
+
+	// Check if incX != 1 or incY != 1
+	CMPQ	AX, $4
+	JNE	with_stride
+	CMPQ	BX, $4
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == incY == 1)
+	full_simd_loop:
+		// Load four pairs
+		MOVUPS	(SI), X2	// x
+		MOVUPS	(DI), X3	// y
+		MOVAPS	X2, X4	// x
+		MOVAPS	X3, X5	// y
+		// Givens rotation
+		MULPS	X0, X2	// c * x
+		MULPS	X1, X3	// s * y
+		MULPS	X1, X4	// s * x
+		MULPS	X0, X5	// c * y
+		ADDPS	X2, X3	// s * y + c * x
+		SUBPS	X4, X5	// c * y - s * x
+		// Save the result
+		MOVUPS	X3, (SI)
+		MOVUPS	X5, (DI)
+		// Update data pointers
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop	// There are 4 or more pairs to process
+
+	JMP	rest
+
+with_stride:
+	// Setup long strides
+	MOVQ	AX, CX
+	MOVQ	BX, DX
+	SALQ	$1, CX 	// CX = 8 * incX
+	SALQ	$1, DX 	// DX = 8 * incY
+
+	// Partially optimized loop
+	half_simd_loop:
+		// Load first two pairs
+		MOVSS	(SI), X2
+		MOVSS	(SI)(AX*1), X4
+		MOVSS	(DI), X3
+		MOVSS	(DI)(BX*1), X5
+
+		// Create two half-vectors
+		UNPCKLPS	X4, X2
+		UNPCKLPS	X5, X3
+	
+		// Save data pointers for destination
+		MOVQ	SI, R8
+		MOVQ	DI, R9
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		// Load second two pairs
+		MOVSS	(SI), X4
+		MOVSS	(SI)(AX*1), X6
+		MOVSS	(DI), X5
+		MOVSS	(DI)(BX*1), X7
+
+		// Create two half-vectors
+		UNPCKLPS	X6, X4
+		UNPCKLPS	X7, X5
+
+		// Create two full-vectors
+		MOVLHPS	X4, X2
+		MOVLHPS	X5, X3
+		// Copy them for Y calculaction second
+		MOVAPS	X2, X4	// x
+		MOVAPS	X3, X5	// y
+
+		// Givens rotation
+		MULPS	X0, X2	// c * x
+		MULPS	X1, X3	// s * y
+		MULPS	X1, X4	// s * x
+		MULPS	X0, X5	// c * y
+		ADDPS	X2, X3	// s * y + c * x
+		SUBPS	X4, X5	// c * y - s * x
+
+		// Unvectorize and save the X
+		MOVHLPS	X3, X2
+		MOVSS	X3, X4
+		MOVSS	X2, X6
+		SHUFPS  $0xe1, X3, X3
+		SHUFPS  $0xe1, X2, X2
+		MOVSS	X4, (R8)
+		MOVSS	X3, (R8)(AX*1)
+		MOVSS	X6, (SI)
+		MOVSS	X2, (SI)(AX*1)
+
+		// Unvectorize and save the Y
+		MOVHLPS	X5, X2
+		MOVSS	X5, X4
+		MOVSS	X2, X6
+		SHUFPS  $0xe1, X5, X5
+		SHUFPS  $0xe1, X2, X2
+		MOVSS	X4, (R9)
+		MOVSS	X5, (R9)(BX*1)
+		MOVSS	X6, (DI)
+		MOVSS	X2, (DI)(BX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 2 or more pairs to process
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+
+	// Check that are there any value to process
+	JE	end
+
+	loop:
+		// Load one pair
+		MOVSS	(SI), X2
+		MOVSS	(DI), X3
+	
+		MOVSS	X2, X4	// x
+		MOVSS	X3, X5	// y
+		// Givens rotation
+		MULSS	X0, X2	// c * x
+		MULSS	X1, X3	// s * y
+		MULSS	X1, X4	// s * x
+		MULSS	X0, X5	// c * y
+		ADDSS	X2, X3	// s * y + c * x
+		SUBSS	X4, X5	// c * y - s * x
+		// Save the result
+		MOVSS	X3, (SI)
+		MOVSS	X5, (DI)
+
+		// Update data pointers
+		ADDQ	AX, SI
+		ADDQ	BX, DI
+
+		DECQ	BP
+		JNE	loop
+
+end:
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 43 - 0
common/src/github.com/ziutek/blas/srotg.go

@@ -0,0 +1,43 @@
+package blas
+
+import "math"
+
+// Compute a Givens rotation (c,s) which zeroes the vector (a,b)
+func Srotg(a, b float32) (c, s, r, z float32)
+
+func srotg(a, b float32) (c, s, r, z float32) {
+	abs_a := a
+	if a < 0 {
+		abs_a = -a
+	}
+	abs_b := b
+	if b < 0 {
+		abs_b = -b
+	}
+	roe := b
+	if abs_a > abs_b {
+		roe = a
+	}
+	scale := abs_a + abs_b
+	if scale == 0 {
+		c = 1
+	} else {
+		sa := a / scale
+		sb := b / scale
+		r = scale * float32(math.Sqrt(float64(sa*sa+sb*sb)))
+		if roe < 0 {
+			r = -r
+		}
+		c = a / r
+		s = b / r
+		z = 1
+		if abs_a > abs_b {
+			z = s
+		} else {
+			if c != 0 {
+				z /= c
+			}
+		}
+	}
+	return
+}

+ 92 - 0
common/src/github.com/ziutek/blas/srotg_amd64.s

@@ -0,0 +1,92 @@
+// func Srotg(a, b float32) (c, s, r, z float32)
+TEXT ·Srotg(SB), 7, $0
+	MOVSS   a+0(FP), X0
+	MOVSS   b+4(FP), X1
+
+	// Setup mask for sign bit clear
+	PCMPEQW	X6, X6
+	PSRLL	$1, X6
+
+	// Setup 0
+	XORPS	X8, X8
+
+	// Setup 1
+	PCMPEQW	X7, X7
+	PSLLL	$25, X7
+	PSRLL	$2, X7 
+
+	// Compute |a|, |b|
+	MOVAPS	X0, X2
+	MOVAPS	X1, X3
+	ANDPS	X6, X2
+	ANDPS	X6, X3
+
+	// Compute roe
+	MOVSS	X1, X4	// roe = b
+	UCOMISS	X3,	X2	// cmp(abs_b, abs_a)
+	JBE	roe_b
+
+	MOVSS	X0, X4	// roe = a
+
+roe_b:
+
+	// Compute scale
+	MOVSS	X2, X5
+	ADDSS	X3, X5
+
+	UCOMISS	X8,	X5	// cmp(0, scale)
+	JNE	scale_NE_zero
+	
+	MOVSS	X7, c+8(FP)	// c = 1
+	MOVSS	X8,	s+12(FP)	// s = 0
+	MOVSS	X8,	r+16(FP)	// r = 0
+	MOVSS	X8, z+20(FP)	// z = 0
+	RET
+
+scale_NE_zero:
+
+	SHUFPS	$0, X5, X5	// (scale, scale, scale, scale)
+	MOVLHPS	X1, X0	// (a, b)
+	MOVAPS	X0, X1	// (a, b)
+	DIVPS	X5, X0	// (a/scale, b/scale)
+	MULPS	X0, X0	// ((a/scale)^2, (b/scale)^2)
+	MOVHLPS	X0, X6
+	ADDSS	X6, X0	// (a/scale)^2 + (b/scale)^2
+	SQRTSS	X0, X0
+	MULSS	X5, X0	// r
+	SHUFPS	$0, X0, X0	// (r, r, r, r)
+
+	UCOMISS	X8, X4 // cmp(0, roe)
+	JAE	roe_GE_zero
+
+	PCMPEQW X4, X4
+	PSLLL	$31, X4 // Sign bit
+	XORPS	X4, X0	// (r, r) = (-r, -r)
+
+roe_GE_zero:
+
+	DIVPS	X0, X1		// (a/r, b/r) 	
+	MOVSS	X1, c+8(FP)	// c = a/r
+	MOVHLPS	X1, X1
+	MOVSS	X1,	s+12(FP)	// s = b/r
+	MOVSS	X0,	r+16(FP)
+
+	MOVSS	X7, X4	// z = 1
+
+	UCOMISS	X3,	X2	// cmp(abs_b, abs_a)
+	JBE	abs_a_LE_abs_b
+
+	MOVSS	X1, X4	// z = s
+
+	JMP end
+	
+abs_a_LE_abs_b:
+
+	UCOMISS	X8, X1
+	JE	end
+
+	DIVSS	X1, X4	// z /= c
+
+end:
+	MOVSS	X4, z+20(FP)
+	RET

+ 17 - 0
common/src/github.com/ziutek/blas/sscal.go

@@ -0,0 +1,17 @@
+package blas
+
+// Rescale the vector X by the multiplicative factor alpha
+func Sscal(N int, alpha float32, X []float32, incX int)
+
+func sscal(N int, alpha float32, X []float32, incX int) {
+	var xi int
+	for ; N >= 2; N -= 2 {
+		X[xi] = alpha * X[xi]
+		xi += incX
+		X[xi] = alpha * X[xi]
+		xi += incX
+	}
+	if N != 0 {
+		X[xi] = alpha * X[xi]
+	}
+}

+ 118 - 0
common/src/github.com/ziutek/blas/sscal_amd64.s

@@ -0,0 +1,118 @@
+//func Sscal(N int, alpha float32, X []float32, incX int)
+TEXT ·Sscal(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVSS	alpha+8(FP), X0
+	MOVQ	X_data+16(FP), SI
+	MOVQ	incX+40(FP), AX
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	CMPQ	CX, X_len+24(FP)
+	JGE		panic
+
+	// Setup stride
+	SALQ	$2, AX	// AX = sizeof(float32) * incX
+
+	// Check that there are 4 or more pairs for SIMD calculations
+	SUBQ	$4, BP
+	JL		rest	// There are less than 4 values to process
+
+	// Setup four alphas in X0
+	SHUFPS	$0, X0, X0
+
+	// Check if incX != 1
+	CMPQ	AX, $4
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == 1)
+	full_simd_loop:
+		// Load four values and scale
+		MOVUPS	(SI), X2
+		MULPS	X0, X2
+		// Save scaled values
+		MOVUPS	X2, (SI)
+
+		// Update data pointers
+		ADDQ	$16, SI
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop	// There are 4 or more pairs to process
+
+	JMP	rest
+
+with_stride:
+	// Setup long stride
+	MOVQ	AX, CX
+	SALQ	$1, CX 	// CX = 8 * incX
+
+	// Partially optimized loop
+	half_simd_loop:
+		// Load first two values
+		MOVSS	(SI), X2
+		MOVSS	(SI)(AX*1), X4
+
+		// Create a half-vector
+		UNPCKLPS	X4, X2
+
+		// Save data pointer
+		MOVQ	SI, DI
+		// Update data pointer using long stride
+		ADDQ	CX, SI
+
+		// Load second two values
+		MOVSS	(SI), X4
+		MOVSS	(SI)(AX*1), X6
+
+		// Create a half-vector
+		UNPCKLPS	X6, X4
+
+		// Create a full-vector
+		MOVLHPS	X4, X2
+
+		// Scale the full-vector
+		MULPS	X0, X2
+
+		// Unvectorize and save the result
+		MOVHLPS	X2, X3
+		MOVSS	X2, X4
+		MOVSS	X3, X5
+		SHUFPS  $0xe1, X2, X2
+		SHUFPS  $0xe1, X3, X3
+		MOVSS	X4, (DI)
+		MOVSS	X2, (DI)(AX*1)
+		MOVSS	X5, (SI)
+		MOVSS	X3, (SI)(AX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more pairs to process
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+
+	// Check that are there any value to process
+	JE	end
+
+	loop:
+		// Load from X and save scaled
+		MOVSS	(SI), X2
+		MULSS	X0, X2
+		MOVSS	X2, (SI)
+
+		// Update data pointers
+		ADDQ	AX, SI
+
+		DECQ	BP
+		JNE	loop
+
+end:
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 13 - 0
common/src/github.com/ziutek/blas/sswap.go

@@ -0,0 +1,13 @@
+package blas
+
+// Exchange the elements of the vectors X and Y.
+func Sswap(N int, X []float32, incX int, Y []float32, incY int)
+
+func sswap(N int, X []float32, incX int, Y []float32, incY int) {
+	var xi, yi int
+	for ; N > 0; N-- {
+		X[xi], Y[yi] = Y[yi], X[xi]
+		xi += incX
+		yi += incY
+	}
+}

+ 124 - 0
common/src/github.com/ziutek/blas/sswap_amd64.s

@@ -0,0 +1,124 @@
+// func Sswap(N int, X []float32, incX int, Y []float32, incY int)
+TEXT ·Sswap(SB), 7, $0
+	MOVQ	N+0(FP), BP
+	MOVQ	X_data+8(FP), SI
+	MOVQ	incX+32(FP), AX
+	MOVQ	Y_data+40(FP), DI
+	MOVQ	incY+64(FP), BX
+
+	// Check data bounaries
+	MOVQ	BP, CX
+	DECQ	CX
+	MOVQ	CX, DX
+	IMULQ	AX, CX	// CX = incX * (N - 1)
+	IMULQ	BX, DX	// DX = incY * (N - 1)
+	CMPQ	CX, X_len+16(FP)
+	JGE		panic
+	CMPQ	DX, Y_len+48(FP)
+	JGE		panic
+
+	// Setup strides
+	SALQ	$2, AX	// AX = sizeof(float32) * incX
+	SALQ	$2, BX	// BX = sizeof(float32) * incY
+
+	// Check that there are 4 or more pairs for SIMD calculations
+	SUBQ	$4, BP
+	JL		rest	// There are less than 4 pairs to process
+
+	// Check if incX != 1 or incY != 1
+	CMPQ	AX, $4
+	JNE	with_stride
+	CMPQ	BX, $4
+	JNE	with_stride
+
+	// Fully optimized loop (for incX == incY == 1)
+	full_simd_loop:
+		// Load four values from X
+		MOVUPS	(SI), X0
+		// Load four values from Y
+		MOVUPS	(DI), X2
+		// Save them
+		MOVUPS	X0, (DI)
+		MOVUPS	X2, (SI)
+
+		// Update data pointers
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+
+		SUBQ	$4, BP
+		JGE		full_simd_loop	// There are 4 or more pairs to process
+
+	JMP rest
+
+with_stride:
+	// Setup long strides
+	MOVQ	AX, CX
+	MOVQ	BX, DX
+	SALQ	$1, CX 	// CX = 8 * incX
+	SALQ	$1, DX 	// DX = 8 * incY
+
+	// Partially optimized loop
+	half_simd_loop:
+		// Load two values from X
+		MOVSS	(SI), X0
+		MOVSS	(SI)(AX*1), X1
+		// Load two values from Y
+		MOVSS	(DI), X2
+		MOVSS	(DI)(BX*1), X3
+		// Save them
+		MOVSS	X0, (DI)
+		MOVSS	X1, (DI)(BX*1)
+		MOVSS	X2, (SI)
+		MOVSS	X3, (SI)(AX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		// Load two values from X
+		MOVSS	(SI), X0
+		MOVSS	(SI)(AX*1), X1
+		// Load two values from Y
+		MOVSS	(DI), X2
+		MOVSS	(DI)(BX*1), X3
+		// Save them
+		MOVSS	X0, (DI)
+		MOVSS	X1, (DI)(BX*1)
+		MOVSS	X2, (SI)
+		MOVSS	X3, (SI)(AX*1)
+
+		// Update data pointers using long strides
+		ADDQ	CX, SI
+		ADDQ	DX, DI
+
+		SUBQ	$4, BP
+		JGE		half_simd_loop	// There are 4 or more pairs to process
+
+rest:
+	// Undo last SUBQ
+	ADDQ	$4,	BP
+
+	// Check that are there any value to process
+	JE	end
+
+	loop:
+		// Load values from X and Y
+		MOVSS	(SI), X0
+		MOVSS	(DI), X1
+		// Save them
+		MOVSS	X0, (DI)
+		MOVSS	X1, (SI)
+
+		// Update data pointers
+		ADDQ	AX, SI
+		ADDQ	BX,	DI
+
+		DECQ	BP
+		JNE	loop
+
+end:
+	RET
+
+panic:
+	CALL	runtime·panicindex(SB)
+	RET

+ 15 - 0
common/src/github.com/ziutek/blas/stubs.bash

@@ -0,0 +1,15 @@
+#!/bin/bash
+
+dst() {
+	p=${2:0:2}
+	 echo ${p,,}${2:2:-1}
+}
+
+>stubs_386.s
+>stubs_arm.s
+
+grep -h TEXT *_amd64.s |while read line; do
+	d=$(dst $line)
+	echo -e "$line\n\tJMP\t$d" >>stubs_386.s
+	echo -e "$line\n\tB\t$d" >>stubs_arm.s
+done

+ 42 - 0
common/src/github.com/ziutek/blas/stubs_386.s

@@ -0,0 +1,42 @@
+TEXT ·Dasum(SB), 7, $0
+	JMP	·dasum(SB)
+TEXT ·Daxpy(SB), 7, $0
+	JMP	·daxpy(SB)
+TEXT ·Dcopy(SB), 7, $0
+	JMP	·dcopy(SB)
+TEXT ·Ddot(SB), 7, $0
+	JMP	·ddot(SB)
+TEXT ·Dnrm2(SB), 7, $0
+	JMP	·dnrm2(SB)
+TEXT ·Drot(SB), 7, $0
+	JMP	·drot(SB)
+TEXT ·Drotg(SB), 7, $0
+	JMP	·drotg(SB)
+TEXT ·Dscal(SB), 7, $0
+	JMP	·dscal(SB)
+TEXT ·Dswap(SB), 7, $0
+	JMP	·dswap(SB)
+TEXT ·Idamax(SB), 7, $0
+	JMP	·idamax(SB)
+TEXT ·Isamax(SB), 7, $0
+	JMP	·isamax(SB)
+TEXT ·Sasum(SB), 7, $0
+	JMP	·sasum(SB)
+TEXT ·Saxpy(SB), 7, $0
+	JMP	·saxpy(SB)
+TEXT ·Scopy(SB), 7, $0
+	JMP	·scopy(SB)
+TEXT ·Sdot(SB), 7, $0
+	JMP	·sdot(SB)
+TEXT ·Sdsdot(SB), 7, $0
+	JMP	·sdsdot(SB)
+TEXT ·Snrm2(SB), 7, $0
+	JMP	·snrm2(SB)
+TEXT ·Srot(SB), 7, $0
+	JMP	·srot(SB)
+TEXT ·Srotg(SB), 7, $0
+	JMP	·srotg(SB)
+TEXT ·Sscal(SB), 7, $0
+	JMP	·sscal(SB)
+TEXT ·Sswap(SB), 7, $0
+	JMP	·sswap(SB)

+ 42 - 0
common/src/github.com/ziutek/blas/stubs_arm.s

@@ -0,0 +1,42 @@
+TEXT ·Dasum(SB), 7, $0
+	B	·dasum(SB)
+TEXT ·Daxpy(SB), 7, $0
+	B	·daxpy(SB)
+TEXT ·Dcopy(SB), 7, $0
+	B	·dcopy(SB)
+TEXT ·Ddot(SB), 7, $0
+	B	·ddot(SB)
+TEXT ·Dnrm2(SB), 7, $0
+	B	·dnrm2(SB)
+TEXT ·Drot(SB), 7, $0
+	B	·drot(SB)
+TEXT ·Drotg(SB), 7, $0
+	B	·drotg(SB)
+TEXT ·Dscal(SB), 7, $0
+	B	·dscal(SB)
+TEXT ·Dswap(SB), 7, $0
+	B	·dswap(SB)
+TEXT ·Idamax(SB), 7, $0
+	B	·idamax(SB)
+TEXT ·Isamax(SB), 7, $0
+	B	·isamax(SB)
+TEXT ·Sasum(SB), 7, $0
+	B	·sasum(SB)
+TEXT ·Saxpy(SB), 7, $0
+	B	·saxpy(SB)
+TEXT ·Scopy(SB), 7, $0
+	B	·scopy(SB)
+TEXT ·Sdot(SB), 7, $0
+	B	·sdot(SB)
+TEXT ·Sdsdot(SB), 7, $0
+	B	·sdsdot(SB)
+TEXT ·Snrm2(SB), 7, $0
+	B	·snrm2(SB)
+TEXT ·Srot(SB), 7, $0
+	B	·srot(SB)
+TEXT ·Srotg(SB), 7, $0
+	B	·srotg(SB)
+TEXT ·Sscal(SB), 7, $0
+	B	·sscal(SB)
+TEXT ·Sswap(SB), 7, $0
+	B	·sswap(SB)