crypto/bn256: full switchover to cloudflare's code (#16301)

* crypto/bn256: full switchover to cloudflare's code * crypto/bn256: only use cloudflare for optimized architectures * crypto/bn256: upstream fallback for non-optimized code * .travis, build: drop support for Go 1.8 (need type aliases) * crypto/bn256/cloudflare: enable curve mul lattice optimization
2018-03-20 01:13:54 +09:00 · 2018-03-20 01:13:54 +09:00 · 1203c6a237
commit 1203c6a237
parent 0965761a45
22 changed files with 783 additions and 174 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -3,17 +3,6 @@ go_import_path: github.com/ethereum/go-ethereum
 sudo: false
 matrix:
  include:
    - os: linux
      dist: trusty
      sudo: required
      go: 1.8.x
      script:
        - sudo modprobe fuse
        - sudo chmod 666 /dev/fuse
        - sudo chown root:$USER /etc/fuse.conf
        - go run build/ci.go install
        - go run build/ci.go test -coverage
    - os: linux
      dist: trusty
      sudo: required
--- a/build/ci.go
+++ b/build/ci.go
@ -182,13 +182,13 @@ func doInstall(cmdline []string) {
 	// Check Go version. People regularly open issues about compilation
 	// failure with outdated Go. This should save them the trouble.
 	if !strings.Contains(runtime.Version(), "devel") {
-		// Figure out the minor version number since we can't textually compare (1.10 < 1.8)
+		// Figure out the minor version number since we can't textually compare (1.10 < 1.9)
 		var minor int
 		fmt.Sscanf(strings.TrimPrefix(runtime.Version(), "go1."), "%d", &minor)
-		if minor < 8 {
+		if minor < 9 {
 			log.Println("You have Go version", runtime.Version())
-			log.Println("go-ethereum requires at least Go version 1.8 and cannot")
+			log.Println("go-ethereum requires at least Go version 1.9 and cannot")
 			log.Println("be compiled with an earlier version. Please upgrade your Go installation.")
 			os.Exit(1)
 		}
@ -262,16 +262,6 @@ func goTool(subcmd string, args ...string) *exec.Cmd {
 func goToolArch(arch string, cc string, subcmd string, args ...string) *exec.Cmd {
 	cmd := build.GoTool(subcmd, args...)
 	if subcmd == "build" || subcmd == "install" || subcmd == "test" {
 		// Go CGO has a Windows linker error prior to 1.8 (https://github.com/golang/go/issues/8756).
 		// Work around issue by allowing multiple definitions for <1.8 builds.
 		var minor int
 		fmt.Sscanf(strings.TrimPrefix(runtime.Version(), "go1."), "%d", &minor)
 		if runtime.GOOS == "windows" && minor < 8 {
 			cmd.Args = append(cmd.Args, []string{"-ldflags", "-extldflags -Wl,--allow-multiple-definition"}...)
 		}
 	}
 	cmd.Env = []string{"GOPATH=" + build.GOPATH()}
 	if arch == "" || arch == runtime.GOARCH {
 		cmd.Env = append(cmd.Env, "GOBIN="+GOBIN)
--- a/crypto/bn256/bn256_other.go
+++ b/crypto/bn256/bn256_other.go
@ -14,50 +14,22 @@
 // You should have received a copy of the GNU Lesser General Public License
 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
-// +build !amd64 appengine gccgo
+// +build amd64 arm64
 // Package bn256 implements the Optimal Ate pairing over a 256-bit Barreto-Naehrig curve.
 package bn256
-import (
+import "github.com/ethereum/go-ethereum/crypto/bn256/cloudflare"
 	"math/big"
 	"github.com/ethereum/go-ethereum/crypto/bn256/google"
 )
 // G1 is an abstract cyclic group. The zero value is suitable for use as the
 // output of an operation, but cannot be used as an input.
-type G1 struct {
+type G1 = bn256.G1
 	bn256.G1
 }
 // Add sets e to a+b and then returns e.
 func (e *G1) Add(a, b *G1) *G1 {
 	e.G1.Add(&a.G1, &b.G1)
 	return e
 }
 // ScalarMult sets e to a*k and then returns e.
 func (e *G1) ScalarMult(a *G1, k *big.Int) *G1 {
 	e.G1.ScalarMult(&a.G1, k)
 	return e
 }
 // G2 is an abstract cyclic group. The zero value is suitable for use as the
 // output of an operation, but cannot be used as an input.
-type G2 struct {
+type G2 = bn256.G2
 	bn256.G2
 }
 // PairingCheck calculates the Optimal Ate pairing for a set of points.
 func PairingCheck(a []*G1, b []*G2) bool {
-	as := make([]*bn256.G1, len(a))
+	return bn256.PairingCheck(a, b)
 	for i, p := range a {
 		as[i] = &p.G1
 	}
 	bs := make([]*bn256.G2, len(b))
 	for i, p := range b {
 		bs[i] = &p.G2
 	}
 	return bn256.PairingCheck(as, bs)
 }
--- a/crypto/bn256/bn256_fuzz.go
+++ b/crypto/bn256/bn256_fuzz.go
@ -0,0 +1,138 @@
 // Copyright 2018 The go-ethereum Authors
 // This file is part of the go-ethereum library.
 //
 // The go-ethereum library is free software: you can redistribute it and/or modify
 // it under the terms of the GNU Lesser General Public License as published by
 // the Free Software Foundation, either version 3 of the License, or
 // (at your option) any later version.
 //
 // The go-ethereum library is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 // GNU Lesser General Public License for more details.
 //
 // You should have received a copy of the GNU Lesser General Public License
 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
 // +build gofuzz
 package bn256
 import (
 	"bytes"
 	"math/big"
 	cloudflare "github.com/ethereum/go-ethereum/crypto/bn256/cloudflare"
 	google "github.com/ethereum/go-ethereum/crypto/bn256/google"
 )
 // FuzzAdd fuzzez bn256 addition between the Google and Cloudflare libraries.
 func FuzzAdd(data []byte) int {
 	// Ensure we have enough data in the first place
 	if len(data) != 128 {
 		return 0
 	}
 	// Ensure both libs can parse the first curve point
 	xc := new(cloudflare.G1)
 	_, errc := xc.Unmarshal(data[:64])
 	xg := new(google.G1)
 	_, errg := xg.Unmarshal(data[:64])
 	if (errc == nil) != (errg == nil) {
 		panic("parse mismatch")
 	} else if errc != nil {
 		return 0
 	}
 	// Ensure both libs can parse the second curve point
 	yc := new(cloudflare.G1)
 	_, errc = yc.Unmarshal(data[64:])
 	yg := new(google.G1)
 	_, errg = yg.Unmarshal(data[64:])
 	if (errc == nil) != (errg == nil) {
 		panic("parse mismatch")
 	} else if errc != nil {
 		return 0
 	}
 	// Add the two points and ensure they result in the same output
 	rc := new(cloudflare.G1)
 	rc.Add(xc, yc)
 	rg := new(google.G1)
 	rg.Add(xg, yg)
 	if !bytes.Equal(rc.Marshal(), rg.Marshal()) {
 		panic("add mismatch")
 	}
 	return 0
 }
 // FuzzMul fuzzez bn256 scalar multiplication between the Google and Cloudflare
 // libraries.
 func FuzzMul(data []byte) int {
 	// Ensure we have enough data in the first place
 	if len(data) != 96 {
 		return 0
 	}
 	// Ensure both libs can parse the curve point
 	pc := new(cloudflare.G1)
 	_, errc := pc.Unmarshal(data[:64])
 	pg := new(google.G1)
 	_, errg := pg.Unmarshal(data[:64])
 	if (errc == nil) != (errg == nil) {
 		panic("parse mismatch")
 	} else if errc != nil {
 		return 0
 	}
 	// Add the two points and ensure they result in the same output
 	rc := new(cloudflare.G1)
 	rc.ScalarMult(pc, new(big.Int).SetBytes(data[64:]))
 	rg := new(google.G1)
 	rg.ScalarMult(pg, new(big.Int).SetBytes(data[64:]))
 	if !bytes.Equal(rc.Marshal(), rg.Marshal()) {
 		panic("scalar mul mismatch")
 	}
 	return 0
 }
 func FuzzPair(data []byte) int {
 	// Ensure we have enough data in the first place
 	if len(data) != 192 {
 		return 0
 	}
 	// Ensure both libs can parse the curve point
 	pc := new(cloudflare.G1)
 	_, errc := pc.Unmarshal(data[:64])
 	pg := new(google.G1)
 	_, errg := pg.Unmarshal(data[:64])
 	if (errc == nil) != (errg == nil) {
 		panic("parse mismatch")
 	} else if errc != nil {
 		return 0
 	}
 	// Ensure both libs can parse the twist point
 	tc := new(cloudflare.G2)
 	_, errc = tc.Unmarshal(data[64:])
 	tg := new(google.G2)
 	_, errg = tg.Unmarshal(data[64:])
 	if (errc == nil) != (errg == nil) {
 		panic("parse mismatch")
 	} else if errc != nil {
 		return 0
 	}
 	// Pair the two points and ensure thet result in the same output
 	if cloudflare.PairingCheck([]*cloudflare.G1{pc}, []*cloudflare.G2{tc}) != google.PairingCheck([]*google.G1{pg}, []*google.G2{tg}) {
 		panic("pair mismatch")
 	}
 	return 0
 }
--- a/crypto/bn256/bn256_amd64.go
+++ b/crypto/bn256/bn256_amd64.go
@ -14,50 +14,22 @@
 // You should have received a copy of the GNU Lesser General Public License
 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
-// +build amd64,!appengine,!gccgo
+// +build !amd64,!arm64
 // Package bn256 implements the Optimal Ate pairing over a 256-bit Barreto-Naehrig curve.
 package bn256
-import (
+import "github.com/ethereum/go-ethereum/crypto/bn256/google"
 	"math/big"
 	"github.com/ethereum/go-ethereum/crypto/bn256/cloudflare"
 )
 // G1 is an abstract cyclic group. The zero value is suitable for use as the
 // output of an operation, but cannot be used as an input.
-type G1 struct {
+type G1 = bn256.G1
 	bn256.G1
 }
 // Add sets e to a+b and then returns e.
 func (e *G1) Add(a, b *G1) *G1 {
 	e.G1.Add(&a.G1, &b.G1)
 	return e
 }
 // ScalarMult sets e to a*k and then returns e.
 func (e *G1) ScalarMult(a *G1, k *big.Int) *G1 {
 	e.G1.ScalarMult(&a.G1, k)
 	return e
 }
 // G2 is an abstract cyclic group. The zero value is suitable for use as the
 // output of an operation, but cannot be used as an input.
-type G2 struct {
+type G2 = bn256.G2
 	bn256.G2
 }
 // PairingCheck calculates the Optimal Ate pairing for a set of points.
 func PairingCheck(a []*G1, b []*G2) bool {
-	as := make([]*bn256.G1, len(a))
+	return bn256.PairingCheck(a, b)
 	for i, p := range a {
 		as[i] = &p.G1
 	}
 	bs := make([]*bn256.G2, len(b))
 	for i, p := range b {
 		bs[i] = &p.G2
 	}
 	return bn256.PairingCheck(as, bs)
 }
--- a/crypto/bn256/cloudflare/bn256_test.go
+++ b/crypto/bn256/cloudflare/bn256_test.go
@ -1,5 +1,3 @@
 // +build amd64,!appengine,!gccgo
 package bn256
 import (
--- a/crypto/bn256/cloudflare/curve.go
+++ b/crypto/bn256/cloudflare/curve.go
@ -183,15 +183,24 @@ func (c *curvePoint) Double(a *curvePoint) {
 }
 func (c *curvePoint) Mul(a *curvePoint, scalar *big.Int) {
-	sum, t := &curvePoint{}, &curvePoint{}
+	precomp := [1 << 2]*curvePoint{nil, {}, {}, {}}
-	sum.SetInfinity()
+	precomp[1].Set(a)
 	precomp[2].Set(a)
 	gfpMul(&precomp[2].x, &precomp[2].x, xiTo2PSquaredMinus2Over3)
 	precomp[3].Add(precomp[1], precomp[2])
-	for i := scalar.BitLen(); i >= 0; i-- {
+	multiScalar := curveLattice.Multi(scalar)
 	sum := &curvePoint{}
 	sum.SetInfinity()
 	t := &curvePoint{}
 	for i := len(multiScalar) - 1; i >= 0; i-- {
 		t.Double(sum)
-		if scalar.Bit(i) != 0 {
+		if multiScalar[i] == 0 {
 			sum.Add(t, a)
 		} else {
 			sum.Set(t)
 		} else {
 			sum.Add(t, precomp[multiScalar[i]])
 		}
 	}
 	c.Set(sum)
--- a/crypto/bn256/cloudflare/example_test.go
+++ b/crypto/bn256/cloudflare/example_test.go
@ -2,8 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build amd64,!appengine,!gccgo
 package bn256
 import (
--- a/crypto/bn256/cloudflare/gfp.h
+++ b/crypto/bn256/cloudflare/gfp.h
@ -1,32 +0,0 @@
 #define storeBlock(a0,a1,a2,a3, r) \
 	MOVQ a0,  0+r \
 	MOVQ a1,  8+r \
 	MOVQ a2, 16+r \
 	MOVQ a3, 24+r
 #define loadBlock(r, a0,a1,a2,a3) \
 	MOVQ  0+r, a0 \
 	MOVQ  8+r, a1 \
 	MOVQ 16+r, a2 \
 	MOVQ 24+r, a3
 #define gfpCarry(a0,a1,a2,a3,a4, b0,b1,b2,b3,b4) \
 	\ // b = a-p
 	MOVQ a0, b0 \
 	MOVQ a1, b1 \
 	MOVQ a2, b2 \
 	MOVQ a3, b3 \
 	MOVQ a4, b4 \
 	\
 	SUBQ ·p2+0(SB), b0 \
 	SBBQ ·p2+8(SB), b1 \
 	SBBQ ·p2+16(SB), b2 \
 	SBBQ ·p2+24(SB), b3 \
 	SBBQ $0, b4 \
 	\
 	\ // if b is negative then return a
 	\ // else return b
 	CMOVQCC b0, a0 \
 	CMOVQCC b1, a1 \
 	CMOVQCC b2, a2 \
 	CMOVQCC b3, a3
--- a/crypto/bn256/cloudflare/gfp_amd64.go
+++ b/crypto/bn256/cloudflare/gfp_amd64.go
@ -1,15 +0,0 @@
 // +build amd64,!appengine,!gccgo
 package bn256
 // go:noescape
 func gfpNeg(c, a *gfP)
 //go:noescape
 func gfpAdd(c, a, b *gfP)
 //go:noescape
 func gfpSub(c, a, b *gfP)
 //go:noescape
 func gfpMul(c, a, b *gfP)
--- a/crypto/bn256/cloudflare/gfp_amd64.s
+++ b/crypto/bn256/cloudflare/gfp_amd64.s
@ -1,8 +1,40 @@
-// +build amd64,!appengine,!gccgo
+// +build amd64,!generic
-#include "gfp.h"
+#define storeBlock(a0,a1,a2,a3, r) \
-#include "mul.h"
+	MOVQ a0,  0+r \
-#include "mul_bmi2.h"
+	MOVQ a1,  8+r \
 	MOVQ a2, 16+r \
 	MOVQ a3, 24+r
 #define loadBlock(r, a0,a1,a2,a3) \
 	MOVQ  0+r, a0 \
 	MOVQ  8+r, a1 \
 	MOVQ 16+r, a2 \
 	MOVQ 24+r, a3
 #define gfpCarry(a0,a1,a2,a3,a4, b0,b1,b2,b3,b4) \
 	\ // b = a-p
 	MOVQ a0, b0 \
 	MOVQ a1, b1 \
 	MOVQ a2, b2 \
 	MOVQ a3, b3 \
 	MOVQ a4, b4 \
 	\
 	SUBQ ·p2+0(SB), b0 \
 	SBBQ ·p2+8(SB), b1 \
 	SBBQ ·p2+16(SB), b2 \
 	SBBQ ·p2+24(SB), b3 \
 	SBBQ $0, b4 \
 	\
 	\ // if b is negative then return a
 	\ // else return b
 	CMOVQCC b0, a0 \
 	CMOVQCC b1, a1 \
 	CMOVQCC b2, a2 \
 	CMOVQCC b3, a3
 #include "mul_amd64.h"
 #include "mul_bmi2_amd64.h"
 TEXT ·gfpNeg(SB),0,$0-16
 	MOVQ ·p2+0(SB), R8
--- a/crypto/bn256/cloudflare/gfp_arm64.s
+++ b/crypto/bn256/cloudflare/gfp_arm64.s
@ -0,0 +1,113 @@
 // +build arm64,!generic
 #define storeBlock(a0,a1,a2,a3, r) \
 	MOVD a0,  0+r \
 	MOVD a1,  8+r \
 	MOVD a2, 16+r \
 	MOVD a3, 24+r
 #define loadBlock(r, a0,a1,a2,a3) \
 	MOVD  0+r, a0 \
 	MOVD  8+r, a1 \
 	MOVD 16+r, a2 \
 	MOVD 24+r, a3
 #define loadModulus(p0,p1,p2,p3) \
 	MOVD ·p2+0(SB), p0 \
 	MOVD ·p2+8(SB), p1 \
 	MOVD ·p2+16(SB), p2 \
 	MOVD ·p2+24(SB), p3
 #include "mul_arm64.h"
 TEXT ·gfpNeg(SB),0,$0-16
 	MOVD a+8(FP), R0
 	loadBlock(0(R0), R1,R2,R3,R4)
 	loadModulus(R5,R6,R7,R8)
 	SUBS R1, R5, R1
 	SBCS R2, R6, R2
 	SBCS R3, R7, R3
 	SBCS R4, R8, R4
 	SUBS R5, R1, R5
 	SBCS R6, R2, R6
 	SBCS R7, R3, R7
 	SBCS R8, R4, R8
 	CSEL CS, R5, R1, R1
 	CSEL CS, R6, R2, R2
 	CSEL CS, R7, R3, R3
 	CSEL CS, R8, R4, R4
 	MOVD c+0(FP), R0
 	storeBlock(R1,R2,R3,R4, 0(R0))
 	RET
 TEXT ·gfpAdd(SB),0,$0-24
 	MOVD a+8(FP), R0
 	loadBlock(0(R0), R1,R2,R3,R4)
 	MOVD b+16(FP), R0
 	loadBlock(0(R0), R5,R6,R7,R8)
 	loadModulus(R9,R10,R11,R12)
 	MOVD ZR, R0
 	ADDS R5, R1
 	ADCS R6, R2
 	ADCS R7, R3
 	ADCS R8, R4
 	ADCS ZR, R0
 	SUBS  R9, R1, R5
 	SBCS R10, R2, R6
 	SBCS R11, R3, R7
 	SBCS R12, R4, R8
 	SBCS  ZR, R0, R0
 	CSEL CS, R5, R1, R1
 	CSEL CS, R6, R2, R2
 	CSEL CS, R7, R3, R3
 	CSEL CS, R8, R4, R4
 	MOVD c+0(FP), R0
 	storeBlock(R1,R2,R3,R4, 0(R0))
 	RET
 TEXT ·gfpSub(SB),0,$0-24
 	MOVD a+8(FP), R0
 	loadBlock(0(R0), R1,R2,R3,R4)
 	MOVD b+16(FP), R0
 	loadBlock(0(R0), R5,R6,R7,R8)
 	loadModulus(R9,R10,R11,R12)
 	SUBS R5, R1
 	SBCS R6, R2
 	SBCS R7, R3
 	SBCS R8, R4
 	CSEL CS, ZR,  R9,  R9
 	CSEL CS, ZR, R10, R10
 	CSEL CS, ZR, R11, R11
 	CSEL CS, ZR, R12, R12
 	ADDS  R9, R1
 	ADCS R10, R2
 	ADCS R11, R3
 	ADCS R12, R4
 	MOVD c+0(FP), R0
 	storeBlock(R1,R2,R3,R4, 0(R0))
 	RET
 TEXT ·gfpMul(SB),0,$0-24
 	MOVD a+8(FP), R0
 	loadBlock(0(R0), R1,R2,R3,R4)
 	MOVD b+16(FP), R0
 	loadBlock(0(R0), R5,R6,R7,R8)
 	mul(R9,R10,R11,R12,R13,R14,R15,R16)
 	gfpReduce()
 	MOVD c+0(FP), R0
 	storeBlock(R1,R2,R3,R4, 0(R0))
 	RET
--- a/crypto/bn256/cloudflare/gfp_decl.go
+++ b/crypto/bn256/cloudflare/gfp_decl.go
@ -0,0 +1,18 @@
 // +build amd64,!generic arm64,!generic
 package bn256
 // This file contains forward declarations for the architecture-specific
 // assembly implementations of these functions, provided that they exist.
 // go:noescape
 func gfpNeg(c, a *gfP)
 //go:noescape
 func gfpAdd(c, a, b *gfP)
 //go:noescape
 func gfpSub(c, a, b *gfP)
 //go:noescape
 func gfpMul(c, a, b *gfP)
--- a/crypto/bn256/cloudflare/gfp_generic.go
+++ b/crypto/bn256/cloudflare/gfp_generic.go
@ -0,0 +1,173 @@
 // +build !amd64,!arm64 generic
 package bn256
 func gfpCarry(a *gfP, head uint64) {
 	b := &gfP{}
 	var carry uint64
 	for i, pi := range p2 {
 		ai := a[i]
 		bi := ai - pi - carry
 		b[i] = bi
 		carry = (pi&^ai | (pi|^ai)&bi) >> 63
 	}
 	carry = carry &^ head
 	// If b is negative, then return a.
 	// Else return b.
 	carry = -carry
 	ncarry := ^carry
 	for i := 0; i < 4; i++ {
 		a[i] = (a[i] & carry) | (b[i] & ncarry)
 	}
 }
 func gfpNeg(c, a *gfP) {
 	var carry uint64
 	for i, pi := range p2 {
 		ai := a[i]
 		ci := pi - ai - carry
 		c[i] = ci
 		carry = (ai&^pi | (ai|^pi)&ci) >> 63
 	}
 	gfpCarry(c, 0)
 }
 func gfpAdd(c, a, b *gfP) {
 	var carry uint64
 	for i, ai := range a {
 		bi := b[i]
 		ci := ai + bi + carry
 		c[i] = ci
 		carry = (ai&bi | (ai|bi)&^ci) >> 63
 	}
 	gfpCarry(c, carry)
 }
 func gfpSub(c, a, b *gfP) {
 	t := &gfP{}
 	var carry uint64
 	for i, pi := range p2 {
 		bi := b[i]
 		ti := pi - bi - carry
 		t[i] = ti
 		carry = (bi&^pi | (bi|^pi)&ti) >> 63
 	}
 	carry = 0
 	for i, ai := range a {
 		ti := t[i]
 		ci := ai + ti + carry
 		c[i] = ci
 		carry = (ai&ti | (ai|ti)&^ci) >> 63
 	}
 	gfpCarry(c, carry)
 }
 func mul(a, b [4]uint64) [8]uint64 {
 	const (
 		mask16 uint64 = 0x0000ffff
 		mask32 uint64 = 0xffffffff
 	)
 	var buff [32]uint64
 	for i, ai := range a {
 		a0, a1, a2, a3 := ai&mask16, (ai>>16)&mask16, (ai>>32)&mask16, ai>>48
 		for j, bj := range b {
 			b0, b2 := bj&mask32, bj>>32
 			off := 4 * (i + j)
 			buff[off+0] += a0 * b0
 			buff[off+1] += a1 * b0
 			buff[off+2] += a2*b0 + a0*b2
 			buff[off+3] += a3*b0 + a1*b2
 			buff[off+4] += a2 * b2
 			buff[off+5] += a3 * b2
 		}
 	}
 	for i := uint(1); i < 4; i++ {
 		shift := 16 * i
 		var head, carry uint64
 		for j := uint(0); j < 8; j++ {
 			block := 4 * j
 			xi := buff[block]
 			yi := (buff[block+i] << shift) + head
 			zi := xi + yi + carry
 			buff[block] = zi
 			carry = (xi&yi | (xi|yi)&^zi) >> 63
 			head = buff[block+i] >> (64 - shift)
 		}
 	}
 	return [8]uint64{buff[0], buff[4], buff[8], buff[12], buff[16], buff[20], buff[24], buff[28]}
 }
 func halfMul(a, b [4]uint64) [4]uint64 {
 	const (
 		mask16 uint64 = 0x0000ffff
 		mask32 uint64 = 0xffffffff
 	)
 	var buff [18]uint64
 	for i, ai := range a {
 		a0, a1, a2, a3 := ai&mask16, (ai>>16)&mask16, (ai>>32)&mask16, ai>>48
 		for j, bj := range b {
 			if i+j > 3 {
 				break
 			}
 			b0, b2 := bj&mask32, bj>>32
 			off := 4 * (i + j)
 			buff[off+0] += a0 * b0
 			buff[off+1] += a1 * b0
 			buff[off+2] += a2*b0 + a0*b2
 			buff[off+3] += a3*b0 + a1*b2
 			buff[off+4] += a2 * b2
 			buff[off+5] += a3 * b2
 		}
 	}
 	for i := uint(1); i < 4; i++ {
 		shift := 16 * i
 		var head, carry uint64
 		for j := uint(0); j < 4; j++ {
 			block := 4 * j
 			xi := buff[block]
 			yi := (buff[block+i] << shift) + head
 			zi := xi + yi + carry
 			buff[block] = zi
 			carry = (xi&yi | (xi|yi)&^zi) >> 63
 			head = buff[block+i] >> (64 - shift)
 		}
 	}
 	return [4]uint64{buff[0], buff[4], buff[8], buff[12]}
 }
 func gfpMul(c, a, b *gfP) {
 	T := mul(*a, *b)
 	m := halfMul([4]uint64{T[0], T[1], T[2], T[3]}, np)
 	t := mul([4]uint64{m[0], m[1], m[2], m[3]}, p2)
 	var carry uint64
 	for i, Ti := range T {
 		ti := t[i]
 		zi := Ti + ti + carry
 		T[i] = zi
 		carry = (Ti&ti | (Ti|ti)&^zi) >> 63
 	}
 	*c = gfP{T[4], T[5], T[6], T[7]}
 	gfpCarry(c, carry)
 }
--- a/crypto/bn256/cloudflare/gfp_pure.go
+++ b/crypto/bn256/cloudflare/gfp_pure.go
@ -1,19 +0,0 @@
 //  +build !amd64 appengine gccgo
 package bn256
 func gfpNeg(c, a *gfP) {
 	panic("unsupported architecture")
 }
 func gfpAdd(c, a, b *gfP) {
 	panic("unsupported architecture")
 }
 func gfpSub(c, a, b *gfP) {
 	panic("unsupported architecture")
 }
 func gfpMul(c, a, b *gfP) {
 	panic("unsupported architecture")
 }
--- a/crypto/bn256/cloudflare/gfp_test.go
+++ b/crypto/bn256/cloudflare/gfp_test.go
@ -1,5 +1,3 @@
 // +build amd64,!appengine,!gccgo
 package bn256
 import (
--- a/crypto/bn256/cloudflare/lattice.go
+++ b/crypto/bn256/cloudflare/lattice.go
@ -0,0 +1,115 @@
 package bn256
 import (
 	"math/big"
 )
 var half = new(big.Int).Rsh(Order, 1)
 var curveLattice = &lattice{
 	vectors: [][]*big.Int{
 		{bigFromBase10("147946756881789319000765030803803410728"), bigFromBase10("147946756881789319010696353538189108491")},
 		{bigFromBase10("147946756881789319020627676272574806254"), bigFromBase10("-147946756881789318990833708069417712965")},
 	},
 	inverse: []*big.Int{
 		bigFromBase10("147946756881789318990833708069417712965"),
 		bigFromBase10("147946756881789319010696353538189108491"),
 	},
 	det: bigFromBase10("43776485743678550444492811490514550177096728800832068687396408373151616991234"),
 }
 var targetLattice = &lattice{
 	vectors: [][]*big.Int{
 		{bigFromBase10("9931322734385697761"), bigFromBase10("9931322734385697761"), bigFromBase10("9931322734385697763"), bigFromBase10("9931322734385697764")},
 		{bigFromBase10("4965661367192848881"), bigFromBase10("4965661367192848881"), bigFromBase10("4965661367192848882"), bigFromBase10("-9931322734385697762")},
 		{bigFromBase10("-9931322734385697762"), bigFromBase10("-4965661367192848881"), bigFromBase10("4965661367192848881"), bigFromBase10("-4965661367192848882")},
 		{bigFromBase10("9931322734385697763"), bigFromBase10("-4965661367192848881"), bigFromBase10("-4965661367192848881"), bigFromBase10("-4965661367192848881")},
 	},
 	inverse: []*big.Int{
 		bigFromBase10("734653495049373973658254490726798021314063399421879442165"),
 		bigFromBase10("147946756881789319000765030803803410728"),
 		bigFromBase10("-147946756881789319005730692170996259609"),
 		bigFromBase10("1469306990098747947464455738335385361643788813749140841702"),
 	},
 	det: new(big.Int).Set(Order),
 }
 type lattice struct {
 	vectors [][]*big.Int
 	inverse []*big.Int
 	det     *big.Int
 }
 // decompose takes a scalar mod Order as input and finds a short, positive decomposition of it wrt to the lattice basis.
 func (l *lattice) decompose(k *big.Int) []*big.Int {
 	n := len(l.inverse)
 	// Calculate closest vector in lattice to <k,0,0,...> with Babai's rounding.
 	c := make([]*big.Int, n)
 	for i := 0; i < n; i++ {
 		c[i] = new(big.Int).Mul(k, l.inverse[i])
 		round(c[i], l.det)
 	}
 	// Transform vectors according to c and subtract <k,0,0,...>.
 	out := make([]*big.Int, n)
 	temp := new(big.Int)
 	for i := 0; i < n; i++ {
 		out[i] = new(big.Int)
 		for j := 0; j < n; j++ {
 			temp.Mul(c[j], l.vectors[j][i])
 			out[i].Add(out[i], temp)
 		}
 		out[i].Neg(out[i])
 		out[i].Add(out[i], l.vectors[0][i]).Add(out[i], l.vectors[0][i])
 	}
 	out[0].Add(out[0], k)
 	return out
 }
 func (l *lattice) Precompute(add func(i, j uint)) {
 	n := uint(len(l.vectors))
 	total := uint(1) << n
 	for i := uint(0); i < n; i++ {
 		for j := uint(0); j < total; j++ {
 			if (j>>i)&1 == 1 {
 				add(i, j)
 			}
 		}
 	}
 }
 func (l *lattice) Multi(scalar *big.Int) []uint8 {
 	decomp := l.decompose(scalar)
 	maxLen := 0
 	for _, x := range decomp {
 		if x.BitLen() > maxLen {
 			maxLen = x.BitLen()
 		}
 	}
 	out := make([]uint8, maxLen)
 	for j, x := range decomp {
 		for i := 0; i < maxLen; i++ {
 			out[i] += uint8(x.Bit(i)) << uint(j)
 		}
 	}
 	return out
 }
 // round sets num to num/denom rounded to the nearest integer.
 func round(num, denom *big.Int) {
 	r := new(big.Int)
 	num.DivMod(num, denom, r)
 	if r.Cmp(half) == 1 {
 		num.Add(num, big.NewInt(1))
 	}
 }
--- a/crypto/bn256/cloudflare/lattice_test.go
+++ b/crypto/bn256/cloudflare/lattice_test.go
@ -0,0 +1,29 @@
 package bn256
 import (
 	"crypto/rand"
 	"testing"
 )
 func TestLatticeReduceCurve(t *testing.T) {
 	k, _ := rand.Int(rand.Reader, Order)
 	ks := curveLattice.decompose(k)
 	if ks[0].BitLen() > 130 || ks[1].BitLen() > 130 {
 		t.Fatal("reduction too large")
 	} else if ks[0].Sign() < 0 || ks[1].Sign() < 0 {
 		t.Fatal("reduction must be positive")
 	}
 }
 func TestLatticeReduceTarget(t *testing.T) {
 	k, _ := rand.Int(rand.Reader, Order)
 	ks := targetLattice.decompose(k)
 	if ks[0].BitLen() > 66 || ks[1].BitLen() > 66 || ks[2].BitLen() > 66 || ks[3].BitLen() > 66 {
 		t.Fatal("reduction too large")
 	} else if ks[0].Sign() < 0 || ks[1].Sign() < 0 || ks[2].Sign() < 0 || ks[3].Sign() < 0 {
 		t.Fatal("reduction must be positive")
 	}
 }
--- a/crypto/bn256/cloudflare/main_test.go
+++ b/crypto/bn256/cloudflare/main_test.go
@ -1,5 +1,3 @@
 // +build amd64,!appengine,!gccgo
 package bn256
 import (
--- a/crypto/bn256/cloudflare/mul_amd64.h
+++ b/crypto/bn256/cloudflare/mul_amd64.h
--- a/crypto/bn256/cloudflare/mul_arm64.h
+++ b/crypto/bn256/cloudflare/mul_arm64.h
@ -0,0 +1,133 @@
 #define mul(c0,c1,c2,c3,c4,c5,c6,c7) \
 	MUL R1, R5, c0 \
 	UMULH R1, R5, c1 \
 	MUL R1, R6, R0 \
 	ADDS R0, c1 \
 	UMULH R1, R6, c2 \
 	MUL R1, R7, R0 \
 	ADCS R0, c2 \
 	UMULH R1, R7, c3 \
 	MUL R1, R8, R0 \
 	ADCS R0, c3 \
 	UMULH R1, R8, c4 \
 	ADCS ZR, c4 \
 	\
 	MUL R2, R5, R25 \
 	UMULH R2, R5, R26 \
 	MUL R2, R6, R0 \
 	ADDS R0, R26 \
 	UMULH R2, R6, R27 \
 	MUL R2, R7, R0 \
 	ADCS R0, R27 \
 	UMULH R2, R7, R29 \
 	MUL R2, R8, R0 \
 	ADCS R0, R29 \
 	UMULH R2, R8, c5 \
 	ADCS ZR, c5 \
 	ADDS R25, c1 \
 	ADCS R26, c2 \
 	ADCS R27, c3 \
 	ADCS R29, c4 \
 	ADCS  ZR, c5 \
 	\
 	MUL R3, R5, R25 \
 	UMULH R3, R5, R26 \
 	MUL R3, R6, R0 \
 	ADDS R0, R26 \
 	UMULH R3, R6, R27 \
 	MUL R3, R7, R0 \
 	ADCS R0, R27 \
 	UMULH R3, R7, R29 \
 	MUL R3, R8, R0 \
 	ADCS R0, R29 \
 	UMULH R3, R8, c6 \
 	ADCS ZR, c6 \
 	ADDS R25, c2 \
 	ADCS R26, c3 \
 	ADCS R27, c4 \
 	ADCS R29, c5 \
 	ADCS  ZR, c6 \
 	\
 	MUL R4, R5, R25 \
 	UMULH R4, R5, R26 \
 	MUL R4, R6, R0 \
 	ADDS R0, R26 \
 	UMULH R4, R6, R27 \
 	MUL R4, R7, R0 \
 	ADCS R0, R27 \
 	UMULH R4, R7, R29 \
 	MUL R4, R8, R0 \
 	ADCS R0, R29 \
 	UMULH R4, R8, c7 \
 	ADCS ZR, c7 \
 	ADDS R25, c3 \
 	ADCS R26, c4 \
 	ADCS R27, c5 \
 	ADCS R29, c6 \
 	ADCS  ZR, c7
 #define gfpReduce() \
 	\ // m = (T * N') mod R, store m in R1:R2:R3:R4
 	MOVD ·np+0(SB), R17 \
 	MOVD ·np+8(SB), R18 \
 	MOVD ·np+16(SB), R19 \
 	MOVD ·np+24(SB), R20 \
 	\
 	MUL R9, R17, R1 \
 	UMULH R9, R17, R2 \
 	MUL R9, R18, R0 \
 	ADDS R0, R2 \
 	UMULH R9, R18, R3 \
 	MUL R9, R19, R0 \
 	ADCS R0, R3 \
 	UMULH R9, R19, R4 \
 	MUL R9, R20, R0 \
 	ADCS R0, R4 \
 	\
 	MUL R10, R17, R21 \
 	UMULH R10, R17, R22 \
 	MUL R10, R18, R0 \
 	ADDS R0, R22 \
 	UMULH R10, R18, R23 \
 	MUL R10, R19, R0 \
 	ADCS R0, R23 \
 	ADDS R21, R2 \
 	ADCS R22, R3 \
 	ADCS R23, R4 \
 	\
 	MUL R11, R17, R21 \
 	UMULH R11, R17, R22 \
 	MUL R11, R18, R0 \
 	ADDS R0, R22 \
 	ADDS R21, R3 \
 	ADCS R22, R4 \
 	\
 	MUL R12, R17, R21 \
 	ADDS R21, R4 \
 	\
 	\ // m * N
 	loadModulus(R5,R6,R7,R8) \
 	mul(R17,R18,R19,R20,R21,R22,R23,R24) \
 	\
 	\ // Add the 512-bit intermediate to m*N
 	MOVD  ZR, R25 \
 	ADDS  R9, R17 \
 	ADCS R10, R18 \
 	ADCS R11, R19 \
 	ADCS R12, R20 \
 	ADCS R13, R21 \
 	ADCS R14, R22 \
 	ADCS R15, R23 \
 	ADCS R16, R24 \
 	ADCS  ZR, R25 \
 	\
 	\ // Our output is R21:R22:R23:R24. Reduce mod p if necessary.
 	SUBS R5, R21, R10 \
 	SBCS R6, R22, R11 \
 	SBCS R7, R23, R12 \
 	SBCS R8, R24, R13 \
 	\
 	CSEL CS, R10, R21, R1 \
 	CSEL CS, R11, R22, R2 \
 	CSEL CS, R12, R23, R3 \
 	CSEL CS, R13, R24, R4
--- a/crypto/bn256/cloudflare/mul_bmi2_amd64.h
+++ b/crypto/bn256/cloudflare/mul_bmi2_amd64.h