371 lines
7.2 KiB
ArmAsm
371 lines
7.2 KiB
ArmAsm
//
|
|
// hamming distance calculations in Go
|
|
//
|
|
// https://github.com/steakknife/hamming
|
|
//
|
|
// Copyright © 2014, 2015, 2016 Barry Allard
|
|
//
|
|
// MIT license
|
|
//
|
|
|
|
#include "textflag.h"
|
|
|
|
// type SliceHeader struct {
|
|
// Data uintptr 0
|
|
// Len int 8
|
|
// Cap int 16
|
|
// }
|
|
|
|
// 0 x.Data
|
|
// 8 x.Len
|
|
// 16 x.Cap
|
|
// 24 ret
|
|
|
|
// type StringHeader struct {
|
|
// Data uintptr 0
|
|
// Len int 8
|
|
// }
|
|
|
|
// 0 x.Data
|
|
// 8 x.Len
|
|
// 16 ret
|
|
|
|
// func CountBitsInt8sPopCnt(x []int8) (ret int)
|
|
TEXT ·CountBitsInt8sPopCnt(SB),NOSPLIT,$0
|
|
JMP ·CountBitsUint8sPopCnt(SB)
|
|
|
|
// func CountBitsInt16sPopCnt(x []int16) (ret int)
|
|
TEXT ·CountBitsInt16sPopCnt(SB),NOSPLIT,$0
|
|
JMP ·CountBitsUint16sPopCnt(SB)
|
|
|
|
// func CountBitsInt32sPopCnt(x []int32) (ret int)
|
|
TEXT ·CountBitsInt32sPopCnt(SB),NOSPLIT,$0
|
|
JMP ·CountBitsUint32sPopCnt(SB)
|
|
|
|
// func CountBitsInt64sPopCnt(x []int64) (ret int)
|
|
TEXT ·CountBitsInt64sPopCnt(SB),NOSPLIT,$0
|
|
JMP ·CountBitsUint64sPopCnt(SB)
|
|
|
|
// func CountBitsUint8sPopCnt(x []uint8) (ret int)
|
|
TEXT ·CountBitsUint8sPopCnt(SB),NOSPLIT,$0
|
|
XORQ AX, AX // ret = 0
|
|
MOVQ x+8(FP), CX // x.Len -> CX
|
|
|
|
test_negative_slice_len:
|
|
MOVQ CX, BX // x.Len < 0 ---> x.Len[63] != 0
|
|
SHRQ $63, BX
|
|
JNZ done
|
|
|
|
MOVQ x+0(FP), DI // x.Data -> DI
|
|
|
|
CMPQ CX, $32 // x.Len >= 32
|
|
JL unrolled_loop_skip
|
|
|
|
unrolled_loop_setup:
|
|
XORQ R9, R9
|
|
XORQ BX, BX
|
|
XORQ DX, DX
|
|
|
|
unrolled_loop: // 4 unrolled loops of POPCNTQ (4 quad words at a time)
|
|
SUBQ $32, CX
|
|
|
|
POPCNTQ 0(DI), R10
|
|
ADDQ R10, R9
|
|
POPCNTQ 8(DI), R11
|
|
ADDQ R11, AX
|
|
POPCNTQ 16(DI), R12
|
|
ADDQ R12, BX
|
|
POPCNTQ 24(DI), R13
|
|
ADDQ R13, DX
|
|
|
|
ADDQ $32, DI
|
|
CMPQ CX, $32 // x.Len >= 32
|
|
JGE unrolled_loop
|
|
|
|
unrolled_loop_done:
|
|
ADDQ R9, AX
|
|
ADDQ BX, DX
|
|
ADDQ DX, AX
|
|
|
|
XORQ BX, BX
|
|
|
|
unrolled_loop_skip:
|
|
CMPQ CX, $0
|
|
JZ done
|
|
|
|
XORQ DX, DX
|
|
|
|
remainder_loop:
|
|
MOVB 0(DI), DL
|
|
POPCNTQ DX, BX
|
|
ADDQ BX, AX
|
|
|
|
INCQ DI
|
|
DECQ CX
|
|
JNZ remainder_loop
|
|
|
|
done:
|
|
MOVQ AX, ret+24(FP)
|
|
RET
|
|
|
|
// func CountBitsUint16sPopCnt(x []uint16) (ret int)
|
|
TEXT ·CountBitsUint16sPopCnt(SB),NOSPLIT,$0
|
|
XORQ AX, AX // ret = 0
|
|
MOVQ x+8(FP), CX // x.Len -> CX
|
|
|
|
test_negative_slice_len:
|
|
MOVQ CX, BX // x.Len*2 < 0 ---> x.Len[63:62] != 0
|
|
SHLQ $1, CX
|
|
SHRQ $62, BX
|
|
JNZ done
|
|
|
|
MOVQ x+0(FP), DI // x.Data -> DI
|
|
|
|
|
|
CMPQ CX, $32 // x.Len*2 >= 32
|
|
JL unrolled_loop_skip
|
|
|
|
unrolled_loop_setup:
|
|
XORQ R9, R9
|
|
XORQ BX, BX
|
|
XORQ DX, DX
|
|
|
|
unrolled_loop: // 4 unrolled loops of POPCNTQ (4 quad words at a time)
|
|
SUBQ $32, CX
|
|
|
|
POPCNTQ 0(DI), R10
|
|
ADDQ R10, R9
|
|
POPCNTQ 8(DI), R11
|
|
ADDQ R11, AX
|
|
POPCNTQ 16(DI), R12
|
|
ADDQ R12, BX
|
|
POPCNTQ 24(DI), R13
|
|
ADDQ R13, DX
|
|
|
|
ADDQ $32, DI
|
|
CMPQ CX, $32 // x.Len*2 >= 32
|
|
JGE unrolled_loop
|
|
|
|
unrolled_loop_done:
|
|
ADDQ R9, AX
|
|
ADDQ BX, DX
|
|
ADDQ DX, AX
|
|
|
|
XORQ BX, BX
|
|
|
|
unrolled_loop_skip:
|
|
CMPQ CX, $0
|
|
JZ done
|
|
|
|
XORQ DX, DX
|
|
|
|
remainder_loop:
|
|
MOVW 0(DI), DX
|
|
POPCNTQ DX, BX
|
|
ADDQ BX, AX
|
|
|
|
ADDQ $2, DI
|
|
SUBQ $2, CX
|
|
JNZ remainder_loop
|
|
|
|
done:
|
|
MOVQ AX, ret+24(FP)
|
|
RET
|
|
|
|
// func CountBitsUint32sPopCnt(x []uint32) (ret int)
|
|
TEXT ·CountBitsUint32sPopCnt(SB),NOSPLIT,$0
|
|
XORQ AX, AX // ret = 0
|
|
MOVQ x+8(FP), CX // x.Len -> CX
|
|
MOVQ CX, BX
|
|
MOVQ x+0(FP), DI // x.Data -> DI
|
|
|
|
test_negative_slice_len:
|
|
SHLQ $2, CX // x.Len*4 < 0 ---> x.Len[63:61] != 0
|
|
SHRQ $61, BX
|
|
JNZ done
|
|
|
|
|
|
|
|
CMPQ CX, $32 // x.Len*4 >= 32
|
|
JL unrolled_loop_skip
|
|
|
|
unrolled_loop_setup:
|
|
XORQ R9, R9
|
|
XORQ BX, BX
|
|
XORQ DX, DX
|
|
|
|
unrolled_loop: // 4 unrolled loops of POPCNTQ (4 quad words at a time)
|
|
SUBQ $32, CX
|
|
|
|
POPCNTQ 0(DI), R10 // r9 += popcntq(QW DI+0)
|
|
ADDQ R10, R9
|
|
POPCNTQ 8(DI), R11 // ax += popcntq(QW DI+8)
|
|
ADDQ R11, AX
|
|
POPCNTQ 16(DI), R12 // bx += popcntq(QW DI+16)
|
|
ADDQ R12, BX
|
|
POPCNTQ 24(DI), R13 // dx += popcntq(QW DI+24)
|
|
ADDQ R13, DX
|
|
|
|
ADDQ $32, DI
|
|
CMPQ CX, $32 // x.Len*4 >= 32
|
|
JGE unrolled_loop
|
|
|
|
unrolled_loop_done:
|
|
ADDQ R9, AX // ax = (ax + r9) + (bx + dx)
|
|
ADDQ BX, DX
|
|
ADDQ DX, AX
|
|
|
|
XORQ BX, BX
|
|
|
|
unrolled_loop_skip:
|
|
CMPQ CX, $0
|
|
JZ done
|
|
|
|
XORQ DX, DX
|
|
remainder_loop:
|
|
MOVB (DI), DX // ax += popcnt(DB 0(DI))
|
|
POPCNTQ DX, BX
|
|
ADDQ BX, AX
|
|
|
|
INCQ DI
|
|
DECQ CX
|
|
JNZ remainder_loop
|
|
|
|
done:
|
|
MOVQ AX, ret+24(FP)
|
|
RET
|
|
|
|
// func CountBitsUint64sPopCnt(x []uint64) (ret int)
|
|
TEXT ·CountBitsUint64sPopCnt(SB),NOSPLIT,$0
|
|
XORQ AX, AX // ret = 0
|
|
MOVQ x+8(FP), CX // x.Len -> CX
|
|
|
|
test_negative_slice_len:
|
|
MOVQ CX, BX // x.Len*8 < 0 ---> x.Len[63:60] != 0
|
|
SHLQ $3, CX
|
|
SHRQ $60, BX
|
|
JNZ done
|
|
|
|
MOVQ x+0(FP), DI // x.Data -> DI
|
|
|
|
|
|
CMPQ CX, $32 // x.Len*8 >= 32
|
|
JL unrolled_loop_skip
|
|
|
|
unrolled_loop_setup:
|
|
XORQ R9, R9
|
|
XORQ BX, BX
|
|
XORQ DX, DX
|
|
|
|
unrolled_loop: // 4 unrolled loops of POPCNTQ (4 quad words at a time)
|
|
SUBQ $32, CX
|
|
|
|
POPCNTQ 0(DI), R10
|
|
ADDQ R10, R9
|
|
POPCNTQ 8(DI), R11
|
|
ADDQ R11, AX
|
|
POPCNTQ 16(DI), R12
|
|
ADDQ R12, BX
|
|
POPCNTQ 24(DI), R13
|
|
ADDQ R13, DX
|
|
|
|
ADDQ $32, DI
|
|
CMPQ CX, $32 // x.Len*4 >= 32
|
|
JGE unrolled_loop
|
|
|
|
unrolled_loop_done:
|
|
ADDQ R9, AX
|
|
ADDQ BX, DX
|
|
ADDQ DX, AX
|
|
|
|
XORQ BX, BX
|
|
|
|
unrolled_loop_skip:
|
|
CMPQ CX, $0
|
|
JZ done
|
|
|
|
XORQ DX, DX
|
|
|
|
remainder_loop:
|
|
MOVQ 0(DI), DX
|
|
POPCNTQ DX, BX
|
|
ADDQ BX, AX
|
|
|
|
ADDQ $8, DI
|
|
SUBQ $8, CX
|
|
JNZ remainder_loop
|
|
|
|
done:
|
|
MOVQ AX, ret+24(FP)
|
|
RET
|
|
|
|
// func CountBitsBytesPopCnt(x []byte) (ret int)
|
|
TEXT ·CountBitsBytesPopCnt(SB),NOSPLIT,$0
|
|
JMP ·CountBitsUint8sPopCnt(SB)
|
|
|
|
// func CountBitsRunesPopCnt(x []rune) (ret int)
|
|
TEXT ·CountBitsRunesPopCnt(SB),NOSPLIT,$0
|
|
JMP ·CountBitsUint32sPopCnt(SB)
|
|
|
|
// func CountBitsStringPopCnt(s string) (ret int)
|
|
TEXT ·CountBitsStringPopCnt(SB),NOSPLIT,$0
|
|
XORQ AX, AX // ret = 0
|
|
MOVQ x+8(FP), CX // x.Len -> CX
|
|
|
|
test_negative_slice_len:
|
|
MOVQ CX, BX // x.Len < 0 ---> x.Len[63] != 0
|
|
SHRQ $63, BX
|
|
JNZ done
|
|
|
|
MOVQ x+0(FP), DI // x.Data -> DI
|
|
|
|
CMPQ CX, $32 // x.Len >= 32
|
|
JL unrolled_loop_skip
|
|
|
|
unrolled_loop_setup:
|
|
XORQ R9, R9
|
|
XORQ BX, BX
|
|
XORQ DX, DX
|
|
|
|
unrolled_loop: // 4 unrolled loops of POPCNTQ (4 quad words at a time)
|
|
SUBQ $32, CX
|
|
|
|
POPCNTQ 0(DI), R10
|
|
ADDQ R10, R9
|
|
POPCNTQ 8(DI), R11
|
|
ADDQ R11, AX
|
|
POPCNTQ 16(DI), R12
|
|
ADDQ R12, BX
|
|
POPCNTQ 24(DI), R13
|
|
ADDQ R13, DX
|
|
|
|
ADDQ $32, DI
|
|
CMPQ CX, $32 // x.Len >= 32
|
|
JGE unrolled_loop
|
|
|
|
unrolled_loop_done:
|
|
ADDQ R9, AX
|
|
ADDQ BX, DX
|
|
ADDQ DX, AX
|
|
|
|
XORQ BX, BX
|
|
|
|
unrolled_loop_skip:
|
|
CMPQ CX, $0
|
|
JZ done
|
|
|
|
XORQ DX, DX
|
|
|
|
remainder_loop:
|
|
MOVB 0(DI), DL
|
|
POPCNTQ DX, BX
|
|
ADDQ BX, AX
|
|
|
|
INCQ DI
|
|
DECQ CX
|
|
JNZ remainder_loop
|
|
|
|
done:
|
|
MOVQ AX, ret+16(FP)
|
|
RET
|