123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244 |
- // Copyright 2017 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- //go:build amd64 && gc && !purego
- // +build amd64,gc,!purego
- #include "textflag.h"
- DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
- DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
- GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
- DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
- DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
- GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
- #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
- MOVO v4, t1; \
- MOVO v5, v4; \
- MOVO t1, v5; \
- MOVO v6, t1; \
- PUNPCKLQDQ v6, t2; \
- PUNPCKHQDQ v7, v6; \
- PUNPCKHQDQ t2, v6; \
- PUNPCKLQDQ v7, t2; \
- MOVO t1, v7; \
- MOVO v2, t1; \
- PUNPCKHQDQ t2, v7; \
- PUNPCKLQDQ v3, t2; \
- PUNPCKHQDQ t2, v2; \
- PUNPCKLQDQ t1, t2; \
- PUNPCKHQDQ t2, v3
- #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
- MOVO v4, t1; \
- MOVO v5, v4; \
- MOVO t1, v5; \
- MOVO v2, t1; \
- PUNPCKLQDQ v2, t2; \
- PUNPCKHQDQ v3, v2; \
- PUNPCKHQDQ t2, v2; \
- PUNPCKLQDQ v3, t2; \
- MOVO t1, v3; \
- MOVO v6, t1; \
- PUNPCKHQDQ t2, v3; \
- PUNPCKLQDQ v7, t2; \
- PUNPCKHQDQ t2, v6; \
- PUNPCKLQDQ t1, t2; \
- PUNPCKHQDQ t2, v7
- #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \
- MOVO v0, t0; \
- PMULULQ v2, t0; \
- PADDQ v2, v0; \
- PADDQ t0, v0; \
- PADDQ t0, v0; \
- PXOR v0, v6; \
- PSHUFD $0xB1, v6, v6; \
- MOVO v4, t0; \
- PMULULQ v6, t0; \
- PADDQ v6, v4; \
- PADDQ t0, v4; \
- PADDQ t0, v4; \
- PXOR v4, v2; \
- PSHUFB c40, v2; \
- MOVO v0, t0; \
- PMULULQ v2, t0; \
- PADDQ v2, v0; \
- PADDQ t0, v0; \
- PADDQ t0, v0; \
- PXOR v0, v6; \
- PSHUFB c48, v6; \
- MOVO v4, t0; \
- PMULULQ v6, t0; \
- PADDQ v6, v4; \
- PADDQ t0, v4; \
- PADDQ t0, v4; \
- PXOR v4, v2; \
- MOVO v2, t0; \
- PADDQ v2, t0; \
- PSRLQ $63, v2; \
- PXOR t0, v2; \
- MOVO v1, t0; \
- PMULULQ v3, t0; \
- PADDQ v3, v1; \
- PADDQ t0, v1; \
- PADDQ t0, v1; \
- PXOR v1, v7; \
- PSHUFD $0xB1, v7, v7; \
- MOVO v5, t0; \
- PMULULQ v7, t0; \
- PADDQ v7, v5; \
- PADDQ t0, v5; \
- PADDQ t0, v5; \
- PXOR v5, v3; \
- PSHUFB c40, v3; \
- MOVO v1, t0; \
- PMULULQ v3, t0; \
- PADDQ v3, v1; \
- PADDQ t0, v1; \
- PADDQ t0, v1; \
- PXOR v1, v7; \
- PSHUFB c48, v7; \
- MOVO v5, t0; \
- PMULULQ v7, t0; \
- PADDQ v7, v5; \
- PADDQ t0, v5; \
- PADDQ t0, v5; \
- PXOR v5, v3; \
- MOVO v3, t0; \
- PADDQ v3, t0; \
- PSRLQ $63, v3; \
- PXOR t0, v3
- #define LOAD_MSG_0(block, off) \
- MOVOU 8*(off+0)(block), X0; \
- MOVOU 8*(off+2)(block), X1; \
- MOVOU 8*(off+4)(block), X2; \
- MOVOU 8*(off+6)(block), X3; \
- MOVOU 8*(off+8)(block), X4; \
- MOVOU 8*(off+10)(block), X5; \
- MOVOU 8*(off+12)(block), X6; \
- MOVOU 8*(off+14)(block), X7
- #define STORE_MSG_0(block, off) \
- MOVOU X0, 8*(off+0)(block); \
- MOVOU X1, 8*(off+2)(block); \
- MOVOU X2, 8*(off+4)(block); \
- MOVOU X3, 8*(off+6)(block); \
- MOVOU X4, 8*(off+8)(block); \
- MOVOU X5, 8*(off+10)(block); \
- MOVOU X6, 8*(off+12)(block); \
- MOVOU X7, 8*(off+14)(block)
- #define LOAD_MSG_1(block, off) \
- MOVOU 8*off+0*8(block), X0; \
- MOVOU 8*off+16*8(block), X1; \
- MOVOU 8*off+32*8(block), X2; \
- MOVOU 8*off+48*8(block), X3; \
- MOVOU 8*off+64*8(block), X4; \
- MOVOU 8*off+80*8(block), X5; \
- MOVOU 8*off+96*8(block), X6; \
- MOVOU 8*off+112*8(block), X7
- #define STORE_MSG_1(block, off) \
- MOVOU X0, 8*off+0*8(block); \
- MOVOU X1, 8*off+16*8(block); \
- MOVOU X2, 8*off+32*8(block); \
- MOVOU X3, 8*off+48*8(block); \
- MOVOU X4, 8*off+64*8(block); \
- MOVOU X5, 8*off+80*8(block); \
- MOVOU X6, 8*off+96*8(block); \
- MOVOU X7, 8*off+112*8(block)
- #define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \
- LOAD_MSG_0(block, off); \
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
- SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \
- STORE_MSG_0(block, off)
- #define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \
- LOAD_MSG_1(block, off); \
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
- SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \
- STORE_MSG_1(block, off)
- // func blamkaSSE4(b *block)
- TEXT ·blamkaSSE4(SB), 4, $0-8
- MOVQ b+0(FP), AX
- MOVOU ·c40<>(SB), X10
- MOVOU ·c48<>(SB), X11
- BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
- BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
- BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
- BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
- BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
- BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
- BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
- BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
- RET
- // func mixBlocksSSE2(out, a, b, c *block)
- TEXT ·mixBlocksSSE2(SB), 4, $0-32
- MOVQ out+0(FP), DX
- MOVQ a+8(FP), AX
- MOVQ b+16(FP), BX
- MOVQ a+24(FP), CX
- MOVQ $128, BP
- loop:
- MOVOU 0(AX), X0
- MOVOU 0(BX), X1
- MOVOU 0(CX), X2
- PXOR X1, X0
- PXOR X2, X0
- MOVOU X0, 0(DX)
- ADDQ $16, AX
- ADDQ $16, BX
- ADDQ $16, CX
- ADDQ $16, DX
- SUBQ $2, BP
- JA loop
- RET
- // func xorBlocksSSE2(out, a, b, c *block)
- TEXT ·xorBlocksSSE2(SB), 4, $0-32
- MOVQ out+0(FP), DX
- MOVQ a+8(FP), AX
- MOVQ b+16(FP), BX
- MOVQ a+24(FP), CX
- MOVQ $128, BP
- loop:
- MOVOU 0(AX), X0
- MOVOU 0(BX), X1
- MOVOU 0(CX), X2
- MOVOU 0(DX), X3
- PXOR X1, X0
- PXOR X2, X0
- PXOR X3, X0
- MOVOU X0, 0(DX)
- ADDQ $16, AX
- ADDQ $16, BX
- ADDQ $16, CX
- ADDQ $16, DX
- SUBQ $2, BP
- JA loop
- RET
|