blamka_amd64.s 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. // Copyright 2017 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:build amd64 && gc && !purego
  5. // +build amd64,gc,!purego
  6. #include "textflag.h"
  7. DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
  8. DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
  9. GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
  10. DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
  11. DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
  12. GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
  13. #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
  14. MOVO v4, t1; \
  15. MOVO v5, v4; \
  16. MOVO t1, v5; \
  17. MOVO v6, t1; \
  18. PUNPCKLQDQ v6, t2; \
  19. PUNPCKHQDQ v7, v6; \
  20. PUNPCKHQDQ t2, v6; \
  21. PUNPCKLQDQ v7, t2; \
  22. MOVO t1, v7; \
  23. MOVO v2, t1; \
  24. PUNPCKHQDQ t2, v7; \
  25. PUNPCKLQDQ v3, t2; \
  26. PUNPCKHQDQ t2, v2; \
  27. PUNPCKLQDQ t1, t2; \
  28. PUNPCKHQDQ t2, v3
  29. #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
  30. MOVO v4, t1; \
  31. MOVO v5, v4; \
  32. MOVO t1, v5; \
  33. MOVO v2, t1; \
  34. PUNPCKLQDQ v2, t2; \
  35. PUNPCKHQDQ v3, v2; \
  36. PUNPCKHQDQ t2, v2; \
  37. PUNPCKLQDQ v3, t2; \
  38. MOVO t1, v3; \
  39. MOVO v6, t1; \
  40. PUNPCKHQDQ t2, v3; \
  41. PUNPCKLQDQ v7, t2; \
  42. PUNPCKHQDQ t2, v6; \
  43. PUNPCKLQDQ t1, t2; \
  44. PUNPCKHQDQ t2, v7
  45. #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \
  46. MOVO v0, t0; \
  47. PMULULQ v2, t0; \
  48. PADDQ v2, v0; \
  49. PADDQ t0, v0; \
  50. PADDQ t0, v0; \
  51. PXOR v0, v6; \
  52. PSHUFD $0xB1, v6, v6; \
  53. MOVO v4, t0; \
  54. PMULULQ v6, t0; \
  55. PADDQ v6, v4; \
  56. PADDQ t0, v4; \
  57. PADDQ t0, v4; \
  58. PXOR v4, v2; \
  59. PSHUFB c40, v2; \
  60. MOVO v0, t0; \
  61. PMULULQ v2, t0; \
  62. PADDQ v2, v0; \
  63. PADDQ t0, v0; \
  64. PADDQ t0, v0; \
  65. PXOR v0, v6; \
  66. PSHUFB c48, v6; \
  67. MOVO v4, t0; \
  68. PMULULQ v6, t0; \
  69. PADDQ v6, v4; \
  70. PADDQ t0, v4; \
  71. PADDQ t0, v4; \
  72. PXOR v4, v2; \
  73. MOVO v2, t0; \
  74. PADDQ v2, t0; \
  75. PSRLQ $63, v2; \
  76. PXOR t0, v2; \
  77. MOVO v1, t0; \
  78. PMULULQ v3, t0; \
  79. PADDQ v3, v1; \
  80. PADDQ t0, v1; \
  81. PADDQ t0, v1; \
  82. PXOR v1, v7; \
  83. PSHUFD $0xB1, v7, v7; \
  84. MOVO v5, t0; \
  85. PMULULQ v7, t0; \
  86. PADDQ v7, v5; \
  87. PADDQ t0, v5; \
  88. PADDQ t0, v5; \
  89. PXOR v5, v3; \
  90. PSHUFB c40, v3; \
  91. MOVO v1, t0; \
  92. PMULULQ v3, t0; \
  93. PADDQ v3, v1; \
  94. PADDQ t0, v1; \
  95. PADDQ t0, v1; \
  96. PXOR v1, v7; \
  97. PSHUFB c48, v7; \
  98. MOVO v5, t0; \
  99. PMULULQ v7, t0; \
  100. PADDQ v7, v5; \
  101. PADDQ t0, v5; \
  102. PADDQ t0, v5; \
  103. PXOR v5, v3; \
  104. MOVO v3, t0; \
  105. PADDQ v3, t0; \
  106. PSRLQ $63, v3; \
  107. PXOR t0, v3
  108. #define LOAD_MSG_0(block, off) \
  109. MOVOU 8*(off+0)(block), X0; \
  110. MOVOU 8*(off+2)(block), X1; \
  111. MOVOU 8*(off+4)(block), X2; \
  112. MOVOU 8*(off+6)(block), X3; \
  113. MOVOU 8*(off+8)(block), X4; \
  114. MOVOU 8*(off+10)(block), X5; \
  115. MOVOU 8*(off+12)(block), X6; \
  116. MOVOU 8*(off+14)(block), X7
  117. #define STORE_MSG_0(block, off) \
  118. MOVOU X0, 8*(off+0)(block); \
  119. MOVOU X1, 8*(off+2)(block); \
  120. MOVOU X2, 8*(off+4)(block); \
  121. MOVOU X3, 8*(off+6)(block); \
  122. MOVOU X4, 8*(off+8)(block); \
  123. MOVOU X5, 8*(off+10)(block); \
  124. MOVOU X6, 8*(off+12)(block); \
  125. MOVOU X7, 8*(off+14)(block)
  126. #define LOAD_MSG_1(block, off) \
  127. MOVOU 8*off+0*8(block), X0; \
  128. MOVOU 8*off+16*8(block), X1; \
  129. MOVOU 8*off+32*8(block), X2; \
  130. MOVOU 8*off+48*8(block), X3; \
  131. MOVOU 8*off+64*8(block), X4; \
  132. MOVOU 8*off+80*8(block), X5; \
  133. MOVOU 8*off+96*8(block), X6; \
  134. MOVOU 8*off+112*8(block), X7
  135. #define STORE_MSG_1(block, off) \
  136. MOVOU X0, 8*off+0*8(block); \
  137. MOVOU X1, 8*off+16*8(block); \
  138. MOVOU X2, 8*off+32*8(block); \
  139. MOVOU X3, 8*off+48*8(block); \
  140. MOVOU X4, 8*off+64*8(block); \
  141. MOVOU X5, 8*off+80*8(block); \
  142. MOVOU X6, 8*off+96*8(block); \
  143. MOVOU X7, 8*off+112*8(block)
  144. #define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \
  145. LOAD_MSG_0(block, off); \
  146. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
  147. SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \
  148. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
  149. SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \
  150. STORE_MSG_0(block, off)
  151. #define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \
  152. LOAD_MSG_1(block, off); \
  153. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
  154. SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \
  155. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
  156. SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \
  157. STORE_MSG_1(block, off)
  158. // func blamkaSSE4(b *block)
  159. TEXT ·blamkaSSE4(SB), 4, $0-8
  160. MOVQ b+0(FP), AX
  161. MOVOU ·c40<>(SB), X10
  162. MOVOU ·c48<>(SB), X11
  163. BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
  164. BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
  165. BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
  166. BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
  167. BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
  168. BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
  169. BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
  170. BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)
  171. BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
  172. BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
  173. BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
  174. BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
  175. BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
  176. BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
  177. BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
  178. BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
  179. RET
  180. // func mixBlocksSSE2(out, a, b, c *block)
  181. TEXT ·mixBlocksSSE2(SB), 4, $0-32
  182. MOVQ out+0(FP), DX
  183. MOVQ a+8(FP), AX
  184. MOVQ b+16(FP), BX
  185. MOVQ a+24(FP), CX
  186. MOVQ $128, BP
  187. loop:
  188. MOVOU 0(AX), X0
  189. MOVOU 0(BX), X1
  190. MOVOU 0(CX), X2
  191. PXOR X1, X0
  192. PXOR X2, X0
  193. MOVOU X0, 0(DX)
  194. ADDQ $16, AX
  195. ADDQ $16, BX
  196. ADDQ $16, CX
  197. ADDQ $16, DX
  198. SUBQ $2, BP
  199. JA loop
  200. RET
  201. // func xorBlocksSSE2(out, a, b, c *block)
  202. TEXT ·xorBlocksSSE2(SB), 4, $0-32
  203. MOVQ out+0(FP), DX
  204. MOVQ a+8(FP), AX
  205. MOVQ b+16(FP), BX
  206. MOVQ a+24(FP), CX
  207. MOVQ $128, BP
  208. loop:
  209. MOVOU 0(AX), X0
  210. MOVOU 0(BX), X1
  211. MOVOU 0(CX), X2
  212. MOVOU 0(DX), X3
  213. PXOR X1, X0
  214. PXOR X2, X0
  215. PXOR X3, X0
  216. MOVOU X0, 0(DX)
  217. ADDQ $16, AX
  218. ADDQ $16, BX
  219. ADDQ $16, CX
  220. ADDQ $16, DX
  221. SUBQ $2, BP
  222. JA loop
  223. RET