sha256blockSha_amd64.s 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. //+build !noasm,!appengine
  2. // SHA intrinsic version of SHA256
  3. // Kristofer Peterson, (C) 2018.
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License");
  6. // you may not use this file except in compliance with the License.
  7. // You may obtain a copy of the License at
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS,
  13. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. // See the License for the specific language governing permissions and
  15. // limitations under the License.
  16. //
  17. #include "textflag.h"
  18. DATA K<>+0x00(SB)/4, $0x428a2f98
  19. DATA K<>+0x04(SB)/4, $0x71374491
  20. DATA K<>+0x08(SB)/4, $0xb5c0fbcf
  21. DATA K<>+0x0c(SB)/4, $0xe9b5dba5
  22. DATA K<>+0x10(SB)/4, $0x3956c25b
  23. DATA K<>+0x14(SB)/4, $0x59f111f1
  24. DATA K<>+0x18(SB)/4, $0x923f82a4
  25. DATA K<>+0x1c(SB)/4, $0xab1c5ed5
  26. DATA K<>+0x20(SB)/4, $0xd807aa98
  27. DATA K<>+0x24(SB)/4, $0x12835b01
  28. DATA K<>+0x28(SB)/4, $0x243185be
  29. DATA K<>+0x2c(SB)/4, $0x550c7dc3
  30. DATA K<>+0x30(SB)/4, $0x72be5d74
  31. DATA K<>+0x34(SB)/4, $0x80deb1fe
  32. DATA K<>+0x38(SB)/4, $0x9bdc06a7
  33. DATA K<>+0x3c(SB)/4, $0xc19bf174
  34. DATA K<>+0x40(SB)/4, $0xe49b69c1
  35. DATA K<>+0x44(SB)/4, $0xefbe4786
  36. DATA K<>+0x48(SB)/4, $0x0fc19dc6
  37. DATA K<>+0x4c(SB)/4, $0x240ca1cc
  38. DATA K<>+0x50(SB)/4, $0x2de92c6f
  39. DATA K<>+0x54(SB)/4, $0x4a7484aa
  40. DATA K<>+0x58(SB)/4, $0x5cb0a9dc
  41. DATA K<>+0x5c(SB)/4, $0x76f988da
  42. DATA K<>+0x60(SB)/4, $0x983e5152
  43. DATA K<>+0x64(SB)/4, $0xa831c66d
  44. DATA K<>+0x68(SB)/4, $0xb00327c8
  45. DATA K<>+0x6c(SB)/4, $0xbf597fc7
  46. DATA K<>+0x70(SB)/4, $0xc6e00bf3
  47. DATA K<>+0x74(SB)/4, $0xd5a79147
  48. DATA K<>+0x78(SB)/4, $0x06ca6351
  49. DATA K<>+0x7c(SB)/4, $0x14292967
  50. DATA K<>+0x80(SB)/4, $0x27b70a85
  51. DATA K<>+0x84(SB)/4, $0x2e1b2138
  52. DATA K<>+0x88(SB)/4, $0x4d2c6dfc
  53. DATA K<>+0x8c(SB)/4, $0x53380d13
  54. DATA K<>+0x90(SB)/4, $0x650a7354
  55. DATA K<>+0x94(SB)/4, $0x766a0abb
  56. DATA K<>+0x98(SB)/4, $0x81c2c92e
  57. DATA K<>+0x9c(SB)/4, $0x92722c85
  58. DATA K<>+0xa0(SB)/4, $0xa2bfe8a1
  59. DATA K<>+0xa4(SB)/4, $0xa81a664b
  60. DATA K<>+0xa8(SB)/4, $0xc24b8b70
  61. DATA K<>+0xac(SB)/4, $0xc76c51a3
  62. DATA K<>+0xb0(SB)/4, $0xd192e819
  63. DATA K<>+0xb4(SB)/4, $0xd6990624
  64. DATA K<>+0xb8(SB)/4, $0xf40e3585
  65. DATA K<>+0xbc(SB)/4, $0x106aa070
  66. DATA K<>+0xc0(SB)/4, $0x19a4c116
  67. DATA K<>+0xc4(SB)/4, $0x1e376c08
  68. DATA K<>+0xc8(SB)/4, $0x2748774c
  69. DATA K<>+0xcc(SB)/4, $0x34b0bcb5
  70. DATA K<>+0xd0(SB)/4, $0x391c0cb3
  71. DATA K<>+0xd4(SB)/4, $0x4ed8aa4a
  72. DATA K<>+0xd8(SB)/4, $0x5b9cca4f
  73. DATA K<>+0xdc(SB)/4, $0x682e6ff3
  74. DATA K<>+0xe0(SB)/4, $0x748f82ee
  75. DATA K<>+0xe4(SB)/4, $0x78a5636f
  76. DATA K<>+0xe8(SB)/4, $0x84c87814
  77. DATA K<>+0xec(SB)/4, $0x8cc70208
  78. DATA K<>+0xf0(SB)/4, $0x90befffa
  79. DATA K<>+0xf4(SB)/4, $0xa4506ceb
  80. DATA K<>+0xf8(SB)/4, $0xbef9a3f7
  81. DATA K<>+0xfc(SB)/4, $0xc67178f2
  82. GLOBL K<>(SB), RODATA|NOPTR, $256
  83. DATA SHUF_MASK<>+0x00(SB)/8, $0x0405060700010203
  84. DATA SHUF_MASK<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
  85. GLOBL SHUF_MASK<>(SB), RODATA|NOPTR, $16
  86. // Register Usage
  87. // BX base address of constant table (constant)
  88. // DX hash_state (constant)
  89. // SI hash_data.data
  90. // DI hash_data.data + hash_data.length - 64 (constant)
  91. // X0 scratch
  92. // X1 scratch
  93. // X2 working hash state // ABEF
  94. // X3 working hash state // CDGH
  95. // X4 first 16 bytes of block
  96. // X5 second 16 bytes of block
  97. // X6 third 16 bytes of block
  98. // X7 fourth 16 bytes of block
  99. // X12 saved hash state // ABEF
  100. // X13 saved hash state // CDGH
  101. // X15 data shuffle mask (constant)
  102. TEXT ·blockSha(SB), NOSPLIT, $0-32
  103. MOVQ h+0(FP), DX
  104. MOVQ message_base+8(FP), SI
  105. MOVQ message_len+16(FP), DI
  106. LEAQ -64(SI)(DI*1), DI
  107. MOVOU (DX), X2
  108. MOVOU 16(DX), X1
  109. MOVO X2, X3
  110. PUNPCKLLQ X1, X2
  111. PUNPCKHLQ X1, X3
  112. PSHUFD $0x27, X2, X2
  113. PSHUFD $0x27, X3, X3
  114. MOVO SHUF_MASK<>(SB), X15
  115. LEAQ K<>(SB), BX
  116. JMP TEST
  117. LOOP:
  118. MOVO X2, X12
  119. MOVO X3, X13
  120. // load block and shuffle
  121. MOVOU (SI), X4
  122. MOVOU 16(SI), X5
  123. MOVOU 32(SI), X6
  124. MOVOU 48(SI), X7
  125. PSHUFB X15, X4
  126. PSHUFB X15, X5
  127. PSHUFB X15, X6
  128. PSHUFB X15, X7
  129. #define ROUND456 \
  130. PADDL X5, X0 \
  131. LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2
  132. MOVO X5, X1 \
  133. LONG $0x0f3a0f66; WORD $0x04cc \ // PALIGNR XMM1, XMM4, 4
  134. PADDL X1, X6 \
  135. LONG $0xf5cd380f \ // SHA256MSG2 XMM6, XMM5
  136. PSHUFD $0x4e, X0, X0 \
  137. LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3
  138. LONG $0xe5cc380f // SHA256MSG1 XMM4, XMM5
  139. #define ROUND567 \
  140. PADDL X6, X0 \
  141. LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2
  142. MOVO X6, X1 \
  143. LONG $0x0f3a0f66; WORD $0x04cd \ // PALIGNR XMM1, XMM5, 4
  144. PADDL X1, X7 \
  145. LONG $0xfecd380f \ // SHA256MSG2 XMM7, XMM6
  146. PSHUFD $0x4e, X0, X0 \
  147. LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3
  148. LONG $0xeecc380f // SHA256MSG1 XMM5, XMM6
  149. #define ROUND674 \
  150. PADDL X7, X0 \
  151. LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2
  152. MOVO X7, X1 \
  153. LONG $0x0f3a0f66; WORD $0x04ce \ // PALIGNR XMM1, XMM6, 4
  154. PADDL X1, X4 \
  155. LONG $0xe7cd380f \ // SHA256MSG2 XMM4, XMM7
  156. PSHUFD $0x4e, X0, X0 \
  157. LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3
  158. LONG $0xf7cc380f // SHA256MSG1 XMM6, XMM7
  159. #define ROUND745 \
  160. PADDL X4, X0 \
  161. LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2
  162. MOVO X4, X1 \
  163. LONG $0x0f3a0f66; WORD $0x04cf \ // PALIGNR XMM1, XMM7, 4
  164. PADDL X1, X5 \
  165. LONG $0xeccd380f \ // SHA256MSG2 XMM5, XMM4
  166. PSHUFD $0x4e, X0, X0 \
  167. LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3
  168. LONG $0xfccc380f // SHA256MSG1 XMM7, XMM4
  169. // rounds 0-3
  170. MOVO (BX), X0
  171. PADDL X4, X0
  172. LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
  173. PSHUFD $0x4e, X0, X0
  174. LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
  175. // rounds 4-7
  176. MOVO 1*16(BX), X0
  177. PADDL X5, X0
  178. LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
  179. PSHUFD $0x4e, X0, X0
  180. LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
  181. LONG $0xe5cc380f // SHA256MSG1 XMM4, XMM5
  182. // rounds 8-11
  183. MOVO 2*16(BX), X0
  184. PADDL X6, X0
  185. LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
  186. PSHUFD $0x4e, X0, X0
  187. LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
  188. LONG $0xeecc380f // SHA256MSG1 XMM5, XMM6
  189. MOVO 3*16(BX), X0; ROUND674 // rounds 12-15
  190. MOVO 4*16(BX), X0; ROUND745 // rounds 16-19
  191. MOVO 5*16(BX), X0; ROUND456 // rounds 20-23
  192. MOVO 6*16(BX), X0; ROUND567 // rounds 24-27
  193. MOVO 7*16(BX), X0; ROUND674 // rounds 28-31
  194. MOVO 8*16(BX), X0; ROUND745 // rounds 32-35
  195. MOVO 9*16(BX), X0; ROUND456 // rounds 36-39
  196. MOVO 10*16(BX), X0; ROUND567 // rounds 40-43
  197. MOVO 11*16(BX), X0; ROUND674 // rounds 44-47
  198. MOVO 12*16(BX), X0; ROUND745 // rounds 48-51
  199. // rounds 52-55
  200. MOVO 13*16(BX), X0
  201. PADDL X5, X0
  202. LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
  203. MOVO X5, X1
  204. LONG $0x0f3a0f66; WORD $0x04cc // PALIGNR XMM1, XMM4, 4
  205. PADDL X1, X6
  206. LONG $0xf5cd380f // SHA256MSG2 XMM6, XMM5
  207. PSHUFD $0x4e, X0, X0
  208. LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
  209. // rounds 56-59
  210. MOVO 14*16(BX), X0
  211. PADDL X6, X0
  212. LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
  213. MOVO X6, X1
  214. LONG $0x0f3a0f66; WORD $0x04cd // PALIGNR XMM1, XMM5, 4
  215. PADDL X1, X7
  216. LONG $0xfecd380f // SHA256MSG2 XMM7, XMM6
  217. PSHUFD $0x4e, X0, X0
  218. LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
  219. // rounds 60-63
  220. MOVO 15*16(BX), X0
  221. PADDL X7, X0
  222. LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
  223. PSHUFD $0x4e, X0, X0
  224. LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
  225. PADDL X12, X2
  226. PADDL X13, X3
  227. ADDQ $64, SI
  228. TEST:
  229. CMPQ SI, DI
  230. JBE LOOP
  231. PSHUFD $0x4e, X3, X0
  232. LONG $0x0e3a0f66; WORD $0xf0c2 // PBLENDW XMM0, XMM2, 0xf0
  233. PSHUFD $0x4e, X2, X1
  234. LONG $0x0e3a0f66; WORD $0x0fcb // PBLENDW XMM1, XMM3, 0x0f
  235. PSHUFD $0x1b, X0, X0
  236. PSHUFD $0x1b, X1, X1
  237. MOVOU X0, (DX)
  238. MOVOU X1, 16(DX)
  239. RET