sha256blockAvx_amd64.s 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. //+build !noasm,!appengine
  2. // SHA256 implementation for AVX
  3. //
  4. // Minio Cloud Storage, (C) 2016 Minio, Inc.
  5. //
  6. // Licensed under the Apache License, Version 2.0 (the "License");
  7. // you may not use this file except in compliance with the License.
  8. // You may obtain a copy of the License at
  9. //
  10. // http://www.apache.org/licenses/LICENSE-2.0
  11. //
  12. // Unless required by applicable law or agreed to in writing, software
  13. // distributed under the License is distributed on an "AS IS" BASIS,
  14. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. // See the License for the specific language governing permissions and
  16. // limitations under the License.
  17. //
  18. //
  19. // This code is based on an Intel White-Paper:
  20. // "Fast SHA-256 Implementations on Intel Architecture Processors"
  21. //
  22. // together with the reference implementation from the following authors:
  23. // James Guilford <james.guilford@intel.com>
  24. // Kirk Yap <kirk.s.yap@intel.com>
  25. // Tim Chen <tim.c.chen@linux.intel.com>
  26. //
  27. // For Golang it has been converted to Plan 9 assembly with the help of
  28. // github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9
  29. // equivalents
  30. //
  31. #include "textflag.h"
  32. #define ROTATE_XS \
  33. MOVOU X4, X15 \
  34. MOVOU X5, X4 \
  35. MOVOU X6, X5 \
  36. MOVOU X7, X6 \
  37. MOVOU X15, X7
  38. // compute s0 four at a time and s1 two at a time
  39. // compute W[-16] + W[-7] 4 at a time
  40. #define FOUR_ROUNDS_AND_SCHED(a, b, c, d, e, f, g, h) \
  41. MOVL e, R13 \ // y0 = e
  42. ROLL $18, R13 \ // y0 = e >> (25-11)
  43. MOVL a, R14 \ // y1 = a
  44. LONG $0x0f41e3c4; WORD $0x04c6 \ // VPALIGNR XMM0,XMM7,XMM6,0x4 /* XTMP0 = W[-7] */
  45. ROLL $23, R14 \ // y1 = a >> (22-13)
  46. XORL e, R13 \ // y0 = e ^ (e >> (25-11))
  47. MOVL f, R15 \ // y2 = f
  48. ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
  49. XORL a, R14 \ // y1 = a ^ (a >> (22-13)
  50. XORL g, R15 \ // y2 = f^g
  51. LONG $0xc4fef9c5 \ // VPADDD XMM0,XMM0,XMM4 /* XTMP0 = W[-7] + W[-16] */
  52. XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6) )
  53. ANDL e, R15 \ // y2 = (f^g)&e
  54. ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
  55. \
  56. \ // compute s0
  57. \
  58. LONG $0x0f51e3c4; WORD $0x04cc \ // VPALIGNR XMM1,XMM5,XMM4,0x4 /* XTMP1 = W[-15] */
  59. XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  60. ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  61. XORL g, R15 \ // y2 = CH = ((f^g)&e)^g
  62. ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  63. ADDL R13, R15 \ // y2 = S1 + CH
  64. ADDL _xfer+48(FP), R15 \ // y2 = k + w + S1 + CH
  65. MOVL a, R13 \ // y0 = a
  66. ADDL R15, h \ // h = h + S1 + CH + k + w
  67. \ // ROTATE_ARGS
  68. MOVL a, R15 \ // y2 = a
  69. LONG $0xd172e9c5; BYTE $0x07 \ // VPSRLD XMM2,XMM1,0x7 /* */
  70. ORL c, R13 \ // y0 = a|c
  71. ADDL h, d \ // d = d + h + S1 + CH + k + w
  72. ANDL c, R15 \ // y2 = a&c
  73. LONG $0xf172e1c5; BYTE $0x19 \ // VPSLLD XMM3,XMM1,0x19 /* */
  74. ANDL b, R13 \ // y0 = (a|c)&b
  75. ADDL R14, h \ // h = h + S1 + CH + k + w + S0
  76. LONG $0xdaebe1c5 \ // VPOR XMM3,XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 */
  77. ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
  78. ADDL R13, h \ // h = h + S1 + CH + k + w + S0 + MAJ
  79. \ // ROTATE_ARGS
  80. MOVL d, R13 \ // y0 = e
  81. MOVL h, R14 \ // y1 = a
  82. ROLL $18, R13 \ // y0 = e >> (25-11)
  83. XORL d, R13 \ // y0 = e ^ (e >> (25-11))
  84. MOVL e, R15 \ // y2 = f
  85. ROLL $23, R14 \ // y1 = a >> (22-13)
  86. LONG $0xd172e9c5; BYTE $0x12 \ // VPSRLD XMM2,XMM1,0x12 /* */
  87. XORL h, R14 \ // y1 = a ^ (a >> (22-13)
  88. ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
  89. XORL f, R15 \ // y2 = f^g
  90. LONG $0xd172b9c5; BYTE $0x03 \ // VPSRLD XMM8,XMM1,0x3 /* XTMP4 = W[-15] >> 3 */
  91. ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
  92. XORL d, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  93. ANDL d, R15 \ // y2 = (f^g)&e
  94. ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  95. LONG $0xf172f1c5; BYTE $0x0e \ // VPSLLD XMM1,XMM1,0xe /* */
  96. XORL h, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  97. XORL f, R15 \ // y2 = CH = ((f^g)&e)^g
  98. LONG $0xd9efe1c5 \ // VPXOR XMM3,XMM3,XMM1 /* */
  99. ADDL R13, R15 \ // y2 = S1 + CH
  100. ADDL _xfer+52(FP), R15 \ // y2 = k + w + S1 + CH
  101. ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  102. LONG $0xdaefe1c5 \ // VPXOR XMM3,XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR */
  103. MOVL h, R13 \ // y0 = a
  104. ADDL R15, g \ // h = h + S1 + CH + k + w
  105. MOVL h, R15 \ // y2 = a
  106. LONG $0xef61c1c4; BYTE $0xc8 \ // VPXOR XMM1,XMM3,XMM8 /* XTMP1 = s0 */
  107. ORL b, R13 \ // y0 = a|c
  108. ADDL g, c \ // d = d + h + S1 + CH + k + w
  109. ANDL b, R15 \ // y2 = a&c
  110. \
  111. \ // compute low s1
  112. \
  113. LONG $0xd770f9c5; BYTE $0xfa \ // VPSHUFD XMM2,XMM7,0xfa /* XTMP2 = W[-2] {BBAA} */
  114. ANDL a, R13 \ // y0 = (a|c)&b
  115. ADDL R14, g \ // h = h + S1 + CH + k + w + S0
  116. LONG $0xc1fef9c5 \ // VPADDD XMM0,XMM0,XMM1 /* XTMP0 = W[-16] + W[-7] + s0 */
  117. ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
  118. ADDL R13, g \ // h = h + S1 + CH + k + w + S0 + MAJ
  119. \ // ROTATE_ARGS
  120. MOVL c, R13 \ // y0 = e
  121. MOVL g, R14 \ // y1 = a
  122. ROLL $18, R13 \ // y0 = e >> (25-11)
  123. XORL c, R13 \ // y0 = e ^ (e >> (25-11))
  124. ROLL $23, R14 \ // y1 = a >> (22-13)
  125. MOVL d, R15 \ // y2 = f
  126. XORL g, R14 \ // y1 = a ^ (a >> (22-13)
  127. ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
  128. LONG $0xd272b9c5; BYTE $0x0a \ // VPSRLD XMM8,XMM2,0xa /* XTMP4 = W[-2] >> 10 {BBAA} */
  129. XORL e, R15 \ // y2 = f^g
  130. LONG $0xd273e1c5; BYTE $0x13 \ // VPSRLQ XMM3,XMM2,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */
  131. XORL c, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  132. ANDL c, R15 \ // y2 = (f^g)&e
  133. LONG $0xd273e9c5; BYTE $0x11 \ // VPSRLQ XMM2,XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */
  134. ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
  135. XORL g, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  136. XORL e, R15 \ // y2 = CH = ((f^g)&e)^g
  137. ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  138. LONG $0xd3efe9c5 \ // VPXOR XMM2,XMM2,XMM3 /* */
  139. ADDL R13, R15 \ // y2 = S1 + CH
  140. ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  141. ADDL _xfer+56(FP), R15 \ // y2 = k + w + S1 + CH
  142. LONG $0xc2ef39c5 \ // VPXOR XMM8,XMM8,XMM2 /* XTMP4 = s1 {xBxA} */
  143. MOVL g, R13 \ // y0 = a
  144. ADDL R15, f \ // h = h + S1 + CH + k + w
  145. MOVL g, R15 \ // y2 = a
  146. LONG $0x003942c4; BYTE $0xc2 \ // VPSHUFB XMM8,XMM8,XMM10 /* XTMP4 = s1 {00BA} */
  147. ORL a, R13 \ // y0 = a|c
  148. ADDL f, b \ // d = d + h + S1 + CH + k + w
  149. ANDL a, R15 \ // y2 = a&c
  150. LONG $0xfe79c1c4; BYTE $0xc0 \ // VPADDD XMM0,XMM0,XMM8 /* XTMP0 = {..., ..., W[1], W[0]} */
  151. ANDL h, R13 \ // y0 = (a|c)&b
  152. ADDL R14, f \ // h = h + S1 + CH + k + w + S0
  153. \
  154. \ // compute high s1
  155. \
  156. LONG $0xd070f9c5; BYTE $0x50 \ // VPSHUFD XMM2,XMM0,0x50 /* XTMP2 = W[-2] {DDCC} */
  157. ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
  158. ADDL R13, f \ // h = h + S1 + CH + k + w + S0 + MAJ
  159. \ // ROTATE_ARGS
  160. MOVL b, R13 \ // y0 = e
  161. ROLL $18, R13 \ // y0 = e >> (25-11)
  162. MOVL f, R14 \ // y1 = a
  163. ROLL $23, R14 \ // y1 = a >> (22-13)
  164. XORL b, R13 \ // y0 = e ^ (e >> (25-11))
  165. MOVL c, R15 \ // y2 = f
  166. ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
  167. LONG $0xd272a1c5; BYTE $0x0a \ // VPSRLD XMM11,XMM2,0xa /* XTMP5 = W[-2] >> 10 {DDCC} */
  168. XORL f, R14 \ // y1 = a ^ (a >> (22-13)
  169. XORL d, R15 \ // y2 = f^g
  170. LONG $0xd273e1c5; BYTE $0x13 \ // VPSRLQ XMM3,XMM2,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */
  171. XORL b, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  172. ANDL b, R15 \ // y2 = (f^g)&e
  173. ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
  174. LONG $0xd273e9c5; BYTE $0x11 \ // VPSRLQ XMM2,XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */
  175. XORL f, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  176. ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  177. XORL d, R15 \ // y2 = CH = ((f^g)&e)^g
  178. LONG $0xd3efe9c5 \ // VPXOR XMM2,XMM2,XMM3 /* */
  179. ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  180. ADDL R13, R15 \ // y2 = S1 + CH
  181. ADDL _xfer+60(FP), R15 \ // y2 = k + w + S1 + CH
  182. LONG $0xdaef21c5 \ // VPXOR XMM11,XMM11,XMM2 /* XTMP5 = s1 {xDxC} */
  183. MOVL f, R13 \ // y0 = a
  184. ADDL R15, e \ // h = h + S1 + CH + k + w
  185. MOVL f, R15 \ // y2 = a
  186. LONG $0x002142c4; BYTE $0xdc \ // VPSHUFB XMM11,XMM11,XMM12 /* XTMP5 = s1 {DC00} */
  187. ORL h, R13 \ // y0 = a|c
  188. ADDL e, a \ // d = d + h + S1 + CH + k + w
  189. ANDL h, R15 \ // y2 = a&c
  190. LONG $0xe0fea1c5 \ // VPADDD XMM4,XMM11,XMM0 /* X0 = {W[3], W[2], W[1], W[0]} */
  191. ANDL g, R13 \ // y0 = (a|c)&b
  192. ADDL R14, e \ // h = h + S1 + CH + k + w + S0
  193. ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
  194. ADDL R13, e \ // h = h + S1 + CH + k + w + S0 + MAJ
  195. \ // ROTATE_ARGS
  196. ROTATE_XS
  197. #define DO_ROUND(a, b, c, d, e, f, g, h, offset) \
  198. MOVL e, R13 \ // y0 = e
  199. ROLL $18, R13 \ // y0 = e >> (25-11)
  200. MOVL a, R14 \ // y1 = a
  201. XORL e, R13 \ // y0 = e ^ (e >> (25-11))
  202. ROLL $23, R14 \ // y1 = a >> (22-13)
  203. MOVL f, R15 \ // y2 = f
  204. XORL a, R14 \ // y1 = a ^ (a >> (22-13)
  205. ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
  206. XORL g, R15 \ // y2 = f^g
  207. XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  208. ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
  209. ANDL e, R15 \ // y2 = (f^g)&e
  210. XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  211. ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  212. XORL g, R15 \ // y2 = CH = ((f^g)&e)^g
  213. ADDL R13, R15 \ // y2 = S1 + CH
  214. ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  215. ADDL _xfer+offset(FP), R15 \ // y2 = k + w + S1 + CH
  216. MOVL a, R13 \ // y0 = a
  217. ADDL R15, h \ // h = h + S1 + CH + k + w
  218. MOVL a, R15 \ // y2 = a
  219. ORL c, R13 \ // y0 = a|c
  220. ADDL h, d \ // d = d + h + S1 + CH + k + w
  221. ANDL c, R15 \ // y2 = a&c
  222. ANDL b, R13 \ // y0 = (a|c)&b
  223. ADDL R14, h \ // h = h + S1 + CH + k + w + S0
  224. ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
  225. ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ
  226. // func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
  227. TEXT ·blockAvx(SB), 7, $0-80
  228. MOVQ h+0(FP), SI // SI: &h
  229. MOVQ message_base+24(FP), R8 // &message
  230. MOVQ message_len+32(FP), R9 // length of message
  231. CMPQ R9, $0
  232. JEQ done_hash
  233. ADDQ R8, R9
  234. MOVQ R9, reserved2+64(FP) // store end of message
  235. // Register definition
  236. // a --> eax
  237. // b --> ebx
  238. // c --> ecx
  239. // d --> r8d
  240. // e --> edx
  241. // f --> r9d
  242. // g --> r10d
  243. // h --> r11d
  244. //
  245. // y0 --> r13d
  246. // y1 --> r14d
  247. // y2 --> r15d
  248. MOVL (0*4)(SI), AX // a = H0
  249. MOVL (1*4)(SI), BX // b = H1
  250. MOVL (2*4)(SI), CX // c = H2
  251. MOVL (3*4)(SI), R8 // d = H3
  252. MOVL (4*4)(SI), DX // e = H4
  253. MOVL (5*4)(SI), R9 // f = H5
  254. MOVL (6*4)(SI), R10 // g = H6
  255. MOVL (7*4)(SI), R11 // h = H7
  256. MOVOU bflipMask<>(SB), X13
  257. MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA
  258. MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00
  259. MOVQ message_base+24(FP), SI // SI: &message
  260. loop0:
  261. LEAQ constants<>(SB), BP
  262. // byte swap first 16 dwords
  263. MOVOU 0*16(SI), X4
  264. LONG $0x0059c2c4; BYTE $0xe5 // VPSHUFB XMM4, XMM4, XMM13
  265. MOVOU 1*16(SI), X5
  266. LONG $0x0051c2c4; BYTE $0xed // VPSHUFB XMM5, XMM5, XMM13
  267. MOVOU 2*16(SI), X6
  268. LONG $0x0049c2c4; BYTE $0xf5 // VPSHUFB XMM6, XMM6, XMM13
  269. MOVOU 3*16(SI), X7
  270. LONG $0x0041c2c4; BYTE $0xfd // VPSHUFB XMM7, XMM7, XMM13
  271. MOVQ SI, reserved3+72(FP)
  272. MOVD $0x3, DI
  273. // schedule 48 input dwords, by doing 3 rounds of 16 each
  274. loop1:
  275. LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */
  276. MOVOU X9, reserved0+48(FP)
  277. FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
  278. LONG $0x4dfe59c5; BYTE $0x10 // VPADDD XMM9, XMM4, 16[RBP] /* Add 2nd constant to message */
  279. MOVOU X9, reserved0+48(FP)
  280. FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
  281. LONG $0x4dfe59c5; BYTE $0x20 // VPADDD XMM9, XMM4, 32[RBP] /* Add 3rd constant to message */
  282. MOVOU X9, reserved0+48(FP)
  283. FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
  284. LONG $0x4dfe59c5; BYTE $0x30 // VPADDD XMM9, XMM4, 48[RBP] /* Add 4th constant to message */
  285. MOVOU X9, reserved0+48(FP)
  286. ADDQ $64, BP
  287. FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
  288. SUBQ $1, DI
  289. JNE loop1
  290. MOVD $0x2, DI
  291. loop2:
  292. LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */
  293. MOVOU X9, reserved0+48(FP)
  294. DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48)
  295. DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52)
  296. DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56)
  297. DO_ROUND( R9, R10, R11, AX, BX, CX, R8, DX, 60)
  298. LONG $0x4dfe51c5; BYTE $0x10 // VPADDD XMM9, XMM5, 16[RBP] /* Add 2nd constant to message */
  299. MOVOU X9, reserved0+48(FP)
  300. ADDQ $32, BP
  301. DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48)
  302. DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52)
  303. DO_ROUND( CX, R8, DX, R9, R10, R11, AX, BX, 56)
  304. DO_ROUND( BX, CX, R8, DX, R9, R10, R11, AX, 60)
  305. MOVOU X6, X4
  306. MOVOU X7, X5
  307. SUBQ $1, DI
  308. JNE loop2
  309. MOVQ h+0(FP), SI // SI: &h
  310. ADDL (0*4)(SI), AX // H0 = a + H0
  311. MOVL AX, (0*4)(SI)
  312. ADDL (1*4)(SI), BX // H1 = b + H1
  313. MOVL BX, (1*4)(SI)
  314. ADDL (2*4)(SI), CX // H2 = c + H2
  315. MOVL CX, (2*4)(SI)
  316. ADDL (3*4)(SI), R8 // H3 = d + H3
  317. MOVL R8, (3*4)(SI)
  318. ADDL (4*4)(SI), DX // H4 = e + H4
  319. MOVL DX, (4*4)(SI)
  320. ADDL (5*4)(SI), R9 // H5 = f + H5
  321. MOVL R9, (5*4)(SI)
  322. ADDL (6*4)(SI), R10 // H6 = g + H6
  323. MOVL R10, (6*4)(SI)
  324. ADDL (7*4)(SI), R11 // H7 = h + H7
  325. MOVL R11, (7*4)(SI)
  326. MOVQ reserved3+72(FP), SI
  327. ADDQ $64, SI
  328. CMPQ reserved2+64(FP), SI
  329. JNE loop0
  330. done_hash:
  331. RET
  332. // Constants table
  333. DATA constants<>+0x0(SB)/8, $0x71374491428a2f98
  334. DATA constants<>+0x8(SB)/8, $0xe9b5dba5b5c0fbcf
  335. DATA constants<>+0x10(SB)/8, $0x59f111f13956c25b
  336. DATA constants<>+0x18(SB)/8, $0xab1c5ed5923f82a4
  337. DATA constants<>+0x20(SB)/8, $0x12835b01d807aa98
  338. DATA constants<>+0x28(SB)/8, $0x550c7dc3243185be
  339. DATA constants<>+0x30(SB)/8, $0x80deb1fe72be5d74
  340. DATA constants<>+0x38(SB)/8, $0xc19bf1749bdc06a7
  341. DATA constants<>+0x40(SB)/8, $0xefbe4786e49b69c1
  342. DATA constants<>+0x48(SB)/8, $0x240ca1cc0fc19dc6
  343. DATA constants<>+0x50(SB)/8, $0x4a7484aa2de92c6f
  344. DATA constants<>+0x58(SB)/8, $0x76f988da5cb0a9dc
  345. DATA constants<>+0x60(SB)/8, $0xa831c66d983e5152
  346. DATA constants<>+0x68(SB)/8, $0xbf597fc7b00327c8
  347. DATA constants<>+0x70(SB)/8, $0xd5a79147c6e00bf3
  348. DATA constants<>+0x78(SB)/8, $0x1429296706ca6351
  349. DATA constants<>+0x80(SB)/8, $0x2e1b213827b70a85
  350. DATA constants<>+0x88(SB)/8, $0x53380d134d2c6dfc
  351. DATA constants<>+0x90(SB)/8, $0x766a0abb650a7354
  352. DATA constants<>+0x98(SB)/8, $0x92722c8581c2c92e
  353. DATA constants<>+0xa0(SB)/8, $0xa81a664ba2bfe8a1
  354. DATA constants<>+0xa8(SB)/8, $0xc76c51a3c24b8b70
  355. DATA constants<>+0xb0(SB)/8, $0xd6990624d192e819
  356. DATA constants<>+0xb8(SB)/8, $0x106aa070f40e3585
  357. DATA constants<>+0xc0(SB)/8, $0x1e376c0819a4c116
  358. DATA constants<>+0xc8(SB)/8, $0x34b0bcb52748774c
  359. DATA constants<>+0xd0(SB)/8, $0x4ed8aa4a391c0cb3
  360. DATA constants<>+0xd8(SB)/8, $0x682e6ff35b9cca4f
  361. DATA constants<>+0xe0(SB)/8, $0x78a5636f748f82ee
  362. DATA constants<>+0xe8(SB)/8, $0x8cc7020884c87814
  363. DATA constants<>+0xf0(SB)/8, $0xa4506ceb90befffa
  364. DATA constants<>+0xf8(SB)/8, $0xc67178f2bef9a3f7
  365. DATA bflipMask<>+0x00(SB)/8, $0x0405060700010203
  366. DATA bflipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
  367. DATA shuf00BA<>+0x00(SB)/8, $0x0b0a090803020100
  368. DATA shuf00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
  369. DATA shufDC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
  370. DATA shufDC00<>+0x08(SB)/8, $0x0b0a090803020100
  371. GLOBL constants<>(SB), 8, $256
  372. GLOBL bflipMask<>(SB), (NOPTR+RODATA), $16
  373. GLOBL shuf00BA<>(SB), (NOPTR+RODATA), $16
  374. GLOBL shufDC00<>(SB), (NOPTR+RODATA), $16