sha256blockSsse_amd64.s 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. //+build !noasm,!appengine
  2. // SHA256 implementation for SSSE3
  3. //
  4. // Minio Cloud Storage, (C) 2016 Minio, Inc.
  5. //
  6. // Licensed under the Apache License, Version 2.0 (the "License");
  7. // you may not use this file except in compliance with the License.
  8. // You may obtain a copy of the License at
  9. //
  10. // http://www.apache.org/licenses/LICENSE-2.0
  11. //
  12. // Unless required by applicable law or agreed to in writing, software
  13. // distributed under the License is distributed on an "AS IS" BASIS,
  14. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. // See the License for the specific language governing permissions and
  16. // limitations under the License.
  17. //
  18. //
  19. // This code is based on an Intel White-Paper:
  20. // "Fast SHA-256 Implementations on Intel Architecture Processors"
  21. //
  22. // together with the reference implementation from the following authors:
  23. // James Guilford <james.guilford@intel.com>
  24. // Kirk Yap <kirk.s.yap@intel.com>
  25. // Tim Chen <tim.c.chen@linux.intel.com>
  26. //
  27. // For Golang it has been converted to Plan 9 assembly with the help of
  28. // github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9
  29. // equivalents
  30. //
  31. #include "textflag.h"
  32. #define ROTATE_XS \
  33. MOVOU X4, X15 \
  34. MOVOU X5, X4 \
  35. MOVOU X6, X5 \
  36. MOVOU X7, X6 \
  37. MOVOU X15, X7
  38. // compute s0 four at a time and s1 two at a time
  39. // compute W[-16] + W[-7] 4 at a time
  40. #define FOUR_ROUNDS_AND_SCHED(a, b, c, d, e, f, g, h) \
  41. MOVL e, R13 \ // y0 = e
  42. ROLL $18, R13 \ // y0 = e >> (25-11)
  43. MOVL a, R14 \ // y1 = a
  44. MOVOU X7, X0 \
  45. LONG $0x0f3a0f66; WORD $0x04c6 \ // PALIGNR XMM0,XMM6,0x4 /* XTMP0 = W[-7] */
  46. ROLL $23, R14 \ // y1 = a >> (22-13)
  47. XORL e, R13 \ // y0 = e ^ (e >> (25-11))
  48. MOVL f, R15 \ // y2 = f
  49. ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
  50. XORL a, R14 \ // y1 = a ^ (a >> (22-13)
  51. XORL g, R15 \ // y2 = f^g
  52. LONG $0xc4fe0f66 \ // PADDD XMM0,XMM4 /* XTMP0 = W[-7] + W[-16] */
  53. XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6) )
  54. ANDL e, R15 \ // y2 = (f^g)&e
  55. ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
  56. \
  57. \ // compute s0
  58. \
  59. MOVOU X5, X1 \
  60. LONG $0x0f3a0f66; WORD $0x04cc \ // PALIGNR XMM1,XMM4,0x4 /* XTMP1 = W[-15] */
  61. XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  62. ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  63. XORL g, R15 \ // y2 = CH = ((f^g)&e)^g
  64. ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  65. ADDL R13, R15 \ // y2 = S1 + CH
  66. ADDL _xfer+48(FP), R15 \ // y2 = k + w + S1 + CH
  67. MOVL a, R13 \ // y0 = a
  68. ADDL R15, h \ // h = h + S1 + CH + k + w
  69. \ // ROTATE_ARGS
  70. MOVL a, R15 \ // y2 = a
  71. MOVOU X1, X2 \
  72. LONG $0xd2720f66; BYTE $0x07 \ // PSRLD XMM2,0x7 /* */
  73. ORL c, R13 \ // y0 = a|c
  74. ADDL h, d \ // d = d + h + S1 + CH + k + w
  75. ANDL c, R15 \ // y2 = a&c
  76. MOVOU X1, X3 \
  77. LONG $0xf3720f66; BYTE $0x19 \ // PSLLD XMM3,0x19 /* */
  78. ANDL b, R13 \ // y0 = (a|c)&b
  79. ADDL R14, h \ // h = h + S1 + CH + k + w + S0
  80. LONG $0xdaeb0f66 \ // POR XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 */
  81. ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
  82. ADDL R13, h \ // h = h + S1 + CH + k + w + S0 + MAJ
  83. \ // ROTATE_ARGS
  84. MOVL d, R13 \ // y0 = e
  85. MOVL h, R14 \ // y1 = a
  86. ROLL $18, R13 \ // y0 = e >> (25-11)
  87. XORL d, R13 \ // y0 = e ^ (e >> (25-11))
  88. MOVL e, R15 \ // y2 = f
  89. ROLL $23, R14 \ // y1 = a >> (22-13)
  90. MOVOU X1, X2 \
  91. LONG $0xd2720f66; BYTE $0x12 \ // PSRLD XMM2,0x12 /* */
  92. XORL h, R14 \ // y1 = a ^ (a >> (22-13)
  93. ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
  94. XORL f, R15 \ // y2 = f^g
  95. MOVOU X1, X8 \
  96. LONG $0x720f4166; WORD $0x03d0 \ // PSRLD XMM8,0x3 /* XTMP4 = W[-15] >> 3 */
  97. ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
  98. XORL d, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  99. ANDL d, R15 \ // y2 = (f^g)&e
  100. ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  101. LONG $0xf1720f66; BYTE $0x0e \ // PSLLD XMM1,0xe /* */
  102. XORL h, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  103. XORL f, R15 \ // y2 = CH = ((f^g)&e)^g
  104. LONG $0xd9ef0f66 \ // PXOR XMM3,XMM1 /* */
  105. ADDL R13, R15 \ // y2 = S1 + CH
  106. ADDL _xfer+52(FP), R15 \ // y2 = k + w + S1 + CH
  107. ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  108. LONG $0xdaef0f66 \ // PXOR XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR */
  109. MOVL h, R13 \ // y0 = a
  110. ADDL R15, g \ // h = h + S1 + CH + k + w
  111. MOVL h, R15 \ // y2 = a
  112. MOVOU X3, X1 \
  113. LONG $0xef0f4166; BYTE $0xc8 \ // PXOR XMM1,XMM8 /* XTMP1 = s0 */
  114. ORL b, R13 \ // y0 = a|c
  115. ADDL g, c \ // d = d + h + S1 + CH + k + w
  116. ANDL b, R15 \ // y2 = a&c
  117. \
  118. \ // compute low s1
  119. \
  120. LONG $0xd7700f66; BYTE $0xfa \ // PSHUFD XMM2,XMM7,0xfa /* XTMP2 = W[-2] {BBAA} */
  121. ANDL a, R13 \ // y0 = (a|c)&b
  122. ADDL R14, g \ // h = h + S1 + CH + k + w + S0
  123. LONG $0xc1fe0f66 \ // PADDD XMM0,XMM1 /* XTMP0 = W[-16] + W[-7] + s0 */
  124. ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
  125. ADDL R13, g \ // h = h + S1 + CH + k + w + S0 + MAJ
  126. \ // ROTATE_ARGS
  127. MOVL c, R13 \ // y0 = e
  128. MOVL g, R14 \ // y1 = a
  129. ROLL $18, R13 \ // y0 = e >> (25-11)
  130. XORL c, R13 \ // y0 = e ^ (e >> (25-11))
  131. ROLL $23, R14 \ // y1 = a >> (22-13)
  132. MOVL d, R15 \ // y2 = f
  133. XORL g, R14 \ // y1 = a ^ (a >> (22-13)
  134. ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
  135. MOVOU X2, X8 \
  136. LONG $0x720f4166; WORD $0x0ad0 \ // PSRLD XMM8,0xa /* XTMP4 = W[-2] >> 10 {BBAA} */
  137. XORL e, R15 \ // y2 = f^g
  138. MOVOU X2, X3 \
  139. LONG $0xd3730f66; BYTE $0x13 \ // PSRLQ XMM3,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */
  140. XORL c, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  141. ANDL c, R15 \ // y2 = (f^g)&e
  142. LONG $0xd2730f66; BYTE $0x11 \ // PSRLQ XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */
  143. ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
  144. XORL g, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  145. XORL e, R15 \ // y2 = CH = ((f^g)&e)^g
  146. ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  147. LONG $0xd3ef0f66 \ // PXOR XMM2,XMM3 /* */
  148. ADDL R13, R15 \ // y2 = S1 + CH
  149. ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  150. ADDL _xfer+56(FP), R15 \ // y2 = k + w + S1 + CH
  151. LONG $0xef0f4466; BYTE $0xc2 \ // PXOR XMM8,XMM2 /* XTMP4 = s1 {xBxA} */
  152. MOVL g, R13 \ // y0 = a
  153. ADDL R15, f \ // h = h + S1 + CH + k + w
  154. MOVL g, R15 \ // y2 = a
  155. LONG $0x380f4566; WORD $0xc200 \ // PSHUFB XMM8,XMM10 /* XTMP4 = s1 {00BA} */
  156. ORL a, R13 \ // y0 = a|c
  157. ADDL f, b \ // d = d + h + S1 + CH + k + w
  158. ANDL a, R15 \ // y2 = a&c
  159. LONG $0xfe0f4166; BYTE $0xc0 \ // PADDD XMM0,XMM8 /* XTMP0 = {..., ..., W[1], W[0]} */
  160. ANDL h, R13 \ // y0 = (a|c)&b
  161. ADDL R14, f \ // h = h + S1 + CH + k + w + S0
  162. \
  163. \ // compute high s1
  164. \
  165. LONG $0xd0700f66; BYTE $0x50 \ // PSHUFD XMM2,XMM0,0x50 /* XTMP2 = W[-2] {DDCC} */
  166. ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
  167. ADDL R13, f \ // h = h + S1 + CH + k + w + S0 + MAJ
  168. \ // ROTATE_ARGS
  169. MOVL b, R13 \ // y0 = e
  170. ROLL $18, R13 \ // y0 = e >> (25-11)
  171. MOVL f, R14 \ // y1 = a
  172. ROLL $23, R14 \ // y1 = a >> (22-13)
  173. XORL b, R13 \ // y0 = e ^ (e >> (25-11))
  174. MOVL c, R15 \ // y2 = f
  175. ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
  176. MOVOU X2, X11 \
  177. LONG $0x720f4166; WORD $0x0ad3 \ // PSRLD XMM11,0xa /* XTMP5 = W[-2] >> 10 {DDCC} */
  178. XORL f, R14 \ // y1 = a ^ (a >> (22-13)
  179. XORL d, R15 \ // y2 = f^g
  180. MOVOU X2, X3 \
  181. LONG $0xd3730f66; BYTE $0x13 \ // PSRLQ XMM3,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */
  182. XORL b, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  183. ANDL b, R15 \ // y2 = (f^g)&e
  184. ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
  185. LONG $0xd2730f66; BYTE $0x11 \ // PSRLQ XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */
  186. XORL f, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  187. ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  188. XORL d, R15 \ // y2 = CH = ((f^g)&e)^g
  189. LONG $0xd3ef0f66 \ // PXOR XMM2,XMM3 /* */
  190. ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  191. ADDL R13, R15 \ // y2 = S1 + CH
  192. ADDL _xfer+60(FP), R15 \ // y2 = k + w + S1 + CH
  193. LONG $0xef0f4466; BYTE $0xda \ // PXOR XMM11,XMM2 /* XTMP5 = s1 {xDxC} */
  194. MOVL f, R13 \ // y0 = a
  195. ADDL R15, e \ // h = h + S1 + CH + k + w
  196. MOVL f, R15 \ // y2 = a
  197. LONG $0x380f4566; WORD $0xdc00 \ // PSHUFB XMM11,XMM12 /* XTMP5 = s1 {DC00} */
  198. ORL h, R13 \ // y0 = a|c
  199. ADDL e, a \ // d = d + h + S1 + CH + k + w
  200. ANDL h, R15 \ // y2 = a&c
  201. MOVOU X11, X4 \
  202. LONG $0xe0fe0f66 \ // PADDD XMM4,XMM0 /* X0 = {W[3], W[2], W[1], W[0]} */
  203. ANDL g, R13 \ // y0 = (a|c)&b
  204. ADDL R14, e \ // h = h + S1 + CH + k + w + S0
  205. ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
  206. ADDL R13, e \ // h = h + S1 + CH + k + w + S0 + MAJ
  207. \ // ROTATE_ARGS
  208. ROTATE_XS
  209. #define DO_ROUND(a, b, c, d, e, f, g, h, offset) \
  210. MOVL e, R13 \ // y0 = e
  211. ROLL $18, R13 \ // y0 = e >> (25-11)
  212. MOVL a, R14 \ // y1 = a
  213. XORL e, R13 \ // y0 = e ^ (e >> (25-11))
  214. ROLL $23, R14 \ // y1 = a >> (22-13)
  215. MOVL f, R15 \ // y2 = f
  216. XORL a, R14 \ // y1 = a ^ (a >> (22-13)
  217. ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
  218. XORL g, R15 \ // y2 = f^g
  219. XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  220. ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
  221. ANDL e, R15 \ // y2 = (f^g)&e
  222. XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  223. ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  224. XORL g, R15 \ // y2 = CH = ((f^g)&e)^g
  225. ADDL R13, R15 \ // y2 = S1 + CH
  226. ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  227. ADDL _xfer+offset(FP), R15 \ // y2 = k + w + S1 + CH
  228. MOVL a, R13 \ // y0 = a
  229. ADDL R15, h \ // h = h + S1 + CH + k + w
  230. MOVL a, R15 \ // y2 = a
  231. ORL c, R13 \ // y0 = a|c
  232. ADDL h, d \ // d = d + h + S1 + CH + k + w
  233. ANDL c, R15 \ // y2 = a&c
  234. ANDL b, R13 \ // y0 = (a|c)&b
  235. ADDL R14, h \ // h = h + S1 + CH + k + w + S0
  236. ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
  237. ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ
  238. // func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
  239. TEXT ·blockSsse(SB), 7, $0-80
  240. MOVQ h+0(FP), SI // SI: &h
  241. MOVQ message_base+24(FP), R8 // &message
  242. MOVQ message_len+32(FP), R9 // length of message
  243. CMPQ R9, $0
  244. JEQ done_hash
  245. ADDQ R8, R9
  246. MOVQ R9, reserved2+64(FP) // store end of message
  247. // Register definition
  248. // a --> eax
  249. // b --> ebx
  250. // c --> ecx
  251. // d --> r8d
  252. // e --> edx
  253. // f --> r9d
  254. // g --> r10d
  255. // h --> r11d
  256. //
  257. // y0 --> r13d
  258. // y1 --> r14d
  259. // y2 --> r15d
  260. MOVL (0*4)(SI), AX // a = H0
  261. MOVL (1*4)(SI), BX // b = H1
  262. MOVL (2*4)(SI), CX // c = H2
  263. MOVL (3*4)(SI), R8 // d = H3
  264. MOVL (4*4)(SI), DX // e = H4
  265. MOVL (5*4)(SI), R9 // f = H5
  266. MOVL (6*4)(SI), R10 // g = H6
  267. MOVL (7*4)(SI), R11 // h = H7
  268. MOVOU bflipMask<>(SB), X13
  269. MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA
  270. MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00
  271. MOVQ message_base+24(FP), SI // SI: &message
  272. loop0:
  273. LEAQ constants<>(SB), BP
  274. // byte swap first 16 dwords
  275. MOVOU 0*16(SI), X4
  276. LONG $0x380f4166; WORD $0xe500 // PSHUFB XMM4, XMM13
  277. MOVOU 1*16(SI), X5
  278. LONG $0x380f4166; WORD $0xed00 // PSHUFB XMM5, XMM13
  279. MOVOU 2*16(SI), X6
  280. LONG $0x380f4166; WORD $0xf500 // PSHUFB XMM6, XMM13
  281. MOVOU 3*16(SI), X7
  282. LONG $0x380f4166; WORD $0xfd00 // PSHUFB XMM7, XMM13
  283. MOVQ SI, reserved3+72(FP)
  284. MOVD $0x3, DI
  285. // Align
  286. // nop WORD PTR [rax+rax*1+0x0]
  287. // schedule 48 input dwords, by doing 3 rounds of 16 each
  288. loop1:
  289. MOVOU X4, X9
  290. LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */
  291. MOVOU X9, reserved0+48(FP)
  292. FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
  293. MOVOU X4, X9
  294. LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */
  295. MOVOU X9, reserved0+48(FP)
  296. FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
  297. MOVOU X4, X9
  298. LONG $0xfe0f4466; WORD $0x204d // PADDD XMM9, 32[RBP] /* Add 3rd constant to message */
  299. MOVOU X9, reserved0+48(FP)
  300. FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
  301. MOVOU X4, X9
  302. LONG $0xfe0f4466; WORD $0x304d // PADDD XMM9, 48[RBP] /* Add 4th constant to message */
  303. MOVOU X9, reserved0+48(FP)
  304. ADDQ $64, BP
  305. FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
  306. SUBQ $1, DI
  307. JNE loop1
  308. MOVD $0x2, DI
  309. loop2:
  310. MOVOU X4, X9
  311. LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */
  312. MOVOU X9, reserved0+48(FP)
  313. DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48)
  314. DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52)
  315. DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56)
  316. DO_ROUND( R9, R10, R11, AX, BX, CX, R8, DX, 60)
  317. MOVOU X5, X9
  318. LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */
  319. MOVOU X9, reserved0+48(FP)
  320. ADDQ $32, BP
  321. DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48)
  322. DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52)
  323. DO_ROUND( CX, R8, DX, R9, R10, R11, AX, BX, 56)
  324. DO_ROUND( BX, CX, R8, DX, R9, R10, R11, AX, 60)
  325. MOVOU X6, X4
  326. MOVOU X7, X5
  327. SUBQ $1, DI
  328. JNE loop2
  329. MOVQ h+0(FP), SI // SI: &h
  330. ADDL (0*4)(SI), AX // H0 = a + H0
  331. MOVL AX, (0*4)(SI)
  332. ADDL (1*4)(SI), BX // H1 = b + H1
  333. MOVL BX, (1*4)(SI)
  334. ADDL (2*4)(SI), CX // H2 = c + H2
  335. MOVL CX, (2*4)(SI)
  336. ADDL (3*4)(SI), R8 // H3 = d + H3
  337. MOVL R8, (3*4)(SI)
  338. ADDL (4*4)(SI), DX // H4 = e + H4
  339. MOVL DX, (4*4)(SI)
  340. ADDL (5*4)(SI), R9 // H5 = f + H5
  341. MOVL R9, (5*4)(SI)
  342. ADDL (6*4)(SI), R10 // H6 = g + H6
  343. MOVL R10, (6*4)(SI)
  344. ADDL (7*4)(SI), R11 // H7 = h + H7
  345. MOVL R11, (7*4)(SI)
  346. MOVQ reserved3+72(FP), SI
  347. ADDQ $64, SI
  348. CMPQ reserved2+64(FP), SI
  349. JNE loop0
  350. done_hash:
  351. RET
  352. // Constants table
  353. DATA constants<>+0x0(SB)/8, $0x71374491428a2f98
  354. DATA constants<>+0x8(SB)/8, $0xe9b5dba5b5c0fbcf
  355. DATA constants<>+0x10(SB)/8, $0x59f111f13956c25b
  356. DATA constants<>+0x18(SB)/8, $0xab1c5ed5923f82a4
  357. DATA constants<>+0x20(SB)/8, $0x12835b01d807aa98
  358. DATA constants<>+0x28(SB)/8, $0x550c7dc3243185be
  359. DATA constants<>+0x30(SB)/8, $0x80deb1fe72be5d74
  360. DATA constants<>+0x38(SB)/8, $0xc19bf1749bdc06a7
  361. DATA constants<>+0x40(SB)/8, $0xefbe4786e49b69c1
  362. DATA constants<>+0x48(SB)/8, $0x240ca1cc0fc19dc6
  363. DATA constants<>+0x50(SB)/8, $0x4a7484aa2de92c6f
  364. DATA constants<>+0x58(SB)/8, $0x76f988da5cb0a9dc
  365. DATA constants<>+0x60(SB)/8, $0xa831c66d983e5152
  366. DATA constants<>+0x68(SB)/8, $0xbf597fc7b00327c8
  367. DATA constants<>+0x70(SB)/8, $0xd5a79147c6e00bf3
  368. DATA constants<>+0x78(SB)/8, $0x1429296706ca6351
  369. DATA constants<>+0x80(SB)/8, $0x2e1b213827b70a85
  370. DATA constants<>+0x88(SB)/8, $0x53380d134d2c6dfc
  371. DATA constants<>+0x90(SB)/8, $0x766a0abb650a7354
  372. DATA constants<>+0x98(SB)/8, $0x92722c8581c2c92e
  373. DATA constants<>+0xa0(SB)/8, $0xa81a664ba2bfe8a1
  374. DATA constants<>+0xa8(SB)/8, $0xc76c51a3c24b8b70
  375. DATA constants<>+0xb0(SB)/8, $0xd6990624d192e819
  376. DATA constants<>+0xb8(SB)/8, $0x106aa070f40e3585
  377. DATA constants<>+0xc0(SB)/8, $0x1e376c0819a4c116
  378. DATA constants<>+0xc8(SB)/8, $0x34b0bcb52748774c
  379. DATA constants<>+0xd0(SB)/8, $0x4ed8aa4a391c0cb3
  380. DATA constants<>+0xd8(SB)/8, $0x682e6ff35b9cca4f
  381. DATA constants<>+0xe0(SB)/8, $0x78a5636f748f82ee
  382. DATA constants<>+0xe8(SB)/8, $0x8cc7020884c87814
  383. DATA constants<>+0xf0(SB)/8, $0xa4506ceb90befffa
  384. DATA constants<>+0xf8(SB)/8, $0xc67178f2bef9a3f7
  385. DATA bflipMask<>+0x00(SB)/8, $0x0405060700010203
  386. DATA bflipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
  387. DATA shuf00BA<>+0x00(SB)/8, $0x0b0a090803020100
  388. DATA shuf00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
  389. DATA shufDC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
  390. DATA shufDC00<>+0x08(SB)/8, $0x0b0a090803020100
  391. GLOBL constants<>(SB), 8, $256
  392. GLOBL bflipMask<>(SB), (NOPTR+RODATA), $16
  393. GLOBL shuf00BA<>(SB), (NOPTR+RODATA), $16
  394. GLOBL shufDC00<>(SB), (NOPTR+RODATA), $16