block16_amd64.s 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. // Copyright (c) 2020 MinIO Inc. All rights reserved.
  2. // Use of this source code is governed by a license that can be
  3. // found in the LICENSE file.
  4. // This is the AVX512 implementation of the MD5 block function (16-way parallel)
  5. #define prep(index) \
  6. KMOVQ kmask, ktmp \
  7. VPGATHERDD index*4(base)(ptrs*1), ktmp, mem
  8. #define ROUND1(a, b, c, d, index, const, shift) \
  9. VXORPS c, tmp, tmp \
  10. VPADDD 64*const(consts), a, a \
  11. VPADDD mem, a, a \
  12. VPTERNLOGD $0x6C, b, d, tmp \
  13. prep(index) \
  14. VPADDD tmp, a, a \
  15. VPROLD $shift, a, a \
  16. VMOVAPD c, tmp \
  17. VPADDD b, a, a
  18. #define ROUND1noload(a, b, c, d, const, shift) \
  19. VXORPS c, tmp, tmp \
  20. VPADDD 64*const(consts), a, a \
  21. VPADDD mem, a, a \
  22. VPTERNLOGD $0x6C, b, d, tmp \
  23. VPADDD tmp, a, a \
  24. VPROLD $shift, a, a \
  25. VMOVAPD c, tmp \
  26. VPADDD b, a, a
  27. #define ROUND2(a, b, c, d, zreg, const, shift) \
  28. VPADDD 64*const(consts), a, a \
  29. VPADDD zreg, a, a \
  30. VANDNPS c, tmp, tmp \
  31. VPTERNLOGD $0xEC, b, tmp, tmp2 \
  32. VMOVAPD c, tmp \
  33. VPADDD tmp2, a, a \
  34. VMOVAPD c, tmp2 \
  35. VPROLD $shift, a, a \
  36. VPADDD b, a, a
  37. #define ROUND3(a, b, c, d, zreg, const, shift) \
  38. VPADDD 64*const(consts), a, a \
  39. VPADDD zreg, a, a \
  40. VPTERNLOGD $0x96, b, d, tmp \
  41. VPADDD tmp, a, a \
  42. VPROLD $shift, a, a \
  43. VMOVAPD b, tmp \
  44. VPADDD b, a, a
  45. #define ROUND4(a, b, c, d, zreg, const, shift) \
  46. VPADDD 64*const(consts), a, a \
  47. VPADDD zreg, a, a \
  48. VPTERNLOGD $0x36, b, c, tmp \
  49. VPADDD tmp, a, a \
  50. VPROLD $shift, a, a \
  51. VXORPS c, ones, tmp \
  52. VPADDD b, a, a
  53. TEXT ·block16(SB),4,$0-40
  54. MOVQ state+0(FP), BX
  55. MOVQ base+8(FP), SI
  56. MOVQ ptrs+16(FP), AX
  57. KMOVQ mask+24(FP), K1
  58. MOVQ n+32(FP), DX
  59. MOVQ ·avx512md5consts+0(SB), DI
  60. #define a Z0
  61. #define b Z1
  62. #define c Z2
  63. #define d Z3
  64. #define sa Z4
  65. #define sb Z5
  66. #define sc Z6
  67. #define sd Z7
  68. #define tmp Z8
  69. #define tmp2 Z9
  70. #define ptrs Z10
  71. #define ones Z12
  72. #define mem Z15
  73. #define kmask K1
  74. #define ktmp K3
  75. // ----------------------------------------------------------
  76. // Registers Z16 through to Z31 are used for caching purposes
  77. // ----------------------------------------------------------
  78. #define dig BX
  79. #define count DX
  80. #define base SI
  81. #define consts DI
  82. // load digest into state registers
  83. VMOVUPD (dig), a
  84. VMOVUPD 0x40(dig), b
  85. VMOVUPD 0x80(dig), c
  86. VMOVUPD 0xc0(dig), d
  87. // load source pointers
  88. VMOVUPD 0x00(AX), ptrs
  89. MOVQ $-1, AX
  90. VPBROADCASTQ AX, ones
  91. loop:
  92. VMOVAPD a, sa
  93. VMOVAPD b, sb
  94. VMOVAPD c, sc
  95. VMOVAPD d, sd
  96. prep(0)
  97. VMOVAPD d, tmp
  98. VMOVAPD mem, Z16
  99. ROUND1(a,b,c,d, 1,0x00, 7)
  100. VMOVAPD mem, Z17
  101. ROUND1(d,a,b,c, 2,0x01,12)
  102. VMOVAPD mem, Z18
  103. ROUND1(c,d,a,b, 3,0x02,17)
  104. VMOVAPD mem, Z19
  105. ROUND1(b,c,d,a, 4,0x03,22)
  106. VMOVAPD mem, Z20
  107. ROUND1(a,b,c,d, 5,0x04, 7)
  108. VMOVAPD mem, Z21
  109. ROUND1(d,a,b,c, 6,0x05,12)
  110. VMOVAPD mem, Z22
  111. ROUND1(c,d,a,b, 7,0x06,17)
  112. VMOVAPD mem, Z23
  113. ROUND1(b,c,d,a, 8,0x07,22)
  114. VMOVAPD mem, Z24
  115. ROUND1(a,b,c,d, 9,0x08, 7)
  116. VMOVAPD mem, Z25
  117. ROUND1(d,a,b,c,10,0x09,12)
  118. VMOVAPD mem, Z26
  119. ROUND1(c,d,a,b,11,0x0a,17)
  120. VMOVAPD mem, Z27
  121. ROUND1(b,c,d,a,12,0x0b,22)
  122. VMOVAPD mem, Z28
  123. ROUND1(a,b,c,d,13,0x0c, 7)
  124. VMOVAPD mem, Z29
  125. ROUND1(d,a,b,c,14,0x0d,12)
  126. VMOVAPD mem, Z30
  127. ROUND1(c,d,a,b,15,0x0e,17)
  128. VMOVAPD mem, Z31
  129. ROUND1noload(b,c,d,a, 0x0f,22)
  130. VMOVAPD d, tmp
  131. VMOVAPD d, tmp2
  132. ROUND2(a,b,c,d, Z17,0x10, 5)
  133. ROUND2(d,a,b,c, Z22,0x11, 9)
  134. ROUND2(c,d,a,b, Z27,0x12,14)
  135. ROUND2(b,c,d,a, Z16,0x13,20)
  136. ROUND2(a,b,c,d, Z21,0x14, 5)
  137. ROUND2(d,a,b,c, Z26,0x15, 9)
  138. ROUND2(c,d,a,b, Z31,0x16,14)
  139. ROUND2(b,c,d,a, Z20,0x17,20)
  140. ROUND2(a,b,c,d, Z25,0x18, 5)
  141. ROUND2(d,a,b,c, Z30,0x19, 9)
  142. ROUND2(c,d,a,b, Z19,0x1a,14)
  143. ROUND2(b,c,d,a, Z24,0x1b,20)
  144. ROUND2(a,b,c,d, Z29,0x1c, 5)
  145. ROUND2(d,a,b,c, Z18,0x1d, 9)
  146. ROUND2(c,d,a,b, Z23,0x1e,14)
  147. ROUND2(b,c,d,a, Z28,0x1f,20)
  148. VMOVAPD c, tmp
  149. ROUND3(a,b,c,d, Z21,0x20, 4)
  150. ROUND3(d,a,b,c, Z24,0x21,11)
  151. ROUND3(c,d,a,b, Z27,0x22,16)
  152. ROUND3(b,c,d,a, Z30,0x23,23)
  153. ROUND3(a,b,c,d, Z17,0x24, 4)
  154. ROUND3(d,a,b,c, Z20,0x25,11)
  155. ROUND3(c,d,a,b, Z23,0x26,16)
  156. ROUND3(b,c,d,a, Z26,0x27,23)
  157. ROUND3(a,b,c,d, Z29,0x28, 4)
  158. ROUND3(d,a,b,c, Z16,0x29,11)
  159. ROUND3(c,d,a,b, Z19,0x2a,16)
  160. ROUND3(b,c,d,a, Z22,0x2b,23)
  161. ROUND3(a,b,c,d, Z25,0x2c, 4)
  162. ROUND3(d,a,b,c, Z28,0x2d,11)
  163. ROUND3(c,d,a,b, Z31,0x2e,16)
  164. ROUND3(b,c,d,a, Z18,0x2f,23)
  165. VXORPS d, ones, tmp
  166. ROUND4(a,b,c,d, Z16,0x30, 6)
  167. ROUND4(d,a,b,c, Z23,0x31,10)
  168. ROUND4(c,d,a,b, Z30,0x32,15)
  169. ROUND4(b,c,d,a, Z21,0x33,21)
  170. ROUND4(a,b,c,d, Z28,0x34, 6)
  171. ROUND4(d,a,b,c, Z19,0x35,10)
  172. ROUND4(c,d,a,b, Z26,0x36,15)
  173. ROUND4(b,c,d,a, Z17,0x37,21)
  174. ROUND4(a,b,c,d, Z24,0x38, 6)
  175. ROUND4(d,a,b,c, Z31,0x39,10)
  176. ROUND4(c,d,a,b, Z22,0x3a,15)
  177. ROUND4(b,c,d,a, Z29,0x3b,21)
  178. ROUND4(a,b,c,d, Z20,0x3c, 6)
  179. ROUND4(d,a,b,c, Z27,0x3d,10)
  180. ROUND4(c,d,a,b, Z18,0x3e,15)
  181. ROUND4(b,c,d,a, Z25,0x3f,21)
  182. VPADDD sa, a, a
  183. VPADDD sb, b, b
  184. VPADDD sc, c, c
  185. VPADDD sd, d, d
  186. LEAQ 64(base), base
  187. SUBQ $64, count
  188. JNE loop
  189. VMOVUPD a, (dig)
  190. VMOVUPD b, 0x40(dig)
  191. VMOVUPD c, 0x80(dig)
  192. VMOVUPD d, 0xc0(dig)
  193. VZEROUPPER
  194. RET