block_amd64.go 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. //+build !noasm,!appengine,gc
  2. // Copyright (c) 2020 MinIO Inc. All rights reserved.
  3. // Use of this source code is governed by a license that can be
  4. // found in the LICENSE file.
  5. package md5simd
  6. import (
  7. "fmt"
  8. "math"
  9. "sync"
  10. "unsafe"
  11. "github.com/klauspost/cpuid"
  12. )
  13. var hasAVX512 bool
  14. //go:noescape
  15. func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int)
  16. //go:noescape
  17. func block16(state *uint32, base uintptr, ptrs *int32, mask uint64, n int)
  18. // 8-way 4x uint32 digests in 4 ymm registers
  19. // (ymm0, ymm1, ymm2, ymm3)
  20. type digest8 struct {
  21. v0, v1, v2, v3 [8]uint32
  22. }
  23. // Stack cache for 8x64 byte md5.BlockSize bytes.
  24. // Must be 32-byte aligned, so allocate 512+32 and
  25. // align upwards at runtime.
  26. type cache8 [512 + 32]byte
  27. // MD5 magic numbers for one lane of hashing; inflated
  28. // 8x below at init time.
  29. var md5consts = [64]uint32{
  30. 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
  31. 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
  32. 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
  33. 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
  34. 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
  35. 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
  36. 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
  37. 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
  38. 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
  39. 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
  40. 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
  41. 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
  42. 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
  43. 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
  44. 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
  45. 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
  46. }
  47. // inflate the consts 8-way for 8x md5 (256 bit ymm registers)
  48. var avx256md5consts = func(c []uint32) []uint32 {
  49. inf := make([]uint32, 8*len(c))
  50. for i := range c {
  51. for j := 0; j < 8; j++ {
  52. inf[(i*8)+j] = c[i]
  53. }
  54. }
  55. return inf
  56. }(md5consts[:])
  57. // 16-way 4x uint32 digests in 4 zmm registers
  58. type digest16 struct {
  59. v0, v1, v2, v3 [16]uint32
  60. }
  61. // inflate the consts 16-way for 16x md5 (512 bit zmm registers)
  62. var avx512md5consts = func(c []uint32) []uint32 {
  63. inf := make([]uint32, 16*len(c))
  64. for i := range c {
  65. for j := 0; j < 16; j++ {
  66. inf[(i*16)+j] = c[i]
  67. }
  68. }
  69. return inf
  70. }(md5consts[:])
  71. func init() {
  72. hasAVX512 = cpuid.CPU.AVX512F()
  73. }
  74. // Interface function to assembly code
  75. func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) {
  76. if hasAVX512 {
  77. blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16)
  78. } else {
  79. d8a, d8b := digest8{}, digest8{}
  80. for i := range d8a.v0 {
  81. j := i + 8
  82. d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
  83. if !half {
  84. d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
  85. }
  86. }
  87. i8 := [2][8][]byte{}
  88. for i := range i8[0] {
  89. i8[0][i], i8[1][i] = input[i], input[8+i]
  90. }
  91. if half {
  92. blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a)
  93. } else {
  94. wg := sync.WaitGroup{}
  95. wg.Add(2)
  96. go func() { blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a); wg.Done() }()
  97. go func() { blockMd5_avx2(&d8b, i8[1], s.allBufs, &s.maskRounds8b); wg.Done() }()
  98. wg.Wait()
  99. }
  100. for i := range d8a.v0 {
  101. j := i + 8
  102. d.v0[i], d.v1[i], d.v2[i], d.v3[i] = d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i]
  103. if !half {
  104. d.v0[j], d.v1[j], d.v2[j], d.v3[j] = d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i]
  105. }
  106. }
  107. }
  108. }
  109. // Interface function to AVX512 assembly code
  110. func blockMd5_avx512(s *digest16, input [16][]byte, base []byte, maskRounds *[16]maskRounds) {
  111. baseMin := uint64(uintptr(unsafe.Pointer(&(base[0]))))
  112. ptrs := [16]int32{}
  113. for i := range ptrs {
  114. if len(input[i]) > 0 {
  115. if len(input[i]) > internalBlockSize {
  116. panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i))
  117. }
  118. off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin
  119. if off > math.MaxUint32 {
  120. panic(fmt.Sprintf("invalid buffer sent with offset %x", off))
  121. }
  122. ptrs[i] = int32(off)
  123. }
  124. }
  125. sdup := *s // create copy of initial states to receive intermediate updates
  126. rounds := generateMaskAndRounds16(input, maskRounds)
  127. for r := 0; r < rounds; r++ {
  128. m := maskRounds[r]
  129. block16(&sdup.v0[0], uintptr(baseMin), &ptrs[0], m.mask, int(64*m.rounds))
  130. for j := 0; j < len(ptrs); j++ {
  131. ptrs[j] += int32(64 * m.rounds) // update pointers for next round
  132. if m.mask&(1<<j) != 0 { // update digest if still masked as active
  133. (*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j]
  134. }
  135. }
  136. }
  137. }
  138. // Interface function to AVX2 assembly code
  139. func blockMd5_avx2(s *digest8, input [8][]byte, base []byte, maskRounds *[8]maskRounds) {
  140. baseMin := uint64(uintptr(unsafe.Pointer(&(base[0])))) - 4
  141. ptrs := [8]int32{}
  142. for i := range ptrs {
  143. if len(input[i]) > 0 {
  144. if len(input[i]) > internalBlockSize {
  145. panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i))
  146. }
  147. off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin
  148. if off > math.MaxUint32 {
  149. panic(fmt.Sprintf("invalid buffer sent with offset %x", off))
  150. }
  151. ptrs[i] = int32(off)
  152. }
  153. }
  154. sdup := *s // create copy of initial states to receive intermediate updates
  155. rounds := generateMaskAndRounds8(input, maskRounds)
  156. for r := 0; r < rounds; r++ {
  157. m := maskRounds[r]
  158. var cache cache8 // stack storage for block8 tmp state
  159. block8(&sdup.v0[0], uintptr(baseMin), &ptrs[0], &cache[0], int(64*m.rounds))
  160. for j := 0; j < len(ptrs); j++ {
  161. ptrs[j] += int32(64 * m.rounds) // update pointers for next round
  162. if m.mask&(1<<j) != 0 { // update digest if still masked as active
  163. (*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j]
  164. }
  165. }
  166. }
  167. }