SDL_blit_A_avx2.c 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. #include "SDL_internal.h"
  2. #if SDL_HAVE_BLIT_A
  3. #ifdef SDL_AVX2_INTRINSICS
  4. #define SDL_blit_A_avx2_c
  5. #include "SDL_blit.h"
  6. #include "SDL_blit_A_sse4_1.h"
  7. __m256i SDL_TARGETING("avx2") GetSDL_PixelFormatAlphaMask_AVX2(const SDL_PixelFormat* dstfmt) {
  8. Uint8 index = dstfmt->Ashift / 4;
  9. /* Handle case where bad input sent */
  10. if (dstfmt->Ashift == dstfmt->Bshift && dstfmt->Ashift == 0) {
  11. index = 6;
  12. }
  13. return _mm256_set_epi8(
  14. -1, index + 24, -1, index + 24, -1, index + 24, -1, index + 24,
  15. -1, index + 16, -1, index + 16, -1, index + 16, -1, index + 16,
  16. -1, index + 8, -1, index + 8, -1, index + 8, -1, index + 8,
  17. -1, index, -1, index, -1, index, -1, index);
  18. }
  19. /**
  20. * Using the AVX2 instruction set, blit eight pixels with alpha blending
  21. * @param src A pointer to four 32-bit pixels of ARGB format to blit into dst
  22. * @param dst A pointer to four 32-bit pixels of ARGB format to retain visual data for while alpha blending
  23. * @return A 128-bit wide vector of four alpha-blended pixels in ARGB format
  24. */
  25. __m128i SDL_TARGETING("avx2") MixRGBA_AVX2(const __m128i src, const __m128i dst, const __m256i alphaMask) {
  26. __m256i src_color = _mm256_cvtepu8_epi16(src);
  27. __m256i dst_color = _mm256_cvtepu8_epi16(dst);
  28. __m256i alpha = _mm256_shuffle_epi8(src_color, alphaMask);
  29. __m256i sub = _mm256_sub_epi16(src_color, dst_color);
  30. __m256i mul = _mm256_mullo_epi16(sub, alpha);
  31. /**
  32. * With an 8-bit shuffle, one can only move integers within a lane. The 256-bit AVX2 lane is actually 4 64-bit
  33. * lanes. We pack the integers into the start of each lane. The second shuffle operates on these 64-bit integers to
  34. * put them into the correct order for transport back to the surface in the correct format.
  35. */
  36. const __m256i SHUFFLE_REDUCE = _mm256_set_epi8(
  37. -1, -1, -1, -1, -1, -1, -1, -1,
  38. 31, 29, 27, 25, 23, 21, 19, 17,
  39. -1, -1, -1, -1, -1, -1, -1, -1,
  40. 15, 13, 11, 9, 7, 5, 3, 1);
  41. __m256i reduced = _mm256_shuffle_epi8(mul, SHUFFLE_REDUCE);
  42. __m256i packed = _mm256_permute4x64_epi64(reduced, _MM_SHUFFLE(3, 1, 2, 0));
  43. __m128i mix = _mm256_castsi256_si128(packed);
  44. return _mm_add_epi8(mix, dst);
  45. }
  46. void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
  47. {
  48. int width = info->dst_w;
  49. int height = info->dst_h;
  50. Uint8 *src = info->src;
  51. int srcskip = info->src_skip;
  52. Uint8 *dst = info->dst;
  53. int dstskip = info->dst_skip;
  54. SDL_PixelFormat *srcfmt = info->src_fmt;
  55. SDL_PixelFormat *dstfmt = info->dst_fmt;
  56. int chunks = width / 4;
  57. const __m128i colorShiftMask = GetSDL_PixelFormatShuffleMask(srcfmt, dstfmt);
  58. const __m256i alphaMask = GetSDL_PixelFormatAlphaMask_AVX2(dstfmt);
  59. const __m128i sse4_1AlphaMask = GetSDL_PixelFormatAlphaMask_SSE4_1(dstfmt);
  60. while (height--) {
  61. /* Process 4-wide chunks of source color data that may be in wrong format */
  62. for (int i = 0; i < chunks; i += 1) {
  63. __m128i c_src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *) (src + i * 16)), colorShiftMask);
  64. /* Alpha-blend in 4-wide chunk from src into destination */
  65. __m128i c_dst = _mm_loadu_si128((__m128i*) (dst + i * 16));
  66. __m128i c_mix = MixRGBA_AVX2(c_src, c_dst, alphaMask);
  67. _mm_storeu_si128((__m128i*) (dst + i * 16), c_mix);
  68. }
  69. /* Handle remaining pixels when width is not a multiple of 4 */
  70. if (width % 4 != 0) {
  71. int remaining_pixels = width % 4;
  72. int offset = width - remaining_pixels;
  73. if (remaining_pixels >= 2) {
  74. Uint32 *src_ptr = ((Uint32*)(src + (offset * 4)));
  75. Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4)));
  76. __m128i c_src = _mm_loadu_si64(src_ptr);
  77. c_src = _mm_shuffle_epi8(c_src, colorShiftMask);
  78. __m128i c_dst = _mm_loadu_si64(dst_ptr);
  79. __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, sse4_1AlphaMask);
  80. _mm_storeu_si64(dst_ptr, c_mix);
  81. remaining_pixels -= 2;
  82. offset += 2;
  83. }
  84. if (remaining_pixels == 1) {
  85. Uint32 *src_ptr = ((Uint32*)(src + (offset * 4)));
  86. Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4)));
  87. Uint32 pixel = AlignPixelToSDL_PixelFormat(*src_ptr, srcfmt, dstfmt);
  88. /* Old GCC has bad or no _mm_loadu_si32 */
  89. #if defined(__GNUC__) && (__GNUC__ < 11)
  90. __m128i c_src = _mm_set_epi32(0, 0, 0, pixel);
  91. __m128i c_dst = _mm_set_epi32(0, 0, 0, *dst_ptr);
  92. #else
  93. __m128i c_src = _mm_loadu_si32(&pixel);
  94. __m128i c_dst = _mm_loadu_si32(dst_ptr);
  95. #endif
  96. __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst, sse4_1AlphaMask);
  97. /* Old GCC has bad or no _mm_storeu_si32 */
  98. #if defined(__GNUC__) && (__GNUC__ < 11)
  99. *dst_ptr = _mm_extract_epi32(mixed_pixel, 0);
  100. #else
  101. _mm_storeu_si32(dst_ptr, mixed_pixel);
  102. #endif
  103. }
  104. }
  105. src += 4 * width;
  106. dst += 4 * width;
  107. src += srcskip;
  108. dst += dstskip;
  109. }
  110. }
  111. #endif
  112. #endif