SDL_blit_A.c 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461
  1. /*
  2. Simple DirectMedia Layer
  3. Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org>
  4. This software is provided 'as-is', without any express or implied
  5. warranty. In no event will the authors be held liable for any damages
  6. arising from the use of this software.
  7. Permission is granted to anyone to use this software for any purpose,
  8. including commercial applications, and to alter it and redistribute it
  9. freely, subject to the following restrictions:
  10. 1. The origin of this software must not be misrepresented; you must not
  11. claim that you wrote the original software. If you use this software
  12. in a product, an acknowledgment in the product documentation would be
  13. appreciated but is not required.
  14. 2. Altered source versions must be plainly marked as such, and must not be
  15. misrepresented as being the original software.
  16. 3. This notice may not be removed or altered from any source distribution.
  17. */
  18. #include "SDL_internal.h"
  19. #if SDL_HAVE_BLIT_A
  20. #include "SDL_surface_c.h"
  21. // Functions to perform alpha blended blitting
  22. // N->1 blending with per-surface alpha
  23. static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
  24. {
  25. int width = info->dst_w;
  26. int height = info->dst_h;
  27. Uint8 *src = info->src;
  28. int srcskip = info->src_skip;
  29. Uint8 *dst = info->dst;
  30. int dstskip = info->dst_skip;
  31. Uint8 *palmap = info->table;
  32. const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
  33. const SDL_Color *dstpal = info->dst_pal->colors;
  34. int srcbpp = srcfmt->bytes_per_pixel;
  35. Uint32 Pixel;
  36. unsigned sR, sG, sB;
  37. unsigned dR, dG, dB;
  38. const unsigned A = info->a;
  39. while (height--) {
  40. /* *INDENT-OFF* */ // clang-format off
  41. DUFFS_LOOP4(
  42. {
  43. DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  44. dR = dstpal[*dst].r;
  45. dG = dstpal[*dst].g;
  46. dB = dstpal[*dst].b;
  47. ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
  48. dR &= 0xff;
  49. dG &= 0xff;
  50. dB &= 0xff;
  51. // Pack RGB into 8bit pixel
  52. if ( palmap == NULL ) {
  53. *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)));
  54. } else {
  55. *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
  56. }
  57. dst++;
  58. src += srcbpp;
  59. },
  60. width);
  61. /* *INDENT-ON* */ // clang-format on
  62. src += srcskip;
  63. dst += dstskip;
  64. }
  65. }
  66. // N->1 blending with pixel alpha
  67. static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
  68. {
  69. int width = info->dst_w;
  70. int height = info->dst_h;
  71. Uint8 *src = info->src;
  72. int srcskip = info->src_skip;
  73. Uint8 *dst = info->dst;
  74. int dstskip = info->dst_skip;
  75. Uint8 *palmap = info->table;
  76. const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
  77. const SDL_Color *dstpal = info->dst_pal->colors;
  78. int srcbpp = srcfmt->bytes_per_pixel;
  79. Uint32 Pixel;
  80. unsigned sR, sG, sB, sA;
  81. unsigned dR, dG, dB;
  82. while (height--) {
  83. /* *INDENT-OFF* */ // clang-format off
  84. DUFFS_LOOP4(
  85. {
  86. DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
  87. dR = dstpal[*dst].r;
  88. dG = dstpal[*dst].g;
  89. dB = dstpal[*dst].b;
  90. ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
  91. dR &= 0xff;
  92. dG &= 0xff;
  93. dB &= 0xff;
  94. // Pack RGB into 8bit pixel
  95. if ( palmap == NULL ) {
  96. *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)));
  97. } else {
  98. *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
  99. }
  100. dst++;
  101. src += srcbpp;
  102. },
  103. width);
  104. /* *INDENT-ON* */ // clang-format on
  105. src += srcskip;
  106. dst += dstskip;
  107. }
  108. }
  109. // colorkeyed N->1 blending with per-surface alpha
  110. static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
  111. {
  112. int width = info->dst_w;
  113. int height = info->dst_h;
  114. Uint8 *src = info->src;
  115. int srcskip = info->src_skip;
  116. Uint8 *dst = info->dst;
  117. int dstskip = info->dst_skip;
  118. Uint8 *palmap = info->table;
  119. const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
  120. const SDL_Color *dstpal = info->dst_pal->colors;
  121. int srcbpp = srcfmt->bytes_per_pixel;
  122. Uint32 ckey = info->colorkey;
  123. Uint32 Pixel;
  124. unsigned sR, sG, sB;
  125. unsigned dR, dG, dB;
  126. const unsigned A = info->a;
  127. while (height--) {
  128. /* *INDENT-OFF* */ // clang-format off
  129. DUFFS_LOOP(
  130. {
  131. DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  132. if ( Pixel != ckey ) {
  133. dR = dstpal[*dst].r;
  134. dG = dstpal[*dst].g;
  135. dB = dstpal[*dst].b;
  136. ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
  137. dR &= 0xff;
  138. dG &= 0xff;
  139. dB &= 0xff;
  140. // Pack RGB into 8bit pixel
  141. if ( palmap == NULL ) {
  142. *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)));
  143. } else {
  144. *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
  145. }
  146. }
  147. dst++;
  148. src += srcbpp;
  149. },
  150. width);
  151. /* *INDENT-ON* */ // clang-format on
  152. src += srcskip;
  153. dst += dstskip;
  154. }
  155. }
  156. #ifdef SDL_SSE2_INTRINSICS
  157. static void SDL_TARGETING("sse2") Blit888to888SurfaceAlphaSSE2(SDL_BlitInfo *info)
  158. {
  159. int width = info->dst_w;
  160. int height = info->dst_h;
  161. Uint8 *src = info->src;
  162. int srcskip = info->src_skip;
  163. Uint8 *dst = info->dst;
  164. int dstskip = info->dst_skip;
  165. Uint8 alpha = info->a;
  166. const __m128i alpha_fill_mask = _mm_set1_epi32((int)0xff000000);
  167. const __m128i srcA = _mm_set1_epi16(alpha);
  168. while (height--) {
  169. int i = 0;
  170. for (; i + 4 <= width; i += 4) {
  171. // Load 4 src pixels
  172. __m128i src128 = _mm_loadu_si128((__m128i *)src);
  173. // Load 4 dst pixels
  174. __m128i dst128 = _mm_loadu_si128((__m128i *)dst);
  175. __m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128());
  176. __m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128());
  177. __m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128());
  178. __m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128());
  179. // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
  180. dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srcA),
  181. _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
  182. dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srcA),
  183. _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
  184. // dst += 0x1U (use 0x80 to round instead of floor)
  185. dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
  186. dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
  187. // dst = (dst + (dst >> 8)) >> 8
  188. dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
  189. dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
  190. dst128 = _mm_packus_epi16(dst_lo, dst_hi);
  191. // Set the alpha channels of dst to 255
  192. dst128 = _mm_or_si128(dst128, alpha_fill_mask);
  193. _mm_storeu_si128((__m128i *)dst, dst128);
  194. src += 16;
  195. dst += 16;
  196. }
  197. for (; i < width; ++i) {
  198. Uint32 src32 = *(Uint32 *)src;
  199. Uint32 dst32 = *(Uint32 *)dst;
  200. FACTOR_BLEND_8888(src32, dst32, alpha);
  201. *dst = dst32 | 0xff000000;
  202. src += 4;
  203. dst += 4;
  204. }
  205. src += srcskip;
  206. dst += dstskip;
  207. }
  208. }
  209. #endif
  210. // fast RGB888->(A)RGB888 blending with surface alpha=128 special case
  211. static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
  212. {
  213. int width = info->dst_w;
  214. int height = info->dst_h;
  215. Uint32 *srcp = (Uint32 *)info->src;
  216. int srcskip = info->src_skip >> 2;
  217. Uint32 *dstp = (Uint32 *)info->dst;
  218. int dstskip = info->dst_skip >> 2;
  219. while (height--) {
  220. /* *INDENT-OFF* */ // clang-format off
  221. DUFFS_LOOP4({
  222. Uint32 s = *srcp++;
  223. Uint32 d = *dstp;
  224. *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  225. + (s & d & 0x00010101)) | 0xff000000;
  226. }, width);
  227. /* *INDENT-ON* */ // clang-format on
  228. srcp += srcskip;
  229. dstp += dstskip;
  230. }
  231. }
  232. // fast RGB888->(A)RGB888 blending with surface alpha
  233. static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
  234. {
  235. unsigned alpha = info->a;
  236. if (alpha == 128) {
  237. BlitRGBtoRGBSurfaceAlpha128(info);
  238. } else {
  239. int width = info->dst_w;
  240. int height = info->dst_h;
  241. Uint32 *srcp = (Uint32 *)info->src;
  242. int srcskip = info->src_skip >> 2;
  243. Uint32 *dstp = (Uint32 *)info->dst;
  244. int dstskip = info->dst_skip >> 2;
  245. Uint32 s;
  246. Uint32 d;
  247. while (height--) {
  248. /* *INDENT-OFF* */ // clang-format off
  249. DUFFS_LOOP4({
  250. s = *srcp;
  251. d = *dstp;
  252. FACTOR_BLEND_8888(s, d, alpha);
  253. *dstp = d | 0xff000000;
  254. ++srcp;
  255. ++dstp;
  256. }, width);
  257. /* *INDENT-ON* */ // clang-format on
  258. srcp += srcskip;
  259. dstp += dstskip;
  260. }
  261. }
  262. }
  263. // 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel
  264. // blend a single 16 bit pixel at 50%
  265. #define BLEND16_50(d, s, mask) \
  266. ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  267. // blend two 16 bit pixels at 50%
  268. #define BLEND2x16_50(d, s, mask) \
  269. (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) + (s & d & (~(mask | mask << 16))))
  270. static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
  271. {
  272. int width = info->dst_w;
  273. int height = info->dst_h;
  274. Uint16 *srcp = (Uint16 *)info->src;
  275. int srcskip = info->src_skip >> 1;
  276. Uint16 *dstp = (Uint16 *)info->dst;
  277. int dstskip = info->dst_skip >> 1;
  278. while (height--) {
  279. if (((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
  280. /*
  281. * Source and destination not aligned, pipeline it.
  282. * This is mostly a win for big blits but no loss for
  283. * small ones
  284. */
  285. Uint32 prev_sw;
  286. int w = width;
  287. // handle odd destination
  288. if ((uintptr_t)dstp & 2) {
  289. Uint16 d = *dstp, s = *srcp;
  290. *dstp = BLEND16_50(d, s, mask);
  291. dstp++;
  292. srcp++;
  293. w--;
  294. }
  295. srcp++; // srcp is now 32-bit aligned
  296. // bootstrap pipeline with first halfword
  297. prev_sw = ((Uint32 *)srcp)[-1];
  298. while (w > 1) {
  299. Uint32 sw, dw, s;
  300. sw = *(Uint32 *)srcp;
  301. dw = *(Uint32 *)dstp;
  302. #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  303. s = (prev_sw << 16) + (sw >> 16);
  304. #else
  305. s = (prev_sw >> 16) + (sw << 16);
  306. #endif
  307. prev_sw = sw;
  308. *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
  309. dstp += 2;
  310. srcp += 2;
  311. w -= 2;
  312. }
  313. // final pixel if any
  314. if (w) {
  315. Uint16 d = *dstp, s;
  316. #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  317. s = (Uint16)prev_sw;
  318. #else
  319. s = (Uint16)(prev_sw >> 16);
  320. #endif
  321. *dstp = BLEND16_50(d, s, mask);
  322. srcp++;
  323. dstp++;
  324. }
  325. srcp += srcskip - 1;
  326. dstp += dstskip;
  327. } else {
  328. // source and destination are aligned
  329. int w = width;
  330. // first odd pixel?
  331. if ((uintptr_t)srcp & 2) {
  332. Uint16 d = *dstp, s = *srcp;
  333. *dstp = BLEND16_50(d, s, mask);
  334. srcp++;
  335. dstp++;
  336. w--;
  337. }
  338. // srcp and dstp are now 32-bit aligned
  339. while (w > 1) {
  340. Uint32 sw = *(Uint32 *)srcp;
  341. Uint32 dw = *(Uint32 *)dstp;
  342. *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
  343. srcp += 2;
  344. dstp += 2;
  345. w -= 2;
  346. }
  347. // last odd pixel?
  348. if (w) {
  349. Uint16 d = *dstp, s = *srcp;
  350. *dstp = BLEND16_50(d, s, mask);
  351. srcp++;
  352. dstp++;
  353. }
  354. srcp += srcskip;
  355. dstp += dstskip;
  356. }
  357. }
  358. }
  359. #ifdef SDL_MMX_INTRINSICS
  360. // fast RGB565->RGB565 blending with surface alpha
  361. static void SDL_TARGETING("mmx") Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
  362. {
  363. unsigned alpha = info->a;
  364. if (alpha == 128) {
  365. Blit16to16SurfaceAlpha128(info, 0xf7de);
  366. } else {
  367. int width = info->dst_w;
  368. int height = info->dst_h;
  369. Uint16 *srcp = (Uint16 *)info->src;
  370. int srcskip = info->src_skip >> 1;
  371. Uint16 *dstp = (Uint16 *)info->dst;
  372. int dstskip = info->dst_skip >> 1;
  373. Uint32 s, d;
  374. #ifdef USE_DUFFS_LOOP
  375. __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  376. alpha &= ~(1 + 2 + 4); // cut alpha to get the exact same behaviour
  377. mm_alpha = _mm_set_pi32(0, alpha); // 0000000A -> mm_alpha
  378. alpha >>= 3; // downscale alpha to 5 bits
  379. mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); // 00000A0A -> mm_alpha
  380. mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); // 0A0A0A0A -> mm_alpha
  381. /* position alpha to allow for mullo and mulhi on diff channels
  382. to reduce the number of operations */
  383. mm_alpha = _mm_slli_si64(mm_alpha, 3);
  384. // Setup the 565 color channel masks
  385. gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); // MASKGREEN -> gmask
  386. bmask = _mm_set_pi32(0x001F001F, 0x001F001F); // MASKBLUE -> bmask
  387. #endif
  388. while (height--) {
  389. /* *INDENT-OFF* */ // clang-format off
  390. DUFFS_LOOP_124(
  391. {
  392. s = *srcp++;
  393. d = *dstp;
  394. /*
  395. * shift out the middle component (green) to
  396. * the high 16 bits, and process all three RGB
  397. * components at the same time.
  398. */
  399. s = (s | s << 16) & 0x07e0f81f;
  400. d = (d | d << 16) & 0x07e0f81f;
  401. d += (s - d) * alpha >> 5;
  402. d &= 0x07e0f81f;
  403. *dstp++ = (Uint16)(d | d >> 16);
  404. },{
  405. s = *srcp++;
  406. d = *dstp;
  407. /*
  408. * shift out the middle component (green) to
  409. * the high 16 bits, and process all three RGB
  410. * components at the same time.
  411. */
  412. s = (s | s << 16) & 0x07e0f81f;
  413. d = (d | d << 16) & 0x07e0f81f;
  414. d += (s - d) * alpha >> 5;
  415. d &= 0x07e0f81f;
  416. *dstp++ = (Uint16)(d | d >> 16);
  417. s = *srcp++;
  418. d = *dstp;
  419. /*
  420. * shift out the middle component (green) to
  421. * the high 16 bits, and process all three RGB
  422. * components at the same time.
  423. */
  424. s = (s | s << 16) & 0x07e0f81f;
  425. d = (d | d << 16) & 0x07e0f81f;
  426. d += (s - d) * alpha >> 5;
  427. d &= 0x07e0f81f;
  428. *dstp++ = (Uint16)(d | d >> 16);
  429. },{
  430. src1 = *(__m64*)srcp; // 4 src pixels -> src1
  431. dst1 = *(__m64*)dstp; // 4 dst pixels -> dst1
  432. // red
  433. src2 = src1;
  434. src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2 [000r 000r 000r 000r]
  435. dst2 = dst1;
  436. dst2 = _mm_srli_pi16(dst2, 11); // dst2 >> 11 -> dst2 [000r 000r 000r 000r]
  437. // blend
  438. src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
  439. src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  440. src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2
  441. dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
  442. dst2 = _mm_slli_pi16(dst2, 11); // dst2 << 11 -> dst2
  443. mm_res = dst2; // RED -> mm_res
  444. // green -- process the bits in place
  445. src2 = src1;
  446. src2 = _mm_and_si64(src2, gmask); // src & MASKGREEN -> src2
  447. dst2 = dst1;
  448. dst2 = _mm_and_si64(dst2, gmask); // dst & MASKGREEN -> dst2
  449. // blend
  450. src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
  451. src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  452. src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2
  453. dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
  454. mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN -> mm_res
  455. // blue
  456. src2 = src1;
  457. src2 = _mm_and_si64(src2, bmask); // src & MASKBLUE -> src2[000b 000b 000b 000b]
  458. dst2 = dst1;
  459. dst2 = _mm_and_si64(dst2, bmask); // dst & MASKBLUE -> dst2[000b 000b 000b 000b]
  460. // blend
  461. src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
  462. src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  463. src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2
  464. dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
  465. dst2 = _mm_and_si64(dst2, bmask); // dst2 & MASKBLUE -> dst2
  466. mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN | BLUE -> mm_res
  467. *(__m64*)dstp = mm_res; // mm_res -> 4 dst pixels
  468. srcp += 4;
  469. dstp += 4;
  470. }, width);
  471. /* *INDENT-ON* */ // clang-format on
  472. srcp += srcskip;
  473. dstp += dstskip;
  474. }
  475. _mm_empty();
  476. }
  477. }
  478. // fast RGB555->RGB555 blending with surface alpha
  479. static void SDL_TARGETING("mmx") Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
  480. {
  481. unsigned alpha = info->a;
  482. if (alpha == 128) {
  483. Blit16to16SurfaceAlpha128(info, 0xfbde);
  484. } else {
  485. int width = info->dst_w;
  486. int height = info->dst_h;
  487. Uint16 *srcp = (Uint16 *)info->src;
  488. int srcskip = info->src_skip >> 1;
  489. Uint16 *dstp = (Uint16 *)info->dst;
  490. int dstskip = info->dst_skip >> 1;
  491. Uint32 s, d;
  492. #ifdef USE_DUFFS_LOOP
  493. __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  494. alpha &= ~(1 + 2 + 4); // cut alpha to get the exact same behaviour
  495. mm_alpha = _mm_set_pi32(0, alpha); // 0000000A -> mm_alpha
  496. alpha >>= 3; // downscale alpha to 5 bits
  497. mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); // 00000A0A -> mm_alpha
  498. mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); // 0A0A0A0A -> mm_alpha
  499. /* position alpha to allow for mullo and mulhi on diff channels
  500. to reduce the number of operations */
  501. mm_alpha = _mm_slli_si64(mm_alpha, 3);
  502. // Setup the 555 color channel masks
  503. rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); // MASKRED -> rmask
  504. gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); // MASKGREEN -> gmask
  505. bmask = _mm_set_pi32(0x001F001F, 0x001F001F); // MASKBLUE -> bmask
  506. #endif
  507. while (height--) {
  508. /* *INDENT-OFF* */ // clang-format off
  509. DUFFS_LOOP_124(
  510. {
  511. s = *srcp++;
  512. d = *dstp;
  513. /*
  514. * shift out the middle component (green) to
  515. * the high 16 bits, and process all three RGB
  516. * components at the same time.
  517. */
  518. s = (s | s << 16) & 0x03e07c1f;
  519. d = (d | d << 16) & 0x03e07c1f;
  520. d += (s - d) * alpha >> 5;
  521. d &= 0x03e07c1f;
  522. *dstp++ = (Uint16)(d | d >> 16);
  523. },{
  524. s = *srcp++;
  525. d = *dstp;
  526. /*
  527. * shift out the middle component (green) to
  528. * the high 16 bits, and process all three RGB
  529. * components at the same time.
  530. */
  531. s = (s | s << 16) & 0x03e07c1f;
  532. d = (d | d << 16) & 0x03e07c1f;
  533. d += (s - d) * alpha >> 5;
  534. d &= 0x03e07c1f;
  535. *dstp++ = (Uint16)(d | d >> 16);
  536. s = *srcp++;
  537. d = *dstp;
  538. /*
  539. * shift out the middle component (green) to
  540. * the high 16 bits, and process all three RGB
  541. * components at the same time.
  542. */
  543. s = (s | s << 16) & 0x03e07c1f;
  544. d = (d | d << 16) & 0x03e07c1f;
  545. d += (s - d) * alpha >> 5;
  546. d &= 0x03e07c1f;
  547. *dstp++ = (Uint16)(d | d >> 16);
  548. },{
  549. src1 = *(__m64*)srcp; // 4 src pixels -> src1
  550. dst1 = *(__m64*)dstp; // 4 dst pixels -> dst1
  551. // red -- process the bits in place
  552. src2 = src1;
  553. src2 = _mm_and_si64(src2, rmask); // src & MASKRED -> src2
  554. dst2 = dst1;
  555. dst2 = _mm_and_si64(dst2, rmask); // dst & MASKRED -> dst2
  556. // blend
  557. src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
  558. src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  559. src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2
  560. dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
  561. dst2 = _mm_and_si64(dst2, rmask); // dst2 & MASKRED -> dst2
  562. mm_res = dst2; // RED -> mm_res
  563. // green -- process the bits in place
  564. src2 = src1;
  565. src2 = _mm_and_si64(src2, gmask); // src & MASKGREEN -> src2
  566. dst2 = dst1;
  567. dst2 = _mm_and_si64(dst2, gmask); // dst & MASKGREEN -> dst2
  568. // blend
  569. src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
  570. src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  571. src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2
  572. dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
  573. mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN -> mm_res
  574. // blue
  575. src2 = src1; // src -> src2
  576. src2 = _mm_and_si64(src2, bmask); // src & MASKBLUE -> src2[000b 000b 000b 000b]
  577. dst2 = dst1; // dst -> dst2
  578. dst2 = _mm_and_si64(dst2, bmask); // dst & MASKBLUE -> dst2[000b 000b 000b 000b]
  579. // blend
  580. src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
  581. src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  582. src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2
  583. dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
  584. dst2 = _mm_and_si64(dst2, bmask); // dst2 & MASKBLUE -> dst2
  585. mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN | BLUE -> mm_res
  586. *(__m64*)dstp = mm_res; // mm_res -> 4 dst pixels
  587. srcp += 4;
  588. dstp += 4;
  589. }, width);
  590. /* *INDENT-ON* */ // clang-format on
  591. srcp += srcskip;
  592. dstp += dstskip;
  593. }
  594. _mm_empty();
  595. }
  596. }
  597. #endif // SDL_MMX_INTRINSICS
  598. // fast RGB565->RGB565 blending with surface alpha
  599. static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
  600. {
  601. unsigned alpha = info->a;
  602. if (alpha == 128) {
  603. Blit16to16SurfaceAlpha128(info, 0xf7de);
  604. } else {
  605. int width = info->dst_w;
  606. int height = info->dst_h;
  607. Uint16 *srcp = (Uint16 *)info->src;
  608. int srcskip = info->src_skip >> 1;
  609. Uint16 *dstp = (Uint16 *)info->dst;
  610. int dstskip = info->dst_skip >> 1;
  611. alpha >>= 3; // downscale alpha to 5 bits
  612. while (height--) {
  613. /* *INDENT-OFF* */ // clang-format off
  614. DUFFS_LOOP4({
  615. Uint32 s = *srcp++;
  616. Uint32 d = *dstp;
  617. /*
  618. * shift out the middle component (green) to
  619. * the high 16 bits, and process all three RGB
  620. * components at the same time.
  621. */
  622. s = (s | s << 16) & 0x07e0f81f;
  623. d = (d | d << 16) & 0x07e0f81f;
  624. d += (s - d) * alpha >> 5;
  625. d &= 0x07e0f81f;
  626. *dstp++ = (Uint16)(d | d >> 16);
  627. }, width);
  628. /* *INDENT-ON* */ // clang-format on
  629. srcp += srcskip;
  630. dstp += dstskip;
  631. }
  632. }
  633. }
  634. // fast RGB555->RGB555 blending with surface alpha
  635. static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
  636. {
  637. unsigned alpha = info->a; // downscale alpha to 5 bits
  638. if (alpha == 128) {
  639. Blit16to16SurfaceAlpha128(info, 0xfbde);
  640. } else {
  641. int width = info->dst_w;
  642. int height = info->dst_h;
  643. Uint16 *srcp = (Uint16 *)info->src;
  644. int srcskip = info->src_skip >> 1;
  645. Uint16 *dstp = (Uint16 *)info->dst;
  646. int dstskip = info->dst_skip >> 1;
  647. alpha >>= 3; // downscale alpha to 5 bits
  648. while (height--) {
  649. /* *INDENT-OFF* */ // clang-format off
  650. DUFFS_LOOP4({
  651. Uint32 s = *srcp++;
  652. Uint32 d = *dstp;
  653. /*
  654. * shift out the middle component (green) to
  655. * the high 16 bits, and process all three RGB
  656. * components at the same time.
  657. */
  658. s = (s | s << 16) & 0x03e07c1f;
  659. d = (d | d << 16) & 0x03e07c1f;
  660. d += (s - d) * alpha >> 5;
  661. d &= 0x03e07c1f;
  662. *dstp++ = (Uint16)(d | d >> 16);
  663. }, width);
  664. /* *INDENT-ON* */ // clang-format on
  665. srcp += srcskip;
  666. dstp += dstskip;
  667. }
  668. }
  669. }
  670. // fast ARGB8888->RGB565 blending with pixel alpha
  671. static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
  672. {
  673. int width = info->dst_w;
  674. int height = info->dst_h;
  675. Uint32 *srcp = (Uint32 *)info->src;
  676. int srcskip = info->src_skip >> 2;
  677. Uint16 *dstp = (Uint16 *)info->dst;
  678. int dstskip = info->dst_skip >> 1;
  679. while (height--) {
  680. /* *INDENT-OFF* */ // clang-format off
  681. DUFFS_LOOP4({
  682. Uint32 s = *srcp;
  683. unsigned alpha = s >> 27; // downscale alpha to 5 bits
  684. /* Here we special-case opaque alpha since the
  685. compositioning used (>>8 instead of /255) doesn't handle
  686. it correctly. */
  687. if (alpha) {
  688. if (alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  689. *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
  690. } else {
  691. Uint32 d = *dstp;
  692. /*
  693. * convert source and destination to G0RAB65565
  694. * and blend all components at the same time
  695. */
  696. s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800) + (s >> 3 & 0x1f);
  697. d = (d | d << 16) & 0x07e0f81f;
  698. d += (s - d) * alpha >> 5;
  699. d &= 0x07e0f81f;
  700. *dstp = (Uint16)(d | d >> 16);
  701. }
  702. }
  703. srcp++;
  704. dstp++;
  705. }, width);
  706. /* *INDENT-ON* */ // clang-format on
  707. srcp += srcskip;
  708. dstp += dstskip;
  709. }
  710. }
  711. // fast ARGB8888->RGB555 blending with pixel alpha
  712. static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
  713. {
  714. int width = info->dst_w;
  715. int height = info->dst_h;
  716. Uint32 *srcp = (Uint32 *)info->src;
  717. int srcskip = info->src_skip >> 2;
  718. Uint16 *dstp = (Uint16 *)info->dst;
  719. int dstskip = info->dst_skip >> 1;
  720. while (height--) {
  721. /* *INDENT-OFF* */ // clang-format off
  722. DUFFS_LOOP4({
  723. unsigned alpha;
  724. Uint32 s = *srcp;
  725. alpha = s >> 27; // downscale alpha to 5 bits
  726. /* Here we special-case opaque alpha since the
  727. compositioning used (>>8 instead of /255) doesn't handle
  728. it correctly. */
  729. if (alpha) {
  730. if (alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  731. *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
  732. } else {
  733. Uint32 d = *dstp;
  734. /*
  735. * convert source and destination to G0RAB55555
  736. * and blend all components at the same time
  737. */
  738. s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00) + (s >> 3 & 0x1f);
  739. d = (d | d << 16) & 0x03e07c1f;
  740. d += (s - d) * alpha >> 5;
  741. d &= 0x03e07c1f;
  742. *dstp = (Uint16)(d | d >> 16);
  743. }
  744. }
  745. srcp++;
  746. dstp++;
  747. }, width);
  748. /* *INDENT-ON* */ // clang-format on
  749. srcp += srcskip;
  750. dstp += dstskip;
  751. }
  752. }
  753. // General (slow) N->N blending with per-surface alpha
  754. static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
  755. {
  756. int width = info->dst_w;
  757. int height = info->dst_h;
  758. Uint8 *src = info->src;
  759. int srcskip = info->src_skip;
  760. Uint8 *dst = info->dst;
  761. int dstskip = info->dst_skip;
  762. const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
  763. const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
  764. int srcbpp = srcfmt->bytes_per_pixel;
  765. int dstbpp = dstfmt->bytes_per_pixel;
  766. Uint32 Pixel;
  767. unsigned sR, sG, sB;
  768. unsigned dR, dG, dB, dA;
  769. const unsigned sA = info->a;
  770. if (sA) {
  771. while (height--) {
  772. /* *INDENT-OFF* */ // clang-format off
  773. DUFFS_LOOP4(
  774. {
  775. DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  776. DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  777. ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  778. ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  779. src += srcbpp;
  780. dst += dstbpp;
  781. },
  782. width);
  783. /* *INDENT-ON* */ // clang-format on
  784. src += srcskip;
  785. dst += dstskip;
  786. }
  787. }
  788. }
  789. // General (slow) colorkeyed N->N blending with per-surface alpha
  790. static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
  791. {
  792. int width = info->dst_w;
  793. int height = info->dst_h;
  794. Uint8 *src = info->src;
  795. int srcskip = info->src_skip;
  796. Uint8 *dst = info->dst;
  797. int dstskip = info->dst_skip;
  798. const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
  799. const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
  800. Uint32 ckey = info->colorkey;
  801. int srcbpp = srcfmt->bytes_per_pixel;
  802. int dstbpp = dstfmt->bytes_per_pixel;
  803. Uint32 Pixel;
  804. unsigned sR, sG, sB;
  805. unsigned dR, dG, dB, dA;
  806. const unsigned sA = info->a;
  807. while (height--) {
  808. /* *INDENT-OFF* */ // clang-format off
  809. DUFFS_LOOP4(
  810. {
  811. RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  812. if (sA && Pixel != ckey) {
  813. RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  814. DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  815. ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  816. ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  817. }
  818. src += srcbpp;
  819. dst += dstbpp;
  820. },
  821. width);
  822. /* *INDENT-ON* */ // clang-format on
  823. src += srcskip;
  824. dst += dstskip;
  825. }
  826. }
  827. // Fast 32-bit RGBA->RGBA blending with pixel alpha
  828. static void Blit8888to8888PixelAlpha(SDL_BlitInfo *info)
  829. {
  830. int width = info->dst_w;
  831. int height = info->dst_h;
  832. Uint8 *src = info->src;
  833. int srcskip = info->src_skip;
  834. Uint8 *dst = info->dst;
  835. int dstskip = info->dst_skip;
  836. const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
  837. while (height--) {
  838. int i = 0;
  839. for (; i < width; ++i) {
  840. Uint32 src32 = *(Uint32 *)src;
  841. Uint32 dst32 = *(Uint32 *)dst;
  842. ALPHA_BLEND_8888(src32, dst32, srcfmt);
  843. *(Uint32 *)dst = dst32;
  844. src += 4;
  845. dst += 4;
  846. }
  847. src += srcskip;
  848. dst += dstskip;
  849. }
  850. }
  851. // Fast 32-bit RGBA->RGB(A) blending with pixel alpha and src swizzling
  852. static void Blit8888to8888PixelAlphaSwizzle(SDL_BlitInfo *info)
  853. {
  854. int width = info->dst_w;
  855. int height = info->dst_h;
  856. Uint8 *src = info->src;
  857. int srcskip = info->src_skip;
  858. Uint8 *dst = info->dst;
  859. int dstskip = info->dst_skip;
  860. const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
  861. const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
  862. while (height--) {
  863. int i = 0;
  864. for (; i < width; ++i) {
  865. Uint32 src32 = *(Uint32 *)src;
  866. Uint32 dst32 = *(Uint32 *)dst;
  867. ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
  868. *(Uint32 *)dst = dst32;
  869. src += 4;
  870. dst += 4;
  871. }
  872. src += srcskip;
  873. dst += dstskip;
  874. }
  875. }
  876. #ifdef SDL_SSE4_1_INTRINSICS
  877. static void SDL_TARGETING("sse4.1") Blit8888to8888PixelAlphaSwizzleSSE41(SDL_BlitInfo *info)
  878. {
  879. int width = info->dst_w;
  880. int height = info->dst_h;
  881. Uint8 *src = info->src;
  882. int srcskip = info->src_skip;
  883. Uint8 *dst = info->dst;
  884. int dstskip = info->dst_skip;
  885. const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
  886. const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
  887. // The byte offsets for the start of each pixel
  888. const __m128i mask_offsets = _mm_set_epi8(
  889. 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
  890. const __m128i convert_mask = _mm_add_epi32(
  891. _mm_set1_epi32(
  892. ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
  893. ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
  894. ((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
  895. mask_offsets);
  896. const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
  897. const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstfmt->Amask);
  898. while (height--) {
  899. int i = 0;
  900. for (; i + 4 <= width; i += 4) {
  901. // Load 4 src pixels
  902. __m128i src128 = _mm_loadu_si128((__m128i *)src);
  903. // Load 4 dst pixels
  904. __m128i dst128 = _mm_loadu_si128((__m128i *)dst);
  905. // Extract the alpha from each pixel and splat it into all the channels
  906. __m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask);
  907. // Convert to dst format
  908. src128 = _mm_shuffle_epi8(src128, convert_mask);
  909. // Set the alpha channels of src to 255
  910. src128 = _mm_or_si128(src128, alpha_fill_mask);
  911. // Duplicate each 8-bit alpha value into both bytes of 16-bit lanes
  912. __m128i srca_lo = _mm_unpacklo_epi8(srcA, srcA);
  913. __m128i srca_hi = _mm_unpackhi_epi8(srcA, srcA);
  914. // Calculate 255-srcA in every second 8-bit lane (255-srcA = srcA^0xff)
  915. srca_lo = _mm_xor_si128(srca_lo, _mm_set1_epi16(0xff00));
  916. srca_hi = _mm_xor_si128(srca_hi, _mm_set1_epi16(0xff00));
  917. // maddubs expects second argument to be signed, so subtract 128
  918. src128 = _mm_sub_epi8(src128, _mm_set1_epi8((Uint8)128));
  919. dst128 = _mm_sub_epi8(dst128, _mm_set1_epi8((Uint8)128));
  920. // dst = srcA*(src-128) + (255-srcA)*(dst-128) = srcA*src + (255-srcA)*dst - 128*255
  921. __m128i dst_lo = _mm_maddubs_epi16(srca_lo, _mm_unpacklo_epi8(src128, dst128));
  922. __m128i dst_hi = _mm_maddubs_epi16(srca_hi, _mm_unpackhi_epi8(src128, dst128));
  923. // dst += 0x1U (use 0x80 to round instead of floor) + 128*255 (to fix maddubs result)
  924. dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1 + 128*255));
  925. dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1 + 128*255));
  926. // dst = (dst + (dst >> 8)) >> 8 = (dst * 257) >> 16
  927. dst_lo = _mm_mulhi_epu16(dst_lo, _mm_set1_epi16(257));
  928. dst_hi = _mm_mulhi_epu16(dst_hi, _mm_set1_epi16(257));
  929. // Blend the pixels together and save the result
  930. _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(dst_lo, dst_hi));
  931. src += 16;
  932. dst += 16;
  933. }
  934. for (; i < width; ++i) {
  935. Uint32 src32 = *(Uint32 *)src;
  936. Uint32 dst32 = *(Uint32 *)dst;
  937. ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
  938. *(Uint32 *)dst = dst32;
  939. src += 4;
  940. dst += 4;
  941. }
  942. src += srcskip;
  943. dst += dstskip;
  944. }
  945. }
  946. #endif
  947. #ifdef SDL_AVX2_INTRINSICS
  948. static void SDL_TARGETING("avx2") Blit8888to8888PixelAlphaSwizzleAVX2(SDL_BlitInfo *info)
  949. {
  950. int width = info->dst_w;
  951. int height = info->dst_h;
  952. Uint8 *src = info->src;
  953. int srcskip = info->src_skip;
  954. Uint8 *dst = info->dst;
  955. int dstskip = info->dst_skip;
  956. const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
  957. const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
  958. // The byte offsets for the start of each pixel
  959. const __m256i mask_offsets = _mm256_set_epi8(
  960. 28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
  961. const __m256i convert_mask = _mm256_add_epi32(
  962. _mm256_set1_epi32(
  963. ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
  964. ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
  965. ((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
  966. mask_offsets);
  967. const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
  968. const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstfmt->Amask);
  969. while (height--) {
  970. int i = 0;
  971. for (; i + 8 <= width; i += 8) {
  972. // Load 8 src pixels
  973. __m256i src256 = _mm256_loadu_si256((__m256i *)src);
  974. // Load 8 dst pixels
  975. __m256i dst256 = _mm256_loadu_si256((__m256i *)dst);
  976. // Extract the alpha from each pixel and splat it into all the channels
  977. __m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask);
  978. // Convert to dst format
  979. src256 = _mm256_shuffle_epi8(src256, convert_mask);
  980. // Set the alpha channels of src to 255
  981. src256 = _mm256_or_si256(src256, alpha_fill_mask);
  982. // Duplicate each 8-bit alpha value into both bytes of 16-bit lanes
  983. __m256i alpha_lo = _mm256_unpacklo_epi8(srcA, srcA);
  984. __m256i alpha_hi = _mm256_unpackhi_epi8(srcA, srcA);
  985. // Calculate 255-srcA in every second 8-bit lane (255-srcA = srcA^0xff)
  986. alpha_lo = _mm256_xor_si256(alpha_lo, _mm256_set1_epi16(0xff00));
  987. alpha_hi = _mm256_xor_si256(alpha_hi, _mm256_set1_epi16(0xff00));
  988. // maddubs expects second argument to be signed, so subtract 128
  989. src256 = _mm256_sub_epi8(src256, _mm256_set1_epi8((Uint8)128));
  990. dst256 = _mm256_sub_epi8(dst256, _mm256_set1_epi8((Uint8)128));
  991. // dst = srcA*(src-128) + (255-srcA)*(dst-128) = srcA*src + (255-srcA)*dst - 128*255
  992. __m256i dst_lo = _mm256_maddubs_epi16(alpha_lo, _mm256_unpacklo_epi8(src256, dst256));
  993. __m256i dst_hi = _mm256_maddubs_epi16(alpha_hi, _mm256_unpackhi_epi8(src256, dst256));
  994. // dst += 0x1U (use 0x80 to round instead of floor) + 128*255 (to fix maddubs result)
  995. dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1 + 128*255));
  996. dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1 + 128*255));
  997. // dst = (dst + (dst >> 8)) >> 8 = (dst * 257) >> 16
  998. dst_lo = _mm256_mulhi_epu16(dst_lo, _mm256_set1_epi16(257));
  999. dst_hi = _mm256_mulhi_epu16(dst_hi, _mm256_set1_epi16(257));
  1000. // Blend the pixels together and save the result
  1001. _mm256_storeu_si256((__m256i *)dst, _mm256_packus_epi16(dst_lo, dst_hi));
  1002. src += 32;
  1003. dst += 32;
  1004. }
  1005. for (; i < width; ++i) {
  1006. Uint32 src32 = *(Uint32 *)src;
  1007. Uint32 dst32 = *(Uint32 *)dst;
  1008. ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
  1009. *(Uint32 *)dst = dst32;
  1010. src += 4;
  1011. dst += 4;
  1012. }
  1013. src += srcskip;
  1014. dst += dstskip;
  1015. }
  1016. }
  1017. #endif
  1018. #if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8)
  1019. static void Blit8888to8888PixelAlphaSwizzleNEON(SDL_BlitInfo *info)
  1020. {
  1021. int width = info->dst_w;
  1022. int height = info->dst_h;
  1023. Uint8 *src = info->src;
  1024. int srcskip = info->src_skip;
  1025. Uint8 *dst = info->dst;
  1026. int dstskip = info->dst_skip;
  1027. const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
  1028. const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
  1029. // The byte offsets for the start of each pixel
  1030. const uint8x16_t mask_offsets = vreinterpretq_u8_u64(vcombine_u64(
  1031. vcreate_u64(0x0404040400000000), vcreate_u64(0x0c0c0c0c08080808)));
  1032. const uint8x16_t convert_mask = vreinterpretq_u8_u32(vaddq_u32(
  1033. vreinterpretq_u32_u8(mask_offsets),
  1034. vdupq_n_u32(
  1035. ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
  1036. ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
  1037. ((srcfmt->Bshift >> 3) << dstfmt->Bshift))));
  1038. const uint8x16_t alpha_splat_mask = vaddq_u8(vdupq_n_u8(srcfmt->Ashift >> 3), mask_offsets);
  1039. const uint8x16_t alpha_fill_mask = vreinterpretq_u8_u32(vdupq_n_u32(dstfmt->Amask));
  1040. while (height--) {
  1041. int i = 0;
  1042. for (; i + 4 <= width; i += 4) {
  1043. // Load 4 src pixels
  1044. uint8x16_t src128 = vld1q_u8(src);
  1045. // Load 4 dst pixels
  1046. uint8x16_t dst128 = vld1q_u8(dst);
  1047. // Extract the alpha from each pixel and splat it into all the channels
  1048. uint8x16_t srcA = vqtbl1q_u8(src128, alpha_splat_mask);
  1049. // Convert to dst format
  1050. src128 = vqtbl1q_u8(src128, convert_mask);
  1051. // Set the alpha channels of src to 255
  1052. src128 = vorrq_u8(src128, alpha_fill_mask);
  1053. // 255 - srcA = ~srcA
  1054. uint8x16_t srcInvA = vmvnq_u8(srcA);
  1055. // Result initialized with 1, this is for truncated divide later
  1056. uint16x8_t res_lo = vdupq_n_u16(1);
  1057. uint16x8_t res_hi = vdupq_n_u16(1);
  1058. // res = alpha * src + (255 - alpha) * dst
  1059. res_lo = vmlal_u8(res_lo, vget_low_u8(srcA), vget_low_u8(src128));
  1060. res_lo = vmlal_u8(res_lo, vget_low_u8(srcInvA), vget_low_u8(dst128));
  1061. res_hi = vmlal_high_u8(res_hi, srcA, src128);
  1062. res_hi = vmlal_high_u8(res_hi, srcInvA, dst128);
  1063. // Now result has +1 already added for truncated division
  1064. // dst = (res + (res >> 8)) >> 8
  1065. uint8x8_t temp;
  1066. temp = vaddhn_u16(res_lo, vshrq_n_u16(res_lo, 8));
  1067. dst128 = vaddhn_high_u16(temp, res_hi, vshrq_n_u16(res_hi, 8));
  1068. // For rounded division remove the constant 1 and change first two vmlal_u8 to vmull_u8
  1069. // Then replace two previous lines with following code:
  1070. // temp = vraddhn_u16(res_lo, vrshrq_n_u16(res_lo, 8));
  1071. // dst128 = vraddhn_high_u16(temp, res_hi, vrshrq_n_u16(res_hi, 8));
  1072. // Save the result
  1073. vst1q_u8(dst, dst128);
  1074. src += 16;
  1075. dst += 16;
  1076. }
  1077. // Process 1 pixel per iteration, max 3 iterations, same calculations as above
  1078. for (; i < width; ++i) {
  1079. // Top 32-bits will be not used in src32 & dst32
  1080. uint8x8_t src32 = vreinterpret_u8_u32(vld1_dup_u32((Uint32*)src));
  1081. uint8x8_t dst32 = vreinterpret_u8_u32(vld1_dup_u32((Uint32*)dst));
  1082. uint8x8_t srcA = vtbl1_u8(src32, vget_low_u8(alpha_splat_mask));
  1083. src32 = vtbl1_u8(src32, vget_low_u8(convert_mask));
  1084. src32 = vorr_u8(src32, vget_low_u8(alpha_fill_mask));
  1085. uint8x8_t srcInvA = vmvn_u8(srcA);
  1086. uint16x8_t res = vdupq_n_u16(1);
  1087. res = vmlal_u8(res, srcA, src32);
  1088. res = vmlal_u8(res, srcInvA, dst32);
  1089. dst32 = vaddhn_u16(res, vshrq_n_u16(res, 8));
  1090. // Save the result, only low 32-bits
  1091. vst1_lane_u32((Uint32*)dst, vreinterpret_u32_u8(dst32), 0);
  1092. src += 4;
  1093. dst += 4;
  1094. }
  1095. src += srcskip;
  1096. dst += dstskip;
  1097. }
  1098. }
  1099. #endif
  1100. // General (slow) N->N blending with pixel alpha
  1101. static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
  1102. {
  1103. int width = info->dst_w;
  1104. int height = info->dst_h;
  1105. Uint8 *src = info->src;
  1106. int srcskip = info->src_skip;
  1107. Uint8 *dst = info->dst;
  1108. int dstskip = info->dst_skip;
  1109. const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
  1110. const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
  1111. int srcbpp;
  1112. int dstbpp;
  1113. Uint32 Pixel;
  1114. unsigned sR, sG, sB, sA;
  1115. unsigned dR, dG, dB, dA;
  1116. // Set up some basic variables
  1117. srcbpp = srcfmt->bytes_per_pixel;
  1118. dstbpp = dstfmt->bytes_per_pixel;
  1119. while (height--) {
  1120. DUFFS_LOOP4(
  1121. {
  1122. DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  1123. if (sA) {
  1124. DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  1125. ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  1126. ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1127. }
  1128. src += srcbpp;
  1129. dst += dstbpp;
  1130. },
  1131. width);
  1132. /* *INDENT-ON* */ // clang-format on
  1133. src += srcskip;
  1134. dst += dstskip;
  1135. }
  1136. }
  1137. SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
  1138. {
  1139. const SDL_PixelFormatDetails *sf = surface->fmt;
  1140. const SDL_PixelFormatDetails *df = surface->map.info.dst_fmt;
  1141. switch (surface->map.info.flags & ~SDL_COPY_RLE_MASK) {
  1142. case SDL_COPY_BLEND:
  1143. // Per-pixel alpha blits
  1144. switch (df->bytes_per_pixel) {
  1145. case 1:
  1146. if (surface->map.info.dst_pal) {
  1147. return BlitNto1PixelAlpha;
  1148. } else {
  1149. // RGB332 has no palette !
  1150. return BlitNtoNPixelAlpha;
  1151. }
  1152. case 2:
  1153. if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  1154. if (df->Gmask == 0x7e0) {
  1155. return BlitARGBto565PixelAlpha;
  1156. } else if (df->Gmask == 0x3e0 && !df->Amask) {
  1157. return BlitARGBto555PixelAlpha;
  1158. }
  1159. }
  1160. return BlitNtoNPixelAlpha;
  1161. case 4:
  1162. if (SDL_PIXELLAYOUT(sf->format) == SDL_PACKEDLAYOUT_8888 && sf->Amask &&
  1163. SDL_PIXELLAYOUT(df->format) == SDL_PACKEDLAYOUT_8888) {
  1164. #ifdef SDL_AVX2_INTRINSICS
  1165. if (SDL_HasAVX2()) {
  1166. return Blit8888to8888PixelAlphaSwizzleAVX2;
  1167. }
  1168. #endif
  1169. #ifdef SDL_SSE4_1_INTRINSICS
  1170. if (SDL_HasSSE41()) {
  1171. return Blit8888to8888PixelAlphaSwizzleSSE41;
  1172. }
  1173. #endif
  1174. #if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8)
  1175. // To prevent "unused function" compiler warnings/errors
  1176. (void)Blit8888to8888PixelAlpha;
  1177. (void)Blit8888to8888PixelAlphaSwizzle;
  1178. return Blit8888to8888PixelAlphaSwizzleNEON;
  1179. #else
  1180. if (sf->format == df->format) {
  1181. return Blit8888to8888PixelAlpha;
  1182. } else {
  1183. return Blit8888to8888PixelAlphaSwizzle;
  1184. }
  1185. #endif
  1186. }
  1187. return BlitNtoNPixelAlpha;
  1188. case 3:
  1189. default:
  1190. break;
  1191. }
  1192. return BlitNtoNPixelAlpha;
  1193. case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  1194. if (sf->Amask == 0) {
  1195. // Per-surface alpha blits
  1196. switch (df->bytes_per_pixel) {
  1197. case 1:
  1198. if (surface->map.info.dst_pal) {
  1199. return BlitNto1SurfaceAlpha;
  1200. } else {
  1201. // RGB332 has no palette !
  1202. return BlitNtoNSurfaceAlpha;
  1203. }
  1204. case 2:
  1205. if (surface->map.identity) {
  1206. if (df->Gmask == 0x7e0) {
  1207. #ifdef SDL_MMX_INTRINSICS
  1208. if (SDL_HasMMX()) {
  1209. return Blit565to565SurfaceAlphaMMX;
  1210. } else
  1211. #endif
  1212. {
  1213. return Blit565to565SurfaceAlpha;
  1214. }
  1215. } else if (df->Gmask == 0x3e0) {
  1216. #ifdef SDL_MMX_INTRINSICS
  1217. if (SDL_HasMMX()) {
  1218. return Blit555to555SurfaceAlphaMMX;
  1219. } else
  1220. #endif
  1221. {
  1222. return Blit555to555SurfaceAlpha;
  1223. }
  1224. }
  1225. }
  1226. return BlitNtoNSurfaceAlpha;
  1227. case 4:
  1228. if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->bytes_per_pixel == 4) {
  1229. #ifdef SDL_SSE2_INTRINSICS
  1230. if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0 && SDL_HasSSE2()) {
  1231. return Blit888to888SurfaceAlphaSSE2;
  1232. }
  1233. #endif
  1234. if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  1235. return BlitRGBtoRGBSurfaceAlpha;
  1236. }
  1237. }
  1238. return BlitNtoNSurfaceAlpha;
  1239. case 3:
  1240. default:
  1241. return BlitNtoNSurfaceAlpha;
  1242. }
  1243. }
  1244. break;
  1245. case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  1246. if (sf->Amask == 0) {
  1247. if (df->bytes_per_pixel == 1) {
  1248. if (surface->map.info.dst_pal) {
  1249. return BlitNto1SurfaceAlphaKey;
  1250. } else {
  1251. // RGB332 has no palette !
  1252. return BlitNtoNSurfaceAlphaKey;
  1253. }
  1254. } else {
  1255. return BlitNtoNSurfaceAlphaKey;
  1256. }
  1257. }
  1258. break;
  1259. }
  1260. return NULL;
  1261. }
  1262. #endif // SDL_HAVE_BLIT_A