| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111 |
- /*
- Simple DirectMedia Layer
- Copyright (C) 1997-2021 Sam Lantinga <slouken@libsdl.org>
- This software is provided 'as-is', without any express or implied
- warranty. In no event will the authors be held liable for any damages
- arising from the use of this software.
- Permission is granted to anyone to use this software for any purpose,
- including commercial applications, and to alter it and redistribute it
- freely, subject to the following restrictions:
- 1. The origin of this software must not be misrepresented; you must not
- claim that you wrote the original software. If you use this software
- in a product, an acknowledgment in the product documentation would be
- appreciated but is not required.
- 2. Altered source versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
- 3. This notice may not be removed or altered from any source distribution.
- */
- #include "../SDL_internal.h"
- /* This a stretch blit implementation based on ideas given to me by
- Tomasz Cejner - thanks! :)
- April 27, 2000 - Sam Lantinga
- */
- #include "SDL_video.h"
- #include "SDL_blit.h"
- #include "SDL_render.h"
- /* This isn't ready for general consumption yet - it should be folded
- into the general blitting mechanism.
- */
- #if ((defined(_MSC_VER) && defined(_M_IX86)) || \
- (defined(__WATCOMC__) && defined(__386__)) || \
- (defined(__GNUC__) && defined(__i386__))) && SDL_ASSEMBLY_ROUTINES
- /* There's a bug with gcc 4.4.1 and -O2 where srcp doesn't get the correct
- * value after the first scanline. FIXME? */
- /* #define USE_ASM_STRETCH */
- #endif
- #ifdef USE_ASM_STRETCH
- #ifdef HAVE_MPROTECT
- #include <sys/types.h>
- #include <sys/mman.h>
- #endif
- #ifdef __GNUC__
- #define PAGE_ALIGNED __attribute__((__aligned__(4096)))
- #else
- #define PAGE_ALIGNED
- #endif
- #if defined(_M_IX86) || defined(__i386__) || defined(__386__)
- #define PREFIX16 0x66
- #define STORE_BYTE 0xAA
- #define STORE_WORD 0xAB
- #define LOAD_BYTE 0xAC
- #define LOAD_WORD 0xAD
- #define RETURN 0xC3
- #else
- #error Need assembly opcodes for this architecture
- #endif
- static unsigned char copy_row[4096] PAGE_ALIGNED;
- static int
- generate_rowbytes(int src_w, int dst_w, int bpp)
- {
- static struct
- {
- int bpp;
- int src_w;
- int dst_w;
- int status;
- } last;
- int i;
- int pos, inc;
- unsigned char *eip, *fence;
- unsigned char load, store;
- /* See if we need to regenerate the copy buffer */
- if ((src_w == last.src_w) && (dst_w == last.dst_w) && (bpp == last.bpp)) {
- return (last.status);
- }
- last.bpp = bpp;
- last.src_w = src_w;
- last.dst_w = dst_w;
- last.status = -1;
- switch (bpp) {
- case 1:
- load = LOAD_BYTE;
- store = STORE_BYTE;
- break;
- case 2:
- case 4:
- load = LOAD_WORD;
- store = STORE_WORD;
- break;
- default:
- return SDL_SetError("ASM stretch of %d bytes isn't supported", bpp);
- }
- #ifdef HAVE_MPROTECT
- /* Make the code writeable */
- if (mprotect(copy_row, sizeof(copy_row), PROT_READ | PROT_WRITE) < 0) {
- return SDL_SetError("Couldn't make copy buffer writeable");
- }
- #endif
- pos = 0x10000;
- inc = (src_w << 16) / dst_w;
- eip = copy_row;
- fence = copy_row + sizeof(copy_row)-2;
- for (i = 0; i < dst_w; ++i) {
- while (pos >= 0x10000L) {
- if (eip == fence) {
- return -1;
- }
- if (bpp == 2) {
- *eip++ = PREFIX16;
- }
- *eip++ = load;
- pos -= 0x10000L;
- }
- if (eip == fence) {
- return -1;
- }
- if (bpp == 2) {
- *eip++ = PREFIX16;
- }
- *eip++ = store;
- pos += inc;
- }
- *eip++ = RETURN;
- #ifdef HAVE_MPROTECT
- /* Make the code executable but not writeable */
- if (mprotect(copy_row, sizeof(copy_row), PROT_READ | PROT_EXEC) < 0) {
- return SDL_SetError("Couldn't make copy buffer executable");
- }
- #endif
- last.status = 0;
- return (0);
- }
- #endif /* USE_ASM_STRETCH */
- #define DEFINE_COPY_ROW(name, type) \
- static void name(type *src, int src_w, type *dst, int dst_w) \
- { \
- int i; \
- int pos, inc; \
- type pixel = 0; \
- \
- pos = 0x10000; \
- inc = (src_w << 16) / dst_w; \
- for ( i=dst_w; i>0; --i ) { \
- while ( pos >= 0x10000L ) { \
- pixel = *src++; \
- pos -= 0x10000L; \
- } \
- *dst++ = pixel; \
- pos += inc; \
- } \
- }
- /* *INDENT-OFF* */
- DEFINE_COPY_ROW(copy_row1, Uint8)
- DEFINE_COPY_ROW(copy_row2, Uint16)
- DEFINE_COPY_ROW(copy_row4, Uint32)
- /* *INDENT-ON* */
- /* The ASM code doesn't handle 24-bpp stretch blits */
- static void
- copy_row3(Uint8 * src, int src_w, Uint8 * dst, int dst_w)
- {
- int i;
- int pos, inc;
- Uint8 pixel[3] = { 0, 0, 0 };
- pos = 0x10000;
- inc = (src_w << 16) / dst_w;
- for (i = dst_w; i > 0; --i) {
- while (pos >= 0x10000L) {
- pixel[0] = *src++;
- pixel[1] = *src++;
- pixel[2] = *src++;
- pos -= 0x10000L;
- }
- *dst++ = pixel[0];
- *dst++ = pixel[1];
- *dst++ = pixel[2];
- pos += inc;
- }
- }
- static int SDL_LowerSoftStretchNearest(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect);
- static int SDL_LowerSoftStretchLinear(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect);
- static int SDL_UpperSoftStretch(SDL_Surface * src, const SDL_Rect * srcrect, SDL_Surface * dst, const SDL_Rect * dstrect, SDL_ScaleMode scaleMode);
- /* Perform a stretch blit between two surfaces of the same format.
- NOTE: This function is not safe to call from multiple threads!
- */
- int
- SDL_SoftStretch(SDL_Surface *src, const SDL_Rect *srcrect,
- SDL_Surface *dst, const SDL_Rect *dstrect)
- {
- return SDL_UpperSoftStretch(src, srcrect, dst, dstrect, SDL_ScaleModeNearest);
- }
- int
- SDL_SoftStretchLinear(SDL_Surface *src, const SDL_Rect *srcrect,
- SDL_Surface *dst, const SDL_Rect *dstrect)
- {
- return SDL_UpperSoftStretch(src, srcrect, dst, dstrect, SDL_ScaleModeLinear);
- }
- static int
- SDL_UpperSoftStretch(SDL_Surface * src, const SDL_Rect * srcrect,
- SDL_Surface * dst, const SDL_Rect * dstrect, SDL_ScaleMode scaleMode)
- {
- int ret;
- int src_locked;
- int dst_locked;
- SDL_Rect full_src;
- SDL_Rect full_dst;
- if (src->format->format != dst->format->format) {
- return SDL_SetError("Only works with same format surfaces");
- }
- if (scaleMode != SDL_ScaleModeNearest) {
- if (src->format->BytesPerPixel != 4 || src->format->format == SDL_PIXELFORMAT_ARGB2101010) {
- return SDL_SetError("Wrong format");
- }
- }
- /* Verify the blit rectangles */
- if (srcrect) {
- if ((srcrect->x < 0) || (srcrect->y < 0) ||
- ((srcrect->x + srcrect->w) > src->w) ||
- ((srcrect->y + srcrect->h) > src->h)) {
- return SDL_SetError("Invalid source blit rectangle");
- }
- } else {
- full_src.x = 0;
- full_src.y = 0;
- full_src.w = src->w;
- full_src.h = src->h;
- srcrect = &full_src;
- }
- if (dstrect) {
- if ((dstrect->x < 0) || (dstrect->y < 0) ||
- ((dstrect->x + dstrect->w) > dst->w) ||
- ((dstrect->y + dstrect->h) > dst->h)) {
- return SDL_SetError("Invalid destination blit rectangle");
- }
- } else {
- full_dst.x = 0;
- full_dst.y = 0;
- full_dst.w = dst->w;
- full_dst.h = dst->h;
- dstrect = &full_dst;
- }
- if (dstrect->w <= 0 || dstrect->h <= 0) {
- return 0;
- }
- /* Lock the destination if it's in hardware */
- dst_locked = 0;
- if (SDL_MUSTLOCK(dst)) {
- if (SDL_LockSurface(dst) < 0) {
- return SDL_SetError("Unable to lock destination surface");
- }
- dst_locked = 1;
- }
- /* Lock the source if it's in hardware */
- src_locked = 0;
- if (SDL_MUSTLOCK(src)) {
- if (SDL_LockSurface(src) < 0) {
- if (dst_locked) {
- SDL_UnlockSurface(dst);
- }
- return SDL_SetError("Unable to lock source surface");
- }
- src_locked = 1;
- }
- if (scaleMode == SDL_ScaleModeNearest) {
- ret = SDL_LowerSoftStretchNearest(src, srcrect, dst, dstrect);
- } else {
- ret = SDL_LowerSoftStretchLinear(src, srcrect, dst, dstrect);
- }
- /* We need to unlock the surfaces if they're locked */
- if (dst_locked) {
- SDL_UnlockSurface(dst);
- }
- if (src_locked) {
- SDL_UnlockSurface(src);
- }
- return ret;
- }
- int
- SDL_LowerSoftStretchNearest(SDL_Surface *src, const SDL_Rect *srcrect,
- SDL_Surface *dst, const SDL_Rect *dstrect)
- {
- int pos, inc;
- int dst_maxrow;
- int src_row, dst_row;
- Uint8 *srcp = NULL;
- Uint8 *dstp;
- #ifdef USE_ASM_STRETCH
- SDL_bool use_asm = SDL_TRUE;
- #ifdef __GNUC__
- int u1, u2;
- #endif
- #endif /* USE_ASM_STRETCH */
- const int bpp = dst->format->BytesPerPixel;
- /* Set up the data... */
- pos = 0x10000;
- inc = (srcrect->h << 16) / dstrect->h;
- src_row = srcrect->y;
- dst_row = dstrect->y;
- #ifdef USE_ASM_STRETCH
- /* Write the opcodes for this stretch */
- if ((bpp == 3) || (generate_rowbytes(srcrect->w, dstrect->w, bpp) < 0)) {
- use_asm = SDL_FALSE;
- }
- #endif
- /* Perform the stretch blit */
- for (dst_maxrow = dst_row + dstrect->h; dst_row < dst_maxrow; ++dst_row) {
- dstp = (Uint8 *) dst->pixels + (dst_row * dst->pitch)
- + (dstrect->x * bpp);
- while (pos >= 0x10000L) {
- srcp = (Uint8 *) src->pixels + (src_row * src->pitch)
- + (srcrect->x * bpp);
- ++src_row;
- pos -= 0x10000L;
- }
- #ifdef USE_ASM_STRETCH
- if (use_asm) {
- #ifdef __GNUC__
- __asm__ __volatile__("call *%4":"=&D"(u1), "=&S"(u2)
- :"0"(dstp), "1"(srcp), "r"(copy_row)
- :"memory");
- #elif defined(_MSC_VER) || defined(__WATCOMC__)
- /* *INDENT-OFF* */
- {
- void *code = copy_row;
- __asm {
- push edi
- push esi
- mov edi, dstp
- mov esi, srcp
- call dword ptr code
- pop esi
- pop edi
- }
- }
- /* *INDENT-ON* */
- #else
- #error Need inline assembly for this compiler
- #endif
- } else
- #endif
- switch (bpp) {
- case 1:
- copy_row1(srcp, srcrect->w, dstp, dstrect->w);
- break;
- case 2:
- copy_row2((Uint16 *) srcp, srcrect->w,
- (Uint16 *) dstp, dstrect->w);
- break;
- case 3:
- copy_row3(srcp, srcrect->w, dstp, dstrect->w);
- break;
- case 4:
- copy_row4((Uint32 *) srcp, srcrect->w,
- (Uint32 *) dstp, dstrect->w);
- break;
- }
- pos += inc;
- }
- return 0;
- }
- /* bilinear interpolation precision must be < 8
- Because with SSE: add-multiply: _mm_madd_epi16 works with signed int
- so pixels 0xb1...... are negatives and false the result
- same in NEON probably */
- #define PRECISION 7
- #define FIXED_POINT(i) ((uint32_t)(i) << 16)
- #define SRC_INDEX(fp) ((uint32_t)(fp) >> 16)
- #define INTEGER(fp) ((uint32_t)(fp) >> PRECISION)
- #define FRAC(fp) ((uint32_t)(fp >> (16 - PRECISION)) & ((1<<PRECISION) - 1))
- #define FRAC_ZERO 0
- #define FRAC_ONE (1 << PRECISION)
- #define FP_ONE FIXED_POINT(1)
- #define BILINEAR___START \
- int i; \
- int fp_sum_h, fp_step_h, left_pad_h, right_pad_h; \
- int fp_sum_w, fp_step_w, left_pad_w, right_pad_w; \
- int fp_sum_w_init, left_pad_w_init, right_pad_w_init, dst_gap, middle_init; \
- get_scaler_datas(src_h, dst_h, &fp_sum_h, &fp_step_h, &left_pad_h, &right_pad_h); \
- get_scaler_datas(src_w, dst_w, &fp_sum_w, &fp_step_w, &left_pad_w, &right_pad_w); \
- fp_sum_w_init = fp_sum_w + left_pad_w * fp_step_w; \
- left_pad_w_init = left_pad_w; \
- right_pad_w_init = right_pad_w; \
- dst_gap = dst_pitch - 4 * dst_w; \
- middle_init = dst_w - left_pad_w - right_pad_w; \
- #define BILINEAR___HEIGHT \
- int index_h, frac_h0, frac_h1, middle; \
- const Uint32 *src_h0, *src_h1; \
- int no_padding, incr_h0, incr_h1; \
- \
- no_padding = !(i < left_pad_h || i > dst_h - 1 - right_pad_h); \
- index_h = SRC_INDEX(fp_sum_h); \
- frac_h0 = FRAC(fp_sum_h); \
- \
- index_h = no_padding ? index_h : (i < left_pad_h ? 0 : src_h - 1); \
- frac_h0 = no_padding ? frac_h0 : 0; \
- incr_h1 = no_padding ? src_pitch : 0; \
- incr_h0 = index_h * src_pitch; \
- \
- src_h0 = (const Uint32 *)((const Uint8 *)src + incr_h0); \
- src_h1 = (const Uint32 *)((const Uint8 *)src_h0 + incr_h1); \
- \
- fp_sum_h += fp_step_h; \
- \
- frac_h1 = FRAC_ONE - frac_h0; \
- fp_sum_w = fp_sum_w_init; \
- right_pad_w = right_pad_w_init; \
- left_pad_w = left_pad_w_init; \
- middle = middle_init; \
- #if defined(__clang__)
- // Remove inlining of this function
- // Compiler crash with clang 9.0.8 / android-ndk-r21d
- // Compiler crash with clang 11.0.3 / Xcode
- // OK with clang 11.0.5 / android-ndk-22
- // OK with clang 12.0.0 / Xcode
- __attribute__((noinline))
- #endif
- static void
- get_scaler_datas(int src_nb, int dst_nb, int *fp_start, int *fp_step, int *left_pad, int *right_pad)
- {
- int step = FIXED_POINT(src_nb) / (dst_nb); /* source step in fixed point */
- int x0 = FP_ONE / 2; /* dst first pixel center at 0.5 in fixed point */
- int fp_sum;
- int i;
- #if 0
- /* scale to source coordinates */
- x0 *= src_nb;
- x0 /= dst_nb; /* x0 == step / 2 */
- #else
- /* Use this code for perfect match with pixman */
- Sint64 tmp[2];
- tmp[0] = (Sint64)step * (x0 >> 16);
- tmp[1] = (Sint64)step * (x0 & 0xFFFF);
- x0 = (int) (tmp[0] + ((tmp[1] + 0x8000) >> 16)); /* x0 == (step + 1) / 2 */
- #endif
- /* -= 0.5, get back the pixel origin, in source coordinates */
- x0 -= FP_ONE / 2;
- *fp_start = x0;
- *fp_step = step;
- *left_pad = 0;
- *right_pad = 0;
- fp_sum = x0;
- for (i = 0; i < dst_nb; i++) {
- if (fp_sum < 0) {
- *left_pad += 1;
- } else {
- int index = SRC_INDEX(fp_sum);
- if (index > src_nb - 2) {
- *right_pad += 1;
- }
- }
- fp_sum += step;
- }
- // SDL_Log("%d -> %d x0=%d step=%d left_pad=%d right_pad=%d", src_nb, dst_nb, *fp_start, *fp_step, *left_pad, *right_pad);
- }
- typedef struct color_t {
- Uint8 a;
- Uint8 b;
- Uint8 c;
- Uint8 d;
- } color_t;
- #if 0
- static void
- printf_64(const char *str, void *var)
- {
- uint8_t *val = (uint8_t*) var;
- printf(" * %s: %02x %02x %02x %02x _ %02x %02x %02x %02x\n",
- str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
- }
- #endif
- /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
- static SDL_INLINE void
- INTERPOL(const Uint32 *src_x0, const Uint32 *src_x1, int frac0, int frac1, Uint32 *dst)
- {
- const color_t *c0 = (const color_t *)src_x0;
- const color_t *c1 = (const color_t *)src_x1;
- color_t *cx = (color_t *)dst;
- #if 0
- cx->a = c0->a + INTEGER(frac0 * (c1->a - c0->a));
- cx->b = c0->b + INTEGER(frac0 * (c1->b - c0->b));
- cx->c = c0->c + INTEGER(frac0 * (c1->c - c0->c));
- cx->d = c0->d + INTEGER(frac0 * (c1->d - c0->d));
- #else
- cx->a = INTEGER(frac1 * c0->a + frac0 * c1->a);
- cx->b = INTEGER(frac1 * c0->b + frac0 * c1->b);
- cx->c = INTEGER(frac1 * c0->c + frac0 * c1->c);
- cx->d = INTEGER(frac1 * c0->d + frac0 * c1->d);
- #endif
- }
- static SDL_INLINE void
- INTERPOL_BILINEAR(const Uint32 *s0, const Uint32 *s1, int frac_w0, int frac_h0, int frac_h1, Uint32 *dst)
- {
- Uint32 tmp[2];
- unsigned int frac_w1 = FRAC_ONE - frac_w0;
- /* Vertical first, store to 'tmp' */
- INTERPOL(s0, s1, frac_h0, frac_h1, tmp);
- INTERPOL(s0 + 1, s1 + 1, frac_h0, frac_h1, tmp + 1);
- /* Horizontal, store to 'dst' */
- INTERPOL(tmp, tmp + 1, frac_w0, frac_w1, dst);
- }
- static int
- scale_mat(const Uint32 *src, int src_w, int src_h, int src_pitch,
- Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
- {
- BILINEAR___START
- for (i = 0; i < dst_h; i++) {
- BILINEAR___HEIGHT
- while (left_pad_w--) {
- INTERPOL_BILINEAR(src_h0, src_h1, FRAC_ZERO, frac_h0, frac_h1, dst);
- dst += 1;
- }
- while (middle--) {
- const Uint32 *s_00_01;
- const Uint32 *s_10_11;
- int index_w = 4 * SRC_INDEX(fp_sum_w);
- int frac_w = FRAC(fp_sum_w);
- fp_sum_w += fp_step_w;
- /*
- x00 ... x0_ ..... x01
- . . .
- . x .
- . . .
- . . .
- x10 ... x1_ ..... x11
- */
- s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
- s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
- INTERPOL_BILINEAR(s_00_01, s_10_11, frac_w, frac_h0, frac_h1, dst);
- dst += 1;
- }
- while (right_pad_w--) {
- int index_w = 4 * (src_w - 2);
- const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
- const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
- INTERPOL_BILINEAR(s_00_01, s_10_11, FRAC_ONE, frac_h0, frac_h1, dst);
- dst += 1;
- }
- dst = (Uint32 *)((Uint8 *)dst + dst_gap);
- }
- return 0;
- }
- #if defined(__SSE2__)
- # define HAVE_SSE2_INTRINSICS 1
- #endif
- #if defined(__ARM_NEON)
- # define HAVE_NEON_INTRINSICS 1
- # define CAST_uint8x8_t (uint8x8_t)
- # define CAST_uint32x2_t (uint32x2_t)
- #endif
- #if defined(__WINRT__) || defined(_MSC_VER)
- # if defined(HAVE_NEON_INTRINSICS)
- # undef CAST_uint8x8_t
- # undef CAST_uint32x2_t
- # define CAST_uint8x8_t
- # define CAST_uint32x2_t
- # endif
- #endif
- #if defined(HAVE_SSE2_INTRINSICS)
- #if 0
- static void
- printf_128(const char *str, __m128i var)
- {
- uint16_t *val = (uint16_t*) &var;
- printf(" * %s: %04x %04x %04x %04x _ %04x %04x %04x %04x\n",
- str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
- }
- #endif
- static SDL_INLINE int
- hasSSE2()
- {
- static int val = -1;
- if (val != -1) {
- return val;
- }
- val = SDL_HasSSE2();
- return val;
- }
- static SDL_INLINE void
- INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero)
- {
- __m128i x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
- __m128i v_frac_w0, k0, l0, d0, e0;
- int f, f2;
- f = frac_w;
- f2 = FRAC_ONE - frac_w;
- v_frac_w0 = _mm_set_epi16(f, f2, f, f2, f, f2, f, f2);
- x_00_01 = _mm_loadl_epi64((const __m128i *)s0); /* Load x00 and x01 */
- x_10_11 = _mm_loadl_epi64((const __m128i *)s1);
- /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
- /* Interpolation vertical */
- k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1);
- l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0);
- k0 = _mm_add_epi16(k0, l0);
- /* For perfect match, clear the factionnal part eventually. */
- /*
- k0 = _mm_srli_epi16(k0, PRECISION);
- k0 = _mm_slli_epi16(k0, PRECISION);
- */
- /* Interpolation horizontal */
- l0 = _mm_unpacklo_epi64(/* unused */ l0, k0);
- k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0);
- /* Store 1 pixel */
- d0 = _mm_srli_epi32(k0, PRECISION * 2);
- e0 = _mm_packs_epi32(d0, d0);
- e0 = _mm_packus_epi16(e0, e0);
- *dst = _mm_cvtsi128_si32(e0);
- }
- static int
- scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
- {
- BILINEAR___START
- for (i = 0; i < dst_h; i++) {
- int nb_block2;
- __m128i v_frac_h0;
- __m128i v_frac_h1;
- __m128i zero;
- BILINEAR___HEIGHT
- nb_block2 = middle / 2;
- v_frac_h0 = _mm_set_epi16(frac_h0, frac_h0, frac_h0, frac_h0, frac_h0, frac_h0, frac_h0, frac_h0);
- v_frac_h1 = _mm_set_epi16(frac_h1, frac_h1, frac_h1, frac_h1, frac_h1, frac_h1, frac_h1, frac_h1);
- zero = _mm_setzero_si128();
- while (left_pad_w--) {
- INTERPOL_BILINEAR_SSE(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst, zero);
- dst += 1;
- }
- while (nb_block2--) {
- int index_w_0, frac_w_0;
- int index_w_1, frac_w_1;
- const Uint32 *s_00_01, *s_02_03, *s_10_11, *s_12_13;
- __m128i x_00_01, x_10_11, x_02_03, x_12_13;/* Pixels in 4*uint8 in row */
- __m128i v_frac_w0, k0, l0, d0, e0;
- __m128i v_frac_w1, k1, l1, d1, e1;
- int f, f2;
- index_w_0 = 4 * SRC_INDEX(fp_sum_w);
- frac_w_0 = FRAC(fp_sum_w);
- fp_sum_w += fp_step_w;
- index_w_1 = 4 * SRC_INDEX(fp_sum_w);
- frac_w_1 = FRAC(fp_sum_w);
- fp_sum_w += fp_step_w;
- /*
- x00............ x01 x02...........x03
- . . . . . .
- j0 f0 j1 j2 f1 j3
- . . . . . .
- . . . . . .
- . . . . . .
- x10............ x11 x12...........x13
- */
- s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
- s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
- s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
- s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
- f = frac_w_0;
- f2 = FRAC_ONE - frac_w_0;
- v_frac_w0 = _mm_set_epi16(f, f2, f, f2, f, f2, f, f2);
- f = frac_w_1;
- f2 = FRAC_ONE - frac_w_1;
- v_frac_w1 = _mm_set_epi16(f, f2, f, f2, f, f2, f, f2);
- x_00_01 = _mm_loadl_epi64((const __m128i *)s_00_01); /* Load x00 and x01 */
- x_02_03 = _mm_loadl_epi64((const __m128i *)s_02_03);
- x_10_11 = _mm_loadl_epi64((const __m128i *)s_10_11);
- x_12_13 = _mm_loadl_epi64((const __m128i *)s_12_13);
- /* Interpolation vertical */
- k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1);
- l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0);
- k0 = _mm_add_epi16(k0, l0);
- k1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_02_03, zero), v_frac_h1);
- l1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_12_13, zero), v_frac_h0);
- k1 = _mm_add_epi16(k1, l1);
- /* Interpolation horizontal */
- l0 = _mm_unpacklo_epi64(/* unused */ l0, k0);
- k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0);
- l1 = _mm_unpacklo_epi64(/* unused */ l1, k1);
- k1 = _mm_madd_epi16(_mm_unpackhi_epi16(l1, k1), v_frac_w1);
- /* Store 1 pixel */
- d0 = _mm_srli_epi32(k0, PRECISION * 2);
- e0 = _mm_packs_epi32(d0, d0);
- e0 = _mm_packus_epi16(e0, e0);
- *dst++ = _mm_cvtsi128_si32(e0);
- /* Store 1 pixel */
- d1 = _mm_srli_epi32(k1, PRECISION * 2);
- e1 = _mm_packs_epi32(d1, d1);
- e1 = _mm_packus_epi16(e1, e1);
- *dst++ = _mm_cvtsi128_si32(e1);
- }
- /* Last point */
- if (middle & 0x1) {
- const Uint32 *s_00_01;
- const Uint32 *s_10_11;
- int index_w = 4 * SRC_INDEX(fp_sum_w);
- int frac_w = FRAC(fp_sum_w);
- fp_sum_w += fp_step_w;
- s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
- s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
- INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst, zero);
- dst += 1;
- }
- while (right_pad_w--) {
- int index_w = 4 * (src_w - 2);
- const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
- const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
- INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst, zero);
- dst += 1;
- }
- dst = (Uint32 *)((Uint8 *)dst + dst_gap);
- }
- return 0;
- }
- #endif
- #if defined(HAVE_NEON_INTRINSICS)
- static SDL_INLINE int
- hasNEON()
- {
- static int val = -1;
- if (val != -1) {
- return val;
- }
- val = SDL_HasNEON();
- return val;
- }
- static SDL_INLINE void
- INTERPOL_BILINEAR_NEON(const Uint32 *s0, const Uint32 *s1, int frac_w, uint8x8_t v_frac_h0, uint8x8_t v_frac_h1, Uint32 *dst)
- {
- uint8x8_t x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
- uint16x8_t k0;
- uint32x4_t l0;
- uint16x8_t d0;
- uint8x8_t e0;
- x_00_01 = CAST_uint8x8_t vld1_u32(s0); /* Load 2 pixels */
- x_10_11 = CAST_uint8x8_t vld1_u32(s1);
- /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
- k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */
- k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */
- /* k0 now contains 2 interpolated pixels { j0, j1 } */
- l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
- l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w);
- l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w);
- /* Shift and narrow */
- d0 = vcombine_u16(
- /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
- /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION)
- );
- /* Narrow again */
- e0 = vmovn_u16(d0);
- /* Store 1 pixel */
- *dst = vget_lane_u32(CAST_uint32x2_t e0, 0);
- }
- static int
- scale_mat_NEON(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
- {
- BILINEAR___START
- for (i = 0; i < dst_h; i++) {
- int nb_block4;
- uint8x8_t v_frac_h0, v_frac_h1;
- BILINEAR___HEIGHT
- nb_block4 = middle / 4;
- v_frac_h0 = vmov_n_u8(frac_h0);
- v_frac_h1 = vmov_n_u8(frac_h1);
- while (left_pad_w--) {
- INTERPOL_BILINEAR_NEON(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst);
- dst += 1;
- }
- while (nb_block4--) {
- int index_w_0, frac_w_0;
- int index_w_1, frac_w_1;
- int index_w_2, frac_w_2;
- int index_w_3, frac_w_3;
- const Uint32 *s_00_01, *s_02_03, *s_04_05, *s_06_07;
- const Uint32 *s_10_11, *s_12_13, *s_14_15, *s_16_17;
- uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13;/* Pixels in 4*uint8 in row */
- uint8x8_t x_04_05, x_14_15, x_06_07, x_16_17;
- uint16x8_t k0, k1, k2, k3;
- uint32x4_t l0, l1, l2, l3;
- uint16x8_t d0, d1;
- uint8x8_t e0, e1;
- uint32x4_t f0;
- index_w_0 = 4 * SRC_INDEX(fp_sum_w);
- frac_w_0 = FRAC(fp_sum_w);
- fp_sum_w += fp_step_w;
- index_w_1 = 4 * SRC_INDEX(fp_sum_w);
- frac_w_1 = FRAC(fp_sum_w);
- fp_sum_w += fp_step_w;
- index_w_2 = 4 * SRC_INDEX(fp_sum_w);
- frac_w_2 = FRAC(fp_sum_w);
- fp_sum_w += fp_step_w;
- index_w_3 = 4 * SRC_INDEX(fp_sum_w);
- frac_w_3 = FRAC(fp_sum_w);
- fp_sum_w += fp_step_w;
- s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
- s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
- s_04_05 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_2);
- s_06_07 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_3);
- s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
- s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
- s_14_15 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_2);
- s_16_17 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_3);
- /* Interpolation vertical */
- x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01); /* Load 2 pixels */
- x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03);
- x_04_05 = CAST_uint8x8_t vld1_u32(s_04_05);
- x_06_07 = CAST_uint8x8_t vld1_u32(s_06_07);
- x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11);
- x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13);
- x_14_15 = CAST_uint8x8_t vld1_u32(s_14_15);
- x_16_17 = CAST_uint8x8_t vld1_u32(s_16_17);
- /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
- k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */
- k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */
- k1 = vmull_u8(x_02_03, v_frac_h1);
- k1 = vmlal_u8(k1, x_12_13, v_frac_h0);
- k2 = vmull_u8(x_04_05, v_frac_h1);
- k2 = vmlal_u8(k2, x_14_15, v_frac_h0);
- k3 = vmull_u8(x_06_07, v_frac_h1);
- k3 = vmlal_u8(k3, x_16_17, v_frac_h0);
- /* k0 now contains 2 interpolated pixels { j0, j1 } */
- /* k1 now contains 2 interpolated pixels { j2, j3 } */
- /* k2 now contains 2 interpolated pixels { j4, j5 } */
- /* k3 now contains 2 interpolated pixels { j6, j7 } */
- l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
- l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0);
- l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0);
- l1 = vshll_n_u16(vget_low_u16(k1), PRECISION);
- l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1);
- l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1);
- l2 = vshll_n_u16(vget_low_u16(k2), PRECISION);
- l2 = vmlsl_n_u16(l2, vget_low_u16(k2), frac_w_2);
- l2 = vmlal_n_u16(l2, vget_high_u16(k2), frac_w_2);
- l3 = vshll_n_u16(vget_low_u16(k3), PRECISION);
- l3 = vmlsl_n_u16(l3, vget_low_u16(k3), frac_w_3);
- l3 = vmlal_n_u16(l3, vget_high_u16(k3), frac_w_3);
- /* shift and narrow */
- d0 = vcombine_u16(
- /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
- /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION)
- );
- /* narrow again */
- e0 = vmovn_u16(d0);
- /* Shift and narrow */
- d1 = vcombine_u16(
- /* uint16x4_t */ vshrn_n_u32(l2, 2 * PRECISION),
- /* uint16x4_t */ vshrn_n_u32(l3, 2 * PRECISION)
- );
- /* Narrow again */
- e1 = vmovn_u16(d1);
- f0 = vcombine_u32(CAST_uint32x2_t e0, CAST_uint32x2_t e1);
- /* Store 4 pixels */
- vst1q_u32(dst, f0);
- dst += 4;
- }
- if (middle & 0x2) {
- int index_w_0, frac_w_0;
- int index_w_1, frac_w_1;
- const Uint32 *s_00_01, *s_02_03;
- const Uint32 *s_10_11, *s_12_13;
- uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13;/* Pixels in 4*uint8 in row */
- uint16x8_t k0, k1;
- uint32x4_t l0, l1;
- uint16x8_t d0;
- uint8x8_t e0;
- index_w_0 = 4 * SRC_INDEX(fp_sum_w);
- frac_w_0 = FRAC(fp_sum_w);
- fp_sum_w += fp_step_w;
- index_w_1 = 4 * SRC_INDEX(fp_sum_w);
- frac_w_1 = FRAC(fp_sum_w);
- fp_sum_w += fp_step_w;
- /*
- x00............ x01 x02...........x03
- . . . . . .
- j0 dest0 j1 j2 dest1 j3
- . . . . . .
- . . . . . .
- . . . . . .
- x10............ x11 x12...........x13
- */
- s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
- s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
- s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
- s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
- /* Interpolation vertical */
- x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01);/* Load 2 pixels */
- x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03);
- x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11);
- x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13);
- /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
- k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */
- k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */
- k1 = vmull_u8(x_02_03, v_frac_h1);
- k1 = vmlal_u8(k1, x_12_13, v_frac_h0);
- /* k0 now contains 2 interpolated pixels { j0, j1 } */
- /* k1 now contains 2 interpolated pixels { j2, j3 } */
- l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
- l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0);
- l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0);
- l1 = vshll_n_u16(vget_low_u16(k1), PRECISION);
- l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1);
- l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1);
- /* Shift and narrow */
- d0 = vcombine_u16(
- /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
- /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION)
- );
- /* Narrow again */
- e0 = vmovn_u16(d0);
- /* Store 2 pixels */
- vst1_u32(dst, CAST_uint32x2_t e0);
- dst += 2;
- }
- /* Last point */
- if (middle & 0x1) {
- int index_w = 4 * SRC_INDEX(fp_sum_w);
- int frac_w = FRAC(fp_sum_w);
- const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
- const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
- INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst);
- dst += 1;
- }
- while (right_pad_w--) {
- int index_w = 4 * (src_w - 2);
- const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
- const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
- INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst);
- dst += 1;
- }
- dst = (Uint32 *)((Uint8 *)dst + dst_gap);
- }
- return 0;
- }
- #endif
- int
- SDL_LowerSoftStretchLinear(SDL_Surface *s, const SDL_Rect *srcrect,
- SDL_Surface *d, const SDL_Rect *dstrect)
- {
- int ret = -1;
- int src_w = srcrect->w;
- int src_h = srcrect->h;
- int dst_w = dstrect->w;
- int dst_h = dstrect->h;
- int src_pitch = s->pitch;
- int dst_pitch = d->pitch;
- Uint32 *src = (Uint32 *) ((Uint8 *)s->pixels + srcrect->x * 4 + srcrect->y * src_pitch);
- Uint32 *dst = (Uint32 *) ((Uint8 *)d->pixels + dstrect->x * 4 + dstrect->y * dst_pitch);
- #if defined(HAVE_NEON_INTRINSICS)
- if (ret == -1 && hasNEON()) {
- ret = scale_mat_NEON(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
- }
- #endif
- #if defined(HAVE_SSE2_INTRINSICS)
- if (ret == -1 && hasSSE2()) {
- ret = scale_mat_SSE(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
- }
- #endif
- if (ret == -1) {
- scale_mat(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
- }
- return ret;
- }
- /* vi: set ts=4 sw=4 expandtab: */
|