8#ifndef SkBitmapProcState_opts_DEFINED
9#define SkBitmapProcState_opts_DEFINED
24#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
25 #include <immintrin.h>
26#elif defined(SK_ARM_HAS_NEON)
28#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
29 #include <lasxintrin.h>
30#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
31 #include <lsxintrin.h>
37template <
typename U32,
typename Out>
40 *v1 = (packed & 0x3fff);
41 *
w = (packed >> 14) & 0xf;
44#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
51 SkASSERT(kN32_SkColorType ==
s.fPixmap.colorType());
56 auto interpolate_in_x = [](uint32_t
A0, uint32_t
A1,
57 uint32_t
B0, uint32_t
B1,
58 __m128i interlaced_x_weights) {
72 __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(
A0), _mm_cvtsi32_si128(
A1)),
73 interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(
B0), _mm_cvtsi32_si128(
B1));
75 return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
76 interlaced_x_weights);
81 auto interpolate_in_x_and_y = [&](uint32_t
A0, uint32_t
A1,
82 uint32_t
A2, uint32_t
A3,
83 uint32_t
B0, uint32_t
B1,
84 uint32_t
B2, uint32_t
B3,
85 __m128i interlaced_x_weights,
88 __m128i top = interpolate_in_x(
A0,
A1,
B0,
B1, interlaced_x_weights),
89 bot = interpolate_in_x(
A2,
A3,
B2,
B3, interlaced_x_weights);
93 __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4),
94 _mm_mullo_epi16(_mm_sub_epi16(bot, top),
98 px = _mm_srli_epi16(px, 8);
101 if (
s.fAlphaScale < 256) {
102 px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(
s.fAlphaScale)), 8);
113 auto row0 = (
const uint32_t*)((
const uint8_t*)
s.fPixmap.addr() + y0 *
s.fPixmap.rowBytes()),
114 row1 = (
const uint32_t*)((
const uint8_t*)
s.fPixmap.addr() + y1 *
s.fPixmap.rowBytes());
123 __m128i packed = _mm_loadu_si128((
const __m128i*)xy);
124 _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18));
125 _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
126 wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));
130 __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
131 wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
134 __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),
135 interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);
141 __m128i AB = interpolate_in_x_and_y(row0[x0[
A]], row0[x1[
A]],
142 row1[x0[
A]], row1[x1[
A]],
143 row0[x0[
B]], row0[x1[
B]],
144 row1[x0[
B]], row1[x1[
B]],
145 interlaced_x_weights_AB, wy);
148 __m128i CD = interpolate_in_x_and_y(row0[x0[
C]], row0[x1[
C]],
149 row1[x0[
C]], row1[x1[
C]],
150 row0[x0[
D]], row0[x1[
D]],
151 row1[x0[
D]], row1[x1[
D]],
152 interlaced_x_weights_CD, wy);
155 _mm_storeu_si128((__m128i*)
colors, _mm_packus_epi16(AB, CD));
161 while (
count --> 0) {
167 __m128i wr = _mm_set1_epi8(wx),
168 wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
170 __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr);
172 __m128i
A = interpolate_in_x_and_y(row0[x0], row0[x1],
176 interlaced_x_weights, wy);
178 *
colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(
A, _mm_setzero_si128()));
183#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
190 SkASSERT(kN32_SkColorType ==
s.fPixmap.colorType());
196 auto row0 = (
const uint32_t*)( (
const char*)
s.fPixmap.addr() + y0 *
s.fPixmap.rowBytes() ),
197 row1 = (
const uint32_t*)( (
const char*)
s.fPixmap.addr() + y1 *
s.fPixmap.rowBytes() );
201 const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy),
202 _mm_set1_epi16(16-wy));
204 while (
count --> 0) {
211 const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]),
212 bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]);
238 __m128i
L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),
239 R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());
241 __m128i inner = _mm_add_epi16(_mm_slli_epi16(
L, 4),
242 _mm_mullo_epi16(_mm_sub_epi16(
R,
L), _mm_set1_epi16(wx)));
244 __m128i sum_in_x = _mm_mullo_epi16(inner, allY);
247 __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8));
250 sum = _mm_srli_epi16(sum, 8);
252 if (
s.fAlphaScale < 256) {
254 sum = _mm_mullo_epi16(sum, _mm_set1_epi16(
s.fAlphaScale));
255 sum = _mm_srli_epi16(sum, 8);
259 *
colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
263#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
269 SkASSERT(kN32_SkColorType ==
s.fPixmap.colorType());
275 auto row0 = (
const uint32_t*)( (
const char*)
s.fPixmap.addr() + y0 *
s.fPixmap.rowBytes() ),
276 row1 = (
const uint32_t*)( (
const char*)
s.fPixmap.addr() + y1 *
s.fPixmap.rowBytes() );
280 __m256i allY = __lasx_xvilvl_d(__lasx_xvreplgr2vr_h(16-wy), __lasx_xvreplgr2vr_h(wy));
282 while (
count --> 0) {
290 const __m256i zeros = __lasx_xvldi(0);
291 const __m256i tl = __lasx_xvinsgr2vr_w(zeros, row0[x0], 0),
292 tr = __lasx_xvinsgr2vr_w(zeros, row0[x1], 0),
293 bl = __lasx_xvinsgr2vr_w(zeros, row1[x0], 0),
294 br = __lasx_xvinsgr2vr_w(zeros, row1[x1], 0);
320 __m256i
L = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tl, bl)),
321 R = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tr, br));
323 __m256i inner = __lasx_xvadd_h(__lasx_xvslli_h(
L, 4),
324 __lasx_xvmul_h(__lasx_xvsub_h(
R,
L),
325 __lasx_xvreplgr2vr_h(wx)));
327 __m256i sum_in_x = __lasx_xvmul_h(inner, allY);
330 __m256i sum = __lasx_xvadd_h(sum_in_x, __lasx_xvbsrl_v(sum_in_x, 8));
333 sum = __lasx_xvsrli_h(sum, 8);
335 if (
s.fAlphaScale < 256) {
337 sum = __lasx_xvmul_h(sum, __lasx_xvreplgr2vr_h(
s.fAlphaScale));
338 sum = __lasx_xvsrli_h(sum, 8);
342 *
colors++ = __lasx_xvpickve2gr_w(__lasx_xvpickev_b(__lasx_xvldi(0),
343 __lasx_xvsat_hu(sum, 8)), 0);
347#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
354 SkASSERT(kN32_SkColorType ==
s.fPixmap.colorType());
360 auto row0 = (
const uint32_t*)( (
const char*)
s.fPixmap.addr() + y0 *
s.fPixmap.rowBytes() ),
361 row1 = (
const uint32_t*)( (
const char*)
s.fPixmap.addr() + y1 *
s.fPixmap.rowBytes() );
365 __m128i allY = __lsx_vilvl_d(__lsx_vreplgr2vr_h(16-wy), __lsx_vreplgr2vr_h(wy));
367 while (
count --> 0) {
374 const __m128i zeros = __lsx_vldi(0);
375 const __m128i tl = __lsx_vinsgr2vr_w(zeros, row0[x0], 0),
376 tr = __lsx_vinsgr2vr_w(zeros, row0[x1], 0),
377 bl = __lsx_vinsgr2vr_w(zeros, row1[x0], 0),
378 br = __lsx_vinsgr2vr_w(zeros, row1[x1], 0);
405 __m128i
L = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tl, bl)),
406 R = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tr, br));
408 __m128i inner = __lsx_vadd_h(__lsx_vslli_h(
L, 4),
409 __lsx_vmul_h(__lsx_vsub_h(
R,
L),
410 __lsx_vreplgr2vr_h(wx)));
412 __m128i sum_in_x = __lsx_vmul_h(inner, allY);
415 __m128i sum = __lsx_vadd_h(sum_in_x, __lsx_vbsrl_v(sum_in_x, 8));
418 sum = __lsx_vsrli_h(sum, 8);
420 if (
s.fAlphaScale < 256) {
422 sum = __lsx_vmul_h(sum, __lsx_vreplgr2vr_h(
s.fAlphaScale));
423 sum = __lsx_vsrli_h(sum, 8);
427 *
colors++ = __lsx_vpickve2gr_w(__lsx_vpickev_b(__lsx_vldi(0),
428 __lsx_vsat_hu(sum, 8)), 0);
437 #if defined(SK_ARM_HAS_NEON)
438 static void filter_and_scale_by_alpha(
unsigned x,
unsigned y,
443 uint8x8_t vy, vconst16_8, v16_y, vres;
444 uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
446 uint16x8_t tmp1, tmp2;
449 vconst16_8 = vmov_n_u8(16);
450 v16_y = vsub_u8(vconst16_8, vy);
452 va0 = vdup_n_u32(a00);
453 va1 = vdup_n_u32(a10);
454 va0 = vset_lane_u32(a01, va0, 1);
455 va1 = vset_lane_u32(a11, va1, 1);
457 tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y);
458 tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);
461 vconst16_16 = vmov_n_u16(16);
462 v16_x = vsub_u16(vconst16_16, vx);
464 tmp = vmul_u16(vget_high_u16(tmp1), vx);
465 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);
466 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x);
467 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x);
470 vscale = vdup_n_u16(
scale);
471 tmp = vshr_n_u16(tmp, 8);
472 tmp = vmul_u16(tmp, vscale);
475 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8);
476 vst1_lane_u32(
dst, vreinterpret_u32_u8(vres), 0);
479 static void filter_and_scale_by_alpha(
unsigned x,
unsigned y,
483 unsigned alphaScale) {
489 const uint32_t mask = 0xFF00FF;
491 int scale = 256 - 16*
y - 16*
x + xy;
492 uint32_t lo = (a00 & mask) *
scale;
493 uint32_t hi = ((a00 >> 8) & mask) *
scale;
496 lo += (a01 & mask) *
scale;
497 hi += ((a01 >> 8) & mask) *
scale;
500 lo += (a10 & mask) *
scale;
501 hi += ((a10 >> 8) & mask) *
scale;
503 lo += (a11 & mask) * xy;
504 hi += ((a11 >> 8) & mask) * xy;
506 if (alphaScale < 256) {
507 lo = ((lo >> 8) & mask) * alphaScale;
508 hi = ((hi >> 8) & mask) * alphaScale;
511 *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
521 SkASSERT(4 ==
s.fPixmap.info().bytesPerPixel());
527 auto row0 = (
const uint32_t*)( (
const char*)
s.fPixmap.addr() + y0 *
s.fPixmap.rowBytes() ),
528 row1 = (
const uint32_t*)( (
const char*)
s.fPixmap.addr() + y1 *
s.fPixmap.rowBytes() );
530 while (
count --> 0) {
534 filter_and_scale_by_alpha(wx, wy,
544#if defined(SK_ARM_HAS_NEON)
550 SkASSERT(4 ==
s.fPixmap.info().bytesPerPixel());
553 auto src = (
const char*)
s.fPixmap.addr();
554 size_t rb =
s.fPixmap.rowBytes();
556 while (
count --> 0) {
562 auto row0 = (
const uint32_t*)(
src + y0*rb),
563 row1 = (
const uint32_t*)(
src + y1*rb);
565 filter_and_scale_by_alpha(wx, wy,
581 template <
typename U32,
typename Out>
583 SK_OPTS_NS::decode_packed_coordinates_and_weight<U32, Out>(packed, v0, v1,
w);
static void B2(DFData *curr, int width)
static void B1(DFData *curr, int width)
static constexpr void(* S32_alpha_D32_filter_DXDY)(const SkBitmapProcState &, const uint32_t *, int, SkPMColor *)
static void decode_packed_coordinates_and_weight(U32 packed, Out *v0, Out *v1, Out *w)
void S32_alpha_D32_filter_DX(const SkBitmapProcState &s, const uint32_t *xy, int count, uint32_t *colors)
PODArray< SkColor > colors
void decode_packed_coordinates_and_weight(U32 packed, Out *v0, Out *v1, Out *w)