33#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
34 #include <emmintrin.h>
35 #include <xmmintrin.h>
39 const unsigned src_scale) {
41 const __m128i mask = _mm_set1_epi32(0x00FF00FF);
44 __m128i src_rb = _mm_and_si128(mask, src);
45 __m128i src_ag = _mm_srli_epi16(src, 8);
46 __m128i dst_rb = _mm_and_si128(mask, dst);
47 __m128i dst_ag = _mm_srli_epi16(dst, 8);
50 __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
51 __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
52 __m128i
s = _mm_set1_epi16(src_scale);
53 diff_rb = _mm_mullo_epi16(diff_rb,
s);
54 diff_ag = _mm_mullo_epi16(diff_ag,
s);
57 diff_rb = _mm_srli_epi16(diff_rb, 8);
58 diff_ag = _mm_andnot_si128(mask, diff_ag);
59 __m128i diff = _mm_or_si128(diff_rb, diff_ag);
62 return _mm_add_epi8(dst, diff);
69 auto src4 = (
const __m128i*)src;
70 auto dst4 = ( __m128i*)dst;
74 _mm_loadu_si128(dst4),
95 __m128i src_scale = _mm_set1_epi16(alpha);
97 __m128i dst_scale = _mm_srli_epi32(src, 24);
99 dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
100 dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
101 dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
102 dst_scale = _mm_srli_epi32(dst_scale, 8);
104 dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
105 dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
107 const __m128i mask = _mm_set1_epi32(0x00FF00FF);
110 __m128i src_rb = _mm_and_si128(mask, src);
111 __m128i src_ag = _mm_srli_epi16(src, 8);
112 __m128i dst_rb = _mm_and_si128(mask, dst);
113 __m128i dst_ag = _mm_srli_epi16(dst, 8);
116 src_rb = _mm_mullo_epi16(src_rb, src_scale);
117 src_ag = _mm_mullo_epi16(src_ag, src_scale);
118 dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
119 dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
122 dst_rb = _mm_add_epi16(src_rb, dst_rb);
123 dst_ag = _mm_add_epi16(src_ag, dst_ag);
126 dst_rb = _mm_srli_epi16(dst_rb, 8);
127 dst_ag = _mm_andnot_si128(mask, dst_ag);
128 return _mm_or_si128(dst_rb, dst_ag);
134 auto src4 = (
const __m128i*)src;
135 auto dst4 = ( __m128i*)dst;
139 _mm_loadu_si128(dst4),
149 while (
count --> 0) {
156#elif defined(SK_ARM_HAS_NEON)
157 #include <arm_neon.h>
163 uint16_t dst_scale = 256 - src_scale;
166 uint8x8_t vsrc, vdst, vres;
167 uint16x8_t vsrc_wide, vdst_wide;
169 vsrc = vreinterpret_u8_u32(vld1_u32(src));
170 vdst = vreinterpret_u8_u32(vld1_u32(dst));
172 vsrc_wide = vmovl_u8(vsrc);
173 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
175 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
177 vdst_wide += vsrc_wide;
178 vres = vshrn_n_u16(vdst_wide, 8);
180 vst1_u32(dst, vreinterpret_u32_u8(vres));
188 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
189 uint16x8_t vsrc_wide, vdst_wide;
191 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
192 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
194 vsrc_wide = vmovl_u8(vsrc);
195 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
196 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
197 vdst_wide += vsrc_wide;
198 vres = vshrn_n_u16(vdst_wide, 8);
200 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
210 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
211 uint16x8_t vdst_wide, vsrc_wide;
214 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
215 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
217 dst_scale = vget_lane_u8(vsrc, 3);
220 vsrc_wide = vmovl_u8(vsrc);
221 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
223 vdst_wide = vmovl_u8(vdst);
224 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
226 vdst_wide += vsrc_wide;
227 vres = vshrn_n_u16(vdst_wide, 8);
229 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
235 uint8x8_t alpha_mask;
236 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
237 alpha_mask = vld1_u8(alpha_mask_setup);
241 uint8x8_t vsrc, vdst, vres, vsrc_alphas;
242 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
244 __builtin_prefetch(src+32);
245 __builtin_prefetch(dst+32);
247 vsrc = vreinterpret_u8_u32(vld1_u32(src));
248 vdst = vreinterpret_u8_u32(vld1_u32(dst));
250 vsrc_scale = vdupq_n_u16(alpha256);
252 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
253 vdst_scale = vmovl_u8(vsrc_alphas);
259 vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
260 vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
261 vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
263 vsrc_wide = vmovl_u8(vsrc);
264 vsrc_wide *= vsrc_scale;
266 vdst_wide = vmovl_u8(vdst);
267 vdst_wide *= vdst_scale;
269 vdst_wide += vsrc_wide;
270 vres = vshrn_n_u16(vdst_wide, 8);
272 vst1_u32(dst, vreinterpret_u32_u8(vres));
280#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
281 #include <lasxintrin.h>
283 static inline __m256i SkPMLerp_LASX(
const __m256i& src,
285 const unsigned src_scale) {
287 const __m256i mask = __lasx_xvreplgr2vr_w(0x00FF00FF);
290 __m256i src_rb = __lasx_xvand_v(mask, src);
291 __m256i src_ag = __lasx_xvsrli_h(src, 8);
292 __m256i dst_rb = __lasx_xvand_v(mask, dst);
293 __m256i dst_ag = __lasx_xvsrli_h(dst, 8);
296 __m256i diff_rb = __lasx_xvsub_h(src_rb, dst_rb);
297 __m256i diff_ag = __lasx_xvsub_h(src_ag, dst_ag);
298 __m256i
s = __lasx_xvreplgr2vr_h(src_scale);
299 diff_rb = __lasx_xvmul_h(diff_rb,
s);
300 diff_ag = __lasx_xvmul_h(diff_ag,
s);
303 diff_rb = __lasx_xvsrli_h(diff_rb, 8);
304 diff_ag = __lasx_xvandn_v(mask, diff_ag);
305 __m256i diff = __lasx_xvor_v(diff_rb, diff_ag);
308 return __lasx_xvadd_b(dst, diff);
315 auto src8 = (
const __m256i*)src;
316 auto dst8 = ( __m256i*)dst;
319 __lasx_xvst(SkPMLerp_LASX(__lasx_xvld(src8, 0),
320 __lasx_xvld(dst8, 0),
330 while (
count --> 0) {
337 static inline __m256i SkBlendARGB32_LASX(
const __m256i& src,
341 __m256i src_scale = __lasx_xvreplgr2vr_h(alpha);
342 __m256i dst_scale = __lasx_xvsrli_w(src, 24);
344 dst_scale = __lasx_xvmul_h(dst_scale, src_scale);
345 dst_scale = __lasx_xvsub_w(__lasx_xvreplgr2vr_w(0xFFFF), dst_scale);
346 dst_scale = __lasx_xvadd_w(dst_scale, __lasx_xvsrli_w(dst_scale, 8));
347 dst_scale = __lasx_xvsrli_w(dst_scale, 8);
349 dst_scale = __lasx_xvshuf4i_h(dst_scale, 0xA0);
351 const __m256i mask = __lasx_xvreplgr2vr_w(0x00FF00FF);
354 __m256i src_rb = __lasx_xvand_v(mask, src);
355 __m256i src_ag = __lasx_xvsrli_h(src, 8);
356 __m256i dst_rb = __lasx_xvand_v(mask, dst);
357 __m256i dst_ag = __lasx_xvsrli_h(dst, 8);
360 src_rb = __lasx_xvmul_h(src_rb, src_scale);
361 src_ag = __lasx_xvmul_h(src_ag, src_scale);
362 dst_rb = __lasx_xvmul_h(dst_rb, dst_scale);
363 dst_ag = __lasx_xvmul_h(dst_ag, dst_scale);
366 dst_rb = __lasx_xvadd_h(src_rb, dst_rb);
367 dst_ag = __lasx_xvadd_h(src_ag, dst_ag);
370 dst_rb = __lasx_xvsrli_h(dst_rb, 8);
371 dst_ag = __lasx_xvandn_v(mask, dst_ag);
372 return __lasx_xvor_v(dst_rb, dst_ag);
378 auto src8 = (
const __m256i*)src;
379 auto dst8 = ( __m256i*)dst;
382 __lasx_xvst(SkBlendARGB32_LASX(__lasx_xvld(src8, 0),
383 __lasx_xvld(dst8, 0),
393 while (
count --> 0) {
400#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
401 #include <lsxintrin.h>
403 static inline __m128i SkPMLerp_LSX(
const __m128i& src,
405 const unsigned src_scale) {
407 const __m128i mask = __lsx_vreplgr2vr_w(0x00FF00FF);
410 __m128i src_rb = __lsx_vand_v(mask, src);
411 __m128i src_ag = __lsx_vsrli_h(src, 8);
412 __m128i dst_rb = __lsx_vand_v(mask, dst);
413 __m128i dst_ag = __lsx_vsrli_h(dst, 8);
416 __m128i diff_rb = __lsx_vsub_h(src_rb, dst_rb);
417 __m128i diff_ag = __lsx_vsub_h(src_ag, dst_ag);
418 __m128i
s = __lsx_vreplgr2vr_h(src_scale);
419 diff_rb = __lsx_vmul_h(diff_rb,
s);
420 diff_ag = __lsx_vmul_h(diff_ag,
s);
423 diff_rb = __lsx_vsrli_h(diff_rb, 8);
424 diff_ag = __lsx_vandn_v(mask, diff_ag);
425 __m128i diff = __lsx_vor_v(diff_rb, diff_ag);
428 return __lsx_vadd_b(dst, diff);
435 auto src4 = (
const __m128i*)src;
436 auto dst4 = ( __m128i*)dst;
439 __lsx_vst(SkPMLerp_LSX(__lsx_vld(src4, 0),
450 while (
count --> 0) {
457 static inline __m128i SkBlendARGB32_LSX(
const __m128i& src,
461 __m128i src_scale = __lsx_vreplgr2vr_h(alpha);
462 __m128i dst_scale = __lsx_vsrli_w(src, 24);
464 dst_scale = __lsx_vmul_h(dst_scale, src_scale);
465 dst_scale = __lsx_vsub_w(__lsx_vreplgr2vr_w(0xFFFF), dst_scale);
466 dst_scale = __lsx_vadd_w(dst_scale, __lsx_vsrli_w(dst_scale, 8));
467 dst_scale = __lsx_vsrli_w(dst_scale, 8);
469 dst_scale = __lsx_vshuf4i_h(dst_scale, 0xA0);
471 const __m128i mask = __lsx_vreplgr2vr_w(0x00FF00FF);
474 __m128i src_rb = __lsx_vand_v(mask, src);
475 __m128i src_ag = __lsx_vsrli_h(src, 8);
476 __m128i dst_rb = __lsx_vand_v(mask, dst);
477 __m128i dst_ag = __lsx_vsrli_h(dst, 8);
480 src_rb = __lsx_vmul_h(src_rb, src_scale);
481 src_ag = __lsx_vmul_h(src_ag, src_scale);
482 dst_rb = __lsx_vmul_h(dst_rb, dst_scale);
483 dst_ag = __lsx_vmul_h(dst_ag, dst_scale);
486 dst_rb = __lsx_vadd_h(src_rb, dst_rb);
487 dst_ag = __lsx_vadd_h(src_ag, dst_ag);
490 dst_rb = __lsx_vsrli_h(dst_rb, 8);
491 dst_ag = __lsx_vandn_v(mask, dst_ag);
492 return __lsx_vor_v(dst_rb, dst_ag);
498 auto src4 = (
const __m128i*)src;
499 auto dst4 = ( __m128i*)dst;
502 __lsx_vst(SkBlendARGB32_LSX(__lsx_vld(src4, 0),
513 while (
count --> 0) {
523 while (
count --> 0) {
532 while (
count --> 0) {
549 flags &= std::size(kProcs) - 1;
static void blit_row_s32_opaque(SkPMColor *dst, const SkPMColor *src, int count, U8CPU alpha)
static void blit_row_s32_blend(SkPMColor *dst, const SkPMColor *src, int count, U8CPU alpha)
static __m128i SkPMLerp_SSE2(const __m128i &src, const __m128i &dst, const unsigned src_scale)
static void blit_row_s32a_blend(SkPMColor *dst, const SkPMColor *src, int count, U8CPU alpha)
static __m128i SkBlendARGB32_SSE2(const __m128i &src, const __m128i &dst, const unsigned aa)
static SkPMColor SkPMLerp(SkPMColor src, SkPMColor dst, unsigned scale)
static U16CPU SkAlphaMulInv256(U16CPU value, U16CPU alpha256)
static SkPMColor SkBlendARGB32(SkPMColor src, SkPMColor dst, U8CPU aa)
#define SkGetPackedA32(packed)
static unsigned SkAlpha255To256(U8CPU alpha)
static void Color32(SkPMColor dst[], int count, SkPMColor color)
static Proc32 Factory32(unsigned flags32)
void(* Proc32)(uint32_t dst[], const SkPMColor src[], int count, U8CPU alpha)
FlutterSemanticsFlag flags
void(* memset32)(uint32_t[], uint32_t, int)
void(* blit_row_s32a_opaque)(SkPMColor *dst, const SkPMColor *src, int count, U8CPU alpha)
void(* blit_row_color32)(SkPMColor *dst, int count, SkPMColor color)