33#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
34 #include <emmintrin.h>
35 #include <xmmintrin.h>
39 const unsigned src_scale) {
41 const __m128i mask = _mm_set1_epi32(0x00FF00FF);
44 __m128i src_rb = _mm_and_si128(mask,
src);
45 __m128i src_ag = _mm_srli_epi16(
src, 8);
46 __m128i dst_rb = _mm_and_si128(mask,
dst);
47 __m128i dst_ag = _mm_srli_epi16(
dst, 8);
50 __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
51 __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
52 __m128i
s = _mm_set1_epi16(src_scale);
53 diff_rb = _mm_mullo_epi16(diff_rb,
s);
54 diff_ag = _mm_mullo_epi16(diff_ag,
s);
57 diff_rb = _mm_srli_epi16(diff_rb, 8);
58 diff_ag = _mm_andnot_si128(mask, diff_ag);
59 __m128i diff = _mm_or_si128(diff_rb, diff_ag);
62 return _mm_add_epi8(
dst, diff);
69 auto src4 = (
const __m128i*)
src;
70 auto dst4 = ( __m128i*)
dst;
74 _mm_loadu_si128(dst4),
95 __m128i src_scale = _mm_set1_epi16(alpha);
97 __m128i dst_scale = _mm_srli_epi32(
src, 24);
99 dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
100 dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
101 dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
102 dst_scale = _mm_srli_epi32(dst_scale, 8);
104 dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
105 dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
107 const __m128i mask = _mm_set1_epi32(0x00FF00FF);
110 __m128i src_rb = _mm_and_si128(mask,
src);
111 __m128i src_ag = _mm_srli_epi16(
src, 8);
112 __m128i dst_rb = _mm_and_si128(mask,
dst);
113 __m128i dst_ag = _mm_srli_epi16(
dst, 8);
116 src_rb = _mm_mullo_epi16(src_rb, src_scale);
117 src_ag = _mm_mullo_epi16(src_ag, src_scale);
118 dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
119 dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
122 dst_rb = _mm_add_epi16(src_rb, dst_rb);
123 dst_ag = _mm_add_epi16(src_ag, dst_ag);
126 dst_rb = _mm_srli_epi16(dst_rb, 8);
127 dst_ag = _mm_andnot_si128(mask, dst_ag);
128 return _mm_or_si128(dst_rb, dst_ag);
134 auto src4 = (
const __m128i*)
src;
135 auto dst4 = ( __m128i*)
dst;
139 _mm_loadu_si128(dst4),
149 while (
count --> 0) {
156#elif defined(SK_ARM_HAS_NEON)
157 #include <arm_neon.h>
163 uint16_t dst_scale = 256 - src_scale;
166 uint8x8_t vsrc, vdst, vres;
167 uint16x8_t vsrc_wide, vdst_wide;
169 vsrc = vreinterpret_u8_u32(vld1_u32(
src));
170 vdst = vreinterpret_u8_u32(vld1_u32(
dst));
172 vsrc_wide = vmovl_u8(vsrc);
173 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
175 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
177 vdst_wide += vsrc_wide;
178 vres = vshrn_n_u16(vdst_wide, 8);
180 vst1_u32(
dst, vreinterpret_u32_u8(vres));
188 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
189 uint16x8_t vsrc_wide, vdst_wide;
191 vsrc = vreinterpret_u8_u32(vld1_lane_u32(
src, vreinterpret_u32_u8(vsrc), 0));
192 vdst = vreinterpret_u8_u32(vld1_lane_u32(
dst, vreinterpret_u32_u8(vdst), 0));
194 vsrc_wide = vmovl_u8(vsrc);
195 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
196 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
197 vdst_wide += vsrc_wide;
198 vres = vshrn_n_u16(vdst_wide, 8);
200 vst1_lane_u32(
dst, vreinterpret_u32_u8(vres), 0);
210 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
211 uint16x8_t vdst_wide, vsrc_wide;
214 vsrc = vreinterpret_u8_u32(vld1_lane_u32(
src, vreinterpret_u32_u8(vsrc), 0));
215 vdst = vreinterpret_u8_u32(vld1_lane_u32(
dst, vreinterpret_u32_u8(vdst), 0));
217 dst_scale = vget_lane_u8(vsrc, 3);
220 vsrc_wide = vmovl_u8(vsrc);
221 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
223 vdst_wide = vmovl_u8(vdst);
224 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
226 vdst_wide += vsrc_wide;
227 vres = vshrn_n_u16(vdst_wide, 8);
229 vst1_lane_u32(
dst, vreinterpret_u32_u8(vres), 0);
235 uint8x8_t alpha_mask;
236 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
237 alpha_mask = vld1_u8(alpha_mask_setup);
241 uint8x8_t vsrc, vdst, vres, vsrc_alphas;
242 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
244 __builtin_prefetch(
src+32);
245 __builtin_prefetch(
dst+32);
247 vsrc = vreinterpret_u8_u32(vld1_u32(
src));
248 vdst = vreinterpret_u8_u32(vld1_u32(
dst));
250 vsrc_scale = vdupq_n_u16(alpha256);
252 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
253 vdst_scale = vmovl_u8(vsrc_alphas);
259 vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
260 vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
261 vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
263 vsrc_wide = vmovl_u8(vsrc);
264 vsrc_wide *= vsrc_scale;
266 vdst_wide = vmovl_u8(vdst);
267 vdst_wide *= vdst_scale;
269 vdst_wide += vsrc_wide;
270 vres = vshrn_n_u16(vdst_wide, 8);
272 vst1_u32(
dst, vreinterpret_u32_u8(vres));
280#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
281 #include <lasxintrin.h>
283 static inline __m256i SkPMLerp_LASX(
const __m256i&
src,
285 const unsigned src_scale) {
287 const __m256i mask = __lasx_xvreplgr2vr_w(0x00FF00FF);
290 __m256i src_rb = __lasx_xvand_v(mask,
src);
291 __m256i src_ag = __lasx_xvsrli_h(
src, 8);
292 __m256i dst_rb = __lasx_xvand_v(mask,
dst);
293 __m256i dst_ag = __lasx_xvsrli_h(
dst, 8);
296 __m256i diff_rb = __lasx_xvsub_h(src_rb, dst_rb);
297 __m256i diff_ag = __lasx_xvsub_h(src_ag, dst_ag);
298 __m256i
s = __lasx_xvreplgr2vr_h(src_scale);
299 diff_rb = __lasx_xvmul_h(diff_rb,
s);
300 diff_ag = __lasx_xvmul_h(diff_ag,
s);
303 diff_rb = __lasx_xvsrli_h(diff_rb, 8);
304 diff_ag = __lasx_xvandn_v(mask, diff_ag);
305 __m256i diff = __lasx_xvor_v(diff_rb, diff_ag);
308 return __lasx_xvadd_b(
dst, diff);
315 auto src8 = (
const __m256i*)
src;
316 auto dst8 = ( __m256i*)
dst;
319 __lasx_xvst(SkPMLerp_LASX(__lasx_xvld(src8, 0),
320 __lasx_xvld(dst8, 0),
330 while (
count --> 0) {
337 static inline __m256i SkBlendARGB32_LASX(
const __m256i&
src,
341 __m256i src_scale = __lasx_xvreplgr2vr_h(alpha);
342 __m256i dst_scale = __lasx_xvsrli_w(
src, 24);
344 dst_scale = __lasx_xvmul_h(dst_scale, src_scale);
345 dst_scale = __lasx_xvsub_w(__lasx_xvreplgr2vr_w(0xFFFF), dst_scale);
346 dst_scale = __lasx_xvadd_w(dst_scale, __lasx_xvsrli_w(dst_scale, 8));
347 dst_scale = __lasx_xvsrli_w(dst_scale, 8);
349 dst_scale = __lasx_xvshuf4i_h(dst_scale, 0xA0);
351 const __m256i mask = __lasx_xvreplgr2vr_w(0x00FF00FF);
354 __m256i src_rb = __lasx_xvand_v(mask,
src);
355 __m256i src_ag = __lasx_xvsrli_h(
src, 8);
356 __m256i dst_rb = __lasx_xvand_v(mask,
dst);
357 __m256i dst_ag = __lasx_xvsrli_h(
dst, 8);
360 src_rb = __lasx_xvmul_h(src_rb, src_scale);
361 src_ag = __lasx_xvmul_h(src_ag, src_scale);
362 dst_rb = __lasx_xvmul_h(dst_rb, dst_scale);
363 dst_ag = __lasx_xvmul_h(dst_ag, dst_scale);
366 dst_rb = __lasx_xvadd_h(src_rb, dst_rb);
367 dst_ag = __lasx_xvadd_h(src_ag, dst_ag);
370 dst_rb = __lasx_xvsrli_h(dst_rb, 8);
371 dst_ag = __lasx_xvandn_v(mask, dst_ag);
372 return __lasx_xvor_v(dst_rb, dst_ag);
378 auto src8 = (
const __m256i*)
src;
379 auto dst8 = ( __m256i*)
dst;
382 __lasx_xvst(SkBlendARGB32_LASX(__lasx_xvld(src8, 0),
383 __lasx_xvld(dst8, 0),
393 while (
count --> 0) {
400#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
401 #include <lsxintrin.h>
403 static inline __m128i SkPMLerp_LSX(
const __m128i&
src,
405 const unsigned src_scale) {
407 const __m128i mask = __lsx_vreplgr2vr_w(0x00FF00FF);
410 __m128i src_rb = __lsx_vand_v(mask,
src);
411 __m128i src_ag = __lsx_vsrli_h(
src, 8);
412 __m128i dst_rb = __lsx_vand_v(mask,
dst);
413 __m128i dst_ag = __lsx_vsrli_h(
dst, 8);
416 __m128i diff_rb = __lsx_vsub_h(src_rb, dst_rb);
417 __m128i diff_ag = __lsx_vsub_h(src_ag, dst_ag);
418 __m128i
s = __lsx_vreplgr2vr_h(src_scale);
419 diff_rb = __lsx_vmul_h(diff_rb,
s);
420 diff_ag = __lsx_vmul_h(diff_ag,
s);
423 diff_rb = __lsx_vsrli_h(diff_rb, 8);
424 diff_ag = __lsx_vandn_v(mask, diff_ag);
425 __m128i diff = __lsx_vor_v(diff_rb, diff_ag);
428 return __lsx_vadd_b(
dst, diff);
435 auto src4 = (
const __m128i*)
src;
436 auto dst4 = ( __m128i*)
dst;
439 __lsx_vst(SkPMLerp_LSX(__lsx_vld(src4, 0),
450 while (
count --> 0) {
457 static inline __m128i SkBlendARGB32_LSX(
const __m128i&
src,
461 __m128i src_scale = __lsx_vreplgr2vr_h(alpha);
462 __m128i dst_scale = __lsx_vsrli_w(
src, 24);
464 dst_scale = __lsx_vmul_h(dst_scale, src_scale);
465 dst_scale = __lsx_vsub_w(__lsx_vreplgr2vr_w(0xFFFF), dst_scale);
466 dst_scale = __lsx_vadd_w(dst_scale, __lsx_vsrli_w(dst_scale, 8));
467 dst_scale = __lsx_vsrli_w(dst_scale, 8);
469 dst_scale = __lsx_vshuf4i_h(dst_scale, 0xA0);
471 const __m128i mask = __lsx_vreplgr2vr_w(0x00FF00FF);
474 __m128i src_rb = __lsx_vand_v(mask,
src);
475 __m128i src_ag = __lsx_vsrli_h(
src, 8);
476 __m128i dst_rb = __lsx_vand_v(mask,
dst);
477 __m128i dst_ag = __lsx_vsrli_h(
dst, 8);
480 src_rb = __lsx_vmul_h(src_rb, src_scale);
481 src_ag = __lsx_vmul_h(src_ag, src_scale);
482 dst_rb = __lsx_vmul_h(dst_rb, dst_scale);
483 dst_ag = __lsx_vmul_h(dst_ag, dst_scale);
486 dst_rb = __lsx_vadd_h(src_rb, dst_rb);
487 dst_ag = __lsx_vadd_h(src_ag, dst_ag);
490 dst_rb = __lsx_vsrli_h(dst_rb, 8);
491 dst_ag = __lsx_vandn_v(mask, dst_ag);
492 return __lsx_vor_v(dst_rb, dst_ag);
498 auto src4 = (
const __m128i*)
src;
499 auto dst4 = ( __m128i*)
dst;
502 __lsx_vst(SkBlendARGB32_LSX(__lsx_vld(src4, 0),
513 while (
count --> 0) {
523 while (
count --> 0) {
532 while (
count --> 0) {
static void blit_row_s32_opaque(SkPMColor *dst, const SkPMColor *src, int count, U8CPU alpha)
static void blit_row_s32_blend(SkPMColor *dst, const SkPMColor *src, int count, U8CPU alpha)
static __m128i SkPMLerp_SSE2(const __m128i &src, const __m128i &dst, const unsigned src_scale)
static void blit_row_s32a_blend(SkPMColor *dst, const SkPMColor *src, int count, U8CPU alpha)
static __m128i SkBlendARGB32_SSE2(const __m128i &src, const __m128i &dst, const unsigned aa)
static SkPMColor SkPMLerp(SkPMColor src, SkPMColor dst, unsigned scale)
static U16CPU SkAlphaMulInv256(U16CPU value, U16CPU alpha256)
static SkPMColor SkBlendARGB32(SkPMColor src, SkPMColor dst, U8CPU aa)
#define SkGetPackedA32(packed)
static unsigned SkAlpha255To256(U8CPU alpha)
static void Color32(SkPMColor dst[], int count, SkPMColor color)
static Proc32 Factory32(unsigned flags32)
void(* Proc32)(uint32_t dst[], const SkPMColor src[], int count, U8CPU alpha)
FlutterSemanticsFlag flags
void(* memset32)(uint32_t[], uint32_t, int)
void(* blit_row_s32a_opaque)(SkPMColor *dst, const SkPMColor *src, int count, U8CPU alpha)
void(* blit_row_color32)(SkPMColor *dst, int count, SkPMColor color)
it will be possible to load the file into Perfetto s trace viewer disable asset Prevents usage of any non test fonts unless they were explicitly Loaded via prefetched default font Indicates whether the embedding started a prefetch of the default font manager before creating the engine run In non interactive keep the shell running after the Dart script has completed enable serial On low power devices with low core running concurrent GC tasks on threads can cause them to contend with the UI thread which could potentially lead to jank This option turns off all concurrent GC activities domain network JSON encoded network policy per domain This overrides the DisallowInsecureConnections switch Embedder can specify whether to allow or disallow insecure connections at a domain level old gen heap size