64 maskR = maskR * srcA >> 8;
65 maskG = maskG * srcA >> 8;
66 maskB = maskB * srcA >> 8;
126#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
127 #include <emmintrin.h>
132 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
133 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
134 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
136 #if SK_R16x5_R32x5_SHIFT == 0
137 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
138 #elif SK_R16x5_R32x5_SHIFT > 0
139 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
141 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
144 #if SK_G16x5_G32x5_SHIFT == 0
145 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
146 #elif SK_G16x5_G32x5_SHIFT > 0
147 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
149 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
152 #if SK_B16x5_B32x5_SHIFT == 0
153 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
154 #elif SK_B16x5_B32x5_SHIFT > 0
155 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
157 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
198 __m128i
a = _mm_cmplt_epi32(srcA,
203 a = _mm_or_si128(_mm_and_si128(
a, aMin), _mm_andnot_si128(
a, aMax));
210 mask = _mm_or_si128(_mm_or_si128(
a, r), _mm_or_si128(g,
b));
215 __m128i maskLo, maskHi;
217 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
219 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
225 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
226 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
229 maskLo = _mm_mullo_epi16(maskLo, srcA);
230 maskHi = _mm_mullo_epi16(maskHi, srcA);
233 maskLo = _mm_srli_epi16(maskLo, 8);
234 maskHi = _mm_srli_epi16(maskHi, 8);
238 __m128i dstLo = _mm_unpacklo_epi8(
dst, _mm_setzero_si128());
240 __m128i dstHi = _mm_unpackhi_epi8(
dst, _mm_setzero_si128());
243 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(
src, dstLo));
244 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(
src, dstHi));
247 maskLo = _mm_srai_epi16(maskLo, 5);
248 maskHi = _mm_srai_epi16(maskHi, 5);
252 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
253 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
259 return _mm_packus_epi16(resultLo, resultHi);
300 mask = _mm_or_si128(_mm_or_si128(
a, r), _mm_or_si128(g,
b));
305 __m128i maskLo, maskHi;
307 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
309 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
315 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
316 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
320 __m128i dstLo = _mm_unpacklo_epi8(
dst, _mm_setzero_si128());
322 __m128i dstHi = _mm_unpackhi_epi8(
dst, _mm_setzero_si128());
325 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(
src, dstLo));
326 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(
src, dstHi));
329 maskLo = _mm_srai_epi16(maskLo, 5);
330 maskHi = _mm_srai_epi16(maskHi, 5);
334 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
335 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
339 return _mm_packus_epi16(resultLo, resultHi);
356 while (((
size_t)
dst & 0x0F) != 0) {
363 __m128i *
d =
reinterpret_cast<__m128i*
>(
dst);
365 __m128i src_sse = _mm_set1_epi32(
SkPackARGB32(0xFF, srcR, srcG, srcB));
367 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
370 __m128i srcA_sse = _mm_set1_epi16(srcA);
373 __m128i dst_sse = _mm_load_si128(
d);
375 __m128i mask_sse = _mm_loadu_si64(mask);
380 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
381 _mm_setzero_si128()));
384 if (pack_cmp != 0xFFFF) {
388 mask_sse = _mm_unpacklo_epi16(mask_sse,
389 _mm_setzero_si128());
424 while (((
size_t)
dst & 0x0F) != 0) {
431 __m128i *
d =
reinterpret_cast<__m128i*
>(
dst);
433 __m128i src_sse = _mm_set1_epi32(
SkPackARGB32(0xFF, srcR, srcG, srcB));
436 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
439 __m128i dst_sse = _mm_load_si128(
d);
441 __m128i mask_sse = _mm_loadu_si64(mask);
446 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
447 _mm_setzero_si128()));
450 if (pack_cmp != 0xFFFF) {
454 mask_sse = _mm_unpacklo_epi16(mask_sse,
455 _mm_setzero_si128());
478#elif defined(SK_ARM_HAS_NEON)
479 #include <arm_neon.h>
481 #define NEON_A (SK_A32_SHIFT / 8)
482 #define NEON_R (SK_R32_SHIFT / 8)
483 #define NEON_G (SK_G32_SHIFT / 8)
484 #define NEON_B (SK_B32_SHIFT / 8)
486 static inline uint8x8_t blend_32_neon(uint8x8_t
src, uint8x8_t
dst, uint16x8_t
scale) {
487 int16x8_t src_wide, dst_wide;
489 src_wide = vreinterpretq_s16_u16(vmovl_u8(
src));
490 dst_wide = vreinterpretq_s16_u16(vmovl_u8(
dst));
492 src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(
scale);
494 dst_wide += vshrq_n_s16(src_wide, 5);
496 return vmovn_u16(vreinterpretq_u16_s16(dst_wide));
506 uint8x8_t vcolA = vdup_n_u8(0xFF);
507 uint8x8_t vcolR = vdup_n_u8(colR);
508 uint8x8_t vcolG = vdup_n_u8(colG);
509 uint8x8_t vcolB = vdup_n_u8(colB);
514 uint16x8_t vmaskR, vmaskG, vmaskB, vmaskA;
516 vdst = vld4_u8((uint8_t*)
dst);
517 vmask = vld1q_u16(
src);
521 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask,
SK_R16_BITS),
526 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
527 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
528 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
530 vmaskA = vmaxq_u16(vmaskR, vmaxq_u16(vmaskG, vmaskB));
532 vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
533 vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
534 vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
535 vdst.val[NEON_A] = blend_32_neon(vcolA, vdst.val[NEON_A], vmaskA);
537 vst4_u8((uint8_t*)
dst, vdst);
558 uint16x8_t vcolACmp = vdupq_n_u16(colA);
561 uint16x8_t vcolA = vdupq_n_u16(colA);
562 uint8x8_t vcolR = vdup_n_u8(colR);
563 uint8x8_t vcolG = vdup_n_u8(colG);
564 uint8x8_t vcolB = vdup_n_u8(colB);
569 uint16x8_t vmaskR, vmaskG, vmaskB, vmaskA;
571 vdst = vld4_u8((uint8_t*)
dst);
572 vmask = vld1q_u16(
src);
576 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask,
SK_R16_BITS),
581 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
582 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
583 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
585 vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
586 vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
587 vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
591 vmaskA = vbslq_u16(vcleq_u16(vcolACmp, vmovl_u8(vdst.val[NEON_A])),
592 vminq_u16(vmaskR, vminq_u16(vmaskG, vmaskB)),
593 vmaxq_u16(vmaskR, vmaxq_u16(vmaskG, vmaskB)));
595 vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
596 vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
597 vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
599 vdst.val[NEON_A] = blend_32_neon(vdup_n_u8(0xFF), vdst.val[NEON_A], vmaskA);
600 vst4_u8((uint8_t*)
dst, vdst);
612#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
617 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
618 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
619 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
621 #if SK_R16x5_R32x5_SHIFT == 0
622 #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (x)
623 #elif SK_R16x5_R32x5_SHIFT > 0
624 #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (__lasx_xvslli_w(x, SK_R16x5_R32x5_SHIFT))
626 #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_R16x5_R32x5_SHIFT))
629 #if SK_G16x5_G32x5_SHIFT == 0
630 #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (x)
631 #elif SK_G16x5_G32x5_SHIFT > 0
632 #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (__lasx_xvslli_w(x, SK_G16x5_G32x5_SHIFT))
634 #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_G16x5_G32x5_SHIFT))
637 #if SK_B16x5_B32x5_SHIFT == 0
638 #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (x)
639 #elif SK_B16x5_B32x5_SHIFT > 0
640 #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (__lasx_xvslli_w(x, SK_B16x5_B32x5_SHIFT))
642 #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_B16x5_B32x5_SHIFT))
645 static __m256i blend_lcd16_lasx(__m256i &
src, __m256i &
dst, __m256i &mask, __m256i &srcA) {
667 __m256i xv_zero = __lasx_xvldi(0);
672 __m256i r = __lasx_xvand_v(SkPackedR16x5ToUnmaskedR32x5_LASX(mask),
677 __m256i g = __lasx_xvand_v(SkPackedG16x5ToUnmaskedG32x5_LASX(mask),
682 __m256i
b = __lasx_xvand_v(SkPackedB16x5ToUnmaskedB32x5_LASX(mask),
693 __m256i
a = __lasx_xvmskltz_w(srcA -
699 a = __lasx_xvor_v(__lasx_xvand_v(
a, aMin), __lasx_xvandn_v(
a, aMax));
708 mask = __lasx_xvor_v(__lasx_xvor_v(
a, r), __lasx_xvor_v(g,
b));
713 __m256i maskLo, maskHi;
716 maskLo = __lasx_xvilvl_b(xv_zero, mask);
719 maskHi = __lasx_xvilvh_b(xv_zero, mask);
725 maskLo = __lasx_xvadd_h(maskLo, __lasx_xvsrli_h(maskLo, 4));
726 maskHi = __lasx_xvadd_h(maskHi, __lasx_xvsrli_h(maskHi, 4));
729 maskLo = __lasx_xvmul_h(maskLo, srcA);
730 maskHi = __lasx_xvmul_h(maskHi, srcA);
733 maskLo = __lasx_xvsrli_h(maskLo, 8);
734 maskHi = __lasx_xvsrli_h(maskHi, 8);
739 __m256i dstLo = __lasx_xvilvl_b(xv_zero,
dst);
742 __m256i dstHi = __lasx_xvilvh_b(xv_zero,
dst);
745 maskLo = __lasx_xvmul_h(maskLo, __lasx_xvsub_h(
src, dstLo));
746 maskHi = __lasx_xvmul_h(maskHi, __lasx_xvsub_h(
src, dstHi));
749 maskLo = __lasx_xvsrai_h(maskLo, 5);
750 maskHi = __lasx_xvsrai_h(maskHi, 5);
754 __m256i resultLo = __lasx_xvadd_h(dstLo, maskLo);
755 __m256i resultHi = __lasx_xvadd_h(dstHi, maskHi);
761 __m256i tmpl = __lasx_xvsat_hu(resultLo, 7);
762 __m256i tmph = __lasx_xvsat_hu(resultHi, 7);
763 return __lasx_xvpickev_b(tmph, tmpl);
766 static __m256i blend_lcd16_opaque_lasx(__m256i &
src, __m256i &
dst, __m256i &mask) {
784 __m256i xv_zero = __lasx_xvldi(0);
789 __m256i r = __lasx_xvand_v(SkPackedR16x5ToUnmaskedR32x5_LASX(mask),
794 __m256i g = __lasx_xvand_v(SkPackedG16x5ToUnmaskedG32x5_LASX(mask),
799 __m256i
b = __lasx_xvand_v(SkPackedB16x5ToUnmaskedB32x5_LASX(mask),
815 mask = __lasx_xvor_v(__lasx_xvor_v(
a, r), __lasx_xvor_v(g,
b));
820 __m256i maskLo, maskHi;
823 maskLo = __lasx_xvilvl_b(xv_zero, mask);
826 maskHi = __lasx_xvilvh_b(xv_zero, mask);
832 maskLo = __lasx_xvadd_h(maskLo, __lasx_xvsrli_h(maskLo, 4));
833 maskHi = __lasx_xvadd_h(maskHi, __lasx_xvsrli_h(maskHi, 4));
838 __m256i dstLo = __lasx_xvilvl_b(xv_zero,
dst);
841 __m256i dstHi = __lasx_xvilvh_b(xv_zero,
dst);
844 maskLo = __lasx_xvmul_h(maskLo, __lasx_xvsub_h(
src, dstLo));
845 maskHi = __lasx_xvmul_h(maskHi, __lasx_xvsub_h(
src, dstHi));
848 maskLo = __lasx_xvsrai_h(maskLo, 5);
849 maskHi = __lasx_xvsrai_h(maskHi, 5);
853 __m256i resultLo = __lasx_xvadd_h(dstLo, maskLo);
854 __m256i resultHi = __lasx_xvadd_h(dstHi, maskHi);
858 __m256i tmpl = __lasx_xvsat_hu(resultLo, 7);
859 __m256i tmph = __lasx_xvsat_hu(resultHi, 7);
861 return __lasx_xvpickev_b(tmph, tmpl);
873 __m256i xv_zero = __lasx_xvldi(0);
878 while (((
size_t)
dst & 0x0F) != 0) {
885 __m256i *
d =
reinterpret_cast<__m256i*
>(
dst);
887 unsigned int skpackargb32 =
SkPackARGB32(0xFF, srcR, srcG, srcB);
888 __m256i src_lasx = __lasx_xvreplgr2vr_w(skpackargb32);
890 src_lasx = __lasx_xvilvl_b(xv_zero, src_lasx);
894 __m256i srcA_lasx = __lasx_xvreplgr2vr_h(srcA);
898 __m256i dst_lasx = __lasx_xvld(
d, 0);
900 __m256i mask_lasx = __lasx_xvld(mask, 0);
901 mask_lasx = (__m256i){mask_lasx[0], 0, mask_lasx[1], 0};
903 int pack_cmp = __lasx_xbz_v(mask_lasx);
911 mask_lasx = __lasx_xvilvl_h(xv_zero, mask_lasx);
914 __m256i
result = blend_lcd16_lasx(src_lasx, dst_lasx, mask_lasx, srcA_lasx);
941 __m256i xv_zero = __lasx_xvldi(0);
945 while (((
size_t)
dst & 0x0F) != 0) {
952 __m256i *
d =
reinterpret_cast<__m256i*
>(
dst);
954 unsigned int sk_pack_argb32 =
SkPackARGB32(0xFF, srcR, srcG, srcB);
955 __m256i src_lasx = __lasx_xvreplgr2vr_w(sk_pack_argb32);
959 src_lasx = __lasx_xvilvl_b(xv_zero, src_lasx);
963 __m256i dst_lasx = __lasx_xvld(
d, 0);
965 __m256i mask_lasx = __lasx_xvld(mask, 0);
966 mask_lasx = (__m256i){mask_lasx[0], 0, mask_lasx[1], 0};
968 int32_t pack_cmp = __lasx_xbz_v(mask_lasx);
976 mask_lasx = __lasx_xvilvl_h(xv_zero, mask_lasx);
978 __m256i
result = blend_lcd16_opaque_lasx(src_lasx, dst_lasx, mask_lasx);
997#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1002 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
1003 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
1004 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
1006 #if SK_R16x5_R32x5_SHIFT == 0
1007 #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (x)
1008 #elif SK_R16x5_R32x5_SHIFT > 0
1009 #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (__lsx_vslli_w(x, SK_R16x5_R32x5_SHIFT))
1011 #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (__lsx_vsrli_w(x, -SK_R16x5_R32x5_SHIFT))
1014 #if SK_G16x5_G32x5_SHIFT == 0
1015 #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (x)
1016 #elif SK_G16x5_G32x5_SHIFT > 0
1017 #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (__lsx_vslli_w(x, SK_G16x5_G32x5_SHIFT))
1019 #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (__lsx_vsrli_w(x, -SK_G16x5_G32x5_SHIFT))
1022 #if SK_B16x5_B32x5_SHIFT == 0
1023 #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (x)
1024 #elif SK_B16x5_B32x5_SHIFT > 0
1025 #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (__lsx_vslli_w(x, SK_B16x5_B32x5_SHIFT))
1027 #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (__lsx_vsrli_w(x, -SK_B16x5_B32x5_SHIFT))
1030 static __m128i blend_lcd16_lsx(__m128i &
src, __m128i &
dst, __m128i &mask, __m128i &srcA) {
1047 __m128i v_zero = __lsx_vldi(0);
1051 __m128i r = __lsx_vand_v(SkPackedR16x5ToUnmaskedR32x5_LSX(mask),
1055 __m128i g = __lsx_vand_v(SkPackedG16x5ToUnmaskedG32x5_LSX(mask),
1059 __m128i
b = __lsx_vand_v(SkPackedB16x5ToUnmaskedB32x5_LSX(mask),
1070 __m128i
a = __lsx_vmskltz_w(srcA -
1076 a = __lsx_vor_v(__lsx_vand_v(
a, aMin), __lsx_vandn_v(
a, aMax));
1083 mask = __lsx_vor_v(__lsx_vor_v(
a, r), __lsx_vor_v(g,
b));
1088 __m128i maskLo, maskHi;
1090 maskLo = __lsx_vilvl_b(v_zero, mask);
1092 maskHi = __lsx_vilvh_b(v_zero, mask);
1098 maskLo = __lsx_vadd_h(maskLo, __lsx_vsrli_h(maskLo, 4));
1099 maskHi = __lsx_vadd_h(maskHi, __lsx_vsrli_h(maskHi, 4));
1102 maskLo = __lsx_vmul_h(maskLo, srcA);
1103 maskHi = __lsx_vmul_h(maskHi, srcA);
1106 maskLo = __lsx_vsrli_h(maskLo, 8);
1107 maskHi = __lsx_vsrli_h(maskHi, 8);
1111 __m128i dstLo = __lsx_vilvl_b(v_zero,
dst);
1113 __m128i dstHi = __lsx_vilvh_b(v_zero,
dst);
1116 maskLo = __lsx_vmul_h(maskLo, __lsx_vsub_h(
src, dstLo));
1117 maskHi = __lsx_vmul_h(maskHi, __lsx_vsub_h(
src, dstHi));
1120 maskLo = __lsx_vsrai_h(maskLo, 5);
1121 maskHi = __lsx_vsrai_h(maskHi, 5);
1125 __m128i resultLo = __lsx_vadd_h(dstLo, maskLo);
1126 __m128i resultHi = __lsx_vadd_h(dstHi, maskHi);
1132 __m128i tmpl = __lsx_vsat_hu(resultLo, 7);
1133 __m128i tmph = __lsx_vsat_hu(resultHi, 7);
1134 return __lsx_vpickev_b(tmph, tmpl);
1137 static __m128i blend_lcd16_opaque_lsx(__m128i &
src, __m128i &
dst, __m128i &mask) {
1152 __m128i v_zero = __lsx_vldi(0);
1156 __m128i r = __lsx_vand_v(SkPackedR16x5ToUnmaskedR32x5_LSX(mask),
1160 __m128i g = __lsx_vand_v(SkPackedG16x5ToUnmaskedG32x5_LSX(mask),
1164 __m128i
b = __lsx_vand_v(SkPackedB16x5ToUnmaskedB32x5_LSX(mask),
1177 mask = __lsx_vor_v(__lsx_vor_v(
a, r), __lsx_vor_v(g,
b));
1182 __m128i maskLo, maskHi;
1184 maskLo = __lsx_vilvl_b(v_zero, mask);
1186 maskHi = __lsx_vilvh_b(v_zero, mask);
1192 maskLo = __lsx_vadd_h(maskLo, __lsx_vsrli_h(maskLo, 4));
1193 maskHi = __lsx_vadd_h(maskHi, __lsx_vsrli_h(maskHi, 4));
1197 __m128i dstLo = __lsx_vilvl_b(v_zero,
dst);
1199 __m128i dstHi = __lsx_vilvh_b(v_zero,
dst);
1202 maskLo = __lsx_vmul_h(maskLo, __lsx_vsub_h(
src, dstLo));
1203 maskHi = __lsx_vmul_h(maskHi, __lsx_vsub_h(
src, dstHi));
1206 maskLo = __lsx_vsrai_h(maskLo, 5);
1207 maskHi = __lsx_vsrai_h(maskHi, 5);
1211 __m128i resultLo = __lsx_vadd_h(dstLo, maskLo);
1212 __m128i resultHi = __lsx_vadd_h(dstHi, maskHi);
1216 __m128i tmpl = __lsx_vsat_hu(resultLo, 7);
1217 __m128i tmph = __lsx_vsat_hu(resultHi, 7);
1218 return __lsx_vpickev_b(tmph, tmpl);
1230 __m128i v_zero = __lsx_vldi(0);
1235 while (((
size_t)
dst & 0x0F) != 0) {
1242 __m128i *
d =
reinterpret_cast<__m128i*
>(
dst);
1244 unsigned int skpackargb32 =
SkPackARGB32(0xFF, srcR, srcG, srcB);
1245 __m128i src_lsx = __lsx_vreplgr2vr_w(skpackargb32);
1247 src_lsx = __lsx_vilvl_b(v_zero, src_lsx);
1250 __m128i srcA_lsx = __lsx_vreplgr2vr_h(srcA);
1252 while (
width >= 4) {
1254 __m128i dst_lsx = __lsx_vld(
d, 0);
1256 __m128i mask_lsx = __lsx_vldrepl_d((
void *)mask, 0);
1257 mask_lsx = __lsx_vilvl_d(v_zero, mask_lsx);
1259 int pack_cmp = __lsx_bz_v(mask_lsx);
1261 if (pack_cmp != 1) {
1265 mask_lsx = __lsx_vilvl_h(v_zero, mask_lsx);
1268 __m128i
result = blend_lcd16_lsx(src_lsx, dst_lsx, mask_lsx, srcA_lsx);
1297 __m128i v_zero = __lsx_vldi(0);
1301 while (((
size_t)
dst & 0x0F) != 0) {
1308 __m128i *
d =
reinterpret_cast<__m128i*
>(
dst);
1310 unsigned int sk_pack_argb32 =
SkPackARGB32(0xFF, srcR, srcG, srcB);
1311 __m128i src_lsx = __lsx_vreplgr2vr_w(sk_pack_argb32);
1314 src_lsx = __lsx_vilvl_b(v_zero, src_lsx);
1316 while (
width >= 4) {
1318 __m128i dst_lsx = __lsx_vld(
d, 0);
1320 __m128i mask_lsx = __lsx_vldrepl_d((
void *)(mask), 0);
1321 mask_lsx = __lsx_vilvl_d(v_zero, mask_lsx);
1323 int pack_cmp = __lsx_bz_v(mask_lsx);
1325 if (pack_cmp != 1) {
1327 mask_lsx = __lsx_vilvl_h(v_zero, mask_lsx);
1330 __m128i
result = blend_lcd16_opaque_lsx(src_lsx, dst_lsx, mask_lsx);
1394 auto dstRow =
device.writable_addr32(
x,
y);
1395 auto maskRow = (
const uint16_t*)mask.
getAddr(
x,
y);
1406 blit_row(dstRow, maskRow,
color,
clip.width(), opaqueDst);
1409 maskRow = (
const uint16_t*)((
const char*)maskRow + mask.
fRowBytes);
1437 proc(dstRow, srcRow,
width, alpha);
1460#pragma warning ( push )
1461#pragma warning ( disable : 4701 )
1472 const int16_t runs[]) {
1479 unsigned opaqueMask = fSrcA;
1482 int count = runs[0];
1487 unsigned aa = antialias[0];
1489 if ((opaqueMask & aa) == 255) {
1521#define solid_8_pixels(mask, dst, color) \
1523 if (mask & 0x80) dst[0] = color; \
1524 if (mask & 0x40) dst[1] = color; \
1525 if (mask & 0x20) dst[2] = color; \
1526 if (mask & 0x10) dst[3] = color; \
1527 if (mask & 0x08) dst[4] = color; \
1528 if (mask & 0x04) dst[5] = color; \
1529 if (mask & 0x02) dst[6] = color; \
1530 if (mask & 0x01) dst[7] = color; \
1533#define SK_BLITBWMASK_NAME SkARGB32_BlitBW
1534#define SK_BLITBWMASK_ARGS , SkPMColor color
1535#define SK_BLITBWMASK_BLIT8(mask, dst) solid_8_pixels(mask, dst, color)
1536#define SK_BLITBWMASK_GETADDR writable_addr32
1537#define SK_BLITBWMASK_DEVTYPE uint32_t
1540#define blend_8_pixels(mask, dst, sc, dst_scale) \
1542 if (mask & 0x80) { dst[0] = sc + SkAlphaMulQ(dst[0], dst_scale); } \
1543 if (mask & 0x40) { dst[1] = sc + SkAlphaMulQ(dst[1], dst_scale); } \
1544 if (mask & 0x20) { dst[2] = sc + SkAlphaMulQ(dst[2], dst_scale); } \
1545 if (mask & 0x10) { dst[3] = sc + SkAlphaMulQ(dst[3], dst_scale); } \
1546 if (mask & 0x08) { dst[4] = sc + SkAlphaMulQ(dst[4], dst_scale); } \
1547 if (mask & 0x04) { dst[5] = sc + SkAlphaMulQ(dst[5], dst_scale); } \
1548 if (mask & 0x02) { dst[6] = sc + SkAlphaMulQ(dst[6], dst_scale); } \
1549 if (mask & 0x01) { dst[7] = sc + SkAlphaMulQ(dst[7], dst_scale); } \
1552#define SK_BLITBWMASK_NAME SkARGB32_BlendBW
1553#define SK_BLITBWMASK_ARGS , uint32_t sc, unsigned dst_scale
1554#define SK_BLITBWMASK_BLIT8(mask, dst) blend_8_pixels(mask, dst, sc, dst_scale)
1555#define SK_BLITBWMASK_GETADDR writable_addr32
1556#define SK_BLITBWMASK_DEVTYPE uint32_t
1579 SK_ABORT(
"Mask format not handled.");
1599 SK_ABORT(
"Mask format not handled.");
1623 if (alpha == 0 || fSrcA == 0) {
1664#pragma warning ( pop )
1670 const int16_t runs[]) {
1675 int count = runs[0];
1680 unsigned aa = antialias[0];
1686 unsigned dst_scale = 256 - aa;
1736 fShadeDirectlyIntoDevice =
1749 if (fShadeDirectlyIntoDevice) {
1767 if (fShadeDirectlyIntoDevice) {
1776 shaderContext->shadeSpan(
x,
y, span,
width);
1785 const int16_t runs[]) {
1796 int aa = *antialias;
1802 shaderContext->shadeSpan(
x,
y, span,
count);
1817 int aa = *antialias;
1819 shaderContext->shadeSpan(
x,
y, span,
count);
1842 U8x4 cov_splat = skvx::shuffle<0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3>(cov);
1843 return sk_bit_cast<U32>(kernel(sk_bit_cast<U8x4>(
dst),
1844 sk_bit_cast<U8x4>(
src),
1863 auto cov = (
const uint8_t*)mask;
1866 alpha = skvx::shuffle<3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15>(s_aa);
1872 auto cov = (
const uint8_t*)mask;
1874 return skvx::div255( skvx::cast<uint16_t>(
s) * skvx::cast<uint16_t>( c )
1875 + skvx::cast<uint16_t>(
d) * skvx::cast<uint16_t>(255-c));
1880 auto src_alpha_blend = [](
int s,
int d,
int sa,
int m) {
1884 auto upscale_31_to_255 = [](
int v) {
1885 return (v << 3) | (v >> 2);
1888 auto mask = (
const uint16_t*)vmask;
1889 for (
int i = 0;
i < n; ++
i) {
1890 uint16_t
m = mask[
i];
1911 maskR = upscale_31_to_255(maskR);
1912 maskG = upscale_31_to_255(maskG);
1913 maskB = upscale_31_to_255(maskB);
1924 auto mask = (
const uint16_t*)vmask;
1926 for (
int i = 0;
i < n; ++
i) {
1927 uint16_t
m = mask[
i];
1977 const int x =
clip.fLeft;
1984 const uint8_t* maskRow = (
const uint8_t*)mask.
getAddr(
x,
y);
1991 blend_row(
reinterpret_cast<SkPMColor*
>(dstRow), maskRow, span,
width);
2004 if (fShadeDirectlyIntoDevice) {
2025 proc(
device, span, 1, alpha);
#define SK_ABORT(message,...)
static void SkARGB32_Blit32(const SkPixmap &device, const SkMask &mask, const SkIRect &clip, SkPMColor srcColor)
static __m128i blend_lcd16_sse2(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA)
static __m128i blend_lcd16_opaque_sse2(__m128i &src, __m128i &dst, __m128i &mask)
static SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB, SkPMColor dst, uint16_t mask, SkPMColor opaqueDst)
static int blend_32(int src, int dst, int scale)
static int upscale_31_to_32(int value)
static bool blit_color(const SkPixmap &device, const SkMask &mask, const SkIRect &clip, SkColor color)
#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x)
#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x)
static SkPMColor blend_lcd16(int srcA, int srcR, int srcG, int srcB, SkPMColor dst, uint16_t mask)
static void blend_row_A8_opaque(SkPMColor *dst, const void *mask, const SkPMColor *src, int n)
void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor)
void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor opaqueDst)
#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x)
static void drive(SkPMColor *dst, const SkPMColor *src, const uint8_t *cov, int n, U8x4(*kernel)(U8x4, U8x4, U8x4))
static void blend_row_lcd16(SkPMColor *dst, const void *vmask, const SkPMColor *src, int n)
static void blend_row_LCD16_opaque(SkPMColor *dst, const void *vmask, const SkPMColor *src, int n)
static void blend_row_A8(SkPMColor *dst, const void *mask, const SkPMColor *src, int n)
#define SkGetPackedB16(color)
#define SkGetPackedG16(color)
static SkPMColor SkFastFourByteInterp(SkPMColor src, SkPMColor dst, U8CPU srcWeight)
static SkPMColor SkFourByteInterp(SkPMColor src, SkPMColor dst, U8CPU srcWeight)
#define SkGetPackedR16(color)
static SkPMColor SkBlendARGB32(SkPMColor src, SkPMColor dst, U8CPU aa)
#define SkGetPackedB32(packed)
#define SkGetPackedR32(packed)
static SK_ALWAYS_INLINE uint32_t SkAlphaMulQ(uint32_t c, unsigned scale)
#define SkAlphaMul(value, alpha256)
#define SkGetPackedA32(packed)
#define SkGetPackedG32(packed)
static unsigned SkAlpha255To256(U8CPU alpha)
static SkPMColor SkPackARGB32(U8CPU a, U8CPU r, U8CPU g, U8CPU b)
#define SkColorGetR(color)
#define SkColorGetG(color)
SK_API SkPMColor SkPreMultiplyColor(SkColor c)
#define SkColorGetA(color)
#define SkColorGetB(color)
SK_API void sk_free(void *)
static void * sk_malloc_throw(size_t size)
static SkPath clip(const SkPath &path, const SkHalfPlane &plane)
static bool apply(Pass *pass, SkRecord *record)
SkDEBUGCODE(SK_SPI) SkThreadID SkGetThreadID()
static constexpr bool SkToBool(const T &x)
void blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) override
void blitAntiH(int x, int y, const SkAlpha antialias[], const int16_t runs[]) override
void blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) override
void blitMask(const SkMask &, const SkIRect &) override
void blitRect(int x, int y, int width, int height) override
Blit a solid rectangle one or more pixels wide.
SkARGB32_Blitter(const SkPixmap &device, const SkPaint &paint)
void blitH(int x, int y, int width) override
Blit a horizontal run of one or more pixels.
void blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) override
void blitV(int x, int y, int height, SkAlpha alpha) override
Blit a vertical run of pixels with a constant alpha value.
void blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) override
void blitAntiH(int x, int y, const SkAlpha antialias[], const int16_t runs[]) override
void blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) override
void blitMask(const SkMask &, const SkIRect &) override
void blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) override
~SkARGB32_Shader_Blitter() override
void blitRect(int x, int y, int width, int height) override
Blit a solid rectangle one or more pixels wide.
SkARGB32_Shader_Blitter(const SkPixmap &device, const SkPaint &paint, SkShaderBase::Context *shaderContext)
void blitMask(const SkMask &, const SkIRect &) override
void blitH(int x, int y, int width) override
Blit a horizontal run of one or more pixels.
void blitAntiH(int x, int y, const SkAlpha[], const int16_t[]) override
void blitV(int x, int y, int height, SkAlpha alpha) override
Blit a vertical run of pixels with a constant alpha value.
static void Color32(SkPMColor dst[], int count, SkPMColor color)
static Proc32 Factory32(unsigned flags32)
void(* Proc32)(uint32_t dst[], const SkPMColor src[], int count, U8CPU alpha)
virtual void blitMask(const SkMask &, const SkIRect &clip)
uint32_t * writable_addr32(int x, int y) const
virtual void shadeSpan(int x, int y, SkPMColor[], int count)=0
virtual uint32_t getFlags() const
@ kOpaqueAlpha_Flag
set if all of the colors will be opaque
SkShaderBase::Context * fShaderContext
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
FlutterSemanticsFlag flags
static float max(float r, float g, float b)
static float min(float r, float g, float b)
void(* rect_memset32)(uint32_t[], uint32_t, int, size_t, int)
void(* memset32)(uint32_t[], uint32_t, int)
void(* blit_mask_d32_a8)(SkPMColor *dst, size_t dstRB, const SkAlpha *mask, size_t maskRB, SkColor color, int w, int h)
SIN Vec< N, uint8_t > div255(const Vec< N, uint16_t > &x)
SIN Vec< N, uint8_t > approx_scale(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
bool contains(int32_t x, int32_t y) const
@ kA8_Format
8bits per pixel mask (e.g. antialiasing)
@ kLCD16_Format
565 alpha for r/g/b
@ kARGB32_Format
SkPMColor.
@ kBW_Format
1bit per pixel mask (e.g. monochrome)
const uint8_t * getAddr8(int x, int y) const
const void * getAddr(int x, int y) const