8#ifndef SkBlitMask_opts_DEFINED
9#define SkBlitMask_opts_DEFINED
14#if defined(SK_ARM_HAS_NEON)
20#if defined(SK_ARM_HAS_NEON)
24 #define NEON_A (SK_A32_SHIFT / 8)
25 #define NEON_R (SK_R32_SHIFT / 8)
26 #define NEON_G (SK_G32_SHIFT / 8)
27 #define NEON_B (SK_B32_SHIFT / 8)
29 static inline uint16x8_t SkAlpha255To256_neon8(uint8x8_t alpha) {
30 return vaddw_u8(vdupq_n_u16(1), alpha);
33 static inline uint8x8_t SkAlphaMul_neon8(uint8x8_t
color, uint16x8_t
scale) {
34 return vshrn_n_u16(vmovl_u8(
color) *
scale, 8);
37 static inline uint8x8x4_t SkAlphaMulQ_neon8(uint8x8x4_t
color, uint16x8_t
scale) {
40 ret.val[0] = SkAlphaMul_neon8(
color.val[0],
scale);
41 ret.val[1] = SkAlphaMul_neon8(
color.val[1],
scale);
42 ret.val[2] = SkAlphaMul_neon8(
color.val[2],
scale);
43 ret.val[3] = SkAlphaMul_neon8(
color.val[3],
scale);
49 template <
bool isColor>
50 static void D32_A8_Opaque_Color_neon(
void*
SK_RESTRICT dst,
size_t dstRB,
55 const uint8_t*
SK_RESTRICT mask = (
const uint8_t*)maskPtr;
59 ptrdiff_t mask_adjust = (ptrdiff_t)maskRB -
width;
60 dstRB -= (
width << 2);
71 uint8x8_t vmask = vld1_u8(mask);
72 uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask);
74 vscale = vsubw_u8(vdupq_n_u16(256),
75 SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256));
77 vscale = vsubw_u8(vdupq_n_u16(256), vmask);
79 uint8x8x4_t vdev = vld4_u8((uint8_t*)
device);
81 vdev.val[NEON_A] = SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)
82 + SkAlphaMul_neon8(vdev.val[NEON_A], vscale);
83 vdev.val[NEON_R] = SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256)
84 + SkAlphaMul_neon8(vdev.val[NEON_R], vscale);
85 vdev.val[NEON_G] = SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256)
86 + SkAlphaMul_neon8(vdev.val[NEON_G], vscale);
87 vdev.val[NEON_B] = SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256)
88 + SkAlphaMul_neon8(vdev.val[NEON_B], vscale);
90 vst4_u8((uint8_t*)
device, vdev);
98 unsigned aa = *mask++;
115 const SkAlpha* mask,
size_t maskRB,
117 D32_A8_Opaque_Color_neon<true>(
dst, dstRB, mask, maskRB,
color,
w,
h);
122 const SkAlpha* mask,
size_t maskRB,
124 D32_A8_Opaque_Color_neon<false>(
dst, dstRB, mask, maskRB,
color,
w,
h);
129 const SkAlpha* maskPtr,
size_t maskRB,
132 const uint8_t*
SK_RESTRICT mask = (
const uint8_t*)maskPtr;
135 ptrdiff_t mask_adjust = (ptrdiff_t)maskRB -
width;
136 dstRB -= (
width << 2);
140 uint8x8_t vmask = vld1_u8(mask);
141 uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask);
142 uint8x8x4_t vdevice = vld4_u8((uint8_t*)
device);
144 vdevice = SkAlphaMulQ_neon8(vdevice, vscale);
145 vdevice.val[NEON_A] += vmask;
147 vst4_u8((uint8_t*)
device, vdevice);
154 unsigned aa = *mask++;
164#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
165 #include <lsxintrin.h>
168 __m128i tmp = __lsx_vmul_h(
x,
y);
169 __m128i mask = __lsx_vreplgr2vr_h(0xff00);
170 return __lsx_vsrlri_h(__lsx_vand_v(tmp, mask), 8);
173 template <
bool isColor>
179 const uint8_t*
SK_RESTRICT mask = (
const uint8_t*)maskPtr;
180 __m128i vpmc_b = __lsx_vldi(0);
181 __m128i vpmc_g = __lsx_vldi(0);
182 __m128i vpmc_r = __lsx_vldi(0);
183 __m128i vpmc_a = __lsx_vldi(0);
186 ptrdiff_t mask_adjust = (ptrdiff_t)maskRB -
width;
187 dstRB -= (
width << 2);
196 const __m128i zeros = __lsx_vldi(0);
197 __m128i planar = __lsx_vldi(0);
198 planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010c080400, 0);
199 planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030e0a0602, 1);
204 __m128i lo = __lsx_vld(
device, 0);
205 __m128i hi = __lsx_vld(
device, 16);
206 lo = __lsx_vshuf_b(zeros, lo, planar);
207 hi = __lsx_vshuf_b(zeros, hi, planar);
208 __m128i bg = __lsx_vilvl_w(hi, lo),
209 ra = __lsx_vilvh_w(hi, lo);
211 __m128i
b = __lsx_vilvl_b(zeros, bg),
212 g = __lsx_vilvh_b(zeros, bg),
213 r = __lsx_vilvl_b(zeros, ra),
214 a = __lsx_vilvh_b(zeros, ra);
216 __m128i vmask = __lsx_vld(mask, 0);
217 vmask = __lsx_vilvl_b(zeros, vmask);
218 __m128i vscale, vmask256 = __lsx_vadd_h(vmask, __lsx_vreplgr2vr_h(1));
222 vscale = __lsx_vsub_h(__lsx_vreplgr2vr_h(256), tmp);
224 vscale = __lsx_vsub_h(__lsx_vreplgr2vr_h(256), vmask);
232 bg = __lsx_vor_v(
b, __lsx_vslli_h(g, 8));
233 ra = __lsx_vor_v(r, __lsx_vslli_h(
a, 8));
234 lo = __lsx_vilvl_h(ra, bg);
235 hi = __lsx_vilvh_h(ra, bg);
238 __lsx_vst(hi,
device, 16);
246 unsigned aa = *mask++;
263 const SkAlpha* mask,
size_t maskRB,
265 D32_A8_Opaque_Color_lsx<true>(
dst, dstRB, mask, maskRB,
color,
w,
h);
269 const SkAlpha* mask,
size_t maskRB,
271 D32_A8_Opaque_Color_lsx<false>(
dst, dstRB, mask, maskRB,
color,
w,
h);
276 const SkAlpha* maskPtr,
size_t maskRB,
279 const uint8_t*
SK_RESTRICT mask = (
const uint8_t*)maskPtr;
282 ptrdiff_t mask_adjust = (ptrdiff_t)maskRB -
width;
283 dstRB -= (
width << 2);
284 const __m128i zeros = __lsx_vldi(0);
285 __m128i planar = __lsx_vldi(0);
286 planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010c080400, 0);
287 planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030e0a0602, 1);
292 __m128i vmask = __lsx_vld(mask, 0);
293 vmask = __lsx_vilvl_b(zeros, vmask);
294 __m128i vscale = __lsx_vsub_h(__lsx_vreplgr2vr_h(256), vmask);
295 __m128i lo = __lsx_vld(
device, 0);
296 __m128i hi = __lsx_vld(
device, 16);
297 lo = __lsx_vshuf_b(zeros, lo, planar);
298 hi = __lsx_vshuf_b(zeros, hi, planar);
299 __m128i bg = __lsx_vilvl_w(hi, lo),
300 ra = __lsx_vilvh_w(hi, lo);
302 __m128i
b = __lsx_vilvl_b(zeros, bg),
303 g = __lsx_vilvh_b(zeros, bg),
304 r = __lsx_vilvl_b(zeros, ra),
305 a = __lsx_vilvh_b(zeros, ra);
314 bg = __lsx_vor_v(
b, __lsx_vslli_h(g, 8));
315 ra = __lsx_vor_v(r, __lsx_vslli_h(
a, 8));
316 lo = __lsx_vilvl_h(ra, bg);
317 hi = __lsx_vilvh_h(ra, bg);
320 __lsx_vst(hi,
device, 16);
328 unsigned aa = *mask++;
342 const SkAlpha* mask,
size_t maskRB,
348 auto left =
s.approxMulDiv255(aa),
349 right =
d.approxMulDiv255(left.alphas().inv());
354 dst += dstRB /
sizeof(*dst);
355 mask += maskRB /
sizeof(*mask);
361 const SkAlpha* mask,
size_t maskRB,
370 return s.approxMulDiv255(aa) +
d.approxMulDiv255(aa.inv());
374 dst += dstRB /
sizeof(*dst);
375 mask += maskRB /
sizeof(*mask);
381 const SkAlpha* mask,
size_t maskRB,
389 return (aa &
Sk4px(
skvx::byte16{0,0,0,255, 0,0,0,255, 0,0,0,255, 0,0,0,255}))
390 +
d.approxMulDiv255(aa.inv());
394 dst += dstRB /
sizeof(*dst);
395 mask += maskRB /
sizeof(*mask);
401 const SkAlpha* mask,
size_t maskRB,
static SkPMColor SkBlendARGB32(SkPMColor src, SkPMColor dst, U8CPU aa)
#define SkGetPackedB32(packed)
#define SkGetPackedR32(packed)
static SK_ALWAYS_INLINE uint32_t SkAlphaMulQ(uint32_t c, unsigned scale)
#define SkGetPackedA32(packed)
#define SkGetPackedG32(packed)
static unsigned SkAlpha255To256(U8CPU alpha)
SK_API SkPMColor SkPreMultiplyColor(SkColor c)
constexpr SkColor SK_ColorBLACK
#define SkColorGetA(color)
static Sk4px DupPMColor(SkPMColor c)
static void MapDstAlpha(int n, SkPMColor *dst, const SkAlpha *a, const Fn &fn)
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
static void blit_mask_d32_a8_opaque(SkPMColor *dst, size_t dstRB, const SkAlpha *mask, size_t maskRB, SkColor color, int w, int h)
static void D32_A8_Opaque_Color_lsx(void *SK_RESTRICT dst, size_t dstRB, const void *SK_RESTRICT maskPtr, size_t maskRB, SkColor color, int width, int height)
static void blit_mask_d32_a8_general(SkPMColor *dst, size_t dstRB, const SkAlpha *mask, size_t maskRB, SkColor color, int w, int h)
static __m128i SkAlphaMul_lsx(__m128i x, __m128i y)
void blit_mask_d32_a8(SkPMColor *dst, size_t dstRB, const SkAlpha *mask, size_t maskRB, SkColor color, int w, int h)
static void blit_mask_d32_a8_black(SkPMColor *dst, size_t dstRB, const SkAlpha *maskPtr, size_t maskRB, int width, int height)