8#ifndef SkBlitRow_opts_DEFINED
9#define SkBlitRow_opts_DEFINED
22#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
23 #include <immintrin.h>
51 __m256i srcA_x2 = _mm256_shuffle_epi8(
src,
52 _mm256_setr_epi8(3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_,
53 3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_));
54 __m256i scale_x2 = _mm256_sub_epi16(_mm256_set1_epi16(256),
58 __m256i rb = _mm256_and_si256(_mm256_set1_epi32(0x00ff00ff),
dst);
59 rb = _mm256_mullo_epi16(rb, scale_x2);
60 rb = _mm256_srli_epi16 (rb, 8);
63 __m256i ga = _mm256_srli_epi16(
dst, 8);
64 ga = _mm256_mullo_epi16(ga, scale_x2);
65 ga = _mm256_andnot_si256(_mm256_set1_epi32(0x00ff00ff), ga);
67 return _mm256_adds_epu8(
src, _mm256_or_si256(rb, ga));
71#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
72 #include <immintrin.h>
75 __m128i
scale = _mm_sub_epi32(_mm_set1_epi32(256),
76 _mm_srli_epi32(
src, 24));
77 __m128i scale_x2 = _mm_or_si128(_mm_slli_epi32(
scale, 16),
scale);
79 __m128i rb = _mm_and_si128(_mm_set1_epi32(0x00ff00ff),
dst);
80 rb = _mm_mullo_epi16(rb, scale_x2);
81 rb = _mm_srli_epi16(rb, 8);
83 __m128i ga = _mm_srli_epi16(
dst, 8);
84 ga = _mm_mullo_epi16(ga, scale_x2);
85 ga = _mm_andnot_si128(_mm_set1_epi32(0x00ff00ff), ga);
87 return _mm_adds_epu8(
src, _mm_or_si128(rb, ga));
91#if defined(SK_ARM_HAS_NEON)
95 static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t
x, uint8x8_t
y) {
96 uint16x8_t prod = vmull_u8(
x,
y);
97 return vraddhn_u16(prod, vrshrq_n_u16(prod, 8));
100 static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t
dst, uint8x8x4_t
src) {
101 uint8x8_t nalphas = vmvn_u8(
src.val[3]);
103 vqadd_u8(
src.val[0], SkMulDiv255Round_neon8(nalphas,
dst.val[0])),
104 vqadd_u8(
src.val[1], SkMulDiv255Round_neon8(nalphas,
dst.val[1])),
105 vqadd_u8(
src.val[2], SkMulDiv255Round_neon8(nalphas,
dst.val[2])),
106 vqadd_u8(
src.val[3], SkMulDiv255Round_neon8(nalphas,
dst.val[3])),
111 static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t
dst, uint8x8_t
src) {
112 const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303);
113 uint8x8_t nalphas = vmvn_u8(vtbl1_u8(
src, alpha_indices));
114 return vqadd_u8(
src, SkMulDiv255Round_neon8(nalphas,
dst));
119#if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
120 #include <lasxintrin.h>
123 __m256i val = __lasx_xvreplgr2vr_w(256);
124 __m256i
scale = __lasx_xvsub_w(val, __lasx_xvsrli_w(
src, 24));
125 __m256i scale_x2 = __lasx_xvor_v(__lasx_xvslli_w(
scale, 16),
scale);
127 val = __lasx_xvreplgr2vr_w(0x00ff00ff);
128 __m256i rb = __lasx_xvand_v(val,
dst);
129 rb = __lasx_xvmul_h(rb, scale_x2);
130 rb = __lasx_xvsrli_h(rb, 8);
132 __m256i ga = __lasx_xvsrli_h(
dst, 8);
133 ga = __lasx_xvmul_h(ga, scale_x2);
134 ga = __lasx_xvandn_v(val, ga);
136 return __lasx_xvsadd_bu(
src, __lasx_xvor_v(rb, ga));
140#if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
141 #include <lsxintrin.h>
144 __m128i val = __lsx_vreplgr2vr_w(256);
145 __m128i
scale = __lsx_vsub_w(val, __lsx_vsrli_w(
src, 24));
146 __m128i scale_x2 = __lsx_vor_v(__lsx_vslli_w(
scale, 16),
scale);
148 val = __lsx_vreplgr2vr_w(0x00ff00ff);
149 __m128i rb = __lsx_vand_v(val,
dst);
150 rb = __lsx_vmul_h(rb, scale_x2);
151 rb = __lsx_vsrli_h(rb, 8);
153 __m128i ga = __lsx_vsrli_h(
dst, 8);
154 ga = __lsx_vmul_h(ga, scale_x2);
155 ga = __lsx_vandn_v(val, ga);
157 return __lsx_vsadd_bu(
src, __lsx_vor_v(rb, ga));
168#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
170 _mm256_storeu_si256((__m256i*)
dst,
172 _mm256_loadu_si256((
const __m256i*)
dst)));
179#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
182 _mm_loadu_si128((
const __m128i*)
dst)));
189#if defined(SK_ARM_HAS_NEON)
191 vst4_u8((uint8_t*)
dst, SkPMSrcOver_neon8(vld4_u8((
const uint8_t*)
dst),
192 vld4_u8((
const uint8_t*)
src)));
199 vst1_u8((uint8_t*)
dst, SkPMSrcOver_neon2(vld1_u8((
const uint8_t*)
dst),
200 vld1_u8((
const uint8_t*)
src)));
207 uint8x8_t
result = SkPMSrcOver_neon2(vcreate_u8((uint64_t)*
dst),
208 vcreate_u8((uint64_t)*
src));
209 vst1_lane_u32(
dst, vreinterpret_u32_u8(
result), 0);
214#if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
217 __lasx_xvld(
dst, 0)), (__m256i*)
dst, 0);
224#if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
227 __lsx_vld(
dst, 0)), (__m128i*)
dst, 0);
256 U8 s = sk_bit_cast<U8>(
src),
258 U16 c = skvx::cast<uint16_t>(sk_bit_cast<U8>(
U32(
color))),
259 d = (
mull(
s,
a) + (c << 8) + 128)>>8;
260 return sk_bit_cast<U32>(skvx::cast<uint8_t>(
d));
264 kernel(U32::Load(
dst)).store(
dst);
268 while (
count --> 0) {
static __m128i SkPMSrcOver_LSX(const __m128i &src, const __m128i &dst)
static __m256i SkPMSrcOver_LASX(const __m256i &src, const __m256i &dst)
static __m128i SkPMSrcOver_SSE2(const __m128i &src, const __m128i &dst)
static __m256i SkPMSrcOver_AVX2(const __m256i &src, const __m256i &dst)
static SkPMColor SkPMSrcOver(SkPMColor src, SkPMColor dst)
#define SkGetPackedA32(packed)
static void sk_msan_assert_initialized(const void *begin, const void *end)
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
static constexpr size_t N
void blit_row_color32(SkPMColor *dst, int count, SkPMColor color)
void blit_row_s32a_opaque(SkPMColor *dst, const SkPMColor *src, int len, U8CPU alpha)
SIN Vec< N, uint16_t > mull(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)