51 __m256i srcA_x2 = _mm256_shuffle_epi8(src,
52 _mm256_setr_epi8(3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_,
53 3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_));
54 __m256i scale_x2 = _mm256_sub_epi16(_mm256_set1_epi16(256),
58 __m256i rb = _mm256_and_si256(_mm256_set1_epi32(0x00ff00ff), dst);
59 rb = _mm256_mullo_epi16(rb, scale_x2);
60 rb = _mm256_srli_epi16 (rb, 8);
63 __m256i ga = _mm256_srli_epi16(dst, 8);
64 ga = _mm256_mullo_epi16(ga, scale_x2);
65 ga = _mm256_andnot_si256(_mm256_set1_epi32(0x00ff00ff), ga);
67 return _mm256_adds_epu8(src, _mm256_or_si256(rb, ga));
75 __m128i
scale = _mm_sub_epi32(_mm_set1_epi32(256),
76 _mm_srli_epi32(src, 24));
77 __m128i scale_x2 = _mm_or_si128(_mm_slli_epi32(
scale, 16),
scale);
79 __m128i rb = _mm_and_si128(_mm_set1_epi32(0x00ff00ff), dst);
80 rb = _mm_mullo_epi16(rb, scale_x2);
81 rb = _mm_srli_epi16(rb, 8);
83 __m128i ga = _mm_srli_epi16(dst, 8);
84 ga = _mm_mullo_epi16(ga, scale_x2);
85 ga = _mm_andnot_si128(_mm_set1_epi32(0x00ff00ff), ga);
87 return _mm_adds_epu8(src, _mm_or_si128(rb, ga));
123 __m256i val = __lasx_xvreplgr2vr_w(256);
124 __m256i
scale = __lasx_xvsub_w(val, __lasx_xvsrli_w(src, 24));
125 __m256i scale_x2 = __lasx_xvor_v(__lasx_xvslli_w(
scale, 16),
scale);
127 val = __lasx_xvreplgr2vr_w(0x00ff00ff);
128 __m256i rb = __lasx_xvand_v(val, dst);
129 rb = __lasx_xvmul_h(rb, scale_x2);
130 rb = __lasx_xvsrli_h(rb, 8);
132 __m256i ga = __lasx_xvsrli_h(dst, 8);
133 ga = __lasx_xvmul_h(ga, scale_x2);
134 ga = __lasx_xvandn_v(val, ga);
136 return __lasx_xvsadd_bu(src, __lasx_xvor_v(rb, ga));
144 __m128i val = __lsx_vreplgr2vr_w(256);
145 __m128i
scale = __lsx_vsub_w(val, __lsx_vsrli_w(src, 24));
146 __m128i scale_x2 = __lsx_vor_v(__lsx_vslli_w(
scale, 16),
scale);
148 val = __lsx_vreplgr2vr_w(0x00ff00ff);
149 __m128i rb = __lsx_vand_v(val, dst);
150 rb = __lsx_vmul_h(rb, scale_x2);
151 rb = __lsx_vsrli_h(rb, 8);
153 __m128i ga = __lsx_vsrli_h(dst, 8);
154 ga = __lsx_vmul_h(ga, scale_x2);
155 ga = __lsx_vandn_v(val, ga);
157 return __lsx_vsadd_bu(src, __lsx_vor_v(rb, ga));
168#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
170 _mm256_storeu_si256((__m256i*)dst,
172 _mm256_loadu_si256((
const __m256i*)dst)));
179#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
181 _mm_storeu_si128((__m128i*)dst,
SkPMSrcOver_SSE2(_mm_loadu_si128((
const __m128i*)src),
182 _mm_loadu_si128((
const __m128i*)dst)));
189#if defined(SK_ARM_HAS_NEON)
191 vst4_u8((uint8_t*)dst, SkPMSrcOver_neon8(vld4_u8((
const uint8_t*)dst),
192 vld4_u8((
const uint8_t*)src)));
199 vst1_u8((uint8_t*)dst, SkPMSrcOver_neon2(vld1_u8((
const uint8_t*)dst),
200 vld1_u8((
const uint8_t*)src)));
207 uint8x8_t
result = SkPMSrcOver_neon2(vcreate_u8((uint64_t)*dst),
208 vcreate_u8((uint64_t)*src));
209 vst1_lane_u32(dst, vreinterpret_u32_u8(
result), 0);
214#if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
217 __lasx_xvld(dst, 0)), (__m256i*)dst, 0);
224#if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
227 __lsx_vld(dst, 0)), (__m128i*)dst, 0);
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d