Flutter Engine
The Flutter Engine
SkBlitRow_opts.h
Go to the documentation of this file.
1/*
2 * Copyright 2015 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkBlitRow_opts_DEFINED
9#define SkBlitRow_opts_DEFINED
10
12#include "src/base/SkMSAN.h"
13#include "src/base/SkVx.h"
14
15// Helpers for blit_row_s32a_opaque(),
16// then blit_row_s32a_opaque() itself,
17// then unrelated blit_row_color32() at the bottom.
18//
19// To keep Skia resistant to timing attacks, it's important not to branch on pixel data.
20// In particular, don't be tempted to [v]ptest, pmovmskb, etc. to branch on the source alpha.
21
22#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
23 #include <immintrin.h>
24
25 static inline __m256i SkPMSrcOver_AVX2(const __m256i& src, const __m256i& dst) {
26 // Abstractly srcover is
27 // b = s + d*(1-srcA)
28 //
29 // In terms of unorm8 bytes, that works out to
30 // b = s + (d*(255-srcA) + 127) / 255
31 //
32 // But we approximate that to within a bit with
33 // b = s + (d*(255-srcA) + d) / 256
34 // a.k.a
35 // b = s + (d*(256-srcA)) >> 8
36
37 // The bottleneck of this math is the multiply, and we want to do it as
38 // narrowly as possible, here getting inputs into 16-bit lanes and
39 // using 16-bit multiplies. We can do twice as many multiplies at once
40 // as using naive 32-bit multiplies, and on top of that, the 16-bit multiplies
41 // are themselves a couple cycles quicker. Win-win.
42
43 // We'll get everything in 16-bit lanes for two multiplies, one
44 // handling dst red and blue, the other green and alpha. (They're
45 // conveniently 16-bits apart, you see.) We don't need the individual
46 // src channels beyond alpha until the very end when we do the "s + "
47 // add, and we don't even need to unpack them; the adds cannot overflow.
48
49 // Shuffle each pixel's srcA to the low byte of each 16-bit half of the pixel.
50 const int _ = -1; // fills a literal 0 byte.
51 __m256i srcA_x2 = _mm256_shuffle_epi8(src,
52 _mm256_setr_epi8(3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_,
53 3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_));
54 __m256i scale_x2 = _mm256_sub_epi16(_mm256_set1_epi16(256),
55 srcA_x2);
56
57 // Scale red and blue, leaving results in the low byte of each 16-bit lane.
58 __m256i rb = _mm256_and_si256(_mm256_set1_epi32(0x00ff00ff), dst);
59 rb = _mm256_mullo_epi16(rb, scale_x2);
60 rb = _mm256_srli_epi16 (rb, 8);
61
62 // Scale green and alpha, leaving results in the high byte, masking off the low bits.
63 __m256i ga = _mm256_srli_epi16(dst, 8);
64 ga = _mm256_mullo_epi16(ga, scale_x2);
65 ga = _mm256_andnot_si256(_mm256_set1_epi32(0x00ff00ff), ga);
66
67 return _mm256_adds_epu8(src, _mm256_or_si256(rb, ga));
68 }
69#endif
70
71#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
72 #include <immintrin.h>
73
74 static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
75 __m128i scale = _mm_sub_epi32(_mm_set1_epi32(256),
76 _mm_srli_epi32(src, 24));
77 __m128i scale_x2 = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);
78
79 __m128i rb = _mm_and_si128(_mm_set1_epi32(0x00ff00ff), dst);
80 rb = _mm_mullo_epi16(rb, scale_x2);
81 rb = _mm_srli_epi16(rb, 8);
82
83 __m128i ga = _mm_srli_epi16(dst, 8);
84 ga = _mm_mullo_epi16(ga, scale_x2);
85 ga = _mm_andnot_si128(_mm_set1_epi32(0x00ff00ff), ga);
86
87 return _mm_adds_epu8(src, _mm_or_si128(rb, ga));
88 }
89#endif
90
91#if defined(SK_ARM_HAS_NEON)
92 #include <arm_neon.h>
93
94 // SkMulDiv255Round() applied to each lane.
95 static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) {
96 uint16x8_t prod = vmull_u8(x, y);
97 return vraddhn_u16(prod, vrshrq_n_u16(prod, 8));
98 }
99
100 static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) {
101 uint8x8_t nalphas = vmvn_u8(src.val[3]); // 256 - alpha
102 return {
103 vqadd_u8(src.val[0], SkMulDiv255Round_neon8(nalphas, dst.val[0])),
104 vqadd_u8(src.val[1], SkMulDiv255Round_neon8(nalphas, dst.val[1])),
105 vqadd_u8(src.val[2], SkMulDiv255Round_neon8(nalphas, dst.val[2])),
106 vqadd_u8(src.val[3], SkMulDiv255Round_neon8(nalphas, dst.val[3])),
107 };
108 }
109
110 // Variant assuming dst and src contain the color components of two consecutive pixels.
111 static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
112 const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303);
113 uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices));
114 return vqadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst));
115 }
116
117#endif
118
119#if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
120 #include <lasxintrin.h>
121
122 static inline __m256i SkPMSrcOver_LASX(const __m256i& src, const __m256i& dst) {
123 __m256i val = __lasx_xvreplgr2vr_w(256);
124 __m256i scale = __lasx_xvsub_w(val, __lasx_xvsrli_w(src, 24));
125 __m256i scale_x2 = __lasx_xvor_v(__lasx_xvslli_w(scale, 16), scale);
126
127 val = __lasx_xvreplgr2vr_w(0x00ff00ff);
128 __m256i rb = __lasx_xvand_v(val, dst);
129 rb = __lasx_xvmul_h(rb, scale_x2);
130 rb = __lasx_xvsrli_h(rb, 8);
131
132 __m256i ga = __lasx_xvsrli_h(dst, 8);
133 ga = __lasx_xvmul_h(ga, scale_x2);
134 ga = __lasx_xvandn_v(val, ga);
135
136 return __lasx_xvsadd_bu(src, __lasx_xvor_v(rb, ga));
137 }
138#endif
139
140#if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
141 #include <lsxintrin.h>
142
143 static inline __m128i SkPMSrcOver_LSX(const __m128i& src, const __m128i& dst) {
144 __m128i val = __lsx_vreplgr2vr_w(256);
145 __m128i scale = __lsx_vsub_w(val, __lsx_vsrli_w(src, 24));
146 __m128i scale_x2 = __lsx_vor_v(__lsx_vslli_w(scale, 16), scale);
147
148 val = __lsx_vreplgr2vr_w(0x00ff00ff);
149 __m128i rb = __lsx_vand_v(val, dst);
150 rb = __lsx_vmul_h(rb, scale_x2);
151 rb = __lsx_vsrli_h(rb, 8);
152
153 __m128i ga = __lsx_vsrli_h(dst, 8);
154 ga = __lsx_vmul_h(ga, scale_x2);
155 ga = __lsx_vandn_v(val, ga);
156
157 return __lsx_vsadd_bu(src, __lsx_vor_v(rb, ga));
158 }
159#endif
160
161namespace SK_OPTS_NS {
162
163/*not static*/
164inline void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
165 SkASSERT(alpha == 0xFF);
167
168#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
169 while (len >= 8) {
170 _mm256_storeu_si256((__m256i*)dst,
171 SkPMSrcOver_AVX2(_mm256_loadu_si256((const __m256i*)src),
172 _mm256_loadu_si256((const __m256i*)dst)));
173 src += 8;
174 dst += 8;
175 len -= 8;
176 }
177#endif
178
179#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
180 while (len >= 4) {
181 _mm_storeu_si128((__m128i*)dst, SkPMSrcOver_SSE2(_mm_loadu_si128((const __m128i*)src),
182 _mm_loadu_si128((const __m128i*)dst)));
183 src += 4;
184 dst += 4;
185 len -= 4;
186 }
187#endif
188
189#if defined(SK_ARM_HAS_NEON)
190 while (len >= 8) {
191 vst4_u8((uint8_t*)dst, SkPMSrcOver_neon8(vld4_u8((const uint8_t*)dst),
192 vld4_u8((const uint8_t*)src)));
193 src += 8;
194 dst += 8;
195 len -= 8;
196 }
197
198 while (len >= 2) {
199 vst1_u8((uint8_t*)dst, SkPMSrcOver_neon2(vld1_u8((const uint8_t*)dst),
200 vld1_u8((const uint8_t*)src)));
201 src += 2;
202 dst += 2;
203 len -= 2;
204 }
205
206 if (len != 0) {
207 uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8((uint64_t)*dst),
208 vcreate_u8((uint64_t)*src));
209 vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0);
210 }
211 return;
212#endif
213
214#if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
215 while (len >= 8) {
216 __lasx_xvst(SkPMSrcOver_LASX(__lasx_xvld(src, 0),
217 __lasx_xvld(dst, 0)), (__m256i*)dst, 0);
218 src += 8;
219 dst += 8;
220 len -= 8;
221 }
222#endif
223
224#if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
225 while (len >= 4) {
226 __lsx_vst(SkPMSrcOver_LSX(__lsx_vld(src, 0),
227 __lsx_vld(dst, 0)), (__m128i*)dst, 0);
228 src += 4;
229 dst += 4;
230 len -= 4;
231 }
232#endif
233
234 while (len --> 0) {
235 *dst = SkPMSrcOver(*src, *dst);
236 src++;
237 dst++;
238 }
239}
240
241// Blend constant color over count dst pixels
242/*not static*/
244 constexpr int N = 4; // 8, 16 also reasonable choices
248
249 auto kernel = [color](U32 src) {
250 unsigned invA = 255 - SkGetPackedA32(color);
251 invA += invA >> 7;
252 SkASSERT(0 < invA && invA < 256); // We handle alpha == 0 or alpha == 255 specially.
253
254 // (src * invA + (color << 8) + 128) >> 8
255 // Should all fit in 16 bits.
256 U8 s = sk_bit_cast<U8>(src),
257 a = U8(invA);
258 U16 c = skvx::cast<uint16_t>(sk_bit_cast<U8>(U32(color))),
259 d = (mull(s,a) + (c << 8) + 128)>>8;
260 return sk_bit_cast<U32>(skvx::cast<uint8_t>(d));
261 };
262
263 while (count >= N) {
264 kernel(U32::Load(dst)).store(dst);
265 dst += N;
266 count -= N;
267 }
268 while (count --> 0) {
269 *dst = kernel(U32{*dst})[0];
270 dst++;
271 }
272}
273
274} // namespace SK_OPTS_NS
275
276#endif//SkBlitRow_opts_DEFINED
int count
Definition: FontMgrTest.cpp:50
#define SkASSERT(cond)
Definition: SkAssert.h:116
static __m128i SkPMSrcOver_LSX(const __m128i &src, const __m128i &dst)
static __m256i SkPMSrcOver_LASX(const __m256i &src, const __m256i &dst)
static __m128i SkPMSrcOver_SSE2(const __m128i &src, const __m128i &dst)
static __m256i SkPMSrcOver_AVX2(const __m256i &src, const __m256i &dst)
unsigned U8CPU
Definition: SkCPUTypes.h:18
static SkPMColor SkPMSrcOver(SkPMColor src, SkPMColor dst)
Definition: SkColorPriv.h:150
#define SkGetPackedA32(packed)
Definition: SkColorPriv.h:92
uint32_t SkPMColor
Definition: SkColor.h:205
static void sk_msan_assert_initialized(const void *begin, const void *end)
Definition: SkMSAN.h:24
DlColor color
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
Definition: main.cc:19
struct MyStruct s
struct MyStruct a[10]
GAsyncResult * result
double y
double x
static constexpr size_t N
void blit_row_color32(SkPMColor *dst, int count, SkPMColor color)
void blit_row_s32a_opaque(SkPMColor *dst, const SkPMColor *src, int len, U8CPU alpha)
dst
Definition: cp.py:12
SIN Vec< N, uint16_t > mull(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
Definition: SkVx.h:906
const Scalar scale
Definition: SkVx.h:83