Flutter Engine
The Flutter Engine
SkBlitter_ARGB32.cpp
Go to the documentation of this file.
1/*
2 * Copyright 2006 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
13#include "include/core/SkRect.h"
20#include "src/base/SkUtils.h"
21#include "src/base/SkVx.h"
22#include "src/core/SkBlitMask.h"
23#include "src/core/SkBlitRow.h"
25#include "src/core/SkMask.h"
26#include "src/core/SkMemset.h"
28
29#include <algorithm>
30#include <cstddef>
31#include <cstdint>
32
33static inline int upscale_31_to_32(int value) {
34 SkASSERT((unsigned)value <= 31);
35 return value + (value >> 4);
36}
37
38static inline int blend_32(int src, int dst, int scale) {
39 SkASSERT((unsigned)src <= 0xFF);
40 SkASSERT((unsigned)dst <= 0xFF);
41 SkASSERT((unsigned)scale <= 32);
42 return dst + ((src - dst) * scale >> 5);
43}
44
45static inline SkPMColor blend_lcd16(int srcA, int srcR, int srcG, int srcB,
46 SkPMColor dst, uint16_t mask) {
47 if (mask == 0) {
48 return dst;
49 }
50
51 /* We want all of these in 5bits, hence the shifts in case one of them
52 * (green) is 6bits.
53 */
54 int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
55 int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
56 int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
57
58 // Now upscale them to 0..32, so we can use blend32
59 maskR = upscale_31_to_32(maskR);
60 maskG = upscale_31_to_32(maskG);
61 maskB = upscale_31_to_32(maskB);
62
63 // srcA has been upscaled to 256 before passed into this function
64 maskR = maskR * srcA >> 8;
65 maskG = maskG * srcA >> 8;
66 maskB = maskB * srcA >> 8;
67
68 int dstA = SkGetPackedA32(dst);
69 int dstR = SkGetPackedR32(dst);
70 int dstG = SkGetPackedG32(dst);
71 int dstB = SkGetPackedB32(dst);
72
73 // Subtract 1 from srcA to bring it back to [0-255] to compare against dstA, alpha needs to
74 // use either the min or the max of the LCD coverages. See https:/skbug.com/40037823
75 int maskA = (srcA-1) < dstA ? std::min(maskR, std::min(maskG, maskB))
76 : std::max(maskR, std::max(maskG, maskB));
77
78 return SkPackARGB32(blend_32(0xFF, dstA, maskA),
79 blend_32(srcR, dstR, maskR),
80 blend_32(srcG, dstG, maskG),
81 blend_32(srcB, dstB, maskB));
82}
83
84static inline SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB,
85 SkPMColor dst, uint16_t mask,
86 SkPMColor opaqueDst) {
87 if (mask == 0) {
88 return dst;
89 }
90
91 if (0xFFFF == mask) {
92 return opaqueDst;
93 }
94
95 /* We want all of these in 5bits, hence the shifts in case one of them
96 * (green) is 6bits.
97 */
98 int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
99 int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
100 int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
101
102 // Now upscale them to 0..32, so we can use blend32
103 maskR = upscale_31_to_32(maskR);
104 maskG = upscale_31_to_32(maskG);
105 maskB = upscale_31_to_32(maskB);
106
107 int dstA = SkGetPackedA32(dst);
108 int dstR = SkGetPackedR32(dst);
109 int dstG = SkGetPackedG32(dst);
110 int dstB = SkGetPackedB32(dst);
111
112 // Opaque src alpha always uses the max of the LCD coverages.
113 int maskA = std::max(maskR, std::max(maskG, maskB));
114
115 // LCD blitting is only supported if the dst is known/required
116 // to be opaque
117 return SkPackARGB32(blend_32(0xFF, dstA, maskA),
118 blend_32(srcR, dstR, maskR),
119 blend_32(srcG, dstG, maskG),
120 blend_32(srcB, dstB, maskB));
121}
122
123
124// TODO: rewrite at least the SSE code here. It's miserable.
125
126#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
127 #include <emmintrin.h>
128
129 // The following (left) shifts cause the top 5 bits of the mask components to
130 // line up with the corresponding components in an SkPMColor.
131 // Note that the mask's RGB16 order may differ from the SkPMColor order.
132 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
133 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
134 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
135
136 #if SK_R16x5_R32x5_SHIFT == 0
137 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
138 #elif SK_R16x5_R32x5_SHIFT > 0
139 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
140 #else
141 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
142 #endif
143
144 #if SK_G16x5_G32x5_SHIFT == 0
145 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
146 #elif SK_G16x5_G32x5_SHIFT > 0
147 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
148 #else
149 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
150 #endif
151
152 #if SK_B16x5_B32x5_SHIFT == 0
153 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
154 #elif SK_B16x5_B32x5_SHIFT > 0
155 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
156 #else
157 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
158 #endif
159
160 static __m128i blend_lcd16_sse2(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) {
161 // In the following comments, the components of src, dst and mask are
162 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
163 // by an R, G, B, or A suffix. Components of one of the four pixels that
164 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
165 // example is the blue channel of the second destination pixel. Memory
166 // layout is shown for an ARGB byte order in a color value.
167
168 // src and srcA store 8-bit values interleaved with zeros.
169 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
170 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
171 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
172 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
173 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
174 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
175 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
176
177 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
178 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
179 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
180 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
181
182 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
183 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
184 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
185
186 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
187 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
188 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
189
190 // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
191 __m128i aMin = _mm_min_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
192 _mm_min_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
193 _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
194 __m128i aMax = _mm_max_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
195 _mm_max_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
196 _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
197 // srcA has been biased to [0-256], so compare srcA against (dstA+1)
198 __m128i a = _mm_cmplt_epi32(srcA,
199 _mm_and_si128(
200 _mm_add_epi32(dst, _mm_set1_epi32(1 << SK_A32_SHIFT)),
201 _mm_set1_epi32(SK_A32_MASK)));
202 // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
203 a = _mm_or_si128(_mm_and_si128(a, aMin), _mm_andnot_si128(a, aMax));
204
205 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
206 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
207 // 8-bit position
208 // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
209 // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
210 mask = _mm_or_si128(_mm_or_si128(a, r), _mm_or_si128(g, b));
211
212 // Interleave R,G,B into the lower byte of word.
213 // i.e. split the sixteen 8-bit values from mask into two sets of eight
214 // 16-bit values, padded by zero.
215 __m128i maskLo, maskHi;
216 // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
217 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
218 // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
219 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
220
221 // Upscale from 0..31 to 0..32
222 // (allows to replace division by left-shift further down)
223 // Left-shift each component by 4 and add the result back to that component,
224 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
225 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
226 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
227
228 // Multiply each component of maskLo and maskHi by srcA
229 maskLo = _mm_mullo_epi16(maskLo, srcA);
230 maskHi = _mm_mullo_epi16(maskHi, srcA);
231
232 // Left shift mask components by 8 (divide by 256)
233 maskLo = _mm_srli_epi16(maskLo, 8);
234 maskHi = _mm_srli_epi16(maskHi, 8);
235
236 // Interleave R,G,B into the lower byte of the word
237 // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
238 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
239 // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
240 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
241
242 // mask = (src - dst) * mask
243 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
244 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
245
246 // mask = (src - dst) * mask >> 5
247 maskLo = _mm_srai_epi16(maskLo, 5);
248 maskHi = _mm_srai_epi16(maskHi, 5);
249
250 // Add two pixels into result.
251 // result = dst + ((src - dst) * mask >> 5)
252 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
253 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
254
255 // Pack into 4 32bit dst pixels.
256 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
257 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
258 // clamping to 255 if necessary.
259 return _mm_packus_epi16(resultLo, resultHi);
260 }
261
262 static __m128i blend_lcd16_opaque_sse2(__m128i &src, __m128i &dst, __m128i &mask) {
263 // In the following comments, the components of src, dst and mask are
264 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
265 // by an R, G, B, or A suffix. Components of one of the four pixels that
266 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
267 // example is the blue channel of the second destination pixel. Memory
268 // layout is shown for an ARGB byte order in a color value.
269
270 // src and srcA store 8-bit values interleaved with zeros.
271 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
272 // mask stores 16-bit values (shown as high and low bytes) interleaved with
273 // zeros
274 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
275 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
276
277 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
278 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
279 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
280 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
281
282 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
283 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
284 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
285
286 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
287 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
288 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
289
290 // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
291 __m128i a = _mm_max_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
292 _mm_max_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
293 _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
294
295 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
296 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
297 // 8-bit position
298 // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
299 // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
300 mask = _mm_or_si128(_mm_or_si128(a, r), _mm_or_si128(g, b));
301
302 // Interleave R,G,B into the lower byte of word.
303 // i.e. split the sixteen 8-bit values from mask into two sets of eight
304 // 16-bit values, padded by zero.
305 __m128i maskLo, maskHi;
306 // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
307 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
308 // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
309 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
310
311 // Upscale from 0..31 to 0..32
312 // (allows to replace division by left-shift further down)
313 // Left-shift each component by 4 and add the result back to that component,
314 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
315 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
316 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
317
318 // Interleave R,G,B into the lower byte of the word
319 // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
320 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
321 // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
322 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
323
324 // mask = (src - dst) * mask
325 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
326 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
327
328 // mask = (src - dst) * mask >> 5
329 maskLo = _mm_srai_epi16(maskLo, 5);
330 maskHi = _mm_srai_epi16(maskHi, 5);
331
332 // Add two pixels into result.
333 // result = dst + ((src - dst) * mask >> 5)
334 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
335 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
336
337 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
338 // clamping to 255 if necessary.
339 return _mm_packus_epi16(resultLo, resultHi);
340 }
341
342 void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
343 if (width <= 0) {
344 return;
345 }
346
347 int srcA = SkColorGetA(src);
348 int srcR = SkColorGetR(src);
349 int srcG = SkColorGetG(src);
350 int srcB = SkColorGetB(src);
351
352 srcA = SkAlpha255To256(srcA);
353
354 if (width >= 4) {
355 SkASSERT(((size_t)dst & 0x03) == 0);
356 while (((size_t)dst & 0x0F) != 0) {
357 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
358 mask++;
359 dst++;
360 width--;
361 }
362
363 __m128i *d = reinterpret_cast<__m128i*>(dst);
364 // Set alpha to 0xFF and replicate source four times in SSE register.
365 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
366 // Interleave with zeros to get two sets of four 16-bit values.
367 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
368 // Set srcA_sse to contain eight copies of srcA, padded with zero.
369 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
370 __m128i srcA_sse = _mm_set1_epi16(srcA);
371 while (width >= 4) {
372 // Load four destination pixels into dst_sse.
373 __m128i dst_sse = _mm_load_si128(d);
374 // Load four 16-bit masks into lower half of mask_sse.
375 __m128i mask_sse = _mm_loadu_si64(mask);
376
377 // Check whether masks are equal to 0 and get the highest bit
378 // of each byte of result, if masks are all zero, we will get
379 // pack_cmp to 0xFFFF
380 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
381 _mm_setzero_si128()));
382
383 // if mask pixels are not all zero, we will blend the dst pixels
384 if (pack_cmp != 0xFFFF) {
385 // Unpack 4 16bit mask pixels to
386 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
387 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
388 mask_sse = _mm_unpacklo_epi16(mask_sse,
389 _mm_setzero_si128());
390
391 // Process 4 32bit dst pixels
392 __m128i result = blend_lcd16_sse2(src_sse, dst_sse, mask_sse, srcA_sse);
393 _mm_store_si128(d, result);
394 }
395
396 d++;
397 mask += 4;
398 width -= 4;
399 }
400
401 dst = reinterpret_cast<SkPMColor*>(d);
402 }
403
404 while (width > 0) {
405 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
406 mask++;
407 dst++;
408 width--;
409 }
410 }
411
412 void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
413 SkColor src, int width, SkPMColor opaqueDst) {
414 if (width <= 0) {
415 return;
416 }
417
418 int srcR = SkColorGetR(src);
419 int srcG = SkColorGetG(src);
420 int srcB = SkColorGetB(src);
421
422 if (width >= 4) {
423 SkASSERT(((size_t)dst & 0x03) == 0);
424 while (((size_t)dst & 0x0F) != 0) {
425 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
426 mask++;
427 dst++;
428 width--;
429 }
430
431 __m128i *d = reinterpret_cast<__m128i*>(dst);
432 // Set alpha to 0xFF and replicate source four times in SSE register.
433 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
434 // Set srcA_sse to contain eight copies of srcA, padded with zero.
435 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
436 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
437 while (width >= 4) {
438 // Load four destination pixels into dst_sse.
439 __m128i dst_sse = _mm_load_si128(d);
440 // Load four 16-bit masks into lower half of mask_sse.
441 __m128i mask_sse = _mm_loadu_si64(mask);
442
443 // Check whether masks are equal to 0 and get the highest bit
444 // of each byte of result, if masks are all zero, we will get
445 // pack_cmp to 0xFFFF
446 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
447 _mm_setzero_si128()));
448
449 // if mask pixels are not all zero, we will blend the dst pixels
450 if (pack_cmp != 0xFFFF) {
451 // Unpack 4 16bit mask pixels to
452 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
453 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
454 mask_sse = _mm_unpacklo_epi16(mask_sse,
455 _mm_setzero_si128());
456
457 // Process 4 32bit dst pixels
458 __m128i result = blend_lcd16_opaque_sse2(src_sse, dst_sse, mask_sse);
459 _mm_store_si128(d, result);
460 }
461
462 d++;
463 mask += 4;
464 width -= 4;
465 }
466
467 dst = reinterpret_cast<SkPMColor*>(d);
468 }
469
470 while (width > 0) {
471 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
472 mask++;
473 dst++;
474 width--;
475 }
476 }
477
478#elif defined(SK_ARM_HAS_NEON)
479 #include <arm_neon.h>
480
481 #define NEON_A (SK_A32_SHIFT / 8)
482 #define NEON_R (SK_R32_SHIFT / 8)
483 #define NEON_G (SK_G32_SHIFT / 8)
484 #define NEON_B (SK_B32_SHIFT / 8)
485
486 static inline uint8x8_t blend_32_neon(uint8x8_t src, uint8x8_t dst, uint16x8_t scale) {
487 int16x8_t src_wide, dst_wide;
488
489 src_wide = vreinterpretq_s16_u16(vmovl_u8(src));
490 dst_wide = vreinterpretq_s16_u16(vmovl_u8(dst));
491
492 src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(scale);
493
494 dst_wide += vshrq_n_s16(src_wide, 5);
495
496 return vmovn_u16(vreinterpretq_u16_s16(dst_wide));
497 }
498
499 void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t src[],
500 SkColor color, int width,
501 SkPMColor opaqueDst) {
502 int colR = SkColorGetR(color);
503 int colG = SkColorGetG(color);
504 int colB = SkColorGetB(color);
505
506 uint8x8_t vcolA = vdup_n_u8(0xFF);
507 uint8x8_t vcolR = vdup_n_u8(colR);
508 uint8x8_t vcolG = vdup_n_u8(colG);
509 uint8x8_t vcolB = vdup_n_u8(colB);
510
511 while (width >= 8) {
512 uint8x8x4_t vdst;
513 uint16x8_t vmask;
514 uint16x8_t vmaskR, vmaskG, vmaskB, vmaskA;
515
516 vdst = vld4_u8((uint8_t*)dst);
517 vmask = vld1q_u16(src);
518
519 // Get all the color masks on 5 bits
520 vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
521 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
523 vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
524
525 // Upscale to 0..32
526 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
527 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
528 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
529 // Opaque srcAlpha always uses the max of the 3 LCD coverage values
530 vmaskA = vmaxq_u16(vmaskR, vmaxq_u16(vmaskG, vmaskB));
531
532 vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
533 vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
534 vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
535 vdst.val[NEON_A] = blend_32_neon(vcolA, vdst.val[NEON_A], vmaskA);
536
537 vst4_u8((uint8_t*)dst, vdst);
538
539 dst += 8;
540 src += 8;
541 width -= 8;
542 }
543
544 // Leftovers
545 for (int i = 0; i < width; i++) {
546 dst[i] = blend_lcd16_opaque(colR, colG, colB, dst[i], src[i], opaqueDst);
547 }
548 }
549
550 void blit_row_lcd16(SkPMColor dst[], const uint16_t src[],
552 int colA = SkColorGetA(color);
553 int colR = SkColorGetR(color);
554 int colG = SkColorGetG(color);
555 int colB = SkColorGetB(color);
556
557 // srcA in [0-255] to compare vs dstA
558 uint16x8_t vcolACmp = vdupq_n_u16(colA);
559 colA = SkAlpha255To256(colA);
560
561 uint16x8_t vcolA = vdupq_n_u16(colA); // srcA in [0-256] to combine with coverage
562 uint8x8_t vcolR = vdup_n_u8(colR);
563 uint8x8_t vcolG = vdup_n_u8(colG);
564 uint8x8_t vcolB = vdup_n_u8(colB);
565
566 while (width >= 8) {
567 uint8x8x4_t vdst;
568 uint16x8_t vmask;
569 uint16x8_t vmaskR, vmaskG, vmaskB, vmaskA;
570
571 vdst = vld4_u8((uint8_t*)dst);
572 vmask = vld1q_u16(src);
573
574 // Get all the color masks on 5 bits
575 vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
576 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
578 vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
579
580 // Upscale to 0..32
581 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
582 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
583 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
584
585 vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
586 vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
587 vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
588
589 // Select either the min or the max of the RGB mask values, depending on if the src
590 // alpha is less than the dst alpha.
591 vmaskA = vbslq_u16(vcleq_u16(vcolACmp, vmovl_u8(vdst.val[NEON_A])), // srcA < dstA
592 vminq_u16(vmaskR, vminq_u16(vmaskG, vmaskB)), // ? min(r,g,b)
593 vmaxq_u16(vmaskR, vmaxq_u16(vmaskG, vmaskB))); // : max(r,g,b)
594
595 vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
596 vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
597 vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
598 // vmaskA already includes vcolA so blend against 0xFF
599 vdst.val[NEON_A] = blend_32_neon(vdup_n_u8(0xFF), vdst.val[NEON_A], vmaskA);
600 vst4_u8((uint8_t*)dst, vdst);
601
602 dst += 8;
603 src += 8;
604 width -= 8;
605 }
606
607 for (int i = 0; i < width; i++) {
608 dst[i] = blend_lcd16(colA, colR, colG, colB, dst[i], src[i]);
609 }
610 }
611
612#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
613
614 // The following (left) shifts cause the top 5 bits of the mask components to
615 // line up with the corresponding components in an SkPMColor.
616 // Note that the mask's RGB16 order may differ from the SkPMColor order.
617 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
618 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
619 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
620
621 #if SK_R16x5_R32x5_SHIFT == 0
622 #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (x)
623 #elif SK_R16x5_R32x5_SHIFT > 0
624 #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (__lasx_xvslli_w(x, SK_R16x5_R32x5_SHIFT))
625 #else
626 #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_R16x5_R32x5_SHIFT))
627 #endif
628
629 #if SK_G16x5_G32x5_SHIFT == 0
630 #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (x)
631 #elif SK_G16x5_G32x5_SHIFT > 0
632 #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (__lasx_xvslli_w(x, SK_G16x5_G32x5_SHIFT))
633 #else
634 #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_G16x5_G32x5_SHIFT))
635 #endif
636
637 #if SK_B16x5_B32x5_SHIFT == 0
638 #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (x)
639 #elif SK_B16x5_B32x5_SHIFT > 0
640 #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (__lasx_xvslli_w(x, SK_B16x5_B32x5_SHIFT))
641 #else
642 #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_B16x5_B32x5_SHIFT))
643 #endif
644
645 static __m256i blend_lcd16_lasx(__m256i &src, __m256i &dst, __m256i &mask, __m256i &srcA) {
646 // In the following comments, the components of src, dst and mask are
647 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
648 // by an R, G, B, or A suffix. Components of one of the four pixels that
649 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
650 // example is the blue channel of the second destination pixel. Memory
651 // layout is shown for an ARGB byte order in a color value.
652
653 // src and srcA store 8-bit values interleaved with zeros.
654 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
655 // 0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
656 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
657 // srcA, 0, srcA, 0, srcA, 0, srcA, 0,
658 // srcA, 0, srcA, 0, srcA, 0, srcA, 0,
659 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
660 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
661 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
662 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
663 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
664 // m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
665 // m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
666
667 __m256i xv_zero = __lasx_xvldi(0);
668
669 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
670 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0,
671 // 0, m4R, 0, 0, 0, m5R, 0, 0, 0, m6R, 0, 0, 0, m7R, 0, 0)
672 __m256i r = __lasx_xvand_v(SkPackedR16x5ToUnmaskedR32x5_LASX(mask),
673 __lasx_xvreplgr2vr_w(0x1F << SK_R32_SHIFT));
674
675 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
676 // 0, 0, m4G, 0, 0, 0, m5G, 0, 0, 0, m6G, 0, 0, 0, m7R, 0)
677 __m256i g = __lasx_xvand_v(SkPackedG16x5ToUnmaskedG32x5_LASX(mask),
678 __lasx_xvreplgr2vr_w(0x1F << SK_G32_SHIFT));
679
680 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
681 // 0, 0, 0, m4B, 0, 0, 0, m5B, 0, 0, 0, m6B, 0, 0, 0, m7B)
682 __m256i b = __lasx_xvand_v(SkPackedB16x5ToUnmaskedB32x5_LASX(mask),
683 __lasx_xvreplgr2vr_w(0x1F << SK_B32_SHIFT));
684
685 // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
686 __m256i aMin = __lasx_xvmin_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
687 __lasx_xvmin_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
688 __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
689 __m256i aMax = __lasx_xvmax_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
690 __lasx_xvmax_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
691 __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
692 // srcA has been biased to [0-256], so compare srcA against (dstA+1)
693 __m256i a = __lasx_xvmskltz_w(srcA -
694 __lasx_xvand_v(
695 __lasx_xvadd_w(dst,
696 __lasx_xvreplgr2vr_w(1 << SK_A32_SHIFT)),
697 __lasx_xvreplgr2vr_w(SK_A32_MASK)));
698 // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
699 a = __lasx_xvor_v(__lasx_xvand_v(a, aMin), __lasx_xvandn_v(a, aMax));
700
701 // Pack the 8 16bit mask pixels into 8 32bit pixels, (p0, p1, p2, p3)
702 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
703 // 8-bit position
704 // mask = (m0A, m0R, m0G, m0B, m1R, m1R, m1G, m1B,
705 // m2A, m2R, m2G, m2B, m3R, m3R, m3G, m3B,
706 // m4A, m4R, m4G, m4B, m5R, m5R, m5G, m5B,
707 // m6A, m6R, m6G, m6B, m7R, m7R, m7G, m7B)
708 mask = __lasx_xvor_v(__lasx_xvor_v(a, r), __lasx_xvor_v(g, b));
709
710 // Interleave R,G,B into the lower byte of word.
711 // i.e. split the sixteen 8-bit values from mask into two sets of sixteen
712 // 16-bit values, padded by zero.
713 __m256i maskLo, maskHi;
714 // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0,
715 // m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
716 maskLo = __lasx_xvilvl_b(xv_zero, mask);
717 // maskHi = (m4A, 0, m4R, 0, m4G, 0, m4B, 0, m5A, 0, m5R, 0, m5G, 0, m5B, 0,
718 // m6A, 0, m6R, 0, m6G, 0, m6B, 0, m7A, 0, m7R, 0, m7G, 0, m7B, 0)
719 maskHi = __lasx_xvilvh_b(xv_zero, mask);
720
721 // Upscale from 0..31 to 0..32
722 // (allows to replace division by left-shift further down)
723 // Left-shift each component by 4 and add the result back to that component,
724 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
725 maskLo = __lasx_xvadd_h(maskLo, __lasx_xvsrli_h(maskLo, 4));
726 maskHi = __lasx_xvadd_h(maskHi, __lasx_xvsrli_h(maskHi, 4));
727
728 // Multiply each component of maskLo and maskHi by srcA
729 maskLo = __lasx_xvmul_h(maskLo, srcA);
730 maskHi = __lasx_xvmul_h(maskHi, srcA);
731
732 // Left shift mask components by 8 (divide by 256)
733 maskLo = __lasx_xvsrli_h(maskLo, 8);
734 maskHi = __lasx_xvsrli_h(maskHi, 8);
735
736 // Interleave R,G,B into the lower byte of the word
737 // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
738 // d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
739 __m256i dstLo = __lasx_xvilvl_b(xv_zero, dst);
740 // dstLo = (d4A, 0, d4R, 0, d4G, 0, d4B, 0, d5A, 0, d5R, 0, d5G, 0, d5B, 0)
741 // d6A, 0, d6R, 0, d6G, 0, d6B, 0, d7A, 0, d7R, 0, d7G, 0, d7B, 0)
742 __m256i dstHi = __lasx_xvilvh_b(xv_zero, dst);
743
744 // mask = (src - dst) * mask
745 maskLo = __lasx_xvmul_h(maskLo, __lasx_xvsub_h(src, dstLo));
746 maskHi = __lasx_xvmul_h(maskHi, __lasx_xvsub_h(src, dstHi));
747
748 // mask = (src - dst) * mask >> 5
749 maskLo = __lasx_xvsrai_h(maskLo, 5);
750 maskHi = __lasx_xvsrai_h(maskHi, 5);
751
752 // Add two pixels into result.
753 // result = dst + ((src - dst) * mask >> 5)
754 __m256i resultLo = __lasx_xvadd_h(dstLo, maskLo);
755 __m256i resultHi = __lasx_xvadd_h(dstHi, maskHi);
756
757 // Pack into 8 32bit dst pixels.
758 // resultLo and resultHi contain sixteen 16-bit components (four pixels) each.
759 // Merge into one LASX regsiter with 32 8-bit values (eight pixels),
760 // clamping to 255 if necessary.
761 __m256i tmpl = __lasx_xvsat_hu(resultLo, 7);
762 __m256i tmph = __lasx_xvsat_hu(resultHi, 7);
763 return __lasx_xvpickev_b(tmph, tmpl);
764 }
765
766 static __m256i blend_lcd16_opaque_lasx(__m256i &src, __m256i &dst, __m256i &mask) {
767 // In the following comments, the components of src, dst and mask are
768 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
769 // by an R, G, B, or A suffix. Components of one of the four pixels that
770 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
771 // example is the blue channel of the second destination pixel. Memory
772 // layout is shown for an ARGB byte order in a color value.
773
774 // src and srcA store 8-bit values interleaved with zeros.
775 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
776 // 0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
777 // mask stores 16-bit values (shown as high and low bytes) interleaved with
778 // zeros
779 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
780 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
781 // m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
782 // m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
783
784 __m256i xv_zero = __lasx_xvldi(0);
785
786 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
787 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0,
788 // 0, m4R, 0, 0, 0, m5R, 0, 0, 0, m6R, 0, 0, 0, m7R, 0, 0)
789 __m256i r = __lasx_xvand_v(SkPackedR16x5ToUnmaskedR32x5_LASX(mask),
790 __lasx_xvreplgr2vr_w(0x1F << SK_R32_SHIFT));
791
792 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0,
793 // 0, 0, m4G, 0, 0, 0, m5G, 0, 0, 0, m6G, 0, 0, 0, m7G, 0)
794 __m256i g = __lasx_xvand_v(SkPackedG16x5ToUnmaskedG32x5_LASX(mask),
795 __lasx_xvreplgr2vr_w(0x1F << SK_G32_SHIFT));
796
797 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B,
798 // 0, 0, 0, m4B, 0, 0, 0, m5B, 0, 0, 0, m6B, 0, 0, 0, m7B)
799 __m256i b = __lasx_xvand_v(SkPackedB16x5ToUnmaskedB32x5_LASX(mask),
800 __lasx_xvreplgr2vr_w(0x1F << SK_B32_SHIFT));
801
802 // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
803 __m256i a = __lasx_xvmax_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
804 __lasx_xvmax_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
805 __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
806
807 // Pack the 8 16bit mask pixels into 8 32bit pixels, (p0, p1, p2, p3,
808 // p4, p5, p6, p7)
809 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
810 // 8-bit position
811 // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
812 // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B,
813 // m4A, m4R, m4G, m4B, m5A, m5R, m5G, m5B,
814 // m6A, m6R, m6G, m6B, m7A, m7R, m7G, m7B)
815 mask = __lasx_xvor_v(__lasx_xvor_v(a, r), __lasx_xvor_v(g, b));
816
817 // Interleave R,G,B into the lower byte of word.
818 // i.e. split the 32 8-bit values from mask into two sets of sixteen
819 // 16-bit values, padded by zero.
820 __m256i maskLo, maskHi;
821 // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0,
822 // m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
823 maskLo = __lasx_xvilvl_b(xv_zero, mask);
824 // maskHi = (m4A, 0, m4R, 0, m4G, 0, m4B, 0, m5A, 0, m5R, 0, m5G, 0, m5B, 0,
825 // m6A, 0, m6R, 0, m6G, 0, m6B, 0, m7A, 0, m7R, 0, m7G, 0, m7B, 0)
826 maskHi = __lasx_xvilvh_b(xv_zero, mask);
827
828 // Upscale from 0..31 to 0..32
829 // (allows to replace division by left-shift further down)
830 // Left-shift each component by 4 and add the result back to that component,
831 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
832 maskLo = __lasx_xvadd_h(maskLo, __lasx_xvsrli_h(maskLo, 4));
833 maskHi = __lasx_xvadd_h(maskHi, __lasx_xvsrli_h(maskHi, 4));
834
835 // Interleave R,G,B into the lower byte of the word
836 // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0,
837 // d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
838 __m256i dstLo = __lasx_xvilvl_b(xv_zero, dst);
839 // dstLo = (d4A, 0, d4R, 0, d4G, 0, d4B, 0, d5A, 0, d5R, 0, d5G, 0, d5B, 0,
840 // dstLo = (d6A, 0, d6R, 0, d6G, 0, d6B, 0, d7A, 0, d7R, 0, d7G, 0, d7B, 0)
841 __m256i dstHi = __lasx_xvilvh_b(xv_zero, dst);
842
843 // mask = (src - dst) * mask
844 maskLo = __lasx_xvmul_h(maskLo, __lasx_xvsub_h(src, dstLo));
845 maskHi = __lasx_xvmul_h(maskHi, __lasx_xvsub_h(src, dstHi));
846
847 // mask = (src - dst) * mask >> 5
848 maskLo = __lasx_xvsrai_h(maskLo, 5);
849 maskHi = __lasx_xvsrai_h(maskHi, 5);
850
851 // Add two pixels into result.
852 // result = dst + ((src - dst) * mask >> 5)
853 __m256i resultLo = __lasx_xvadd_h(dstLo, maskLo);
854 __m256i resultHi = __lasx_xvadd_h(dstHi, maskHi);
855
856 // Merge into one SSE regsiter with 32 8-bit values (eight pixels),
857 // clamping to 255 if necessary.
858 __m256i tmpl = __lasx_xvsat_hu(resultLo, 7);
859 __m256i tmph = __lasx_xvsat_hu(resultHi, 7);
860
861 return __lasx_xvpickev_b(tmph, tmpl);
862 }
863
864 void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
865 if (width <= 0) {
866 return;
867 }
868
869 int srcA = SkColorGetA(src);
870 int srcR = SkColorGetR(src);
871 int srcG = SkColorGetG(src);
872 int srcB = SkColorGetB(src);
873 __m256i xv_zero = __lasx_xvldi(0);
874
875 srcA = SkAlpha255To256(srcA);
876 if (width >= 8) {
877 SkASSERT(((size_t)dst & 0x03) == 0);
878 while (((size_t)dst & 0x0F) != 0) {
879 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
880 mask++;
881 dst++;
882 width--;
883 }
884
885 __m256i *d = reinterpret_cast<__m256i*>(dst);
886 // Set alpha to 0xFF and replicate source eight times in LASX register.
887 unsigned int skpackargb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
888 __m256i src_lasx = __lasx_xvreplgr2vr_w(skpackargb32);
889 // Interleave with zeros to get two sets of eight 16-bit values.
890 src_lasx = __lasx_xvilvl_b(xv_zero, src_lasx);
891 // Set srcA_lasx to contain sixteen copies of srcA, padded with zero.
892 // src_lasx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
893 // 0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
894 __m256i srcA_lasx = __lasx_xvreplgr2vr_h(srcA);
895
896 while (width >= 8) {
897 // Load eight destination pixels into dst_lasx.
898 __m256i dst_lasx = __lasx_xvld(d, 0);
899 // Load eight 16-bit masks into lower half of mask_lasx.
900 __m256i mask_lasx = __lasx_xvld(mask, 0);
901 mask_lasx = (__m256i){mask_lasx[0], 0, mask_lasx[1], 0};
902
903 int pack_cmp = __lasx_xbz_v(mask_lasx);
904 // if mask pixels are not all zero, we will blend the dst pixels
905 if (pack_cmp != 1) {
906 // Unpack 8 16bit mask pixels to
907 // mask_lasx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
908 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
909 // m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
910 // m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
911 mask_lasx = __lasx_xvilvl_h(xv_zero, mask_lasx);
912
913 // Process 8 32bit dst pixels
914 __m256i result = blend_lcd16_lasx(src_lasx, dst_lasx, mask_lasx, srcA_lasx);
915 __lasx_xvst(result, d, 0);
916 }
917 d++;
918 mask += 8;
919 width -= 8;
920 }
921 dst = reinterpret_cast<SkPMColor*>(d);
922 }
923
924 while (width > 0) {
925 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
926 mask++;
927 dst++;
928 width--;
929 }
930 }
931
932 void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
933 SkColor src, int width, SkPMColor opaqueDst) {
934 if (width <= 0) {
935 return;
936 }
937
938 int srcR = SkColorGetR(src);
939 int srcG = SkColorGetG(src);
940 int srcB = SkColorGetB(src);
941 __m256i xv_zero = __lasx_xvldi(0);
942
943 if (width >= 8) {
944 SkASSERT(((size_t)dst & 0x03) == 0);
945 while (((size_t)dst & 0x0F) != 0) {
946 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
947 mask++;
948 dst++;
949 width--;
950 }
951
952 __m256i *d = reinterpret_cast<__m256i*>(dst);
953 // Set alpha to 0xFF and replicate source four times in LASX register.
954 unsigned int sk_pack_argb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
955 __m256i src_lasx = __lasx_xvreplgr2vr_w(sk_pack_argb32);
956 // Set srcA_lasx to contain sixteen copies of srcA, padded with zero.
957 // src_lasx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
958 // 0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
959 src_lasx = __lasx_xvilvl_b(xv_zero, src_lasx);
960
961 while (width >= 8) {
962 // Load eight destination pixels into dst_lasx.
963 __m256i dst_lasx = __lasx_xvld(d, 0);
964 // Load eight 16-bit masks into lower half of mask_lasx.
965 __m256i mask_lasx = __lasx_xvld(mask, 0);
966 mask_lasx = (__m256i){mask_lasx[0], 0, mask_lasx[1], 0};
967
968 int32_t pack_cmp = __lasx_xbz_v(mask_lasx);
969 // if mask pixels are not all zero, we will blend the dst pixels
970 if (pack_cmp != 1) {
971 // Unpack 8 16bit mask pixels to
972 // mask_lasx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
973 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
974 // m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
975 // m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
976 mask_lasx = __lasx_xvilvl_h(xv_zero, mask_lasx);
977 // Process 8 32bit dst pixels
978 __m256i result = blend_lcd16_opaque_lasx(src_lasx, dst_lasx, mask_lasx);
979 __lasx_xvst(result, d, 0);
980 }
981 d++;
982 mask += 8;
983 width -= 8;
984 }
985
986 dst = reinterpret_cast<SkPMColor*>(d);
987 }
988
989 while (width > 0) {
990 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
991 mask++;
992 dst++;
993 width--;
994 }
995 }
996
997#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
998
999 // The following (left) shifts cause the top 5 bits of the mask components to
1000 // line up with the corresponding components in an SkPMColor.
1001 // Note that the mask's RGB16 order may differ from the SkPMColor order.
1002 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
1003 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
1004 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
1005
1006 #if SK_R16x5_R32x5_SHIFT == 0
1007 #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (x)
1008 #elif SK_R16x5_R32x5_SHIFT > 0
1009 #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (__lsx_vslli_w(x, SK_R16x5_R32x5_SHIFT))
1010 #else
1011 #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (__lsx_vsrli_w(x, -SK_R16x5_R32x5_SHIFT))
1012 #endif
1013
1014 #if SK_G16x5_G32x5_SHIFT == 0
1015 #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (x)
1016 #elif SK_G16x5_G32x5_SHIFT > 0
1017 #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (__lsx_vslli_w(x, SK_G16x5_G32x5_SHIFT))
1018 #else
1019 #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (__lsx_vsrli_w(x, -SK_G16x5_G32x5_SHIFT))
1020 #endif
1021
1022 #if SK_B16x5_B32x5_SHIFT == 0
1023 #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (x)
1024 #elif SK_B16x5_B32x5_SHIFT > 0
1025 #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (__lsx_vslli_w(x, SK_B16x5_B32x5_SHIFT))
1026 #else
1027 #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (__lsx_vsrli_w(x, -SK_B16x5_B32x5_SHIFT))
1028 #endif
1029
1030 static __m128i blend_lcd16_lsx(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) {
1031 // In the following comments, the components of src, dst and mask are
1032 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
1033 // by an R, G, B, or A suffix. Components of one of the four pixels that
1034 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
1035 // example is the blue channel of the second destination pixel. Memory
1036 // layout is shown for an ARGB byte order in a color value.
1037
1038 // src and srcA store 8-bit values interleaved with zeros.
1039 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1040 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
1041 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
1042 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
1043 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
1044 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
1045 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
1046
1047 __m128i v_zero = __lsx_vldi(0);
1048
1049 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
1050 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
1051 __m128i r = __lsx_vand_v(SkPackedR16x5ToUnmaskedR32x5_LSX(mask),
1052 __lsx_vreplgr2vr_w(0x1F << SK_R32_SHIFT));
1053
1054 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
1055 __m128i g = __lsx_vand_v(SkPackedG16x5ToUnmaskedG32x5_LSX(mask),
1056 __lsx_vreplgr2vr_w(0x1F << SK_G32_SHIFT));
1057
1058 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
1059 __m128i b = __lsx_vand_v(SkPackedB16x5ToUnmaskedB32x5_LSX(mask),
1060 __lsx_vreplgr2vr_w(0x1F << SK_B32_SHIFT));
1061
1062 // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
1063 __m128i aMin = __lsx_vmin_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
1064 __lsx_vmin_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
1065 __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
1066 __m128i aMax = __lsx_vmax_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
1067 __lsx_vmax_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
1068 __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
1069 // srcA has been biased to [0-256], so compare srcA against (dstA+1)
1070 __m128i a = __lsx_vmskltz_w(srcA -
1071 __lsx_vand_v(
1072 __lsx_vadd_w(dst,
1073 __lsx_vreplgr2vr_w(1 << SK_A32_SHIFT)),
1074 __lsx_vreplgr2vr_w(SK_A32_MASK)));
1075 // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
1076 a = __lsx_vor_v(__lsx_vand_v(a, aMin), __lsx_vandn_v(a, aMax));
1077
1078 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
1079 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
1080 // 8-bit position
1081 // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
1082 // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
1083 mask = __lsx_vor_v(__lsx_vor_v(a, r), __lsx_vor_v(g, b));
1084
1085 // Interleave R,G,B into the lower byte of word.
1086 // i.e. split the sixteen 8-bit values from mask into two sets of eight
1087 // 16-bit values, padded by zero.
1088 __m128i maskLo, maskHi;
1089 // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
1090 maskLo = __lsx_vilvl_b(v_zero, mask);
1091 // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
1092 maskHi = __lsx_vilvh_b(v_zero, mask);
1093
1094 // Upscale from 0..31 to 0..32
1095 // (allows to replace division by left-shift further down)
1096 // Left-shift each component by 4 and add the result back to that component,
1097 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
1098 maskLo = __lsx_vadd_h(maskLo, __lsx_vsrli_h(maskLo, 4));
1099 maskHi = __lsx_vadd_h(maskHi, __lsx_vsrli_h(maskHi, 4));
1100
1101 // Multiply each component of maskLo and maskHi by srcA
1102 maskLo = __lsx_vmul_h(maskLo, srcA);
1103 maskHi = __lsx_vmul_h(maskHi, srcA);
1104
1105 // Left shift mask components by 8 (divide by 256)
1106 maskLo = __lsx_vsrli_h(maskLo, 8);
1107 maskHi = __lsx_vsrli_h(maskHi, 8);
1108
1109 // Interleave R,G,B into the lower byte of the word
1110 // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
1111 __m128i dstLo = __lsx_vilvl_b(v_zero, dst);
1112 // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
1113 __m128i dstHi = __lsx_vilvh_b(v_zero, dst);
1114
1115 // mask = (src - dst) * mask
1116 maskLo = __lsx_vmul_h(maskLo, __lsx_vsub_h(src, dstLo));
1117 maskHi = __lsx_vmul_h(maskHi, __lsx_vsub_h(src, dstHi));
1118
1119 // mask = (src - dst) * mask >> 5
1120 maskLo = __lsx_vsrai_h(maskLo, 5);
1121 maskHi = __lsx_vsrai_h(maskHi, 5);
1122
1123 // Add two pixels into result.
1124 // result = dst + ((src - dst) * mask >> 5)
1125 __m128i resultLo = __lsx_vadd_h(dstLo, maskLo);
1126 __m128i resultHi = __lsx_vadd_h(dstHi, maskHi);
1127
1128 // Pack into 4 32bit dst pixels.
1129 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
1130 // Merge into one LSX regsiter with sixteen 8-bit values (four pixels),
1131 // clamping to 255 if necessary.
1132 __m128i tmpl = __lsx_vsat_hu(resultLo, 7);
1133 __m128i tmph = __lsx_vsat_hu(resultHi, 7);
1134 return __lsx_vpickev_b(tmph, tmpl);
1135 }
1136
1137 static __m128i blend_lcd16_opaque_lsx(__m128i &src, __m128i &dst, __m128i &mask) {
1138 // In the following comments, the components of src, dst and mask are
1139 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
1140 // by an R, G, B, or A suffix. Components of one of the four pixels that
1141 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
1142 // example is the blue channel of the second destination pixel. Memory
1143 // layout is shown for an ARGB byte order in a color value.
1144
1145 // src and srcA store 8-bit values interleaved with zeros.
1146 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1147 // mask stores 16-bit values (shown as high and low bytes) interleaved with
1148 // zeros
1149 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
1150 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
1151
1152 __m128i v_zero = __lsx_vldi(0);
1153
1154 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
1155 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
1156 __m128i r = __lsx_vand_v(SkPackedR16x5ToUnmaskedR32x5_LSX(mask),
1157 __lsx_vreplgr2vr_w(0x1F << SK_R32_SHIFT));
1158
1159 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
1160 __m128i g = __lsx_vand_v(SkPackedG16x5ToUnmaskedG32x5_LSX(mask),
1161 __lsx_vreplgr2vr_w(0x1F << SK_G32_SHIFT));
1162
1163 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
1164 __m128i b = __lsx_vand_v(SkPackedB16x5ToUnmaskedB32x5_LSX(mask),
1165 __lsx_vreplgr2vr_w(0x1F << SK_B32_SHIFT));
1166
1167 // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
1168 __m128i a = __lsx_vmax_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
1169 __lsx_vmax_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
1170 __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
1171
1172 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
1173 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
1174 // 8-bit position
1175 // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
1176 // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
1177 mask = __lsx_vor_v(__lsx_vor_v(a, r), __lsx_vor_v(g, b));
1178
1179 // Interleave R,G,B into the lower byte of word.
1180 // i.e. split the sixteen 8-bit values from mask into two sets of eight
1181 // 16-bit values, padded by zero.
1182 __m128i maskLo, maskHi;
1183 // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
1184 maskLo = __lsx_vilvl_b(v_zero, mask);
1185 // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
1186 maskHi = __lsx_vilvh_b(v_zero, mask);
1187
1188 // Upscale from 0..31 to 0..32
1189 // (allows to replace division by left-shift further down)
1190 // Left-shift each component by 4 and add the result back to that component,
1191 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
1192 maskLo = __lsx_vadd_h(maskLo, __lsx_vsrli_h(maskLo, 4));
1193 maskHi = __lsx_vadd_h(maskHi, __lsx_vsrli_h(maskHi, 4));
1194
1195 // Interleave R,G,B into the lower byte of the word
1196 // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
1197 __m128i dstLo = __lsx_vilvl_b(v_zero, dst);
1198 // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
1199 __m128i dstHi = __lsx_vilvh_b(v_zero, dst);
1200
1201 // mask = (src - dst) * mask
1202 maskLo = __lsx_vmul_h(maskLo, __lsx_vsub_h(src, dstLo));
1203 maskHi = __lsx_vmul_h(maskHi, __lsx_vsub_h(src, dstHi));
1204
1205 // mask = (src - dst) * mask >> 5
1206 maskLo = __lsx_vsrai_h(maskLo, 5);
1207 maskHi = __lsx_vsrai_h(maskHi, 5);
1208
1209 // Add two pixels into result.
1210 // result = dst + ((src - dst) * mask >> 5)
1211 __m128i resultLo = __lsx_vadd_h(dstLo, maskLo);
1212 __m128i resultHi = __lsx_vadd_h(dstHi, maskHi);
1213
1214 // Merge into one LSX regsiter with sixteen 8-bit values (four pixels),
1215 // clamping to 255 if necessary.
1216 __m128i tmpl = __lsx_vsat_hu(resultLo, 7);
1217 __m128i tmph = __lsx_vsat_hu(resultHi, 7);
1218 return __lsx_vpickev_b(tmph, tmpl);
1219 }
1220
1221 void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
1222 if (width <= 0) {
1223 return;
1224 }
1225
1226 int srcA = SkColorGetA(src);
1227 int srcR = SkColorGetR(src);
1228 int srcG = SkColorGetG(src);
1229 int srcB = SkColorGetB(src);
1230 __m128i v_zero = __lsx_vldi(0);
1231
1232 srcA = SkAlpha255To256(srcA);
1233 if (width >= 4) {
1234 SkASSERT(((size_t)dst & 0x03) == 0);
1235 while (((size_t)dst & 0x0F) != 0) {
1236 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
1237 mask++;
1238 dst++;
1239 width--;
1240 }
1241
1242 __m128i *d = reinterpret_cast<__m128i*>(dst);
1243 // Set alpha to 0xFF and replicate source eight times in LSX register.
1244 unsigned int skpackargb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
1245 __m128i src_lsx = __lsx_vreplgr2vr_w(skpackargb32);
1246 // Interleave with zeros to get two sets of eight 16-bit values.
1247 src_lsx = __lsx_vilvl_b(v_zero, src_lsx);
1248 // Set srcA_lsx to contain eight copies of srcA, padded with zero.
1249 // src_lsx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1250 __m128i srcA_lsx = __lsx_vreplgr2vr_h(srcA);
1251
1252 while (width >= 4) {
1253 // Load eight destination pixels into dst_lsx.
1254 __m128i dst_lsx = __lsx_vld(d, 0);
1255 // Load four 16-bit masks into lower half of mask_lsx.
1256 __m128i mask_lsx = __lsx_vldrepl_d((void *)mask, 0);
1257 mask_lsx = __lsx_vilvl_d(v_zero, mask_lsx);
1258
1259 int pack_cmp = __lsx_bz_v(mask_lsx);
1260 // if mask pixels are not all zero, we will blend the dst pixels
1261 if (pack_cmp != 1) {
1262 // Unpack 4 16bit mask pixels to
1263 // mask_lsx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
1264 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
1265 mask_lsx = __lsx_vilvl_h(v_zero, mask_lsx);
1266
1267 // Process 8 32bit dst pixels
1268 __m128i result = blend_lcd16_lsx(src_lsx, dst_lsx, mask_lsx, srcA_lsx);
1269 __lsx_vst(result, d, 0);
1270 }
1271
1272 d++;
1273 mask += 4;
1274 width -= 4;
1275 }
1276
1277 dst = reinterpret_cast<SkPMColor*>(d);
1278 }
1279
1280 while (width > 0) {
1281 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
1282 mask++;
1283 dst++;
1284 width--;
1285 }
1286 }
1287
1288 void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
1289 SkColor src, int width, SkPMColor opaqueDst) {
1290 if (width <= 0) {
1291 return;
1292 }
1293
1294 int srcR = SkColorGetR(src);
1295 int srcG = SkColorGetG(src);
1296 int srcB = SkColorGetB(src);
1297 __m128i v_zero = __lsx_vldi(0);
1298
1299 if (width >= 4) {
1300 SkASSERT(((size_t)dst & 0x03) == 0);
1301 while (((size_t)dst & 0x0F) != 0) {
1302 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
1303 mask++;
1304 dst++;
1305 width--;
1306 }
1307
1308 __m128i *d = reinterpret_cast<__m128i*>(dst);
1309 // Set alpha to 0xFF and replicate source four times in LSX register.
1310 unsigned int sk_pack_argb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
1311 __m128i src_lsx = __lsx_vreplgr2vr_w(sk_pack_argb32);
1312 // Set srcA_lsx to contain eight copies of srcA, padded with zero.
1313 // src_lsx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1314 src_lsx = __lsx_vilvl_b(v_zero, src_lsx);
1315
1316 while (width >= 4) {
1317 // Load four destination pixels into dst_lsx.
1318 __m128i dst_lsx = __lsx_vld(d, 0);
1319 // Load four 16-bit masks into lower half of mask_lsx.
1320 __m128i mask_lsx = __lsx_vldrepl_d((void *)(mask), 0);
1321 mask_lsx = __lsx_vilvl_d(v_zero, mask_lsx);
1322
1323 int pack_cmp = __lsx_bz_v(mask_lsx);
1324 // if mask pixels are not all zero, we will blend the dst pixels
1325 if (pack_cmp != 1) {
1326 // Unpack 4 16bit mask pixels to
1327 mask_lsx = __lsx_vilvl_h(v_zero, mask_lsx);
1328
1329 // Process 8 32bit dst pixels
1330 __m128i result = blend_lcd16_opaque_lsx(src_lsx, dst_lsx, mask_lsx);
1331 __lsx_vst(result, d, 0);
1332 }
1333 d++;
1334 mask += 4;
1335 width -= 4;
1336 }
1337
1338 dst = reinterpret_cast<SkPMColor*>(d);
1339 }
1340
1341 while (width > 0) {
1342 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
1343 mask++;
1344 dst++;
1345 width--;
1346 }
1347 }
1348
1349#else
1350
1351 static inline void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[],
1352 SkColor src, int width, SkPMColor) {
1353 int srcA = SkColorGetA(src);
1354 int srcR = SkColorGetR(src);
1355 int srcG = SkColorGetG(src);
1356 int srcB = SkColorGetB(src);
1357
1358 srcA = SkAlpha255To256(srcA);
1359
1360 for (int i = 0; i < width; i++) {
1361 dst[i] = blend_lcd16(srcA, srcR, srcG, srcB, dst[i], mask[i]);
1362 }
1363 }
1364
1365 static inline void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
1366 SkColor src, int width,
1367 SkPMColor opaqueDst) {
1368 int srcR = SkColorGetR(src);
1369 int srcG = SkColorGetG(src);
1370 int srcB = SkColorGetB(src);
1371
1372 for (int i = 0; i < width; i++) {
1373 dst[i] = blend_lcd16_opaque(srcR, srcG, srcB, dst[i], mask[i], opaqueDst);
1374 }
1375 }
1376
1377#endif
1378
1379static bool blit_color(const SkPixmap& device,
1380 const SkMask& mask,
1381 const SkIRect& clip,
1382 SkColor color) {
1383 int x = clip.fLeft,
1384 y = clip.fTop;
1385
1386 if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kA8_Format) {
1387 SkOpts::blit_mask_d32_a8(device.writable_addr32(x,y), device.rowBytes(),
1388 (const SkAlpha*)mask.getAddr(x,y), mask.fRowBytes,
1389 color, clip.width(), clip.height());
1390 return true;
1391 }
1392
1393 if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kLCD16_Format) {
1394 auto dstRow = device.writable_addr32(x,y);
1395 auto maskRow = (const uint16_t*)mask.getAddr(x,y);
1396
1397 auto blit_row = blit_row_lcd16;
1398 SkPMColor opaqueDst = 0; // ignored unless opaque
1399
1400 if (0xff == SkColorGetA(color)) {
1401 blit_row = blit_row_lcd16_opaque;
1402 opaqueDst = SkPreMultiplyColor(color);
1403 }
1404
1405 for (int height = clip.height(); height --> 0; ) {
1406 blit_row(dstRow, maskRow, color, clip.width(), opaqueDst);
1407
1408 dstRow = (SkPMColor*) (( char*) dstRow + device.rowBytes());
1409 maskRow = (const uint16_t*)((const char*)maskRow + mask.fRowBytes);
1410 }
1411 return true;
1412 }
1413
1414 return false;
1415}
1416
1417///////////////////////////////////////////////////////////////////////////////
1418
1419static void SkARGB32_Blit32(const SkPixmap& device, const SkMask& mask,
1420 const SkIRect& clip, SkPMColor srcColor) {
1421 U8CPU alpha = SkGetPackedA32(srcColor);
1423 if (alpha != 255) {
1425 }
1427
1428 int x = clip.fLeft;
1429 int y = clip.fTop;
1430 int width = clip.width();
1431 int height = clip.height();
1432
1433 SkPMColor* dstRow = device.writable_addr32(x, y);
1434 const SkPMColor* srcRow = reinterpret_cast<const SkPMColor*>(mask.getAddr8(x, y));
1435
1436 do {
1437 proc(dstRow, srcRow, width, alpha);
1438 dstRow = (SkPMColor*)((char*)dstRow + device.rowBytes());
1439 srcRow = (const SkPMColor*)((const char*)srcRow + mask.fRowBytes);
1440 } while (--height != 0);
1441}
1442
1443//////////////////////////////////////////////////////////////////////////////////////
1444
1446 : INHERITED(device) {
1447 SkColor color = paint.getColor();
1448 fColor = color;
1449
1450 fSrcA = SkColorGetA(color);
1451 unsigned scale = SkAlpha255To256(fSrcA);
1452 fSrcR = SkAlphaMul(SkColorGetR(color), scale);
1453 fSrcG = SkAlphaMul(SkColorGetG(color), scale);
1454 fSrcB = SkAlphaMul(SkColorGetB(color), scale);
1455
1456 fPMColor = SkPackARGB32(fSrcA, fSrcR, fSrcG, fSrcB);
1457}
1458
1459#if defined _WIN32 // disable warning : local variable used without having been initialized
1460#pragma warning ( push )
1461#pragma warning ( disable : 4701 )
1462#endif
1463
1464void SkARGB32_Blitter::blitH(int x, int y, int width) {
1465 SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
1466
1467 uint32_t* device = fDevice.writable_addr32(x, y);
1469}
1470
1471void SkARGB32_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1472 const int16_t runs[]) {
1473 if (fSrcA == 0) {
1474 return;
1475 }
1476
1477 uint32_t color = fPMColor;
1478 uint32_t* device = fDevice.writable_addr32(x, y);
1479 unsigned opaqueMask = fSrcA; // if fSrcA is 0xFF, then we will catch the fast opaque case
1480
1481 for (;;) {
1482 int count = runs[0];
1483 SkASSERT(count >= 0);
1484 if (count <= 0) {
1485 return;
1486 }
1487 unsigned aa = antialias[0];
1488 if (aa) {
1489 if ((opaqueMask & aa) == 255) {
1491 } else {
1492 uint32_t sc = SkAlphaMulQ(color, SkAlpha255To256(aa));
1494 }
1495 }
1496 runs += count;
1497 antialias += count;
1498 device += count;
1499 }
1500}
1501
1503 uint32_t* device = fDevice.writable_addr32(x, y);
1504 SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
1505
1506 device[0] = SkBlendARGB32(fPMColor, device[0], a0);
1507 device[1] = SkBlendARGB32(fPMColor, device[1], a1);
1508}
1509
1511 uint32_t* device = fDevice.writable_addr32(x, y);
1512 SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
1513
1514 device[0] = SkBlendARGB32(fPMColor, device[0], a0);
1515 device = (uint32_t*)((char*)device + fDevice.rowBytes());
1516 device[0] = SkBlendARGB32(fPMColor, device[0], a1);
1517}
1518
1519//////////////////////////////////////////////////////////////////////////////////////
1520
1521#define solid_8_pixels(mask, dst, color) \
1522 do { \
1523 if (mask & 0x80) dst[0] = color; \
1524 if (mask & 0x40) dst[1] = color; \
1525 if (mask & 0x20) dst[2] = color; \
1526 if (mask & 0x10) dst[3] = color; \
1527 if (mask & 0x08) dst[4] = color; \
1528 if (mask & 0x04) dst[5] = color; \
1529 if (mask & 0x02) dst[6] = color; \
1530 if (mask & 0x01) dst[7] = color; \
1531 } while (0)
1532
1533#define SK_BLITBWMASK_NAME SkARGB32_BlitBW
1534#define SK_BLITBWMASK_ARGS , SkPMColor color
1535#define SK_BLITBWMASK_BLIT8(mask, dst) solid_8_pixels(mask, dst, color)
1536#define SK_BLITBWMASK_GETADDR writable_addr32
1537#define SK_BLITBWMASK_DEVTYPE uint32_t
1539
1540#define blend_8_pixels(mask, dst, sc, dst_scale) \
1541 do { \
1542 if (mask & 0x80) { dst[0] = sc + SkAlphaMulQ(dst[0], dst_scale); } \
1543 if (mask & 0x40) { dst[1] = sc + SkAlphaMulQ(dst[1], dst_scale); } \
1544 if (mask & 0x20) { dst[2] = sc + SkAlphaMulQ(dst[2], dst_scale); } \
1545 if (mask & 0x10) { dst[3] = sc + SkAlphaMulQ(dst[3], dst_scale); } \
1546 if (mask & 0x08) { dst[4] = sc + SkAlphaMulQ(dst[4], dst_scale); } \
1547 if (mask & 0x04) { dst[5] = sc + SkAlphaMulQ(dst[5], dst_scale); } \
1548 if (mask & 0x02) { dst[6] = sc + SkAlphaMulQ(dst[6], dst_scale); } \
1549 if (mask & 0x01) { dst[7] = sc + SkAlphaMulQ(dst[7], dst_scale); } \
1550 } while (0)
1551
1552#define SK_BLITBWMASK_NAME SkARGB32_BlendBW
1553#define SK_BLITBWMASK_ARGS , uint32_t sc, unsigned dst_scale
1554#define SK_BLITBWMASK_BLIT8(mask, dst) blend_8_pixels(mask, dst, sc, dst_scale)
1555#define SK_BLITBWMASK_GETADDR writable_addr32
1556#define SK_BLITBWMASK_DEVTYPE uint32_t
1558
1561 SkASSERT(fSrcA != 0xFF);
1562
1563 if (fSrcA == 0) {
1564 return;
1565 }
1566
1567 if (blit_color(fDevice, mask, clip, fColor)) {
1568 return;
1569 }
1570
1571 switch (mask.fFormat) {
1572 case SkMask::kBW_Format:
1573 SkARGB32_BlendBW(fDevice, mask, clip, fPMColor, SkAlpha255To256(255 - fSrcA));
1574 break;
1577 break;
1578 default:
1579 SK_ABORT("Mask format not handled.");
1580 }
1581}
1582
1584 const SkIRect& clip) {
1586
1587 if (blit_color(fDevice, mask, clip, fColor)) {
1588 return;
1589 }
1590
1591 switch (mask.fFormat) {
1592 case SkMask::kBW_Format:
1593 SkARGB32_BlitBW(fDevice, mask, clip, fPMColor);
1594 break;
1597 break;
1598 default:
1599 SK_ABORT("Mask format not handled.");
1600 }
1601}
1602
1604 uint32_t* device = fDevice.writable_addr32(x, y);
1605 SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
1606
1609}
1610
1612 uint32_t* device = fDevice.writable_addr32(x, y);
1613 SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
1614
1616 device = (uint32_t*)((char*)device + fDevice.rowBytes());
1618}
1619
1620///////////////////////////////////////////////////////////////////////////////
1621
1622void SkARGB32_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
1623 if (alpha == 0 || fSrcA == 0) {
1624 return;
1625 }
1626
1627 uint32_t* device = fDevice.writable_addr32(x, y);
1628 uint32_t color = fPMColor;
1629
1630 if (alpha != 255) {
1632 }
1633
1634 unsigned dst_scale = SkAlpha255To256(255 - SkGetPackedA32(color));
1635 size_t rowBytes = fDevice.rowBytes();
1636 while (--height >= 0) {
1637 device[0] = color + SkAlphaMulQ(device[0], dst_scale);
1638 device = (uint32_t*)((char*)device + rowBytes);
1639 }
1640}
1641
1642void SkARGB32_Blitter::blitRect(int x, int y, int width, int height) {
1643 SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width() && y + height <= fDevice.height());
1644
1645 if (fSrcA == 0) {
1646 return;
1647 }
1648
1649 uint32_t* device = fDevice.writable_addr32(x, y);
1650 uint32_t color = fPMColor;
1651 size_t rowBytes = fDevice.rowBytes();
1652
1653 if (SkGetPackedA32(fPMColor) == 0xFF) {
1655 } else {
1656 while (height --> 0) {
1658 device = (uint32_t*)((char*)device + rowBytes);
1659 }
1660 }
1661}
1662
1663#if defined _WIN32
1664#pragma warning ( pop )
1665#endif
1666
1667///////////////////////////////////////////////////////////////////////
1668
1669void SkARGB32_Black_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1670 const int16_t runs[]) {
1671 uint32_t* device = fDevice.writable_addr32(x, y);
1673
1674 for (;;) {
1675 int count = runs[0];
1676 SkASSERT(count >= 0);
1677 if (count <= 0) {
1678 return;
1679 }
1680 unsigned aa = antialias[0];
1681 if (aa) {
1682 if (aa == 255) {
1683 SkOpts::memset32(device, black, count);
1684 } else {
1685 SkPMColor src = aa << SK_A32_SHIFT;
1686 unsigned dst_scale = 256 - aa;
1687 int n = count;
1688 do {
1689 --n;
1690 device[n] = src + SkAlphaMulQ(device[n], dst_scale);
1691 } while (n > 0);
1692 }
1693 }
1694 runs += count;
1695 antialias += count;
1696 device += count;
1697 }
1698}
1699
1701 uint32_t* device = fDevice.writable_addr32(x, y);
1702 SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
1703
1704 device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
1705 device[1] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[1], 256 - a1);
1706}
1707
1709 uint32_t* device = fDevice.writable_addr32(x, y);
1710 SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
1711
1712 device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
1713 device = (uint32_t*)((char*)device + fDevice.rowBytes());
1714 device[0] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a1);
1715}
1716
1717///////////////////////////////////////////////////////////////////////////////
1718
1720 const SkPaint& paint, SkShaderBase::Context* shaderContext)
1721 : INHERITED(device, paint, shaderContext)
1722{
1723 fBuffer = (SkPMColor*)sk_malloc_throw(device.width() * (sizeof(SkPMColor)));
1724
1725 SkASSERT(paint.isSrcOver());
1726
1727 int flags = 0;
1728 if (!(shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
1730 }
1731 // we call this on the output from the shader
1732 fProc32 = SkBlitRow::Factory32(flags);
1733 // we call this on the output from the shader + alpha from the aa buffer
1735
1736 fShadeDirectlyIntoDevice =
1738}
1739
1741 sk_free(fBuffer);
1742}
1743
1745 SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
1746
1747 uint32_t* device = fDevice.writable_addr32(x, y);
1748
1749 if (fShadeDirectlyIntoDevice) {
1751 } else {
1752 SkPMColor* span = fBuffer;
1753 fShaderContext->shadeSpan(x, y, span, width);
1754 fProc32(device, span, width, 255);
1755 }
1756}
1757
1759 SkASSERT(x >= 0 && y >= 0 &&
1760 x + width <= fDevice.width() && y + height <= fDevice.height());
1761
1762 uint32_t* device = fDevice.writable_addr32(x, y);
1763 size_t deviceRB = fDevice.rowBytes();
1764 auto* shaderContext = fShaderContext;
1765 SkPMColor* span = fBuffer;
1766
1767 if (fShadeDirectlyIntoDevice) {
1768 do {
1769 shaderContext->shadeSpan(x, y, device, width);
1770 y += 1;
1771 device = (uint32_t*)((char*)device + deviceRB);
1772 } while (--height > 0);
1773 } else {
1774 SkBlitRow::Proc32 proc = fProc32;
1775 do {
1776 shaderContext->shadeSpan(x, y, span, width);
1777 proc(device, span, width, 255);
1778 y += 1;
1779 device = (uint32_t*)((char*)device + deviceRB);
1780 } while (--height > 0);
1781 }
1782}
1783
1784void SkARGB32_Shader_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1785 const int16_t runs[]) {
1786 SkPMColor* span = fBuffer;
1787 uint32_t* device = fDevice.writable_addr32(x, y);
1788 auto* shaderContext = fShaderContext;
1789
1790 if (fShadeDirectlyIntoDevice || (shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
1791 for (;;) {
1792 int count = *runs;
1793 if (count <= 0) {
1794 break;
1795 }
1796 int aa = *antialias;
1797 if (aa) {
1798 if (aa == 255) {
1799 // cool, have the shader draw right into the device
1800 shaderContext->shadeSpan(x, y, device, count);
1801 } else {
1802 shaderContext->shadeSpan(x, y, span, count);
1803 fProc32Blend(device, span, count, aa);
1804 }
1805 }
1806 device += count;
1807 runs += count;
1808 antialias += count;
1809 x += count;
1810 }
1811 } else {
1812 for (;;) {
1813 int count = *runs;
1814 if (count <= 0) {
1815 break;
1816 }
1817 int aa = *antialias;
1818 if (aa) {
1819 shaderContext->shadeSpan(x, y, span, count);
1820 if (aa == 255) {
1821 fProc32(device, span, count, 255);
1822 } else {
1823 fProc32Blend(device, span, count, aa);
1824 }
1825 }
1826 device += count;
1827 runs += count;
1828 antialias += count;
1829 x += count;
1830 }
1831 }
1832}
1833
1837
1838static void drive(SkPMColor* dst, const SkPMColor* src, const uint8_t* cov, int n,
1839 U8x4 (*kernel)(U8x4,U8x4,U8x4)) {
1840
1841 auto apply = [kernel](U32 dst, U32 src, U8 cov) -> U32 {
1842 U8x4 cov_splat = skvx::shuffle<0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3>(cov);
1843 return sk_bit_cast<U32>(kernel(sk_bit_cast<U8x4>(dst),
1844 sk_bit_cast<U8x4>(src),
1845 cov_splat));
1846 };
1847 while (n >= 4) {
1848 apply(U32::Load(dst), U32::Load(src), U8::Load(cov)).store(dst);
1849 dst += 4;
1850 src += 4;
1851 cov += 4;
1852 n -= 4;
1853 }
1854 while (n --> 0) {
1855 *dst = apply(U32{*dst}, U32{*src}, U8{*cov})[0];
1856 dst++;
1857 src++;
1858 cov++;
1859 }
1860}
1861
1862static void blend_row_A8(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
1863 auto cov = (const uint8_t*)mask;
1864 drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
1865 U8x4 s_aa = skvx::approx_scale(s, c),
1866 alpha = skvx::shuffle<3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15>(s_aa);
1867 return s_aa + skvx::approx_scale(d, 255 - alpha);
1868 });
1869}
1870
1871static void blend_row_A8_opaque(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
1872 auto cov = (const uint8_t*)mask;
1873 drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
1874 return skvx::div255( skvx::cast<uint16_t>(s) * skvx::cast<uint16_t>( c )
1875 + skvx::cast<uint16_t>(d) * skvx::cast<uint16_t>(255-c));
1876 });
1877}
1878
1879static void blend_row_lcd16(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1880 auto src_alpha_blend = [](int s, int d, int sa, int m) {
1881 return d + SkAlphaMul(s - SkAlphaMul(sa, d), m);
1882 };
1883
1884 auto upscale_31_to_255 = [](int v) {
1885 return (v << 3) | (v >> 2);
1886 };
1887
1888 auto mask = (const uint16_t*)vmask;
1889 for (int i = 0; i < n; ++i) {
1890 uint16_t m = mask[i];
1891 if (0 == m) {
1892 continue;
1893 }
1894
1895 SkPMColor s = src[i];
1896 SkPMColor d = dst[i];
1897
1898 int srcA = SkGetPackedA32(s);
1899 int srcR = SkGetPackedR32(s);
1900 int srcG = SkGetPackedG32(s);
1901 int srcB = SkGetPackedB32(s);
1902
1903 srcA += srcA >> 7;
1904
1905 // We're ignoring the least significant bit of the green coverage channel here.
1906 int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
1907 int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
1908 int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
1909
1910 // Scale up to 8-bit coverage to work with SkAlphaMul() in src_alpha_blend().
1911 maskR = upscale_31_to_255(maskR);
1912 maskG = upscale_31_to_255(maskG);
1913 maskB = upscale_31_to_255(maskB);
1914
1915 // This LCD blit routine only works if the destination is opaque.
1916 dst[i] = SkPackARGB32(0xFF,
1917 src_alpha_blend(srcR, SkGetPackedR32(d), srcA, maskR),
1918 src_alpha_blend(srcG, SkGetPackedG32(d), srcA, maskG),
1919 src_alpha_blend(srcB, SkGetPackedB32(d), srcA, maskB));
1920 }
1921}
1922
1923static void blend_row_LCD16_opaque(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1924 auto mask = (const uint16_t*)vmask;
1925
1926 for (int i = 0; i < n; ++i) {
1927 uint16_t m = mask[i];
1928 if (0 == m) {
1929 continue;
1930 }
1931
1932 SkPMColor s = src[i];
1933 SkPMColor d = dst[i];
1934
1935 int srcR = SkGetPackedR32(s);
1936 int srcG = SkGetPackedG32(s);
1937 int srcB = SkGetPackedB32(s);
1938
1939 // We're ignoring the least significant bit of the green coverage channel here.
1940 int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
1941 int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
1942 int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
1943
1944 // Now upscale them to 0..32, so we can use blend_32.
1945 maskR = upscale_31_to_32(maskR);
1946 maskG = upscale_31_to_32(maskG);
1947 maskB = upscale_31_to_32(maskB);
1948
1949 // This LCD blit routine only works if the destination is opaque.
1950 dst[i] = SkPackARGB32(0xFF,
1951 blend_32(srcR, SkGetPackedR32(d), maskR),
1952 blend_32(srcG, SkGetPackedG32(d), maskG),
1953 blend_32(srcB, SkGetPackedB32(d), maskB));
1954 }
1955}
1956
1959
1960 void (*blend_row)(SkPMColor*, const void* mask, const SkPMColor*, int) = nullptr;
1961
1963
1964 if (mask.fFormat == SkMask::kA8_Format && opaque) {
1965 blend_row = blend_row_A8_opaque;
1966 } else if (mask.fFormat == SkMask::kA8_Format) {
1967 blend_row = blend_row_A8;
1968 } else if (mask.fFormat == SkMask::kLCD16_Format && opaque) {
1969 blend_row = blend_row_LCD16_opaque;
1970 } else if (mask.fFormat == SkMask::kLCD16_Format) {
1971 blend_row = blend_row_lcd16;
1972 } else {
1973 this->INHERITED::blitMask(mask, clip);
1974 return;
1975 }
1976
1977 const int x = clip.fLeft;
1978 const int width = clip.width();
1979 int y = clip.fTop;
1980 int height = clip.height();
1981
1982 char* dstRow = (char*)fDevice.writable_addr32(x, y);
1983 const size_t dstRB = fDevice.rowBytes();
1984 const uint8_t* maskRow = (const uint8_t*)mask.getAddr(x, y);
1985 const size_t maskRB = mask.fRowBytes;
1986
1987 SkPMColor* span = fBuffer;
1988 SkASSERT(blend_row);
1989 do {
1990 fShaderContext->shadeSpan(x, y, span, width);
1991 blend_row(reinterpret_cast<SkPMColor*>(dstRow), maskRow, span, width);
1992 dstRow += dstRB;
1993 maskRow += maskRB;
1994 y += 1;
1995 } while (--height > 0);
1996}
1997
1999 SkASSERT(x >= 0 && y >= 0 && y + height <= fDevice.height());
2000
2001 uint32_t* device = fDevice.writable_addr32(x, y);
2002 size_t deviceRB = fDevice.rowBytes();
2003
2004 if (fShadeDirectlyIntoDevice) {
2005 if (255 == alpha) {
2006 do {
2008 y += 1;
2009 device = (uint32_t*)((char*)device + deviceRB);
2010 } while (--height > 0);
2011 } else {
2012 do {
2013 SkPMColor c;
2014 fShaderContext->shadeSpan(x, y, &c, 1);
2015 *device = SkFourByteInterp(c, *device, alpha);
2016 y += 1;
2017 device = (uint32_t*)((char*)device + deviceRB);
2018 } while (--height > 0);
2019 }
2020 } else {
2021 SkPMColor* span = fBuffer;
2022 SkBlitRow::Proc32 proc = (255 == alpha) ? fProc32 : fProc32Blend;
2023 do {
2024 fShaderContext->shadeSpan(x, y, span, 1);
2025 proc(device, span, 1, alpha);
2026 y += 1;
2027 device = (uint32_t*)((char*)device + deviceRB);
2028 } while (--height > 0);
2029 }
2030}
int count
Definition: FontMgrTest.cpp:50
#define SK_ABORT(message,...)
Definition: SkAssert.h:70
#define SkASSERT(cond)
Definition: SkAssert.h:116
static void SkARGB32_Blit32(const SkPixmap &device, const SkMask &mask, const SkIRect &clip, SkPMColor srcColor)
static __m128i blend_lcd16_sse2(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA)
static __m128i blend_lcd16_opaque_sse2(__m128i &src, __m128i &dst, __m128i &mask)
static SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB, SkPMColor dst, uint16_t mask, SkPMColor opaqueDst)
static int blend_32(int src, int dst, int scale)
static int upscale_31_to_32(int value)
static bool blit_color(const SkPixmap &device, const SkMask &mask, const SkIRect &clip, SkColor color)
#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x)
#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x)
static SkPMColor blend_lcd16(int srcA, int srcR, int srcG, int srcB, SkPMColor dst, uint16_t mask)
static void blend_row_A8_opaque(SkPMColor *dst, const void *mask, const SkPMColor *src, int n)
void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor)
void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor opaqueDst)
#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x)
static void drive(SkPMColor *dst, const SkPMColor *src, const uint8_t *cov, int n, U8x4(*kernel)(U8x4, U8x4, U8x4))
static void blend_row_lcd16(SkPMColor *dst, const void *vmask, const SkPMColor *src, int n)
static void blend_row_LCD16_opaque(SkPMColor *dst, const void *vmask, const SkPMColor *src, int n)
static void blend_row_A8(SkPMColor *dst, const void *mask, const SkPMColor *src, int n)
unsigned U8CPU
Definition: SkCPUTypes.h:18
#define SkGetPackedB16(color)
Definition: SkColorData.h:32
#define SkGetPackedG16(color)
Definition: SkColorData.h:31
#define SK_G16_BITS
Definition: SkColorData.h:19
static SkPMColor SkFastFourByteInterp(SkPMColor src, SkPMColor dst, U8CPU srcWeight)
Definition: SkColorData.h:260
#define SK_B16_MASK
Definition: SkColorData.h:28
static SkPMColor SkFourByteInterp(SkPMColor src, SkPMColor dst, U8CPU srcWeight)
Definition: SkColorData.h:177
#define SkGetPackedR16(color)
Definition: SkColorData.h:30
#define SK_R16_BITS
Definition: SkColorData.h:18
#define SK_R16_SHIFT
Definition: SkColorData.h:22
static SkPMColor SkBlendARGB32(SkPMColor src, SkPMColor dst, U8CPU aa)
Definition: SkColorData.h:274
#define SK_B16_BITS
Definition: SkColorData.h:20
#define SkGetPackedB32(packed)
Definition: SkColorPriv.h:95
#define SkGetPackedR32(packed)
Definition: SkColorPriv.h:93
static SK_ALWAYS_INLINE uint32_t SkAlphaMulQ(uint32_t c, unsigned scale)
Definition: SkColorPriv.h:142
#define SK_A32_MASK
Definition: SkColorPriv.h:45
#define SkAlphaMul(value, alpha256)
Definition: SkColorPriv.h:34
#define SkGetPackedA32(packed)
Definition: SkColorPriv.h:92
#define SkGetPackedG32(packed)
Definition: SkColorPriv.h:94
static unsigned SkAlpha255To256(U8CPU alpha)
Definition: SkColorPriv.h:24
static SkPMColor SkPackARGB32(U8CPU a, U8CPU r, U8CPU g, U8CPU b)
Definition: SkColorPriv.h:106
#define SkColorGetR(color)
Definition: SkColor.h:65
#define SkColorGetG(color)
Definition: SkColor.h:69
SK_API SkPMColor SkPreMultiplyColor(SkColor c)
Definition: SkColor.cpp:21
uint32_t SkColor
Definition: SkColor.h:37
uint8_t SkAlpha
Definition: SkColor.h:26
uint32_t SkPMColor
Definition: SkColor.h:205
#define SkColorGetA(color)
Definition: SkColor.h:61
#define SkColorGetB(color)
Definition: SkColor.h:73
SK_API void sk_free(void *)
static void * sk_malloc_throw(size_t size)
Definition: SkMalloc.h:67
static SkPath clip(const SkPath &path, const SkHalfPlane &plane)
Definition: SkPath.cpp:3892
static bool apply(Pass *pass, SkRecord *record)
SkDEBUGCODE(SK_SPI) SkThreadID SkGetThreadID()
static constexpr bool SkToBool(const T &x)
Definition: SkTo.h:35
#define SK_R32_SHIFT
Definition: SkTypes.h:44
#define SK_A32_SHIFT
Definition: SkTypes.h:54
#define SK_B32_SHIFT
Definition: SkTypes.h:50
#define SK_G32_SHIFT
Definition: SkTypes.h:53
V< uint8_t > U8
Definition: Transform_inl.h:19
V< uint32_t > U32
Definition: Transform_inl.h:17
void blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) override
void blitAntiH(int x, int y, const SkAlpha antialias[], const int16_t runs[]) override
void blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) override
void blitMask(const SkMask &, const SkIRect &) override
SkPMColor fPMColor
void blitRect(int x, int y, int width, int height) override
Blit a solid rectangle one or more pixels wide.
SkARGB32_Blitter(const SkPixmap &device, const SkPaint &paint)
void blitH(int x, int y, int width) override
Blit a horizontal run of one or more pixels.
void blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) override
void blitV(int x, int y, int height, SkAlpha alpha) override
Blit a vertical run of pixels with a constant alpha value.
void blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) override
void blitAntiH(int x, int y, const SkAlpha antialias[], const int16_t runs[]) override
void blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) override
void blitMask(const SkMask &, const SkIRect &) override
void blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) override
void blitRect(int x, int y, int width, int height) override
Blit a solid rectangle one or more pixels wide.
SkARGB32_Shader_Blitter(const SkPixmap &device, const SkPaint &paint, SkShaderBase::Context *shaderContext)
void blitMask(const SkMask &, const SkIRect &) override
void blitH(int x, int y, int width) override
Blit a horizontal run of one or more pixels.
void blitAntiH(int x, int y, const SkAlpha[], const int16_t[]) override
void blitV(int x, int y, int height, SkAlpha alpha) override
Blit a vertical run of pixels with a constant alpha value.
static void Color32(SkPMColor dst[], int count, SkPMColor color)
static Proc32 Factory32(unsigned flags32)
void(* Proc32)(uint32_t dst[], const SkPMColor src[], int count, U8CPU alpha)
Definition: SkBlitRow.h:27
@ kSrcPixelAlpha_Flag32
Definition: SkBlitRow.h:18
@ kGlobalAlpha_Flag32
Definition: SkBlitRow.h:17
virtual void blitMask(const SkMask &, const SkIRect &clip)
Definition: SkBlitter.cpp:201
size_t rowBytes() const
Definition: SkPixmap.h:145
int width() const
Definition: SkPixmap.h:160
uint32_t * writable_addr32(int x, int y) const
Definition: SkPixmap.h:537
int height() const
Definition: SkPixmap.h:166
const SkPixmap fDevice
virtual void shadeSpan(int x, int y, SkPMColor[], int count)=0
virtual uint32_t getFlags() const
Definition: SkShaderBase.h:310
@ kOpaqueAlpha_Flag
set if all of the colors will be opaque
Definition: SkShaderBase.h:262
SkShaderBase::Context * fShaderContext
const Paint & paint
Definition: color_source.cc:38
DlColor color
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
Definition: main.cc:19
VkDevice device
Definition: main.cc:53
static bool b
struct MyStruct s
struct MyStruct a[10]
FlutterSemanticsFlag flags
uint8_t value
GAsyncResult * result
static float max(float r, float g, float b)
Definition: hsl.cpp:49
static float min(float r, float g, float b)
Definition: hsl.cpp:48
double y
double x
void(* rect_memset32)(uint32_t[], uint32_t, int, size_t, int)
void(* memset32)(uint32_t[], uint32_t, int)
void(* blit_mask_d32_a8)(SkPMColor *dst, size_t dstRB, const SkAlpha *mask, size_t maskRB, SkColor color, int w, int h)
dst
Definition: cp.py:12
SIN Vec< N, uint8_t > div255(const Vec< N, uint16_t > &x)
Definition: SkVx.h:818
SIN Vec< N, uint8_t > approx_scale(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
Definition: SkVx.h:824
int32_t height
int32_t width
const Scalar scale
Definition: SkRect.h:32
bool contains(int32_t x, int32_t y) const
Definition: SkRect.h:463
Definition: SkMask.h:25
const uint32_t fRowBytes
Definition: SkMask.h:43
@ kA8_Format
8bits per pixel mask (e.g. antialiasing)
Definition: SkMask.h:28
@ kLCD16_Format
565 alpha for r/g/b
Definition: SkMask.h:31
@ kARGB32_Format
SkPMColor.
Definition: SkMask.h:30
@ kBW_Format
1bit per pixel mask (e.g. monochrome)
Definition: SkMask.h:27
const uint8_t * getAddr8(int x, int y) const
Definition: SkMask.h:79
const void * getAddr(int x, int y) const
Definition: SkMask.cpp:112
const SkIRect fBounds
Definition: SkMask.h:42
const Format fFormat
Definition: SkMask.h:44
Definition: SkVx.h:83