Flutter Engine
The Flutter Engine
SkSwizzler_opts.inc
Go to the documentation of this file.
1/*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
9#include "src/base/SkUtils.h"
10#include "src/base/SkVx.h"
12
13#include <algorithm>
14#include <cmath>
15#include <utility>
16
17#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
18 #include <immintrin.h>
19#elif defined(SK_ARM_HAS_NEON)
20 #include <arm_neon.h>
21#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
22 #include <lasxintrin.h>
23#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
24 #include <lsxintrin.h>
25#endif
26
27// This file is included in multiple translation units with different #defines set enabling
28// different instruction use for different CPU architectures.
29//
30// A pair of files controls what #defines are defined: SkOpts_SetTarget.h set the flags, and
31// SkOpts_RestoreTarget.h restores them. SkOpts_SetTarget is controlled by setting the
32// SK_OPTS_TARGET define before included it.
33//
34// SkOpts_SetTarget also sets the #define SK_OPTS_NS to the unique namespace for this code.
35
36#if defined(__clang__) || defined(__GNUC__)
37#define SI __attribute__((always_inline)) static inline
38#else
39#define SI static inline
40#endif
41
42namespace SK_OPTS_NS {
43
44#if defined(SK_USE_FAST_UNPREMUL_324099025)
45constexpr bool kFastUnpremul = true;
46#else
47constexpr bool kFastUnpremul = false;
48#endif
49
50SI float reciprocal_alpha_times_255_portable(float a) {
51 return a != 0 ? 255.0f / a : 0.0f;
52}
53
54SI float reciprocal_alpha_portable(float a) {
55 return a != 0 ? 1.0f / a : 0.0f;
56}
57
58#if defined(SK_ARM_HAS_NEON)
59// -- NEON -- Harden against timing attacks
60// For neon, the portable versions create branchless code.
61SI float reciprocal_alpha_times_255(float a) {
62 return reciprocal_alpha_times_255_portable(a);
63}
64
65SI float reciprocal_alpha(float a) {
66 return reciprocal_alpha_portable(a);
67}
68#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 && (defined(__clang__) || !defined(_MSC_VER))
69// -- SSE -- Harden against timing attacks -- MSVC is not supported.
70using F4 = __m128;
71
72SK_NO_SANITIZE("float-divide-by-zero")
73SI float reciprocal_alpha_times_255(float a) {
74 SkASSERT(0 <= a && a <= 255);
75 F4 vA{a, a, a, a};
76 auto q = F4{255.0f} / vA;
77 return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0];
78}
79
80SK_NO_SANITIZE("float-divide-by-zero")
81SI float reciprocal_alpha(float a) {
82 SkASSERT(0 <= a && a <= 1);
83 F4 vA{a, a, a, a};
84 auto q = F4{1.0f} / vA;
85 return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0];
86}
87#else
88// -- Portable -- *Not* hardened against timing attacks
89SI float reciprocal_alpha_times_255(float a) {
90 return reciprocal_alpha_times_255_portable(a);
91}
92
93SI float reciprocal_alpha(float a) {
94 return reciprocal_alpha_portable(a);
95}
96#endif
97
98static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) {
99 for (int i = 0; i < count; i++) {
100 uint8_t a = (src[i] >> 24) & 0xFF,
101 b = (src[i] >> 16) & 0xFF,
102 g = (src[i] >> 8) & 0xFF,
103 r = (src[i] >> 0) & 0xFF;
104 b = (b*a+127)/255;
105 g = (g*a+127)/255;
106 r = (r*a+127)/255;
107 dst[i] = (uint32_t)a << 24
108 | (uint32_t)b << 16
109 | (uint32_t)g << 8
110 | (uint32_t)r << 0;
111 }
112}
113
114// RP uses the following rounding routines in store_8888. There are three different
115// styles of rounding:
116// 1) +0.5 and floor - used by scalar and ARMv7
117// 2) round to even for sure - ARMv8
118// 3) round to even maybe - intel. The rounding on intel depends on MXCSR which
119// defaults to round to even.
120//
121// Note: that vrndns_f32 is the single float version of vcvtnq_u32_f32.
122
123SI uint32_t pixel_round_as_RP(float n) {
124#if defined(SK_ARM_HAS_NEON) && defined(SK_CPU_ARM64)
125 return vrndns_f32(n);
126#elif defined(SK_ARM_HAS_NEON) && !defined(SK_CPU_ARM64)
127 float32x4_t vN{n + 0.5f};
128 return vcvtq_u32_f32(vN)[0];
129#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 && (defined(__clang__) || !defined(_MSC_VER))
130 return _mm_cvtps_epi32(__m128{n})[0];
131#else
132 return (uint32_t)(n + 0.5f);
133#endif
134}
135
136// Doing the math for an original color b resulting in a premul color x,
137// x = ⌊(b * a + 127) / 255⌋,
138// x ≤ (b * a + 127) / 255 < x + 1,
139// 255 * x ≤ b * a + 127 < 255 * (x + 1),
140// 255 * x - 127 ≤ b * a < 255 * (x + 1) - 127,
141// 255 * x - 127 ≤ b * a < 255 * x + 128,
142// (255 * x - 127) / a ≤ b < (255 * x + 128) / a.
143// So, given a premul value x < a, the original color b can be in the above range.
144// We can pick the middle of that range as
145// b = 255 * x / a
146// b = x * (255 / a)
147SI uint32_t unpremul_quick(float reciprocalA, float c) {
148 return (uint32_t)std::min(255.0f, (c * reciprocalA + 0.5f));
149}
150
151// Similar to unpremul but simulates Raster Pipeline by normalizing the pixel on the interval
152// [0, 1] and uses round-to-even in most cases instead of round-up.
153SI uint32_t unpremul_simulating_RP(float reciprocalA, float c) {
154 const float normalizedC = c * (1.0f / 255.0f);
155 const float answer = std::min(255.0f, normalizedC * reciprocalA * 255.0f);
156 return pixel_round_as_RP(answer);
157}
158
159SI uint32_t rgbA_to_CCCA(float c00, float c08, float c16, float a) {
160 if constexpr (kFastUnpremul) {
161 const float reciprocalA = reciprocal_alpha_times_255(a);
162 auto unpremul = [reciprocalA](float c) {
163 return unpremul_quick(reciprocalA, c);
164 };
165 return (uint32_t) a << 24
166 | unpremul(c16) << 16
167 | unpremul(c08) << 8
168 | unpremul(c00) << 0;
169 } else {
170 const float normalizedA = a * (1.0f / 255.0f);
171 const float reciprocalA = reciprocal_alpha(normalizedA);
172 auto unpremul = [reciprocalA](float c) {
173 return unpremul_simulating_RP(reciprocalA, c);
174 };
175 return (uint32_t) a << 24
176 | unpremul(c16) << 16
177 | unpremul(c08) << 8
178 | unpremul(c00) << 0;
179 }
180}
181
182static void rgbA_to_RGBA_portable(uint32_t* dst, const uint32_t* src, int count) {
183 for (int i = 0; i < count; i++) {
184 const uint32_t p = src[i];
185
186 const float a = (p >> 24) & 0xFF,
187 b = (p >> 16) & 0xFF,
188 g = (p >> 8) & 0xFF,
189 r = (p >> 0) & 0xFF;
190
191 dst[i] = rgbA_to_CCCA(r, g, b, a);
192 }
193}
194
195static void rgbA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
196 for (int i = 0; i < count; i++) {
197 const uint32_t p = src[i];
198
199 const uint32_t a = (p >> 24) & 0xFF,
200 b = (p >> 16) & 0xFF,
201 g = (p >> 8) & 0xFF,
202 r = (p >> 0) & 0xFF;
203
204 dst[i] = rgbA_to_CCCA(b, g, r, a);
205 }
206}
207
208static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) {
209 for (int i = 0; i < count; i++) {
210 uint8_t a = (src[i] >> 24) & 0xFF,
211 b = (src[i] >> 16) & 0xFF,
212 g = (src[i] >> 8) & 0xFF,
213 r = (src[i] >> 0) & 0xFF;
214 b = (b*a+127)/255;
215 g = (g*a+127)/255;
216 r = (r*a+127)/255;
217 dst[i] = (uint32_t)a << 24
218 | (uint32_t)r << 16
219 | (uint32_t)g << 8
220 | (uint32_t)b << 0;
221 }
222}
223
224static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
225 for (int i = 0; i < count; i++) {
226 uint8_t a = (src[i] >> 24) & 0xFF,
227 b = (src[i] >> 16) & 0xFF,
228 g = (src[i] >> 8) & 0xFF,
229 r = (src[i] >> 0) & 0xFF;
230 dst[i] = (uint32_t)a << 24
231 | (uint32_t)r << 16
232 | (uint32_t)g << 8
233 | (uint32_t)b << 0;
234 }
235}
236
237static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) {
238 for (int i = 0; i < count; i++) {
239 uint8_t g = src[0],
240 a = src[1];
241 src += 2;
242 dst[i] = (uint32_t)a << 24
243 | (uint32_t)g << 16
244 | (uint32_t)g << 8
245 | (uint32_t)g << 0;
246 }
247}
248
249static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) {
250 for (int i = 0; i < count; i++) {
251 uint8_t g = src[0],
252 a = src[1];
253 src += 2;
254 g = (g*a+127)/255;
255 dst[i] = (uint32_t)a << 24
256 | (uint32_t)g << 16
257 | (uint32_t)g << 8
258 | (uint32_t)g << 0;
259 }
260}
261
262static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) {
263 for (int i = 0; i < count; i++) {
264 uint8_t k = (src[i] >> 24) & 0xFF,
265 y = (src[i] >> 16) & 0xFF,
266 m = (src[i] >> 8) & 0xFF,
267 c = (src[i] >> 0) & 0xFF;
268 // See comments in SkSwizzler.cpp for details on the conversion formula.
269 uint8_t b = (y*k+127)/255,
270 g = (m*k+127)/255,
271 r = (c*k+127)/255;
272 dst[i] = (uint32_t)0xFF << 24
273 | (uint32_t) b << 16
274 | (uint32_t) g << 8
275 | (uint32_t) r << 0;
276 }
277}
278
279static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) {
280 for (int i = 0; i < count; i++) {
281 uint8_t k = (src[i] >> 24) & 0xFF,
282 y = (src[i] >> 16) & 0xFF,
283 m = (src[i] >> 8) & 0xFF,
284 c = (src[i] >> 0) & 0xFF;
285 uint8_t b = (y*k+127)/255,
286 g = (m*k+127)/255,
287 r = (c*k+127)/255;
288 dst[i] = (uint32_t)0xFF << 24
289 | (uint32_t) r << 16
290 | (uint32_t) g << 8
291 | (uint32_t) b << 0;
292 }
293}
294
295#if defined(SK_ARM_HAS_NEON)
296// -- NEON -----------------------------------------------------------------------------------------
297// Rounded divide by 255, (x + 127) / 255
298SI uint8x8_t div255_round(uint16x8_t x) {
299 // result = (x + 127) / 255
300 // result = (x + 127) / 256 + error1
301 //
302 // error1 = (x + 127) / (255 * 256)
303 // error1 = (x + 127) / (256 * 256) + error2
304 //
305 // error2 = (x + 127) / (255 * 256 * 256)
306 //
307 // The maximum value of error2 is too small to matter. Thus:
308 // result = (x + 127) / 256 + (x + 127) / (256 * 256)
309 // result = ((x + 127) / 256 + x + 127) / 256
310 // result = ((x + 127) >> 8 + x + 127) >> 8
311 //
312 // Use >>> to represent "rounded right shift" which, conveniently,
313 // NEON supports in one instruction.
314 // result = ((x >>> 8) + x) >>> 8
315 //
316 // Note that the second right shift is actually performed as an
317 // "add, round, and narrow back to 8-bits" instruction.
318 return vraddhn_u16(x, vrshrq_n_u16(x, 8));
319}
320
321// Scale a byte by another, (x * y + 127) / 255
322SI uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
323 return div255_round(vmull_u8(x, y));
324}
325
326static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
327 while (count >= 8) {
328 // Load 8 pixels.
329 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
330
331 uint8x8_t a = rgba.val[3],
332 b = rgba.val[2],
333 g = rgba.val[1],
334 r = rgba.val[0];
335
336 // Premultiply.
337 b = scale(b, a);
338 g = scale(g, a);
339 r = scale(r, a);
340
341 // Store 8 premultiplied pixels.
342 if (kSwapRB) {
343 rgba.val[2] = r;
344 rgba.val[1] = g;
345 rgba.val[0] = b;
346 } else {
347 rgba.val[2] = b;
348 rgba.val[1] = g;
349 rgba.val[0] = r;
350 }
351 vst4_u8((uint8_t*) dst, rgba);
352 src += 8;
353 dst += 8;
354 count -= 8;
355 }
356
357 // Call portable code to finish up the tail of [0,8) pixels.
358 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
359 proc(dst, src, count);
360}
361
362void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
363 premul_should_swapRB(false, dst, src, count);
364}
365
366void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
367 premul_should_swapRB(true, dst, src, count);
368}
369
370void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
371 using std::swap;
372 while (count >= 16) {
373 // Load 16 pixels.
374 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
375
376 // Swap r and b.
377 swap(rgba.val[0], rgba.val[2]);
378
379 // Store 16 pixels.
380 vst4q_u8((uint8_t*) dst, rgba);
381 src += 16;
382 dst += 16;
383 count -= 16;
384 }
385
386 if (count >= 8) {
387 // Load 8 pixels.
388 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
389
390 // Swap r and b.
391 swap(rgba.val[0], rgba.val[2]);
392
393 // Store 8 pixels.
394 vst4_u8((uint8_t*) dst, rgba);
395 src += 8;
396 dst += 8;
397 count -= 8;
398 }
399
400 RGBA_to_BGRA_portable(dst, src, count);
401}
402
403static void expand_grayA(bool kPremul, uint32_t dst[], const uint8_t* src, int count) {
404 while (count >= 16) {
405 // Load 16 pixels.
406 uint8x16x2_t ga = vld2q_u8(src);
407
408 // Premultiply if requested.
409 if (kPremul) {
410 ga.val[0] = vcombine_u8(
411 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])),
412 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
413 }
414
415 // Set each of the color channels.
416 uint8x16x4_t rgba;
417 rgba.val[0] = ga.val[0];
418 rgba.val[1] = ga.val[0];
419 rgba.val[2] = ga.val[0];
420 rgba.val[3] = ga.val[1];
421
422 // Store 16 pixels.
423 vst4q_u8((uint8_t*) dst, rgba);
424 src += 16*2;
425 dst += 16;
426 count -= 16;
427 }
428
429 if (count >= 8) {
430 // Load 8 pixels.
431 uint8x8x2_t ga = vld2_u8(src);
432
433 // Premultiply if requested.
434 if (kPremul) {
435 ga.val[0] = scale(ga.val[0], ga.val[1]);
436 }
437
438 // Set each of the color channels.
439 uint8x8x4_t rgba;
440 rgba.val[0] = ga.val[0];
441 rgba.val[1] = ga.val[0];
442 rgba.val[2] = ga.val[0];
443 rgba.val[3] = ga.val[1];
444
445 // Store 8 pixels.
446 vst4_u8((uint8_t*) dst, rgba);
447 src += 8*2;
448 dst += 8;
449 count -= 8;
450 }
451
452 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
453 proc(dst, src, count);
454}
455
456void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
457 expand_grayA(false, dst, src, count);
458}
459
460void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
461 expand_grayA(true, dst, src, count);
462}
463
464enum Format { kRGB1, kBGR1 };
465static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
466 while (count >= 8) {
467 // Load 8 cmyk pixels.
468 uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
469
470 uint8x8_t k = pixels.val[3],
471 y = pixels.val[2],
472 m = pixels.val[1],
473 c = pixels.val[0];
474
475 // Scale to r, g, b.
476 uint8x8_t b = scale(y, k);
477 uint8x8_t g = scale(m, k);
478 uint8x8_t r = scale(c, k);
479
480 // Store 8 rgba pixels.
481 if (kBGR1 == format) {
482 pixels.val[3] = vdup_n_u8(0xFF);
483 pixels.val[2] = r;
484 pixels.val[1] = g;
485 pixels.val[0] = b;
486 } else {
487 pixels.val[3] = vdup_n_u8(0xFF);
488 pixels.val[2] = b;
489 pixels.val[1] = g;
490 pixels.val[0] = r;
491 }
492 vst4_u8((uint8_t*) dst, pixels);
493 src += 8;
494 dst += 8;
495 count -= 8;
496 }
497
498 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
499 proc(dst, src, count);
500}
501
502void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
503 inverted_cmyk_to(kRGB1, dst, src, count);
504}
505
506void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
507 inverted_cmyk_to(kBGR1, dst, src, count);
508}
509
510template <bool swapRB>
511static void common_rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
512
513 // Only use the SIMD code if simulating RP, otherwise the quick code auto-vectorizes will
514 // enough on ARM to not need a SIMD implementation.
515 if constexpr (!kFastUnpremul) {
516 while (count >= 8) {
517 const uint8x8x4_t in = vld4_u8((const uint8_t*)src);
518
519 auto round = [](float32x4_t v) -> uint32x4_t {
520 #if defined(SK_CPU_ARM64)
521 return vcvtnq_u32_f32(v);
522 #else
523 return vcvtq_u32_f32(v + 0.5f);
524 #endif
525 };
526
527 static constexpr float kN = 1.0f / 255.0f;
528 auto toNormalized = [](uint16x4_t v) -> float32x4_t {
529 return vcvtq_f32_u32(vmovl_u16(v)) * kN;
530 };
531
532 auto unpremulHalf =
533 [toNormalized, round](float32x4_t invA, uint16x4_t v) -> uint16x4_t {
534 const float32x4_t normalizedV = toNormalized(v);
535 const float32x4_t divided = invA * normalizedV;
536 const float32x4_t denormalized = divided * 255.0f;
537 const uint32x4_t rounded = round(denormalized);
538 return vqmovn_u32(rounded);
539 };
540
541 auto reciprocal = [](float32x4_t a) -> float32x4_t {
542 uint32x4_t mask = sk_bit_cast<uint32x4_t>(a != float32x4_t{0, 0, 0, 0});
543 auto recip = 1.0f / a;
544 return sk_bit_cast<float32x4_t>(mask & sk_bit_cast<uint32x4_t>(recip));
545 };
546
547 const uint8x8_t a = in.val[3];
548 const uint16x8_t intA = vmovl_u8(a);
549 const float32x4_t invALow = reciprocal(toNormalized(vget_low_u16(intA)));
550 const float32x4_t invAHigh = reciprocal(toNormalized(vget_high_u16(intA)));
551
552 auto unpremul = [unpremulHalf, invALow, invAHigh](uint8x8_t v) -> uint8x8_t {
553 const uint16x8_t to16 = vmovl_u8(v);
554
555 const uint16x4_t low = unpremulHalf(invALow, vget_low_u16(to16));
556 const uint16x4_t high = unpremulHalf(invAHigh, vget_high_u16(to16));
557
558 const uint16x8_t combined = vcombine_u16(low, high);
559 return vqmovn_u16(combined);
560 };
561
562 const uint8x8_t b = unpremul(in.val[2]);
563 const uint8x8_t g = unpremul(in.val[1]);
564 const uint8x8_t r = unpremul(in.val[0]);
565
566 if constexpr (swapRB) {
567 const uint8x8x4_t out{b, g, r, a};
568 vst4_u8((uint8_t*)dst, out);
569 } else {
570 const uint8x8x4_t out{r, g, b, a};
571 vst4_u8((uint8_t*)dst, out);
572 }
573
574 src += 8;
575 dst += 8;
576 count -= 8;
577 }
578 }
579
580 // Handle the tail. Count will be < 8.
581 if constexpr (swapRB) {
582 rgbA_to_BGRA_portable(dst, src, count);
583 } else {
584 rgbA_to_RGBA_portable(dst, src, count);
585 }
586}
587
588void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
589 common_rgbA_to_RGBA</*swapRB=*/false>(dst, src, count);
590}
591
592void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
593 common_rgbA_to_RGBA</*swapRB=*/true>(dst, src, count);
594}
595
596#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
597// -- AVX2 -----------------------------------------------------------------------------------------
598
599// Scale a byte by another.
600// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
601static __m256i scale(__m256i x, __m256i y) {
602 const __m256i _128 = _mm256_set1_epi16(128);
603 const __m256i _257 = _mm256_set1_epi16(257);
604
605 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
606 return _mm256_mulhi_epu16(_mm256_add_epi16(_mm256_mullo_epi16(x, y), _128), _257);
607}
608
609static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
610
611 auto premul8 = [=](__m256i* lo, __m256i* hi) {
612 const __m256i zeros = _mm256_setzero_si256();
613 __m256i planar;
614 if (kSwapRB) {
615 planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
616 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
617 } else {
618 planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
619 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
620 }
621
622 // Swizzle the pixels to 8-bit planar.
623 *lo = _mm256_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa rrrrgggg bbbbaaaa
624 *hi = _mm256_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA RRRRGGGG BBBBAAAA
625 __m256i rg = _mm256_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG rrrrRRRR ggggGGGG
626 ba = _mm256_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA bbbbBBBB aaaaAAAA
627
628 // Unpack to 16-bit planar.
629 __m256i r = _mm256_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_ r_r_r_r_ R_R_R_R_
630 g = _mm256_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_ g_g_g_g_ G_G_G_G_
631 b = _mm256_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ b_b_b_b_ B_B_B_B_
632 a = _mm256_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ a_a_a_a_ A_A_A_A_
633
634 // Premultiply!
635 r = scale(r, a);
636 g = scale(g, a);
637 b = scale(b, a);
638
639 // Repack into interlaced pixels.
640 rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
641 ba = _mm256_or_si256(b, _mm256_slli_epi16(a, 8)); // babababa BABABABA babababa BABABABA
642 *lo = _mm256_unpacklo_epi16(rg, ba); // rgbargba rgbargba rgbargba rgbargba
643 *hi = _mm256_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA RGBARGBA RGBARGBA
644 };
645
646 while (count >= 16) {
647 __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)),
648 hi = _mm256_loadu_si256((const __m256i*) (src + 8));
649
650 premul8(&lo, &hi);
651
652 _mm256_storeu_si256((__m256i*) (dst + 0), lo);
653 _mm256_storeu_si256((__m256i*) (dst + 8), hi);
654
655 src += 16;
656 dst += 16;
657 count -= 16;
658 }
659
660 if (count >= 8) {
661 __m256i lo = _mm256_loadu_si256((const __m256i*) src),
662 hi = _mm256_setzero_si256();
663
664 premul8(&lo, &hi);
665
666 _mm256_storeu_si256((__m256i*) dst, lo);
667
668 src += 8;
669 dst += 8;
670 count -= 8;
671 }
672
673 // Call portable code to finish up the tail of [0,8) pixels.
674 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
675 proc(dst, src, count);
676}
677
678void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
679 premul_should_swapRB(false, dst, src, count);
680}
681
682void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
683 premul_should_swapRB(true, dst, src, count);
684}
685
686void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
687 const __m256i swapRB = _mm256_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
688 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
689
690 while (count >= 8) {
691 __m256i rgba = _mm256_loadu_si256((const __m256i*) src);
692 __m256i bgra = _mm256_shuffle_epi8(rgba, swapRB);
693 _mm256_storeu_si256((__m256i*) dst, bgra);
694
695 src += 8;
696 dst += 8;
697 count -= 8;
698 }
699
700 RGBA_to_BGRA_portable(dst, src, count);
701}
702
703void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
704 while (count >= 16) {
705 __m256i ga = _mm256_loadu_si256((const __m256i*) src);
706
707 __m256i gg = _mm256_or_si256(_mm256_and_si256(ga, _mm256_set1_epi16(0x00FF)),
708 _mm256_slli_epi16(ga, 8));
709
710 __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
711 __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);
712
713 // Shuffle for pixel reorder
714 // Note. 'p' stands for 'ggga'
715 // Before shuffle:
716 // ggga_lo = p0 p1 p2 p3 | p8 p9 p10 p11
717 // ggga_hi = p4 p5 p6 p7 | p12 p13 p14 p15
718 //
719 // After shuffle:
720 // ggga_lo_shuffle = p0 p1 p2 p3 | p4 p5 p6 p7
721 // ggga_hi_shuffle = p8 p9 p10 p11 | p12 p13 p14 p15
722 __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20),
723 ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31);
724
725 _mm256_storeu_si256((__m256i*) (dst + 0), ggga_lo_shuffle);
726 _mm256_storeu_si256((__m256i*) (dst + 8), ggga_hi_shuffle);
727
728 src += 16*2;
729 dst += 16;
730 count -= 16;
731 }
732
733 grayA_to_RGBA_portable(dst, src, count);
734}
735
736void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
737 while (count >= 16) {
738 __m256i grayA = _mm256_loadu_si256((const __m256i*) src);
739
740 __m256i g0 = _mm256_and_si256(grayA, _mm256_set1_epi16(0x00FF));
741 __m256i a0 = _mm256_srli_epi16(grayA, 8);
742
743 // Premultiply
744 g0 = scale(g0, a0);
745
746 __m256i gg = _mm256_or_si256(g0, _mm256_slli_epi16(g0, 8));
747 __m256i ga = _mm256_or_si256(g0, _mm256_slli_epi16(a0, 8));
748
749 __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
750 __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);
751
752 // Shuffle for pixel reorder, similar as grayA_to_RGBA
753 __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20),
754 ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31);
755
756 _mm256_storeu_si256((__m256i*) (dst + 0), ggga_lo_shuffle);
757 _mm256_storeu_si256((__m256i*) (dst + 8), ggga_hi_shuffle);
758
759 src += 16*2;
760 dst += 16;
761 count -= 16;
762 }
763
764 grayA_to_rgbA_portable(dst, src, count);
765}
766
767enum Format { kRGB1, kBGR1 };
768static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
769 auto convert8 = [=](__m256i* lo, __m256i* hi) {
770 const __m256i zeros = _mm256_setzero_si256();
771 __m256i planar;
772 if (kBGR1 == format) {
773 planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
774 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
775 } else {
776 planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
777 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
778 }
779
780 // Swizzle the pixels to 8-bit planar.
781 *lo = _mm256_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk ccccmmmm yyyykkkk
782 *hi = _mm256_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK CCCCMMMM YYYYKKKK
783 __m256i cm = _mm256_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM ccccCCCC mmmmMMMM
784 yk = _mm256_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK yyyyYYYY kkkkKKKK
785
786 // Unpack to 16-bit planar.
787 __m256i c = _mm256_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_ c_c_c_c_ C_C_C_C_
788 m = _mm256_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_ m_m_m_m_ M_M_M_M_
789 y = _mm256_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_ y_y_y_y_ Y_Y_Y_Y_
790 k = _mm256_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_ k_k_k_k_ K_K_K_K_
791
792 // Scale to r, g, b.
793 __m256i r = scale(c, k),
794 g = scale(m, k),
795 b = scale(y, k);
796
797 // Repack into interlaced pixels:
798 // rg = rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
799 // ba = b1b1b1b1 B1B1B1B1 b1b1b1b1 B1B1B1B1
800 __m256i rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8)),
801 ba = _mm256_or_si256(b, _mm256_set1_epi16((uint16_t) 0xFF00));
802 *lo = _mm256_unpacklo_epi16(rg, ba); // rgb1rgb1 rgb1rgb1 rgb1rgb1 rgb1rgb1
803 *hi = _mm256_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1 RGB1RGB1 RGB1RGB1
804 };
805
806 while (count >= 16) {
807 __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)),
808 hi = _mm256_loadu_si256((const __m256i*) (src + 8));
809
810 convert8(&lo, &hi);
811
812 _mm256_storeu_si256((__m256i*) (dst + 0), lo);
813 _mm256_storeu_si256((__m256i*) (dst + 8), hi);
814
815 src += 16;
816 dst += 16;
817 count -= 16;
818 }
819
820 if (count >= 8) {
821 __m256i lo = _mm256_loadu_si256((const __m256i*) src),
822 hi = _mm256_setzero_si256();
823
824 convert8(&lo, &hi);
825
826 _mm256_storeu_si256((__m256i*) dst, lo);
827
828 src += 8;
829 dst += 8;
830 count -= 8;
831 }
832
833 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
834 proc(dst, src, count);
835}
836
837void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
838 inverted_cmyk_to(kRGB1, dst, src, count);
839}
840
841void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
842 inverted_cmyk_to(kBGR1, dst, src, count);
843}
844
845void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
846 rgbA_to_RGBA_portable(dst, src, count);
847}
848
849void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
850 rgbA_to_BGRA_portable(dst, src, count);
851}
852
853#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
854// -- SSSE3 ----------------------------------------------------------------------------------------
855
856// Scale a byte by another.
857// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
858static __m128i scale(__m128i x, __m128i y) {
859 const __m128i _128 = _mm_set1_epi16(128);
860 const __m128i _257 = _mm_set1_epi16(257);
861
862 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
863 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
864}
865
866static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
867
868 auto premul8 = [=](__m128i* lo, __m128i* hi) {
869 const __m128i zeros = _mm_setzero_si128();
870 __m128i planar;
871 if (kSwapRB) {
872 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
873 } else {
874 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
875 }
876
877 // Swizzle the pixels to 8-bit planar.
878 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa
879 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA
880 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG
881 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA
882
883 // Unpack to 16-bit planar.
884 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_
885 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_
886 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_
887 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_
888
889 // Premultiply!
890 r = scale(r, a);
891 g = scale(g, a);
892 b = scale(b, a);
893
894 // Repack into interlaced pixels.
895 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG
896 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA
897 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
898 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA
899 };
900
901 while (count >= 8) {
902 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
903 hi = _mm_loadu_si128((const __m128i*) (src + 4));
904
905 premul8(&lo, &hi);
906
907 _mm_storeu_si128((__m128i*) (dst + 0), lo);
908 _mm_storeu_si128((__m128i*) (dst + 4), hi);
909
910 src += 8;
911 dst += 8;
912 count -= 8;
913 }
914
915 if (count >= 4) {
916 __m128i lo = _mm_loadu_si128((const __m128i*) src),
917 hi = _mm_setzero_si128();
918
919 premul8(&lo, &hi);
920
921 _mm_storeu_si128((__m128i*) dst, lo);
922
923 src += 4;
924 dst += 4;
925 count -= 4;
926 }
927
928 // Call portable code to finish up the tail of [0,4) pixels.
929 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
930 proc(dst, src, count);
931}
932
933void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
934 premul_should_swapRB(false, dst, src, count);
935}
936
937void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
938 premul_should_swapRB(true, dst, src, count);
939}
940
941void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
942 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
943
944 while (count >= 4) {
945 __m128i rgba = _mm_loadu_si128((const __m128i*) src);
946 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
947 _mm_storeu_si128((__m128i*) dst, bgra);
948
949 src += 4;
950 dst += 4;
951 count -= 4;
952 }
953
954 RGBA_to_BGRA_portable(dst, src, count);
955}
956
957void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
958 while (count >= 8) {
959 __m128i ga = _mm_loadu_si128((const __m128i*) src);
960
961 __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
962 _mm_slli_epi16(ga, 8));
963
964 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
965 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
966
967 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
968 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
969
970 src += 8*2;
971 dst += 8;
972 count -= 8;
973 }
974
975 grayA_to_RGBA_portable(dst, src, count);
976}
977
978void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
979 while (count >= 8) {
980 __m128i grayA = _mm_loadu_si128((const __m128i*) src);
981
982 __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
983 __m128i a0 = _mm_srli_epi16(grayA, 8);
984
985 // Premultiply
986 g0 = scale(g0, a0);
987
988 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
989 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
990
991
992 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
993 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
994
995 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
996 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
997
998 src += 8*2;
999 dst += 8;
1000 count -= 8;
1001 }
1002
1003 grayA_to_rgbA_portable(dst, src, count);
1004}
1005
1006enum Format { kRGB1, kBGR1 };
1007static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
1008 auto convert8 = [=](__m128i* lo, __m128i* hi) {
1009 const __m128i zeros = _mm_setzero_si128();
1010 __m128i planar;
1011 if (kBGR1 == format) {
1012 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
1013 } else {
1014 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
1015 }
1016
1017 // Swizzle the pixels to 8-bit planar.
1018 *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk
1019 *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK
1020 __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM
1021 yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK
1022
1023 // Unpack to 16-bit planar.
1024 __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_
1025 m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_
1026 y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_
1027 k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_
1028
1029 // Scale to r, g, b.
1030 __m128i r = scale(c, k),
1031 g = scale(m, k),
1032 b = scale(y, k);
1033
1034 // Repack into interlaced pixels.
1035 __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG
1036 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1
1037 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
1038 *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1
1039 };
1040
1041 while (count >= 8) {
1042 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
1043 hi = _mm_loadu_si128((const __m128i*) (src + 4));
1044
1045 convert8(&lo, &hi);
1046
1047 _mm_storeu_si128((__m128i*) (dst + 0), lo);
1048 _mm_storeu_si128((__m128i*) (dst + 4), hi);
1049
1050 src += 8;
1051 dst += 8;
1052 count -= 8;
1053 }
1054
1055 if (count >= 4) {
1056 __m128i lo = _mm_loadu_si128((const __m128i*) src),
1057 hi = _mm_setzero_si128();
1058
1059 convert8(&lo, &hi);
1060
1061 _mm_storeu_si128((__m128i*) dst, lo);
1062
1063 src += 4;
1064 dst += 4;
1065 count -= 4;
1066 }
1067
1068 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1069 proc(dst, src, count);
1070}
1071
1072void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1073 inverted_cmyk_to(kRGB1, dst, src, count);
1074}
1075
1076void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1077 inverted_cmyk_to(kBGR1, dst, src, count);
1078}
1079
1080void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
1081 rgbA_to_RGBA_portable(dst, src, count);
1082}
1083
1084void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1085 rgbA_to_BGRA_portable(dst, src, count);
1086}
1087
1088#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
1089// -- LASX ----------------------------------------------------------------------------------------
1090
1091// Scale a byte by another.
1092// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
1093// (x+127)/255 == ((x+128)*257)>>16
1094SI __m256i scale(__m256i x, __m256i y) {
1095 const __m256i _128 = __lasx_xvreplgr2vr_h(128);
1096 const __m256i _257 = __lasx_xvreplgr2vr_h(257);
1097
1098 // (x+127)/255 == ((x+128)*257)>>16
1099 return __lasx_xvmuh_hu(__lasx_xvadd_h(__lasx_xvmul_h(x, y), _128), _257);
1100}
1101
1102static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
1103 auto premul8 = [=](__m256i* lo, __m256i* hi) {
1104 const __m256i zeros = __lasx_xvldi(0);
1105 __m256i planar = __lasx_xvldi(0);
1106 if (kSwapRB) {
1107 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,0);
1108 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,1);
1109 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,2);
1110 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,3);
1111 } else {
1112 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,0);
1113 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,1);
1114 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,2);
1115 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,3);
1116 }
1117
1118 // Swizzle the pixels to 8-bit planar.
1119 *lo = __lasx_xvshuf_b(zeros, *lo, planar); // rrrrgggg bbbbaaaa rrrrgggg bbbbaaaa
1120 *hi = __lasx_xvshuf_b(zeros, *hi, planar); // RRRRGGGG BBBBAAAA RRRRGGGG BBBBAAAA
1121 __m256i rg = __lasx_xvilvl_w(*hi, *lo), // rrrrRRRR ggggGGGG rrrrRRRR ggggGGGG
1122 ba = __lasx_xvilvh_w(*hi, *lo); // bbbbBBBB aaaaAAAA bbbbBBBB aaaaAAAA
1123
1124 // Unpack to 16-bit planar.
1125 __m256i r = __lasx_xvilvl_b(zeros, rg), // r_r_r_r_ R_R_R_R_ r_r_r_r_ R_R_R_R_
1126 g = __lasx_xvilvh_b(zeros, rg), // g_g_g_g_ G_G_G_G_ g_g_g_g_ G_G_G_G_
1127 b = __lasx_xvilvl_b(zeros, ba), // b_b_b_b_ B_B_B_B_ b_b_b_b_ B_B_B_B_
1128 a = __lasx_xvilvh_b(zeros, ba); // a_a_a_a_ A_A_A_A_ a_a_a_a_ A_A_A_A_
1129
1130 // Premultiply!
1131 r = scale(r, a);
1132 g = scale(g, a);
1133 b = scale(b, a);
1134
1135 // Repack into interlaced pixels.
1136 rg = __lasx_xvor_v(r, __lasx_xvslli_h(g, 8)); // rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
1137 ba = __lasx_xvor_v(b, __lasx_xvslli_h(a, 8)); // babababa BABABABA babababa BABABABA
1138 *lo = __lasx_xvilvl_h(ba, rg); // rgbargba rgbargba rgbargba rgbargba
1139 *hi = __lasx_xvilvh_h(ba, rg); // RGBARGBA RGBARGBA RGBARGBA RGBARGBA
1140 };
1141
1142 while (count >= 16) {
1143 __m256i lo = __lasx_xvld(src, 0),
1144 hi = __lasx_xvld(src, 32);
1145
1146 premul8(&lo, &hi);
1147
1148 __lasx_xvst(lo, dst, 0);
1149 __lasx_xvst(hi, dst, 32);
1150
1151 src += 16;
1152 dst += 16;
1153 count -= 16;
1154 }
1155
1156 if (count >= 8) {
1157 __m256i lo = __lasx_xvld(src, 0),
1158 hi = __lasx_xvldi(0);
1159
1160 premul8(&lo, &hi);
1161
1162 __lasx_xvst(lo, dst, 0);
1163
1164 src += 8;
1165 dst += 8;
1166 count -= 8;
1167 }
1168
1169 // Call portable code to finish up the tail of [0,4) pixels.
1170 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
1171 proc(dst, src, count);
1172}
1173
1174/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
1175 premul_should_swapRB(false, dst, src, count);
1176}
1177
1178/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
1179 premul_should_swapRB(true, dst, src, count);
1180}
1181
1182/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1183 while (count >= 8) {
1184 __m256i rgba = __lasx_xvld(src, 0);
1185 __m256i bgra = __lasx_xvshuf4i_b(rgba, 0xC6);
1186 __lasx_xvst(bgra, dst, 0);
1187
1188 src += 8;
1189 dst += 8;
1190 count -= 8;
1191 }
1192
1193 RGBA_to_BGRA_portable(dst, src, count);
1194}
1195
1196/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
1197 while (count >= 16) {
1198 __m256i ga = __lasx_xvld(src, 0);
1199
1200 __m256i gg = __lasx_xvor_v(__lasx_xvand_v(ga, __lasx_xvreplgr2vr_h(0x00FF)),
1201 __lasx_xvslli_h(ga, 8));
1202
1203 __m256i ggga_lo = __lasx_xvilvl_h(ga, gg);
1204 __m256i ggga_hi = __lasx_xvilvh_h(ga, gg);
1205
1206 __lasx_xvst(__lasx_xvpermi_q(ggga_lo, ggga_hi, 0x02), dst, 0);
1207 __lasx_xvst(__lasx_xvpermi_q(ggga_lo, ggga_hi, 0x13), dst, 32);
1208
1209 src += 16*2;
1210 dst += 16;
1211 count -= 16;
1212 }
1213
1214 grayA_to_RGBA_portable(dst, src, count);
1215}
1216
1217/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
1218 while (count >= 16) {
1219 __m256i grayA = __lasx_xvld(src, 0);
1220
1221 __m256i val = __lasx_xvreplgr2vr_h(0x00FF);
1222
1223 __m256i g0 = __lasx_xvand_v(grayA, val);
1224 __m256i a0 = __lasx_xvsrli_h(grayA, 8);
1225
1226 // Premultiply
1227 g0 = scale(g0, a0);
1228
1229 __m256i gg = __lasx_xvor_v(g0, __lasx_xvslli_h(g0, 8));
1230 __m256i ga = __lasx_xvor_v(g0, __lasx_xvslli_h(a0, 8));
1231
1232 __m256i ggga_lo = __lasx_xvilvl_h(ga, gg);
1233 __m256i ggga_hi = __lasx_xvilvh_h(ga, gg);
1234
1235 val = __lasx_xvpermi_q(ggga_lo, ggga_hi, 0x02);
1236 __lasx_xvst(val, dst, 0);
1237
1238 val = __lasx_xvpermi_q(ggga_lo, ggga_hi, 0x13);
1239 __lasx_xvst(val, dst, 32);
1240
1241 src += 16*2;
1242 dst += 16;
1243 count -= 16;
1244 }
1245
1246 grayA_to_rgbA_portable(dst, src, count);
1247}
1248
1249enum Format { kRGB1, kBGR1 };
1250static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
1251 auto convert8 = [=](__m256i *lo, __m256i* hi) {
1252 const __m256i zeros = __lasx_xvldi(0);
1253 __m256i planar = __lasx_xvldi(0);
1254 if (kBGR1 == format) {
1255 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,0);
1256 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,1);
1257 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,2);
1258 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,3);
1259 } else {
1260 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,0);
1261 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,1);
1262 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,2);
1263 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,3);
1264 }
1265
1266 // Swizzle the pixels to 8-bit planar.
1267 *lo = __lasx_xvshuf_b(zeros, *lo, planar); // ccccmmmm yyyykkkk ccccmmmm yyyykkkk
1268 *hi = __lasx_xvshuf_b(zeros, *hi, planar); // CCCCMMMM YYYYKKKK CCCCMMMM YYYYKKKK
1269 __m256i cm = __lasx_xvilvl_w(*hi, *lo), // ccccCCCC mmmmMMMM ccccCCCC mmmmMMMM
1270 yk = __lasx_xvilvh_w(*hi, *lo); // yyyyYYYY kkkkKKKK yyyyYYYY kkkkKKKK
1271
1272 // Unpack to 16-bit planar.
1273 __m256i c = __lasx_xvilvl_b(zeros, cm), // c_c_c_c_ C_C_C_C_ c_c_c_c_ C_C_C_C_
1274 m = __lasx_xvilvh_b(zeros, cm), // m_m_m_m_ M_M_M_M_ m_m_m_m_ M_M_M_M_
1275 y = __lasx_xvilvl_b(zeros, yk), // y_y_y_y_ Y_Y_Y_Y_ y_y_y_y_ Y_Y_Y_Y_
1276 k = __lasx_xvilvh_b(zeros, yk); // k_k_k_k_ K_K_K_K_ k_k_k_k_ K_K_K_K_
1277
1278 // Scale to r, g, b.
1279 __m256i r = scale(c, k),
1280 g = scale(m, k),
1281 b = scale(y, k);
1282
1283 // Repack into interlaced pixels:
1284 // rg = rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
1285 // ba = b1b1b1b1 B1B1B1B1 b1b1b1b1 B1B1B1B1
1286 __m256i rg = __lasx_xvor_v(r, __lasx_xvslli_h(g, 8)),
1287 ba = __lasx_xvor_v(b, __lasx_xvreplgr2vr_h(0xff00));
1288 *lo = __lasx_xvilvl_h(ba, rg); // rgb1rgb1 rgb1rgb1 rgb1rgb1 rgb1rgb1
1289 *hi = __lasx_xvilvh_h(ba, rg); // RGB1RGB1 RGB1RGB1 RGB1RGB1 RGB1RGB1
1290 };
1291
1292 while (count >= 16) {
1293 __m256i lo = __lasx_xvld(src, 0),
1294 hi = __lasx_xvld(src, 32);
1295
1296 convert8(&lo, &hi);
1297
1298 __lasx_xvst(lo, dst, 0);
1299 __lasx_xvst(hi, dst, 32);
1300
1301 src += 16;
1302 dst += 16;
1303 count -= 16;
1304 }
1305
1306 while (count >= 8) {
1307 __m256i lo = __lasx_xvld(src, 0),
1308 hi = __lasx_xvldi(0);
1309
1310 convert8(&lo, &hi);
1311
1312 __lasx_xvst(lo, dst, 0);
1313
1314 src += 8;
1315 dst += 8;
1316 count -= 8;
1317 }
1318
1319 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1320 proc(dst, src, count);
1321}
1322
1323/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1324 inverted_cmyk_to(kRGB1, dst, src, count);
1325}
1326
1327/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1328 inverted_cmyk_to(kBGR1, dst, src, count);
1329}
1330
1331/*not static*/ inline void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
1332 rgbA_to_RGBA_portable(dst, src, count);
1333}
1334
1335/*not static*/ inline void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1336 rgbA_to_BGRA_portable(dst, src, count);
1337}
1338
1339#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1340// -- LSX -----------------------------------------------------------------------------------------
1341
1342// Scale a byte by another.
1343// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
1344SI __m128i scale(__m128i x, __m128i y) {
1345 const __m128i _128 = __lsx_vreplgr2vr_h(128);
1346 const __m128i _257 = __lsx_vreplgr2vr_h(257);
1347
1348 // (x+127)/255 == ((x+128)*257)>>16
1349 return __lsx_vmuh_hu(__lsx_vadd_h(__lsx_vmul_h(x, y), _128), _257);
1350}
1351
1352static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
1353
1354 auto premul8 = [=](__m128i *lo, __m128i *hi){
1355 const __m128i zeros = __lsx_vldi(0);
1356 __m128i planar = __lsx_vldi(0);
1357 if (kSwapRB) {
1358 planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010e0a0602, 0);
1359 planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030c080400, 1);
1360 } else {
1361 planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010c080400, 0);
1362 planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030e0a0602, 1);
1363 }
1364
1365 // Swizzle the pixels to 8-bit planar.
1366 *lo = __lsx_vshuf_b(zeros, *lo, planar); // rrrrgggg bbbbaaaa
1367 *hi = __lsx_vshuf_b(zeros, *hi, planar); // RRRRGGGG BBBBAAAA
1368 __m128i rg = __lsx_vilvl_w(*hi, *lo), // rrrrRRRR ggggGGGG
1369 ba = __lsx_vilvh_w(*hi, *lo); // bbbbBBBB aaaaAAAA
1370
1371 // Unpack to 16-bit planar.
1372 __m128i r = __lsx_vilvl_b(zeros, rg), // r_r_r_r_ R_R_R_R_
1373 g = __lsx_vilvh_b(zeros, rg), // g_g_g_g_ G_G_G_G_
1374 b = __lsx_vilvl_b(zeros, ba), // b_b_b_b_ B_B_B_B_
1375 a = __lsx_vilvh_b(zeros, ba); // a_a_a_a_ A_A_A_A_
1376
1377 // Premultiply!
1378 r = scale(r, a);
1379 g = scale(g, a);
1380 b = scale(b, a);
1381
1382 // Repack into interlaced pixels.
1383 rg = __lsx_vor_v(r, __lsx_vslli_h(g, 8)); // rgrgrgrg RGRGRGRG
1384 ba = __lsx_vor_v(b, __lsx_vslli_h(a, 8)); // babababa BABABABA
1385 *lo = __lsx_vilvl_h(ba, rg); // rgbargba rgbargba
1386 *hi = __lsx_vilvh_h(ba, rg); // RGBARGBA RGBARGBA
1387 };
1388 while (count >= 8) {
1389 __m128i lo = __lsx_vld(src ,0),
1390 hi = __lsx_vld(src ,16);
1391
1392 premul8(&lo, &hi);
1393
1394 __lsx_vst(lo, dst, 0);
1395 __lsx_vst(hi, dst, 16);
1396
1397 src += 8;
1398 dst += 8;
1399 count -= 8;
1400 }
1401
1402 if (count >= 4) {
1403 __m128i lo = __lsx_vld(src, 0),
1404 hi = __lsx_vldi(0);
1405
1406 premul8(&lo, &hi);
1407
1408 __lsx_vst(lo, dst, 0);
1409
1410 src += 4;
1411 dst += 4;
1412 count -= 4;
1413 }
1414
1415 // Call portable code to finish up the tail of [0,4) pixels.
1416 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
1417 proc(dst, src, count);
1418}
1419
1420/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
1421 premul_should_swapRB(false, dst, src, count);
1422}
1423
1424/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
1425 premul_should_swapRB(true, dst, src, count);
1426}
1427
1428/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1429 __m128i swapRB = __lsx_vldi(0);
1430 swapRB = __lsx_vinsgr2vr_d(swapRB, 0x0704050603000102, 0);
1431 swapRB = __lsx_vinsgr2vr_d(swapRB, 0x0f0c0d0e0b08090a, 1);
1432
1433 while (count >= 4) {
1434 __m128i rgba = __lsx_vld(src, 0);
1435 __m128i bgra = __lsx_vshuf4i_b(rgba, 0xC6);
1436 __lsx_vst(bgra, dst, 0);
1437
1438 src += 4;
1439 dst += 4;
1440 count -= 4;
1441 }
1442
1443 RGBA_to_BGRA_portable(dst, src, count);
1444}
1445
1446/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
1447 while (count >= 8) {
1448 __m128i ga = __lsx_vld(src, 0);
1449
1450 __m128i gg = __lsx_vor_v(__lsx_vand_v(ga, __lsx_vreplgr2vr_h(0x00FF)),
1451 __lsx_vslli_h(ga, 8));
1452
1453 __m128i ggga_lo = __lsx_vilvl_h(ga, gg);
1454 __m128i ggga_hi = __lsx_vilvh_h(ga, gg);
1455
1456 __lsx_vst(ggga_lo, dst, 0);
1457 __lsx_vst(ggga_hi, dst, 16);
1458
1459 src += 8*2;
1460 dst += 8;
1461 count -= 8;
1462 }
1463
1464 grayA_to_RGBA_portable(dst, src, count);
1465}
1466
1467/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
1468 while (count >= 8) {
1469 __m128i grayA = __lsx_vld(src, 0);
1470
1471 __m128i g0 = __lsx_vand_v(grayA, __lsx_vreplgr2vr_h(0x00FF));
1472 __m128i a0 = __lsx_vsrli_h(grayA, 8);
1473
1474 // Premultiply
1475 g0 = scale(g0, a0);
1476
1477 __m128i gg = __lsx_vor_v(g0, __lsx_vslli_h(g0, 8));
1478 __m128i ga = __lsx_vor_v(g0, __lsx_vslli_h(a0, 8));
1479
1480 __m128i ggga_lo = __lsx_vilvl_h(ga, gg);
1481 __m128i ggga_hi = __lsx_vilvh_h(ga, gg);
1482
1483 __lsx_vst(ggga_lo, dst, 0);
1484 __lsx_vst(ggga_hi, dst, 16);
1485
1486 src += 8*2;
1487 dst += 8;
1488 count -= 8;
1489 }
1490
1491 grayA_to_rgbA_portable(dst, src, count);
1492}
1493
1494enum Format { kRGB1, kBGR1 };
1495static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
1496 auto convert8 = [=](__m128i *lo, __m128i* hi) {
1497 const __m128i zeros = __lsx_vldi(0);
1498 __m128i planar = __lsx_vldi(0);
1499 if (kBGR1 == format) {
1500 planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010e0a0602, 0);
1501 planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030c080400, 1);
1502 } else {
1503 planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010c080400, 0);
1504 planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030e0a0602, 1);
1505 }
1506
1507 // Swizzle the pixels to 8-bit planar.
1508 *lo = __lsx_vshuf_b(zeros, *lo, planar); // ccccmmmm yyyykkkk
1509 *hi = __lsx_vshuf_b(zeros, *hi, planar); // CCCCMMMM YYYYKKKK
1510 __m128i cm = __lsx_vilvl_w(*hi, *lo), // ccccCCCC mmmmMMMM
1511 yk = __lsx_vilvh_w(*hi, *lo); // yyyyYYYY kkkkKKKK
1512
1513 // Unpack to 16-bit planar.
1514 __m128i c = __lsx_vilvl_b(zeros, cm), // c_c_c_c_ C_C_C_C_
1515 m = __lsx_vilvh_b(zeros, cm), // m_m_m_m_ M_M_M_M_
1516 y = __lsx_vilvl_b(zeros, yk), // y_y_y_y_ Y_Y_Y_Y_
1517 k = __lsx_vilvh_b(zeros, yk); // k_k_k_k_ K_K_K_K_
1518
1519 // Scale to r, g, b.
1520 __m128i r = scale(c, k),
1521 g = scale(m, k),
1522 b = scale(y, k);
1523
1524 // Repack into interlaced pixels.
1525 // rgrgrgrg RGRGRGRG
1526 // b1b1b1b1 B1B1B1B1
1527 __m128i rg = __lsx_vor_v(r, __lsx_vslli_h(g, 8)),
1528 ba = __lsx_vor_v(b, __lsx_vreplgr2vr_h(0xff00));
1529 *lo = __lsx_vilvl_h(ba, rg); // rgbargba rgbargba
1530 *hi = __lsx_vilvl_h(ba, rg); // RGB1RGB1 RGB1RGB1
1531 };
1532
1533 while (count >= 8) {
1534 __m128i lo = __lsx_vld(src, 0),
1535 hi = __lsx_vld(src, 16);
1536
1537 convert8(&lo, &hi);
1538
1539 __lsx_vst(lo, dst, 0);
1540 __lsx_vst(hi, dst, 16);
1541
1542 src += 8;
1543 dst += 8;
1544 count -= 8;
1545 }
1546
1547 if (count >= 4) {
1548 __m128i lo = __lsx_vld(src, 0),
1549 hi = __lsx_vldi(0);
1550
1551 convert8(&lo, &hi);
1552
1553 __lsx_vst(lo, dst, 0);
1554
1555 src += 4;
1556 dst += 4;
1557 count -= 4;
1558 }
1559
1560 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1561 proc(dst, src, count);
1562}
1563
1564/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1565 inverted_cmyk_to(kRGB1, dst, src, count);
1566}
1567
1568/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1569 inverted_cmyk_to(kBGR1, dst, src, count);
1570}
1571
1572/*not static*/ inline void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
1573 rgbA_to_RGBA_portable(dst, src, count);
1574}
1575
1576/*not static*/ inline void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1577 rgbA_to_BGRA_portable(dst, src, count);
1578}
1579
1580#else
1581// -- No Opts --------------------------------------------------------------------------------------
1582
1583void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
1584 rgbA_to_RGBA_portable(dst, src, count);
1585}
1586
1587void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1588 rgbA_to_BGRA_portable(dst, src, count);
1589}
1590
1591void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
1592 RGBA_to_rgbA_portable(dst, src, count);
1593}
1594
1595void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
1596 RGBA_to_bgrA_portable(dst, src, count);
1597}
1598
1599void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1600 RGBA_to_BGRA_portable(dst, src, count);
1601}
1602
1603void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
1604 grayA_to_RGBA_portable(dst, src, count);
1605}
1606
1607void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
1608 grayA_to_rgbA_portable(dst, src, count);
1609}
1610
1611void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1612 inverted_CMYK_to_RGB1_portable(dst, src, count);
1613}
1614
1615void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1616 inverted_CMYK_to_BGR1_portable(dst, src, count);
1617}
1618#endif
1619
1620// Basically as above, but we found no benefit from AVX-512 for gray_to_RGB1.
1621static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
1622 for (int i = 0; i < count; i++) {
1623 dst[i] = (uint32_t)0xFF << 24
1624 | (uint32_t)src[i] << 16
1625 | (uint32_t)src[i] << 8
1626 | (uint32_t)src[i] << 0;
1627 }
1628}
1629#if defined(SK_ARM_HAS_NEON)
1630 void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1631 while (count >= 16) {
1632 // Load 16 pixels.
1633 uint8x16_t gray = vld1q_u8(src);
1634
1635 // Set each of the color channels.
1636 uint8x16x4_t rgba;
1637 rgba.val[0] = gray;
1638 rgba.val[1] = gray;
1639 rgba.val[2] = gray;
1640 rgba.val[3] = vdupq_n_u8(0xFF);
1641
1642 // Store 16 pixels.
1643 vst4q_u8((uint8_t*) dst, rgba);
1644 src += 16;
1645 dst += 16;
1646 count -= 16;
1647 }
1648 if (count >= 8) {
1649 // Load 8 pixels.
1650 uint8x8_t gray = vld1_u8(src);
1651
1652 // Set each of the color channels.
1653 uint8x8x4_t rgba;
1654 rgba.val[0] = gray;
1655 rgba.val[1] = gray;
1656 rgba.val[2] = gray;
1657 rgba.val[3] = vdup_n_u8(0xFF);
1658
1659 // Store 8 pixels.
1660 vst4_u8((uint8_t*) dst, rgba);
1661 src += 8;
1662 dst += 8;
1663 count -= 8;
1664 }
1665 gray_to_RGB1_portable(dst, src, count);
1666 }
1667#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
1668 void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1669 const __m256i alphas = _mm256_set1_epi8((uint8_t) 0xFF);
1670 while (count >= 32) {
1671 __m256i grays = _mm256_loadu_si256((const __m256i*) src);
1672
1673 __m256i gg_lo = _mm256_unpacklo_epi8(grays, grays);
1674 __m256i gg_hi = _mm256_unpackhi_epi8(grays, grays);
1675 __m256i ga_lo = _mm256_unpacklo_epi8(grays, alphas);
1676 __m256i ga_hi = _mm256_unpackhi_epi8(grays, alphas);
1677
1678 __m256i ggga0 = _mm256_unpacklo_epi16(gg_lo, ga_lo);
1679 __m256i ggga1 = _mm256_unpackhi_epi16(gg_lo, ga_lo);
1680 __m256i ggga2 = _mm256_unpacklo_epi16(gg_hi, ga_hi);
1681 __m256i ggga3 = _mm256_unpackhi_epi16(gg_hi, ga_hi);
1682
1683 // Shuffle for pixel reorder.
1684 // Note. 'p' stands for 'ggga'
1685 // Before shuffle:
1686 // ggga0 = p0 p1 p2 p3 | p16 p17 p18 p19
1687 // ggga1 = p4 p5 p6 p7 | p20 p21 p22 p23
1688 // ggga2 = p8 p9 p10 p11 | p24 p25 p26 p27
1689 // ggga3 = p12 p13 p14 p15 | p28 p29 p30 p31
1690 //
1691 // After shuffle:
1692 // ggga0_shuffle = p0 p1 p2 p3 | p4 p5 p6 p7
1693 // ggga1_shuffle = p8 p9 p10 p11 | p12 p13 p14 p15
1694 // ggga2_shuffle = p16 p17 p18 p19 | p20 p21 p22 p23
1695 // ggga3_shuffle = p24 p25 p26 p27 | p28 p29 p30 p31
1696 __m256i ggga0_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x20),
1697 ggga1_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x20),
1698 ggga2_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x31),
1699 ggga3_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x31);
1700
1701 _mm256_storeu_si256((__m256i*) (dst + 0), ggga0_shuffle);
1702 _mm256_storeu_si256((__m256i*) (dst + 8), ggga1_shuffle);
1703 _mm256_storeu_si256((__m256i*) (dst + 16), ggga2_shuffle);
1704 _mm256_storeu_si256((__m256i*) (dst + 24), ggga3_shuffle);
1705
1706 src += 32;
1707 dst += 32;
1708 count -= 32;
1709 }
1710 gray_to_RGB1_portable(dst, src, count);
1711 }
1712#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 // TODO: just check >= SSE2?
1713 void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1714 const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
1715 while (count >= 16) {
1716 __m128i grays = _mm_loadu_si128((const __m128i*) src);
1717
1718 __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
1719 __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
1720 __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
1721 __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
1722
1723 __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
1724 __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
1725 __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
1726 __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
1727
1728 _mm_storeu_si128((__m128i*) (dst + 0), ggga0);
1729 _mm_storeu_si128((__m128i*) (dst + 4), ggga1);
1730 _mm_storeu_si128((__m128i*) (dst + 8), ggga2);
1731 _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
1732
1733 src += 16;
1734 dst += 16;
1735 count -= 16;
1736 }
1737 gray_to_RGB1_portable(dst, src, count);
1738 }
1739#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
1740 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1741 const __m256i alphas = __lasx_xvreplgr2vr_b(0xFF);
1742 while (count >= 32) {
1743 __m256i grays = __lasx_xvld(src, 0);
1744
1745 __m256i gg_lo = __lasx_xvilvl_b(grays, grays);
1746 __m256i gg_hi = __lasx_xvilvh_b(grays, grays);
1747 __m256i ga_lo = __lasx_xvilvl_b(alphas, grays);
1748 __m256i ga_hi = __lasx_xvilvh_b(alphas, grays);
1749
1750 __m256i ggga0 = __lasx_xvilvl_h(ga_lo, gg_lo);
1751 __m256i ggga1 = __lasx_xvilvh_h(ga_lo, gg_lo);
1752 __m256i ggga2 = __lasx_xvilvl_h(ga_hi, gg_hi);
1753 __m256i ggga3 = __lasx_xvilvh_h(ga_hi, gg_hi);
1754
1755 __m256i ggga_0 = __lasx_xvpermi_q(ggga0, ggga1, 0x02);
1756 __m256i ggga_1 = __lasx_xvpermi_q(ggga2, ggga3, 0x02);
1757 __m256i ggga_2 = __lasx_xvpermi_q(ggga0, ggga1, 0x13);
1758 __m256i ggga_3 = __lasx_xvpermi_q(ggga2, ggga3, 0x13);
1759
1760 __lasx_xvst(ggga_0, dst, 0);
1761 __lasx_xvst(ggga_1, dst, 32);
1762 __lasx_xvst(ggga_2, dst, 64);
1763 __lasx_xvst(ggga_3, dst, 96);
1764
1765 src += 32;
1766 dst += 32;
1767 count -= 32;
1768 }
1769 gray_to_RGB1_portable(dst, src, count);
1770 }
1771#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1772 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1773 const __m128i alphas = __lsx_vreplgr2vr_b(0xFF);
1774 while (count >= 16) {
1775 __m128i grays = __lsx_vld(src, 0);
1776
1777 __m128i gg_lo = __lsx_vilvl_b(grays, grays);
1778 __m128i gg_hi = __lsx_vilvh_b(grays, grays);
1779 __m128i ga_lo = __lsx_vilvl_b(alphas, grays);
1780 __m128i ga_hi = __lsx_vilvh_b(alphas, grays);
1781
1782 __m128i ggga0 = __lsx_vilvl_h(ga_lo, gg_lo);
1783 __m128i ggga1 = __lsx_vilvh_h(ga_lo, gg_lo);
1784 __m128i ggga2 = __lsx_vilvl_h(ga_hi, gg_hi);
1785 __m128i ggga3 = __lsx_vilvh_h(ga_hi, gg_hi);
1786
1787 __lsx_vst(ggga0, dst, 0);
1788 __lsx_vst(ggga1, dst, 16);
1789 __lsx_vst(ggga2, dst, 32);
1790 __lsx_vst(ggga3, dst, 48);
1791
1792 src += 16;
1793 dst += 16;
1794 count -= 16;
1795 }
1796 gray_to_RGB1_portable(dst, src, count);
1797 }
1798#else
1799 void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1800 gray_to_RGB1_portable(dst, src, count);
1801 }
1802#endif
1803
1804// Again as above, this time not even finding benefit from AVX2 for RGB_to_{RGB,BGR}1.
1805static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
1806 for (int i = 0; i < count; i++) {
1807 uint8_t r = src[0],
1808 g = src[1],
1809 b = src[2];
1810 src += 3;
1811 dst[i] = (uint32_t)0xFF << 24
1812 | (uint32_t)b << 16
1813 | (uint32_t)g << 8
1814 | (uint32_t)r << 0;
1815 }
1816}
1817static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) {
1818 for (int i = 0; i < count; i++) {
1819 uint8_t r = src[0],
1820 g = src[1],
1821 b = src[2];
1822 src += 3;
1823 dst[i] = (uint32_t)0xFF << 24
1824 | (uint32_t)r << 16
1825 | (uint32_t)g << 8
1826 | (uint32_t)b << 0;
1827 }
1828}
1829#if defined(SK_ARM_HAS_NEON)
1830 static void insert_alpha_should_swaprb(bool kSwapRB,
1831 uint32_t dst[], const uint8_t* src, int count) {
1832 while (count >= 16) {
1833 // Load 16 pixels.
1834 uint8x16x3_t rgb = vld3q_u8(src);
1835
1836 // Insert an opaque alpha channel and swap if needed.
1837 uint8x16x4_t rgba;
1838 if (kSwapRB) {
1839 rgba.val[0] = rgb.val[2];
1840 rgba.val[2] = rgb.val[0];
1841 } else {
1842 rgba.val[0] = rgb.val[0];
1843 rgba.val[2] = rgb.val[2];
1844 }
1845 rgba.val[1] = rgb.val[1];
1846 rgba.val[3] = vdupq_n_u8(0xFF);
1847
1848 // Store 16 pixels.
1849 vst4q_u8((uint8_t*) dst, rgba);
1850 src += 16*3;
1851 dst += 16;
1852 count -= 16;
1853 }
1854
1855 if (count >= 8) {
1856 // Load 8 pixels.
1857 uint8x8x3_t rgb = vld3_u8(src);
1858
1859 // Insert an opaque alpha channel and swap if needed.
1860 uint8x8x4_t rgba;
1861 if (kSwapRB) {
1862 rgba.val[0] = rgb.val[2];
1863 rgba.val[2] = rgb.val[0];
1864 } else {
1865 rgba.val[0] = rgb.val[0];
1866 rgba.val[2] = rgb.val[2];
1867 }
1868 rgba.val[1] = rgb.val[1];
1869 rgba.val[3] = vdup_n_u8(0xFF);
1870
1871 // Store 8 pixels.
1872 vst4_u8((uint8_t*) dst, rgba);
1873 src += 8*3;
1874 dst += 8;
1875 count -= 8;
1876 }
1877
1878 // Call portable code to finish up the tail of [0,8) pixels.
1879 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1880 proc(dst, src, count);
1881 }
1882
1883 void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1884 insert_alpha_should_swaprb(false, dst, src, count);
1885 }
1886 void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1887 insert_alpha_should_swaprb(true, dst, src, count);
1888 }
1889#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
1890 static void insert_alpha_should_swaprb(bool kSwapRB,
1891 uint32_t dst[], const uint8_t* src, int count) {
1892 const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
1893 __m128i expand;
1894 const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant.
1895 if (kSwapRB) {
1896 expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
1897 } else {
1898 expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
1899 }
1900
1901 while (count >= 6) {
1902 // Load a vector. While this actually contains 5 pixels plus an
1903 // extra component, we will discard all but the first four pixels on
1904 // this iteration.
1905 __m128i rgb = _mm_loadu_si128((const __m128i*) src);
1906
1907 // Expand the first four pixels to RGBX and then mask to RGB(FF).
1908 __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
1909
1910 // Store 4 pixels.
1911 _mm_storeu_si128((__m128i*) dst, rgba);
1912
1913 src += 4*3;
1914 dst += 4;
1915 count -= 4;
1916 }
1917
1918 // Call portable code to finish up the tail of [0,4) pixels.
1919 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1920 proc(dst, src, count);
1921 }
1922
1923 void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1924 insert_alpha_should_swaprb(false, dst, src, count);
1925 }
1926 void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1927 insert_alpha_should_swaprb(true, dst, src, count);
1928 }
1929#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
1930 static void insert_alpha_should_swaprb(bool kSwapRB,
1931 uint32_t dst[], const uint8_t* src, int count) {
1932 const __m256i alphaMask = __lasx_xvreplgr2vr_w(0xFF000000);
1933
1934 __m256i expand = __lasx_xvldi(0);
1935 if (kSwapRB) {
1936 expand = __lasx_xvinsgr2vr_d(expand, 0x0503040502000102, 0);
1937 expand = __lasx_xvinsgr2vr_d(expand, 0x0b090a0b08060708, 1);
1938 expand = __lasx_xvinsgr2vr_d(expand, 0x110f10110e0c0d0e, 2);
1939 expand = __lasx_xvinsgr2vr_d(expand, 0x1715161714121314, 3);
1940 } else {
1941 expand = __lasx_xvinsgr2vr_d(expand, 0x0505040302020100, 0);
1942 expand = __lasx_xvinsgr2vr_d(expand, 0x0b0b0a0908080706, 1);
1943 expand = __lasx_xvinsgr2vr_d(expand, 0x1111100f0e0e0d0c, 2);
1944 expand = __lasx_xvinsgr2vr_d(expand, 0x1717161514141312, 3);
1945 }
1946
1947 while (count >= 8) {
1948 // Load a vector. While this actually contains 5 pixels plus an
1949 // extra component, we will discard all but the first four pixels on
1950 // this iteration.
1951 __m256i rgb = __lasx_xvld(src, 0);
1952 __m256i rgb_l = __lasx_xvpermi_d(rgb, 0x44);
1953 __m256i rgb_h = __lasx_xvpermi_d(rgb, 0xEE);
1954
1955 // Expand the first four pixels to RGBX and then mask to RGB(FF).
1956 __m256i rgba = __lasx_xvor_v(__lasx_xvshuf_b(rgb_h, rgb_l, expand), alphaMask);
1957
1958 // Store 8 pixels.
1959 __lasx_xvst(rgba, dst, 0);
1960
1961 src += 4*6;
1962 dst += 8;
1963 count -= 8;
1964 }
1965
1966 // Call portable code to finish up the tail of [0,4) pixels.
1967 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1968 proc(dst, src, count);
1969 }
1970 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1971 insert_alpha_should_swaprb(false, dst, src, count);
1972 }
1973 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1974 insert_alpha_should_swaprb(true, dst, src, count);
1975 }
1976#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1977 static void insert_alpha_should_swaprb(bool kSwapRB,
1978 uint32_t dst[], const uint8_t* src, int count) {
1979 const __m128i alphaMask = __lsx_vreplgr2vr_w(0xFF000000);
1980
1981 __m128i expand = __lsx_vldi(0);
1982 if (kSwapRB) {
1983 expand = __lsx_vinsgr2vr_d(expand, 0x0503040502000102, 0);
1984 expand = __lsx_vinsgr2vr_d(expand, 0x0b090a0b08060708, 1);
1985 } else {
1986 expand = __lsx_vinsgr2vr_d(expand, 0x0505040302020100, 0);
1987 expand = __lsx_vinsgr2vr_d(expand, 0x0b0b0a0908080706, 1);
1988 }
1989
1990 while (count >= 6) {
1991 // Load a vector. While this actually contains 5 pixels plus an
1992 // extra component, we will discard all but the first four pixels on
1993 // this iteration.
1994 __m128i rgb = __lsx_vld(src, 0);
1995
1996 // Expand the first four pixels to RGBX and then mask to RGB(FF).
1997 __m128i rgba = __lsx_vor_v(__lsx_vshuf_b(rgb, rgb, expand), alphaMask);
1998
1999 // Store 4 pixels.
2000 __lsx_vst(rgba, dst, 0);
2001
2002 src += 4*3;
2003 dst += 4;
2004 count -= 4;
2005 }
2006
2007 // Call portable code to finish up the tail of [0,4) pixels.
2008 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
2009 proc(dst, src, count);
2010 }
2011 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
2012 insert_alpha_should_swaprb(false, dst, src, count);
2013 }
2014 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
2015 insert_alpha_should_swaprb(true, dst, src, count);
2016 }
2017#else
2018 void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
2019 RGB_to_RGB1_portable(dst, src, count);
2020 }
2021 void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
2022 RGB_to_BGR1_portable(dst, src, count);
2023 }
2024#endif
2025
2026} // namespace SK_OPTS_NS
2027
2028#undef SI
int count
Definition: FontMgrTest.cpp:50
static const uint32_t bgra[kNumPixels]
static const uint32_t rgba[kNumPixels]
#define SkASSERT(cond)
Definition: SkAssert.h:116
#define SK_NO_SANITIZE(A)
Definition: SkAttributes.h:59
void swap(sk_sp< T > &a, sk_sp< T > &b)
Definition: SkRefCnt.h:341
static const SkScalar X
Definition: StrokeBench.cpp:54
#define SI
Definition: Transform_inl.h:94
static bool b
struct MyStruct a[10]
uint32_t uint32_t * format
static float min(float r, float g, float b)
Definition: hsl.cpp:48
double y
double x
SI U32 expand(U16 v)
Swizzle_8888_u8 gray_to_RGB1
Definition: SkSwizzlePriv.h:30
Swizzle_8888_u32 RGBA_to_rgbA
Definition: SkSwizzlePriv.h:20
Swizzle_8888_u8 RGB_to_RGB1
Swizzle_8888_u8 grayA_to_rgbA
Definition: SkSwizzlePriv.h:32
Swizzle_8888_u8 RGB_to_BGR1
Definition: SkSwizzlePriv.h:29
Swizzle_8888_u8 grayA_to_RGBA
Definition: SkSwizzlePriv.h:31
Swizzle_8888_u32 RGBA_to_BGRA
Swizzle_8888_u32 RGBA_to_bgrA
Definition: SkSwizzlePriv.h:21
Swizzle_8888_u32 rgbA_to_BGRA
Definition: SkSwizzlePriv.h:23
Swizzle_8888_u32 inverted_CMYK_to_BGR1
Definition: SkSwizzlePriv.h:25
Swizzle_8888_u32 inverted_CMYK_to_RGB1
Definition: SkSwizzlePriv.h:24
Swizzle_8888_u32 rgbA_to_RGBA
Definition: SkSwizzlePriv.h:22
def Format(template, **parameters)
Definition: emitter.py:13
const Scalar scale