17#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
18 #include <immintrin.h>
19#elif defined(SK_ARM_HAS_NEON)
21#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
22 #include <lasxintrin.h>
23#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
24 #include <lsxintrin.h>
36#if defined(__clang__) || defined(__GNUC__)
37#define SI __attribute__((always_inline)) static inline
39#define SI static inline
44#if defined(SK_USE_FAST_UNPREMUL_324099025)
45constexpr bool kFastUnpremul =
true;
47constexpr bool kFastUnpremul =
false;
50SI float reciprocal_alpha_times_255_portable(
float a) {
51 return a != 0 ? 255.0f /
a : 0.0f;
54SI float reciprocal_alpha_portable(
float a) {
55 return a != 0 ? 1.0f /
a : 0.0f;
58#if defined(SK_ARM_HAS_NEON)
61SI float reciprocal_alpha_times_255(
float a) {
62 return reciprocal_alpha_times_255_portable(
a);
65SI float reciprocal_alpha(
float a) {
66 return reciprocal_alpha_portable(
a);
68#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 && (defined(__clang__) || !defined(_MSC_VER))
73SI float reciprocal_alpha_times_255(
float a) {
76 auto q = F4{255.0f} / vA;
77 return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0];
81SI float reciprocal_alpha(
float a) {
84 auto q = F4{1.0f} / vA;
85 return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0];
89SI float reciprocal_alpha_times_255(
float a) {
90 return reciprocal_alpha_times_255_portable(
a);
93SI float reciprocal_alpha(
float a) {
94 return reciprocal_alpha_portable(
a);
98static void RGBA_to_rgbA_portable(uint32_t*
dst,
const uint32_t*
src,
int count) {
100 uint8_t
a = (
src[
i] >> 24) & 0xFF,
101 b = (
src[
i] >> 16) & 0xFF,
102 g = (
src[
i] >> 8) & 0xFF,
103 r = (
src[
i] >> 0) & 0xFF;
107 dst[
i] = (uint32_t)
a << 24
123SI uint32_t pixel_round_as_RP(
float n) {
124#if defined(SK_ARM_HAS_NEON) && defined(SK_CPU_ARM64)
125 return vrndns_f32(n);
126#elif defined(SK_ARM_HAS_NEON) && !defined(SK_CPU_ARM64)
127 float32x4_t vN{n + 0.5f};
128 return vcvtq_u32_f32(vN)[0];
129#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 && (defined(__clang__) || !defined(_MSC_VER))
130 return _mm_cvtps_epi32(__m128{n})[0];
132 return (uint32_t)(n + 0.5f);
147SI uint32_t unpremul_quick(
float reciprocalA,
float c) {
148 return (uint32_t)
std::min(255.0
f, (c * reciprocalA + 0.5
f));
153SI uint32_t unpremul_simulating_RP(
float reciprocalA,
float c) {
154 const float normalizedC = c * (1.0f / 255.0f);
155 const float answer =
std::min(255.0
f, normalizedC * reciprocalA * 255.0
f);
156 return pixel_round_as_RP(answer);
159SI uint32_t rgbA_to_CCCA(
float c00,
float c08,
float c16,
float a) {
160 if constexpr (kFastUnpremul) {
161 const float reciprocalA = reciprocal_alpha_times_255(
a);
162 auto unpremul = [reciprocalA](
float c) {
163 return unpremul_quick(reciprocalA, c);
165 return (uint32_t)
a << 24
166 | unpremul(c16) << 16
168 | unpremul(c00) << 0;
170 const float normalizedA =
a * (1.0f / 255.0f);
171 const float reciprocalA = reciprocal_alpha(normalizedA);
172 auto unpremul = [reciprocalA](
float c) {
173 return unpremul_simulating_RP(reciprocalA, c);
175 return (uint32_t)
a << 24
176 | unpremul(c16) << 16
178 | unpremul(c00) << 0;
182static void rgbA_to_RGBA_portable(uint32_t*
dst,
const uint32_t*
src,
int count) {
184 const uint32_t
p =
src[
i];
186 const float a = (
p >> 24) & 0xFF,
187 b = (
p >> 16) & 0xFF,
191 dst[
i] = rgbA_to_CCCA(r, g,
b,
a);
195static void rgbA_to_BGRA_portable(uint32_t*
dst,
const uint32_t*
src,
int count) {
197 const uint32_t
p =
src[
i];
199 const uint32_t
a = (
p >> 24) & 0xFF,
200 b = (
p >> 16) & 0xFF,
204 dst[
i] = rgbA_to_CCCA(
b, g, r,
a);
208static void RGBA_to_bgrA_portable(uint32_t*
dst,
const uint32_t*
src,
int count) {
210 uint8_t
a = (
src[
i] >> 24) & 0xFF,
211 b = (
src[
i] >> 16) & 0xFF,
212 g = (
src[
i] >> 8) & 0xFF,
213 r = (
src[
i] >> 0) & 0xFF;
217 dst[
i] = (uint32_t)
a << 24
224static void RGBA_to_BGRA_portable(uint32_t*
dst,
const uint32_t*
src,
int count) {
226 uint8_t
a = (
src[
i] >> 24) & 0xFF,
227 b = (
src[
i] >> 16) & 0xFF,
228 g = (
src[
i] >> 8) & 0xFF,
229 r = (
src[
i] >> 0) & 0xFF;
230 dst[
i] = (uint32_t)
a << 24
237static void grayA_to_RGBA_portable(uint32_t
dst[],
const uint8_t*
src,
int count) {
242 dst[
i] = (uint32_t)
a << 24
249static void grayA_to_rgbA_portable(uint32_t
dst[],
const uint8_t*
src,
int count) {
255 dst[
i] = (uint32_t)
a << 24
262static void inverted_CMYK_to_RGB1_portable(uint32_t*
dst,
const uint32_t*
src,
int count) {
264 uint8_t k = (
src[
i] >> 24) & 0xFF,
265 y = (
src[
i] >> 16) & 0xFF,
266 m = (
src[
i] >> 8) & 0xFF,
267 c = (
src[
i] >> 0) & 0xFF;
269 uint8_t
b = (
y*k+127)/255,
272 dst[
i] = (uint32_t)0xFF << 24
279static void inverted_CMYK_to_BGR1_portable(uint32_t*
dst,
const uint32_t*
src,
int count) {
281 uint8_t k = (
src[
i] >> 24) & 0xFF,
282 y = (
src[
i] >> 16) & 0xFF,
283 m = (
src[
i] >> 8) & 0xFF,
284 c = (
src[
i] >> 0) & 0xFF;
285 uint8_t
b = (
y*k+127)/255,
288 dst[
i] = (uint32_t)0xFF << 24
295#if defined(SK_ARM_HAS_NEON)
298SI uint8x8_t div255_round(uint16x8_t
x) {
318 return vraddhn_u16(
x, vrshrq_n_u16(
x, 8));
322SI uint8x8_t
scale(uint8x8_t
x, uint8x8_t
y) {
323 return div255_round(vmull_u8(
x,
y));
326static void premul_should_swapRB(
bool kSwapRB, uint32_t*
dst,
const uint32_t*
src,
int count) {
329 uint8x8x4_t
rgba = vld4_u8((
const uint8_t*)
src);
331 uint8x8_t
a =
rgba.val[3],
358 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
372 while (
count >= 16) {
374 uint8x16x4_t
rgba = vld4q_u8((
const uint8_t*)
src);
380 vst4q_u8((uint8_t*)
dst,
rgba);
388 uint8x8x4_t
rgba = vld4_u8((
const uint8_t*)
src);
403static void expand_grayA(
bool kPremul, uint32_t
dst[],
const uint8_t*
src,
int count) {
404 while (
count >= 16) {
406 uint8x16x2_t ga = vld2q_u8(
src);
410 ga.val[0] = vcombine_u8(
411 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])),
412 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
417 rgba.val[0] = ga.val[0];
418 rgba.val[1] = ga.val[0];
419 rgba.val[2] = ga.val[0];
420 rgba.val[3] = ga.val[1];
423 vst4q_u8((uint8_t*)
dst,
rgba);
431 uint8x8x2_t ga = vld2_u8(
src);
435 ga.val[0] =
scale(ga.val[0], ga.val[1]);
440 rgba.val[0] = ga.val[0];
441 rgba.val[1] = ga.val[0];
442 rgba.val[2] = ga.val[0];
443 rgba.val[3] = ga.val[1];
452 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
468 uint8x8x4_t pixels = vld4_u8((
const uint8_t*)
src);
470 uint8x8_t k = pixels.val[3],
477 uint8x8_t g =
scale(
m, k);
478 uint8x8_t r =
scale(c, k);
482 pixels.val[3] = vdup_n_u8(0xFF);
487 pixels.val[3] = vdup_n_u8(0xFF);
492 vst4_u8((uint8_t*)
dst, pixels);
498 auto proc = (kBGR1 ==
format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
510template <
bool swapRB>
511static void common_rgbA_to_RGBA(uint32_t*
dst,
const uint32_t*
src,
int count) {
515 if constexpr (!kFastUnpremul) {
517 const uint8x8x4_t in = vld4_u8((
const uint8_t*)
src);
519 auto round = [](float32x4_t v) -> uint32x4_t {
520 #if defined(SK_CPU_ARM64)
521 return vcvtnq_u32_f32(v);
523 return vcvtq_u32_f32(v + 0.5
f);
527 static constexpr float kN = 1.0f / 255.0f;
528 auto toNormalized = [](uint16x4_t v) -> float32x4_t {
529 return vcvtq_f32_u32(vmovl_u16(v)) * kN;
533 [toNormalized,
round](float32x4_t invA, uint16x4_t v) -> uint16x4_t {
534 const float32x4_t normalizedV = toNormalized(v);
535 const float32x4_t divided = invA * normalizedV;
536 const float32x4_t denormalized = divided * 255.0f;
537 const uint32x4_t rounded =
round(denormalized);
538 return vqmovn_u32(rounded);
541 auto reciprocal = [](float32x4_t
a) -> float32x4_t {
542 uint32x4_t mask = sk_bit_cast<uint32x4_t>(
a != float32x4_t{0, 0, 0, 0});
543 auto recip = 1.0f /
a;
544 return sk_bit_cast<float32x4_t>(mask & sk_bit_cast<uint32x4_t>(recip));
547 const uint8x8_t
a = in.val[3];
548 const uint16x8_t intA = vmovl_u8(
a);
549 const float32x4_t invALow = reciprocal(toNormalized(vget_low_u16(intA)));
550 const float32x4_t invAHigh = reciprocal(toNormalized(vget_high_u16(intA)));
552 auto unpremul = [unpremulHalf, invALow, invAHigh](uint8x8_t v) -> uint8x8_t {
553 const uint16x8_t to16 = vmovl_u8(v);
555 const uint16x4_t
low = unpremulHalf(invALow, vget_low_u16(to16));
556 const uint16x4_t
high = unpremulHalf(invAHigh, vget_high_u16(to16));
558 const uint16x8_t combined = vcombine_u16(low, high);
559 return vqmovn_u16(combined);
562 const uint8x8_t
b = unpremul(in.val[2]);
563 const uint8x8_t g = unpremul(in.val[1]);
564 const uint8x8_t r = unpremul(in.val[0]);
566 if constexpr (swapRB) {
567 const uint8x8x4_t
out{
b, g, r,
a};
568 vst4_u8((uint8_t*)
dst,
out);
570 const uint8x8x4_t
out{r, g,
b,
a};
571 vst4_u8((uint8_t*)
dst,
out);
581 if constexpr (swapRB) {
596#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
601static __m256i
scale(__m256i
x, __m256i
y) {
602 const __m256i _128 = _mm256_set1_epi16(128);
603 const __m256i _257 = _mm256_set1_epi16(257);
606 return _mm256_mulhi_epu16(_mm256_add_epi16(_mm256_mullo_epi16(
x,
y), _128), _257);
609static void premul_should_swapRB(
bool kSwapRB, uint32_t*
dst,
const uint32_t*
src,
int count) {
611 auto premul8 = [=](__m256i* lo, __m256i* hi) {
612 const __m256i zeros = _mm256_setzero_si256();
615 planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
616 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
618 planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
619 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
623 *lo = _mm256_shuffle_epi8(*lo, planar);
624 *hi = _mm256_shuffle_epi8(*hi, planar);
625 __m256i rg = _mm256_unpacklo_epi32(*lo, *hi),
626 ba = _mm256_unpackhi_epi32(*lo, *hi);
629 __m256i r = _mm256_unpacklo_epi8(rg, zeros),
630 g = _mm256_unpackhi_epi8(rg, zeros),
631 b = _mm256_unpacklo_epi8(ba, zeros),
632 a = _mm256_unpackhi_epi8(ba, zeros);
640 rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8));
641 ba = _mm256_or_si256(
b, _mm256_slli_epi16(
a, 8));
642 *lo = _mm256_unpacklo_epi16(rg, ba);
643 *hi = _mm256_unpackhi_epi16(rg, ba);
646 while (
count >= 16) {
647 __m256i lo = _mm256_loadu_si256((
const __m256i*) (
src + 0)),
648 hi = _mm256_loadu_si256((
const __m256i*) (
src + 8));
652 _mm256_storeu_si256((__m256i*) (
dst + 0), lo);
653 _mm256_storeu_si256((__m256i*) (
dst + 8), hi);
661 __m256i lo = _mm256_loadu_si256((
const __m256i*)
src),
662 hi = _mm256_setzero_si256();
666 _mm256_storeu_si256((__m256i*)
dst, lo);
674 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
687 const __m256i swapRB = _mm256_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
688 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
691 __m256i
rgba = _mm256_loadu_si256((
const __m256i*)
src);
692 __m256i
bgra = _mm256_shuffle_epi8(
rgba, swapRB);
693 _mm256_storeu_si256((__m256i*)
dst,
bgra);
704 while (
count >= 16) {
705 __m256i ga = _mm256_loadu_si256((
const __m256i*)
src);
707 __m256i gg = _mm256_or_si256(_mm256_and_si256(ga, _mm256_set1_epi16(0x00FF)),
708 _mm256_slli_epi16(ga, 8));
710 __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
711 __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);
722 __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20),
723 ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31);
725 _mm256_storeu_si256((__m256i*) (
dst + 0), ggga_lo_shuffle);
726 _mm256_storeu_si256((__m256i*) (
dst + 8), ggga_hi_shuffle);
737 while (
count >= 16) {
738 __m256i grayA = _mm256_loadu_si256((
const __m256i*)
src);
740 __m256i g0 = _mm256_and_si256(grayA, _mm256_set1_epi16(0x00FF));
741 __m256i a0 = _mm256_srli_epi16(grayA, 8);
746 __m256i gg = _mm256_or_si256(g0, _mm256_slli_epi16(g0, 8));
747 __m256i ga = _mm256_or_si256(g0, _mm256_slli_epi16(a0, 8));
749 __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
750 __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);
753 __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20),
754 ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31);
756 _mm256_storeu_si256((__m256i*) (
dst + 0), ggga_lo_shuffle);
757 _mm256_storeu_si256((__m256i*) (
dst + 8), ggga_hi_shuffle);
769 auto convert8 = [=](__m256i* lo, __m256i* hi) {
770 const __m256i zeros = _mm256_setzero_si256();
773 planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
774 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
776 planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
777 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
781 *lo = _mm256_shuffle_epi8(*lo, planar);
782 *hi = _mm256_shuffle_epi8(*hi, planar);
783 __m256i cm = _mm256_unpacklo_epi32(*lo, *hi),
784 yk = _mm256_unpackhi_epi32(*lo, *hi);
787 __m256i c = _mm256_unpacklo_epi8(cm, zeros),
788 m = _mm256_unpackhi_epi8(cm, zeros),
789 y = _mm256_unpacklo_epi8(yk, zeros),
790 k = _mm256_unpackhi_epi8(yk, zeros);
793 __m256i r =
scale(c, k),
800 __m256i rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8)),
801 ba = _mm256_or_si256(
b, _mm256_set1_epi16((uint16_t) 0xFF00));
802 *lo = _mm256_unpacklo_epi16(rg, ba);
803 *hi = _mm256_unpackhi_epi16(rg, ba);
806 while (
count >= 16) {
807 __m256i lo = _mm256_loadu_si256((
const __m256i*) (
src + 0)),
808 hi = _mm256_loadu_si256((
const __m256i*) (
src + 8));
812 _mm256_storeu_si256((__m256i*) (
dst + 0), lo);
813 _mm256_storeu_si256((__m256i*) (
dst + 8), hi);
821 __m256i lo = _mm256_loadu_si256((
const __m256i*)
src),
822 hi = _mm256_setzero_si256();
826 _mm256_storeu_si256((__m256i*)
dst, lo);
833 auto proc = (kBGR1 ==
format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
853#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
858static __m128i
scale(__m128i
x, __m128i
y) {
859 const __m128i _128 = _mm_set1_epi16(128);
860 const __m128i _257 = _mm_set1_epi16(257);
863 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(
x,
y), _128), _257);
866static void premul_should_swapRB(
bool kSwapRB, uint32_t*
dst,
const uint32_t*
src,
int count) {
868 auto premul8 = [=](__m128i* lo, __m128i* hi) {
869 const __m128i zeros = _mm_setzero_si128();
872 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
874 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
878 *lo = _mm_shuffle_epi8(*lo, planar);
879 *hi = _mm_shuffle_epi8(*hi, planar);
880 __m128i rg = _mm_unpacklo_epi32(*lo, *hi),
881 ba = _mm_unpackhi_epi32(*lo, *hi);
884 __m128i r = _mm_unpacklo_epi8(rg, zeros),
885 g = _mm_unpackhi_epi8(rg, zeros),
886 b = _mm_unpacklo_epi8(ba, zeros),
887 a = _mm_unpackhi_epi8(ba, zeros);
895 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8));
896 ba = _mm_or_si128(
b, _mm_slli_epi16(
a, 8));
897 *lo = _mm_unpacklo_epi16(rg, ba);
898 *hi = _mm_unpackhi_epi16(rg, ba);
902 __m128i lo = _mm_loadu_si128((
const __m128i*) (
src + 0)),
903 hi = _mm_loadu_si128((
const __m128i*) (
src + 4));
907 _mm_storeu_si128((__m128i*) (
dst + 0), lo);
908 _mm_storeu_si128((__m128i*) (
dst + 4), hi);
916 __m128i lo = _mm_loadu_si128((
const __m128i*)
src),
917 hi = _mm_setzero_si128();
921 _mm_storeu_si128((__m128i*)
dst, lo);
929 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
942 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
945 __m128i
rgba = _mm_loadu_si128((
const __m128i*)
src);
946 __m128i
bgra = _mm_shuffle_epi8(
rgba, swapRB);
947 _mm_storeu_si128((__m128i*)
dst,
bgra);
959 __m128i ga = _mm_loadu_si128((
const __m128i*)
src);
961 __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
962 _mm_slli_epi16(ga, 8));
964 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
965 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
967 _mm_storeu_si128((__m128i*) (
dst + 0), ggga_lo);
968 _mm_storeu_si128((__m128i*) (
dst + 4), ggga_hi);
980 __m128i grayA = _mm_loadu_si128((
const __m128i*)
src);
982 __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
983 __m128i a0 = _mm_srli_epi16(grayA, 8);
988 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
989 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
992 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
993 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
995 _mm_storeu_si128((__m128i*) (
dst + 0), ggga_lo);
996 _mm_storeu_si128((__m128i*) (
dst + 4), ggga_hi);
1008 auto convert8 = [=](__m128i* lo, __m128i* hi) {
1009 const __m128i zeros = _mm_setzero_si128();
1012 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
1014 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
1018 *lo = _mm_shuffle_epi8(*lo, planar);
1019 *hi = _mm_shuffle_epi8(*hi, planar);
1020 __m128i cm = _mm_unpacklo_epi32(*lo, *hi),
1021 yk = _mm_unpackhi_epi32(*lo, *hi);
1024 __m128i c = _mm_unpacklo_epi8(cm, zeros),
1025 m = _mm_unpackhi_epi8(cm, zeros),
1026 y = _mm_unpacklo_epi8(yk, zeros),
1027 k = _mm_unpackhi_epi8(yk, zeros);
1030 __m128i r =
scale(c, k),
1035 __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)),
1036 ba = _mm_or_si128(
b, _mm_set1_epi16((uint16_t) 0xFF00));
1037 *lo = _mm_unpacklo_epi16(rg, ba);
1038 *hi = _mm_unpackhi_epi16(rg, ba);
1041 while (
count >= 8) {
1042 __m128i lo = _mm_loadu_si128((
const __m128i*) (
src + 0)),
1043 hi = _mm_loadu_si128((
const __m128i*) (
src + 4));
1047 _mm_storeu_si128((__m128i*) (
dst + 0), lo);
1048 _mm_storeu_si128((__m128i*) (
dst + 4), hi);
1056 __m128i lo = _mm_loadu_si128((
const __m128i*)
src),
1057 hi = _mm_setzero_si128();
1061 _mm_storeu_si128((__m128i*)
dst, lo);
1068 auto proc = (kBGR1 ==
format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1088#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
1095 const __m256i _128 = __lasx_xvreplgr2vr_h(128);
1096 const __m256i _257 = __lasx_xvreplgr2vr_h(257);
1099 return __lasx_xvmuh_hu(__lasx_xvadd_h(__lasx_xvmul_h(
x,
y), _128), _257);
1102static void premul_should_swapRB(
bool kSwapRB, uint32_t*
dst,
const uint32_t*
src,
int count) {
1103 auto premul8 = [=](__m256i* lo, __m256i* hi) {
1104 const __m256i zeros = __lasx_xvldi(0);
1105 __m256i planar = __lasx_xvldi(0);
1107 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,0);
1108 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,1);
1109 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,2);
1110 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,3);
1112 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,0);
1113 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,1);
1114 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,2);
1115 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,3);
1119 *lo = __lasx_xvshuf_b(zeros, *lo, planar);
1120 *hi = __lasx_xvshuf_b(zeros, *hi, planar);
1121 __m256i rg = __lasx_xvilvl_w(*hi, *lo),
1122 ba = __lasx_xvilvh_w(*hi, *lo);
1125 __m256i r = __lasx_xvilvl_b(zeros, rg),
1126 g = __lasx_xvilvh_b(zeros, rg),
1127 b = __lasx_xvilvl_b(zeros, ba),
1128 a = __lasx_xvilvh_b(zeros, ba);
1136 rg = __lasx_xvor_v(r, __lasx_xvslli_h(g, 8));
1137 ba = __lasx_xvor_v(
b, __lasx_xvslli_h(
a, 8));
1138 *lo = __lasx_xvilvl_h(ba, rg);
1139 *hi = __lasx_xvilvh_h(ba, rg);
1142 while (
count >= 16) {
1143 __m256i lo = __lasx_xvld(
src, 0),
1144 hi = __lasx_xvld(
src, 32);
1148 __lasx_xvst(lo,
dst, 0);
1149 __lasx_xvst(hi,
dst, 32);
1157 __m256i lo = __lasx_xvld(
src, 0),
1158 hi = __lasx_xvldi(0);
1162 __lasx_xvst(lo,
dst, 0);
1170 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
1183 while (
count >= 8) {
1184 __m256i
rgba = __lasx_xvld(
src, 0);
1185 __m256i
bgra = __lasx_xvshuf4i_b(
rgba, 0xC6);
1197 while (
count >= 16) {
1198 __m256i ga = __lasx_xvld(
src, 0);
1200 __m256i gg = __lasx_xvor_v(__lasx_xvand_v(ga, __lasx_xvreplgr2vr_h(0x00FF)),
1201 __lasx_xvslli_h(ga, 8));
1203 __m256i ggga_lo = __lasx_xvilvl_h(ga, gg);
1204 __m256i ggga_hi = __lasx_xvilvh_h(ga, gg);
1206 __lasx_xvst(__lasx_xvpermi_q(ggga_lo, ggga_hi, 0x02),
dst, 0);
1207 __lasx_xvst(__lasx_xvpermi_q(ggga_lo, ggga_hi, 0x13),
dst, 32);
1218 while (
count >= 16) {
1219 __m256i grayA = __lasx_xvld(
src, 0);
1221 __m256i val = __lasx_xvreplgr2vr_h(0x00FF);
1223 __m256i g0 = __lasx_xvand_v(grayA, val);
1224 __m256i a0 = __lasx_xvsrli_h(grayA, 8);
1229 __m256i gg = __lasx_xvor_v(g0, __lasx_xvslli_h(g0, 8));
1230 __m256i ga = __lasx_xvor_v(g0, __lasx_xvslli_h(a0, 8));
1232 __m256i ggga_lo = __lasx_xvilvl_h(ga, gg);
1233 __m256i ggga_hi = __lasx_xvilvh_h(ga, gg);
1235 val = __lasx_xvpermi_q(ggga_lo, ggga_hi, 0x02);
1236 __lasx_xvst(val,
dst, 0);
1238 val = __lasx_xvpermi_q(ggga_lo, ggga_hi, 0x13);
1239 __lasx_xvst(val,
dst, 32);
1251 auto convert8 = [=](__m256i *lo, __m256i* hi) {
1252 const __m256i zeros = __lasx_xvldi(0);
1253 __m256i planar = __lasx_xvldi(0);
1255 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,0);
1256 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,1);
1257 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,2);
1258 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,3);
1260 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,0);
1261 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,1);
1262 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,2);
1263 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,3);
1267 *lo = __lasx_xvshuf_b(zeros, *lo, planar);
1268 *hi = __lasx_xvshuf_b(zeros, *hi, planar);
1269 __m256i cm = __lasx_xvilvl_w(*hi, *lo),
1270 yk = __lasx_xvilvh_w(*hi, *lo);
1273 __m256i c = __lasx_xvilvl_b(zeros, cm),
1274 m = __lasx_xvilvh_b(zeros, cm),
1275 y = __lasx_xvilvl_b(zeros, yk),
1276 k = __lasx_xvilvh_b(zeros, yk);
1279 __m256i r =
scale(c, k),
1286 __m256i rg = __lasx_xvor_v(r, __lasx_xvslli_h(g, 8)),
1287 ba = __lasx_xvor_v(
b, __lasx_xvreplgr2vr_h(0xff00));
1288 *lo = __lasx_xvilvl_h(ba, rg);
1289 *hi = __lasx_xvilvh_h(ba, rg);
1292 while (
count >= 16) {
1293 __m256i lo = __lasx_xvld(
src, 0),
1294 hi = __lasx_xvld(
src, 32);
1298 __lasx_xvst(lo,
dst, 0);
1299 __lasx_xvst(hi,
dst, 32);
1306 while (
count >= 8) {
1307 __m256i lo = __lasx_xvld(
src, 0),
1308 hi = __lasx_xvldi(0);
1312 __lasx_xvst(lo,
dst, 0);
1319 auto proc = (kBGR1 ==
format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1339#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1345 const __m128i _128 = __lsx_vreplgr2vr_h(128);
1346 const __m128i _257 = __lsx_vreplgr2vr_h(257);
1349 return __lsx_vmuh_hu(__lsx_vadd_h(__lsx_vmul_h(
x,
y), _128), _257);
1352static void premul_should_swapRB(
bool kSwapRB, uint32_t*
dst,
const uint32_t*
src,
int count) {
1354 auto premul8 = [=](__m128i *lo, __m128i *hi){
1355 const __m128i zeros = __lsx_vldi(0);
1356 __m128i planar = __lsx_vldi(0);
1358 planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010e0a0602, 0);
1359 planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030c080400, 1);
1361 planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010c080400, 0);
1362 planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030e0a0602, 1);
1366 *lo = __lsx_vshuf_b(zeros, *lo, planar);
1367 *hi = __lsx_vshuf_b(zeros, *hi, planar);
1368 __m128i rg = __lsx_vilvl_w(*hi, *lo),
1369 ba = __lsx_vilvh_w(*hi, *lo);
1372 __m128i r = __lsx_vilvl_b(zeros, rg),
1373 g = __lsx_vilvh_b(zeros, rg),
1374 b = __lsx_vilvl_b(zeros, ba),
1375 a = __lsx_vilvh_b(zeros, ba);
1383 rg = __lsx_vor_v(r, __lsx_vslli_h(g, 8));
1384 ba = __lsx_vor_v(
b, __lsx_vslli_h(
a, 8));
1385 *lo = __lsx_vilvl_h(ba, rg);
1386 *hi = __lsx_vilvh_h(ba, rg);
1388 while (
count >= 8) {
1389 __m128i lo = __lsx_vld(
src ,0),
1390 hi = __lsx_vld(
src ,16);
1394 __lsx_vst(lo,
dst, 0);
1395 __lsx_vst(hi,
dst, 16);
1403 __m128i lo = __lsx_vld(
src, 0),
1408 __lsx_vst(lo,
dst, 0);
1416 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
1429 __m128i swapRB = __lsx_vldi(0);
1430 swapRB = __lsx_vinsgr2vr_d(swapRB, 0x0704050603000102, 0);
1431 swapRB = __lsx_vinsgr2vr_d(swapRB, 0x0f0c0d0e0b08090a, 1);
1433 while (
count >= 4) {
1434 __m128i
rgba = __lsx_vld(
src, 0);
1435 __m128i
bgra = __lsx_vshuf4i_b(
rgba, 0xC6);
1447 while (
count >= 8) {
1448 __m128i ga = __lsx_vld(
src, 0);
1450 __m128i gg = __lsx_vor_v(__lsx_vand_v(ga, __lsx_vreplgr2vr_h(0x00FF)),
1451 __lsx_vslli_h(ga, 8));
1453 __m128i ggga_lo = __lsx_vilvl_h(ga, gg);
1454 __m128i ggga_hi = __lsx_vilvh_h(ga, gg);
1456 __lsx_vst(ggga_lo,
dst, 0);
1457 __lsx_vst(ggga_hi,
dst, 16);
1468 while (
count >= 8) {
1469 __m128i grayA = __lsx_vld(
src, 0);
1471 __m128i g0 = __lsx_vand_v(grayA, __lsx_vreplgr2vr_h(0x00FF));
1472 __m128i a0 = __lsx_vsrli_h(grayA, 8);
1477 __m128i gg = __lsx_vor_v(g0, __lsx_vslli_h(g0, 8));
1478 __m128i ga = __lsx_vor_v(g0, __lsx_vslli_h(a0, 8));
1480 __m128i ggga_lo = __lsx_vilvl_h(ga, gg);
1481 __m128i ggga_hi = __lsx_vilvh_h(ga, gg);
1483 __lsx_vst(ggga_lo,
dst, 0);
1484 __lsx_vst(ggga_hi,
dst, 16);
1496 auto convert8 = [=](__m128i *lo, __m128i* hi) {
1497 const __m128i zeros = __lsx_vldi(0);
1498 __m128i planar = __lsx_vldi(0);
1500 planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010e0a0602, 0);
1501 planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030c080400, 1);
1503 planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010c080400, 0);
1504 planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030e0a0602, 1);
1508 *lo = __lsx_vshuf_b(zeros, *lo, planar);
1509 *hi = __lsx_vshuf_b(zeros, *hi, planar);
1510 __m128i cm = __lsx_vilvl_w(*hi, *lo),
1511 yk = __lsx_vilvh_w(*hi, *lo);
1514 __m128i c = __lsx_vilvl_b(zeros, cm),
1515 m = __lsx_vilvh_b(zeros, cm),
1516 y = __lsx_vilvl_b(zeros, yk),
1517 k = __lsx_vilvh_b(zeros, yk);
1520 __m128i r =
scale(c, k),
1527 __m128i rg = __lsx_vor_v(r, __lsx_vslli_h(g, 8)),
1528 ba = __lsx_vor_v(
b, __lsx_vreplgr2vr_h(0xff00));
1529 *lo = __lsx_vilvl_h(ba, rg);
1530 *hi = __lsx_vilvl_h(ba, rg);
1533 while (
count >= 8) {
1534 __m128i lo = __lsx_vld(
src, 0),
1535 hi = __lsx_vld(
src, 16);
1539 __lsx_vst(lo,
dst, 0);
1540 __lsx_vst(hi,
dst, 16);
1548 __m128i lo = __lsx_vld(
src, 0),
1553 __lsx_vst(lo,
dst, 0);
1560 auto proc = (kBGR1 ==
format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1621static void gray_to_RGB1_portable(uint32_t
dst[],
const uint8_t*
src,
int count) {
1623 dst[
i] = (uint32_t)0xFF << 24
1624 | (uint32_t)
src[
i] << 16
1625 | (uint32_t)
src[
i] << 8
1626 | (uint32_t)
src[
i] << 0;
1629#if defined(SK_ARM_HAS_NEON)
1631 while (
count >= 16) {
1633 uint8x16_t gray = vld1q_u8(
src);
1640 rgba.val[3] = vdupq_n_u8(0xFF);
1643 vst4q_u8((uint8_t*)
dst,
rgba);
1650 uint8x8_t gray = vld1_u8(
src);
1657 rgba.val[3] = vdup_n_u8(0xFF);
1660 vst4_u8((uint8_t*)
dst,
rgba);
1667#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
1669 const __m256i alphas = _mm256_set1_epi8((uint8_t) 0xFF);
1670 while (
count >= 32) {
1671 __m256i grays = _mm256_loadu_si256((
const __m256i*)
src);
1673 __m256i gg_lo = _mm256_unpacklo_epi8(grays, grays);
1674 __m256i gg_hi = _mm256_unpackhi_epi8(grays, grays);
1675 __m256i ga_lo = _mm256_unpacklo_epi8(grays, alphas);
1676 __m256i ga_hi = _mm256_unpackhi_epi8(grays, alphas);
1678 __m256i ggga0 = _mm256_unpacklo_epi16(gg_lo, ga_lo);
1679 __m256i ggga1 = _mm256_unpackhi_epi16(gg_lo, ga_lo);
1680 __m256i ggga2 = _mm256_unpacklo_epi16(gg_hi, ga_hi);
1681 __m256i ggga3 = _mm256_unpackhi_epi16(gg_hi, ga_hi);
1696 __m256i ggga0_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x20),
1697 ggga1_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x20),
1698 ggga2_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x31),
1699 ggga3_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x31);
1701 _mm256_storeu_si256((__m256i*) (
dst + 0), ggga0_shuffle);
1702 _mm256_storeu_si256((__m256i*) (
dst + 8), ggga1_shuffle);
1703 _mm256_storeu_si256((__m256i*) (
dst + 16), ggga2_shuffle);
1704 _mm256_storeu_si256((__m256i*) (
dst + 24), ggga3_shuffle);
1712#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
1714 const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
1715 while (
count >= 16) {
1716 __m128i grays = _mm_loadu_si128((
const __m128i*)
src);
1718 __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
1719 __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
1720 __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
1721 __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
1723 __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
1724 __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
1725 __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
1726 __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
1728 _mm_storeu_si128((__m128i*) (
dst + 0), ggga0);
1729 _mm_storeu_si128((__m128i*) (
dst + 4), ggga1);
1730 _mm_storeu_si128((__m128i*) (
dst + 8), ggga2);
1731 _mm_storeu_si128((__m128i*) (
dst + 12), ggga3);
1739#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
1741 const __m256i alphas = __lasx_xvreplgr2vr_b(0xFF);
1742 while (
count >= 32) {
1743 __m256i grays = __lasx_xvld(
src, 0);
1745 __m256i gg_lo = __lasx_xvilvl_b(grays, grays);
1746 __m256i gg_hi = __lasx_xvilvh_b(grays, grays);
1747 __m256i ga_lo = __lasx_xvilvl_b(alphas, grays);
1748 __m256i ga_hi = __lasx_xvilvh_b(alphas, grays);
1750 __m256i ggga0 = __lasx_xvilvl_h(ga_lo, gg_lo);
1751 __m256i ggga1 = __lasx_xvilvh_h(ga_lo, gg_lo);
1752 __m256i ggga2 = __lasx_xvilvl_h(ga_hi, gg_hi);
1753 __m256i ggga3 = __lasx_xvilvh_h(ga_hi, gg_hi);
1755 __m256i ggga_0 = __lasx_xvpermi_q(ggga0, ggga1, 0x02);
1756 __m256i ggga_1 = __lasx_xvpermi_q(ggga2, ggga3, 0x02);
1757 __m256i ggga_2 = __lasx_xvpermi_q(ggga0, ggga1, 0x13);
1758 __m256i ggga_3 = __lasx_xvpermi_q(ggga2, ggga3, 0x13);
1760 __lasx_xvst(ggga_0,
dst, 0);
1761 __lasx_xvst(ggga_1,
dst, 32);
1762 __lasx_xvst(ggga_2,
dst, 64);
1763 __lasx_xvst(ggga_3,
dst, 96);
1771#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1773 const __m128i alphas = __lsx_vreplgr2vr_b(0xFF);
1774 while (
count >= 16) {
1775 __m128i grays = __lsx_vld(
src, 0);
1777 __m128i gg_lo = __lsx_vilvl_b(grays, grays);
1778 __m128i gg_hi = __lsx_vilvh_b(grays, grays);
1779 __m128i ga_lo = __lsx_vilvl_b(alphas, grays);
1780 __m128i ga_hi = __lsx_vilvh_b(alphas, grays);
1782 __m128i ggga0 = __lsx_vilvl_h(ga_lo, gg_lo);
1783 __m128i ggga1 = __lsx_vilvh_h(ga_lo, gg_lo);
1784 __m128i ggga2 = __lsx_vilvl_h(ga_hi, gg_hi);
1785 __m128i ggga3 = __lsx_vilvh_h(ga_hi, gg_hi);
1787 __lsx_vst(ggga0,
dst, 0);
1788 __lsx_vst(ggga1,
dst, 16);
1789 __lsx_vst(ggga2,
dst, 32);
1790 __lsx_vst(ggga3,
dst, 48);
1805static void RGB_to_RGB1_portable(uint32_t
dst[],
const uint8_t*
src,
int count) {
1811 dst[
i] = (uint32_t)0xFF << 24
1817static void RGB_to_BGR1_portable(uint32_t
dst[],
const uint8_t*
src,
int count) {
1823 dst[
i] = (uint32_t)0xFF << 24
1829#if defined(SK_ARM_HAS_NEON)
1830 static void insert_alpha_should_swaprb(
bool kSwapRB,
1832 while (
count >= 16) {
1834 uint8x16x3_t rgb = vld3q_u8(
src);
1839 rgba.val[0] = rgb.val[2];
1840 rgba.val[2] = rgb.val[0];
1842 rgba.val[0] = rgb.val[0];
1843 rgba.val[2] = rgb.val[2];
1845 rgba.val[1] = rgb.val[1];
1846 rgba.val[3] = vdupq_n_u8(0xFF);
1849 vst4q_u8((uint8_t*)
dst,
rgba);
1857 uint8x8x3_t rgb = vld3_u8(
src);
1862 rgba.val[0] = rgb.val[2];
1863 rgba.val[2] = rgb.val[0];
1865 rgba.val[0] = rgb.val[0];
1866 rgba.val[2] = rgb.val[2];
1868 rgba.val[1] = rgb.val[1];
1869 rgba.val[3] = vdup_n_u8(0xFF);
1872 vst4_u8((uint8_t*)
dst,
rgba);
1879 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1884 insert_alpha_should_swaprb(
false,
dst,
src,
count);
1887 insert_alpha_should_swaprb(
true,
dst,
src,
count);
1889#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
1890 static void insert_alpha_should_swaprb(
bool kSwapRB,
1892 const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
1894 const uint8_t
X = 0xFF;
1896 expand = _mm_setr_epi8(2,1,0,
X, 5,4,3,
X, 8,7,6,
X, 11,10,9,
X);
1898 expand = _mm_setr_epi8(0,1,2,
X, 3,4,5,
X, 6,7,8,
X, 9,10,11,
X);
1901 while (
count >= 6) {
1905 __m128i rgb = _mm_loadu_si128((
const __m128i*)
src);
1908 __m128i
rgba = _mm_or_si128(_mm_shuffle_epi8(rgb,
expand), alphaMask);
1911 _mm_storeu_si128((__m128i*)
dst,
rgba);
1919 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1924 insert_alpha_should_swaprb(
false,
dst,
src,
count);
1927 insert_alpha_should_swaprb(
true,
dst,
src,
count);
1929#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
1930 static void insert_alpha_should_swaprb(
bool kSwapRB,
1932 const __m256i alphaMask = __lasx_xvreplgr2vr_w(0xFF000000);
1934 __m256i
expand = __lasx_xvldi(0);
1936 expand = __lasx_xvinsgr2vr_d(
expand, 0x0503040502000102, 0);
1937 expand = __lasx_xvinsgr2vr_d(
expand, 0x0b090a0b08060708, 1);
1938 expand = __lasx_xvinsgr2vr_d(
expand, 0x110f10110e0c0d0e, 2);
1939 expand = __lasx_xvinsgr2vr_d(
expand, 0x1715161714121314, 3);
1941 expand = __lasx_xvinsgr2vr_d(
expand, 0x0505040302020100, 0);
1942 expand = __lasx_xvinsgr2vr_d(
expand, 0x0b0b0a0908080706, 1);
1943 expand = __lasx_xvinsgr2vr_d(
expand, 0x1111100f0e0e0d0c, 2);
1944 expand = __lasx_xvinsgr2vr_d(
expand, 0x1717161514141312, 3);
1947 while (
count >= 8) {
1951 __m256i rgb = __lasx_xvld(
src, 0);
1952 __m256i rgb_l = __lasx_xvpermi_d(rgb, 0x44);
1953 __m256i rgb_h = __lasx_xvpermi_d(rgb, 0xEE);
1956 __m256i
rgba = __lasx_xvor_v(__lasx_xvshuf_b(rgb_h, rgb_l,
expand), alphaMask);
1967 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1971 insert_alpha_should_swaprb(
false,
dst,
src,
count);
1974 insert_alpha_should_swaprb(
true,
dst,
src,
count);
1976#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1977 static void insert_alpha_should_swaprb(
bool kSwapRB,
1979 const __m128i alphaMask = __lsx_vreplgr2vr_w(0xFF000000);
1981 __m128i
expand = __lsx_vldi(0);
1983 expand = __lsx_vinsgr2vr_d(
expand, 0x0503040502000102, 0);
1984 expand = __lsx_vinsgr2vr_d(
expand, 0x0b090a0b08060708, 1);
1986 expand = __lsx_vinsgr2vr_d(
expand, 0x0505040302020100, 0);
1987 expand = __lsx_vinsgr2vr_d(
expand, 0x0b0b0a0908080706, 1);
1990 while (
count >= 6) {
1994 __m128i rgb = __lsx_vld(
src, 0);
1997 __m128i
rgba = __lsx_vor_v(__lsx_vshuf_b(rgb, rgb,
expand), alphaMask);
2008 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
2012 insert_alpha_should_swaprb(
false,
dst,
src,
count);
2015 insert_alpha_should_swaprb(
true,
dst,
src,
count);
static const uint32_t bgra[kNumPixels]
static const uint32_t rgba[kNumPixels]
#define SK_NO_SANITIZE(A)
void swap(sk_sp< T > &a, sk_sp< T > &b)
uint32_t uint32_t * format
static float min(float r, float g, float b)
Swizzle_8888_u8 gray_to_RGB1
Swizzle_8888_u32 RGBA_to_rgbA
Swizzle_8888_u8 RGB_to_RGB1
Swizzle_8888_u8 grayA_to_rgbA
Swizzle_8888_u8 RGB_to_BGR1
Swizzle_8888_u8 grayA_to_RGBA
Swizzle_8888_u32 RGBA_to_BGRA
Swizzle_8888_u32 RGBA_to_bgrA
Swizzle_8888_u32 rgbA_to_BGRA
Swizzle_8888_u32 inverted_CMYK_to_BGR1
Swizzle_8888_u32 inverted_CMYK_to_RGB1
Swizzle_8888_u32 rgbA_to_RGBA
def Format(template, **parameters)