29#include <initializer_list>
37#if !defined(SKNX_NO_SIMD)
38 #define SKVX_USE_SIMD 1
40 #define SKVX_USE_SIMD 0
44 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
45 #include <immintrin.h>
46 #elif defined(SK_ARM_HAS_NEON)
48 #elif defined(__wasm_simd128__)
49 #include <wasm_simd128.h>
50 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
51 #include <lasxintrin.h>
52 #include <lsxintrin.h>
53 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
54 #include <lsxintrin.h>
60 #define SKVX_ALWAYS_INLINE __forceinline
62 #define SKVX_ALWAYS_INLINE __attribute__((always_inline))
66#define SI static inline
67#define SIT template < typename T> SI
68#define SIN template <int N > SI
69#define SINT template <int N, typename T> SI
70#define SINTU template <int N, typename T, typename U, \
71 typename=std::enable_if_t<std::is_convertible<U,T>::value>> SI
75template <
int N,
typename T>
76struct alignas(N*sizeof(T))
Vec;
78template <
int... Ix,
int N,
typename T>
82template <
int N,
typename T>
83struct alignas(N*sizeof(T))
Vec {
84 static_assert((
N & (
N-1)) == 0,
"N must be a power of 2.");
85 static_assert(
sizeof(
T) >=
alignof(
T),
"What kind of unusual T is this?");
99 assert(xs.size() <= (
size_t)
N);
100 memcpy(vals, xs.begin(), std::min(xs.size(), (
size_t)
N)*
sizeof(
T));
110 return sk_unaligned_load<Vec>(ptr);
114 memcpy(ptr,
this,
sizeof(
Vec));
123struct alignas(4*sizeof(T))
Vec<4,
T> {
124 static_assert(
sizeof(
T) >=
alignof(
T),
"What kind of unusual T is this?");
135 assert(xs.size() <= (
size_t)4);
136 memcpy(vals, xs.begin(), std::min(xs.size(), (
size_t)4)*
sizeof(
T));
146 return sk_unaligned_load<Vec>(ptr);
149 memcpy(ptr,
this,
sizeof(
Vec));
174struct alignas(2*sizeof(T))
Vec<2,
T> {
175 static_assert(
sizeof(
T) >=
alignof(
T),
"What kind of unusual T is this?");
183 assert(xs.size() <= (
size_t)2);
184 memcpy(vals, xs.begin(), std::min(xs.size(), (
size_t)2)*
sizeof(
T));
194 return sk_unaligned_load<Vec>(ptr);
197 memcpy(ptr,
this,
sizeof(
Vec));
221 assert(xs.size() <= (
size_t)1);
228 return sk_unaligned_load<Vec>(ptr);
231 memcpy(ptr,
this,
sizeof(
Vec));
237template <>
struct Mask<float > {
using type = int32_t; };
238template <>
struct Mask<double> {
using type = int64_t; };
256#if SKVX_USE_SIMD && (defined(__clang__) || defined(__GNUC__))
259 #if defined(__clang__)
260 template <
int N,
typename T>
263 #elif defined(__GNUC__)
264 template <
int N,
typename T>
269 template <
int N,
typename T>
270 using VExt =
typename VExtHelper<N,T>::type;
274 SI Vec<4,float> to_vec(VExt<4,float> v) {
return sk_bit_cast<Vec<4,float>>(v); }
277 SINT VExt<N,T> to_vext(
const Vec<N,T>& v) {
return sk_bit_cast<VExt<N,T>>(v); }
278 SINT Vec <N,T> to_vec(
const VExt<N,T>& v) {
return sk_bit_cast<Vec <N,T>>(v); }
281 return to_vec<N,T>(to_vext(
x) + to_vext(
y));
284 return to_vec<N,T>(to_vext(
x) - to_vext(
y));
287 return to_vec<N,T>(to_vext(
x) * to_vext(
y));
290 return to_vec<N,T>(to_vext(
x) / to_vext(
y));
294 return to_vec<N,T>(to_vext(
x) ^ to_vext(
y));
297 return to_vec<N,T>(to_vext(
x) & to_vext(
y));
300 return to_vec<N,T>(to_vext(
x) | to_vext(
y));
303 SINT Vec<N,T>
operator!(
const Vec<N,T>&
x) {
return to_vec<N,T>(!to_vext(
x)); }
304 SINT Vec<N,T>
operator-(
const Vec<N,T>&
x) {
return to_vec<N,T>(-to_vext(
x)); }
305 SINT Vec<N,T>
operator~(
const Vec<N,T>&
x) {
return to_vec<N,T>(~to_vext(
x)); }
307 SINT Vec<N,T>
operator<<(
const Vec<N,T>&
x,
int k) {
return to_vec<N,T>(to_vext(
x) << k); }
308 SINT Vec<N,T>
operator>>(
const Vec<N,T>&
x,
int k) {
return to_vec<N,T>(to_vext(
x) >> k); }
311 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) == to_vext(
y));
314 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) != to_vext(
y));
317 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) <= to_vext(
y));
320 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) >= to_vext(
y));
323 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) < to_vext(
y));
326 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) > to_vext(
y));
352 return x.val ==
y.val ? ~0 : 0;
355 return x.val !=
y.val ? ~0 : 0;
358 return x.val <=
y.val ? ~0 : 0;
361 return x.val >=
y.val ? ~0 : 0;
364 return x.val <
y.val ? ~0 : 0;
367 return x.val >
y.val ? ~0 : 0;
372 return join(
x.lo +
y.lo,
x.hi +
y.hi);
375 return join(
x.lo -
y.lo,
x.hi -
y.hi);
378 return join(
x.lo *
y.lo,
x.hi *
y.hi);
381 return join(
x.lo /
y.lo,
x.hi /
y.hi);
385 return join(
x.lo ^
y.lo,
x.hi ^
y.hi);
388 return join(
x.lo &
y.lo,
x.hi &
y.hi);
391 return join(
x.lo |
y.lo,
x.hi |
y.hi);
402 return join(
x.lo ==
y.lo,
x.hi ==
y.hi);
405 return join(
x.lo !=
y.lo,
x.hi !=
y.hi);
408 return join(
x.lo <=
y.lo,
x.hi <=
y.hi);
411 return join(
x.lo >=
y.lo,
x.hi >=
y.hi);
414 return join(
x.lo <
y.lo,
x.hi <
y.hi);
417 return join(
x.lo >
y.lo,
x.hi >
y.hi);
475 return sk_bit_cast<Vec<N,T>>(( cond & sk_bit_cast<Vec<N, M<T>>>(t)) |
476 (~cond & sk_bit_cast<Vec<N, M<T>>>(e)) );
481 return sk_bit_cast<Vec<1,T>>(( cond & sk_bit_cast<Vec<1, M<T>>>(t)) |
482 (~cond & sk_bit_cast<Vec<1, M<T>>>(e)) );
486#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
487 if constexpr (
N*
sizeof(
T) == 32) {
488 return sk_bit_cast<Vec<N,T>>(_mm256_blendv_epi8(sk_bit_cast<__m256i>(e),
489 sk_bit_cast<__m256i>(t),
490 sk_bit_cast<__m256i>(cond)));
493#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
494 if constexpr (
N*
sizeof(
T) == 16) {
495 return sk_bit_cast<Vec<N,T>>(_mm_blendv_epi8(sk_bit_cast<__m128i>(e),
496 sk_bit_cast<__m128i>(t),
497 sk_bit_cast<__m128i>(cond)));
500#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
501 if constexpr (
N*
sizeof(
T) == 16) {
502 return sk_bit_cast<Vec<N,T>>(vbslq_u8(sk_bit_cast<uint8x16_t>(cond),
503 sk_bit_cast<uint8x16_t>(t),
504 sk_bit_cast<uint8x16_t>(e)));
507#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
508 if constexpr (
N*
sizeof(
T) == 32) {
509 return sk_bit_cast<Vec<N,T>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),
510 sk_bit_cast<__m256i>(t),
511 sk_bit_cast<__m256i>(cond)));
514#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
515 if constexpr (
N*
sizeof(
T) == 16) {
516 return sk_bit_cast<Vec<N,T>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(e),
517 sk_bit_cast<__m128i>(t),
518 sk_bit_cast<__m128i>(cond)));
522 if constexpr (
N*
sizeof(
T) > 16) {
534#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
535 if constexpr (
N*
sizeof(
T) == 32) {
536 return !_mm256_testz_si256(sk_bit_cast<__m256i>(
x), _mm256_set1_epi32(-1));
539#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
540 if constexpr (
N*
sizeof(
T) == 16) {
541 return !_mm_testz_si128(sk_bit_cast<__m128i>(
x), _mm_set1_epi32(-1));
544#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
545 if constexpr (
N*
sizeof(
T) == 16) {
549 return _mm_movemask_ps(_mm_cmpneq_ps(sk_bit_cast<__m128>(
x), _mm_set1_ps(0))) != 0b0000;
552#if SKVX_USE_SIMD && defined(__aarch64__)
556 if constexpr (
N*
sizeof(
T) == 8 ) {
return vmaxv_u8 (sk_bit_cast<uint8x8_t> (
x)) > 0; }
557 if constexpr (
N*
sizeof(
T) == 16) {
return vmaxvq_u8(sk_bit_cast<uint8x16_t>(
x)) > 0; }
559#if SKVX_USE_SIMD && defined(__wasm_simd128__)
560 if constexpr (
N == 4 &&
sizeof(
T) == 4) {
561 return wasm_i32x4_any_true(
sk_bit_cast<VExt<4,int>>(
x));
564#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
565 if constexpr (
N*
sizeof(
T) == 32) {
566 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0),
567 sk_bit_cast<__m256i>(
x)));
568 return (retv[0] | retv[4]) != 0b0000;
571#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
572 if constexpr (
N*
sizeof(
T) == 16) {
573 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0),
574 sk_bit_cast<__m128i>(
x)));
575 return retv[0] != 0b0000;
586#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
589 if constexpr (
N == 4 &&
sizeof(
T) == 4) {
590 return _mm_movemask_ps(_mm_cmpneq_ps(sk_bit_cast<__m128>(
x), _mm_set1_ps(0))) == 0b1111;
593#if SKVX_USE_SIMD && defined(__aarch64__)
595 if constexpr (
sizeof(
T)==1 &&
N==8) {
return vminv_u8 (sk_bit_cast<uint8x8_t> (
x)) > 0;}
596 if constexpr (
sizeof(
T)==1 &&
N==16) {
return vminvq_u8 (sk_bit_cast<uint8x16_t>(
x)) > 0;}
597 if constexpr (
sizeof(
T)==2 &&
N==4) {
return vminv_u16 (sk_bit_cast<uint16x4_t>(
x)) > 0;}
598 if constexpr (
sizeof(
T)==2 &&
N==8) {
return vminvq_u16(sk_bit_cast<uint16x8_t>(
x)) > 0;}
599 if constexpr (
sizeof(
T)==4 &&
N==2) {
return vminv_u32 (sk_bit_cast<uint32x2_t>(
x)) > 0;}
600 if constexpr (
sizeof(
T)==4 &&
N==4) {
return vminvq_u32(sk_bit_cast<uint32x4_t>(
x)) > 0;}
602#if SKVX_USE_SIMD && defined(__wasm_simd128__)
603 if constexpr (
N == 4 &&
sizeof(
T) == 4) {
604 return wasm_i32x4_all_true(
sk_bit_cast<VExt<4,int>>(
x));
607#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
608 if constexpr (
N == 8 &&
sizeof(
T) == 4) {
609 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0),
610 sk_bit_cast<__m256i>(
x)));
611 return (retv[0] & retv[4]) == 0b1111;
614#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
615 if constexpr (
N == 4 &&
sizeof(
T) == 4) {
616 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0),
617 sk_bit_cast<__m128i>(
x)));
618 return retv[0] == 0b1111;
627template <
typename D,
typename S>
630template <
typename D,
int N,
typename S>
632#if SKVX_USE_SIMD && defined(__clang__)
633 return to_vec(__builtin_convertvector(to_vext(src), VExt<N,D>));
635 return join(cast<D>(src.lo), cast<D>(src.hi));
666template <
int... Ix,
int N,
typename T>
668#if SKVX_USE_SIMD && defined(__clang__)
670 return to_vec<
sizeof...(Ix),
T>(__builtin_shufflevector(to_vext(
x), to_vext(
x), Ix...));
679template <
typename Fn,
typename... Args,
size_t...
I>
680SI auto map(std::index_sequence<I...>,
681 Fn&& fn,
const Args&...
args) ->
skvx::Vec<
sizeof...(I),
decltype(fn(
args[0]...))> {
682 auto lane = [&](
size_t i)
683#
if defined(__clang__)
691 {
return fn(
args[
static_cast<int>(i)]...); };
693 return { lane(
I)... };
696template <
typename Fn,
int N,
typename T,
typename... Rest>
699 return map(std::make_index_sequence<N>{}, fn, first,rest...);
712 auto fn = [](
float x,
float y,
float z) {
return fmaf(
x,
y,z); };
713 return map(fn,
x,
y,z);
717 return (
int)lrintf(
x.val);
720#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
721 if constexpr (
N == 8) {
722 return sk_bit_cast<Vec<N,int>>(_mm256_cvtps_epi32(sk_bit_cast<__m256>(
x)));
725#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
726 if constexpr (
N == 4) {
727 return sk_bit_cast<Vec<N,int>>(_mm_cvtps_epi32(sk_bit_cast<__m128>(
x)));
730#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
731 if constexpr (
N == 8) {
732 return sk_bit_cast<Vec<N,int>>(__lasx_xvftint_w_s(sk_bit_cast<__m256>(
x)));
735#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
736 if constexpr (
N == 4) {
737 return sk_bit_cast<Vec<N,int>>(__lsx_vftint_w_s(sk_bit_cast<__m128>(
x)));
756 if constexpr (
N > 4) {
761#if SKVX_USE_SIMD && defined(__aarch64__)
762 if constexpr (
N == 4) {
763 return sk_bit_cast<Vec<N,uint16_t>>(vcvt_f16_f32(sk_bit_cast<float32x4_t>(
x)));
768#define I(x) sk_bit_cast<Vec<N,int32_t>>(x)
769#define F(x) sk_bit_cast<Vec<N,float>>(x)
771 s = sem & 0x8000'0000,
772 em =
min(sem ^
s, 0x4780'0000),
776 magic =
I(
max(
F(em) * 8192.f, 0.5f)) & (255 << 23),
777 rounded =
I((
F(em) +
F(magic))),
780 exp = ((magic >> 13) - ((127-15+13+1)<<10)),
782 return cast<uint16_t>((
s>>16) |
f16);
791 if constexpr (
N > 4) {
796#if SKVX_USE_SIMD && defined(__aarch64__)
797 if constexpr (
N == 4) {
798 return sk_bit_cast<Vec<N,float>>(vcvt_f32_f16(sk_bit_cast<float16x4_t>(
x)));
805 inf_or_nan = (em >= (31 << 10)) & (255 << 23),
806 is_norm = em > 0x3ff,
809 norm = ((em<<13) + ((127-15)<<23)),
810 finite = (is_norm & norm) | (~is_norm & sub);
814 return sk_bit_cast<Vec<N,float>>((
s<<16) | finite | inf_or_nan);
819 return cast<uint8_t>( (
x+127)/255 );
827 auto X = cast<uint16_t>(
x),
828 Y = cast<uint16_t>(
y);
829 return cast<uint8_t>( (
X*
Y+
X)/256 );
835#if SKVX_USE_SIMD && (SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 || defined(SK_ARM_HAS_NEON) || \
836 SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX)
839 if constexpr (
N == 16 &&
sizeof(
T) == 1) {
840 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
841 return sk_bit_cast<Vec<N,T>>(_mm_adds_epu8(sk_bit_cast<__m128i>(
x),
842 sk_bit_cast<__m128i>(
y)));
843 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
844 return sk_bit_cast<Vec<N,T>>(__lsx_vsadd_bu(sk_bit_cast<__m128i>(
x),
845 sk_bit_cast<__m128i>(
y)));
847 return sk_bit_cast<Vec<N,T>>(vqaddq_u8(sk_bit_cast<uint8x16_t>(
x),
848 sk_bit_cast<uint8x16_t>(
y)));
850 }
else if constexpr (
N < 16 &&
sizeof(
T) == 1) {
852 }
else if constexpr (
sizeof(
T) == 1) {
882 : fDivisorFactor{(uint32_t)(
std::
round((1.0 / divisor) * (1ull << 32)))}
883 , fHalf{(divisor + 1) >> 1} {
888#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
889 uint64x2_t hi = vmull_n_u32(vget_high_u32(to_vext(numerator)), fDivisorFactor);
890 uint64x2_t lo = vmull_n_u32(vget_low_u32(to_vext(numerator)), fDivisorFactor);
892 return to_vec<4, uint32_t>(vcombine_u32(vshrn_n_u64(lo,32), vshrn_n_u64(hi,32)));
894 return cast<uint32_t>((cast<uint64_t>(numerator) * fDivisorFactor) >> 32);
898 uint32_t
half()
const {
return fHalf; }
901 const uint32_t fDivisorFactor;
902 const uint32_t fHalf;
908#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
910 if constexpr (
N == 8) {
911 return to_vec<8,uint16_t>(vmull_u8(to_vext(
x), to_vext(
y)));
912 }
else if constexpr (
N < 8) {
918 return cast<uint16_t>(
x) * cast<uint16_t>(
y);
924#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
926 if constexpr (
N == 4) {
927 return to_vec<4,uint32_t>(vmull_u16(to_vext(
x), to_vext(
y)));
928 }
else if constexpr (
N < 4) {
934 return cast<uint32_t>(
x) * cast<uint32_t>(
y);
940#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
942 if constexpr (
N == 8) {
943 return sk_bit_cast<Vec<8,uint16_t>>(_mm_mulhi_epu16(sk_bit_cast<__m128i>(
x),
944 sk_bit_cast<__m128i>(
y)));
945 }
else if constexpr (
N < 8) {
950#elif SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
951 if constexpr (
N == 8) {
952 return sk_bit_cast<Vec<8,uint16_t>>(__lsx_vmuh_hu(sk_bit_cast<__m128i>(
x),
953 sk_bit_cast<__m128i>(
y)));
954 }
else if constexpr (
N < 8) {
960 return skvx::cast<uint16_t>(
mull(
x,
y) >> 16);
969 if constexpr (
N == 2) {
970 return ab[0] +
ab[1];
971 }
else if constexpr (
N == 4) {
972 return ab[0] +
ab[1] +
ab[2] +
ab[3];
975 for (
int i = 1; i <
N; ++i) {
983 auto x =
a * shuffle<1,0>(
b);
988 return std::sqrt(
dot(v, v));
992 return std::sqrt(
dot(v, v));
1031#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
1032#define IMPL_LOAD4_TRANSPOSED(N, T, VLD) \
1033SI void strided_load4(const T* v, \
1038 auto mat = VLD(v); \
1039 a = sk_bit_cast<Vec<N,T>>(mat.val[0]); \
1040 b = sk_bit_cast<Vec<N,T>>(mat.val[1]); \
1041 c = sk_bit_cast<Vec<N,T>>(mat.val[2]); \
1042 d = sk_bit_cast<Vec<N,T>>(mat.val[3]); \
1044IMPL_LOAD4_TRANSPOSED(2, uint32_t, vld4_u32)
1045IMPL_LOAD4_TRANSPOSED(4, uint16_t, vld4_u16)
1046IMPL_LOAD4_TRANSPOSED(8, uint8_t, vld4_u8)
1047IMPL_LOAD4_TRANSPOSED(2, int32_t, vld4_s32)
1048IMPL_LOAD4_TRANSPOSED(4, int16_t, vld4_s16)
1049IMPL_LOAD4_TRANSPOSED(8, int8_t, vld4_s8)
1050IMPL_LOAD4_TRANSPOSED(2,
float, vld4_f32)
1051IMPL_LOAD4_TRANSPOSED(4, uint32_t, vld4q_u32)
1052IMPL_LOAD4_TRANSPOSED(8, uint16_t, vld4q_u16)
1053IMPL_LOAD4_TRANSPOSED(16, uint8_t, vld4q_u8)
1054IMPL_LOAD4_TRANSPOSED(4, int32_t, vld4q_s32)
1055IMPL_LOAD4_TRANSPOSED(8, int16_t, vld4q_s16)
1056IMPL_LOAD4_TRANSPOSED(16, int8_t, vld4q_s8)
1057IMPL_LOAD4_TRANSPOSED(4,
float, vld4q_f32)
1058#undef IMPL_LOAD4_TRANSPOSED
1060#elif SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
1067 __m128 a_ = _mm_loadu_ps(v);
1068 __m128 b_ = _mm_loadu_ps(v+4);
1069 __m128 c_ = _mm_loadu_ps(v+8);
1070 __m128 d_ = _mm_loadu_ps(v+12);
1071 _MM_TRANSPOSE4_PS(a_, b_, c_, d_);
1072 a = sk_bit_cast<Vec<4,float>>(a_);
1073 b = sk_bit_cast<Vec<4,float>>(b_);
1074 c = sk_bit_cast<Vec<4,float>>(c_);
1075 d = sk_bit_cast<Vec<4,float>>(d_);
1078#elif SKVX_USE_SIMD && SKVX_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1079#define _LSX_TRANSPOSE4(row0, row1, row2, row3) \
1081 __m128i __t0 = __lsx_vilvl_w (row1, row0); \
1082 __m128i __t1 = __lsx_vilvl_w (row3, row2); \
1083 __m128i __t2 = __lsx_vilvh_w (row1, row0); \
1084 __m128i __t3 = __lsx_vilvh_w (row3, row2); \
1085 (row0) = __lsx_vilvl_d (__t1, __t0); \
1086 (row1) = __lsx_vilvh_d (__t1, __t0); \
1087 (row2) = __lsx_vilvl_d (__t3, __t2); \
1088 (row3) = __lsx_vilvh_d (__t3, __t2); \
1096 __m128i a_ = __lsx_vld(v, 0);
1097 __m128i b_ = __lsx_vld(v, 16);
1098 __m128i c_ = __lsx_vld(v, 32);
1099 __m128i d_ = __lsx_vld(v, 48);
1100 _LSX_TRANSPOSE4(a_, b_, c_, d_);
1101 a = sk_bit_cast<Vec<4,int>>(a_);
1102 b = sk_bit_cast<Vec<4,int>>(b_);
1103 c = sk_bit_cast<Vec<4,int>>(c_);
1104 d = sk_bit_cast<Vec<4,int>>(d_);
1120#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
1121#define IMPL_LOAD2_TRANSPOSED(N, T, VLD) \
1122SI void strided_load2(const T* v, Vec<N,T>& a, Vec<N,T>& b) { \
1123 auto mat = VLD(v); \
1124 a = sk_bit_cast<Vec<N,T>>(mat.val[0]); \
1125 b = sk_bit_cast<Vec<N,T>>(mat.val[1]); \
1127IMPL_LOAD2_TRANSPOSED(2, uint32_t, vld2_u32)
1128IMPL_LOAD2_TRANSPOSED(4, uint16_t, vld2_u16)
1129IMPL_LOAD2_TRANSPOSED(8, uint8_t, vld2_u8)
1130IMPL_LOAD2_TRANSPOSED(2, int32_t, vld2_s32)
1131IMPL_LOAD2_TRANSPOSED(4, int16_t, vld2_s16)
1132IMPL_LOAD2_TRANSPOSED(8, int8_t, vld2_s8)
1133IMPL_LOAD2_TRANSPOSED(2,
float, vld2_f32)
1134IMPL_LOAD2_TRANSPOSED(4, uint32_t, vld2q_u32)
1135IMPL_LOAD2_TRANSPOSED(8, uint16_t, vld2q_u16)
1136IMPL_LOAD2_TRANSPOSED(16, uint8_t, vld2q_u8)
1137IMPL_LOAD2_TRANSPOSED(4, int32_t, vld2q_s32)
1138IMPL_LOAD2_TRANSPOSED(8, int16_t, vld2q_s16)
1139IMPL_LOAD2_TRANSPOSED(16, int8_t, vld2q_s8)
1140IMPL_LOAD2_TRANSPOSED(4,
float, vld2q_f32)
1141#undef IMPL_LOAD2_TRANSPOSED
1186#undef SKVX_ALWAYS_INLINE
static const uint64_t f16[kNumPixels]
static bool SkIsFinite(T x, Pack... values)
static SK_ALWAYS_INLINE Dst SK_FP_SAFE_ABI sk_bit_cast(const Src &src)
#define SKVX_ALWAYS_INLINE
Vec< 4, uint32_t > divide(const Vec< 4, uint32_t > &numerator) const
ScaledDividerU32(uint32_t divisor)
static const char * begin(const StringSlice &s)
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
G_BEGIN_DECLS G_MODULE_EXPORT FlValue * args
__attribute__((visibility("default"))) int RunBenchmarks(int argc
SINT bool isfinite(const Vec< N, T > &v)
SIN Vec< N, float > trunc(const Vec< N, float > &x)
SINT T dot(const Vec< N, T > &a, const Vec< N, T > &b)
SINT Vec< N, T > & operator-=(Vec< N, T > &x, const Vec< N, T > &y)
SIT Vec< 1, T > if_then_else(const Vec< 1, M< T > > &cond, const Vec< 1, T > &t, const Vec< 1, T > &e)
SI Vec< 1, int > lrint(const Vec< 1, float > &x)
SIT Vec< 1, T > operator^(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIN Vec< N, float > fma(const Vec< N, float > &x, const Vec< N, float > &y, const Vec< N, float > &z)
SIT Vec< 1, T > operator+(const Vec< 1, T > &x, const Vec< 1, T > &y)
SINT Vec< N, T > & operator^=(Vec< N, T > &x, const Vec< N, T > &y)
SIT Vec< 1, M< T > > operator<=(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIT Vec< 1, T > operator|(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIT Vec< 1, T > operator*(const Vec< 1, T > &x, const Vec< 1, T > &y)
SINT Vec< N, T > naive_if_then_else(const Vec< N, M< T > > &cond, const Vec< N, T > &t, const Vec< N, T > &e)
SIT Vec< 1, M< T > > operator==(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIN Vec< N, float > round(const Vec< N, float > &x)
SI Vec< 1, D > cast(const Vec< 1, S > &src)
SIT void strided_load4(const T *v, Vec< 1, T > &a, Vec< 1, T > &b, Vec< 1, T > &c, Vec< 1, T > &d)
SINT Vec< N, T > & operator|=(Vec< N, T > &x, const Vec< N, T > &y)
SIN Vec< N, uint16_t > mulhi(const Vec< N, uint16_t > &x, const Vec< N, uint16_t > &y)
SIN Vec< N, float > abs(const Vec< N, float > &x)
SINT Vec< N, T > & operator*=(Vec< N, T > &x, const Vec< N, T > &y)
SIT void strided_load2(const T *v, Vec< 1, T > &a, Vec< 1, T > &b)
SIT Vec< 1, M< T > > operator>=(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIN Vec< N, float > sqrt(const Vec< N, float > &x)
SIN Vec< N, float > normalize(const Vec< N, float > &v)
typename Mask< T >::type M
SINT Vec< 2 *N, T > join(const Vec< N, T > &lo, const Vec< N, T > &hi)
SIT Vec< 1, T > operator~(const Vec< 1, T > &x)
SIN Vec< N, uint16_t > mull(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
SINT Vec< N, T > & operator>>=(Vec< N, T > &x, int bits)
SIT Vec< 1, T > operator-(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIT Vec< 1, M< T > > operator!=(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIN Vec< N, float > from_half(const Vec< N, uint16_t > &x)
SIN Vec< N, uint8_t > div255(const Vec< N, uint16_t > &x)
SIT Vec< 1, M< T > > operator>(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIN Vec< N, uint16_t > to_half(const Vec< N, float > &x)
SIT bool all(const Vec< 1, T > &x)
SIT Vec< 1, M< T > > operator<(const Vec< 1, T > &x, const Vec< 1, T > &y)
SINT std::enable_if_t< std::is_unsigned_v< T >, Vec< N, T > > saturated_add(const Vec< N, T > &x, const Vec< N, T > &y)
SI auto map(std::index_sequence< I... >, Fn &&fn, const Args &... args) -> skvx::Vec< sizeof...(I), decltype(fn(args[0]...))>
SIT T max(const Vec< 1, T > &x)
SIT Vec< 1, T > operator>>(const Vec< 1, T > &x, int k)
SIT Vec< 1, T > operator!(const Vec< 1, T > &x)
SINT Vec< N, T > & operator/=(Vec< N, T > &x, const Vec< N, T > &y)
SI Vec< sizeof...(Ix), T > shuffle(const Vec< N, T > &)
SINT Vec< N, T > & operator&=(Vec< N, T > &x, const Vec< N, T > &y)
SIT Vec< 1, T > operator&(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIT T min(const Vec< 1, T > &x)
SINT Vec< N, T > & operator+=(Vec< N, T > &x, const Vec< N, T > &y)
SIN Vec< N, float > fract(const Vec< N, float > &x)
SIN Vec< N, float > floor(const Vec< N, float > &x)
SIT Vec< 1, T > operator<<(const Vec< 1, T > &x, int k)
SIT bool any(const Vec< 1, T > &x)
SIN Vec< N, float > ceil(const Vec< N, float > &x)
SINT Vec< N, T > & operator<<=(Vec< N, T > &x, int bits)
SIN Vec< N, uint8_t > approx_scale(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
SIT T cross(const Vec< 2, T > &a, const Vec< 2, T > &b)
SIT Vec< 1, T > operator/(const Vec< 1, T > &x, const Vec< 1, T > &y)
SINT Vec< N, T > pin(const Vec< N, T > &x, const Vec< N, T > &lo, const Vec< N, T > &hi)
SKVX_ALWAYS_INLINE T operator[](int i) const
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
SKVX_ALWAYS_INLINE T & operator[](int i)
SKVX_ALWAYS_INLINE Vec(T s)
SKVX_ALWAYS_INLINE Vec()=default
SKVX_ALWAYS_INLINE void store(void *ptr) const
SKVX_ALWAYS_INLINE T x() const
SKVX_ALWAYS_INLINE T & operator[](int i)
SKVX_ALWAYS_INLINE Vec< 2, T > yx() const
SKVX_ALWAYS_INLINE T y() const
SKVX_ALWAYS_INLINE T & x()
SKVX_ALWAYS_INLINE void store(void *ptr) const
SKVX_ALWAYS_INLINE Vec()=default
SKVX_ALWAYS_INLINE T & y()
SKVX_ALWAYS_INLINE Vec(T x, T y)
SKVX_ALWAYS_INLINE Vec(T s)
SKVX_ALWAYS_INLINE Vec< 4, T > xyxy() const
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
SKVX_ALWAYS_INLINE T operator[](int i) const
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
SKVX_ALWAYS_INLINE T w() const
SKVX_ALWAYS_INLINE T & x()
SKVX_ALWAYS_INLINE void store(void *ptr) const
SKVX_ALWAYS_INLINE Vec()=default
SKVX_ALWAYS_INLINE Vec< 4, T > zwxy() const
SKVX_ALWAYS_INLINE T & y()
SKVX_ALWAYS_INLINE Vec< 2, T > zw() const
SKVX_ALWAYS_INLINE Vec< 2, T > & zw()
SKVX_ALWAYS_INLINE Vec(T x, T y, Vec< 2, T > zw)
SKVX_ALWAYS_INLINE Vec(T s)
SKVX_ALWAYS_INLINE T & z()
SKVX_ALWAYS_INLINE Vec(Vec< 2, T > xy, Vec< 2, T > zw)
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
SKVX_ALWAYS_INLINE T x() const
SKVX_ALWAYS_INLINE T & w()
SKVX_ALWAYS_INLINE Vec(T x, T y, T z, T w)
SKVX_ALWAYS_INLINE Vec< 4, T > yxwz() const
SKVX_ALWAYS_INLINE T & operator[](int i)
SKVX_ALWAYS_INLINE T z() const
SKVX_ALWAYS_INLINE T y() const
SKVX_ALWAYS_INLINE T operator[](int i) const
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
SKVX_ALWAYS_INLINE Vec< 2, T > & xy()
SKVX_ALWAYS_INLINE Vec< 2, T > xy() const
SKVX_ALWAYS_INLINE Vec(Vec< 2, T > xy, T z, T w)
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
SKVX_ALWAYS_INLINE Vec()=default
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
SKVX_ALWAYS_INLINE void store(void *ptr) const
SKVX_ALWAYS_INLINE Vec(T s)
SKVX_ALWAYS_INLINE T operator[](int i) const
SKVX_ALWAYS_INLINE T & operator[](int i)