29#include <initializer_list>
37#if !defined(SKNX_NO_SIMD)
38 #define SKVX_USE_SIMD 1
40 #define SKVX_USE_SIMD 0
44 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
45 #include <immintrin.h>
46 #elif defined(SK_ARM_HAS_NEON)
48 #elif defined(__wasm_simd128__)
49 #include <wasm_simd128.h>
50 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
51 #include <lasxintrin.h>
52 #include <lsxintrin.h>
53 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
54 #include <lsxintrin.h>
60 #define SKVX_ALWAYS_INLINE __forceinline
62 #define SKVX_ALWAYS_INLINE __attribute__((always_inline))
66#define SI static inline
67#define SIT template < typename T> SI
68#define SIN template <int N > SI
69#define SINT template <int N, typename T> SI
70#define SINTU template <int N, typename T, typename U, \
71 typename=std::enable_if_t<std::is_convertible<U,T>::value>> SI
75template <
int N,
typename T>
76struct alignas(N*sizeof(T))
Vec;
78template <
int... Ix,
int N,
typename T>
82template <
int N,
typename T>
83struct alignas(N*sizeof(T))
Vec {
84 static_assert((
N & (
N-1)) == 0,
"N must be a power of 2.");
85 static_assert(
sizeof(
T) >=
alignof(
T),
"What kind of unusual T is this?");
99 assert(xs.size() <= (
size_t)
N);
100 memcpy(vals, xs.begin(),
std::min(xs.size(), (
size_t)
N)*
sizeof(
T));
110 return sk_unaligned_load<Vec>(ptr);
114 memcpy(ptr,
this,
sizeof(
Vec));
123struct alignas(4*sizeof(T))
Vec<4,
T> {
124 static_assert(
sizeof(
T) >=
alignof(
T),
"What kind of unusual T is this?");
135 assert(xs.size() <= (
size_t)4);
136 memcpy(vals, xs.begin(),
std::min(xs.size(), (
size_t)4)*
sizeof(
T));
146 return sk_unaligned_load<Vec>(ptr);
149 memcpy(ptr,
this,
sizeof(
Vec));
174struct alignas(2*sizeof(T))
Vec<2,
T> {
175 static_assert(
sizeof(
T) >=
alignof(
T),
"What kind of unusual T is this?");
183 assert(xs.size() <= (
size_t)2);
184 memcpy(vals, xs.begin(),
std::min(xs.size(), (
size_t)2)*
sizeof(
T));
194 return sk_unaligned_load<Vec>(ptr);
197 memcpy(ptr,
this,
sizeof(
Vec));
221 assert(xs.size() <= (
size_t)1);
228 return sk_unaligned_load<Vec>(ptr);
231 memcpy(ptr,
this,
sizeof(
Vec));
238template <>
struct Mask<double> {
using type = int64_t; };
256#if SKVX_USE_SIMD && (defined(__clang__) || defined(__GNUC__))
259 #if defined(__clang__)
260 template <
int N,
typename T>
263 #elif defined(__GNUC__)
264 template <
int N,
typename T>
269 template <
int N,
typename T>
274 SI Vec<4,float> to_vec(VExt<4,float> v) {
return sk_bit_cast<Vec<4,float>>(v); }
277 SINT VExt<N,T> to_vext(
const Vec<N,T>& v) {
return sk_bit_cast<VExt<N,T>>(v); }
278 SINT Vec <N,T> to_vec(
const VExt<N,T>& v) {
return sk_bit_cast<Vec <N,T>>(v); }
281 return to_vec<N,T>(to_vext(
x) + to_vext(
y));
284 return to_vec<N,T>(to_vext(
x) - to_vext(
y));
287 return to_vec<N,T>(to_vext(
x) * to_vext(
y));
290 return to_vec<N,T>(to_vext(
x) / to_vext(
y));
294 return to_vec<N,T>(to_vext(
x) ^ to_vext(
y));
297 return to_vec<N,T>(to_vext(
x) & to_vext(
y));
300 return to_vec<N,T>(to_vext(
x) | to_vext(
y));
303 SINT Vec<N,T>
operator!(
const Vec<N,T>&
x) {
return to_vec<N,T>(!to_vext(
x)); }
304 SINT Vec<N,T>
operator-(
const Vec<N,T>&
x) {
return to_vec<N,T>(-to_vext(
x)); }
305 SINT Vec<N,T>
operator~(
const Vec<N,T>&
x) {
return to_vec<N,T>(~to_vext(
x)); }
307 SINT Vec<N,T>
operator<<(
const Vec<N,T>&
x,
int k) {
return to_vec<N,T>(to_vext(
x) << k); }
308 SINT Vec<N,T>
operator>>(
const Vec<N,T>&
x,
int k) {
return to_vec<N,T>(to_vext(
x) >> k); }
311 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) == to_vext(
y));
314 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) != to_vext(
y));
317 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) <= to_vext(
y));
320 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) >= to_vext(
y));
323 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) < to_vext(
y));
326 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) > to_vext(
y));
352 return x.val ==
y.val ? ~0 : 0;
355 return x.val !=
y.val ? ~0 : 0;
358 return x.val <=
y.val ? ~0 : 0;
361 return x.val >=
y.val ? ~0 : 0;
364 return x.val <
y.val ? ~0 : 0;
367 return x.val >
y.val ? ~0 : 0;
372 return join(
x.lo +
y.lo,
x.hi +
y.hi);
375 return join(
x.lo -
y.lo,
x.hi -
y.hi);
378 return join(
x.lo *
y.lo,
x.hi *
y.hi);
381 return join(
x.lo /
y.lo,
x.hi /
y.hi);
385 return join(
x.lo ^
y.lo,
x.hi ^
y.hi);
388 return join(
x.lo &
y.lo,
x.hi &
y.hi);
391 return join(
x.lo |
y.lo,
x.hi |
y.hi);
402 return join(
x.lo ==
y.lo,
x.hi ==
y.hi);
405 return join(
x.lo !=
y.lo,
x.hi !=
y.hi);
408 return join(
x.lo <=
y.lo,
x.hi <=
y.hi);
411 return join(
x.lo >=
y.lo,
x.hi >=
y.hi);
414 return join(
x.lo <
y.lo,
x.hi <
y.hi);
417 return join(
x.lo >
y.lo,
x.hi >
y.hi);
475 return sk_bit_cast<Vec<N,T>>(( cond & sk_bit_cast<Vec<N, M<T>>>(t)) |
476 (~cond & sk_bit_cast<Vec<N, M<T>>>(
e)) );
481 return sk_bit_cast<Vec<1,T>>(( cond & sk_bit_cast<Vec<1, M<T>>>(t)) |
482 (~cond & sk_bit_cast<Vec<1, M<T>>>(
e)) );
486#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
487 if constexpr (
N*
sizeof(
T) == 32) {
488 return sk_bit_cast<Vec<N,T>>(_mm256_blendv_epi8(sk_bit_cast<__m256i>(
e),
489 sk_bit_cast<__m256i>(t),
490 sk_bit_cast<__m256i>(cond)));
493#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
494 if constexpr (
N*
sizeof(
T) == 16) {
495 return sk_bit_cast<Vec<N,T>>(_mm_blendv_epi8(sk_bit_cast<__m128i>(
e),
496 sk_bit_cast<__m128i>(t),
497 sk_bit_cast<__m128i>(cond)));
500#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
501 if constexpr (
N*
sizeof(
T) == 16) {
502 return sk_bit_cast<Vec<N,T>>(vbslq_u8(sk_bit_cast<uint8x16_t>(cond),
503 sk_bit_cast<uint8x16_t>(t),
504 sk_bit_cast<uint8x16_t>(
e)));
507#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
508 if constexpr (
N*
sizeof(
T) == 32) {
509 return sk_bit_cast<Vec<N,T>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(
e),
510 sk_bit_cast<__m256i>(t),
511 sk_bit_cast<__m256i>(cond)));
514#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
515 if constexpr (
N*
sizeof(
T) == 16) {
516 return sk_bit_cast<Vec<N,T>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(
e),
517 sk_bit_cast<__m128i>(t),
518 sk_bit_cast<__m128i>(cond)));
522 if constexpr (
N*
sizeof(
T) > 16) {
534#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
535 if constexpr (
N*
sizeof(
T) == 32) {
536 return !_mm256_testz_si256(sk_bit_cast<__m256i>(
x), _mm256_set1_epi32(-1));
539#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
540 if constexpr (
N*
sizeof(
T) == 16) {
541 return !_mm_testz_si128(sk_bit_cast<__m128i>(
x), _mm_set1_epi32(-1));
544#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
545 if constexpr (
N*
sizeof(
T) == 16) {
549 return _mm_movemask_ps(_mm_cmpneq_ps(sk_bit_cast<__m128>(
x), _mm_set1_ps(0))) != 0b0000;
552#if SKVX_USE_SIMD && defined(__aarch64__)
556 if constexpr (
N*
sizeof(
T) == 8 ) {
return vmaxv_u8 (sk_bit_cast<uint8x8_t> (
x)) > 0; }
557 if constexpr (
N*
sizeof(
T) == 16) {
return vmaxvq_u8(sk_bit_cast<uint8x16_t>(
x)) > 0; }
559#if SKVX_USE_SIMD && defined(__wasm_simd128__)
560 if constexpr (
N == 4 &&
sizeof(
T) == 4) {
561 return wasm_i32x4_any_true(
sk_bit_cast<VExt<4,int>>(
x));
564#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
565 if constexpr (
N*
sizeof(
T) == 32) {
566 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0),
567 sk_bit_cast<__m256i>(
x)));
568 return (retv[0] | retv[4]) != 0b0000;
571#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
572 if constexpr (
N*
sizeof(
T) == 16) {
573 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0),
574 sk_bit_cast<__m128i>(
x)));
575 return retv[0] != 0b0000;
586#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
589 if constexpr (
N == 4 &&
sizeof(
T) == 4) {
590 return _mm_movemask_ps(_mm_cmpneq_ps(sk_bit_cast<__m128>(
x), _mm_set1_ps(0))) == 0b1111;
593#if SKVX_USE_SIMD && defined(__aarch64__)
595 if constexpr (
sizeof(
T)==1 &&
N==8) {
return vminv_u8 (sk_bit_cast<uint8x8_t> (
x)) > 0;}
596 if constexpr (
sizeof(
T)==1 &&
N==16) {
return vminvq_u8 (sk_bit_cast<uint8x16_t>(
x)) > 0;}
597 if constexpr (
sizeof(
T)==2 &&
N==4) {
return vminv_u16 (sk_bit_cast<uint16x4_t>(
x)) > 0;}
598 if constexpr (
sizeof(
T)==2 &&
N==8) {
return vminvq_u16(sk_bit_cast<uint16x8_t>(
x)) > 0;}
599 if constexpr (
sizeof(
T)==4 &&
N==2) {
return vminv_u32 (sk_bit_cast<uint32x2_t>(
x)) > 0;}
600 if constexpr (
sizeof(
T)==4 &&
N==4) {
return vminvq_u32(sk_bit_cast<uint32x4_t>(
x)) > 0;}
602#if SKVX_USE_SIMD && defined(__wasm_simd128__)
603 if constexpr (
N == 4 &&
sizeof(
T) == 4) {
604 return wasm_i32x4_all_true(
sk_bit_cast<VExt<4,int>>(
x));
607#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
608 if constexpr (
N == 8 &&
sizeof(
T) == 4) {
609 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0),
610 sk_bit_cast<__m256i>(
x)));
611 return (retv[0] & retv[4]) == 0b1111;
614#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
615 if constexpr (
N == 4 &&
sizeof(
T) == 4) {
616 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0),
617 sk_bit_cast<__m128i>(
x)));
618 return retv[0] == 0b1111;
627template <
typename D,
typename S>
630template <
typename D,
int N,
typename S>
632#if SKVX_USE_SIMD && defined(__clang__)
633 return to_vec(__builtin_convertvector(to_vext(
src), VExt<N,D>));
635 return join(cast<D>(
src.lo), cast<D>(
src.hi));
666template <
int... Ix,
int N,
typename T>
668#if SKVX_USE_SIMD && defined(__clang__)
670 return to_vec<
sizeof...(Ix),
T>(__builtin_shufflevector(to_vext(
x), to_vext(
x), Ix...));
679template <
typename Fn,
typename... Args,
size_t...
I>
680SI auto map(std::index_sequence<I...>,
681 Fn&& fn,
const Args&...
args) ->
skvx::Vec<
sizeof...(I),
decltype(fn(
args[0]...))> {
682 auto lane = [&](
size_t i)
683#
if defined(__clang__)
691 {
return fn(
args[
static_cast<int>(
i)]...); };
693 return { lane(
I)... };
696template <
typename Fn,
int N,
typename T,
typename... Rest>
699 return map(std::make_index_sequence<N>{}, fn, first,rest...);
712 auto fn = [](
float x,
float y,
float z) {
return fmaf(
x,
y,z); };
713 return map(fn,
x,
y,z);
717 return (
int)lrintf(
x.val);
720#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
721 if constexpr (
N == 8) {
722 return sk_bit_cast<Vec<N,int>>(_mm256_cvtps_epi32(sk_bit_cast<__m256>(
x)));
725#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
726 if constexpr (
N == 4) {
727 return sk_bit_cast<Vec<N,int>>(_mm_cvtps_epi32(sk_bit_cast<__m128>(
x)));
730#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
731 if constexpr (
N == 8) {
732 return sk_bit_cast<Vec<N,int>>(__lasx_xvftint_w_s(sk_bit_cast<__m256>(
x)));
735#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
736 if constexpr (
N == 4) {
737 return sk_bit_cast<Vec<N,int>>(__lsx_vftint_w_s(sk_bit_cast<__m128>(
x)));
756 if constexpr (
N > 4) {
761#if SKVX_USE_SIMD && defined(__aarch64__)
762 if constexpr (
N == 4) {
763 return sk_bit_cast<Vec<N,uint16_t>>(vcvt_f16_f32(sk_bit_cast<float32x4_t>(
x)));
768#define I(x) sk_bit_cast<Vec<N,int32_t>>(x)
769#define F(x) sk_bit_cast<Vec<N,float>>(x)
771 s = sem & 0x8000'0000,
772 em =
min(sem ^
s, 0x4780'0000),
776 magic =
I(
max(
F(em) * 8192.f, 0.5f)) & (255 << 23),
780 exp = ((
magic >> 13) - ((127-15+13+1)<<10)),
782 return cast<uint16_t>((
s>>16) |
f16);
791 if constexpr (
N > 4) {
796#if SKVX_USE_SIMD && defined(__aarch64__)
797 if constexpr (
N == 4) {
798 return sk_bit_cast<Vec<N,float>>(vcvt_f32_f16(sk_bit_cast<float16x4_t>(
x)));
805 inf_or_nan = (em >= (31 << 10)) & (255 << 23),
806 is_norm = em > 0x3ff,
809 norm = ((em<<13) + ((127-15)<<23)),
810 finite = (is_norm & norm) | (~is_norm & sub);
814 return sk_bit_cast<Vec<N,float>>((
s<<16) | finite | inf_or_nan);
819 return cast<uint8_t>( (
x+127)/255 );
827 auto X = cast<uint16_t>(
x),
828 Y = cast<uint16_t>(
y);
829 return cast<uint8_t>( (
X*
Y+
X)/256 );
835#if SKVX_USE_SIMD && (SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 || defined(SK_ARM_HAS_NEON) || \
836 SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX)
839 if constexpr (
N == 16 &&
sizeof(
T) == 1) {
840 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
841 return sk_bit_cast<Vec<N,T>>(_mm_adds_epu8(sk_bit_cast<__m128i>(
x),
842 sk_bit_cast<__m128i>(
y)));
843 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
844 return sk_bit_cast<Vec<N,T>>(__lsx_vsadd_bu(sk_bit_cast<__m128i>(
x),
845 sk_bit_cast<__m128i>(
y)));
847 return sk_bit_cast<Vec<N,T>>(vqaddq_u8(sk_bit_cast<uint8x16_t>(
x),
848 sk_bit_cast<uint8x16_t>(
y)));
850 }
else if constexpr (
N < 16 &&
sizeof(
T) == 1) {
852 }
else if constexpr (
sizeof(
T) == 1) {
882 : fDivisorFactor{(uint32_t)(
std::
round((1.0 / divisor) * (1ull << 32)))}
883 , fHalf{(divisor + 1) >> 1} {
888#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
889 uint64x2_t hi = vmull_n_u32(vget_high_u32(to_vext(numerator)), fDivisorFactor);
890 uint64x2_t lo = vmull_n_u32(vget_low_u32(to_vext(numerator)), fDivisorFactor);
892 return to_vec<4, uint32_t>(vcombine_u32(vshrn_n_u64(lo,32), vshrn_n_u64(hi,32)));
894 return cast<uint32_t>((cast<uint64_t>(numerator) * fDivisorFactor) >> 32);
898 uint32_t
half()
const {
return fHalf; }
901 const uint32_t fDivisorFactor;
902 const uint32_t fHalf;
908#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
910 if constexpr (
N == 8) {
911 return to_vec<8,uint16_t>(vmull_u8(to_vext(
x), to_vext(
y)));
912 }
else if constexpr (
N < 8) {
918 return cast<uint16_t>(
x) * cast<uint16_t>(
y);
924#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
926 if constexpr (
N == 4) {
927 return to_vec<4,uint32_t>(vmull_u16(to_vext(
x), to_vext(
y)));
928 }
else if constexpr (
N < 4) {
934 return cast<uint32_t>(
x) * cast<uint32_t>(
y);
940#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
942 if constexpr (
N == 8) {
943 return sk_bit_cast<Vec<8,uint16_t>>(_mm_mulhi_epu16(sk_bit_cast<__m128i>(
x),
944 sk_bit_cast<__m128i>(
y)));
945 }
else if constexpr (
N < 8) {
950#elif SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
951 if constexpr (
N == 8) {
952 return sk_bit_cast<Vec<8,uint16_t>>(__lsx_vmuh_hu(sk_bit_cast<__m128i>(
x),
953 sk_bit_cast<__m128i>(
y)));
954 }
else if constexpr (
N < 8) {
960 return skvx::cast<uint16_t>(
mull(
x,
y) >> 16);
969 if constexpr (
N == 2) {
970 return ab[0] +
ab[1];
971 }
else if constexpr (
N == 4) {
972 return ab[0] +
ab[1] +
ab[2] +
ab[3];
975 for (
int i = 1;
i <
N; ++
i) {
983 auto x =
a * shuffle<1,0>(
b);
1031#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
1032#define IMPL_LOAD4_TRANSPOSED(N, T, VLD) \
1033SI void strided_load4(const T* v, \
1038 auto mat = VLD(v); \
1039 a = sk_bit_cast<Vec<N,T>>(mat.val[0]); \
1040 b = sk_bit_cast<Vec<N,T>>(mat.val[1]); \
1041 c = sk_bit_cast<Vec<N,T>>(mat.val[2]); \
1042 d = sk_bit_cast<Vec<N,T>>(mat.val[3]); \
1044IMPL_LOAD4_TRANSPOSED(2, uint32_t, vld4_u32)
1045IMPL_LOAD4_TRANSPOSED(4, uint16_t, vld4_u16)
1046IMPL_LOAD4_TRANSPOSED(8, uint8_t, vld4_u8)
1047IMPL_LOAD4_TRANSPOSED(2, int32_t, vld4_s32)
1048IMPL_LOAD4_TRANSPOSED(4, int16_t, vld4_s16)
1049IMPL_LOAD4_TRANSPOSED(8, int8_t, vld4_s8)
1050IMPL_LOAD4_TRANSPOSED(2,
float, vld4_f32)
1051IMPL_LOAD4_TRANSPOSED(4, uint32_t, vld4q_u32)
1052IMPL_LOAD4_TRANSPOSED(8, uint16_t, vld4q_u16)
1053IMPL_LOAD4_TRANSPOSED(16, uint8_t, vld4q_u8)
1054IMPL_LOAD4_TRANSPOSED(4, int32_t, vld4q_s32)
1055IMPL_LOAD4_TRANSPOSED(8, int16_t, vld4q_s16)
1056IMPL_LOAD4_TRANSPOSED(16, int8_t, vld4q_s8)
1057IMPL_LOAD4_TRANSPOSED(4,
float, vld4q_f32)
1058#undef IMPL_LOAD4_TRANSPOSED
1060#elif SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
1067 __m128 a_ = _mm_loadu_ps(v);
1068 __m128 b_ = _mm_loadu_ps(v+4);
1069 __m128 c_ = _mm_loadu_ps(v+8);
1070 __m128 d_ = _mm_loadu_ps(v+12);
1071 _MM_TRANSPOSE4_PS(a_, b_, c_, d_);
1072 a = sk_bit_cast<Vec<4,float>>(a_);
1073 b = sk_bit_cast<Vec<4,float>>(b_);
1074 c = sk_bit_cast<Vec<4,float>>(c_);
1075 d = sk_bit_cast<Vec<4,float>>(d_);
1078#elif SKVX_USE_SIMD && SKVX_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1079#define _LSX_TRANSPOSE4(row0, row1, row2, row3) \
1081 __m128i __t0 = __lsx_vilvl_w (row1, row0); \
1082 __m128i __t1 = __lsx_vilvl_w (row3, row2); \
1083 __m128i __t2 = __lsx_vilvh_w (row1, row0); \
1084 __m128i __t3 = __lsx_vilvh_w (row3, row2); \
1085 (row0) = __lsx_vilvl_d (__t1, __t0); \
1086 (row1) = __lsx_vilvh_d (__t1, __t0); \
1087 (row2) = __lsx_vilvl_d (__t3, __t2); \
1088 (row3) = __lsx_vilvh_d (__t3, __t2); \
1096 __m128i a_ = __lsx_vld(v, 0);
1097 __m128i b_ = __lsx_vld(v, 16);
1098 __m128i c_ = __lsx_vld(v, 32);
1099 __m128i d_ = __lsx_vld(v, 48);
1100 _LSX_TRANSPOSE4(a_, b_, c_, d_);
1101 a = sk_bit_cast<Vec<4,int>>(a_);
1102 b = sk_bit_cast<Vec<4,int>>(b_);
1103 c = sk_bit_cast<Vec<4,int>>(c_);
1104 d = sk_bit_cast<Vec<4,int>>(d_);
1120#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
1121#define IMPL_LOAD2_TRANSPOSED(N, T, VLD) \
1122SI void strided_load2(const T* v, Vec<N,T>& a, Vec<N,T>& b) { \
1123 auto mat = VLD(v); \
1124 a = sk_bit_cast<Vec<N,T>>(mat.val[0]); \
1125 b = sk_bit_cast<Vec<N,T>>(mat.val[1]); \
1127IMPL_LOAD2_TRANSPOSED(2, uint32_t, vld2_u32)
1128IMPL_LOAD2_TRANSPOSED(4, uint16_t, vld2_u16)
1129IMPL_LOAD2_TRANSPOSED(8, uint8_t, vld2_u8)
1130IMPL_LOAD2_TRANSPOSED(2, int32_t, vld2_s32)
1131IMPL_LOAD2_TRANSPOSED(4, int16_t, vld2_s16)
1132IMPL_LOAD2_TRANSPOSED(8, int8_t, vld2_s8)
1133IMPL_LOAD2_TRANSPOSED(2,
float, vld2_f32)
1134IMPL_LOAD2_TRANSPOSED(4, uint32_t, vld2q_u32)
1135IMPL_LOAD2_TRANSPOSED(8, uint16_t, vld2q_u16)
1136IMPL_LOAD2_TRANSPOSED(16, uint8_t, vld2q_u8)
1137IMPL_LOAD2_TRANSPOSED(4, int32_t, vld2q_s32)
1138IMPL_LOAD2_TRANSPOSED(8, int16_t, vld2q_s16)
1139IMPL_LOAD2_TRANSPOSED(16, int8_t, vld2q_s8)
1140IMPL_LOAD2_TRANSPOSED(4,
float, vld2q_f32)
1141#undef IMPL_LOAD2_TRANSPOSED
1186#undef SKVX_ALWAYS_INLINE
static const uint64_t f16[kNumPixels]
static bool SkIsFinite(T x, Pack... values)
static SK_ALWAYS_INLINE Dst SK_FP_SAFE_ABI sk_bit_cast(const Src &src)
#define SKVX_ALWAYS_INLINE
Vec< 4, uint32_t > divide(const Vec< 4, uint32_t > &numerator) const
ScaledDividerU32(uint32_t divisor)
static const char * begin(const StringSlice &s)
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
G_BEGIN_DECLS G_MODULE_EXPORT FlValue * args
static float max(float r, float g, float b)
static float min(float r, float g, float b)
__attribute__((visibility("default"))) int RunBenchmarks(int argc
it will be possible to load the file into Perfetto s trace viewer disable asset Prevents usage of any non test fonts unless they were explicitly Loaded via prefetched default font Indicates whether the embedding started a prefetch of the default font manager before creating the engine run In non interactive keep the shell running after the Dart script has completed enable serial On low power devices with low core running concurrent GC tasks on threads can cause them to contend with the UI thread which could potentially lead to jank This option turns off all concurrent GC activities domain network JSON encoded network policy per domain This overrides the DisallowInsecureConnections switch Embedder can specify whether to allow or disallow insecure connections at a domain level old gen heap size
SINT bool isfinite(const Vec< N, T > &v)
SIN Vec< N, float > trunc(const Vec< N, float > &x)
SINT T dot(const Vec< N, T > &a, const Vec< N, T > &b)
SINT Vec< N, T > & operator-=(Vec< N, T > &x, const Vec< N, T > &y)
SIT Vec< 1, T > if_then_else(const Vec< 1, M< T > > &cond, const Vec< 1, T > &t, const Vec< 1, T > &e)
SI Vec< 1, int > lrint(const Vec< 1, float > &x)
SIT Vec< 1, T > operator^(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIN Vec< N, float > fma(const Vec< N, float > &x, const Vec< N, float > &y, const Vec< N, float > &z)
SIT Vec< 1, T > operator+(const Vec< 1, T > &x, const Vec< 1, T > &y)
SINT Vec< N, T > & operator^=(Vec< N, T > &x, const Vec< N, T > &y)
SIT Vec< 1, M< T > > operator<=(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIT Vec< 1, T > operator|(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIT Vec< 1, T > operator*(const Vec< 1, T > &x, const Vec< 1, T > &y)
SINT Vec< N, T > naive_if_then_else(const Vec< N, M< T > > &cond, const Vec< N, T > &t, const Vec< N, T > &e)
SIT Vec< 1, M< T > > operator==(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIN Vec< N, float > round(const Vec< N, float > &x)
SI Vec< 1, D > cast(const Vec< 1, S > &src)
SIT void strided_load4(const T *v, Vec< 1, T > &a, Vec< 1, T > &b, Vec< 1, T > &c, Vec< 1, T > &d)
SINT Vec< N, T > & operator|=(Vec< N, T > &x, const Vec< N, T > &y)
SIN Vec< N, uint16_t > mulhi(const Vec< N, uint16_t > &x, const Vec< N, uint16_t > &y)
SIN Vec< N, float > abs(const Vec< N, float > &x)
SINT Vec< N, T > & operator*=(Vec< N, T > &x, const Vec< N, T > &y)
SIT void strided_load2(const T *v, Vec< 1, T > &a, Vec< 1, T > &b)
SIT Vec< 1, M< T > > operator>=(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIN Vec< N, float > sqrt(const Vec< N, float > &x)
SIN Vec< N, float > normalize(const Vec< N, float > &v)
typename Mask< T >::type M
SINT Vec< 2 *N, T > join(const Vec< N, T > &lo, const Vec< N, T > &hi)
SIT Vec< 1, T > operator~(const Vec< 1, T > &x)
SIN Vec< N, uint16_t > mull(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
SINT Vec< N, T > & operator>>=(Vec< N, T > &x, int bits)
SIT Vec< 1, T > operator-(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIT Vec< 1, M< T > > operator!=(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIN Vec< N, float > from_half(const Vec< N, uint16_t > &x)
SIN Vec< N, uint8_t > div255(const Vec< N, uint16_t > &x)
SIT Vec< 1, M< T > > operator>(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIN Vec< N, uint16_t > to_half(const Vec< N, float > &x)
SIT bool all(const Vec< 1, T > &x)
SIT Vec< 1, M< T > > operator<(const Vec< 1, T > &x, const Vec< 1, T > &y)
SINT std::enable_if_t< std::is_unsigned_v< T >, Vec< N, T > > saturated_add(const Vec< N, T > &x, const Vec< N, T > &y)
SI auto map(std::index_sequence< I... >, Fn &&fn, const Args &... args) -> skvx::Vec< sizeof...(I), decltype(fn(args[0]...))>
SIT T max(const Vec< 1, T > &x)
SIT Vec< 1, T > operator>>(const Vec< 1, T > &x, int k)
SIT Vec< 1, T > operator!(const Vec< 1, T > &x)
SINT Vec< N, T > & operator/=(Vec< N, T > &x, const Vec< N, T > &y)
SI Vec< sizeof...(Ix), T > shuffle(const Vec< N, T > &)
SINT Vec< N, T > & operator&=(Vec< N, T > &x, const Vec< N, T > &y)
SIT Vec< 1, T > operator&(const Vec< 1, T > &x, const Vec< 1, T > &y)
SIT T min(const Vec< 1, T > &x)
SINT Vec< N, T > & operator+=(Vec< N, T > &x, const Vec< N, T > &y)
SIN Vec< N, float > fract(const Vec< N, float > &x)
SIN Vec< N, float > floor(const Vec< N, float > &x)
SIT Vec< 1, T > operator<<(const Vec< 1, T > &x, int k)
SIT bool any(const Vec< 1, T > &x)
SIN Vec< N, float > ceil(const Vec< N, float > &x)
SINT Vec< N, T > & operator<<=(Vec< N, T > &x, int bits)
SIN float length(const Vec< N, float > &v)
SIN Vec< N, uint8_t > approx_scale(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
SIT T cross(const Vec< 2, T > &a, const Vec< 2, T > &b)
SIT Vec< 1, T > operator/(const Vec< 1, T > &x, const Vec< 1, T > &y)
SINT Vec< N, T > pin(const Vec< N, T > &x, const Vec< N, T > &lo, const Vec< N, T > &hi)
SKVX_ALWAYS_INLINE T operator[](int i) const
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
SKVX_ALWAYS_INLINE T & operator[](int i)
SKVX_ALWAYS_INLINE Vec(T s)
SKVX_ALWAYS_INLINE Vec()=default
SKVX_ALWAYS_INLINE void store(void *ptr) const
SKVX_ALWAYS_INLINE T x() const
SKVX_ALWAYS_INLINE T & operator[](int i)
SKVX_ALWAYS_INLINE Vec< 2, T > yx() const
SKVX_ALWAYS_INLINE T y() const
SKVX_ALWAYS_INLINE T & x()
SKVX_ALWAYS_INLINE void store(void *ptr) const
SKVX_ALWAYS_INLINE Vec()=default
SKVX_ALWAYS_INLINE T & y()
SKVX_ALWAYS_INLINE Vec(T x, T y)
SKVX_ALWAYS_INLINE Vec(T s)
SKVX_ALWAYS_INLINE Vec< 4, T > xyxy() const
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
SKVX_ALWAYS_INLINE T operator[](int i) const
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
SKVX_ALWAYS_INLINE T w() const
SKVX_ALWAYS_INLINE T & x()
SKVX_ALWAYS_INLINE void store(void *ptr) const
SKVX_ALWAYS_INLINE Vec()=default
SKVX_ALWAYS_INLINE Vec< 4, T > zwxy() const
SKVX_ALWAYS_INLINE T & y()
SKVX_ALWAYS_INLINE Vec< 2, T > zw() const
SKVX_ALWAYS_INLINE Vec< 2, T > & zw()
SKVX_ALWAYS_INLINE Vec(T x, T y, Vec< 2, T > zw)
SKVX_ALWAYS_INLINE Vec(T s)
SKVX_ALWAYS_INLINE T & z()
SKVX_ALWAYS_INLINE Vec(Vec< 2, T > xy, Vec< 2, T > zw)
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
SKVX_ALWAYS_INLINE T x() const
SKVX_ALWAYS_INLINE T & w()
SKVX_ALWAYS_INLINE Vec(T x, T y, T z, T w)
SKVX_ALWAYS_INLINE Vec< 4, T > yxwz() const
SKVX_ALWAYS_INLINE T & operator[](int i)
SKVX_ALWAYS_INLINE T z() const
SKVX_ALWAYS_INLINE T y() const
SKVX_ALWAYS_INLINE T operator[](int i) const
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
SKVX_ALWAYS_INLINE Vec< 2, T > & xy()
SKVX_ALWAYS_INLINE Vec< 2, T > xy() const
SKVX_ALWAYS_INLINE Vec(Vec< 2, T > xy, T z, T w)
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
SKVX_ALWAYS_INLINE Vec()=default
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
SKVX_ALWAYS_INLINE void store(void *ptr) const
SKVX_ALWAYS_INLINE Vec(T s)
SKVX_ALWAYS_INLINE T operator[](int i) const
SKVX_ALWAYS_INLINE T & operator[](int i)