72 {
73
74template <int N, typename T>
75struct alignas(N*sizeof(T)) Vec;
76
77template <
int... Ix,
int N,
typename T>
78SI Vec<
sizeof...(Ix),
T>
shuffle(
const Vec<N,T>&);
79
80
81template <int N, typename T>
82struct alignas(N*sizeof(T)) Vec {
83 static_assert((
N & (
N-1)) == 0,
"N must be a power of 2.");
84 static_assert(
sizeof(
T) >=
alignof(
T),
"What kind of unusual T is this?");
85
86
87
88
89
90
93
94
95
98 assert(xs.size() <= (
size_t)
N);
99 memcpy(vals, xs.begin(), std::min(xs.size(), (
size_t)
N)*
sizeof(
T));
100
101 this->lo = Vec<
N/2,
T>::Load(vals + 0);
102 this->hi = Vec<
N/2,
T>::Load(vals +
N/2);
103 }
104
107
109 return sk_unaligned_load<Vec>(ptr);
110 }
112
113 memcpy(ptr, this, sizeof(Vec));
114 }
115
117};
118
119
120
121template <typename T>
122struct alignas(4*sizeof(T)) Vec<4,
T> {
123 static_assert(
sizeof(
T) >=
alignof(
T),
"What kind of unusual T is this?");
124
131
134 assert(xs.size() <= (size_t)4);
135 memcpy(vals, xs.begin(), std::min(xs.size(), (
size_t)4)*
sizeof(
T));
136
137 this->lo = Vec<2,T>::Load(vals + 0);
138 this->hi = Vec<2,T>::Load(vals + 2);
139 }
140
143
145 return sk_unaligned_load<Vec>(ptr);
146 }
148 memcpy(ptr, this, sizeof(Vec));
149 }
150
157
164
165
168
169 Vec<2,T> lo, hi;
170};
171
172template <typename T>
173struct alignas(2*sizeof(T)) Vec<2,
T> {
174 static_assert(
sizeof(
T) >=
alignof(
T),
"What kind of unusual T is this?");
175
179
182 assert(xs.size() <= (size_t)2);
183 memcpy(vals, xs.begin(), std::min(xs.size(), (
size_t)2)*
sizeof(
T));
184
185 this->lo = Vec<1,T>::Load(vals + 0);
186 this->hi = Vec<1,T>::Load(vals + 1);
187 }
188
191
193 return sk_unaligned_load<Vec>(ptr);
194 }
196 memcpy(ptr, this, sizeof(Vec));
197 }
198
201
204
205
208
209 Vec<1,T> lo, hi;
210};
211
212template <typename T>
215
218
220 assert(xs.size() <= (size_t)1);
221 }
222
225
227 return sk_unaligned_load<Vec>(ptr);
228 }
230 memcpy(ptr, this, sizeof(Vec));
231 }
232};
233
234
235template <
typename T>
struct Mask {
using type =
T; };
236template <>
struct Mask<
float > {
using type = int32_t; };
237template <>
struct Mask<double> {
using type = int64_t; };
238template <
typename T>
using M =
typename Mask<T>::type;
239
240
241SINT Vec<2*N,T>
join(
const Vec<N,T>& lo,
const Vec<N,T>& hi) {
242 Vec<2*N,T> v;
243 v.lo = lo;
244 v.hi = hi;
245 return v;
246}
247
248
249
250
251
252
253
254
255#if SKVX_USE_SIMD && (defined(__clang__) || defined(__GNUC__))
256
257
258 #if defined(__clang__)
259 template <int N, typename T>
261
262 #elif defined(__GNUC__)
263 template <int N, typename T>
264 struct VExtHelper {
266 };
267
268 template <int N, typename T>
269 using VExt = typename VExtHelper<N,T>::type;
270
271
272
273 SI Vec<4,float> to_vec(VExt<4,float> v) {
return sk_bit_cast<Vec<4,float>>(v); }
274 #endif
275
276 SINT VExt<N,T> to_vext(
const Vec<N,T>& v) {
return sk_bit_cast<VExt<N,T>>(v); }
277 SINT Vec <N,T> to_vec(
const VExt<N,T>& v) {
return sk_bit_cast<Vec <N,T>>(v); }
278
280 return to_vec<N,T>(to_vext(
x) + to_vext(
y));
281 }
283 return to_vec<N,T>(to_vext(
x) - to_vext(
y));
284 }
286 return to_vec<N,T>(to_vext(
x) * to_vext(
y));
287 }
289 return to_vec<N,T>(to_vext(
x) / to_vext(
y));
290 }
291
293 return to_vec<N,T>(to_vext(
x) ^ to_vext(
y));
294 }
296 return to_vec<N,T>(to_vext(
x) & to_vext(
y));
297 }
299 return to_vec<N,T>(to_vext(
x) | to_vext(
y));
300 }
301
302 SINT Vec<N,T>
operator!(
const Vec<N,T>&
x) {
return to_vec<N,T>(!to_vext(
x)); }
303 SINT Vec<N,T>
operator-(
const Vec<N,T>&
x) {
return to_vec<N,T>(-to_vext(
x)); }
304 SINT Vec<N,T>
operator~(
const Vec<N,T>&
x) {
return to_vec<N,T>(~to_vext(
x)); }
305
306 SINT Vec<N,T>
operator<<(
const Vec<N,T>&
x,
int k) {
return to_vec<N,T>(to_vext(
x) << k); }
307 SINT Vec<N,T>
operator>>(
const Vec<N,T>&
x,
int k) {
return to_vec<N,T>(to_vext(
x) >> k); }
308
310 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) == to_vext(
y));
311 }
313 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) != to_vext(
y));
314 }
316 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) <= to_vext(
y));
317 }
319 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) >= to_vext(
y));
320 }
322 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) < to_vext(
y));
323 }
324 SINT Vec<N,M<T>> operator> (
const Vec<N,T>&
x,
const Vec<N,T>&
y) {
325 return sk_bit_cast<Vec<N,M<T>>>(to_vext(
x) > to_vext(
y));
326 }
327
328#else
329
330
331
332
333
334 SIT Vec<1,T>
operator+(
const Vec<1,T>&
x,
const Vec<1,T>&
y) {
return x.val +
y.val; }
335 SIT Vec<1,T>
operator-(
const Vec<1,T>&
x,
const Vec<1,T>&
y) {
return x.val -
y.val; }
336 SIT Vec<1,T>
operator*(
const Vec<1,T>&
x,
const Vec<1,T>&
y) {
return x.val *
y.val; }
337 SIT Vec<1,T>
operator/(
const Vec<1,T>&
x,
const Vec<1,T>&
y) {
return x.val /
y.val; }
338
339 SIT Vec<1,T>
operator^(
const Vec<1,T>&
x,
const Vec<1,T>&
y) {
return x.val ^
y.val; }
340 SIT Vec<1,T>
operator&(
const Vec<1,T>&
x,
const Vec<1,T>&
y) {
return x.val &
y.val; }
341 SIT Vec<1,T>
operator|(
const Vec<1,T>&
x,
const Vec<1,T>&
y) {
return x.val |
y.val; }
342
346
347 SIT Vec<1,T>
operator<<(
const Vec<1,T>&
x,
int k) {
return x.val << k; }
349
351 return x.val ==
y.val ? ~0 : 0;
352 }
354 return x.val !=
y.val ? ~0 : 0;
355 }
357 return x.val <=
y.val ? ~0 : 0;
358 }
360 return x.val >=
y.val ? ~0 : 0;
361 }
362 SIT Vec<1,M<T>>
operator< (
const Vec<1,T>&
x,
const Vec<1,T>&
y) {
363 return x.val <
y.val ? ~0 : 0;
364 }
365 SIT Vec<1,M<T>> operator> (
const Vec<1,T>&
x,
const Vec<1,T>&
y) {
366 return x.val >
y.val ? ~0 : 0;
367 }
368
369
371 return join(
x.lo +
y.lo,
x.hi +
y.hi);
372 }
374 return join(
x.lo -
y.lo,
x.hi -
y.hi);
375 }
377 return join(
x.lo *
y.lo,
x.hi *
y.hi);
378 }
380 return join(
x.lo /
y.lo,
x.hi /
y.hi);
381 }
382
384 return join(
x.lo ^
y.lo,
x.hi ^
y.hi);
385 }
387 return join(
x.lo &
y.lo,
x.hi &
y.hi);
388 }
390 return join(
x.lo |
y.lo,
x.hi |
y.hi);
391 }
392
396
399
401 return join(
x.lo ==
y.lo,
x.hi ==
y.hi);
402 }
404 return join(
x.lo !=
y.lo,
x.hi !=
y.hi);
405 }
407 return join(
x.lo <=
y.lo,
x.hi <=
y.hi);
408 }
410 return join(
x.lo >=
y.lo,
x.hi >=
y.hi);
411 }
413 return join(
x.lo <
y.lo,
x.hi <
y.hi);
414 }
415 SINT Vec<N,M<T>> operator> (
const Vec<N,T>&
x,
const Vec<N,T>&
y) {
416 return join(
x.lo >
y.lo,
x.hi >
y.hi);
417 }
418#endif
419
420
421SINTU Vec<N,T> operator+ (U
x,
const Vec<N,T>&
y) {
return Vec<N,T>(
x) +
y; }
422SINTU Vec<N,T> operator- (U
x,
const Vec<N,T>&
y) {
return Vec<N,T>(
x) -
y; }
424SINTU Vec<N,T> operator/ (U
x,
const Vec<N,T>&
y) {
return Vec<N,T>(
x) /
y; }
433SINTU Vec<N,M<T>> operator> (U
x,
const Vec<N,T>&
y) {
return Vec<N,T>(
x) >
y; }
434
435SINTU Vec<N,T> operator+ (
const Vec<N,T>&
x, U
y) {
return x + Vec<N,T>(
y); }
436SINTU Vec<N,T> operator- (
const Vec<N,T>&
x, U
y) {
return x - Vec<N,T>(
y); }
438SINTU Vec<N,T> operator/ (
const Vec<N,T>&
x, U
y) {
return x / Vec<N,T>(
y); }
447SINTU Vec<N,M<T>> operator> (
const Vec<N,T>&
x, U
y) {
return x > Vec<N,T>(
y); }
448
456
464
467
468
469
470
471
472
474 return sk_bit_cast<Vec<N,T>>(( cond & sk_bit_cast<Vec<N, M<T>>>(t)) |
475 (~cond & sk_bit_cast<Vec<N, M<T>>>(
e)) );
476}
477
478SIT Vec<1,T>
if_then_else(
const Vec<1,M<T>>& cond,
const Vec<1,T>& t,
const Vec<1,T>& e) {
479
480 return sk_bit_cast<Vec<1,T>>(( cond & sk_bit_cast<Vec<1, M<T>>>(t)) |
481 (~cond & sk_bit_cast<Vec<1, M<T>>>(
e)) );
482}
483SINT Vec<N,T>
if_then_else(
const Vec<
N,M<T>>& cond,
const Vec<N,T>& t,
const Vec<N,T>& e) {
484
485#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
486 if constexpr (
N*
sizeof(
T) == 32) {
487 return sk_bit_cast<Vec<N,T>>(_mm256_blendv_epi8(sk_bit_cast<__m256i>(e),
488 sk_bit_cast<__m256i>(t),
489 sk_bit_cast<__m256i>(cond)));
490 }
491#endif
492#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
493 if constexpr (
N*
sizeof(
T) == 16) {
494 return sk_bit_cast<Vec<N,T>>(_mm_blendv_epi8(sk_bit_cast<__m128i>(e),
495 sk_bit_cast<__m128i>(t),
496 sk_bit_cast<__m128i>(cond)));
497 }
498#endif
499#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
500 if constexpr (
N*
sizeof(
T) == 16) {
501 return sk_bit_cast<Vec<N,T>>(vbslq_u8(sk_bit_cast<uint8x16_t>(cond),
502 sk_bit_cast<uint8x16_t>(t),
503 sk_bit_cast<uint8x16_t>(e)));
504 }
505#endif
506#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
507 if constexpr (
N*
sizeof(
T) == 32) {
508 return sk_bit_cast<Vec<N,T>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),
509 sk_bit_cast<__m256i>(t),
510 sk_bit_cast<__m256i>(cond)));
511 }
512#endif
513#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
514 if constexpr (
N*
sizeof(
T) == 16) {
515 return sk_bit_cast<Vec<N,T>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(e),
516 sk_bit_cast<__m128i>(t),
517 sk_bit_cast<__m128i>(cond)));
518 }
519#endif
520
521 if constexpr (
N*
sizeof(
T) > 16) {
524 }
525
527}
528
529SIT bool any(
const Vec<1,T>&
x) {
return x.val != 0; }
531
532
533#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
534 if constexpr (
N*
sizeof(
T) == 32) {
535 return !_mm256_testz_si256(sk_bit_cast<__m256i>(
x), _mm256_set1_epi32(-1));
536 }
537#endif
538#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
539 if constexpr (
N*
sizeof(
T) == 16) {
540 return !_mm_testz_si128(sk_bit_cast<__m128i>(
x), _mm_set1_epi32(-1));
541 }
542#endif
543#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
544 if constexpr (
N*
sizeof(
T) == 16) {
545
546
547
548 return _mm_movemask_ps(_mm_cmpneq_ps(sk_bit_cast<__m128>(
x), _mm_set1_ps(0))) != 0b0000;
549 }
550#endif
551#if SKVX_USE_SIMD && defined(__aarch64__)
552
553
554
555 if constexpr (
N*
sizeof(
T) == 8 ) {
return vmaxv_u8 (sk_bit_cast<uint8x8_t> (
x)) > 0; }
556 if constexpr (
N*
sizeof(
T) == 16) {
return vmaxvq_u8(sk_bit_cast<uint8x16_t>(
x)) > 0; }
557#endif
558#if SKVX_USE_SIMD && defined(__wasm_simd128__)
559 if constexpr (
N == 4 &&
sizeof(
T) == 4) {
560 return wasm_i32x4_any_true(
sk_bit_cast<VExt<4,int>>(
x));
561 }
562#endif
563#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
564 if constexpr (
N*
sizeof(
T) == 32) {
565 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0),
566 sk_bit_cast<__m256i>(
x)));
567 return (retv[0] | retv[4]) != 0b0000;
568 }
569#endif
570#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
571 if constexpr (
N*
sizeof(
T) == 16) {
572 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0),
573 sk_bit_cast<__m128i>(
x)));
574 return retv[0] != 0b0000;
575 }
576#endif
579}
580
581SIT bool all(
const Vec<1,T>&
x) {
return x.val != 0; }
583
584
585#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
586
587
588 if constexpr (
N == 4 &&
sizeof(
T) == 4) {
589 return _mm_movemask_ps(_mm_cmpneq_ps(sk_bit_cast<__m128>(
x), _mm_set1_ps(0))) == 0b1111;
590 }
591#endif
592#if SKVX_USE_SIMD && defined(__aarch64__)
593
594 if constexpr (
sizeof(
T)==1 &&
N==8) {
return vminv_u8 (sk_bit_cast<uint8x8_t> (
x)) > 0;}
595 if constexpr (
sizeof(
T)==1 &&
N==16) {
return vminvq_u8 (sk_bit_cast<uint8x16_t>(
x)) > 0;}
596 if constexpr (
sizeof(
T)==2 &&
N==4) {
return vminv_u16 (sk_bit_cast<uint16x4_t>(
x)) > 0;}
597 if constexpr (
sizeof(
T)==2 &&
N==8) {
return vminvq_u16(sk_bit_cast<uint16x8_t>(
x)) > 0;}
598 if constexpr (
sizeof(
T)==4 &&
N==2) {
return vminv_u32 (sk_bit_cast<uint32x2_t>(
x)) > 0;}
599 if constexpr (
sizeof(
T)==4 &&
N==4) {
return vminvq_u32(sk_bit_cast<uint32x4_t>(
x)) > 0;}
600#endif
601#if SKVX_USE_SIMD && defined(__wasm_simd128__)
602 if constexpr (
N == 4 &&
sizeof(
T) == 4) {
603 return wasm_i32x4_all_true(
sk_bit_cast<VExt<4,int>>(
x));
604 }
605#endif
606#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
607 if constexpr (
N == 8 &&
sizeof(
T) == 4) {
608 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0),
609 sk_bit_cast<__m256i>(
x)));
610 return (retv[0] & retv[4]) == 0b1111;
611 }
612#endif
613#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
614 if constexpr (
N == 4 &&
sizeof(
T) == 4) {
615 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0),
616 sk_bit_cast<__m128i>(
x)));
617 return retv[0] == 0b1111;
618 }
619#endif
622}
623
624
625
626template <typename D, typename S>
627SI Vec<1,D>
cast(
const Vec<1,S>& src) {
return (
D)
src.val; }
628
629template <typename D, int N, typename S>
630SI Vec<N,D>
cast(
const Vec<N,S>& src) {
631#if SKVX_USE_SIMD && defined(__clang__)
632 return to_vec(__builtin_convertvector(to_vext(src), VExt<N,D>));
633#else
634 return join(cast<D>(
src.lo), cast<D>(
src.hi));
635#endif
636}
637
638
639SIT T min(
const Vec<1,T>&
x) {
return x.val; }
640SIT T max(
const Vec<1,T>&
x) {
return x.val; }
643
646
647SINTU Vec<N,T>
min(
const Vec<N,T>&
x, U
y) {
return min(
x, Vec<N,T>(
y)); }
648SINTU Vec<N,T>
max(
const Vec<N,T>&
x, U
y) {
return max(
x, Vec<N,T>(
y)); }
649SINTU Vec<N,T>
min(U
x,
const Vec<N,T>&
y) {
return min(Vec<N,T>(
x),
y); }
650SINTU Vec<N,T>
max(U
x,
const Vec<N,T>&
y) {
return max(Vec<N,T>(
x),
y); }
651
652
653
654SINT Vec<N,T>
pin(
const Vec<N,T>&
x,
const Vec<N,T>& lo,
const Vec<N,T>& hi) {
656}
657
658
659
660
661
662
663
664
665template <
int... Ix,
int N,
typename T>
666SI Vec<
sizeof...(Ix),
T>
shuffle(
const Vec<N,T>&
x) {
667#if SKVX_USE_SIMD && defined(__clang__)
668
669 return to_vec<
sizeof...(Ix),
T>(__builtin_shufflevector(to_vext(
x), to_vext(
x), Ix...));
670#else
672#endif
673}
674
675
676
677
678template <
typename Fn,
typename... Args,
size_t...
I>
679SI auto map(std::index_sequence<I...>,
680 Fn&& fn,
const Args&...
args) ->
skvx::Vec<
sizeof...(I),
decltype(fn(
args[0]...))> {
681 auto lane = [&](size_t i)
682#if defined(__clang__)
683
684
685
686
687
689#endif
690 {
return fn(
args[
static_cast<int>(i)]...); };
691
692 return { lane(
I)... };
693}
694
695template <
typename Fn,
int N,
typename T,
typename... Rest>
696auto map(Fn&& fn,
const Vec<N,T>& first,
const Rest&... rest) {
697
698 return map(std::make_index_sequence<N>{}, fn, first,rest...);
699}
700
701SIN Vec<N,float>
ceil(
const Vec<N,float>&
x) {
return map( ceilf,
x); }
702SIN Vec<N,float>
floor(
const Vec<N,float>&
x) {
return map(floorf,
x); }
703SIN Vec<N,float>
trunc(
const Vec<N,float>&
x) {
return map(truncf,
x); }
704SIN Vec<N,float>
round(
const Vec<N,float>&
x) {
return map(roundf,
x); }
705SIN Vec<N,float>
sqrt(
const Vec<N,float>&
x) {
return map( sqrtf,
x); }
706SIN Vec<N,float>
abs(
const Vec<N,float>&
x) {
return map( fabsf,
x); }
707SIN Vec<N,float>
fma(
const Vec<N,float>&
x,
708 const Vec<N,float>&
y,
709 const Vec<N,float>& z) {
710
711 auto fn = [](
float x,
float y,
float z) {
return fmaf(
x,
y,z); };
712 return map(fn,
x,
y,z);
713}
714
715SI Vec<1,int>
lrint(
const Vec<1,float>&
x) {
716 return (
int)lrintf(
x.val);
717}
718SIN Vec<N,int>
lrint(
const Vec<N,float>&
x) {
719#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
720 if constexpr (
N == 8) {
721 return sk_bit_cast<Vec<N,int>>(_mm256_cvtps_epi32(sk_bit_cast<__m256>(
x)));
722 }
723#endif
724#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
725 if constexpr (
N == 4) {
726 return sk_bit_cast<Vec<N,int>>(_mm_cvtps_epi32(sk_bit_cast<__m128>(
x)));
727 }
728#endif
729#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
730 if constexpr (
N == 8) {
731 return sk_bit_cast<Vec<N,int>>(__lasx_xvftint_w_s(sk_bit_cast<__m256>(
x)));
732 }
733#endif
734#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
735 if constexpr (
N == 4) {
736 return sk_bit_cast<Vec<N,int>>(__lsx_vftint_w_s(sk_bit_cast<__m128>(
x)));
737 }
738#endif
741}
742
744
745
746
747
748
749SIN Vec<N,uint16_t>
to_half(
const Vec<N,float>&
x) {
751
752
753
754
755 if constexpr (
N > 4) {
758 }
759
760#if SKVX_USE_SIMD && defined(__aarch64__)
761 if constexpr (
N == 4) {
762 return sk_bit_cast<Vec<N,uint16_t>>(vcvt_f16_f32(sk_bit_cast<float32x4_t>(
x)));
763
764 }
765#endif
766
767#define I(x) sk_bit_cast<Vec<N,int32_t>>(x)
768#define F(x) sk_bit_cast<Vec<N,float>>(x)
769 Vec<N,int32_t> sem =
I(
x),
770 s = sem & 0x8000
'0000,
771 em = min(sem ^ s, 0x4780'0000),
772
773
774
775 magic =
I(
max(
F(em) * 8192.f, 0.5f)) & (255 << 23),
776 rounded =
I((
F(em) +
F(magic))),
777
778
779 exp = ((
magic >> 13) - ((127-15+13+1)<<10)),
781 return cast<uint16_t>((
s>>16) |
f16);
782#undef I
783#undef F
784}
785
786
787
788
790 if constexpr (
N > 4) {
793 }
794
795#if SKVX_USE_SIMD && defined(__aarch64__)
796 if constexpr (
N == 4) {
797 return sk_bit_cast<Vec<N,float>>(vcvt_f32_f16(sk_bit_cast<float16x4_t>(
x)));
798 }
799#endif
800
801 Vec<N,int32_t> wide = cast<int32_t>(
x),
804 inf_or_nan = (em >= (31 << 10)) & (255 << 23),
805 is_norm = em > 0x3ff,
806
807 sub =
sk_bit_cast<Vec<N,int32_t>>((cast<float>(em) * (1.f/(1<<24)))),
808 norm = ((em<<13) + ((127-15)<<23)),
809 finite = (is_norm & norm) | (~is_norm & sub);
810
811
812
813 return sk_bit_cast<Vec<N,float>>((
s<<16) | finite | inf_or_nan);
814}
815
816
817SIN Vec<N,uint8_t>
div255(
const Vec<N,uint16_t>&
x) {
818 return cast<uint8_t>( (
x+127)/255 );
819}
820
821
822
823SIN Vec<N,uint8_t>
approx_scale(
const Vec<N,uint8_t>&
x,
const Vec<N,uint8_t>&
y) {
824
825
826 auto X = cast<uint16_t>(
x),
827 Y = cast<uint16_t>(
y);
828 return cast<uint8_t>( (
X*
Y+
X)/256 );
829}
830
831
832SINT std::enable_if_t<std::is_unsigned_v<T>, Vec<N,T>>
saturated_add(
const Vec<N,T>&
x,
834#if SKVX_USE_SIMD && (SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 || defined(SK_ARM_HAS_NEON) || \
835 SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX)
836
837
838 if constexpr (
N == 16 &&
sizeof(
T) == 1) {
839 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
840 return sk_bit_cast<Vec<N,T>>(_mm_adds_epu8(sk_bit_cast<__m128i>(
x),
841 sk_bit_cast<__m128i>(
y)));
842 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
843 return sk_bit_cast<Vec<N,T>>(__lsx_vsadd_bu(sk_bit_cast<__m128i>(
x),
844 sk_bit_cast<__m128i>(
y)));
845 #else
846 return sk_bit_cast<Vec<N,T>>(vqaddq_u8(sk_bit_cast<uint8x16_t>(
x),
847 sk_bit_cast<uint8x16_t>(
y)));
848 #endif
849 }
else if constexpr (
N < 16 &&
sizeof(
T) == 1) {
851 }
else if constexpr (
sizeof(
T) == 1) {
853 }
854#endif
855
857 return if_then_else(sum <
x, Vec<N,T>(std::numeric_limits<T>::max()), sum);
858}
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878class ScaledDividerU32 {
879public:
880 explicit ScaledDividerU32(uint32_t divisor)
881 : fDivisorFactor{(uint32_t)(
std::
round((1.0 / divisor) * (1ull << 32)))}
882 , fHalf{(divisor + 1) >> 1} {
883 assert(divisor > 1);
884 }
885
886 Vec<4, uint32_t> divide(const Vec<4, uint32_t>& numerator) const {
887#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
888 uint64x2_t hi = vmull_n_u32(vget_high_u32(to_vext(numerator)), fDivisorFactor);
889 uint64x2_t lo = vmull_n_u32(vget_low_u32(to_vext(numerator)), fDivisorFactor);
890
891 return to_vec<4, uint32_t>(vcombine_u32(vshrn_n_u64(lo,32), vshrn_n_u64(hi,32)));
892#else
893 return cast<uint32_t>((cast<uint64_t>(numerator) * fDivisorFactor) >> 32);
894#endif
895 }
896
897 uint32_t half() const { return fHalf; }
898
899private:
900 const uint32_t fDivisorFactor;
901 const uint32_t fHalf;
902};
903
904
905SIN Vec<N,uint16_t>
mull(
const Vec<N,uint8_t>&
x,
906 const Vec<N,uint8_t>&
y) {
907#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
908
909 if constexpr (
N == 8) {
910 return to_vec<8,uint16_t>(vmull_u8(to_vext(
x), to_vext(
y)));
911 }
else if constexpr (
N < 8) {
913 } else {
915 }
916#else
917 return cast<uint16_t>(
x) * cast<uint16_t>(
y);
918#endif
919}
920
921SIN Vec<N,uint32_t>
mull(
const Vec<N,uint16_t>&
x,
922 const Vec<N,uint16_t>&
y) {
923#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
924
925 if constexpr (
N == 4) {
926 return to_vec<4,uint32_t>(vmull_u16(to_vext(
x), to_vext(
y)));
927 }
else if constexpr (
N < 4) {
929 } else {
931 }
932#else
933 return cast<uint32_t>(
x) * cast<uint32_t>(
y);
934#endif
935}
936
937SIN Vec<N,uint16_t>
mulhi(
const Vec<N,uint16_t>&
x,
938 const Vec<N,uint16_t>&
y) {
939#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
940
941 if constexpr (
N == 8) {
942 return sk_bit_cast<Vec<8,uint16_t>>(_mm_mulhi_epu16(sk_bit_cast<__m128i>(
x),
943 sk_bit_cast<__m128i>(
y)));
944 }
else if constexpr (
N < 8) {
946 } else {
948 }
949#elif SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
950 if constexpr (
N == 8) {
951 return sk_bit_cast<Vec<8,uint16_t>>(__lsx_vmuh_hu(sk_bit_cast<__m128i>(
x),
952 sk_bit_cast<__m128i>(
y)));
953 }
else if constexpr (
N < 8) {
955 } else {
957 }
958#else
959 return skvx::cast<uint16_t>(
mull(
x,
y) >> 16);
960#endif
961}
962
963SINT T dot(
const Vec<N, T>&
a,
const Vec<N, T>&
b) {
964
965
966
968 if constexpr (
N == 2) {
969 return ab[0] +
ab[1];
970 }
else if constexpr (
N == 4) {
971 return ab[0] +
ab[1] +
ab[2] +
ab[3];
972 } else {
974 for (
int i = 1; i <
N; ++i) {
976 }
977 return sum;
978 }
979}
980
981SIT T cross(
const Vec<2, T>&
a,
const Vec<2, T>&
b) {
982 auto x =
a * shuffle<1,0>(
b);
984}
985
986SIN float length(
const Vec<N, float>& v) {
987 return std::sqrt(
dot(v, v));
988}
989
990SIN double length(
const Vec<N, double>& v) {
991 return std::sqrt(
dot(v, v));
992}
993
996}
997
1000}
1001
1003
1004
1006}
1007
1008
1009
1010
1011
1015 Vec<1,T>& c,
1019 c.val = v[2];
1021}
1025 Vec<N,T>& c,
1029}
1030#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
1031#define IMPL_LOAD4_TRANSPOSED(N, T, VLD) \
1032SI void strided_load4(const T* v, \
1033 Vec<N,T>& a, \
1034 Vec<N,T>& b, \
1035 Vec<N,T>& c, \
1036 Vec<N,T>& d) { \
1037 auto mat = VLD(v); \
1038 a = sk_bit_cast<Vec<N,T>>(mat.val[0]); \
1039 b = sk_bit_cast<Vec<N,T>>(mat.val[1]); \
1040 c = sk_bit_cast<Vec<N,T>>(mat.val[2]); \
1041 d = sk_bit_cast<Vec<N,T>>(mat.val[3]); \
1042}
1043IMPL_LOAD4_TRANSPOSED(2, uint32_t, vld4_u32)
1044IMPL_LOAD4_TRANSPOSED(4, uint16_t, vld4_u16)
1045IMPL_LOAD4_TRANSPOSED(8, uint8_t, vld4_u8)
1046IMPL_LOAD4_TRANSPOSED(2, int32_t, vld4_s32)
1047IMPL_LOAD4_TRANSPOSED(4, int16_t, vld4_s16)
1048IMPL_LOAD4_TRANSPOSED(8, int8_t, vld4_s8)
1049IMPL_LOAD4_TRANSPOSED(2, float, vld4_f32)
1050IMPL_LOAD4_TRANSPOSED(4, uint32_t, vld4q_u32)
1051IMPL_LOAD4_TRANSPOSED(8, uint16_t, vld4q_u16)
1052IMPL_LOAD4_TRANSPOSED(16, uint8_t, vld4q_u8)
1053IMPL_LOAD4_TRANSPOSED(4, int32_t, vld4q_s32)
1054IMPL_LOAD4_TRANSPOSED(8, int16_t, vld4q_s16)
1055IMPL_LOAD4_TRANSPOSED(16, int8_t, vld4q_s8)
1056IMPL_LOAD4_TRANSPOSED(4, float, vld4q_f32)
1057#undef IMPL_LOAD4_TRANSPOSED
1058
1059#elif SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
1060
1064 Vec<4,float>& c,
1066 __m128 a_ = _mm_loadu_ps(v);
1067 __m128 b_ = _mm_loadu_ps(v+4);
1068 __m128 c_ = _mm_loadu_ps(v+8);
1069 __m128 d_ = _mm_loadu_ps(v+12);
1070 _MM_TRANSPOSE4_PS(a_, b_, c_, d_);
1071 a = sk_bit_cast<Vec<4,float>>(a_);
1072 b = sk_bit_cast<Vec<4,float>>(b_);
1073 c = sk_bit_cast<Vec<4,float>>(c_);
1074 d = sk_bit_cast<Vec<4,float>>(d_);
1075}
1076
1077#elif SKVX_USE_SIMD && SKVX_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1078#define _LSX_TRANSPOSE4(row0, row1, row2, row3) \
1079do { \
1080 __m128i __t0 = __lsx_vilvl_w (row1, row0); \
1081 __m128i __t1 = __lsx_vilvl_w (row3, row2); \
1082 __m128i __t2 = __lsx_vilvh_w (row1, row0); \
1083 __m128i __t3 = __lsx_vilvh_w (row3, row2); \
1084 (row0) = __lsx_vilvl_d (__t1, __t0); \
1085 (row1) = __lsx_vilvh_d (__t1, __t0); \
1086 (row2) = __lsx_vilvl_d (__t3, __t2); \
1087 (row3) = __lsx_vilvh_d (__t3, __t2); \
1088} while (0)
1089
1093 Vec<4,int>& c,
1095 __m128i a_ = __lsx_vld(v, 0);
1096 __m128i b_ = __lsx_vld(v, 16);
1097 __m128i c_ = __lsx_vld(v, 32);
1098 __m128i d_ = __lsx_vld(v, 48);
1099 _LSX_TRANSPOSE4(a_, b_, c_, d_);
1100 a = sk_bit_cast<Vec<4,int>>(a_);
1101 b = sk_bit_cast<Vec<4,int>>(b_);
1102 c = sk_bit_cast<Vec<4,int>>(c_);
1103 d = sk_bit_cast<Vec<4,int>>(d_);
1104}
1105#endif
1106
1107
1108
1109
1110
1114}
1118}
1119#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
1120#define IMPL_LOAD2_TRANSPOSED(N, T, VLD) \
1121SI void strided_load2(const T* v, Vec<N,T>& a, Vec<N,T>& b) { \
1122 auto mat = VLD(v); \
1123 a = sk_bit_cast<Vec<N,T>>(mat.val[0]); \
1124 b = sk_bit_cast<Vec<N,T>>(mat.val[1]); \
1125}
1126IMPL_LOAD2_TRANSPOSED(2, uint32_t, vld2_u32)
1127IMPL_LOAD2_TRANSPOSED(4, uint16_t, vld2_u16)
1128IMPL_LOAD2_TRANSPOSED(8, uint8_t, vld2_u8)
1129IMPL_LOAD2_TRANSPOSED(2, int32_t, vld2_s32)
1130IMPL_LOAD2_TRANSPOSED(4, int16_t, vld2_s16)
1131IMPL_LOAD2_TRANSPOSED(8, int8_t, vld2_s8)
1132IMPL_LOAD2_TRANSPOSED(2, float, vld2_f32)
1133IMPL_LOAD2_TRANSPOSED(4, uint32_t, vld2q_u32)
1134IMPL_LOAD2_TRANSPOSED(8, uint16_t, vld2q_u16)
1135IMPL_LOAD2_TRANSPOSED(16, uint8_t, vld2q_u8)
1136IMPL_LOAD2_TRANSPOSED(4, int32_t, vld2q_s32)
1137IMPL_LOAD2_TRANSPOSED(8, int16_t, vld2q_s16)
1138IMPL_LOAD2_TRANSPOSED(16, int8_t, vld2q_s8)
1139IMPL_LOAD2_TRANSPOSED(4, float, vld2q_f32)
1140#undef IMPL_LOAD2_TRANSPOSED
1141#endif
1142
1143
1144using float2 = Vec< 2, float>;
1145using float4 = Vec< 4, float>;
1146using float8 = Vec< 8, float>;
1147
1148using double2 = Vec< 2, double>;
1149using double4 = Vec< 4, double>;
1150using double8 = Vec< 8, double>;
1151
1152using byte2 = Vec< 2, uint8_t>;
1153using byte4 = Vec< 4, uint8_t>;
1154using byte8 = Vec< 8, uint8_t>;
1155using byte16 = Vec<16, uint8_t>;
1156
1157using int2 = Vec< 2, int32_t>;
1158using int4 = Vec< 4, int32_t>;
1159using int8 = Vec< 8, int32_t>;
1160
1161using ushort2 = Vec< 2, uint16_t>;
1162using ushort4 = Vec< 4, uint16_t>;
1163using ushort8 = Vec< 8, uint16_t>;
1164
1165using uint2 = Vec< 2, uint32_t>;
1166using uint4 = Vec< 4, uint32_t>;
1167using uint8 = Vec< 8, uint32_t>;
1168
1169using long2 = Vec< 2, int64_t>;
1170using long4 = Vec< 4, int64_t>;
1171using long8 = Vec< 8, int64_t>;
1172
1173
1174using half2 = Vec< 2, uint16_t>;
1175using half4 = Vec< 4, uint16_t>;
1176using half8 = Vec< 8, uint16_t>;
1177
1178}
1179
1180#undef SINTU
1181#undef SINT
1182#undef SIN
1183#undef SIT
1184#undef SI
1185#undef SKVX_ALWAYS_INLINE
1186#undef SKVX_USE_SIMD
1187
1188#endif
static void round(SkPoint *p)
static const uint64_t f16[kNumPixels]
std::enable_if_t< sknonstd::is_bitmask_enum< E >::value, E > constexpr operator&(E l, E r)
std::enable_if_t< sknonstd::is_bitmask_enum< E >::value, E & > constexpr operator&=(E &l, E r)
std::enable_if_t< sknonstd::is_bitmask_enum< E >::value, E & > constexpr operator^=(E &l, E r)
std::enable_if_t< sknonstd::is_bitmask_enum< E >::value, E > constexpr operator~(E e)
std::enable_if_t< sknonstd::is_bitmask_enum< E >::value, E > constexpr operator|(E l, E r)
std::enable_if_t< sknonstd::is_bitmask_enum< E >::value, E & > constexpr operator|=(E &l, E r)
std::enable_if_t< sknonstd::is_bitmask_enum< E >::value, E > constexpr operator^(E l, E r)
static uint8_t div255(unsigned prod)
static bool SkIsFinite(T x, Pack... values)
static void normalize(int n, double *gauss)
static skvx::float4 fma(const skvx::float4 &f, float m, const skvx::float4 &a)
static SkSize operator*(SkISize u, SkScalar s)
bool operator!=(const sk_sp< T > &a, const sk_sp< U > &b)
static SK_ALWAYS_INLINE Dst SK_FP_SAFE_ABI sk_bit_cast(const Src &src)
#define SKVX_ALWAYS_INLINE
static const char * begin(const StringSlice &s)
static bool operator<(const SkPlainTextEditor::Editor::TextPosition &u, const SkPlainTextEditor::Editor::TextPosition &v)
bool operator==(const FlutterPoint &a, const FlutterPoint &b)
std::ostream & operator<<(std::ostream &out, const FlutterPoint &point)
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
G_BEGIN_DECLS G_MODULE_EXPORT FlValue * args
static float max(float r, float g, float b)
static float min(float r, float g, float b)
__attribute__((visibility("default"))) int RunBenchmarks(int argc
it will be possible to load the file into Perfetto s trace viewer disable asset Prevents usage of any non test fonts unless they were explicitly Loaded via prefetched default font Indicates whether the embedding started a prefetch of the default font manager before creating the engine run In non interactive keep the shell running after the Dart script has completed enable serial On low power devices with low core running concurrent GC tasks on threads can cause them to contend with the UI thread which could potentially lead to jank This option turns off all concurrent GC activities domain network JSON encoded network policy per domain This overrides the DisallowInsecureConnections switch Embedder can specify whether to allow or disallow insecure connections at a domain level old gen heap size
constexpr Color operator-(T value, const Color &c)
constexpr Color operator/(T value, const Color &c)
constexpr bool operator>=(const EnumType &lhs, const Mask< EnumType > &rhs)
constexpr Color operator+(T value, const Color &c)
constexpr bool operator<=(const EnumType &lhs, const Mask< EnumType > &rhs)
int64_t cross(Point d0, Point d1)
SINT bool isfinite(const Vec< N, T > &v)
SIN Vec< N, float > trunc(const Vec< N, float > &x)
Vec< 8, uint16_t > ushort8
SINT T dot(const Vec< N, T > &a, const Vec< N, T > &b)
SINT Vec< N, T > & operator-=(Vec< N, T > &x, const Vec< N, T > &y)
SI Vec< 1, int > lrint(const Vec< 1, float > &x)
Vec< 16, uint8_t > byte16
SINT Vec< N, T > naive_if_then_else(const Vec< N, M< T > > &cond, const Vec< N, T > &t, const Vec< N, T > &e)
SIT void strided_load4(const T *v, Vec< 1, T > &a, Vec< 1, T > &b, Vec< 1, T > &c, Vec< 1, T > &d)
SIN Vec< N, uint16_t > mulhi(const Vec< N, uint16_t > &x, const Vec< N, uint16_t > &y)
SIN Vec< N, float > abs(const Vec< N, float > &x)
SINT Vec< N, T > & operator*=(Vec< N, T > &x, const Vec< N, T > &y)
SIT void strided_load2(const T *v, Vec< 1, T > &a, Vec< 1, T > &b)
SIN Vec< N, float > sqrt(const Vec< N, float > &x)
SINT Vec< 2 *N, T > join(const Vec< N, T > &lo, const Vec< N, T > &hi)
SIN Vec< N, uint16_t > mull(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
SINT Vec< N, T > & operator>>=(Vec< N, T > &x, int bits)
SIN Vec< N, float > from_half(const Vec< N, uint16_t > &x)
SIN Vec< N, uint16_t > to_half(const Vec< N, float > &x)
SIT bool all(const Vec< 1, T > &x)
SINT std::enable_if_t< std::is_unsigned_v< T >, Vec< N, T > > saturated_add(const Vec< N, T > &x, const Vec< N, T > &y)
SI auto map(std::index_sequence< I... >, Fn &&fn, const Args &... args) -> skvx::Vec< sizeof...(I), decltype(fn(args[0]...))>
SIT Vec< 1, T > operator>>(const Vec< 1, T > &x, int k)
Vec< 2, uint16_t > ushort2
SIT Vec< 1, T > operator!(const Vec< 1, T > &x)
SINT Vec< N, T > & operator/=(Vec< N, T > &x, const Vec< N, T > &y)
SI Vec< sizeof...(Ix), T > shuffle(const Vec< N, T > &)
Vec< 4, uint16_t > ushort4
SINT Vec< N, T > & operator+=(Vec< N, T > &x, const Vec< N, T > &y)
SIN Vec< N, float > fract(const Vec< N, float > &x)
SIN Vec< N, float > floor(const Vec< N, float > &x)
SIT bool any(const Vec< 1, T > &x)
SIN Vec< N, float > ceil(const Vec< N, float > &x)
SINT Vec< N, T > & operator<<=(Vec< N, T > &x, int bits)
SIN Vec< N, uint8_t > approx_scale(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
SINT Vec< N, T > pin(const Vec< N, T > &x, const Vec< N, T > &lo, const Vec< N, T > &hi)