Flutter Engine
The Flutter Engine
SkVx.h
Go to the documentation of this file.
1/*
2 * Copyright 2019 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SKVX_DEFINED
9#define SKVX_DEFINED
10
11// skvx::Vec<N,T> are SIMD vectors of N T's, a v1.5 successor to SkNx<N,T>.
12//
13// This time we're leaning a bit less on platform-specific intrinsics and a bit
14// more on Clang/GCC vector extensions, but still keeping the option open to
15// drop in platform-specific intrinsics, actually more easily than before.
16//
17// We've also fixed a few of the caveats that used to make SkNx awkward to work
18// with across translation units. skvx::Vec<N,T> always has N*sizeof(T) size
19// and alignment and is safe to use across translation units freely.
20// (Ideally we'd only align to T, but that tanks ARMv7 NEON codegen.)
21
23#include "src/base/SkUtils.h"
24#include <algorithm> // std::min, std::max
25#include <cassert> // assert()
26#include <cmath> // ceilf, floorf, truncf, roundf, sqrtf, etc.
27#include <cstdint> // intXX_t
28#include <cstring> // memcpy()
29#include <initializer_list> // std::initializer_list
30#include <type_traits>
31#include <utility> // std::index_sequence
32
33// Users may disable SIMD with SKNX_NO_SIMD, which may be set via compiler flags.
34// The gn build has no option which sets SKNX_NO_SIMD.
35// Use SKVX_USE_SIMD internally to avoid confusing double negation.
36// Do not use 'defined' in a macro expansion.
37#if !defined(SKNX_NO_SIMD)
38 #define SKVX_USE_SIMD 1
39#else
40 #define SKVX_USE_SIMD 0
41#endif
42
43#if SKVX_USE_SIMD
44 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
45 #include <immintrin.h>
46 #elif defined(SK_ARM_HAS_NEON)
47 #include <arm_neon.h>
48 #elif defined(__wasm_simd128__)
49 #include <wasm_simd128.h>
50 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
51 #include <lasxintrin.h>
52 #include <lsxintrin.h>
53 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
54 #include <lsxintrin.h>
55 #endif
56#endif
57
58// To avoid ODR violations, all methods must be force-inlined...
59#if defined(_MSC_VER)
60 #define SKVX_ALWAYS_INLINE __forceinline
61#else
62 #define SKVX_ALWAYS_INLINE __attribute__((always_inline))
63#endif
64
65// ... and all standalone functions must be static. Please use these helpers:
66#define SI static inline
67#define SIT template < typename T> SI
68#define SIN template <int N > SI
69#define SINT template <int N, typename T> SI
70#define SINTU template <int N, typename T, typename U, \
71 typename=std::enable_if_t<std::is_convertible<U,T>::value>> SI
72
73namespace skvx {
74
75template <int N, typename T>
76struct alignas(N*sizeof(T)) Vec;
77
78template <int... Ix, int N, typename T>
79SI Vec<sizeof...(Ix),T> shuffle(const Vec<N,T>&);
80
81// All Vec have the same simple memory layout, the same as `T vec[N]`.
82template <int N, typename T>
83struct alignas(N*sizeof(T)) Vec {
84 static_assert((N & (N-1)) == 0, "N must be a power of 2.");
85 static_assert(sizeof(T) >= alignof(T), "What kind of unusual T is this?");
86
87 // Methods belong here in the class declaration of Vec only if:
88 // - they must be here, like constructors or operator[];
89 // - they'll definitely never want a specialized implementation.
90 // Other operations on Vec should be defined outside the type.
91
94
95 // NOTE: Vec{x} produces x000..., whereas Vec(x) produces xxxx.... since this constructor fills
96 // unspecified lanes with 0s, whereas the single T constructor fills all lanes with the value.
97 SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) {
98 T vals[N] = {0};
99 assert(xs.size() <= (size_t)N);
100 memcpy(vals, xs.begin(), std::min(xs.size(), (size_t)N)*sizeof(T));
101
102 this->lo = Vec<N/2,T>::Load(vals + 0);
103 this->hi = Vec<N/2,T>::Load(vals + N/2);
104 }
105
106 SKVX_ALWAYS_INLINE T operator[](int i) const { return i<N/2 ? this->lo[i] : this->hi[i-N/2]; }
107 SKVX_ALWAYS_INLINE T& operator[](int i) { return i<N/2 ? this->lo[i] : this->hi[i-N/2]; }
108
109 SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) {
110 return sk_unaligned_load<Vec>(ptr);
111 }
112 SKVX_ALWAYS_INLINE void store(void* ptr) const {
113 // Note: Calling sk_unaligned_store produces slightly worse code here, for some reason
114 memcpy(ptr, this, sizeof(Vec));
115 }
116
117 Vec<N/2,T> lo, hi;
118};
119
120// We have specializations for N == 1 (the base-case), as well as 2 and 4, where we add helpful
121// constructors and swizzle accessors.
122template <typename T>
123struct alignas(4*sizeof(T)) Vec<4,T> {
124 static_assert(sizeof(T) >= alignof(T), "What kind of unusual T is this?");
125
128 SKVX_ALWAYS_INLINE Vec(T x, T y, T z, T w) : lo(x,y), hi(z,w) {}
129 SKVX_ALWAYS_INLINE Vec(Vec<2,T> xy, T z, T w) : lo(xy), hi(z,w) {}
132
133 SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) {
134 T vals[4] = {0};
135 assert(xs.size() <= (size_t)4);
136 memcpy(vals, xs.begin(), std::min(xs.size(), (size_t)4)*sizeof(T));
137
138 this->lo = Vec<2,T>::Load(vals + 0);
139 this->hi = Vec<2,T>::Load(vals + 2);
140 }
141
142 SKVX_ALWAYS_INLINE T operator[](int i) const { return i<2 ? this->lo[i] : this->hi[i-2]; }
143 SKVX_ALWAYS_INLINE T& operator[](int i) { return i<2 ? this->lo[i] : this->hi[i-2]; }
144
145 SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) {
146 return sk_unaligned_load<Vec>(ptr);
147 }
148 SKVX_ALWAYS_INLINE void store(void* ptr) const {
149 memcpy(ptr, this, sizeof(Vec));
150 }
151
154 SKVX_ALWAYS_INLINE T& x() { return lo.lo.val; }
155 SKVX_ALWAYS_INLINE T& y() { return lo.hi.val; }
156 SKVX_ALWAYS_INLINE T& z() { return hi.lo.val; }
157 SKVX_ALWAYS_INLINE T& w() { return hi.hi.val; }
158
159 SKVX_ALWAYS_INLINE Vec<2,T> xy() const { return lo; }
160 SKVX_ALWAYS_INLINE Vec<2,T> zw() const { return hi; }
161 SKVX_ALWAYS_INLINE T x() const { return lo.lo.val; }
162 SKVX_ALWAYS_INLINE T y() const { return lo.hi.val; }
163 SKVX_ALWAYS_INLINE T z() const { return hi.lo.val; }
164 SKVX_ALWAYS_INLINE T w() const { return hi.hi.val; }
165
166 // Exchange-based swizzles. These should take 1 cycle on NEON and 3 (pipelined) cycles on SSE.
167 SKVX_ALWAYS_INLINE Vec<4,T> yxwz() const { return shuffle<1,0,3,2>(*this); }
168 SKVX_ALWAYS_INLINE Vec<4,T> zwxy() const { return shuffle<2,3,0,1>(*this); }
169
171};
172
173template <typename T>
174struct alignas(2*sizeof(T)) Vec<2,T> {
175 static_assert(sizeof(T) >= alignof(T), "What kind of unusual T is this?");
176
180
181 SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) {
182 T vals[2] = {0};
183 assert(xs.size() <= (size_t)2);
184 memcpy(vals, xs.begin(), std::min(xs.size(), (size_t)2)*sizeof(T));
185
186 this->lo = Vec<1,T>::Load(vals + 0);
187 this->hi = Vec<1,T>::Load(vals + 1);
188 }
189
190 SKVX_ALWAYS_INLINE T operator[](int i) const { return i<1 ? this->lo[i] : this->hi[i-1]; }
191 SKVX_ALWAYS_INLINE T& operator[](int i) { return i<1 ? this->lo[i] : this->hi[i-1]; }
192
193 SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) {
194 return sk_unaligned_load<Vec>(ptr);
195 }
196 SKVX_ALWAYS_INLINE void store(void* ptr) const {
197 memcpy(ptr, this, sizeof(Vec));
198 }
199
200 SKVX_ALWAYS_INLINE T& x() { return lo.val; }
201 SKVX_ALWAYS_INLINE T& y() { return hi.val; }
202
203 SKVX_ALWAYS_INLINE T x() const { return lo.val; }
204 SKVX_ALWAYS_INLINE T y() const { return hi.val; }
205
206 // This exchange-based swizzle should take 1 cycle on NEON and 3 (pipelined) cycles on SSE.
207 SKVX_ALWAYS_INLINE Vec<2,T> yx() const { return shuffle<1,0>(*this); }
208 SKVX_ALWAYS_INLINE Vec<4,T> xyxy() const { return Vec<4,T>(*this, *this); }
209
211};
212
213template <typename T>
214struct Vec<1,T> {
215 T val = {};
216
219
220 SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) : val(xs.size() ? *xs.begin() : 0) {
221 assert(xs.size() <= (size_t)1);
222 }
223
224 SKVX_ALWAYS_INLINE T operator[](int i) const { assert(i == 0); return val; }
225 SKVX_ALWAYS_INLINE T& operator[](int i) { assert(i == 0); return val; }
226
227 SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) {
228 return sk_unaligned_load<Vec>(ptr);
229 }
230 SKVX_ALWAYS_INLINE void store(void* ptr) const {
231 memcpy(ptr, this, sizeof(Vec));
232 }
233};
234
235// Translate from a value type T to its corresponding Mask, the result of a comparison.
236template <typename T> struct Mask { using type = T; };
237template <> struct Mask<float > { using type = int32_t; };
238template <> struct Mask<double> { using type = int64_t; };
239template <typename T> using M = typename Mask<T>::type;
240
241// Join two Vec<N,T> into one Vec<2N,T>.
242SINT Vec<2*N,T> join(const Vec<N,T>& lo, const Vec<N,T>& hi) {
243 Vec<2*N,T> v;
244 v.lo = lo;
245 v.hi = hi;
246 return v;
247}
248
249// We have three strategies for implementing Vec operations:
250// 1) lean on Clang/GCC vector extensions when available;
251// 2) use map() to apply a scalar function lane-wise;
252// 3) recurse on lo/hi to scalar portable implementations.
253// We can slot in platform-specific implementations as overloads for particular Vec<N,T>,
254// or often integrate them directly into the recursion of style 3), allowing fine control.
255
256#if SKVX_USE_SIMD && (defined(__clang__) || defined(__GNUC__))
257
258 // VExt<N,T> types have the same size as Vec<N,T> and support most operations directly.
259 #if defined(__clang__)
260 template <int N, typename T>
261 using VExt = T __attribute__((ext_vector_type(N)));
262
263 #elif defined(__GNUC__)
264 template <int N, typename T>
265 struct VExtHelper {
266 typedef T __attribute__((vector_size(N*sizeof(T)))) type;
267 };
268
269 template <int N, typename T>
270 using VExt = typename VExtHelper<N,T>::type;
271
272 // For some reason some (new!) versions of GCC cannot seem to deduce N in the generic
273 // to_vec<N,T>() below for N=4 and T=float. This workaround seems to help...
274 SI Vec<4,float> to_vec(VExt<4,float> v) { return sk_bit_cast<Vec<4,float>>(v); }
275 #endif
276
277 SINT VExt<N,T> to_vext(const Vec<N,T>& v) { return sk_bit_cast<VExt<N,T>>(v); }
278 SINT Vec <N,T> to_vec(const VExt<N,T>& v) { return sk_bit_cast<Vec <N,T>>(v); }
279
280 SINT Vec<N,T> operator+(const Vec<N,T>& x, const Vec<N,T>& y) {
281 return to_vec<N,T>(to_vext(x) + to_vext(y));
282 }
283 SINT Vec<N,T> operator-(const Vec<N,T>& x, const Vec<N,T>& y) {
284 return to_vec<N,T>(to_vext(x) - to_vext(y));
285 }
286 SINT Vec<N,T> operator*(const Vec<N,T>& x, const Vec<N,T>& y) {
287 return to_vec<N,T>(to_vext(x) * to_vext(y));
288 }
289 SINT Vec<N,T> operator/(const Vec<N,T>& x, const Vec<N,T>& y) {
290 return to_vec<N,T>(to_vext(x) / to_vext(y));
291 }
292
293 SINT Vec<N,T> operator^(const Vec<N,T>& x, const Vec<N,T>& y) {
294 return to_vec<N,T>(to_vext(x) ^ to_vext(y));
295 }
296 SINT Vec<N,T> operator&(const Vec<N,T>& x, const Vec<N,T>& y) {
297 return to_vec<N,T>(to_vext(x) & to_vext(y));
298 }
299 SINT Vec<N,T> operator|(const Vec<N,T>& x, const Vec<N,T>& y) {
300 return to_vec<N,T>(to_vext(x) | to_vext(y));
301 }
302
303 SINT Vec<N,T> operator!(const Vec<N,T>& x) { return to_vec<N,T>(!to_vext(x)); }
304 SINT Vec<N,T> operator-(const Vec<N,T>& x) { return to_vec<N,T>(-to_vext(x)); }
305 SINT Vec<N,T> operator~(const Vec<N,T>& x) { return to_vec<N,T>(~to_vext(x)); }
306
307 SINT Vec<N,T> operator<<(const Vec<N,T>& x, int k) { return to_vec<N,T>(to_vext(x) << k); }
308 SINT Vec<N,T> operator>>(const Vec<N,T>& x, int k) { return to_vec<N,T>(to_vext(x) >> k); }
309
310 SINT Vec<N,M<T>> operator==(const Vec<N,T>& x, const Vec<N,T>& y) {
311 return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) == to_vext(y));
312 }
313 SINT Vec<N,M<T>> operator!=(const Vec<N,T>& x, const Vec<N,T>& y) {
314 return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) != to_vext(y));
315 }
316 SINT Vec<N,M<T>> operator<=(const Vec<N,T>& x, const Vec<N,T>& y) {
317 return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) <= to_vext(y));
318 }
319 SINT Vec<N,M<T>> operator>=(const Vec<N,T>& x, const Vec<N,T>& y) {
320 return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) >= to_vext(y));
321 }
322 SINT Vec<N,M<T>> operator< (const Vec<N,T>& x, const Vec<N,T>& y) {
323 return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) < to_vext(y));
324 }
325 SINT Vec<N,M<T>> operator> (const Vec<N,T>& x, const Vec<N,T>& y) {
326 return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) > to_vext(y));
327 }
328
329#else
330
331 // Either SKNX_NO_SIMD is defined, or Clang/GCC vector extensions are not available.
332 // We'll implement things portably with N==1 scalar implementations and recursion onto them.
333
334 // N == 1 scalar implementations.
335 SIT Vec<1,T> operator+(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val + y.val; }
336 SIT Vec<1,T> operator-(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val - y.val; }
337 SIT Vec<1,T> operator*(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val * y.val; }
338 SIT Vec<1,T> operator/(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val / y.val; }
339
340 SIT Vec<1,T> operator^(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val ^ y.val; }
341 SIT Vec<1,T> operator&(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val & y.val; }
342 SIT Vec<1,T> operator|(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val | y.val; }
343
344 SIT Vec<1,T> operator!(const Vec<1,T>& x) { return !x.val; }
345 SIT Vec<1,T> operator-(const Vec<1,T>& x) { return -x.val; }
346 SIT Vec<1,T> operator~(const Vec<1,T>& x) { return ~x.val; }
347
348 SIT Vec<1,T> operator<<(const Vec<1,T>& x, int k) { return x.val << k; }
349 SIT Vec<1,T> operator>>(const Vec<1,T>& x, int k) { return x.val >> k; }
350
352 return x.val == y.val ? ~0 : 0;
353 }
355 return x.val != y.val ? ~0 : 0;
356 }
358 return x.val <= y.val ? ~0 : 0;
359 }
361 return x.val >= y.val ? ~0 : 0;
362 }
364 return x.val < y.val ? ~0 : 0;
365 }
367 return x.val > y.val ? ~0 : 0;
368 }
369
370 // Recurse on lo/hi down to N==1 scalar implementations.
372 return join(x.lo + y.lo, x.hi + y.hi);
373 }
375 return join(x.lo - y.lo, x.hi - y.hi);
376 }
378 return join(x.lo * y.lo, x.hi * y.hi);
379 }
381 return join(x.lo / y.lo, x.hi / y.hi);
382 }
383
385 return join(x.lo ^ y.lo, x.hi ^ y.hi);
386 }
388 return join(x.lo & y.lo, x.hi & y.hi);
389 }
391 return join(x.lo | y.lo, x.hi | y.hi);
392 }
393
394 SINT Vec<N,T> operator!(const Vec<N,T>& x) { return join(!x.lo, !x.hi); }
395 SINT Vec<N,T> operator-(const Vec<N,T>& x) { return join(-x.lo, -x.hi); }
396 SINT Vec<N,T> operator~(const Vec<N,T>& x) { return join(~x.lo, ~x.hi); }
397
398 SINT Vec<N,T> operator<<(const Vec<N,T>& x, int k) { return join(x.lo << k, x.hi << k); }
399 SINT Vec<N,T> operator>>(const Vec<N,T>& x, int k) { return join(x.lo >> k, x.hi >> k); }
400
402 return join(x.lo == y.lo, x.hi == y.hi);
403 }
405 return join(x.lo != y.lo, x.hi != y.hi);
406 }
408 return join(x.lo <= y.lo, x.hi <= y.hi);
409 }
411 return join(x.lo >= y.lo, x.hi >= y.hi);
412 }
414 return join(x.lo < y.lo, x.hi < y.hi);
415 }
417 return join(x.lo > y.lo, x.hi > y.hi);
418 }
419#endif
420
421// Scalar/vector operations splat the scalar to a vector.
422SINTU Vec<N,T> operator+ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) + y; }
423SINTU Vec<N,T> operator- (U x, const Vec<N,T>& y) { return Vec<N,T>(x) - y; }
424SINTU Vec<N,T> operator* (U x, const Vec<N,T>& y) { return Vec<N,T>(x) * y; }
425SINTU Vec<N,T> operator/ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) / y; }
426SINTU Vec<N,T> operator^ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) ^ y; }
427SINTU Vec<N,T> operator& (U x, const Vec<N,T>& y) { return Vec<N,T>(x) & y; }
428SINTU Vec<N,T> operator| (U x, const Vec<N,T>& y) { return Vec<N,T>(x) | y; }
429SINTU Vec<N,M<T>> operator==(U x, const Vec<N,T>& y) { return Vec<N,T>(x) == y; }
430SINTU Vec<N,M<T>> operator!=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) != y; }
431SINTU Vec<N,M<T>> operator<=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) <= y; }
432SINTU Vec<N,M<T>> operator>=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) >= y; }
433SINTU Vec<N,M<T>> operator< (U x, const Vec<N,T>& y) { return Vec<N,T>(x) < y; }
434SINTU Vec<N,M<T>> operator> (U x, const Vec<N,T>& y) { return Vec<N,T>(x) > y; }
435
436SINTU Vec<N,T> operator+ (const Vec<N,T>& x, U y) { return x + Vec<N,T>(y); }
437SINTU Vec<N,T> operator- (const Vec<N,T>& x, U y) { return x - Vec<N,T>(y); }
438SINTU Vec<N,T> operator* (const Vec<N,T>& x, U y) { return x * Vec<N,T>(y); }
439SINTU Vec<N,T> operator/ (const Vec<N,T>& x, U y) { return x / Vec<N,T>(y); }
440SINTU Vec<N,T> operator^ (const Vec<N,T>& x, U y) { return x ^ Vec<N,T>(y); }
441SINTU Vec<N,T> operator& (const Vec<N,T>& x, U y) { return x & Vec<N,T>(y); }
442SINTU Vec<N,T> operator| (const Vec<N,T>& x, U y) { return x | Vec<N,T>(y); }
443SINTU Vec<N,M<T>> operator==(const Vec<N,T>& x, U y) { return x == Vec<N,T>(y); }
444SINTU Vec<N,M<T>> operator!=(const Vec<N,T>& x, U y) { return x != Vec<N,T>(y); }
445SINTU Vec<N,M<T>> operator<=(const Vec<N,T>& x, U y) { return x <= Vec<N,T>(y); }
446SINTU Vec<N,M<T>> operator>=(const Vec<N,T>& x, U y) { return x >= Vec<N,T>(y); }
447SINTU Vec<N,M<T>> operator< (const Vec<N,T>& x, U y) { return x < Vec<N,T>(y); }
448SINTU Vec<N,M<T>> operator> (const Vec<N,T>& x, U y) { return x > Vec<N,T>(y); }
449
450SINT Vec<N,T>& operator+=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x + y); }
451SINT Vec<N,T>& operator-=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x - y); }
452SINT Vec<N,T>& operator*=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x * y); }
453SINT Vec<N,T>& operator/=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x / y); }
454SINT Vec<N,T>& operator^=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x ^ y); }
455SINT Vec<N,T>& operator&=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x & y); }
456SINT Vec<N,T>& operator|=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x | y); }
457
458SINTU Vec<N,T>& operator+=(Vec<N,T>& x, U y) { return (x = x + Vec<N,T>(y)); }
459SINTU Vec<N,T>& operator-=(Vec<N,T>& x, U y) { return (x = x - Vec<N,T>(y)); }
460SINTU Vec<N,T>& operator*=(Vec<N,T>& x, U y) { return (x = x * Vec<N,T>(y)); }
461SINTU Vec<N,T>& operator/=(Vec<N,T>& x, U y) { return (x = x / Vec<N,T>(y)); }
462SINTU Vec<N,T>& operator^=(Vec<N,T>& x, U y) { return (x = x ^ Vec<N,T>(y)); }
463SINTU Vec<N,T>& operator&=(Vec<N,T>& x, U y) { return (x = x & Vec<N,T>(y)); }
464SINTU Vec<N,T>& operator|=(Vec<N,T>& x, U y) { return (x = x | Vec<N,T>(y)); }
465
466SINT Vec<N,T>& operator<<=(Vec<N,T>& x, int bits) { return (x = x << bits); }
467SINT Vec<N,T>& operator>>=(Vec<N,T>& x, int bits) { return (x = x >> bits); }
468
469// Some operations we want are not expressible with Clang/GCC vector extensions.
470
471// Clang can reason about naive_if_then_else() and optimize through it better
472// than if_then_else(), so it's sometimes useful to call it directly when we
473// think an entire expression should optimize away, e.g. min()/max().
474SINT Vec<N,T> naive_if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec<N,T>& e) {
475 return sk_bit_cast<Vec<N,T>>(( cond & sk_bit_cast<Vec<N, M<T>>>(t)) |
476 (~cond & sk_bit_cast<Vec<N, M<T>>>(e)) );
477}
478
479SIT Vec<1,T> if_then_else(const Vec<1,M<T>>& cond, const Vec<1,T>& t, const Vec<1,T>& e) {
480 // In practice this scalar implementation is unlikely to be used. See next if_then_else().
481 return sk_bit_cast<Vec<1,T>>(( cond & sk_bit_cast<Vec<1, M<T>>>(t)) |
482 (~cond & sk_bit_cast<Vec<1, M<T>>>(e)) );
483}
484SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec<N,T>& e) {
485 // Specializations inline here so they can generalize what types the apply to.
486#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
487 if constexpr (N*sizeof(T) == 32) {
488 return sk_bit_cast<Vec<N,T>>(_mm256_blendv_epi8(sk_bit_cast<__m256i>(e),
489 sk_bit_cast<__m256i>(t),
490 sk_bit_cast<__m256i>(cond)));
491 }
492#endif
493#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
494 if constexpr (N*sizeof(T) == 16) {
495 return sk_bit_cast<Vec<N,T>>(_mm_blendv_epi8(sk_bit_cast<__m128i>(e),
496 sk_bit_cast<__m128i>(t),
497 sk_bit_cast<__m128i>(cond)));
498 }
499#endif
500#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
501 if constexpr (N*sizeof(T) == 16) {
502 return sk_bit_cast<Vec<N,T>>(vbslq_u8(sk_bit_cast<uint8x16_t>(cond),
503 sk_bit_cast<uint8x16_t>(t),
504 sk_bit_cast<uint8x16_t>(e)));
505 }
506#endif
507#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
508 if constexpr (N*sizeof(T) == 32) {
509 return sk_bit_cast<Vec<N,T>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),
510 sk_bit_cast<__m256i>(t),
511 sk_bit_cast<__m256i>(cond)));
512 }
513#endif
514#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
515 if constexpr (N*sizeof(T) == 16) {
516 return sk_bit_cast<Vec<N,T>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(e),
517 sk_bit_cast<__m128i>(t),
518 sk_bit_cast<__m128i>(cond)));
519 }
520#endif
521 // Recurse for large vectors to try to hit the specializations above.
522 if constexpr (N*sizeof(T) > 16) {
523 return join(if_then_else(cond.lo, t.lo, e.lo),
524 if_then_else(cond.hi, t.hi, e.hi));
525 }
526 // This default can lead to better code than the recursing onto scalars.
527 return naive_if_then_else(cond, t, e);
528}
529
530SIT bool any(const Vec<1,T>& x) { return x.val != 0; }
531SINT bool any(const Vec<N,T>& x) {
532 // For any(), the _mm_testz intrinsics are correct and don't require comparing 'x' to 0, so it's
533 // lower latency compared to _mm_movemask + _mm_compneq on plain SSE.
534#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
535 if constexpr (N*sizeof(T) == 32) {
536 return !_mm256_testz_si256(sk_bit_cast<__m256i>(x), _mm256_set1_epi32(-1));
537 }
538#endif
539#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
540 if constexpr (N*sizeof(T) == 16) {
541 return !_mm_testz_si128(sk_bit_cast<__m128i>(x), _mm_set1_epi32(-1));
542 }
543#endif
544#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
545 if constexpr (N*sizeof(T) == 16) {
546 // On SSE, movemask checks only the MSB in each lane, which is fine if the lanes were set
547 // directly from a comparison op (which sets all bits to 1 when true), but skvx::Vec<>
548 // treats any non-zero value as true, so we have to compare 'x' to 0 before calling movemask
549 return _mm_movemask_ps(_mm_cmpneq_ps(sk_bit_cast<__m128>(x), _mm_set1_ps(0))) != 0b0000;
550 }
551#endif
552#if SKVX_USE_SIMD && defined(__aarch64__)
553 // On 64-bit NEON, take the max across lanes, which will be non-zero if any lane was true.
554 // The specific lane-size doesn't really matter in this case since it's really any set bit
555 // that we're looking for.
556 if constexpr (N*sizeof(T) == 8 ) { return vmaxv_u8 (sk_bit_cast<uint8x8_t> (x)) > 0; }
557 if constexpr (N*sizeof(T) == 16) { return vmaxvq_u8(sk_bit_cast<uint8x16_t>(x)) > 0; }
558#endif
559#if SKVX_USE_SIMD && defined(__wasm_simd128__)
560 if constexpr (N == 4 && sizeof(T) == 4) {
561 return wasm_i32x4_any_true(sk_bit_cast<VExt<4,int>>(x));
562 }
563#endif
564#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
565 if constexpr (N*sizeof(T) == 32) {
566 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0),
567 sk_bit_cast<__m256i>(x)));
568 return (retv[0] | retv[4]) != 0b0000;
569 }
570#endif
571#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
572 if constexpr (N*sizeof(T) == 16) {
573 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0),
574 sk_bit_cast<__m128i>(x)));
575 return retv[0] != 0b0000;
576 }
577#endif
578 return any(x.lo)
579 || any(x.hi);
580}
581
582SIT bool all(const Vec<1,T>& x) { return x.val != 0; }
583SINT bool all(const Vec<N,T>& x) {
584// Unlike any(), we have to respect the lane layout, or we'll miss cases where a
585// true lane has a mix of 0 and 1 bits.
586#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
587 // Unfortunately, the _mm_testc intrinsics don't let us avoid the comparison to 0 for all()'s
588 // correctness, so always just use the plain SSE version.
589 if constexpr (N == 4 && sizeof(T) == 4) {
590 return _mm_movemask_ps(_mm_cmpneq_ps(sk_bit_cast<__m128>(x), _mm_set1_ps(0))) == 0b1111;
591 }
592#endif
593#if SKVX_USE_SIMD && defined(__aarch64__)
594 // On 64-bit NEON, take the min across the lanes, which will be non-zero if all lanes are != 0.
595 if constexpr (sizeof(T)==1 && N==8) {return vminv_u8 (sk_bit_cast<uint8x8_t> (x)) > 0;}
596 if constexpr (sizeof(T)==1 && N==16) {return vminvq_u8 (sk_bit_cast<uint8x16_t>(x)) > 0;}
597 if constexpr (sizeof(T)==2 && N==4) {return vminv_u16 (sk_bit_cast<uint16x4_t>(x)) > 0;}
598 if constexpr (sizeof(T)==2 && N==8) {return vminvq_u16(sk_bit_cast<uint16x8_t>(x)) > 0;}
599 if constexpr (sizeof(T)==4 && N==2) {return vminv_u32 (sk_bit_cast<uint32x2_t>(x)) > 0;}
600 if constexpr (sizeof(T)==4 && N==4) {return vminvq_u32(sk_bit_cast<uint32x4_t>(x)) > 0;}
601#endif
602#if SKVX_USE_SIMD && defined(__wasm_simd128__)
603 if constexpr (N == 4 && sizeof(T) == 4) {
604 return wasm_i32x4_all_true(sk_bit_cast<VExt<4,int>>(x));
605 }
606#endif
607#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
608 if constexpr (N == 8 && sizeof(T) == 4) {
609 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0),
610 sk_bit_cast<__m256i>(x)));
611 return (retv[0] & retv[4]) == 0b1111;
612 }
613#endif
614#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
615 if constexpr (N == 4 && sizeof(T) == 4) {
616 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0),
617 sk_bit_cast<__m128i>(x)));
618 return retv[0] == 0b1111;
619 }
620#endif
621 return all(x.lo)
622 && all(x.hi);
623}
624
625// cast() Vec<N,S> to Vec<N,D>, as if applying a C-cast to each lane.
626// TODO: implement with map()?
627template <typename D, typename S>
628SI Vec<1,D> cast(const Vec<1,S>& src) { return (D)src.val; }
629
630template <typename D, int N, typename S>
632#if SKVX_USE_SIMD && defined(__clang__)
633 return to_vec(__builtin_convertvector(to_vext(src), VExt<N,D>));
634#else
635 return join(cast<D>(src.lo), cast<D>(src.hi));
636#endif
637}
638
639// min/max match logic of std::min/std::max, which is important when NaN is involved.
640SIT T min(const Vec<1,T>& x) { return x.val; }
641SIT T max(const Vec<1,T>& x) { return x.val; }
642SINT T min(const Vec<N,T>& x) { return std::min(min(x.lo), min(x.hi)); }
643SINT T max(const Vec<N,T>& x) { return std::max(max(x.lo), max(x.hi)); }
644
645SINT Vec<N,T> min(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(y < x, y, x); }
646SINT Vec<N,T> max(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(x < y, y, x); }
647
648SINTU Vec<N,T> min(const Vec<N,T>& x, U y) { return min(x, Vec<N,T>(y)); }
649SINTU Vec<N,T> max(const Vec<N,T>& x, U y) { return max(x, Vec<N,T>(y)); }
650SINTU Vec<N,T> min(U x, const Vec<N,T>& y) { return min(Vec<N,T>(x), y); }
651SINTU Vec<N,T> max(U x, const Vec<N,T>& y) { return max(Vec<N,T>(x), y); }
652
653// pin matches the logic of SkTPin, which is important when NaN is involved. It always returns
654// values in the range lo..hi, and if x is NaN, it returns lo.
655SINT Vec<N,T> pin(const Vec<N,T>& x, const Vec<N,T>& lo, const Vec<N,T>& hi) {
656 return max(lo, min(x, hi));
657}
658
659// Shuffle values from a vector pretty arbitrarily:
660// skvx::Vec<4,float> rgba = {R,G,B,A};
661// shuffle<2,1,0,3> (rgba) ~> {B,G,R,A}
662// shuffle<2,1> (rgba) ~> {B,G}
663// shuffle<2,1,2,1,2,1,2,1>(rgba) ~> {B,G,B,G,B,G,B,G}
664// shuffle<3,3,3,3> (rgba) ~> {A,A,A,A}
665// The only real restriction is that the output also be a legal N=power-of-two sknx::Vec.
666template <int... Ix, int N, typename T>
667SI Vec<sizeof...(Ix),T> shuffle(const Vec<N,T>& x) {
668#if SKVX_USE_SIMD && defined(__clang__)
669 // TODO: can we just always use { x[Ix]... }?
670 return to_vec<sizeof...(Ix),T>(__builtin_shufflevector(to_vext(x), to_vext(x), Ix...));
671#else
672 return { x[Ix]... };
673#endif
674}
675
676// Call map(fn, x) for a vector with fn() applied to each lane of x, { fn(x[0]), fn(x[1]), ... },
677// or map(fn, x,y) for a vector of fn(x[i], y[i]), etc.
678
679template <typename Fn, typename... Args, size_t... I>
680SI auto map(std::index_sequence<I...>,
681 Fn&& fn, const Args&... args) -> skvx::Vec<sizeof...(I), decltype(fn(args[0]...))> {
682 auto lane = [&](size_t i)
683#if defined(__clang__)
684 // CFI, specifically -fsanitize=cfi-icall, seems to give a false positive here,
685 // with errors like "control flow integrity check for type 'float (float)
686 // noexcept' failed during indirect function call... note: sqrtf.cfi_jt defined
687 // here". But we can be quite sure fn is the right type: it's all inferred!
688 // So, stifle CFI in this function.
689 __attribute__((no_sanitize("cfi")))
690#endif
691 { return fn(args[static_cast<int>(i)]...); };
692
693 return { lane(I)... };
694}
695
696template <typename Fn, int N, typename T, typename... Rest>
697auto map(Fn&& fn, const Vec<N,T>& first, const Rest&... rest) {
698 // Derive an {0...N-1} index_sequence from the size of the first arg: N lanes in, N lanes out.
699 return map(std::make_index_sequence<N>{}, fn, first,rest...);
700}
701
702SIN Vec<N,float> ceil(const Vec<N,float>& x) { return map( ceilf, x); }
703SIN Vec<N,float> floor(const Vec<N,float>& x) { return map(floorf, x); }
704SIN Vec<N,float> trunc(const Vec<N,float>& x) { return map(truncf, x); }
705SIN Vec<N,float> round(const Vec<N,float>& x) { return map(roundf, x); }
706SIN Vec<N,float> sqrt(const Vec<N,float>& x) { return map( sqrtf, x); }
707SIN Vec<N,float> abs(const Vec<N,float>& x) { return map( fabsf, x); }
709 const Vec<N,float>& y,
710 const Vec<N,float>& z) {
711 // I don't understand why Clang's codegen is terrible if we write map(fmaf, x,y,z) directly.
712 auto fn = [](float x, float y, float z) { return fmaf(x,y,z); };
713 return map(fn, x,y,z);
714}
715
717 return (int)lrintf(x.val);
718}
720#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
721 if constexpr (N == 8) {
722 return sk_bit_cast<Vec<N,int>>(_mm256_cvtps_epi32(sk_bit_cast<__m256>(x)));
723 }
724#endif
725#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
726 if constexpr (N == 4) {
727 return sk_bit_cast<Vec<N,int>>(_mm_cvtps_epi32(sk_bit_cast<__m128>(x)));
728 }
729#endif
730#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
731 if constexpr (N == 8) {
732 return sk_bit_cast<Vec<N,int>>(__lasx_xvftint_w_s(sk_bit_cast<__m256>(x)));
733 }
734#endif
735#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
736 if constexpr (N == 4) {
737 return sk_bit_cast<Vec<N,int>>(__lsx_vftint_w_s(sk_bit_cast<__m128>(x)));
738 }
739#endif
740 return join(lrint(x.lo),
741 lrint(x.hi));
742}
743
744SIN Vec<N,float> fract(const Vec<N,float>& x) { return x - floor(x); }
745
746// Converts float to half, rounding to nearest even, and supporting de-normal f16 conversion,
747// and overflow to f16 infinity. Should not be called with NaNs, since it can convert NaN->inf.
748// KEEP IN SYNC with skcms' Half_from_F to ensure that f16 colors are computed consistently in both
749// skcms and skvx.
751 assert(all(x == x)); // No NaNs should reach this function
752
753 // Intrinsics for float->half tend to operate on 4 lanes, and the default implementation has
754 // enough instructions that it's better to split and join on 128 bits groups vs.
755 // recursing for each min/max/shift/etc.
756 if constexpr (N > 4) {
757 return join(to_half(x.lo),
758 to_half(x.hi));
759 }
760
761#if SKVX_USE_SIMD && defined(__aarch64__)
762 if constexpr (N == 4) {
763 return sk_bit_cast<Vec<N,uint16_t>>(vcvt_f16_f32(sk_bit_cast<float32x4_t>(x)));
764
765 }
766#endif
767
768#define I(x) sk_bit_cast<Vec<N,int32_t>>(x)
769#define F(x) sk_bit_cast<Vec<N,float>>(x)
770 Vec<N,int32_t> sem = I(x),
771 s = sem & 0x8000'0000,
772 em = min(sem ^ s, 0x4780'0000), // |x| clamped to f16 infinity
773 // F(em)*8192 increases the exponent by 13, which when added back to em will shift
774 // the mantissa bits 13 to the right. We clamp to 1/2 for subnormal values, which
775 // automatically shifts the mantissa to match 2^-14 expected for a subnorm f16.
776 magic = I(max(F(em) * 8192.f, 0.5f)) & (255 << 23),
777 rounded = I((F(em) + F(magic))), // shift mantissa with automatic round-to-even
778 // Subtract 127 for f32 bias, subtract 13 to undo the *8192, subtract 1 to remove
779 // the implicit leading 1., and add 15 to get the f16 biased exponent.
780 exp = ((magic >> 13) - ((127-15+13+1)<<10)), // shift and re-bias exponent
781 f16 = rounded + exp; // use + if 'rounded' rolled over into first exponent bit
782 return cast<uint16_t>((s>>16) | f16);
783#undef I
784#undef F
785}
786
787// Converts from half to float, preserving NaN and +/- infinity.
788// KEEP IN SYNC with skcms' F_from_Half to ensure that f16 colors are computed consistently in both
789// skcms and skvx.
791 if constexpr (N > 4) {
792 return join(from_half(x.lo),
793 from_half(x.hi));
794 }
795
796#if SKVX_USE_SIMD && defined(__aarch64__)
797 if constexpr (N == 4) {
798 return sk_bit_cast<Vec<N,float>>(vcvt_f32_f16(sk_bit_cast<float16x4_t>(x)));
799 }
800#endif
801
802 Vec<N,int32_t> wide = cast<int32_t>(x),
803 s = wide & 0x8000,
804 em = wide ^ s,
805 inf_or_nan = (em >= (31 << 10)) & (255 << 23), // Expands exponent to fill 8 bits
806 is_norm = em > 0x3ff,
807 // subnormal f16's are 2^-14*0.[m0:9] == 2^-24*[m0:9].0
808 sub = sk_bit_cast<Vec<N,int32_t>>((cast<float>(em) * (1.f/(1<<24)))),
809 norm = ((em<<13) + ((127-15)<<23)), // Shifts mantissa, shifts + re-biases exp
810 finite = (is_norm & norm) | (~is_norm & sub);
811 // If 'x' is f16 +/- infinity, inf_or_nan will be the filled 8-bit exponent but 'norm' will be
812 // all 0s since 'x's mantissa is 0. Thus norm | inf_or_nan becomes f32 infinity. However, if
813 // 'x' is an f16 NaN, some bits of 'norm' will be non-zero, so it stays an f32 NaN after the OR.
814 return sk_bit_cast<Vec<N,float>>((s<<16) | finite | inf_or_nan);
815}
816
817// div255(x) = (x + 127) / 255 is a bit-exact rounding divide-by-255, packing down to 8-bit.
819 return cast<uint8_t>( (x+127)/255 );
820}
821
822// approx_scale(x,y) approximates div255(cast<uint16_t>(x)*cast<uint16_t>(y)) within a bit,
823// and is always perfect when x or y is 0 or 255.
825 // All of (x*y+x)/256, (x*y+y)/256, and (x*y+255)/256 meet the criteria above.
826 // We happen to have historically picked (x*y+x)/256.
827 auto X = cast<uint16_t>(x),
828 Y = cast<uint16_t>(y);
829 return cast<uint8_t>( (X*Y+X)/256 );
830}
831
832// saturated_add(x,y) sums values and clamps to the maximum value instead of overflowing.
833SINT std::enable_if_t<std::is_unsigned_v<T>, Vec<N,T>> saturated_add(const Vec<N,T>& x,
834 const Vec<N,T>& y) {
835#if SKVX_USE_SIMD && (SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 || defined(SK_ARM_HAS_NEON) || \
836 SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX)
837 // Both SSE and ARM have 16-lane saturated adds, so use intrinsics for those and recurse down
838 // or join up to take advantage.
839 if constexpr (N == 16 && sizeof(T) == 1) {
840 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
841 return sk_bit_cast<Vec<N,T>>(_mm_adds_epu8(sk_bit_cast<__m128i>(x),
842 sk_bit_cast<__m128i>(y)));
843 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
844 return sk_bit_cast<Vec<N,T>>(__lsx_vsadd_bu(sk_bit_cast<__m128i>(x),
845 sk_bit_cast<__m128i>(y)));
846 #else // SK_ARM_HAS_NEON
847 return sk_bit_cast<Vec<N,T>>(vqaddq_u8(sk_bit_cast<uint8x16_t>(x),
848 sk_bit_cast<uint8x16_t>(y)));
849 #endif
850 } else if constexpr (N < 16 && sizeof(T) == 1) {
851 return saturated_add(join(x,x), join(y,y)).lo;
852 } else if constexpr (sizeof(T) == 1) {
853 return join(saturated_add(x.lo, y.lo), saturated_add(x.hi, y.hi));
854 }
855#endif
856 // Otherwise saturate manually
857 auto sum = x + y;
858 return if_then_else(sum < x, Vec<N,T>(std::numeric_limits<T>::max()), sum);
859}
860
861// The ScaledDividerU32 takes a divisor > 1, and creates a function divide(numerator) that
862// calculates a numerator / denominator. For this to be rounded properly, numerator should have
863// half added in:
864// divide(numerator + half) == floor(numerator/denominator + 1/2).
865//
866// This gives an answer within +/- 1 from the true value.
867//
868// Derivation of half:
869// numerator/denominator + 1/2 = (numerator + half) / d
870// numerator + denominator / 2 = numerator + half
871// half = denominator / 2.
872//
873// Because half is divided by 2, that division must also be rounded.
874// half == denominator / 2 = (denominator + 1) / 2.
875//
876// The divisorFactor is just a scaled value:
877// divisorFactor = (1 / divisor) * 2 ^ 32.
878// The maximum that can be divided and rounded is UINT_MAX - half.
880public:
881 explicit ScaledDividerU32(uint32_t divisor)
882 : fDivisorFactor{(uint32_t)(std::round((1.0 / divisor) * (1ull << 32)))}
883 , fHalf{(divisor + 1) >> 1} {
884 assert(divisor > 1);
885 }
886
887 Vec<4, uint32_t> divide(const Vec<4, uint32_t>& numerator) const {
888#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
889 uint64x2_t hi = vmull_n_u32(vget_high_u32(to_vext(numerator)), fDivisorFactor);
890 uint64x2_t lo = vmull_n_u32(vget_low_u32(to_vext(numerator)), fDivisorFactor);
891
892 return to_vec<4, uint32_t>(vcombine_u32(vshrn_n_u64(lo,32), vshrn_n_u64(hi,32)));
893#else
894 return cast<uint32_t>((cast<uint64_t>(numerator) * fDivisorFactor) >> 32);
895#endif
896 }
897
898 uint32_t half() const { return fHalf; }
899
900private:
901 const uint32_t fDivisorFactor;
902 const uint32_t fHalf;
903};
904
905
907 const Vec<N,uint8_t>& y) {
908#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
909 // With NEON we can do eight u8*u8 -> u16 in one instruction, vmull_u8 (read, mul-long).
910 if constexpr (N == 8) {
911 return to_vec<8,uint16_t>(vmull_u8(to_vext(x), to_vext(y)));
912 } else if constexpr (N < 8) {
913 return mull(join(x,x), join(y,y)).lo;
914 } else { // N > 8
915 return join(mull(x.lo, y.lo), mull(x.hi, y.hi));
916 }
917#else
918 return cast<uint16_t>(x) * cast<uint16_t>(y);
919#endif
920}
921
923 const Vec<N,uint16_t>& y) {
924#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
925 // NEON can do four u16*u16 -> u32 in one instruction, vmull_u16
926 if constexpr (N == 4) {
927 return to_vec<4,uint32_t>(vmull_u16(to_vext(x), to_vext(y)));
928 } else if constexpr (N < 4) {
929 return mull(join(x,x), join(y,y)).lo;
930 } else { // N > 4
931 return join(mull(x.lo, y.lo), mull(x.hi, y.hi));
932 }
933#else
934 return cast<uint32_t>(x) * cast<uint32_t>(y);
935#endif
936}
937
939 const Vec<N,uint16_t>& y) {
940#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
941 // Use _mm_mulhi_epu16 for 8xuint16_t and join or split to get there.
942 if constexpr (N == 8) {
943 return sk_bit_cast<Vec<8,uint16_t>>(_mm_mulhi_epu16(sk_bit_cast<__m128i>(x),
944 sk_bit_cast<__m128i>(y)));
945 } else if constexpr (N < 8) {
946 return mulhi(join(x,x), join(y,y)).lo;
947 } else { // N > 8
948 return join(mulhi(x.lo, y.lo), mulhi(x.hi, y.hi));
949 }
950#elif SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
951 if constexpr (N == 8) {
952 return sk_bit_cast<Vec<8,uint16_t>>(__lsx_vmuh_hu(sk_bit_cast<__m128i>(x),
953 sk_bit_cast<__m128i>(y)));
954 } else if constexpr (N < 8) {
955 return mulhi(join(x,x), join(y,y)).lo;
956 } else { // N > 8
957 return join(mulhi(x.lo, y.lo), mulhi(x.hi, y.hi));
958 }
959#else
960 return skvx::cast<uint16_t>(mull(x, y) >> 16);
961#endif
962}
963
964SINT T dot(const Vec<N, T>& a, const Vec<N, T>& b) {
965 // While dot is a "horizontal" operation like any or all, it needs to remain
966 // in floating point and there aren't really any good SIMD instructions that make it faster.
967 // The constexpr cases remove the for loop in the only cases we realistically call.
968 auto ab = a*b;
969 if constexpr (N == 2) {
970 return ab[0] + ab[1];
971 } else if constexpr (N == 4) {
972 return ab[0] + ab[1] + ab[2] + ab[3];
973 } else {
974 T sum = ab[0];
975 for (int i = 1; i < N; ++i) {
976 sum += ab[i];
977 }
978 return sum;
979 }
980}
981
982SIT T cross(const Vec<2, T>& a, const Vec<2, T>& b) {
983 auto x = a * shuffle<1,0>(b);
984 return x[0] - x[1];
985}
986
987SIN float length(const Vec<N, float>& v) {
988 return std::sqrt(dot(v, v));
989}
990
991SIN double length(const Vec<N, double>& v) {
992 return std::sqrt(dot(v, v));
993}
994
996 return v / length(v);
997}
998
1000 return v / length(v);
1001}
1002
1003SINT bool isfinite(const Vec<N, T>& v) {
1004 // Multiply all values together with 0. If they were all finite, the output is
1005 // 0 (also finite). If any were not, we'll get nan.
1006 return SkIsFinite(dot(v, Vec<N, T>(0)));
1007}
1008
1009// De-interleaving load of 4 vectors.
1010//
1011// WARNING: These are really only supported well on NEON. Consider restructuring your data before
1012// resorting to these methods.
1013SIT void strided_load4(const T* v,
1014 Vec<1,T>& a,
1015 Vec<1,T>& b,
1016 Vec<1,T>& c,
1017 Vec<1,T>& d) {
1018 a.val = v[0];
1019 b.val = v[1];
1020 c.val = v[2];
1021 d.val = v[3];
1022}
1023SINT void strided_load4(const T* v,
1024 Vec<N,T>& a,
1025 Vec<N,T>& b,
1026 Vec<N,T>& c,
1027 Vec<N,T>& d) {
1028 strided_load4(v, a.lo, b.lo, c.lo, d.lo);
1029 strided_load4(v + 4*(N/2), a.hi, b.hi, c.hi, d.hi);
1030}
1031#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
1032#define IMPL_LOAD4_TRANSPOSED(N, T, VLD) \
1033SI void strided_load4(const T* v, \
1034 Vec<N,T>& a, \
1035 Vec<N,T>& b, \
1036 Vec<N,T>& c, \
1037 Vec<N,T>& d) { \
1038 auto mat = VLD(v); \
1039 a = sk_bit_cast<Vec<N,T>>(mat.val[0]); \
1040 b = sk_bit_cast<Vec<N,T>>(mat.val[1]); \
1041 c = sk_bit_cast<Vec<N,T>>(mat.val[2]); \
1042 d = sk_bit_cast<Vec<N,T>>(mat.val[3]); \
1043}
1044IMPL_LOAD4_TRANSPOSED(2, uint32_t, vld4_u32)
1045IMPL_LOAD4_TRANSPOSED(4, uint16_t, vld4_u16)
1046IMPL_LOAD4_TRANSPOSED(8, uint8_t, vld4_u8)
1047IMPL_LOAD4_TRANSPOSED(2, int32_t, vld4_s32)
1048IMPL_LOAD4_TRANSPOSED(4, int16_t, vld4_s16)
1049IMPL_LOAD4_TRANSPOSED(8, int8_t, vld4_s8)
1050IMPL_LOAD4_TRANSPOSED(2, float, vld4_f32)
1051IMPL_LOAD4_TRANSPOSED(4, uint32_t, vld4q_u32)
1052IMPL_LOAD4_TRANSPOSED(8, uint16_t, vld4q_u16)
1053IMPL_LOAD4_TRANSPOSED(16, uint8_t, vld4q_u8)
1054IMPL_LOAD4_TRANSPOSED(4, int32_t, vld4q_s32)
1055IMPL_LOAD4_TRANSPOSED(8, int16_t, vld4q_s16)
1056IMPL_LOAD4_TRANSPOSED(16, int8_t, vld4q_s8)
1057IMPL_LOAD4_TRANSPOSED(4, float, vld4q_f32)
1058#undef IMPL_LOAD4_TRANSPOSED
1059
1060#elif SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
1061
1062SI void strided_load4(const float* v,
1063 Vec<4,float>& a,
1064 Vec<4,float>& b,
1065 Vec<4,float>& c,
1066 Vec<4,float>& d) {
1067 __m128 a_ = _mm_loadu_ps(v);
1068 __m128 b_ = _mm_loadu_ps(v+4);
1069 __m128 c_ = _mm_loadu_ps(v+8);
1070 __m128 d_ = _mm_loadu_ps(v+12);
1071 _MM_TRANSPOSE4_PS(a_, b_, c_, d_);
1072 a = sk_bit_cast<Vec<4,float>>(a_);
1073 b = sk_bit_cast<Vec<4,float>>(b_);
1074 c = sk_bit_cast<Vec<4,float>>(c_);
1075 d = sk_bit_cast<Vec<4,float>>(d_);
1076}
1077
1078#elif SKVX_USE_SIMD && SKVX_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1079#define _LSX_TRANSPOSE4(row0, row1, row2, row3) \
1080do { \
1081 __m128i __t0 = __lsx_vilvl_w (row1, row0); \
1082 __m128i __t1 = __lsx_vilvl_w (row3, row2); \
1083 __m128i __t2 = __lsx_vilvh_w (row1, row0); \
1084 __m128i __t3 = __lsx_vilvh_w (row3, row2); \
1085 (row0) = __lsx_vilvl_d (__t1, __t0); \
1086 (row1) = __lsx_vilvh_d (__t1, __t0); \
1087 (row2) = __lsx_vilvl_d (__t3, __t2); \
1088 (row3) = __lsx_vilvh_d (__t3, __t2); \
1089} while (0)
1090
1091SI void strided_load4(const int* v,
1092 Vec<4,int>& a,
1093 Vec<4,int>& b,
1094 Vec<4,int>& c,
1095 Vec<4,int>& d) {
1096 __m128i a_ = __lsx_vld(v, 0);
1097 __m128i b_ = __lsx_vld(v, 16);
1098 __m128i c_ = __lsx_vld(v, 32);
1099 __m128i d_ = __lsx_vld(v, 48);
1100 _LSX_TRANSPOSE4(a_, b_, c_, d_);
1101 a = sk_bit_cast<Vec<4,int>>(a_);
1102 b = sk_bit_cast<Vec<4,int>>(b_);
1103 c = sk_bit_cast<Vec<4,int>>(c_);
1104 d = sk_bit_cast<Vec<4,int>>(d_);
1105}
1106#endif
1107
1108// De-interleaving load of 2 vectors.
1109//
1110// WARNING: These are really only supported well on NEON. Consider restructuring your data before
1111// resorting to these methods.
1112SIT void strided_load2(const T* v, Vec<1,T>& a, Vec<1,T>& b) {
1113 a.val = v[0];
1114 b.val = v[1];
1115}
1117 strided_load2(v, a.lo, b.lo);
1118 strided_load2(v + 2*(N/2), a.hi, b.hi);
1119}
1120#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
1121#define IMPL_LOAD2_TRANSPOSED(N, T, VLD) \
1122SI void strided_load2(const T* v, Vec<N,T>& a, Vec<N,T>& b) { \
1123 auto mat = VLD(v); \
1124 a = sk_bit_cast<Vec<N,T>>(mat.val[0]); \
1125 b = sk_bit_cast<Vec<N,T>>(mat.val[1]); \
1126}
1127IMPL_LOAD2_TRANSPOSED(2, uint32_t, vld2_u32)
1128IMPL_LOAD2_TRANSPOSED(4, uint16_t, vld2_u16)
1129IMPL_LOAD2_TRANSPOSED(8, uint8_t, vld2_u8)
1130IMPL_LOAD2_TRANSPOSED(2, int32_t, vld2_s32)
1131IMPL_LOAD2_TRANSPOSED(4, int16_t, vld2_s16)
1132IMPL_LOAD2_TRANSPOSED(8, int8_t, vld2_s8)
1133IMPL_LOAD2_TRANSPOSED(2, float, vld2_f32)
1134IMPL_LOAD2_TRANSPOSED(4, uint32_t, vld2q_u32)
1135IMPL_LOAD2_TRANSPOSED(8, uint16_t, vld2q_u16)
1136IMPL_LOAD2_TRANSPOSED(16, uint8_t, vld2q_u8)
1137IMPL_LOAD2_TRANSPOSED(4, int32_t, vld2q_s32)
1138IMPL_LOAD2_TRANSPOSED(8, int16_t, vld2q_s16)
1139IMPL_LOAD2_TRANSPOSED(16, int8_t, vld2q_s8)
1140IMPL_LOAD2_TRANSPOSED(4, float, vld2q_f32)
1141#undef IMPL_LOAD2_TRANSPOSED
1142#endif
1143
1144// Define commonly used aliases
1148
1152
1157
1161
1165
1169
1173
1174// Use with from_half and to_half to convert between floatX, and use these for storage.
1178
1179} // namespace skvx
1180
1181#undef SINTU
1182#undef SINT
1183#undef SIN
1184#undef SIT
1185#undef SI
1186#undef SKVX_ALWAYS_INLINE
1187#undef SKVX_USE_SIMD
1188
1189#endif//SKVX_DEFINED
static const uint64_t f16[kNumPixels]
static bool SkIsFinite(T x, Pack... values)
static SK_ALWAYS_INLINE Dst SK_FP_SAFE_ABI sk_bit_cast(const Src &src)
Definition: SkUtils.h:68
#define SKVX_ALWAYS_INLINE
Definition: SkVx.h:62
#define SIT
Definition: SkVx.h:67
#define F(x)
#define I(x)
#define SI
Definition: SkVx.h:66
#define SINTU
Definition: SkVx.h:70
#define SIN
Definition: SkVx.h:68
#define SINT
Definition: SkVx.h:69
static const SkScalar Y
Definition: StrokeBench.cpp:55
static const SkScalar X
Definition: StrokeBench.cpp:54
#define N
Definition: beziers.cpp:19
GLenum type
Vec< 4, uint32_t > divide(const Vec< 4, uint32_t > &numerator) const
Definition: SkVx.h:887
ScaledDividerU32(uint32_t divisor)
Definition: SkVx.h:881
uint32_t half() const
Definition: SkVx.h:898
static const char * begin(const StringSlice &s)
Definition: editor.cpp:252
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
Definition: main.cc:19
static bool b
struct MyStruct s
struct MyStruct a[10]
G_BEGIN_DECLS G_MODULE_EXPORT FlValue * args
static float max(float r, float g, float b)
Definition: hsl.cpp:49
static float min(float r, float g, float b)
Definition: hsl.cpp:48
__attribute__((visibility("default"))) int RunBenchmarks(int argc
double y
double x
Definition: ab.py:1
it will be possible to load the file into Perfetto s trace viewer disable asset Prevents usage of any non test fonts unless they were explicitly Loaded via prefetched default font Indicates whether the embedding started a prefetch of the default font manager before creating the engine run In non interactive keep the shell running after the Dart script has completed enable serial On low power devices with low core running concurrent GC tasks on threads can cause them to contend with the UI thread which could potentially lead to jank This option turns off all concurrent GC activities domain network JSON encoded network policy per domain This overrides the DisallowInsecureConnections switch Embedder can specify whether to allow or disallow insecure connections at a domain level old gen heap size
Definition: switches.h:259
Definition: SkVx.h:73
SINT bool isfinite(const Vec< N, T > &v)
Definition: SkVx.h:1003
SIN Vec< N, float > trunc(const Vec< N, float > &x)
Definition: SkVx.h:704
SINT T dot(const Vec< N, T > &a, const Vec< N, T > &b)
Definition: SkVx.h:964
SINT Vec< N, T > & operator-=(Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:451
SIT Vec< 1, T > if_then_else(const Vec< 1, M< T > > &cond, const Vec< 1, T > &t, const Vec< 1, T > &e)
Definition: SkVx.h:479
SI Vec< 1, int > lrint(const Vec< 1, float > &x)
Definition: SkVx.h:716
SIT Vec< 1, T > operator^(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:340
SIN Vec< N, float > fma(const Vec< N, float > &x, const Vec< N, float > &y, const Vec< N, float > &z)
Definition: SkVx.h:708
SIT Vec< 1, T > operator+(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:335
SINT Vec< N, T > & operator^=(Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:454
SIT Vec< 1, M< T > > operator<=(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:357
SIT Vec< 1, T > operator|(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:342
SIT Vec< 1, T > operator*(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:337
SINT Vec< N, T > naive_if_then_else(const Vec< N, M< T > > &cond, const Vec< N, T > &t, const Vec< N, T > &e)
Definition: SkVx.h:474
SIT Vec< 1, M< T > > operator==(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:351
SIN Vec< N, float > round(const Vec< N, float > &x)
Definition: SkVx.h:705
SI Vec< 1, D > cast(const Vec< 1, S > &src)
Definition: SkVx.h:628
SIT void strided_load4(const T *v, Vec< 1, T > &a, Vec< 1, T > &b, Vec< 1, T > &c, Vec< 1, T > &d)
Definition: SkVx.h:1013
SINT Vec< N, T > & operator|=(Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:456
SIN Vec< N, uint16_t > mulhi(const Vec< N, uint16_t > &x, const Vec< N, uint16_t > &y)
Definition: SkVx.h:938
SIN Vec< N, float > abs(const Vec< N, float > &x)
Definition: SkVx.h:707
SINT Vec< N, T > & operator*=(Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:452
SIT void strided_load2(const T *v, Vec< 1, T > &a, Vec< 1, T > &b)
Definition: SkVx.h:1112
SIT Vec< 1, M< T > > operator>=(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:360
SIN Vec< N, float > sqrt(const Vec< N, float > &x)
Definition: SkVx.h:706
SIN Vec< N, float > normalize(const Vec< N, float > &v)
Definition: SkVx.h:995
typename Mask< T >::type M
Definition: SkVx.h:239
SINT Vec< 2 *N, T > join(const Vec< N, T > &lo, const Vec< N, T > &hi)
Definition: SkVx.h:242
SIT Vec< 1, T > operator~(const Vec< 1, T > &x)
Definition: SkVx.h:346
SIN Vec< N, uint16_t > mull(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
Definition: SkVx.h:906
SINT Vec< N, T > & operator>>=(Vec< N, T > &x, int bits)
Definition: SkVx.h:467
SIT Vec< 1, T > operator-(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:336
SIT Vec< 1, M< T > > operator!=(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:354
SIN Vec< N, float > from_half(const Vec< N, uint16_t > &x)
Definition: SkVx.h:790
SIN Vec< N, uint8_t > div255(const Vec< N, uint16_t > &x)
Definition: SkVx.h:818
SIT Vec< 1, M< T > > operator>(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:366
SIN Vec< N, uint16_t > to_half(const Vec< N, float > &x)
Definition: SkVx.h:750
SIT bool all(const Vec< 1, T > &x)
Definition: SkVx.h:582
SIT Vec< 1, M< T > > operator<(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:363
SINT std::enable_if_t< std::is_unsigned_v< T >, Vec< N, T > > saturated_add(const Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:833
SI auto map(std::index_sequence< I... >, Fn &&fn, const Args &... args) -> skvx::Vec< sizeof...(I), decltype(fn(args[0]...))>
Definition: SkVx.h:680
SIT T max(const Vec< 1, T > &x)
Definition: SkVx.h:641
SIT Vec< 1, T > operator>>(const Vec< 1, T > &x, int k)
Definition: SkVx.h:349
SIT Vec< 1, T > operator!(const Vec< 1, T > &x)
Definition: SkVx.h:344
SINT Vec< N, T > & operator/=(Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:453
SI Vec< sizeof...(Ix), T > shuffle(const Vec< N, T > &)
Definition: SkVx.h:667
SINT Vec< N, T > & operator&=(Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:455
SIT Vec< 1, T > operator&(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:341
SIT T min(const Vec< 1, T > &x)
Definition: SkVx.h:640
SINT Vec< N, T > & operator+=(Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:450
SIN Vec< N, float > fract(const Vec< N, float > &x)
Definition: SkVx.h:744
SIN Vec< N, float > floor(const Vec< N, float > &x)
Definition: SkVx.h:703
SIT Vec< 1, T > operator<<(const Vec< 1, T > &x, int k)
Definition: SkVx.h:348
SIT bool any(const Vec< 1, T > &x)
Definition: SkVx.h:530
SIN Vec< N, float > ceil(const Vec< N, float > &x)
Definition: SkVx.h:702
SINT Vec< N, T > & operator<<=(Vec< N, T > &x, int bits)
Definition: SkVx.h:466
SIN float length(const Vec< N, float > &v)
Definition: SkVx.h:987
SIN Vec< N, uint8_t > approx_scale(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
Definition: SkVx.h:824
SIT T cross(const Vec< 2, T > &a, const Vec< 2, T > &b)
Definition: SkVx.h:982
SIT Vec< 1, T > operator/(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:338
SINT Vec< N, T > pin(const Vec< N, T > &x, const Vec< N, T > &lo, const Vec< N, T > &hi)
Definition: SkVx.h:655
Definition: ref_ptr.h:256
SkScalar w
#define T
Definition: precompiler.cc:65
Definition: SkMD5.cpp:134
int64_t type
Definition: SkVx.h:238
int32_t type
Definition: SkVx.h:237
T type
Definition: SkVx.h:236
SKVX_ALWAYS_INLINE T operator[](int i) const
Definition: SkVx.h:224
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
Definition: SkVx.h:220
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
Definition: SkVx.h:227
SKVX_ALWAYS_INLINE T & operator[](int i)
Definition: SkVx.h:225
SKVX_ALWAYS_INLINE Vec(T s)
Definition: SkVx.h:218
SKVX_ALWAYS_INLINE Vec()=default
SKVX_ALWAYS_INLINE void store(void *ptr) const
Definition: SkVx.h:230
SKVX_ALWAYS_INLINE T x() const
Definition: SkVx.h:203
SKVX_ALWAYS_INLINE T & operator[](int i)
Definition: SkVx.h:191
SKVX_ALWAYS_INLINE Vec< 2, T > yx() const
Definition: SkVx.h:207
SKVX_ALWAYS_INLINE T y() const
Definition: SkVx.h:204
SKVX_ALWAYS_INLINE T & x()
Definition: SkVx.h:200
SKVX_ALWAYS_INLINE void store(void *ptr) const
Definition: SkVx.h:196
SKVX_ALWAYS_INLINE Vec()=default
SKVX_ALWAYS_INLINE T & y()
Definition: SkVx.h:201
SKVX_ALWAYS_INLINE Vec(T x, T y)
Definition: SkVx.h:179
Vec< 1, T > hi
Definition: SkVx.h:210
SKVX_ALWAYS_INLINE Vec(T s)
Definition: SkVx.h:178
SKVX_ALWAYS_INLINE Vec< 4, T > xyxy() const
Definition: SkVx.h:208
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
Definition: SkVx.h:193
SKVX_ALWAYS_INLINE T operator[](int i) const
Definition: SkVx.h:190
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
Definition: SkVx.h:181
SKVX_ALWAYS_INLINE T w() const
Definition: SkVx.h:164
SKVX_ALWAYS_INLINE T & x()
Definition: SkVx.h:154
SKVX_ALWAYS_INLINE void store(void *ptr) const
Definition: SkVx.h:148
SKVX_ALWAYS_INLINE Vec()=default
SKVX_ALWAYS_INLINE Vec< 4, T > zwxy() const
Definition: SkVx.h:168
SKVX_ALWAYS_INLINE T & y()
Definition: SkVx.h:155
SKVX_ALWAYS_INLINE Vec< 2, T > zw() const
Definition: SkVx.h:160
SKVX_ALWAYS_INLINE Vec< 2, T > & zw()
Definition: SkVx.h:153
SKVX_ALWAYS_INLINE Vec(T x, T y, Vec< 2, T > zw)
Definition: SkVx.h:130
SKVX_ALWAYS_INLINE Vec(T s)
Definition: SkVx.h:127
SKVX_ALWAYS_INLINE T & z()
Definition: SkVx.h:156
SKVX_ALWAYS_INLINE Vec(Vec< 2, T > xy, Vec< 2, T > zw)
Definition: SkVx.h:131
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
Definition: SkVx.h:133
SKVX_ALWAYS_INLINE T x() const
Definition: SkVx.h:161
Vec< 2, T > hi
Definition: SkVx.h:170
SKVX_ALWAYS_INLINE T & w()
Definition: SkVx.h:157
SKVX_ALWAYS_INLINE Vec(T x, T y, T z, T w)
Definition: SkVx.h:128
SKVX_ALWAYS_INLINE Vec< 4, T > yxwz() const
Definition: SkVx.h:167
SKVX_ALWAYS_INLINE T & operator[](int i)
Definition: SkVx.h:143
SKVX_ALWAYS_INLINE T z() const
Definition: SkVx.h:163
SKVX_ALWAYS_INLINE T y() const
Definition: SkVx.h:162
SKVX_ALWAYS_INLINE T operator[](int i) const
Definition: SkVx.h:142
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
Definition: SkVx.h:145
SKVX_ALWAYS_INLINE Vec< 2, T > & xy()
Definition: SkVx.h:152
SKVX_ALWAYS_INLINE Vec< 2, T > xy() const
Definition: SkVx.h:159
SKVX_ALWAYS_INLINE Vec(Vec< 2, T > xy, T z, T w)
Definition: SkVx.h:129
Definition: SkVx.h:83
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
Definition: SkVx.h:97
SKVX_ALWAYS_INLINE Vec()=default
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
Definition: SkVx.h:109
SKVX_ALWAYS_INLINE void store(void *ptr) const
Definition: SkVx.h:112
Vec< N/2, T > hi
Definition: SkVx.h:117
SKVX_ALWAYS_INLINE Vec(T s)
Definition: SkVx.h:93
SKVX_ALWAYS_INLINE T operator[](int i) const
Definition: SkVx.h:106
SKVX_ALWAYS_INLINE T & operator[](int i)
Definition: SkVx.h:107
Vec< N/2, T > lo
Definition: SkVx.h:117