Flutter Engine
The Flutter Engine
Loading...
Searching...
No Matches
Classes | Namespaces | Macros | Typedefs | Functions
SkVx.h File Reference
#include "include/private/base/SkFeatures.h"
#include "src/base/SkUtils.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <initializer_list>
#include <type_traits>
#include <utility>
#include <immintrin.h>

Go to the source code of this file.

Classes

struct  skvx::Vec< N, T >
 
struct  skvx::Vec< 4, T >
 
struct  skvx::Vec< 2, T >
 
struct  skvx::Vec< 1, T >
 
struct  skvx::Mask< T >
 
struct  skvx::Mask< float >
 
struct  skvx::Mask< double >
 
class  skvx::ScaledDividerU32
 

Namespaces

namespace  skvx
 

Macros

#define SKVX_USE_SIMD   1
 
#define SKVX_ALWAYS_INLINE   __attribute__((always_inline))
 
#define SI   static inline
 
#define SIT   template < typename T> SI
 
#define SIN   template <int N > SI
 
#define SINT   template <int N, typename T> SI
 
#define SINTU
 
#define I(x)   sk_bit_cast<Vec<N,int32_t>>(x)
 
#define F(x)   sk_bit_cast<Vec<N,float>>(x)
 

Typedefs

template<typename T >
using skvx::M = typename Mask< T >::type
 
using skvx::float2 = Vec< 2, float >
 
using skvx::float4 = Vec< 4, float >
 
using skvx::float8 = Vec< 8, float >
 
using skvx::double2 = Vec< 2, double >
 
using skvx::double4 = Vec< 4, double >
 
using skvx::double8 = Vec< 8, double >
 
using skvx::byte2 = Vec< 2, uint8_t >
 
using skvx::byte4 = Vec< 4, uint8_t >
 
using skvx::byte8 = Vec< 8, uint8_t >
 
using skvx::byte16 = Vec< 16, uint8_t >
 
using skvx::int2 = Vec< 2, int32_t >
 
using skvx::int4 = Vec< 4, int32_t >
 
using skvx::int8 = Vec< 8, int32_t >
 
using skvx::ushort2 = Vec< 2, uint16_t >
 
using skvx::ushort4 = Vec< 4, uint16_t >
 
using skvx::ushort8 = Vec< 8, uint16_t >
 
using skvx::uint2 = Vec< 2, uint32_t >
 
using skvx::uint4 = Vec< 4, uint32_t >
 
using skvx::uint8 = Vec< 8, uint32_t >
 
using skvx::long2 = Vec< 2, int64_t >
 
using skvx::long4 = Vec< 4, int64_t >
 
using skvx::long8 = Vec< 8, int64_t >
 
using skvx::half2 = Vec< 2, uint16_t >
 
using skvx::half4 = Vec< 4, uint16_t >
 
using skvx::half8 = Vec< 8, uint16_t >
 

Functions

template<int... Ix, int N, typename T >
SI Vec< sizeof...(Ix), Tskvx::shuffle (const Vec< N, T > &)
 
SINT Vec< 2 *N, Tskvx::join (const Vec< N, T > &lo, const Vec< N, T > &hi)
 
SIT Vec< 1, Tskvx::operator+ (const Vec< 1, T > &x, const Vec< 1, T > &y)
 
SIT Vec< 1, Tskvx::operator- (const Vec< 1, T > &x, const Vec< 1, T > &y)
 
SIT Vec< 1, Tskvx::operator* (const Vec< 1, T > &x, const Vec< 1, T > &y)
 
SIT Vec< 1, Tskvx::operator/ (const Vec< 1, T > &x, const Vec< 1, T > &y)
 
SIT Vec< 1, Tskvx::operator^ (const Vec< 1, T > &x, const Vec< 1, T > &y)
 
SIT Vec< 1, Tskvx::operator& (const Vec< 1, T > &x, const Vec< 1, T > &y)
 
SIT Vec< 1, Tskvx::operator| (const Vec< 1, T > &x, const Vec< 1, T > &y)
 
SIT Vec< 1, Tskvx::operator! (const Vec< 1, T > &x)
 
SIT Vec< 1, Tskvx::operator- (const Vec< 1, T > &x)
 
SIT Vec< 1, Tskvx::operator~ (const Vec< 1, T > &x)
 
SIT Vec< 1, Tskvx::operator<< (const Vec< 1, T > &x, int k)
 
SIT Vec< 1, Tskvx::operator>> (const Vec< 1, T > &x, int k)
 
SIT Vec< 1, M< T > > skvx::operator== (const Vec< 1, T > &x, const Vec< 1, T > &y)
 
SIT Vec< 1, M< T > > skvx::operator!= (const Vec< 1, T > &x, const Vec< 1, T > &y)
 
SIT Vec< 1, M< T > > skvx::operator<= (const Vec< 1, T > &x, const Vec< 1, T > &y)
 
SIT Vec< 1, M< T > > skvx::operator>= (const Vec< 1, T > &x, const Vec< 1, T > &y)
 
SIT Vec< 1, M< T > > skvx::operator< (const Vec< 1, T > &x, const Vec< 1, T > &y)
 
SIT Vec< 1, M< T > > skvx::operator> (const Vec< 1, T > &x, const Vec< 1, T > &y)
 
SINT Vec< N, Tskvx::operator+ (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, Tskvx::operator- (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, Tskvx::operator* (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, Tskvx::operator/ (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, Tskvx::operator^ (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, Tskvx::operator& (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, Tskvx::operator| (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, Tskvx::operator! (const Vec< N, T > &x)
 
SINT Vec< N, Tskvx::operator- (const Vec< N, T > &x)
 
SINT Vec< N, Tskvx::operator~ (const Vec< N, T > &x)
 
SINT Vec< N, Tskvx::operator<< (const Vec< N, T > &x, int k)
 
SINT Vec< N, Tskvx::operator>> (const Vec< N, T > &x, int k)
 
SINT Vec< N, M< T > > skvx::operator== (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, M< T > > skvx::operator!= (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, M< T > > skvx::operator<= (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, M< T > > skvx::operator>= (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, M< T > > skvx::operator< (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, M< T > > skvx::operator> (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINTU Vec< N, Tskvx::operator+ (U x, const Vec< N, T > &y)
 
SINTU Vec< N, Tskvx::operator- (U x, const Vec< N, T > &y)
 
SINTU Vec< N, Tskvx::operator* (U x, const Vec< N, T > &y)
 
SINTU Vec< N, Tskvx::operator/ (U x, const Vec< N, T > &y)
 
SINTU Vec< N, Tskvx::operator^ (U x, const Vec< N, T > &y)
 
SINTU Vec< N, Tskvx::operator& (U x, const Vec< N, T > &y)
 
SINTU Vec< N, Tskvx::operator| (U x, const Vec< N, T > &y)
 
SINTU Vec< N, M< T > > skvx::operator== (U x, const Vec< N, T > &y)
 
SINTU Vec< N, M< T > > skvx::operator!= (U x, const Vec< N, T > &y)
 
SINTU Vec< N, M< T > > skvx::operator<= (U x, const Vec< N, T > &y)
 
SINTU Vec< N, M< T > > skvx::operator>= (U x, const Vec< N, T > &y)
 
SINTU Vec< N, M< T > > skvx::operator< (U x, const Vec< N, T > &y)
 
SINTU Vec< N, M< T > > skvx::operator> (U x, const Vec< N, T > &y)
 
SINTU Vec< N, Tskvx::operator+ (const Vec< N, T > &x, U y)
 
SINTU Vec< N, Tskvx::operator- (const Vec< N, T > &x, U y)
 
SINTU Vec< N, Tskvx::operator* (const Vec< N, T > &x, U y)
 
SINTU Vec< N, Tskvx::operator/ (const Vec< N, T > &x, U y)
 
SINTU Vec< N, Tskvx::operator^ (const Vec< N, T > &x, U y)
 
SINTU Vec< N, Tskvx::operator& (const Vec< N, T > &x, U y)
 
SINTU Vec< N, Tskvx::operator| (const Vec< N, T > &x, U y)
 
SINTU Vec< N, M< T > > skvx::operator== (const Vec< N, T > &x, U y)
 
SINTU Vec< N, M< T > > skvx::operator!= (const Vec< N, T > &x, U y)
 
SINTU Vec< N, M< T > > skvx::operator<= (const Vec< N, T > &x, U y)
 
SINTU Vec< N, M< T > > skvx::operator>= (const Vec< N, T > &x, U y)
 
SINTU Vec< N, M< T > > skvx::operator< (const Vec< N, T > &x, U y)
 
SINTU Vec< N, M< T > > skvx::operator> (const Vec< N, T > &x, U y)
 
SINT Vec< N, T > & skvx::operator+= (Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, T > & skvx::operator-= (Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, T > & skvx::operator*= (Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, T > & skvx::operator/= (Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, T > & skvx::operator^= (Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, T > & skvx::operator&= (Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, T > & skvx::operator|= (Vec< N, T > &x, const Vec< N, T > &y)
 
SINTU Vec< N, T > & skvx::operator+= (Vec< N, T > &x, U y)
 
SINTU Vec< N, T > & skvx::operator-= (Vec< N, T > &x, U y)
 
SINTU Vec< N, T > & skvx::operator*= (Vec< N, T > &x, U y)
 
SINTU Vec< N, T > & skvx::operator/= (Vec< N, T > &x, U y)
 
SINTU Vec< N, T > & skvx::operator^= (Vec< N, T > &x, U y)
 
SINTU Vec< N, T > & skvx::operator&= (Vec< N, T > &x, U y)
 
SINTU Vec< N, T > & skvx::operator|= (Vec< N, T > &x, U y)
 
SINT Vec< N, T > & skvx::operator<<= (Vec< N, T > &x, int bits)
 
SINT Vec< N, T > & skvx::operator>>= (Vec< N, T > &x, int bits)
 
SINT Vec< N, Tskvx::naive_if_then_else (const Vec< N, M< T > > &cond, const Vec< N, T > &t, const Vec< N, T > &e)
 
SIT Vec< 1, Tskvx::if_then_else (const Vec< 1, M< T > > &cond, const Vec< 1, T > &t, const Vec< 1, T > &e)
 
SINT Vec< N, Tskvx::if_then_else (const Vec< N, M< T > > &cond, const Vec< N, T > &t, const Vec< N, T > &e)
 
SIT bool skvx::any (const Vec< 1, T > &x)
 
SINT bool skvx::any (const Vec< N, T > &x)
 
SIT bool skvx::all (const Vec< 1, T > &x)
 
SINT bool skvx::all (const Vec< N, T > &x)
 
template<typename D , typename S >
SI Vec< 1, Dskvx::cast (const Vec< 1, S > &src)
 
template<typename D , int N, typename S >
SI Vec< N, Dskvx::cast (const Vec< N, S > &src)
 
SIT T skvx::min (const Vec< 1, T > &x)
 
SIT T skvx::max (const Vec< 1, T > &x)
 
SINT T skvx::min (const Vec< N, T > &x)
 
SINT T skvx::max (const Vec< N, T > &x)
 
SINT Vec< N, Tskvx::min (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINT Vec< N, Tskvx::max (const Vec< N, T > &x, const Vec< N, T > &y)
 
SINTU Vec< N, Tskvx::min (const Vec< N, T > &x, U y)
 
SINTU Vec< N, Tskvx::max (const Vec< N, T > &x, U y)
 
SINTU Vec< N, Tskvx::min (U x, const Vec< N, T > &y)
 
SINTU Vec< N, Tskvx::max (U x, const Vec< N, T > &y)
 
SINT Vec< N, Tskvx::pin (const Vec< N, T > &x, const Vec< N, T > &lo, const Vec< N, T > &hi)
 
template<typename Fn , typename... Args, size_t... I>
SI auto skvx::map (std::index_sequence< I... >, Fn &&fn, const Args &... args) -> skvx::Vec< sizeof...(I), decltype(fn(args[0]...))>
 
template<typename Fn , int N, typename T , typename... Rest>
auto skvx::map (Fn &&fn, const Vec< N, T > &first, const Rest &... rest)
 
SIN Vec< N, float > skvx::ceil (const Vec< N, float > &x)
 
SIN Vec< N, float > skvx::floor (const Vec< N, float > &x)
 
SIN Vec< N, float > skvx::trunc (const Vec< N, float > &x)
 
SIN Vec< N, float > skvx::round (const Vec< N, float > &x)
 
SIN Vec< N, float > skvx::sqrt (const Vec< N, float > &x)
 
SIN Vec< N, float > skvx::abs (const Vec< N, float > &x)
 
SIN Vec< N, float > skvx::fma (const Vec< N, float > &x, const Vec< N, float > &y, const Vec< N, float > &z)
 
SI Vec< 1, intskvx::lrint (const Vec< 1, float > &x)
 
SIN Vec< N, intskvx::lrint (const Vec< N, float > &x)
 
SIN Vec< N, float > skvx::fract (const Vec< N, float > &x)
 
SIN Vec< N, uint16_t > skvx::to_half (const Vec< N, float > &x)
 
SIN Vec< N, float > skvx::from_half (const Vec< N, uint16_t > &x)
 
SIN Vec< N, uint8_t > skvx::div255 (const Vec< N, uint16_t > &x)
 
SIN Vec< N, uint8_t > skvx::approx_scale (const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
 
SINT std::enable_if_t< std::is_unsigned_v< T >, Vec< N, T > > skvx::saturated_add (const Vec< N, T > &x, const Vec< N, T > &y)
 
SIN Vec< N, uint16_t > skvx::mull (const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
 
SIN Vec< N, uint32_t > skvx::mull (const Vec< N, uint16_t > &x, const Vec< N, uint16_t > &y)
 
SIN Vec< N, uint16_t > skvx::mulhi (const Vec< N, uint16_t > &x, const Vec< N, uint16_t > &y)
 
SINT T skvx::dot (const Vec< N, T > &a, const Vec< N, T > &b)
 
SIT T skvx::cross (const Vec< 2, T > &a, const Vec< 2, T > &b)
 
SIN float skvx::length (const Vec< N, float > &v)
 
SIN double skvx::length (const Vec< N, double > &v)
 
SIN Vec< N, float > skvx::normalize (const Vec< N, float > &v)
 
SIN Vec< N, double > skvx::normalize (const Vec< N, double > &v)
 
SINT bool skvx::isfinite (const Vec< N, T > &v)
 
SIT void skvx::strided_load4 (const T *v, Vec< 1, T > &a, Vec< 1, T > &b, Vec< 1, T > &c, Vec< 1, T > &d)
 
SINT void skvx::strided_load4 (const T *v, Vec< N, T > &a, Vec< N, T > &b, Vec< N, T > &c, Vec< N, T > &d)
 
SI void skvx::strided_load4 (const float *v, Vec< 4, float > &a, Vec< 4, float > &b, Vec< 4, float > &c, Vec< 4, float > &d)
 
SIT void skvx::strided_load2 (const T *v, Vec< 1, T > &a, Vec< 1, T > &b)
 
SINT void skvx::strided_load2 (const T *v, Vec< N, T > &a, Vec< N, T > &b)
 

Macro Definition Documentation

◆ F

#define F (   x)    sk_bit_cast<Vec<N,float>>(x)

◆ I

#define I (   x)    sk_bit_cast<Vec<N,int32_t>>(x)

◆ SI

#define SI   static inline

Definition at line 66 of file SkVx.h.

◆ SIN

#define SIN   template <int N > SI

Definition at line 68 of file SkVx.h.

◆ SINT

#define SINT   template <int N, typename T> SI

Definition at line 69 of file SkVx.h.

◆ SINTU

#define SINTU
Value:
template <int N, typename T, typename U, \
typename=std::enable_if_t<std::is_convertible<U,T>::value>> SI
#define SI
Definition SkVx.h:66
#define N
Definition beziers.cpp:19
#define T

Definition at line 70 of file SkVx.h.

72 {
73
74template <int N, typename T>
75struct alignas(N*sizeof(T)) Vec;
76
77template <int... Ix, int N, typename T>
78SI Vec<sizeof...(Ix),T> shuffle(const Vec<N,T>&);
79
80// All Vec have the same simple memory layout, the same as `T vec[N]`.
81template <int N, typename T>
82struct alignas(N*sizeof(T)) Vec {
83 static_assert((N & (N-1)) == 0, "N must be a power of 2.");
84 static_assert(sizeof(T) >= alignof(T), "What kind of unusual T is this?");
85
86 // Methods belong here in the class declaration of Vec only if:
87 // - they must be here, like constructors or operator[];
88 // - they'll definitely never want a specialized implementation.
89 // Other operations on Vec should be defined outside the type.
90
91 SKVX_ALWAYS_INLINE Vec() = default;
92 SKVX_ALWAYS_INLINE Vec(T s) : lo(s), hi(s) {}
93
94 // NOTE: Vec{x} produces x000..., whereas Vec(x) produces xxxx.... since this constructor fills
95 // unspecified lanes with 0s, whereas the single T constructor fills all lanes with the value.
96 SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) {
97 T vals[N] = {0};
98 assert(xs.size() <= (size_t)N);
99 memcpy(vals, xs.begin(), std::min(xs.size(), (size_t)N)*sizeof(T));
100
101 this->lo = Vec<N/2,T>::Load(vals + 0);
102 this->hi = Vec<N/2,T>::Load(vals + N/2);
103 }
104
105 SKVX_ALWAYS_INLINE T operator[](int i) const { return i<N/2 ? this->lo[i] : this->hi[i-N/2]; }
106 SKVX_ALWAYS_INLINE T& operator[](int i) { return i<N/2 ? this->lo[i] : this->hi[i-N/2]; }
107
108 SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) {
109 return sk_unaligned_load<Vec>(ptr);
110 }
111 SKVX_ALWAYS_INLINE void store(void* ptr) const {
112 // Note: Calling sk_unaligned_store produces slightly worse code here, for some reason
113 memcpy(ptr, this, sizeof(Vec));
114 }
115
116 Vec<N/2,T> lo, hi;
117};
118
119// We have specializations for N == 1 (the base-case), as well as 2 and 4, where we add helpful
120// constructors and swizzle accessors.
121template <typename T>
122struct alignas(4*sizeof(T)) Vec<4,T> {
123 static_assert(sizeof(T) >= alignof(T), "What kind of unusual T is this?");
124
125 SKVX_ALWAYS_INLINE Vec() = default;
126 SKVX_ALWAYS_INLINE Vec(T s) : lo(s), hi(s) {}
127 SKVX_ALWAYS_INLINE Vec(T x, T y, T z, T w) : lo(x,y), hi(z,w) {}
128 SKVX_ALWAYS_INLINE Vec(Vec<2,T> xy, T z, T w) : lo(xy), hi(z,w) {}
129 SKVX_ALWAYS_INLINE Vec(T x, T y, Vec<2,T> zw) : lo(x,y), hi(zw) {}
130 SKVX_ALWAYS_INLINE Vec(Vec<2,T> xy, Vec<2,T> zw) : lo(xy), hi(zw) {}
131
132 SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) {
133 T vals[4] = {0};
134 assert(xs.size() <= (size_t)4);
135 memcpy(vals, xs.begin(), std::min(xs.size(), (size_t)4)*sizeof(T));
136
137 this->lo = Vec<2,T>::Load(vals + 0);
138 this->hi = Vec<2,T>::Load(vals + 2);
139 }
140
141 SKVX_ALWAYS_INLINE T operator[](int i) const { return i<2 ? this->lo[i] : this->hi[i-2]; }
142 SKVX_ALWAYS_INLINE T& operator[](int i) { return i<2 ? this->lo[i] : this->hi[i-2]; }
143
144 SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) {
145 return sk_unaligned_load<Vec>(ptr);
146 }
147 SKVX_ALWAYS_INLINE void store(void* ptr) const {
148 memcpy(ptr, this, sizeof(Vec));
149 }
150
151 SKVX_ALWAYS_INLINE Vec<2,T>& xy() { return lo; }
152 SKVX_ALWAYS_INLINE Vec<2,T>& zw() { return hi; }
153 SKVX_ALWAYS_INLINE T& x() { return lo.lo.val; }
154 SKVX_ALWAYS_INLINE T& y() { return lo.hi.val; }
155 SKVX_ALWAYS_INLINE T& z() { return hi.lo.val; }
156 SKVX_ALWAYS_INLINE T& w() { return hi.hi.val; }
157
158 SKVX_ALWAYS_INLINE Vec<2,T> xy() const { return lo; }
159 SKVX_ALWAYS_INLINE Vec<2,T> zw() const { return hi; }
160 SKVX_ALWAYS_INLINE T x() const { return lo.lo.val; }
161 SKVX_ALWAYS_INLINE T y() const { return lo.hi.val; }
162 SKVX_ALWAYS_INLINE T z() const { return hi.lo.val; }
163 SKVX_ALWAYS_INLINE T w() const { return hi.hi.val; }
164
165 // Exchange-based swizzles. These should take 1 cycle on NEON and 3 (pipelined) cycles on SSE.
166 SKVX_ALWAYS_INLINE Vec<4,T> yxwz() const { return shuffle<1,0,3,2>(*this); }
167 SKVX_ALWAYS_INLINE Vec<4,T> zwxy() const { return shuffle<2,3,0,1>(*this); }
168
169 Vec<2,T> lo, hi;
170};
171
172template <typename T>
173struct alignas(2*sizeof(T)) Vec<2,T> {
174 static_assert(sizeof(T) >= alignof(T), "What kind of unusual T is this?");
175
176 SKVX_ALWAYS_INLINE Vec() = default;
177 SKVX_ALWAYS_INLINE Vec(T s) : lo(s), hi(s) {}
178 SKVX_ALWAYS_INLINE Vec(T x, T y) : lo(x), hi(y) {}
179
180 SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) {
181 T vals[2] = {0};
182 assert(xs.size() <= (size_t)2);
183 memcpy(vals, xs.begin(), std::min(xs.size(), (size_t)2)*sizeof(T));
184
185 this->lo = Vec<1,T>::Load(vals + 0);
186 this->hi = Vec<1,T>::Load(vals + 1);
187 }
188
189 SKVX_ALWAYS_INLINE T operator[](int i) const { return i<1 ? this->lo[i] : this->hi[i-1]; }
190 SKVX_ALWAYS_INLINE T& operator[](int i) { return i<1 ? this->lo[i] : this->hi[i-1]; }
191
192 SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) {
193 return sk_unaligned_load<Vec>(ptr);
194 }
195 SKVX_ALWAYS_INLINE void store(void* ptr) const {
196 memcpy(ptr, this, sizeof(Vec));
197 }
198
199 SKVX_ALWAYS_INLINE T& x() { return lo.val; }
200 SKVX_ALWAYS_INLINE T& y() { return hi.val; }
201
202 SKVX_ALWAYS_INLINE T x() const { return lo.val; }
203 SKVX_ALWAYS_INLINE T y() const { return hi.val; }
204
205 // This exchange-based swizzle should take 1 cycle on NEON and 3 (pipelined) cycles on SSE.
206 SKVX_ALWAYS_INLINE Vec<2,T> yx() const { return shuffle<1,0>(*this); }
207 SKVX_ALWAYS_INLINE Vec<4,T> xyxy() const { return Vec<4,T>(*this, *this); }
208
209 Vec<1,T> lo, hi;
210};
211
212template <typename T>
213struct Vec<1,T> {
214 T val = {};
215
216 SKVX_ALWAYS_INLINE Vec() = default;
217 SKVX_ALWAYS_INLINE Vec(T s) : val(s) {}
218
219 SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) : val(xs.size() ? *xs.begin() : 0) {
220 assert(xs.size() <= (size_t)1);
221 }
222
223 SKVX_ALWAYS_INLINE T operator[](int i) const { assert(i == 0); return val; }
224 SKVX_ALWAYS_INLINE T& operator[](int i) { assert(i == 0); return val; }
225
226 SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) {
227 return sk_unaligned_load<Vec>(ptr);
228 }
229 SKVX_ALWAYS_INLINE void store(void* ptr) const {
230 memcpy(ptr, this, sizeof(Vec));
231 }
232};
233
234// Translate from a value type T to its corresponding Mask, the result of a comparison.
235template <typename T> struct Mask { using type = T; };
236template <> struct Mask<float > { using type = int32_t; };
237template <> struct Mask<double> { using type = int64_t; };
238template <typename T> using M = typename Mask<T>::type;
239
240// Join two Vec<N,T> into one Vec<2N,T>.
241SINT Vec<2*N,T> join(const Vec<N,T>& lo, const Vec<N,T>& hi) {
242 Vec<2*N,T> v;
243 v.lo = lo;
244 v.hi = hi;
245 return v;
246}
247
248// We have three strategies for implementing Vec operations:
249// 1) lean on Clang/GCC vector extensions when available;
250// 2) use map() to apply a scalar function lane-wise;
251// 3) recurse on lo/hi to scalar portable implementations.
252// We can slot in platform-specific implementations as overloads for particular Vec<N,T>,
253// or often integrate them directly into the recursion of style 3), allowing fine control.
254
255#if SKVX_USE_SIMD && (defined(__clang__) || defined(__GNUC__))
256
257 // VExt<N,T> types have the same size as Vec<N,T> and support most operations directly.
258 #if defined(__clang__)
259 template <int N, typename T>
260 using VExt = T __attribute__((ext_vector_type(N)));
261
262 #elif defined(__GNUC__)
263 template <int N, typename T>
264 struct VExtHelper {
265 typedef T __attribute__((vector_size(N*sizeof(T)))) type;
266 };
267
268 template <int N, typename T>
269 using VExt = typename VExtHelper<N,T>::type;
270
271 // For some reason some (new!) versions of GCC cannot seem to deduce N in the generic
272 // to_vec<N,T>() below for N=4 and T=float. This workaround seems to help...
273 SI Vec<4,float> to_vec(VExt<4,float> v) { return sk_bit_cast<Vec<4,float>>(v); }
274 #endif
275
276 SINT VExt<N,T> to_vext(const Vec<N,T>& v) { return sk_bit_cast<VExt<N,T>>(v); }
277 SINT Vec <N,T> to_vec(const VExt<N,T>& v) { return sk_bit_cast<Vec <N,T>>(v); }
278
279 SINT Vec<N,T> operator+(const Vec<N,T>& x, const Vec<N,T>& y) {
280 return to_vec<N,T>(to_vext(x) + to_vext(y));
281 }
282 SINT Vec<N,T> operator-(const Vec<N,T>& x, const Vec<N,T>& y) {
283 return to_vec<N,T>(to_vext(x) - to_vext(y));
284 }
285 SINT Vec<N,T> operator*(const Vec<N,T>& x, const Vec<N,T>& y) {
286 return to_vec<N,T>(to_vext(x) * to_vext(y));
287 }
288 SINT Vec<N,T> operator/(const Vec<N,T>& x, const Vec<N,T>& y) {
289 return to_vec<N,T>(to_vext(x) / to_vext(y));
290 }
291
292 SINT Vec<N,T> operator^(const Vec<N,T>& x, const Vec<N,T>& y) {
293 return to_vec<N,T>(to_vext(x) ^ to_vext(y));
294 }
295 SINT Vec<N,T> operator&(const Vec<N,T>& x, const Vec<N,T>& y) {
296 return to_vec<N,T>(to_vext(x) & to_vext(y));
297 }
298 SINT Vec<N,T> operator|(const Vec<N,T>& x, const Vec<N,T>& y) {
299 return to_vec<N,T>(to_vext(x) | to_vext(y));
300 }
301
302 SINT Vec<N,T> operator!(const Vec<N,T>& x) { return to_vec<N,T>(!to_vext(x)); }
303 SINT Vec<N,T> operator-(const Vec<N,T>& x) { return to_vec<N,T>(-to_vext(x)); }
304 SINT Vec<N,T> operator~(const Vec<N,T>& x) { return to_vec<N,T>(~to_vext(x)); }
305
306 SINT Vec<N,T> operator<<(const Vec<N,T>& x, int k) { return to_vec<N,T>(to_vext(x) << k); }
307 SINT Vec<N,T> operator>>(const Vec<N,T>& x, int k) { return to_vec<N,T>(to_vext(x) >> k); }
308
309 SINT Vec<N,M<T>> operator==(const Vec<N,T>& x, const Vec<N,T>& y) {
310 return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) == to_vext(y));
311 }
312 SINT Vec<N,M<T>> operator!=(const Vec<N,T>& x, const Vec<N,T>& y) {
313 return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) != to_vext(y));
314 }
315 SINT Vec<N,M<T>> operator<=(const Vec<N,T>& x, const Vec<N,T>& y) {
316 return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) <= to_vext(y));
317 }
318 SINT Vec<N,M<T>> operator>=(const Vec<N,T>& x, const Vec<N,T>& y) {
319 return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) >= to_vext(y));
320 }
321 SINT Vec<N,M<T>> operator< (const Vec<N,T>& x, const Vec<N,T>& y) {
322 return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) < to_vext(y));
323 }
324 SINT Vec<N,M<T>> operator> (const Vec<N,T>& x, const Vec<N,T>& y) {
325 return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) > to_vext(y));
326 }
327
328#else
329
330 // Either SKNX_NO_SIMD is defined, or Clang/GCC vector extensions are not available.
331 // We'll implement things portably with N==1 scalar implementations and recursion onto them.
332
333 // N == 1 scalar implementations.
334 SIT Vec<1,T> operator+(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val + y.val; }
335 SIT Vec<1,T> operator-(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val - y.val; }
336 SIT Vec<1,T> operator*(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val * y.val; }
337 SIT Vec<1,T> operator/(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val / y.val; }
338
339 SIT Vec<1,T> operator^(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val ^ y.val; }
340 SIT Vec<1,T> operator&(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val & y.val; }
341 SIT Vec<1,T> operator|(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val | y.val; }
342
343 SIT Vec<1,T> operator!(const Vec<1,T>& x) { return !x.val; }
344 SIT Vec<1,T> operator-(const Vec<1,T>& x) { return -x.val; }
345 SIT Vec<1,T> operator~(const Vec<1,T>& x) { return ~x.val; }
346
347 SIT Vec<1,T> operator<<(const Vec<1,T>& x, int k) { return x.val << k; }
348 SIT Vec<1,T> operator>>(const Vec<1,T>& x, int k) { return x.val >> k; }
349
350 SIT Vec<1,M<T>> operator==(const Vec<1,T>& x, const Vec<1,T>& y) {
351 return x.val == y.val ? ~0 : 0;
352 }
353 SIT Vec<1,M<T>> operator!=(const Vec<1,T>& x, const Vec<1,T>& y) {
354 return x.val != y.val ? ~0 : 0;
355 }
356 SIT Vec<1,M<T>> operator<=(const Vec<1,T>& x, const Vec<1,T>& y) {
357 return x.val <= y.val ? ~0 : 0;
358 }
359 SIT Vec<1,M<T>> operator>=(const Vec<1,T>& x, const Vec<1,T>& y) {
360 return x.val >= y.val ? ~0 : 0;
361 }
362 SIT Vec<1,M<T>> operator< (const Vec<1,T>& x, const Vec<1,T>& y) {
363 return x.val < y.val ? ~0 : 0;
364 }
365 SIT Vec<1,M<T>> operator> (const Vec<1,T>& x, const Vec<1,T>& y) {
366 return x.val > y.val ? ~0 : 0;
367 }
368
369 // Recurse on lo/hi down to N==1 scalar implementations.
370 SINT Vec<N,T> operator+(const Vec<N,T>& x, const Vec<N,T>& y) {
371 return join(x.lo + y.lo, x.hi + y.hi);
372 }
373 SINT Vec<N,T> operator-(const Vec<N,T>& x, const Vec<N,T>& y) {
374 return join(x.lo - y.lo, x.hi - y.hi);
375 }
376 SINT Vec<N,T> operator*(const Vec<N,T>& x, const Vec<N,T>& y) {
377 return join(x.lo * y.lo, x.hi * y.hi);
378 }
379 SINT Vec<N,T> operator/(const Vec<N,T>& x, const Vec<N,T>& y) {
380 return join(x.lo / y.lo, x.hi / y.hi);
381 }
382
383 SINT Vec<N,T> operator^(const Vec<N,T>& x, const Vec<N,T>& y) {
384 return join(x.lo ^ y.lo, x.hi ^ y.hi);
385 }
386 SINT Vec<N,T> operator&(const Vec<N,T>& x, const Vec<N,T>& y) {
387 return join(x.lo & y.lo, x.hi & y.hi);
388 }
389 SINT Vec<N,T> operator|(const Vec<N,T>& x, const Vec<N,T>& y) {
390 return join(x.lo | y.lo, x.hi | y.hi);
391 }
392
393 SINT Vec<N,T> operator!(const Vec<N,T>& x) { return join(!x.lo, !x.hi); }
394 SINT Vec<N,T> operator-(const Vec<N,T>& x) { return join(-x.lo, -x.hi); }
395 SINT Vec<N,T> operator~(const Vec<N,T>& x) { return join(~x.lo, ~x.hi); }
396
397 SINT Vec<N,T> operator<<(const Vec<N,T>& x, int k) { return join(x.lo << k, x.hi << k); }
398 SINT Vec<N,T> operator>>(const Vec<N,T>& x, int k) { return join(x.lo >> k, x.hi >> k); }
399
400 SINT Vec<N,M<T>> operator==(const Vec<N,T>& x, const Vec<N,T>& y) {
401 return join(x.lo == y.lo, x.hi == y.hi);
402 }
403 SINT Vec<N,M<T>> operator!=(const Vec<N,T>& x, const Vec<N,T>& y) {
404 return join(x.lo != y.lo, x.hi != y.hi);
405 }
406 SINT Vec<N,M<T>> operator<=(const Vec<N,T>& x, const Vec<N,T>& y) {
407 return join(x.lo <= y.lo, x.hi <= y.hi);
408 }
409 SINT Vec<N,M<T>> operator>=(const Vec<N,T>& x, const Vec<N,T>& y) {
410 return join(x.lo >= y.lo, x.hi >= y.hi);
411 }
412 SINT Vec<N,M<T>> operator< (const Vec<N,T>& x, const Vec<N,T>& y) {
413 return join(x.lo < y.lo, x.hi < y.hi);
414 }
415 SINT Vec<N,M<T>> operator> (const Vec<N,T>& x, const Vec<N,T>& y) {
416 return join(x.lo > y.lo, x.hi > y.hi);
417 }
418#endif
419
420// Scalar/vector operations splat the scalar to a vector.
421SINTU Vec<N,T> operator+ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) + y; }
422SINTU Vec<N,T> operator- (U x, const Vec<N,T>& y) { return Vec<N,T>(x) - y; }
423SINTU Vec<N,T> operator* (U x, const Vec<N,T>& y) { return Vec<N,T>(x) * y; }
424SINTU Vec<N,T> operator/ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) / y; }
425SINTU Vec<N,T> operator^ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) ^ y; }
426SINTU Vec<N,T> operator& (U x, const Vec<N,T>& y) { return Vec<N,T>(x) & y; }
427SINTU Vec<N,T> operator| (U x, const Vec<N,T>& y) { return Vec<N,T>(x) | y; }
428SINTU Vec<N,M<T>> operator==(U x, const Vec<N,T>& y) { return Vec<N,T>(x) == y; }
429SINTU Vec<N,M<T>> operator!=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) != y; }
430SINTU Vec<N,M<T>> operator<=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) <= y; }
431SINTU Vec<N,M<T>> operator>=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) >= y; }
432SINTU Vec<N,M<T>> operator< (U x, const Vec<N,T>& y) { return Vec<N,T>(x) < y; }
433SINTU Vec<N,M<T>> operator> (U x, const Vec<N,T>& y) { return Vec<N,T>(x) > y; }
434
435SINTU Vec<N,T> operator+ (const Vec<N,T>& x, U y) { return x + Vec<N,T>(y); }
436SINTU Vec<N,T> operator- (const Vec<N,T>& x, U y) { return x - Vec<N,T>(y); }
437SINTU Vec<N,T> operator* (const Vec<N,T>& x, U y) { return x * Vec<N,T>(y); }
438SINTU Vec<N,T> operator/ (const Vec<N,T>& x, U y) { return x / Vec<N,T>(y); }
439SINTU Vec<N,T> operator^ (const Vec<N,T>& x, U y) { return x ^ Vec<N,T>(y); }
440SINTU Vec<N,T> operator& (const Vec<N,T>& x, U y) { return x & Vec<N,T>(y); }
441SINTU Vec<N,T> operator| (const Vec<N,T>& x, U y) { return x | Vec<N,T>(y); }
442SINTU Vec<N,M<T>> operator==(const Vec<N,T>& x, U y) { return x == Vec<N,T>(y); }
443SINTU Vec<N,M<T>> operator!=(const Vec<N,T>& x, U y) { return x != Vec<N,T>(y); }
444SINTU Vec<N,M<T>> operator<=(const Vec<N,T>& x, U y) { return x <= Vec<N,T>(y); }
445SINTU Vec<N,M<T>> operator>=(const Vec<N,T>& x, U y) { return x >= Vec<N,T>(y); }
446SINTU Vec<N,M<T>> operator< (const Vec<N,T>& x, U y) { return x < Vec<N,T>(y); }
447SINTU Vec<N,M<T>> operator> (const Vec<N,T>& x, U y) { return x > Vec<N,T>(y); }
448
449SINT Vec<N,T>& operator+=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x + y); }
450SINT Vec<N,T>& operator-=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x - y); }
451SINT Vec<N,T>& operator*=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x * y); }
452SINT Vec<N,T>& operator/=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x / y); }
453SINT Vec<N,T>& operator^=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x ^ y); }
454SINT Vec<N,T>& operator&=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x & y); }
455SINT Vec<N,T>& operator|=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x | y); }
456
457SINTU Vec<N,T>& operator+=(Vec<N,T>& x, U y) { return (x = x + Vec<N,T>(y)); }
458SINTU Vec<N,T>& operator-=(Vec<N,T>& x, U y) { return (x = x - Vec<N,T>(y)); }
459SINTU Vec<N,T>& operator*=(Vec<N,T>& x, U y) { return (x = x * Vec<N,T>(y)); }
460SINTU Vec<N,T>& operator/=(Vec<N,T>& x, U y) { return (x = x / Vec<N,T>(y)); }
461SINTU Vec<N,T>& operator^=(Vec<N,T>& x, U y) { return (x = x ^ Vec<N,T>(y)); }
462SINTU Vec<N,T>& operator&=(Vec<N,T>& x, U y) { return (x = x & Vec<N,T>(y)); }
463SINTU Vec<N,T>& operator|=(Vec<N,T>& x, U y) { return (x = x | Vec<N,T>(y)); }
464
465SINT Vec<N,T>& operator<<=(Vec<N,T>& x, int bits) { return (x = x << bits); }
466SINT Vec<N,T>& operator>>=(Vec<N,T>& x, int bits) { return (x = x >> bits); }
467
468// Some operations we want are not expressible with Clang/GCC vector extensions.
469
470// Clang can reason about naive_if_then_else() and optimize through it better
471// than if_then_else(), so it's sometimes useful to call it directly when we
472// think an entire expression should optimize away, e.g. min()/max().
473SINT Vec<N,T> naive_if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec<N,T>& e) {
474 return sk_bit_cast<Vec<N,T>>(( cond & sk_bit_cast<Vec<N, M<T>>>(t)) |
475 (~cond & sk_bit_cast<Vec<N, M<T>>>(e)) );
476}
477
478SIT Vec<1,T> if_then_else(const Vec<1,M<T>>& cond, const Vec<1,T>& t, const Vec<1,T>& e) {
479 // In practice this scalar implementation is unlikely to be used. See next if_then_else().
480 return sk_bit_cast<Vec<1,T>>(( cond & sk_bit_cast<Vec<1, M<T>>>(t)) |
481 (~cond & sk_bit_cast<Vec<1, M<T>>>(e)) );
482}
483SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec<N,T>& e) {
484 // Specializations inline here so they can generalize what types the apply to.
485#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
486 if constexpr (N*sizeof(T) == 32) {
487 return sk_bit_cast<Vec<N,T>>(_mm256_blendv_epi8(sk_bit_cast<__m256i>(e),
488 sk_bit_cast<__m256i>(t),
489 sk_bit_cast<__m256i>(cond)));
490 }
491#endif
492#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
493 if constexpr (N*sizeof(T) == 16) {
494 return sk_bit_cast<Vec<N,T>>(_mm_blendv_epi8(sk_bit_cast<__m128i>(e),
495 sk_bit_cast<__m128i>(t),
496 sk_bit_cast<__m128i>(cond)));
497 }
498#endif
499#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
500 if constexpr (N*sizeof(T) == 16) {
501 return sk_bit_cast<Vec<N,T>>(vbslq_u8(sk_bit_cast<uint8x16_t>(cond),
502 sk_bit_cast<uint8x16_t>(t),
503 sk_bit_cast<uint8x16_t>(e)));
504 }
505#endif
506#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
507 if constexpr (N*sizeof(T) == 32) {
508 return sk_bit_cast<Vec<N,T>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),
509 sk_bit_cast<__m256i>(t),
510 sk_bit_cast<__m256i>(cond)));
511 }
512#endif
513#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
514 if constexpr (N*sizeof(T) == 16) {
515 return sk_bit_cast<Vec<N,T>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(e),
516 sk_bit_cast<__m128i>(t),
517 sk_bit_cast<__m128i>(cond)));
518 }
519#endif
520 // Recurse for large vectors to try to hit the specializations above.
521 if constexpr (N*sizeof(T) > 16) {
522 return join(if_then_else(cond.lo, t.lo, e.lo),
523 if_then_else(cond.hi, t.hi, e.hi));
524 }
525 // This default can lead to better code than the recursing onto scalars.
526 return naive_if_then_else(cond, t, e);
527}
528
529SIT bool any(const Vec<1,T>& x) { return x.val != 0; }
530SINT bool any(const Vec<N,T>& x) {
531 // For any(), the _mm_testz intrinsics are correct and don't require comparing 'x' to 0, so it's
532 // lower latency compared to _mm_movemask + _mm_compneq on plain SSE.
533#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
534 if constexpr (N*sizeof(T) == 32) {
535 return !_mm256_testz_si256(sk_bit_cast<__m256i>(x), _mm256_set1_epi32(-1));
536 }
537#endif
538#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
539 if constexpr (N*sizeof(T) == 16) {
540 return !_mm_testz_si128(sk_bit_cast<__m128i>(x), _mm_set1_epi32(-1));
541 }
542#endif
543#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
544 if constexpr (N*sizeof(T) == 16) {
545 // On SSE, movemask checks only the MSB in each lane, which is fine if the lanes were set
546 // directly from a comparison op (which sets all bits to 1 when true), but skvx::Vec<>
547 // treats any non-zero value as true, so we have to compare 'x' to 0 before calling movemask
548 return _mm_movemask_ps(_mm_cmpneq_ps(sk_bit_cast<__m128>(x), _mm_set1_ps(0))) != 0b0000;
549 }
550#endif
551#if SKVX_USE_SIMD && defined(__aarch64__)
552 // On 64-bit NEON, take the max across lanes, which will be non-zero if any lane was true.
553 // The specific lane-size doesn't really matter in this case since it's really any set bit
554 // that we're looking for.
555 if constexpr (N*sizeof(T) == 8 ) { return vmaxv_u8 (sk_bit_cast<uint8x8_t> (x)) > 0; }
556 if constexpr (N*sizeof(T) == 16) { return vmaxvq_u8(sk_bit_cast<uint8x16_t>(x)) > 0; }
557#endif
558#if SKVX_USE_SIMD && defined(__wasm_simd128__)
559 if constexpr (N == 4 && sizeof(T) == 4) {
560 return wasm_i32x4_any_true(sk_bit_cast<VExt<4,int>>(x));
561 }
562#endif
563#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
564 if constexpr (N*sizeof(T) == 32) {
565 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0),
566 sk_bit_cast<__m256i>(x)));
567 return (retv[0] | retv[4]) != 0b0000;
568 }
569#endif
570#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
571 if constexpr (N*sizeof(T) == 16) {
572 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0),
573 sk_bit_cast<__m128i>(x)));
574 return retv[0] != 0b0000;
575 }
576#endif
577 return any(x.lo)
578 || any(x.hi);
579}
580
581SIT bool all(const Vec<1,T>& x) { return x.val != 0; }
582SINT bool all(const Vec<N,T>& x) {
583// Unlike any(), we have to respect the lane layout, or we'll miss cases where a
584// true lane has a mix of 0 and 1 bits.
585#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
586 // Unfortunately, the _mm_testc intrinsics don't let us avoid the comparison to 0 for all()'s
587 // correctness, so always just use the plain SSE version.
588 if constexpr (N == 4 && sizeof(T) == 4) {
589 return _mm_movemask_ps(_mm_cmpneq_ps(sk_bit_cast<__m128>(x), _mm_set1_ps(0))) == 0b1111;
590 }
591#endif
592#if SKVX_USE_SIMD && defined(__aarch64__)
593 // On 64-bit NEON, take the min across the lanes, which will be non-zero if all lanes are != 0.
594 if constexpr (sizeof(T)==1 && N==8) {return vminv_u8 (sk_bit_cast<uint8x8_t> (x)) > 0;}
595 if constexpr (sizeof(T)==1 && N==16) {return vminvq_u8 (sk_bit_cast<uint8x16_t>(x)) > 0;}
596 if constexpr (sizeof(T)==2 && N==4) {return vminv_u16 (sk_bit_cast<uint16x4_t>(x)) > 0;}
597 if constexpr (sizeof(T)==2 && N==8) {return vminvq_u16(sk_bit_cast<uint16x8_t>(x)) > 0;}
598 if constexpr (sizeof(T)==4 && N==2) {return vminv_u32 (sk_bit_cast<uint32x2_t>(x)) > 0;}
599 if constexpr (sizeof(T)==4 && N==4) {return vminvq_u32(sk_bit_cast<uint32x4_t>(x)) > 0;}
600#endif
601#if SKVX_USE_SIMD && defined(__wasm_simd128__)
602 if constexpr (N == 4 && sizeof(T) == 4) {
603 return wasm_i32x4_all_true(sk_bit_cast<VExt<4,int>>(x));
604 }
605#endif
606#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
607 if constexpr (N == 8 && sizeof(T) == 4) {
608 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0),
609 sk_bit_cast<__m256i>(x)));
610 return (retv[0] & retv[4]) == 0b1111;
611 }
612#endif
613#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
614 if constexpr (N == 4 && sizeof(T) == 4) {
615 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0),
616 sk_bit_cast<__m128i>(x)));
617 return retv[0] == 0b1111;
618 }
619#endif
620 return all(x.lo)
621 && all(x.hi);
622}
623
624// cast() Vec<N,S> to Vec<N,D>, as if applying a C-cast to each lane.
625// TODO: implement with map()?
626template <typename D, typename S>
627SI Vec<1,D> cast(const Vec<1,S>& src) { return (D)src.val; }
628
629template <typename D, int N, typename S>
630SI Vec<N,D> cast(const Vec<N,S>& src) {
631#if SKVX_USE_SIMD && defined(__clang__)
632 return to_vec(__builtin_convertvector(to_vext(src), VExt<N,D>));
633#else
634 return join(cast<D>(src.lo), cast<D>(src.hi));
635#endif
636}
637
638// min/max match logic of std::min/std::max, which is important when NaN is involved.
639SIT T min(const Vec<1,T>& x) { return x.val; }
640SIT T max(const Vec<1,T>& x) { return x.val; }
641SINT T min(const Vec<N,T>& x) { return std::min(min(x.lo), min(x.hi)); }
642SINT T max(const Vec<N,T>& x) { return std::max(max(x.lo), max(x.hi)); }
643
644SINT Vec<N,T> min(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(y < x, y, x); }
645SINT Vec<N,T> max(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(x < y, y, x); }
646
647SINTU Vec<N,T> min(const Vec<N,T>& x, U y) { return min(x, Vec<N,T>(y)); }
648SINTU Vec<N,T> max(const Vec<N,T>& x, U y) { return max(x, Vec<N,T>(y)); }
649SINTU Vec<N,T> min(U x, const Vec<N,T>& y) { return min(Vec<N,T>(x), y); }
650SINTU Vec<N,T> max(U x, const Vec<N,T>& y) { return max(Vec<N,T>(x), y); }
651
652// pin matches the logic of SkTPin, which is important when NaN is involved. It always returns
653// values in the range lo..hi, and if x is NaN, it returns lo.
654SINT Vec<N,T> pin(const Vec<N,T>& x, const Vec<N,T>& lo, const Vec<N,T>& hi) {
655 return max(lo, min(x, hi));
656}
657
658// Shuffle values from a vector pretty arbitrarily:
659// skvx::Vec<4,float> rgba = {R,G,B,A};
660// shuffle<2,1,0,3> (rgba) ~> {B,G,R,A}
661// shuffle<2,1> (rgba) ~> {B,G}
662// shuffle<2,1,2,1,2,1,2,1>(rgba) ~> {B,G,B,G,B,G,B,G}
663// shuffle<3,3,3,3> (rgba) ~> {A,A,A,A}
664// The only real restriction is that the output also be a legal N=power-of-two sknx::Vec.
665template <int... Ix, int N, typename T>
666SI Vec<sizeof...(Ix),T> shuffle(const Vec<N,T>& x) {
667#if SKVX_USE_SIMD && defined(__clang__)
668 // TODO: can we just always use { x[Ix]... }?
669 return to_vec<sizeof...(Ix),T>(__builtin_shufflevector(to_vext(x), to_vext(x), Ix...));
670#else
671 return { x[Ix]... };
672#endif
673}
674
675// Call map(fn, x) for a vector with fn() applied to each lane of x, { fn(x[0]), fn(x[1]), ... },
676// or map(fn, x,y) for a vector of fn(x[i], y[i]), etc.
677
678template <typename Fn, typename... Args, size_t... I>
679SI auto map(std::index_sequence<I...>,
680 Fn&& fn, const Args&... args) -> skvx::Vec<sizeof...(I), decltype(fn(args[0]...))> {
681 auto lane = [&](size_t i)
682#if defined(__clang__)
683 // CFI, specifically -fsanitize=cfi-icall, seems to give a false positive here,
684 // with errors like "control flow integrity check for type 'float (float)
685 // noexcept' failed during indirect function call... note: sqrtf.cfi_jt defined
686 // here". But we can be quite sure fn is the right type: it's all inferred!
687 // So, stifle CFI in this function.
688 __attribute__((no_sanitize("cfi")))
689#endif
690 { return fn(args[static_cast<int>(i)]...); };
691
692 return { lane(I)... };
693}
694
695template <typename Fn, int N, typename T, typename... Rest>
696auto map(Fn&& fn, const Vec<N,T>& first, const Rest&... rest) {
697 // Derive an {0...N-1} index_sequence from the size of the first arg: N lanes in, N lanes out.
698 return map(std::make_index_sequence<N>{}, fn, first,rest...);
699}
700
701SIN Vec<N,float> ceil(const Vec<N,float>& x) { return map( ceilf, x); }
702SIN Vec<N,float> floor(const Vec<N,float>& x) { return map(floorf, x); }
703SIN Vec<N,float> trunc(const Vec<N,float>& x) { return map(truncf, x); }
704SIN Vec<N,float> round(const Vec<N,float>& x) { return map(roundf, x); }
705SIN Vec<N,float> sqrt(const Vec<N,float>& x) { return map( sqrtf, x); }
706SIN Vec<N,float> abs(const Vec<N,float>& x) { return map( fabsf, x); }
707SIN Vec<N,float> fma(const Vec<N,float>& x,
708 const Vec<N,float>& y,
709 const Vec<N,float>& z) {
710 // I don't understand why Clang's codegen is terrible if we write map(fmaf, x,y,z) directly.
711 auto fn = [](float x, float y, float z) { return fmaf(x,y,z); };
712 return map(fn, x,y,z);
713}
714
715SI Vec<1,int> lrint(const Vec<1,float>& x) {
716 return (int)lrintf(x.val);
717}
718SIN Vec<N,int> lrint(const Vec<N,float>& x) {
719#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
720 if constexpr (N == 8) {
721 return sk_bit_cast<Vec<N,int>>(_mm256_cvtps_epi32(sk_bit_cast<__m256>(x)));
722 }
723#endif
724#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
725 if constexpr (N == 4) {
726 return sk_bit_cast<Vec<N,int>>(_mm_cvtps_epi32(sk_bit_cast<__m128>(x)));
727 }
728#endif
729#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
730 if constexpr (N == 8) {
731 return sk_bit_cast<Vec<N,int>>(__lasx_xvftint_w_s(sk_bit_cast<__m256>(x)));
732 }
733#endif
734#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
735 if constexpr (N == 4) {
736 return sk_bit_cast<Vec<N,int>>(__lsx_vftint_w_s(sk_bit_cast<__m128>(x)));
737 }
738#endif
739 return join(lrint(x.lo),
740 lrint(x.hi));
741}
742
743SIN Vec<N,float> fract(const Vec<N,float>& x) { return x - floor(x); }
744
745// Converts float to half, rounding to nearest even, and supporting de-normal f16 conversion,
746// and overflow to f16 infinity. Should not be called with NaNs, since it can convert NaN->inf.
747// KEEP IN SYNC with skcms' Half_from_F to ensure that f16 colors are computed consistently in both
748// skcms and skvx.
749SIN Vec<N,uint16_t> to_half(const Vec<N,float>& x) {
750 assert(all(x == x)); // No NaNs should reach this function
751
752 // Intrinsics for float->half tend to operate on 4 lanes, and the default implementation has
753 // enough instructions that it's better to split and join on 128 bits groups vs.
754 // recursing for each min/max/shift/etc.
755 if constexpr (N > 4) {
756 return join(to_half(x.lo),
757 to_half(x.hi));
758 }
759
760#if SKVX_USE_SIMD && defined(__aarch64__)
761 if constexpr (N == 4) {
762 return sk_bit_cast<Vec<N,uint16_t>>(vcvt_f16_f32(sk_bit_cast<float32x4_t>(x)));
763
764 }
765#endif
766
767#define I(x) sk_bit_cast<Vec<N,int32_t>>(x)
768#define F(x) sk_bit_cast<Vec<N,float>>(x)
769 Vec<N,int32_t> sem = I(x),
770 s = sem & 0x8000'0000,
771 em = min(sem ^ s, 0x4780'0000), // |x| clamped to f16 infinity
772 // F(em)*8192 increases the exponent by 13, which when added back to em will shift
773 // the mantissa bits 13 to the right. We clamp to 1/2 for subnormal values, which
774 // automatically shifts the mantissa to match 2^-14 expected for a subnorm f16.
775 magic = I(max(F(em) * 8192.f, 0.5f)) & (255 << 23),
776 rounded = I((F(em) + F(magic))), // shift mantissa with automatic round-to-even
777 // Subtract 127 for f32 bias, subtract 13 to undo the *8192, subtract 1 to remove
778 // the implicit leading 1., and add 15 to get the f16 biased exponent.
779 exp = ((magic >> 13) - ((127-15+13+1)<<10)), // shift and re-bias exponent
780 f16 = rounded + exp; // use + if 'rounded' rolled over into first exponent bit
781 return cast<uint16_t>((s>>16) | f16);
782#undef I
783#undef F
784}
785
786// Converts from half to float, preserving NaN and +/- infinity.
787// KEEP IN SYNC with skcms' F_from_Half to ensure that f16 colors are computed consistently in both
788// skcms and skvx.
789SIN Vec<N,float> from_half(const Vec<N,uint16_t>& x) {
790 if constexpr (N > 4) {
791 return join(from_half(x.lo),
792 from_half(x.hi));
793 }
794
795#if SKVX_USE_SIMD && defined(__aarch64__)
796 if constexpr (N == 4) {
797 return sk_bit_cast<Vec<N,float>>(vcvt_f32_f16(sk_bit_cast<float16x4_t>(x)));
798 }
799#endif
800
801 Vec<N,int32_t> wide = cast<int32_t>(x),
802 s = wide & 0x8000,
803 em = wide ^ s,
804 inf_or_nan = (em >= (31 << 10)) & (255 << 23), // Expands exponent to fill 8 bits
805 is_norm = em > 0x3ff,
806 // subnormal f16's are 2^-14*0.[m0:9] == 2^-24*[m0:9].0
807 sub = sk_bit_cast<Vec<N,int32_t>>((cast<float>(em) * (1.f/(1<<24)))),
808 norm = ((em<<13) + ((127-15)<<23)), // Shifts mantissa, shifts + re-biases exp
809 finite = (is_norm & norm) | (~is_norm & sub);
810 // If 'x' is f16 +/- infinity, inf_or_nan will be the filled 8-bit exponent but 'norm' will be
811 // all 0s since 'x's mantissa is 0. Thus norm | inf_or_nan becomes f32 infinity. However, if
812 // 'x' is an f16 NaN, some bits of 'norm' will be non-zero, so it stays an f32 NaN after the OR.
813 return sk_bit_cast<Vec<N,float>>((s<<16) | finite | inf_or_nan);
814}
815
816// div255(x) = (x + 127) / 255 is a bit-exact rounding divide-by-255, packing down to 8-bit.
817SIN Vec<N,uint8_t> div255(const Vec<N,uint16_t>& x) {
818 return cast<uint8_t>( (x+127)/255 );
819}
820
821// approx_scale(x,y) approximates div255(cast<uint16_t>(x)*cast<uint16_t>(y)) within a bit,
822// and is always perfect when x or y is 0 or 255.
823SIN Vec<N,uint8_t> approx_scale(const Vec<N,uint8_t>& x, const Vec<N,uint8_t>& y) {
824 // All of (x*y+x)/256, (x*y+y)/256, and (x*y+255)/256 meet the criteria above.
825 // We happen to have historically picked (x*y+x)/256.
826 auto X = cast<uint16_t>(x),
827 Y = cast<uint16_t>(y);
828 return cast<uint8_t>( (X*Y+X)/256 );
829}
830
831// saturated_add(x,y) sums values and clamps to the maximum value instead of overflowing.
832SINT std::enable_if_t<std::is_unsigned_v<T>, Vec<N,T>> saturated_add(const Vec<N,T>& x,
833 const Vec<N,T>& y) {
834#if SKVX_USE_SIMD && (SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 || defined(SK_ARM_HAS_NEON) || \
835 SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX)
836 // Both SSE and ARM have 16-lane saturated adds, so use intrinsics for those and recurse down
837 // or join up to take advantage.
838 if constexpr (N == 16 && sizeof(T) == 1) {
839 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
840 return sk_bit_cast<Vec<N,T>>(_mm_adds_epu8(sk_bit_cast<__m128i>(x),
841 sk_bit_cast<__m128i>(y)));
842 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
843 return sk_bit_cast<Vec<N,T>>(__lsx_vsadd_bu(sk_bit_cast<__m128i>(x),
844 sk_bit_cast<__m128i>(y)));
845 #else // SK_ARM_HAS_NEON
846 return sk_bit_cast<Vec<N,T>>(vqaddq_u8(sk_bit_cast<uint8x16_t>(x),
847 sk_bit_cast<uint8x16_t>(y)));
848 #endif
849 } else if constexpr (N < 16 && sizeof(T) == 1) {
850 return saturated_add(join(x,x), join(y,y)).lo;
851 } else if constexpr (sizeof(T) == 1) {
852 return join(saturated_add(x.lo, y.lo), saturated_add(x.hi, y.hi));
853 }
854#endif
855 // Otherwise saturate manually
856 auto sum = x + y;
857 return if_then_else(sum < x, Vec<N,T>(std::numeric_limits<T>::max()), sum);
858}
859
860// The ScaledDividerU32 takes a divisor > 1, and creates a function divide(numerator) that
861// calculates a numerator / denominator. For this to be rounded properly, numerator should have
862// half added in:
863// divide(numerator + half) == floor(numerator/denominator + 1/2).
864//
865// This gives an answer within +/- 1 from the true value.
866//
867// Derivation of half:
868// numerator/denominator + 1/2 = (numerator + half) / d
869// numerator + denominator / 2 = numerator + half
870// half = denominator / 2.
871//
872// Because half is divided by 2, that division must also be rounded.
873// half == denominator / 2 = (denominator + 1) / 2.
874//
875// The divisorFactor is just a scaled value:
876// divisorFactor = (1 / divisor) * 2 ^ 32.
877// The maximum that can be divided and rounded is UINT_MAX - half.
878class ScaledDividerU32 {
879public:
880 explicit ScaledDividerU32(uint32_t divisor)
881 : fDivisorFactor{(uint32_t)(std::round((1.0 / divisor) * (1ull << 32)))}
882 , fHalf{(divisor + 1) >> 1} {
883 assert(divisor > 1);
884 }
885
886 Vec<4, uint32_t> divide(const Vec<4, uint32_t>& numerator) const {
887#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
888 uint64x2_t hi = vmull_n_u32(vget_high_u32(to_vext(numerator)), fDivisorFactor);
889 uint64x2_t lo = vmull_n_u32(vget_low_u32(to_vext(numerator)), fDivisorFactor);
890
891 return to_vec<4, uint32_t>(vcombine_u32(vshrn_n_u64(lo,32), vshrn_n_u64(hi,32)));
892#else
893 return cast<uint32_t>((cast<uint64_t>(numerator) * fDivisorFactor) >> 32);
894#endif
895 }
896
897 uint32_t half() const { return fHalf; }
898
899private:
900 const uint32_t fDivisorFactor;
901 const uint32_t fHalf;
902};
903
904
905SIN Vec<N,uint16_t> mull(const Vec<N,uint8_t>& x,
906 const Vec<N,uint8_t>& y) {
907#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
908 // With NEON we can do eight u8*u8 -> u16 in one instruction, vmull_u8 (read, mul-long).
909 if constexpr (N == 8) {
910 return to_vec<8,uint16_t>(vmull_u8(to_vext(x), to_vext(y)));
911 } else if constexpr (N < 8) {
912 return mull(join(x,x), join(y,y)).lo;
913 } else { // N > 8
914 return join(mull(x.lo, y.lo), mull(x.hi, y.hi));
915 }
916#else
917 return cast<uint16_t>(x) * cast<uint16_t>(y);
918#endif
919}
920
921SIN Vec<N,uint32_t> mull(const Vec<N,uint16_t>& x,
922 const Vec<N,uint16_t>& y) {
923#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
924 // NEON can do four u16*u16 -> u32 in one instruction, vmull_u16
925 if constexpr (N == 4) {
926 return to_vec<4,uint32_t>(vmull_u16(to_vext(x), to_vext(y)));
927 } else if constexpr (N < 4) {
928 return mull(join(x,x), join(y,y)).lo;
929 } else { // N > 4
930 return join(mull(x.lo, y.lo), mull(x.hi, y.hi));
931 }
932#else
933 return cast<uint32_t>(x) * cast<uint32_t>(y);
934#endif
935}
936
937SIN Vec<N,uint16_t> mulhi(const Vec<N,uint16_t>& x,
938 const Vec<N,uint16_t>& y) {
939#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
940 // Use _mm_mulhi_epu16 for 8xuint16_t and join or split to get there.
941 if constexpr (N == 8) {
942 return sk_bit_cast<Vec<8,uint16_t>>(_mm_mulhi_epu16(sk_bit_cast<__m128i>(x),
943 sk_bit_cast<__m128i>(y)));
944 } else if constexpr (N < 8) {
945 return mulhi(join(x,x), join(y,y)).lo;
946 } else { // N > 8
947 return join(mulhi(x.lo, y.lo), mulhi(x.hi, y.hi));
948 }
949#elif SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
950 if constexpr (N == 8) {
951 return sk_bit_cast<Vec<8,uint16_t>>(__lsx_vmuh_hu(sk_bit_cast<__m128i>(x),
952 sk_bit_cast<__m128i>(y)));
953 } else if constexpr (N < 8) {
954 return mulhi(join(x,x), join(y,y)).lo;
955 } else { // N > 8
956 return join(mulhi(x.lo, y.lo), mulhi(x.hi, y.hi));
957 }
958#else
959 return skvx::cast<uint16_t>(mull(x, y) >> 16);
960#endif
961}
962
963SINT T dot(const Vec<N, T>& a, const Vec<N, T>& b) {
964 // While dot is a "horizontal" operation like any or all, it needs to remain
965 // in floating point and there aren't really any good SIMD instructions that make it faster.
966 // The constexpr cases remove the for loop in the only cases we realistically call.
967 auto ab = a*b;
968 if constexpr (N == 2) {
969 return ab[0] + ab[1];
970 } else if constexpr (N == 4) {
971 return ab[0] + ab[1] + ab[2] + ab[3];
972 } else {
973 T sum = ab[0];
974 for (int i = 1; i < N; ++i) {
975 sum += ab[i];
976 }
977 return sum;
978 }
979}
980
981SIT T cross(const Vec<2, T>& a, const Vec<2, T>& b) {
982 auto x = a * shuffle<1,0>(b);
983 return x[0] - x[1];
984}
985
986SIN float length(const Vec<N, float>& v) {
987 return std::sqrt(dot(v, v));
988}
989
990SIN double length(const Vec<N, double>& v) {
991 return std::sqrt(dot(v, v));
992}
993
994SIN Vec<N, float> normalize(const Vec<N, float>& v) {
995 return v / length(v);
996}
997
998SIN Vec<N, double> normalize(const Vec<N, double>& v) {
999 return v / length(v);
1000}
1001
1002SINT bool isfinite(const Vec<N, T>& v) {
1003 // Multiply all values together with 0. If they were all finite, the output is
1004 // 0 (also finite). If any were not, we'll get nan.
1005 return SkIsFinite(dot(v, Vec<N, T>(0)));
1006}
1007
1008// De-interleaving load of 4 vectors.
1009//
1010// WARNING: These are really only supported well on NEON. Consider restructuring your data before
1011// resorting to these methods.
1012SIT void strided_load4(const T* v,
1013 Vec<1,T>& a,
1014 Vec<1,T>& b,
1015 Vec<1,T>& c,
1016 Vec<1,T>& d) {
1017 a.val = v[0];
1018 b.val = v[1];
1019 c.val = v[2];
1020 d.val = v[3];
1021}
1022SINT void strided_load4(const T* v,
1023 Vec<N,T>& a,
1024 Vec<N,T>& b,
1025 Vec<N,T>& c,
1026 Vec<N,T>& d) {
1027 strided_load4(v, a.lo, b.lo, c.lo, d.lo);
1028 strided_load4(v + 4*(N/2), a.hi, b.hi, c.hi, d.hi);
1029}
1030#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
1031#define IMPL_LOAD4_TRANSPOSED(N, T, VLD) \
1032SI void strided_load4(const T* v, \
1033 Vec<N,T>& a, \
1034 Vec<N,T>& b, \
1035 Vec<N,T>& c, \
1036 Vec<N,T>& d) { \
1037 auto mat = VLD(v); \
1038 a = sk_bit_cast<Vec<N,T>>(mat.val[0]); \
1039 b = sk_bit_cast<Vec<N,T>>(mat.val[1]); \
1040 c = sk_bit_cast<Vec<N,T>>(mat.val[2]); \
1041 d = sk_bit_cast<Vec<N,T>>(mat.val[3]); \
1042}
1043IMPL_LOAD4_TRANSPOSED(2, uint32_t, vld4_u32)
1044IMPL_LOAD4_TRANSPOSED(4, uint16_t, vld4_u16)
1045IMPL_LOAD4_TRANSPOSED(8, uint8_t, vld4_u8)
1046IMPL_LOAD4_TRANSPOSED(2, int32_t, vld4_s32)
1047IMPL_LOAD4_TRANSPOSED(4, int16_t, vld4_s16)
1048IMPL_LOAD4_TRANSPOSED(8, int8_t, vld4_s8)
1049IMPL_LOAD4_TRANSPOSED(2, float, vld4_f32)
1050IMPL_LOAD4_TRANSPOSED(4, uint32_t, vld4q_u32)
1051IMPL_LOAD4_TRANSPOSED(8, uint16_t, vld4q_u16)
1052IMPL_LOAD4_TRANSPOSED(16, uint8_t, vld4q_u8)
1053IMPL_LOAD4_TRANSPOSED(4, int32_t, vld4q_s32)
1054IMPL_LOAD4_TRANSPOSED(8, int16_t, vld4q_s16)
1055IMPL_LOAD4_TRANSPOSED(16, int8_t, vld4q_s8)
1056IMPL_LOAD4_TRANSPOSED(4, float, vld4q_f32)
1057#undef IMPL_LOAD4_TRANSPOSED
1058
1059#elif SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
1060
1061SI void strided_load4(const float* v,
1062 Vec<4,float>& a,
1063 Vec<4,float>& b,
1064 Vec<4,float>& c,
1065 Vec<4,float>& d) {
1066 __m128 a_ = _mm_loadu_ps(v);
1067 __m128 b_ = _mm_loadu_ps(v+4);
1068 __m128 c_ = _mm_loadu_ps(v+8);
1069 __m128 d_ = _mm_loadu_ps(v+12);
1070 _MM_TRANSPOSE4_PS(a_, b_, c_, d_);
1071 a = sk_bit_cast<Vec<4,float>>(a_);
1072 b = sk_bit_cast<Vec<4,float>>(b_);
1073 c = sk_bit_cast<Vec<4,float>>(c_);
1074 d = sk_bit_cast<Vec<4,float>>(d_);
1075}
1076
1077#elif SKVX_USE_SIMD && SKVX_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1078#define _LSX_TRANSPOSE4(row0, row1, row2, row3) \
1079do { \
1080 __m128i __t0 = __lsx_vilvl_w (row1, row0); \
1081 __m128i __t1 = __lsx_vilvl_w (row3, row2); \
1082 __m128i __t2 = __lsx_vilvh_w (row1, row0); \
1083 __m128i __t3 = __lsx_vilvh_w (row3, row2); \
1084 (row0) = __lsx_vilvl_d (__t1, __t0); \
1085 (row1) = __lsx_vilvh_d (__t1, __t0); \
1086 (row2) = __lsx_vilvl_d (__t3, __t2); \
1087 (row3) = __lsx_vilvh_d (__t3, __t2); \
1088} while (0)
1089
1090SI void strided_load4(const int* v,
1091 Vec<4,int>& a,
1092 Vec<4,int>& b,
1093 Vec<4,int>& c,
1094 Vec<4,int>& d) {
1095 __m128i a_ = __lsx_vld(v, 0);
1096 __m128i b_ = __lsx_vld(v, 16);
1097 __m128i c_ = __lsx_vld(v, 32);
1098 __m128i d_ = __lsx_vld(v, 48);
1099 _LSX_TRANSPOSE4(a_, b_, c_, d_);
1100 a = sk_bit_cast<Vec<4,int>>(a_);
1101 b = sk_bit_cast<Vec<4,int>>(b_);
1102 c = sk_bit_cast<Vec<4,int>>(c_);
1103 d = sk_bit_cast<Vec<4,int>>(d_);
1104}
1105#endif
1106
1107// De-interleaving load of 2 vectors.
1108//
1109// WARNING: These are really only supported well on NEON. Consider restructuring your data before
1110// resorting to these methods.
1111SIT void strided_load2(const T* v, Vec<1,T>& a, Vec<1,T>& b) {
1112 a.val = v[0];
1113 b.val = v[1];
1114}
1115SINT void strided_load2(const T* v, Vec<N,T>& a, Vec<N,T>& b) {
1116 strided_load2(v, a.lo, b.lo);
1117 strided_load2(v + 2*(N/2), a.hi, b.hi);
1118}
1119#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)
1120#define IMPL_LOAD2_TRANSPOSED(N, T, VLD) \
1121SI void strided_load2(const T* v, Vec<N,T>& a, Vec<N,T>& b) { \
1122 auto mat = VLD(v); \
1123 a = sk_bit_cast<Vec<N,T>>(mat.val[0]); \
1124 b = sk_bit_cast<Vec<N,T>>(mat.val[1]); \
1125}
1126IMPL_LOAD2_TRANSPOSED(2, uint32_t, vld2_u32)
1127IMPL_LOAD2_TRANSPOSED(4, uint16_t, vld2_u16)
1128IMPL_LOAD2_TRANSPOSED(8, uint8_t, vld2_u8)
1129IMPL_LOAD2_TRANSPOSED(2, int32_t, vld2_s32)
1130IMPL_LOAD2_TRANSPOSED(4, int16_t, vld2_s16)
1131IMPL_LOAD2_TRANSPOSED(8, int8_t, vld2_s8)
1132IMPL_LOAD2_TRANSPOSED(2, float, vld2_f32)
1133IMPL_LOAD2_TRANSPOSED(4, uint32_t, vld2q_u32)
1134IMPL_LOAD2_TRANSPOSED(8, uint16_t, vld2q_u16)
1135IMPL_LOAD2_TRANSPOSED(16, uint8_t, vld2q_u8)
1136IMPL_LOAD2_TRANSPOSED(4, int32_t, vld2q_s32)
1137IMPL_LOAD2_TRANSPOSED(8, int16_t, vld2q_s16)
1138IMPL_LOAD2_TRANSPOSED(16, int8_t, vld2q_s8)
1139IMPL_LOAD2_TRANSPOSED(4, float, vld2q_f32)
1140#undef IMPL_LOAD2_TRANSPOSED
1141#endif
1142
1143// Define commonly used aliases
1144using float2 = Vec< 2, float>;
1145using float4 = Vec< 4, float>;
1146using float8 = Vec< 8, float>;
1147
1148using double2 = Vec< 2, double>;
1149using double4 = Vec< 4, double>;
1150using double8 = Vec< 8, double>;
1151
1152using byte2 = Vec< 2, uint8_t>;
1153using byte4 = Vec< 4, uint8_t>;
1154using byte8 = Vec< 8, uint8_t>;
1155using byte16 = Vec<16, uint8_t>;
1156
1157using int2 = Vec< 2, int32_t>;
1158using int4 = Vec< 4, int32_t>;
1159using int8 = Vec< 8, int32_t>;
1160
1161using ushort2 = Vec< 2, uint16_t>;
1162using ushort4 = Vec< 4, uint16_t>;
1163using ushort8 = Vec< 8, uint16_t>;
1164
1165using uint2 = Vec< 2, uint32_t>;
1166using uint4 = Vec< 4, uint32_t>;
1167using uint8 = Vec< 8, uint32_t>;
1168
1169using long2 = Vec< 2, int64_t>;
1170using long4 = Vec< 4, int64_t>;
1171using long8 = Vec< 8, int64_t>;
1172
1173// Use with from_half and to_half to convert between floatX, and use these for storage.
1174using half2 = Vec< 2, uint16_t>;
1175using half4 = Vec< 4, uint16_t>;
1176using half8 = Vec< 8, uint16_t>;
1177
1178} // namespace skvx
1179
1180#undef SINTU
1181#undef SINT
1182#undef SIN
1183#undef SIT
1184#undef SI
1185#undef SKVX_ALWAYS_INLINE
1186#undef SKVX_USE_SIMD
1187
1188#endif//SKVX_DEFINED
static void round(SkPoint *p)
static const uint64_t f16[kNumPixels]
std::enable_if_t< sknonstd::is_bitmask_enum< E >::value, E > constexpr operator&(E l, E r)
std::enable_if_t< sknonstd::is_bitmask_enum< E >::value, E & > constexpr operator&=(E &l, E r)
std::enable_if_t< sknonstd::is_bitmask_enum< E >::value, E & > constexpr operator^=(E &l, E r)
std::enable_if_t< sknonstd::is_bitmask_enum< E >::value, E > constexpr operator~(E e)
std::enable_if_t< sknonstd::is_bitmask_enum< E >::value, E > constexpr operator|(E l, E r)
std::enable_if_t< sknonstd::is_bitmask_enum< E >::value, E & > constexpr operator|=(E &l, E r)
std::enable_if_t< sknonstd::is_bitmask_enum< E >::value, E > constexpr operator^(E l, E r)
static uint8_t div255(unsigned prod)
static bool SkIsFinite(T x, Pack... values)
static void normalize(int n, double *gauss)
static skvx::float4 fma(const skvx::float4 &f, float m, const skvx::float4 &a)
static SkSize operator*(SkISize u, SkScalar s)
bool operator!=(const sk_sp< T > &a, const sk_sp< U > &b)
Definition SkRefCnt.h:355
static SK_ALWAYS_INLINE Dst SK_FP_SAFE_ABI sk_bit_cast(const Src &src)
Definition SkUtils.h:68
#define SKVX_ALWAYS_INLINE
Definition SkVx.h:62
#define SIT
Definition SkVx.h:67
#define F(x)
#define I(x)
#define SINTU
Definition SkVx.h:70
#define SIN
Definition SkVx.h:68
#define SINT
Definition SkVx.h:69
static const SkScalar Y
static const SkScalar X
SI void store(P *ptr, const T &val)
SI D cast(const S &v)
SI T if_then_else(C cond, T t, T e)
static const char * begin(const StringSlice &s)
Definition editor.cpp:252
static bool operator<(const SkPlainTextEditor::Editor::TextPosition &u, const SkPlainTextEditor::Editor::TextPosition &v)
Definition editor.h:140
bool operator==(const FlutterPoint &a, const FlutterPoint &b)
std::ostream & operator<<(std::ostream &out, const FlutterPoint &point)
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
Definition main.cc:19
static bool b
struct MyStruct s
struct MyStruct a[10]
G_BEGIN_DECLS G_MODULE_EXPORT FlValue * args
static float max(float r, float g, float b)
Definition hsl.cpp:49
static float min(float r, float g, float b)
Definition hsl.cpp:48
size_t length
__attribute__((visibility("default"))) int RunBenchmarks(int argc
double y
double x
Definition ab.py:1
it will be possible to load the file into Perfetto s trace viewer disable asset Prevents usage of any non test fonts unless they were explicitly Loaded via prefetched default font Indicates whether the embedding started a prefetch of the default font manager before creating the engine run In non interactive keep the shell running after the Dart script has completed enable serial On low power devices with low core running concurrent GC tasks on threads can cause them to contend with the UI thread which could potentially lead to jank This option turns off all concurrent GC activities domain network JSON encoded network policy per domain This overrides the DisallowInsecureConnections switch Embedder can specify whether to allow or disallow insecure connections at a domain level old gen heap size
Definition switches.h:259
constexpr Color operator-(T value, const Color &c)
Definition color.h:895
constexpr Color operator/(T value, const Color &c)
Definition color.h:906
constexpr bool operator>=(const EnumType &lhs, const Mask< EnumType > &rhs)
Definition mask.h:209
constexpr Color operator+(T value, const Color &c)
Definition color.h:890
constexpr bool operator<=(const EnumType &lhs, const Mask< EnumType > &rhs)
Definition mask.h:202
int64_t cross(Point d0, Point d1)
Definition Myers.cpp:55
SINT bool isfinite(const Vec< N, T > &v)
Definition SkVx.h:1003
SIN Vec< N, float > trunc(const Vec< N, float > &x)
Definition SkVx.h:704
Vec< 8, uint16_t > ushort8
Definition SkVx.h:1164
SINT T dot(const Vec< N, T > &a, const Vec< N, T > &b)
Definition SkVx.h:964
SINT Vec< N, T > & operator-=(Vec< N, T > &x, const Vec< N, T > &y)
Definition SkVx.h:451
SI Vec< 1, int > lrint(const Vec< 1, float > &x)
Definition SkVx.h:716
Vec< 8, uint16_t > half8
Definition SkVx.h:1177
Vec< 8, float > float8
Definition SkVx.h:1147
Vec< 16, uint8_t > byte16
Definition SkVx.h:1156
Vec< 8, int32_t > int8
Definition SkVx.h:1160
Vec< 8, double > double8
Definition SkVx.h:1151
SINT Vec< N, T > naive_if_then_else(const Vec< N, M< T > > &cond, const Vec< N, T > &t, const Vec< N, T > &e)
Definition SkVx.h:474
Vec< 2, int64_t > long2
Definition SkVx.h:1170
SIT void strided_load4(const T *v, Vec< 1, T > &a, Vec< 1, T > &b, Vec< 1, T > &c, Vec< 1, T > &d)
Definition SkVx.h:1013
SIN Vec< N, uint16_t > mulhi(const Vec< N, uint16_t > &x, const Vec< N, uint16_t > &y)
Definition SkVx.h:938
SIN Vec< N, float > abs(const Vec< N, float > &x)
Definition SkVx.h:707
SINT Vec< N, T > & operator*=(Vec< N, T > &x, const Vec< N, T > &y)
Definition SkVx.h:452
SIT void strided_load2(const T *v, Vec< 1, T > &a, Vec< 1, T > &b)
Definition SkVx.h:1112
Vec< 8, uint32_t > uint8
Definition SkVx.h:1168
Vec< 2, int32_t > int2
Definition SkVx.h:1158
SIN Vec< N, float > sqrt(const Vec< N, float > &x)
Definition SkVx.h:706
SINT Vec< 2 *N, T > join(const Vec< N, T > &lo, const Vec< N, T > &hi)
Definition SkVx.h:242
SIN Vec< N, uint16_t > mull(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
Definition SkVx.h:906
SINT Vec< N, T > & operator>>=(Vec< N, T > &x, int bits)
Definition SkVx.h:467
Vec< 2, uint16_t > half2
Definition SkVx.h:1175
Vec< 4, int32_t > int4
Definition SkVx.h:1159
SIN Vec< N, float > from_half(const Vec< N, uint16_t > &x)
Definition SkVx.h:790
Vec< 2, uint8_t > byte2
Definition SkVx.h:1153
SIN Vec< N, uint16_t > to_half(const Vec< N, float > &x)
Definition SkVx.h:750
SIT bool all(const Vec< 1, T > &x)
Definition SkVx.h:582
SINT std::enable_if_t< std::is_unsigned_v< T >, Vec< N, T > > saturated_add(const Vec< N, T > &x, const Vec< N, T > &y)
Definition SkVx.h:833
Vec< 4, int64_t > long4
Definition SkVx.h:1171
SI auto map(std::index_sequence< I... >, Fn &&fn, const Args &... args) -> skvx::Vec< sizeof...(I), decltype(fn(args[0]...))>
Definition SkVx.h:680
SIT Vec< 1, T > operator>>(const Vec< 1, T > &x, int k)
Definition SkVx.h:349
Vec< 2, uint16_t > ushort2
Definition SkVx.h:1162
SIT Vec< 1, T > operator!(const Vec< 1, T > &x)
Definition SkVx.h:344
Vec< 4, uint32_t > uint4
Definition SkVx.h:1167
SINT Vec< N, T > & operator/=(Vec< N, T > &x, const Vec< N, T > &y)
Definition SkVx.h:453
Vec< 4, double > double4
Definition SkVx.h:1150
SI Vec< sizeof...(Ix), T > shuffle(const Vec< N, T > &)
Definition SkVx.h:667
Vec< 4, uint8_t > byte4
Definition SkVx.h:1154
Vec< 4, uint16_t > ushort4
Definition SkVx.h:1163
Vec< 2, double > double2
Definition SkVx.h:1149
SINT Vec< N, T > & operator+=(Vec< N, T > &x, const Vec< N, T > &y)
Definition SkVx.h:450
SIN Vec< N, float > fract(const Vec< N, float > &x)
Definition SkVx.h:744
Vec< 8, uint8_t > byte8
Definition SkVx.h:1155
SIN Vec< N, float > floor(const Vec< N, float > &x)
Definition SkVx.h:703
SIT bool any(const Vec< 1, T > &x)
Definition SkVx.h:530
SIN Vec< N, float > ceil(const Vec< N, float > &x)
Definition SkVx.h:702
SINT Vec< N, T > & operator<<=(Vec< N, T > &x, int bits)
Definition SkVx.h:466
Vec< 4, uint16_t > half4
Definition SkVx.h:1176
SIN Vec< N, uint8_t > approx_scale(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
Definition SkVx.h:824
Vec< 2, uint32_t > uint2
Definition SkVx.h:1166
SINT Vec< N, T > pin(const Vec< N, T > &x, const Vec< N, T > &lo, const Vec< N, T > &hi)
Definition SkVx.h:655
Vec< 8, int64_t > long8
Definition SkVx.h:1172
Definition ref_ptr.h:256
SkScalar w
#define M(PROC, DITHER)
Definition SkMD5.cpp:134

◆ SIT

#define SIT   template < typename T> SI

Definition at line 67 of file SkVx.h.

◆ SKVX_ALWAYS_INLINE

#define SKVX_ALWAYS_INLINE   __attribute__((always_inline))

Definition at line 62 of file SkVx.h.

◆ SKVX_USE_SIMD

#define SKVX_USE_SIMD   1

Definition at line 38 of file SkVx.h.