d8/d7a/SkVx_8h_source.html

/*

 * Copyright 2019 Google Inc.

 *

 * Use of this source code is governed by a BSD-style license that can be

 * found in the LICENSE file.

 */


#ifndef SKVX_DEFINED

#define SKVX_DEFINED


// skvx::Vec<N,T> are SIMD vectors of N T's, a v1.5 successor to SkNx<N,T>.

//

// This time we're leaning a bit less on platform-specific intrinsics and a bit

// more on Clang/GCC vector extensions, but still keeping the option open to

// drop in platform-specific intrinsics, actually more easily than before.

//

// We've also fixed a few of the caveats that used to make SkNx awkward to work

// with across translation units.  skvx::Vec<N,T> always has N*sizeof(T) size

// and alignment and is safe to use across translation units freely.

// (Ideally we'd only align to T, but that tanks ARMv7 NEON codegen.)


#include "include/private/base/SkFeatures.h"

#include "src/base/SkUtils.h"

#include <algorithm>         // std::min, std::max

#include <cassert>           // assert()

#include <cmath>             // ceilf, floorf, truncf, roundf, sqrtf, etc.

#include <cstdint>           // intXX_t

#include <cstring>           // memcpy()

#include <initializer_list>  // std::initializer_list

#include <type_traits>

#include <utility>           // std::index_sequence


// Users may disable SIMD with SKNX_NO_SIMD, which may be set via compiler flags.

// The gn build has no option which sets SKNX_NO_SIMD.

// Use SKVX_USE_SIMD internally to avoid confusing double negation.

// Do not use 'defined' in a macro expansion.

#if !defined(SKNX_NO_SIMD)

    #define SKVX_USE_SIMD 1

#else

    #define SKVX_USE_SIMD 0

#endif


#if SKVX_USE_SIMD

    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1

        #include <immintrin.h>

    #elif defined(SK_ARM_HAS_NEON)

        #include <arm_neon.h>

    #elif defined(__wasm_simd128__)

        #include <wasm_simd128.h>

    #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX

        #include <lasxintrin.h>

        #include <lsxintrin.h>

    #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX

        #include <lsxintrin.h>

    #endif

#endif


// To avoid ODR violations, all methods must be force-inlined...

#if defined(_MSC_VER)

    #define SKVX_ALWAYS_INLINE __forceinline

#else

    #define SKVX_ALWAYS_INLINE __attribute__((always_inline))

#endif


// ... and all standalone functions must be static.  Please use these helpers:

#define SI    static inline

#define SIT   template <       typename T> SI

#define SIN   template <int N            > SI

#define SINT  template <int N, typename T> SI

#define SINTU template <int N, typename T, typename U, \

                        typename=std::enable_if_t<std::is_convertible<U,T>::value>> SI


namespace skvx {


template <int N, typename T>

struct alignas(N*sizeof(T)) Vec;


template <int... Ix, int N, typename T>

SI Vec<sizeof...(Ix),T> shuffle(const Vec<N,T>&);


// All Vec have the same simple memory layout, the same as `T vec[N]`.

template <int N, typename T>

struct alignas(N*sizeof(T)) Vec {

    static_assert((N & (N-1)) == 0,        "N must be a power of 2.");

    static_assert(sizeof(T) >= alignof(T), "What kind of unusual T is this?");


    // Methods belong here in the class declaration of Vec only if:

    //   - they must be here, like constructors or operator[];

    //   - they'll definitely never want a specialized implementation.

    // Other operations on Vec should be defined outside the type.


    SKVX_ALWAYS_INLINE Vec() = default;

    SKVX_ALWAYS_INLINE Vec(T s) : lo(s), hi(s) {}


    // NOTE: Vec{x} produces x000..., whereas Vec(x) produces xxxx.... since this constructor fills

    // unspecified lanes with 0s, whereas the single T constructor fills all lanes with the value.

    SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) {

        T vals[N] = {0};

        assert(xs.size() <= (size_t)N);

        memcpy(vals, xs.begin(), std::min(xs.size(), (size_t)N)*sizeof(T));


        this->lo = Vec<N/2,T>::Load(vals +   0);

        this->hi = Vec<N/2,T>::Load(vals + N/2);

    }


    SKVX_ALWAYS_INLINE T  operator[](int i) const { return i<N/2 ? this->lo[i] : this->hi[i-N/2]; }

    SKVX_ALWAYS_INLINE T& operator[](int i)       { return i<N/2 ? this->lo[i] : this->hi[i-N/2]; }


    SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) {

        return sk_unaligned_load<Vec>(ptr);

    }

    SKVX_ALWAYS_INLINE void store(void* ptr) const {

        // Note: Calling sk_unaligned_store produces slightly worse code here, for some reason

        memcpy(ptr, this, sizeof(Vec));

    }


    Vec<N/2,T> lo, hi;

};


// We have specializations for N == 1 (the base-case), as well as 2 and 4, where we add helpful

// constructors and swizzle accessors.

template <typename T>

struct alignas(4*sizeof(T)) Vec<4,T> {

    static_assert(sizeof(T) >= alignof(T), "What kind of unusual T is this?");


    SKVX_ALWAYS_INLINE Vec() = default;

    SKVX_ALWAYS_INLINE Vec(T s) : lo(s), hi(s) {}

    SKVX_ALWAYS_INLINE Vec(T x, T y, T z, T w) : lo(x,y), hi(z,w) {}

    SKVX_ALWAYS_INLINE Vec(Vec<2,T> xy, T z, T w) : lo(xy), hi(z,w) {}

    SKVX_ALWAYS_INLINE Vec(T x, T y, Vec<2,T> zw) : lo(x,y), hi(zw) {}

    SKVX_ALWAYS_INLINE Vec(Vec<2,T> xy, Vec<2,T> zw) : lo(xy), hi(zw) {}


    SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) {

        T vals[4] = {0};

        assert(xs.size() <= (size_t)4);

        memcpy(vals, xs.begin(), std::min(xs.size(), (size_t)4)*sizeof(T));


        this->lo = Vec<2,T>::Load(vals + 0);

        this->hi = Vec<2,T>::Load(vals + 2);

    }


    SKVX_ALWAYS_INLINE T  operator[](int i) const { return i<2 ? this->lo[i] : this->hi[i-2]; }

    SKVX_ALWAYS_INLINE T& operator[](int i)       { return i<2 ? this->lo[i] : this->hi[i-2]; }


    SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) {

        return sk_unaligned_load<Vec>(ptr);

    }

    SKVX_ALWAYS_INLINE void store(void* ptr) const {

        memcpy(ptr, this, sizeof(Vec));

    }


    SKVX_ALWAYS_INLINE Vec<2,T>& xy() { return lo; }

    SKVX_ALWAYS_INLINE Vec<2,T>& zw() { return hi; }

    SKVX_ALWAYS_INLINE T& x() { return lo.lo.val; }

    SKVX_ALWAYS_INLINE T& y() { return lo.hi.val; }

    SKVX_ALWAYS_INLINE T& z() { return hi.lo.val; }

    SKVX_ALWAYS_INLINE T& w() { return hi.hi.val; }


    SKVX_ALWAYS_INLINE Vec<2,T> xy() const { return lo; }

    SKVX_ALWAYS_INLINE Vec<2,T> zw() const { return hi; }

    SKVX_ALWAYS_INLINE T x() const { return lo.lo.val; }

    SKVX_ALWAYS_INLINE T y() const { return lo.hi.val; }

    SKVX_ALWAYS_INLINE T z() const { return hi.lo.val; }

    SKVX_ALWAYS_INLINE T w() const { return hi.hi.val; }


    // Exchange-based swizzles. These should take 1 cycle on NEON and 3 (pipelined) cycles on SSE.

    SKVX_ALWAYS_INLINE Vec<4,T> yxwz() const { return shuffle<1,0,3,2>(*this); }

    SKVX_ALWAYS_INLINE Vec<4,T> zwxy() const { return shuffle<2,3,0,1>(*this); }


    Vec<2,T> lo, hi;

};


template <typename T>

struct alignas(2*sizeof(T)) Vec<2,T> {

    static_assert(sizeof(T) >= alignof(T), "What kind of unusual T is this?");


    SKVX_ALWAYS_INLINE Vec() = default;

    SKVX_ALWAYS_INLINE Vec(T s) : lo(s), hi(s) {}

    SKVX_ALWAYS_INLINE Vec(T x, T y) : lo(x), hi(y) {}


    SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) {

        T vals[2] = {0};

        assert(xs.size() <= (size_t)2);

        memcpy(vals, xs.begin(), std::min(xs.size(), (size_t)2)*sizeof(T));


        this->lo = Vec<1,T>::Load(vals + 0);

        this->hi = Vec<1,T>::Load(vals + 1);

    }


    SKVX_ALWAYS_INLINE T  operator[](int i) const { return i<1 ? this->lo[i] : this->hi[i-1]; }

    SKVX_ALWAYS_INLINE T& operator[](int i)       { return i<1 ? this->lo[i] : this->hi[i-1]; }


    SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) {

        return sk_unaligned_load<Vec>(ptr);

    }

    SKVX_ALWAYS_INLINE void store(void* ptr) const {

        memcpy(ptr, this, sizeof(Vec));

    }


    SKVX_ALWAYS_INLINE T& x() { return lo.val; }

    SKVX_ALWAYS_INLINE T& y() { return hi.val; }


    SKVX_ALWAYS_INLINE T x() const { return lo.val; }

    SKVX_ALWAYS_INLINE T y() const { return hi.val; }


    // This exchange-based swizzle should take 1 cycle on NEON and 3 (pipelined) cycles on SSE.

    SKVX_ALWAYS_INLINE Vec<2,T> yx() const { return shuffle<1,0>(*this); }

    SKVX_ALWAYS_INLINE Vec<4,T> xyxy() const { return Vec<4,T>(*this, *this); }


    Vec<1,T> lo, hi;

};


template <typename T>

struct Vec<1,T> {

    T val = {};


    SKVX_ALWAYS_INLINE Vec() = default;

    SKVX_ALWAYS_INLINE Vec(T s) : val(s) {}


    SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) : val(xs.size() ? *xs.begin() : 0) {

        assert(xs.size() <= (size_t)1);

    }


    SKVX_ALWAYS_INLINE T  operator[](int i) const { assert(i == 0); return val; }

    SKVX_ALWAYS_INLINE T& operator[](int i)       { assert(i == 0); return val; }


    SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) {

        return sk_unaligned_load<Vec>(ptr);

    }

    SKVX_ALWAYS_INLINE void store(void* ptr) const {

        memcpy(ptr, this, sizeof(Vec));

    }

};


// Translate from a value type T to its corresponding Mask, the result of a comparison.

template <typename T> struct Mask { using type = T; };

template <> struct Mask<float > { using type = int32_t; };

template <> struct Mask<double> { using type = int64_t; };

template <typename T> using M = typename Mask<T>::type;


// Join two Vec<N,T> into one Vec<2N,T>.

SINT Vec<2*N,T> join(const Vec<N,T>& lo, const Vec<N,T>& hi) {

    Vec<2*N,T> v;

    v.lo = lo;

    v.hi = hi;

    return v;

}


// We have three strategies for implementing Vec operations:

//    1) lean on Clang/GCC vector extensions when available;

//    2) use map() to apply a scalar function lane-wise;

//    3) recurse on lo/hi to scalar portable implementations.

// We can slot in platform-specific implementations as overloads for particular Vec<N,T>,

// or often integrate them directly into the recursion of style 3), allowing fine control.


#if SKVX_USE_SIMD && (defined(__clang__) || defined(__GNUC__))


    // VExt<N,T> types have the same size as Vec<N,T> and support most operations directly.

    #if defined(__clang__)

        template <int N, typename T>

        using VExt = T __attribute__((ext_vector_type(N)));


    #elif defined(__GNUC__)

        template <int N, typename T>

        struct VExtHelper {

            typedef T __attribute__((vector_size(N*sizeof(T)))) type;

        };


        template <int N, typename T>

        using VExt = typename VExtHelper<N,T>::type;


        // For some reason some (new!) versions of GCC cannot seem to deduce N in the generic

        // to_vec<N,T>() below for N=4 and T=float.  This workaround seems to help...

        SI Vec<4,float> to_vec(VExt<4,float> v) { return sk_bit_cast<Vec<4,float>>(v); }

    #endif


    SINT VExt<N,T> to_vext(const Vec<N,T>& v) { return sk_bit_cast<VExt<N,T>>(v); }

    SINT Vec <N,T> to_vec(const VExt<N,T>& v) { return sk_bit_cast<Vec <N,T>>(v); }


    SINT Vec<N,T> operator+(const Vec<N,T>& x, const Vec<N,T>& y) {

        return to_vec<N,T>(to_vext(x) + to_vext(y));

    }

    SINT Vec<N,T> operator-(const Vec<N,T>& x, const Vec<N,T>& y) {

        return to_vec<N,T>(to_vext(x) - to_vext(y));

    }

    SINT Vec<N,T> operator*(const Vec<N,T>& x, const Vec<N,T>& y) {

        return to_vec<N,T>(to_vext(x) * to_vext(y));

    }

    SINT Vec<N,T> operator/(const Vec<N,T>& x, const Vec<N,T>& y) {

        return to_vec<N,T>(to_vext(x) / to_vext(y));

    }


    SINT Vec<N,T> operator^(const Vec<N,T>& x, const Vec<N,T>& y) {

        return to_vec<N,T>(to_vext(x) ^ to_vext(y));

    }

    SINT Vec<N,T> operator&(const Vec<N,T>& x, const Vec<N,T>& y) {

        return to_vec<N,T>(to_vext(x) & to_vext(y));

    }

    SINT Vec<N,T> operator|(const Vec<N,T>& x, const Vec<N,T>& y) {

        return to_vec<N,T>(to_vext(x) | to_vext(y));

    }


    SINT Vec<N,T> operator!(const Vec<N,T>& x) { return to_vec<N,T>(!to_vext(x)); }

    SINT Vec<N,T> operator-(const Vec<N,T>& x) { return to_vec<N,T>(-to_vext(x)); }

    SINT Vec<N,T> operator~(const Vec<N,T>& x) { return to_vec<N,T>(~to_vext(x)); }


    SINT Vec<N,T> operator<<(const Vec<N,T>& x, int k) { return to_vec<N,T>(to_vext(x) << k); }

    SINT Vec<N,T> operator>>(const Vec<N,T>& x, int k) { return to_vec<N,T>(to_vext(x) >> k); }


    SINT Vec<N,M<T>> operator==(const Vec<N,T>& x, const Vec<N,T>& y) {

        return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) == to_vext(y));

    }

    SINT Vec<N,M<T>> operator!=(const Vec<N,T>& x, const Vec<N,T>& y) {

        return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) != to_vext(y));

    }

    SINT Vec<N,M<T>> operator<=(const Vec<N,T>& x, const Vec<N,T>& y) {

        return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) <= to_vext(y));

    }

    SINT Vec<N,M<T>> operator>=(const Vec<N,T>& x, const Vec<N,T>& y) {

        return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) >= to_vext(y));

    }

    SINT Vec<N,M<T>> operator< (const Vec<N,T>& x, const Vec<N,T>& y) {

        return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) <  to_vext(y));

    }

    SINT Vec<N,M<T>> operator> (const Vec<N,T>& x, const Vec<N,T>& y) {

        return sk_bit_cast<Vec<N,M<T>>>(to_vext(x) >  to_vext(y));

    }


#else


    // Either SKNX_NO_SIMD is defined, or Clang/GCC vector extensions are not available.

    // We'll implement things portably with N==1 scalar implementations and recursion onto them.


    // N == 1 scalar implementations.

    SIT Vec<1,T> operator+(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val + y.val; }

    SIT Vec<1,T> operator-(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val - y.val; }

    SIT Vec<1,T> operator*(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val * y.val; }

    SIT Vec<1,T> operator/(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val / y.val; }


    SIT Vec<1,T> operator^(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val ^ y.val; }

    SIT Vec<1,T> operator&(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val & y.val; }

    SIT Vec<1,T> operator|(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val | y.val; }


    SIT Vec<1,T> operator!(const Vec<1,T>& x) { return !x.val; }

    SIT Vec<1,T> operator-(const Vec<1,T>& x) { return -x.val; }

    SIT Vec<1,T> operator~(const Vec<1,T>& x) { return ~x.val; }


    SIT Vec<1,T> operator<<(const Vec<1,T>& x, int k) { return x.val << k; }

    SIT Vec<1,T> operator>>(const Vec<1,T>& x, int k) { return x.val >> k; }


    SIT Vec<1,M<T>> operator==(const Vec<1,T>& x, const Vec<1,T>& y) {

        return x.val == y.val ? ~0 : 0;

    }

    SIT Vec<1,M<T>> operator!=(const Vec<1,T>& x, const Vec<1,T>& y) {

        return x.val != y.val ? ~0 : 0;

    }

    SIT Vec<1,M<T>> operator<=(const Vec<1,T>& x, const Vec<1,T>& y) {

        return x.val <= y.val ? ~0 : 0;

    }

    SIT Vec<1,M<T>> operator>=(const Vec<1,T>& x, const Vec<1,T>& y) {

        return x.val >= y.val ? ~0 : 0;

    }

    SIT Vec<1,M<T>> operator< (const Vec<1,T>& x, const Vec<1,T>& y) {

        return x.val <  y.val ? ~0 : 0;

    }

    SIT Vec<1,M<T>> operator> (const Vec<1,T>& x, const Vec<1,T>& y) {

        return x.val >  y.val ? ~0 : 0;

    }


    // Recurse on lo/hi down to N==1 scalar implementations.

    SINT Vec<N,T> operator+(const Vec<N,T>& x, const Vec<N,T>& y) {

        return join(x.lo + y.lo, x.hi + y.hi);

    }

    SINT Vec<N,T> operator-(const Vec<N,T>& x, const Vec<N,T>& y) {

        return join(x.lo - y.lo, x.hi - y.hi);

    }

    SINT Vec<N,T> operator*(const Vec<N,T>& x, const Vec<N,T>& y) {

        return join(x.lo * y.lo, x.hi * y.hi);

    }

    SINT Vec<N,T> operator/(const Vec<N,T>& x, const Vec<N,T>& y) {

        return join(x.lo / y.lo, x.hi / y.hi);

    }


    SINT Vec<N,T> operator^(const Vec<N,T>& x, const Vec<N,T>& y) {

        return join(x.lo ^ y.lo, x.hi ^ y.hi);

    }

    SINT Vec<N,T> operator&(const Vec<N,T>& x, const Vec<N,T>& y) {

        return join(x.lo & y.lo, x.hi & y.hi);

    }

    SINT Vec<N,T> operator|(const Vec<N,T>& x, const Vec<N,T>& y) {

        return join(x.lo | y.lo, x.hi | y.hi);

    }


    SINT Vec<N,T> operator!(const Vec<N,T>& x) { return join(!x.lo, !x.hi); }

    SINT Vec<N,T> operator-(const Vec<N,T>& x) { return join(-x.lo, -x.hi); }

    SINT Vec<N,T> operator~(const Vec<N,T>& x) { return join(~x.lo, ~x.hi); }


    SINT Vec<N,T> operator<<(const Vec<N,T>& x, int k) { return join(x.lo << k, x.hi << k); }

    SINT Vec<N,T> operator>>(const Vec<N,T>& x, int k) { return join(x.lo >> k, x.hi >> k); }


    SINT Vec<N,M<T>> operator==(const Vec<N,T>& x, const Vec<N,T>& y) {

        return join(x.lo == y.lo, x.hi == y.hi);

    }

    SINT Vec<N,M<T>> operator!=(const Vec<N,T>& x, const Vec<N,T>& y) {

        return join(x.lo != y.lo, x.hi != y.hi);

    }

    SINT Vec<N,M<T>> operator<=(const Vec<N,T>& x, const Vec<N,T>& y) {

        return join(x.lo <= y.lo, x.hi <= y.hi);

    }

    SINT Vec<N,M<T>> operator>=(const Vec<N,T>& x, const Vec<N,T>& y) {

        return join(x.lo >= y.lo, x.hi >= y.hi);

    }

    SINT Vec<N,M<T>> operator< (const Vec<N,T>& x, const Vec<N,T>& y) {

        return join(x.lo <  y.lo, x.hi <  y.hi);

    }

    SINT Vec<N,M<T>> operator> (const Vec<N,T>& x, const Vec<N,T>& y) {

        return join(x.lo >  y.lo, x.hi >  y.hi);

    }

#endif


// Scalar/vector operations splat the scalar to a vector.

SINTU Vec<N,T>    operator+ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) +  y; }

SINTU Vec<N,T>    operator- (U x, const Vec<N,T>& y) { return Vec<N,T>(x) -  y; }

SINTU Vec<N,T>    operator* (U x, const Vec<N,T>& y) { return Vec<N,T>(x) *  y; }

SINTU Vec<N,T>    operator/ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) /  y; }

SINTU Vec<N,T>    operator^ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) ^  y; }

SINTU Vec<N,T>    operator& (U x, const Vec<N,T>& y) { return Vec<N,T>(x) &  y; }

SINTU Vec<N,T>    operator| (U x, const Vec<N,T>& y) { return Vec<N,T>(x) |  y; }

SINTU Vec<N,M<T>> operator==(U x, const Vec<N,T>& y) { return Vec<N,T>(x) == y; }

SINTU Vec<N,M<T>> operator!=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) != y; }

SINTU Vec<N,M<T>> operator<=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) <= y; }

SINTU Vec<N,M<T>> operator>=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) >= y; }

SINTU Vec<N,M<T>> operator< (U x, const Vec<N,T>& y) { return Vec<N,T>(x) <  y; }

SINTU Vec<N,M<T>> operator> (U x, const Vec<N,T>& y) { return Vec<N,T>(x) >  y; }


SINTU Vec<N,T>    operator+ (const Vec<N,T>& x, U y) { return x +  Vec<N,T>(y); }

SINTU Vec<N,T>    operator- (const Vec<N,T>& x, U y) { return x -  Vec<N,T>(y); }

SINTU Vec<N,T>    operator* (const Vec<N,T>& x, U y) { return x *  Vec<N,T>(y); }

SINTU Vec<N,T>    operator/ (const Vec<N,T>& x, U y) { return x /  Vec<N,T>(y); }

SINTU Vec<N,T>    operator^ (const Vec<N,T>& x, U y) { return x ^  Vec<N,T>(y); }

SINTU Vec<N,T>    operator& (const Vec<N,T>& x, U y) { return x &  Vec<N,T>(y); }

SINTU Vec<N,T>    operator| (const Vec<N,T>& x, U y) { return x |  Vec<N,T>(y); }

SINTU Vec<N,M<T>> operator==(const Vec<N,T>& x, U y) { return x == Vec<N,T>(y); }

SINTU Vec<N,M<T>> operator!=(const Vec<N,T>& x, U y) { return x != Vec<N,T>(y); }

SINTU Vec<N,M<T>> operator<=(const Vec<N,T>& x, U y) { return x <= Vec<N,T>(y); }

SINTU Vec<N,M<T>> operator>=(const Vec<N,T>& x, U y) { return x >= Vec<N,T>(y); }

SINTU Vec<N,M<T>> operator< (const Vec<N,T>& x, U y) { return x <  Vec<N,T>(y); }

SINTU Vec<N,M<T>> operator> (const Vec<N,T>& x, U y) { return x >  Vec<N,T>(y); }


SINT Vec<N,T>& operator+=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x + y); }

SINT Vec<N,T>& operator-=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x - y); }

SINT Vec<N,T>& operator*=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x * y); }

SINT Vec<N,T>& operator/=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x / y); }

SINT Vec<N,T>& operator^=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x ^ y); }

SINT Vec<N,T>& operator&=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x & y); }

SINT Vec<N,T>& operator|=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x | y); }


SINTU Vec<N,T>& operator+=(Vec<N,T>& x, U y) { return (x = x + Vec<N,T>(y)); }

SINTU Vec<N,T>& operator-=(Vec<N,T>& x, U y) { return (x = x - Vec<N,T>(y)); }

SINTU Vec<N,T>& operator*=(Vec<N,T>& x, U y) { return (x = x * Vec<N,T>(y)); }

SINTU Vec<N,T>& operator/=(Vec<N,T>& x, U y) { return (x = x / Vec<N,T>(y)); }

SINTU Vec<N,T>& operator^=(Vec<N,T>& x, U y) { return (x = x ^ Vec<N,T>(y)); }

SINTU Vec<N,T>& operator&=(Vec<N,T>& x, U y) { return (x = x & Vec<N,T>(y)); }

SINTU Vec<N,T>& operator|=(Vec<N,T>& x, U y) { return (x = x | Vec<N,T>(y)); }


SINT Vec<N,T>& operator<<=(Vec<N,T>& x, int bits) { return (x = x << bits); }

SINT Vec<N,T>& operator>>=(Vec<N,T>& x, int bits) { return (x = x >> bits); }


// Some operations we want are not expressible with Clang/GCC vector extensions.


// Clang can reason about naive_if_then_else() and optimize through it better

// than if_then_else(), so it's sometimes useful to call it directly when we

// think an entire expression should optimize away, e.g. min()/max().

SINT Vec<N,T> naive_if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec<N,T>& e) {

    return sk_bit_cast<Vec<N,T>>(( cond & sk_bit_cast<Vec<N, M<T>>>(t)) |

                                 (~cond & sk_bit_cast<Vec<N, M<T>>>(e)) );

}


SIT Vec<1,T> if_then_else(const Vec<1,M<T>>& cond, const Vec<1,T>& t, const Vec<1,T>& e) {

    // In practice this scalar implementation is unlikely to be used.  See next if_then_else().

    return sk_bit_cast<Vec<1,T>>(( cond & sk_bit_cast<Vec<1, M<T>>>(t)) |

                                 (~cond & sk_bit_cast<Vec<1, M<T>>>(e)) );

}

SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec<N,T>& e) {

    // Specializations inline here so they can generalize what types the apply to.

#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2

    if constexpr (N*sizeof(T) == 32) {

        return sk_bit_cast<Vec<N,T>>(_mm256_blendv_epi8(sk_bit_cast<__m256i>(e),

                                                        sk_bit_cast<__m256i>(t),

                                                        sk_bit_cast<__m256i>(cond)));

    }

#endif

#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41

    if constexpr (N*sizeof(T) == 16) {

        return sk_bit_cast<Vec<N,T>>(_mm_blendv_epi8(sk_bit_cast<__m128i>(e),

                                                     sk_bit_cast<__m128i>(t),

                                                     sk_bit_cast<__m128i>(cond)));

    }

#endif

#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)

    if constexpr (N*sizeof(T) == 16) {

        return sk_bit_cast<Vec<N,T>>(vbslq_u8(sk_bit_cast<uint8x16_t>(cond),

                                              sk_bit_cast<uint8x16_t>(t),

                                              sk_bit_cast<uint8x16_t>(e)));

    }

#endif

#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX

    if constexpr (N*sizeof(T) == 32) {

        return sk_bit_cast<Vec<N,T>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),

                                                       sk_bit_cast<__m256i>(t),

                                                       sk_bit_cast<__m256i>(cond)));

    }

#endif

#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX

    if constexpr (N*sizeof(T) == 16) {

        return sk_bit_cast<Vec<N,T>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(e),

                                                     sk_bit_cast<__m128i>(t),

                                                     sk_bit_cast<__m128i>(cond)));

    }

#endif

    // Recurse for large vectors to try to hit the specializations above.

    if constexpr (N*sizeof(T) > 16) {

        return join(if_then_else(cond.lo, t.lo, e.lo),

                    if_then_else(cond.hi, t.hi, e.hi));

    }

    // This default can lead to better code than the recursing onto scalars.

    return naive_if_then_else(cond, t, e);

}


SIT  bool any(const Vec<1,T>& x) { return x.val != 0; }

SINT bool any(const Vec<N,T>& x) {

    // For any(), the _mm_testz intrinsics are correct and don't require comparing 'x' to 0, so it's

    // lower latency compared to _mm_movemask + _mm_compneq on plain SSE.

#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2

    if constexpr (N*sizeof(T) == 32) {

        return !_mm256_testz_si256(sk_bit_cast<__m256i>(x), _mm256_set1_epi32(-1));

    }

#endif

#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41

    if constexpr (N*sizeof(T) == 16) {

        return !_mm_testz_si128(sk_bit_cast<__m128i>(x), _mm_set1_epi32(-1));

    }

#endif

#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1

    if constexpr (N*sizeof(T) == 16) {

        // On SSE, movemask checks only the MSB in each lane, which is fine if the lanes were set

        // directly from a comparison op (which sets all bits to 1 when true), but skvx::Vec<>

        // treats any non-zero value as true, so we have to compare 'x' to 0 before calling movemask

        return _mm_movemask_ps(_mm_cmpneq_ps(sk_bit_cast<__m128>(x), _mm_set1_ps(0))) != 0b0000;

    }

#endif

#if SKVX_USE_SIMD && defined(__aarch64__)

    // On 64-bit NEON, take the max across lanes, which will be non-zero if any lane was true.

    // The specific lane-size doesn't really matter in this case since it's really any set bit

    // that we're looking for.

    if constexpr (N*sizeof(T) == 8 ) { return vmaxv_u8 (sk_bit_cast<uint8x8_t> (x)) > 0; }

    if constexpr (N*sizeof(T) == 16) { return vmaxvq_u8(sk_bit_cast<uint8x16_t>(x)) > 0; }

#endif

#if SKVX_USE_SIMD && defined(__wasm_simd128__)

    if constexpr (N == 4 && sizeof(T) == 4) {

        return wasm_i32x4_any_true(sk_bit_cast<VExt<4,int>>(x));

    }

#endif

#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX

    if constexpr (N*sizeof(T) == 32) {

        v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0),

                                                              sk_bit_cast<__m256i>(x)));

        return (retv[0] | retv[4]) != 0b0000;

    }

#endif

#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX

    if constexpr (N*sizeof(T) == 16) {

        v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0),

                                                          sk_bit_cast<__m128i>(x)));

        return retv[0] != 0b0000;

    }

#endif

    return any(x.lo)

        || any(x.hi);

}


SIT  bool all(const Vec<1,T>& x) { return x.val != 0; }

SINT bool all(const Vec<N,T>& x) {

// Unlike any(), we have to respect the lane layout, or we'll miss cases where a

// true lane has a mix of 0 and 1 bits.

#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1

    // Unfortunately, the _mm_testc intrinsics don't let us avoid the comparison to 0 for all()'s

    // correctness, so always just use the plain SSE version.

    if constexpr (N == 4 && sizeof(T) == 4) {

        return _mm_movemask_ps(_mm_cmpneq_ps(sk_bit_cast<__m128>(x), _mm_set1_ps(0))) == 0b1111;

    }

#endif

#if SKVX_USE_SIMD && defined(__aarch64__)

    // On 64-bit NEON, take the min across the lanes, which will be non-zero if all lanes are != 0.

    if constexpr (sizeof(T)==1 && N==8)  {return vminv_u8  (sk_bit_cast<uint8x8_t> (x)) > 0;}

    if constexpr (sizeof(T)==1 && N==16) {return vminvq_u8 (sk_bit_cast<uint8x16_t>(x)) > 0;}

    if constexpr (sizeof(T)==2 && N==4)  {return vminv_u16 (sk_bit_cast<uint16x4_t>(x)) > 0;}

    if constexpr (sizeof(T)==2 && N==8)  {return vminvq_u16(sk_bit_cast<uint16x8_t>(x)) > 0;}

    if constexpr (sizeof(T)==4 && N==2)  {return vminv_u32 (sk_bit_cast<uint32x2_t>(x)) > 0;}

    if constexpr (sizeof(T)==4 && N==4)  {return vminvq_u32(sk_bit_cast<uint32x4_t>(x)) > 0;}

#endif

#if SKVX_USE_SIMD && defined(__wasm_simd128__)

    if constexpr (N == 4 && sizeof(T) == 4) {

        return wasm_i32x4_all_true(sk_bit_cast<VExt<4,int>>(x));

    }

#endif

#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX

    if constexpr (N == 8 && sizeof(T) == 4) {

        v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0),

                                                              sk_bit_cast<__m256i>(x)));

        return (retv[0] & retv[4]) == 0b1111;

    }

#endif

#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX

    if constexpr (N == 4 && sizeof(T) == 4) {

        v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0),

                                                          sk_bit_cast<__m128i>(x)));

        return retv[0] == 0b1111;

    }

#endif

    return all(x.lo)

        && all(x.hi);

}


// cast() Vec<N,S> to Vec<N,D>, as if applying a C-cast to each lane.

// TODO: implement with map()?

template <typename D, typename S>

SI Vec<1,D> cast(const Vec<1,S>& src) { return (D)src.val; }


template <typename D, int N, typename S>

SI Vec<N,D> cast(const Vec<N,S>& src) {

#if SKVX_USE_SIMD && defined(__clang__)

    return to_vec(__builtin_convertvector(to_vext(src), VExt<N,D>));

#else

    return join(cast<D>(src.lo), cast<D>(src.hi));

#endif

}


// min/max match logic of std::min/std::max, which is important when NaN is involved.

SIT  T min(const Vec<1,T>& x) { return x.val; }

SIT  T max(const Vec<1,T>& x) { return x.val; }

SINT T min(const Vec<N,T>& x) { return std::min(min(x.lo), min(x.hi)); }

SINT T max(const Vec<N,T>& x) { return std::max(max(x.lo), max(x.hi)); }


SINT Vec<N,T> min(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(y < x, y, x); }

SINT Vec<N,T> max(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(x < y, y, x); }


SINTU Vec<N,T> min(const Vec<N,T>& x, U y) { return min(x, Vec<N,T>(y)); }

SINTU Vec<N,T> max(const Vec<N,T>& x, U y) { return max(x, Vec<N,T>(y)); }

SINTU Vec<N,T> min(U x, const Vec<N,T>& y) { return min(Vec<N,T>(x), y); }

SINTU Vec<N,T> max(U x, const Vec<N,T>& y) { return max(Vec<N,T>(x), y); }


// pin matches the logic of SkTPin, which is important when NaN is involved. It always returns

// values in the range lo..hi, and if x is NaN, it returns lo.

SINT Vec<N,T> pin(const Vec<N,T>& x, const Vec<N,T>& lo, const Vec<N,T>& hi) {

    return max(lo, min(x, hi));

}


// Shuffle values from a vector pretty arbitrarily:

//    skvx::Vec<4,float> rgba = {R,G,B,A};

//    shuffle<2,1,0,3>        (rgba) ~> {B,G,R,A}

//    shuffle<2,1>            (rgba) ~> {B,G}

//    shuffle<2,1,2,1,2,1,2,1>(rgba) ~> {B,G,B,G,B,G,B,G}

//    shuffle<3,3,3,3>        (rgba) ~> {A,A,A,A}

// The only real restriction is that the output also be a legal N=power-of-two sknx::Vec.

template <int... Ix, int N, typename T>

SI Vec<sizeof...(Ix),T> shuffle(const Vec<N,T>& x) {

#if SKVX_USE_SIMD && defined(__clang__)

    // TODO: can we just always use { x[Ix]... }?

    return to_vec<sizeof...(Ix),T>(__builtin_shufflevector(to_vext(x), to_vext(x), Ix...));

#else

    return { x[Ix]... };

#endif

}


// Call map(fn, x) for a vector with fn() applied to each lane of x, { fn(x[0]), fn(x[1]), ... },

// or map(fn, x,y) for a vector of fn(x[i], y[i]), etc.


template <typename Fn, typename... Args, size_t... I>

SI auto map(std::index_sequence<I...>,

            Fn&& fn, const Args&... args) -> skvx::Vec<sizeof...(I), decltype(fn(args[0]...))> {

    auto lane = [&](size_t i)

#if defined(__clang__)

    // CFI, specifically -fsanitize=cfi-icall, seems to give a false positive here,

    // with errors like "control flow integrity check for type 'float (float)

    // noexcept' failed during indirect function call... note: sqrtf.cfi_jt defined

    // here".  But we can be quite sure fn is the right type: it's all inferred!

    // So, stifle CFI in this function.

    __attribute__((no_sanitize("cfi")))

#endif

    { return fn(args[static_cast<int>(i)]...); };


    return { lane(I)... };

}


template <typename Fn, int N, typename T, typename... Rest>

auto map(Fn&& fn, const Vec<N,T>& first, const Rest&... rest) {

    // Derive an {0...N-1} index_sequence from the size of the first arg: N lanes in, N lanes out.

    return map(std::make_index_sequence<N>{}, fn, first,rest...);

}


SIN Vec<N,float>  ceil(const Vec<N,float>& x) { return map( ceilf, x); }

SIN Vec<N,float> floor(const Vec<N,float>& x) { return map(floorf, x); }

SIN Vec<N,float> trunc(const Vec<N,float>& x) { return map(truncf, x); }

SIN Vec<N,float> round(const Vec<N,float>& x) { return map(roundf, x); }

SIN Vec<N,float>  sqrt(const Vec<N,float>& x) { return map( sqrtf, x); }

SIN Vec<N,float>   abs(const Vec<N,float>& x) { return map( fabsf, x); }

SIN Vec<N,float>   fma(const Vec<N,float>& x,

                       const Vec<N,float>& y,

                       const Vec<N,float>& z) {

    // I don't understand why Clang's codegen is terrible if we write map(fmaf, x,y,z) directly.

    auto fn = [](float x, float y, float z) { return fmaf(x,y,z); };

    return map(fn, x,y,z);

}


SI Vec<1,int> lrint(const Vec<1,float>& x) {

    return (int)lrintf(x.val);

}

SIN Vec<N,int> lrint(const Vec<N,float>& x) {

#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX

    if constexpr (N == 8) {

        return sk_bit_cast<Vec<N,int>>(_mm256_cvtps_epi32(sk_bit_cast<__m256>(x)));

    }

#endif

#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1

    if constexpr (N == 4) {

        return sk_bit_cast<Vec<N,int>>(_mm_cvtps_epi32(sk_bit_cast<__m128>(x)));

    }

#endif

#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX

    if constexpr (N == 8) {

        return sk_bit_cast<Vec<N,int>>(__lasx_xvftint_w_s(sk_bit_cast<__m256>(x)));

    }

#endif

#if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX

    if constexpr (N == 4) {

        return sk_bit_cast<Vec<N,int>>(__lsx_vftint_w_s(sk_bit_cast<__m128>(x)));

    }

#endif

    return join(lrint(x.lo),

                lrint(x.hi));

}


SIN Vec<N,float> fract(const Vec<N,float>& x) { return x - floor(x); }


// Converts float to half, rounding to nearest even, and supporting de-normal f16 conversion,

// and overflow to f16 infinity. Should not be called with NaNs, since it can convert NaN->inf.

// KEEP IN SYNC with skcms' Half_from_F to ensure that f16 colors are computed consistently in both

// skcms and skvx.

SIN Vec<N,uint16_t> to_half(const Vec<N,float>& x) {

    assert(all(x == x)); // No NaNs should reach this function


    // Intrinsics for float->half tend to operate on 4 lanes, and the default implementation has

    // enough instructions that it's better to split and join on 128 bits groups vs.

    // recursing for each min/max/shift/etc.

    if constexpr (N > 4) {

        return join(to_half(x.lo),

                    to_half(x.hi));

    }


#if SKVX_USE_SIMD && defined(__aarch64__)

    if constexpr (N == 4) {

        return sk_bit_cast<Vec<N,uint16_t>>(vcvt_f16_f32(sk_bit_cast<float32x4_t>(x)));


    }

#endif


#define I(x) sk_bit_cast<Vec<N,int32_t>>(x)

#define F(x) sk_bit_cast<Vec<N,float>>(x)

    Vec<N,int32_t> sem = I(x),

                   s   = sem & 0x8000'0000,

                    em = min(sem ^ s, 0x4780'0000), // |x| clamped to f16 infinity

                 // F(em)*8192 increases the exponent by 13, which when added back to em will shift

                 // the mantissa bits 13 to the right. We clamp to 1/2 for subnormal values, which

                 // automatically shifts the mantissa to match 2^-14 expected for a subnorm f16.

                 magic = I(max(F(em) * 8192.f, 0.5f)) & (255 << 23),

               rounded = I((F(em) + F(magic))), // shift mantissa with automatic round-to-even

                   // Subtract 127 for f32 bias, subtract 13 to undo the *8192, subtract 1 to remove

                   // the implicit leading 1., and add 15 to get the f16 biased exponent.

                   exp = ((magic >> 13) - ((127-15+13+1)<<10)), // shift and re-bias exponent

                   f16 = rounded + exp; // use + if 'rounded' rolled over into first exponent bit

    return cast<uint16_t>((s>>16) | f16);

#undef I

#undef F

}


// Converts from half to float, preserving NaN and +/- infinity.

// KEEP IN SYNC with skcms' F_from_Half to ensure that f16 colors are computed consistently in both

// skcms and skvx.

SIN Vec<N,float> from_half(const Vec<N,uint16_t>& x) {

    if constexpr (N > 4) {

        return join(from_half(x.lo),

                    from_half(x.hi));

    }


#if SKVX_USE_SIMD && defined(__aarch64__)

    if constexpr (N == 4) {

        return sk_bit_cast<Vec<N,float>>(vcvt_f32_f16(sk_bit_cast<float16x4_t>(x)));

    }

#endif


    Vec<N,int32_t> wide = cast<int32_t>(x),

                      s  = wide & 0x8000,

                      em = wide ^ s,

              inf_or_nan =  (em >= (31 << 10)) & (255 << 23),  // Expands exponent to fill 8 bits

                 is_norm =   em > 0x3ff,

                     // subnormal f16's are 2^-14*0.[m0:9] == 2^-24*[m0:9].0

                     sub = sk_bit_cast<Vec<N,int32_t>>((cast<float>(em) * (1.f/(1<<24)))),

                    norm = ((em<<13) + ((127-15)<<23)), // Shifts mantissa, shifts + re-biases exp

                  finite = (is_norm & norm) | (~is_norm & sub);

    // If 'x' is f16 +/- infinity, inf_or_nan will be the filled 8-bit exponent but 'norm' will be

    // all 0s since 'x's mantissa is 0. Thus norm | inf_or_nan becomes f32 infinity. However, if

    // 'x' is an f16 NaN, some bits of 'norm' will be non-zero, so it stays an f32 NaN after the OR.

    return sk_bit_cast<Vec<N,float>>((s<<16) | finite | inf_or_nan);

}


// div255(x) = (x + 127) / 255 is a bit-exact rounding divide-by-255, packing down to 8-bit.

SIN Vec<N,uint8_t> div255(const Vec<N,uint16_t>& x) {

    return cast<uint8_t>( (x+127)/255 );

}


// approx_scale(x,y) approximates div255(cast<uint16_t>(x)*cast<uint16_t>(y)) within a bit,

// and is always perfect when x or y is 0 or 255.

SIN Vec<N,uint8_t> approx_scale(const Vec<N,uint8_t>& x, const Vec<N,uint8_t>& y) {

    // All of (x*y+x)/256, (x*y+y)/256, and (x*y+255)/256 meet the criteria above.

    // We happen to have historically picked (x*y+x)/256.

    auto X = cast<uint16_t>(x),

         Y = cast<uint16_t>(y);

    return cast<uint8_t>( (X*Y+X)/256 );

}


// saturated_add(x,y) sums values and clamps to the maximum value instead of overflowing.

SINT std::enable_if_t<std::is_unsigned_v<T>, Vec<N,T>> saturated_add(const Vec<N,T>& x,

                                                                     const Vec<N,T>& y) {

#if SKVX_USE_SIMD && (SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 || defined(SK_ARM_HAS_NEON) || \

        SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX)

    // Both SSE and ARM have 16-lane saturated adds, so use intrinsics for those and recurse down

    // or join up to take advantage.

    if constexpr (N == 16 && sizeof(T) == 1) {

        #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1

        return sk_bit_cast<Vec<N,T>>(_mm_adds_epu8(sk_bit_cast<__m128i>(x),

                                                   sk_bit_cast<__m128i>(y)));

        #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX

        return sk_bit_cast<Vec<N,T>>(__lsx_vsadd_bu(sk_bit_cast<__m128i>(x),

                                                    sk_bit_cast<__m128i>(y)));

        #else  // SK_ARM_HAS_NEON

        return sk_bit_cast<Vec<N,T>>(vqaddq_u8(sk_bit_cast<uint8x16_t>(x),

                                               sk_bit_cast<uint8x16_t>(y)));

        #endif

    } else if constexpr (N < 16 && sizeof(T) == 1) {

        return saturated_add(join(x,x), join(y,y)).lo;

    } else if constexpr (sizeof(T) == 1) {

        return join(saturated_add(x.lo, y.lo), saturated_add(x.hi, y.hi));

    }

#endif

    // Otherwise saturate manually

    auto sum = x + y;

    return if_then_else(sum < x, Vec<N,T>(std::numeric_limits<T>::max()), sum);

}


// The ScaledDividerU32 takes a divisor > 1, and creates a function divide(numerator) that

// calculates a numerator / denominator. For this to be rounded properly, numerator should have

// half added in:

// divide(numerator + half) == floor(numerator/denominator + 1/2).

//

// This gives an answer within +/- 1 from the true value.

//

// Derivation of half:

//    numerator/denominator + 1/2 = (numerator + half) / d

//    numerator + denominator / 2 = numerator + half

//    half = denominator / 2.

//

// Because half is divided by 2, that division must also be rounded.

//    half == denominator / 2 = (denominator + 1) / 2.

//

// The divisorFactor is just a scaled value:

//    divisorFactor = (1 / divisor) * 2 ^ 32.

// The maximum that can be divided and rounded is UINT_MAX - half.

class ScaledDividerU32 {

public:

    explicit ScaledDividerU32(uint32_t divisor)

            : fDivisorFactor{(uint32_t)(std::round((1.0 / divisor) * (1ull << 32)))}

            , fHalf{(divisor + 1) >> 1} {

        assert(divisor > 1);

    }


    Vec<4, uint32_t> divide(const Vec<4, uint32_t>& numerator) const {

#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)

        uint64x2_t hi = vmull_n_u32(vget_high_u32(to_vext(numerator)), fDivisorFactor);

        uint64x2_t lo = vmull_n_u32(vget_low_u32(to_vext(numerator)),  fDivisorFactor);


        return to_vec<4, uint32_t>(vcombine_u32(vshrn_n_u64(lo,32), vshrn_n_u64(hi,32)));

#else

        return cast<uint32_t>((cast<uint64_t>(numerator) * fDivisorFactor) >> 32);

#endif

    }


    uint32_t half() const { return fHalf; }


private:

    const uint32_t fDivisorFactor;

    const uint32_t fHalf;

};


SIN Vec<N,uint16_t> mull(const Vec<N,uint8_t>& x,

                         const Vec<N,uint8_t>& y) {

#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)

    // With NEON we can do eight u8*u8 -> u16 in one instruction, vmull_u8 (read, mul-long).

    if constexpr (N == 8) {

        return to_vec<8,uint16_t>(vmull_u8(to_vext(x), to_vext(y)));

    } else if constexpr (N < 8) {

        return mull(join(x,x), join(y,y)).lo;

    } else { // N > 8

        return join(mull(x.lo, y.lo), mull(x.hi, y.hi));

    }

#else

    return cast<uint16_t>(x) * cast<uint16_t>(y);

#endif

}


SIN Vec<N,uint32_t> mull(const Vec<N,uint16_t>& x,

                         const Vec<N,uint16_t>& y) {

#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)

    // NEON can do four u16*u16 -> u32 in one instruction, vmull_u16

    if constexpr (N == 4) {

        return to_vec<4,uint32_t>(vmull_u16(to_vext(x), to_vext(y)));

    } else if constexpr (N < 4) {

        return mull(join(x,x), join(y,y)).lo;

    } else { // N > 4

        return join(mull(x.lo, y.lo), mull(x.hi, y.hi));

    }

#else

    return cast<uint32_t>(x) * cast<uint32_t>(y);

#endif

}


SIN Vec<N,uint16_t> mulhi(const Vec<N,uint16_t>& x,

                          const Vec<N,uint16_t>& y) {

#if SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1

    // Use _mm_mulhi_epu16 for 8xuint16_t and join or split to get there.

    if constexpr (N == 8) {

        return sk_bit_cast<Vec<8,uint16_t>>(_mm_mulhi_epu16(sk_bit_cast<__m128i>(x),

                                                            sk_bit_cast<__m128i>(y)));

    } else if constexpr (N < 8) {

        return mulhi(join(x,x), join(y,y)).lo;

    } else { // N > 8

        return join(mulhi(x.lo, y.lo), mulhi(x.hi, y.hi));

    }

#elif SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX

    if constexpr (N == 8) {

        return sk_bit_cast<Vec<8,uint16_t>>(__lsx_vmuh_hu(sk_bit_cast<__m128i>(x),

                                                          sk_bit_cast<__m128i>(y)));

    } else if constexpr (N < 8) {

        return mulhi(join(x,x), join(y,y)).lo;

    } else { // N > 8

        return join(mulhi(x.lo, y.lo), mulhi(x.hi, y.hi));

    }

#else

    return skvx::cast<uint16_t>(mull(x, y) >> 16);

#endif

}


SINT T dot(const Vec<N, T>& a, const Vec<N, T>& b) {

    // While dot is a "horizontal" operation like any or all, it needs to remain

    // in floating point and there aren't really any good SIMD instructions that make it faster.

    // The constexpr cases remove the for loop in the only cases we realistically call.

    auto ab = a*b;

    if constexpr (N == 2) {

        return ab[0] + ab[1];

    } else if constexpr (N == 4) {

        return ab[0] + ab[1] + ab[2] + ab[3];

    } else {

        T sum = ab[0];

        for (int i = 1; i < N; ++i) {

            sum += ab[i];

        }

        return sum;

    }

}


SIT T cross(const Vec<2, T>& a, const Vec<2, T>& b) {

    auto x = a * shuffle<1,0>(b);

    return x[0] - x[1];

}


SIN float length(const Vec<N, float>& v) {

    return std::sqrt(dot(v, v));

}


SIN double length(const Vec<N, double>& v) {

    return std::sqrt(dot(v, v));

}


SIN Vec<N, float> normalize(const Vec<N, float>& v) {

    return v / length(v);

}


SIN Vec<N, double> normalize(const Vec<N, double>& v) {

    return v / length(v);

}


SINT bool isfinite(const Vec<N, T>& v) {

    // Multiply all values together with 0. If they were all finite, the output is

    // 0 (also finite). If any were not, we'll get nan.

    return SkIsFinite(dot(v, Vec<N, T>(0)));

}


// De-interleaving load of 4 vectors.

//

// WARNING: These are really only supported well on NEON. Consider restructuring your data before

// resorting to these methods.

SIT void strided_load4(const T* v,

                       Vec<1,T>& a,

                       Vec<1,T>& b,

                       Vec<1,T>& c,

                       Vec<1,T>& d) {

    a.val = v[0];

    b.val = v[1];

    c.val = v[2];

    d.val = v[3];

}

SINT void strided_load4(const T* v,

                        Vec<N,T>& a,

                        Vec<N,T>& b,

                        Vec<N,T>& c,

                        Vec<N,T>& d) {

    strided_load4(v, a.lo, b.lo, c.lo, d.lo);

    strided_load4(v + 4*(N/2), a.hi, b.hi, c.hi, d.hi);

}

#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)

#define IMPL_LOAD4_TRANSPOSED(N, T, VLD) \

SI void strided_load4(const T* v, \

                      Vec<N,T>& a, \

                      Vec<N,T>& b, \

                      Vec<N,T>& c, \

                      Vec<N,T>& d) { \

    auto mat = VLD(v); \

    a = sk_bit_cast<Vec<N,T>>(mat.val[0]); \

    b = sk_bit_cast<Vec<N,T>>(mat.val[1]); \

    c = sk_bit_cast<Vec<N,T>>(mat.val[2]); \

    d = sk_bit_cast<Vec<N,T>>(mat.val[3]); \

}

IMPL_LOAD4_TRANSPOSED(2, uint32_t, vld4_u32)

IMPL_LOAD4_TRANSPOSED(4, uint16_t, vld4_u16)

IMPL_LOAD4_TRANSPOSED(8, uint8_t, vld4_u8)

IMPL_LOAD4_TRANSPOSED(2, int32_t, vld4_s32)

IMPL_LOAD4_TRANSPOSED(4, int16_t, vld4_s16)

IMPL_LOAD4_TRANSPOSED(8, int8_t, vld4_s8)

IMPL_LOAD4_TRANSPOSED(2, float, vld4_f32)

IMPL_LOAD4_TRANSPOSED(4, uint32_t, vld4q_u32)

IMPL_LOAD4_TRANSPOSED(8, uint16_t, vld4q_u16)

IMPL_LOAD4_TRANSPOSED(16, uint8_t, vld4q_u8)

IMPL_LOAD4_TRANSPOSED(4, int32_t, vld4q_s32)

IMPL_LOAD4_TRANSPOSED(8, int16_t, vld4q_s16)

IMPL_LOAD4_TRANSPOSED(16, int8_t, vld4q_s8)

IMPL_LOAD4_TRANSPOSED(4, float, vld4q_f32)

#undef IMPL_LOAD4_TRANSPOSED


#elif SKVX_USE_SIMD && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1


SI void strided_load4(const float* v,

                      Vec<4,float>& a,

                      Vec<4,float>& b,

                      Vec<4,float>& c,

                      Vec<4,float>& d) {

    __m128 a_ = _mm_loadu_ps(v);

    __m128 b_ = _mm_loadu_ps(v+4);

    __m128 c_ = _mm_loadu_ps(v+8);

    __m128 d_ = _mm_loadu_ps(v+12);

    _MM_TRANSPOSE4_PS(a_, b_, c_, d_);

    a = sk_bit_cast<Vec<4,float>>(a_);

    b = sk_bit_cast<Vec<4,float>>(b_);

    c = sk_bit_cast<Vec<4,float>>(c_);

    d = sk_bit_cast<Vec<4,float>>(d_);

}


#elif SKVX_USE_SIMD && SKVX_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX

#define _LSX_TRANSPOSE4(row0, row1, row2, row3) \

do {                                            \

    __m128i __t0 = __lsx_vilvl_w (row1, row0);  \

    __m128i __t1 = __lsx_vilvl_w (row3, row2);  \

    __m128i __t2 = __lsx_vilvh_w (row1, row0);  \

    __m128i __t3 = __lsx_vilvh_w (row3, row2);  \

    (row0) = __lsx_vilvl_d (__t1, __t0);        \

    (row1) = __lsx_vilvh_d (__t1, __t0);        \

    (row2) = __lsx_vilvl_d (__t3, __t2);        \

    (row3) = __lsx_vilvh_d (__t3, __t2);        \

} while (0)


SI void strided_load4(const int* v,

                      Vec<4,int>& a,

                      Vec<4,int>& b,

                      Vec<4,int>& c,

                      Vec<4,int>& d) {

    __m128i a_ = __lsx_vld(v, 0);

    __m128i b_ = __lsx_vld(v, 16);

    __m128i c_ = __lsx_vld(v, 32);

    __m128i d_ = __lsx_vld(v, 48);

    _LSX_TRANSPOSE4(a_, b_, c_, d_);

    a = sk_bit_cast<Vec<4,int>>(a_);

    b = sk_bit_cast<Vec<4,int>>(b_);

    c = sk_bit_cast<Vec<4,int>>(c_);

    d = sk_bit_cast<Vec<4,int>>(d_);

}

#endif


// De-interleaving load of 2 vectors.

//

// WARNING: These are really only supported well on NEON. Consider restructuring your data before

// resorting to these methods.

SIT void strided_load2(const T* v, Vec<1,T>& a, Vec<1,T>& b) {

    a.val = v[0];

    b.val = v[1];

}

SINT void strided_load2(const T* v, Vec<N,T>& a, Vec<N,T>& b) {

    strided_load2(v, a.lo, b.lo);

    strided_load2(v + 2*(N/2), a.hi, b.hi);

}

#if SKVX_USE_SIMD && defined(SK_ARM_HAS_NEON)

#define IMPL_LOAD2_TRANSPOSED(N, T, VLD) \

SI void strided_load2(const T* v, Vec<N,T>& a, Vec<N,T>& b) { \

    auto mat = VLD(v); \

    a = sk_bit_cast<Vec<N,T>>(mat.val[0]); \

    b = sk_bit_cast<Vec<N,T>>(mat.val[1]); \

}

IMPL_LOAD2_TRANSPOSED(2, uint32_t, vld2_u32)

IMPL_LOAD2_TRANSPOSED(4, uint16_t, vld2_u16)

IMPL_LOAD2_TRANSPOSED(8, uint8_t, vld2_u8)

IMPL_LOAD2_TRANSPOSED(2, int32_t, vld2_s32)

IMPL_LOAD2_TRANSPOSED(4, int16_t, vld2_s16)

IMPL_LOAD2_TRANSPOSED(8, int8_t, vld2_s8)

IMPL_LOAD2_TRANSPOSED(2, float, vld2_f32)

IMPL_LOAD2_TRANSPOSED(4, uint32_t, vld2q_u32)

IMPL_LOAD2_TRANSPOSED(8, uint16_t, vld2q_u16)

IMPL_LOAD2_TRANSPOSED(16, uint8_t, vld2q_u8)

IMPL_LOAD2_TRANSPOSED(4, int32_t, vld2q_s32)

IMPL_LOAD2_TRANSPOSED(8, int16_t, vld2q_s16)

IMPL_LOAD2_TRANSPOSED(16, int8_t, vld2q_s8)

IMPL_LOAD2_TRANSPOSED(4, float, vld2q_f32)

#undef IMPL_LOAD2_TRANSPOSED

#endif


// Define commonly used aliases

using float2  = Vec< 2, float>;

using float4  = Vec< 4, float>;

using float8  = Vec< 8, float>;


using double2 = Vec< 2, double>;

using double4 = Vec< 4, double>;

using double8 = Vec< 8, double>;


using byte2   = Vec< 2, uint8_t>;

using byte4   = Vec< 4, uint8_t>;

using byte8   = Vec< 8, uint8_t>;

using byte16  = Vec<16, uint8_t>;


using int2    = Vec< 2, int32_t>;

using int4    = Vec< 4, int32_t>;

using int8    = Vec< 8, int32_t>;


using ushort2 = Vec< 2, uint16_t>;

using ushort4 = Vec< 4, uint16_t>;

using ushort8 = Vec< 8, uint16_t>;


using uint2   = Vec< 2, uint32_t>;

using uint4   = Vec< 4, uint32_t>;

using uint8   = Vec< 8, uint32_t>;


using long2   = Vec< 2, int64_t>;

using long4   = Vec< 4, int64_t>;

using long8   = Vec< 8, int64_t>;


// Use with from_half and to_half to convert between floatX, and use these for storage.

using half2   = Vec< 2, uint16_t>;

using half4   = Vec< 4, uint16_t>;

using half8   = Vec< 8, uint16_t>;


}  // namespace skvx


#undef SINTU

#undef SINT

#undef SIN

#undef SIT

#undef SI

#undef SKVX_ALWAYS_INLINE

#undef SKVX_USE_SIMD


#endif//SKVX_DEFINED

f16
static const uint64_t f16[kNumPixels]
Definition: ReadPixelsTest.cpp:424

SkFeatures.h

SkIsFinite
static bool SkIsFinite(T x, Pack... values)
Definition: SkFloatingPoint.h:50

SkUtils.h

sk_bit_cast
static SK_ALWAYS_INLINE Dst SK_FP_SAFE_ABI sk_bit_cast(const Src &src)
Definition: SkUtils.h:68

SKVX_ALWAYS_INLINE
#define SKVX_ALWAYS_INLINE
Definition: SkVx.h:62

SIT
#define SIT
Definition: SkVx.h:67

F
#define F(x)

I
#define I(x)

SI
#define SI
Definition: SkVx.h:66

SINTU
#define SINTU
Definition: SkVx.h:70

SIN
#define SIN
Definition: SkVx.h:68

SINT
#define SINT
Definition: SkVx.h:69

Y
static const SkScalar Y
Definition: StrokeBench.cpp:55

X
static const SkScalar X
Definition: StrokeBench.cpp:54

N
#define N
Definition: beziers.cpp:19

type
GLenum type
Definition: blit_command_gles.cc:126

D
Definition: GrMemoryPoolTest.cpp:109

skvx::ScaledDividerU32
Definition: SkVx.h:879

skvx::ScaledDividerU32::divide
Vec< 4, uint32_t > divide(const Vec< 4, uint32_t > &numerator) const
Definition: SkVx.h:887

skvx::ScaledDividerU32::ScaledDividerU32
ScaledDividerU32(uint32_t divisor)
Definition: SkVx.h:881

skvx::ScaledDividerU32::half
uint32_t half() const
Definition: SkVx.h:898

begin
static const char * begin(const StringSlice &s)
Definition: editor.cpp:252

d
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
Definition: main.cc:19

b
static bool b
Definition: ffi_native_test_module.c:74

s
struct MyStruct s

a
struct MyStruct a[10]

args
G_BEGIN_DECLS G_MODULE_EXPORT FlValue * args
Definition: fl_event_channel.h:89

i
int i
Definition: fl_socket_accessible.cc:18

max
static float max(float r, float g, float b)
Definition: hsl.cpp:49

min
static float min(float r, float g, float b)
Definition: hsl.cpp:48

__attribute__
__attribute__((visibility("default"))) int RunBenchmarks(int argc

y
double y
Definition: mouse-input-test.cc:83

x
double x
Definition: mouse-input-test.cc:82

ab
Definition: ab.py:1

dart::compiler::U
@ U
Definition: assembler_arm.h:71

flutter::size
it will be possible to load the file into Perfetto s trace viewer disable asset Prevents usage of any non test fonts unless they were explicitly Loaded via prefetched default font Indicates whether the embedding started a prefetch of the default font manager before creating the engine run In non interactive keep the shell running after the Dart script has completed enable serial On low power devices with low core running concurrent GC tasks on threads can cause them to contend with the UI thread which could potentially lead to jank This option turns off all concurrent GC activities domain network JSON encoded network policy per domain This overrides the DisallowInsecureConnections switch Embedder can specify whether to allow or disallow insecure connections at a domain level old gen heap size
Definition: switches.h:259

mskp_parser.magic
magic
Definition: mskp_parser.py:25

mskp_parser.src
src
Definition: mskp_parser.py:22

protoc_wrapper.e
e
Definition: protoc_wrapper.py:226

skcms_private::bits
uint32_t bits
Definition: skcms_Transform.h:120

skvx
Definition: SkVx.h:73

skvx::isfinite
SINT bool isfinite(const Vec< N, T > &v)
Definition: SkVx.h:1003

skvx::trunc
SIN Vec< N, float > trunc(const Vec< N, float > &x)
Definition: SkVx.h:704

skvx::dot
SINT T dot(const Vec< N, T > &a, const Vec< N, T > &b)
Definition: SkVx.h:964

skvx::operator-=
SINT Vec< N, T > & operator-=(Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:451

skvx::if_then_else
SIT Vec< 1, T > if_then_else(const Vec< 1, M< T > > &cond, const Vec< 1, T > &t, const Vec< 1, T > &e)
Definition: SkVx.h:479

skvx::lrint
SI Vec< 1, int > lrint(const Vec< 1, float > &x)
Definition: SkVx.h:716

skvx::operator^
SIT Vec< 1, T > operator^(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:340

skvx::fma
SIN Vec< N, float > fma(const Vec< N, float > &x, const Vec< N, float > &y, const Vec< N, float > &z)
Definition: SkVx.h:708

skvx::operator+
SIT Vec< 1, T > operator+(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:335

skvx::operator^=
SINT Vec< N, T > & operator^=(Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:454

skvx::operator<=
SIT Vec< 1, M< T > > operator<=(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:357

skvx::operator|
SIT Vec< 1, T > operator|(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:342

skvx::operator*
SIT Vec< 1, T > operator*(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:337

skvx::naive_if_then_else
SINT Vec< N, T > naive_if_then_else(const Vec< N, M< T > > &cond, const Vec< N, T > &t, const Vec< N, T > &e)
Definition: SkVx.h:474

skvx::operator==
SIT Vec< 1, M< T > > operator==(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:351

skvx::round
SIN Vec< N, float > round(const Vec< N, float > &x)
Definition: SkVx.h:705

skvx::cast
SI Vec< 1, D > cast(const Vec< 1, S > &src)
Definition: SkVx.h:628

skvx::strided_load4
SIT void strided_load4(const T *v, Vec< 1, T > &a, Vec< 1, T > &b, Vec< 1, T > &c, Vec< 1, T > &d)
Definition: SkVx.h:1013

skvx::operator|=
SINT Vec< N, T > & operator|=(Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:456

skvx::mulhi
SIN Vec< N, uint16_t > mulhi(const Vec< N, uint16_t > &x, const Vec< N, uint16_t > &y)
Definition: SkVx.h:938

skvx::abs
SIN Vec< N, float > abs(const Vec< N, float > &x)
Definition: SkVx.h:707

skvx::operator*=
SINT Vec< N, T > & operator*=(Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:452

skvx::strided_load2
SIT void strided_load2(const T *v, Vec< 1, T > &a, Vec< 1, T > &b)
Definition: SkVx.h:1112

skvx::operator>=
SIT Vec< 1, M< T > > operator>=(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:360

skvx::sqrt
SIN Vec< N, float > sqrt(const Vec< N, float > &x)
Definition: SkVx.h:706

skvx::normalize
SIN Vec< N, float > normalize(const Vec< N, float > &v)
Definition: SkVx.h:995

skvx::M
typename Mask< T >::type M
Definition: SkVx.h:239

skvx::join
SINT Vec< 2 *N, T > join(const Vec< N, T > &lo, const Vec< N, T > &hi)
Definition: SkVx.h:242

skvx::operator~
SIT Vec< 1, T > operator~(const Vec< 1, T > &x)
Definition: SkVx.h:346

skvx::mull
SIN Vec< N, uint16_t > mull(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
Definition: SkVx.h:906

skvx::operator>>=
SINT Vec< N, T > & operator>>=(Vec< N, T > &x, int bits)
Definition: SkVx.h:467

skvx::operator-
SIT Vec< 1, T > operator-(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:336

skvx::operator!=
SIT Vec< 1, M< T > > operator!=(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:354

skvx::from_half
SIN Vec< N, float > from_half(const Vec< N, uint16_t > &x)
Definition: SkVx.h:790

skvx::div255
SIN Vec< N, uint8_t > div255(const Vec< N, uint16_t > &x)
Definition: SkVx.h:818

skvx::operator>
SIT Vec< 1, M< T > > operator>(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:366

skvx::to_half
SIN Vec< N, uint16_t > to_half(const Vec< N, float > &x)
Definition: SkVx.h:750

skvx::all
SIT bool all(const Vec< 1, T > &x)
Definition: SkVx.h:582

skvx::operator<
SIT Vec< 1, M< T > > operator<(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:363

skvx::saturated_add
SINT std::enable_if_t< std::is_unsigned_v< T >, Vec< N, T > > saturated_add(const Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:833

skvx::map
SI auto map(std::index_sequence< I... >, Fn &&fn, const Args &... args) -> skvx::Vec< sizeof...(I), decltype(fn(args[0]...))>
Definition: SkVx.h:680

skvx::max
SIT T max(const Vec< 1, T > &x)
Definition: SkVx.h:641

skvx::operator>>
SIT Vec< 1, T > operator>>(const Vec< 1, T > &x, int k)
Definition: SkVx.h:349

skvx::operator!
SIT Vec< 1, T > operator!(const Vec< 1, T > &x)
Definition: SkVx.h:344

skvx::operator/=
SINT Vec< N, T > & operator/=(Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:453

skvx::shuffle
SI Vec< sizeof...(Ix), T > shuffle(const Vec< N, T > &)
Definition: SkVx.h:667

skvx::operator&=
SINT Vec< N, T > & operator&=(Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:455

skvx::operator&
SIT Vec< 1, T > operator&(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:341

skvx::min
SIT T min(const Vec< 1, T > &x)
Definition: SkVx.h:640

skvx::operator+=
SINT Vec< N, T > & operator+=(Vec< N, T > &x, const Vec< N, T > &y)
Definition: SkVx.h:450

skvx::fract
SIN Vec< N, float > fract(const Vec< N, float > &x)
Definition: SkVx.h:744

skvx::floor
SIN Vec< N, float > floor(const Vec< N, float > &x)
Definition: SkVx.h:703

skvx::operator<<
SIT Vec< 1, T > operator<<(const Vec< 1, T > &x, int k)
Definition: SkVx.h:348

skvx::any
SIT bool any(const Vec< 1, T > &x)
Definition: SkVx.h:530

skvx::ceil
SIN Vec< N, float > ceil(const Vec< N, float > &x)
Definition: SkVx.h:702

skvx::operator<<=
SINT Vec< N, T > & operator<<=(Vec< N, T > &x, int bits)
Definition: SkVx.h:466

skvx::length
SIN float length(const Vec< N, float > &v)
Definition: SkVx.h:987

skvx::approx_scale
SIN Vec< N, uint8_t > approx_scale(const Vec< N, uint8_t > &x, const Vec< N, uint8_t > &y)
Definition: SkVx.h:824

skvx::cross
SIT T cross(const Vec< 2, T > &a, const Vec< 2, T > &b)
Definition: SkVx.h:982

skvx::operator/
SIT Vec< 1, T > operator/(const Vec< 1, T > &x, const Vec< 1, T > &y)
Definition: SkVx.h:338

skvx::pin
SINT Vec< N, T > pin(const Vec< N, T > &x, const Vec< N, T > &lo, const Vec< N, T > &hi)
Definition: SkVx.h:655

std
Definition: ref_ptr.h:256

tools.skpbench.skpbench.float
float
Definition: skpbench.py:42

w
SkScalar w
Definition: pictureshadertile.cpp:30

T
#define T
Definition: precompiler.cc:65

I
Definition: SkMD5.cpp:134

skvx::Mask< double >::type
int64_t type
Definition: SkVx.h:238

skvx::Mask< float >::type
int32_t type
Definition: SkVx.h:237

skvx::Mask
Definition: SkVx.h:236

skvx::Mask::type
T type
Definition: SkVx.h:236

skvx::Vec< 1, T >
Definition: SkVx.h:214

skvx::Vec< 1, T >::operator[]
SKVX_ALWAYS_INLINE T operator[](int i) const
Definition: SkVx.h:224

skvx::Vec< 1, T >::Vec
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
Definition: SkVx.h:220

skvx::Vec< 1, T >::Load
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
Definition: SkVx.h:227

skvx::Vec< 1, T >::operator[]
SKVX_ALWAYS_INLINE T & operator[](int i)
Definition: SkVx.h:225

skvx::Vec< 1, T >::Vec
SKVX_ALWAYS_INLINE Vec(T s)
Definition: SkVx.h:218

skvx::Vec< 1, T >::val
T val
Definition: SkVx.h:215

skvx::Vec< 1, T >::Vec
SKVX_ALWAYS_INLINE Vec()=default

skvx::Vec< 1, T >::store
SKVX_ALWAYS_INLINE void store(void *ptr) const
Definition: SkVx.h:230

skvx::Vec< 2, T >
Definition: SkVx.h:174

skvx::Vec< 2, T >::x
SKVX_ALWAYS_INLINE T x() const
Definition: SkVx.h:203

skvx::Vec< 2, T >::operator[]
SKVX_ALWAYS_INLINE T & operator[](int i)
Definition: SkVx.h:191

skvx::Vec< 2, T >::yx
SKVX_ALWAYS_INLINE Vec< 2, T > yx() const
Definition: SkVx.h:207

skvx::Vec< 2, T >::y
SKVX_ALWAYS_INLINE T y() const
Definition: SkVx.h:204

skvx::Vec< 2, T >::x
SKVX_ALWAYS_INLINE T & x()
Definition: SkVx.h:200

skvx::Vec< 2, T >::store
SKVX_ALWAYS_INLINE void store(void *ptr) const
Definition: SkVx.h:196

skvx::Vec< 2, T >::Vec
SKVX_ALWAYS_INLINE Vec()=default

skvx::Vec< 2, T >::y
SKVX_ALWAYS_INLINE T & y()
Definition: SkVx.h:201

skvx::Vec< 2, T >::Vec
SKVX_ALWAYS_INLINE Vec(T x, T y)
Definition: SkVx.h:179

skvx::Vec< 2, T >::hi
Vec< 1, T > hi
Definition: SkVx.h:210

skvx::Vec< 2, T >::Vec
SKVX_ALWAYS_INLINE Vec(T s)
Definition: SkVx.h:178

skvx::Vec< 2, T >::xyxy
SKVX_ALWAYS_INLINE Vec< 4, T > xyxy() const
Definition: SkVx.h:208

skvx::Vec< 2, T >::Load
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
Definition: SkVx.h:193

skvx::Vec< 2, T >::operator[]
SKVX_ALWAYS_INLINE T operator[](int i) const
Definition: SkVx.h:190

skvx::Vec< 2, T >::Vec
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
Definition: SkVx.h:181

skvx::Vec< 4, T >
Definition: SkVx.h:123

skvx::Vec< 4, T >::w
SKVX_ALWAYS_INLINE T w() const
Definition: SkVx.h:164

skvx::Vec< 4, T >::x
SKVX_ALWAYS_INLINE T & x()
Definition: SkVx.h:154

skvx::Vec< 4, T >::store
SKVX_ALWAYS_INLINE void store(void *ptr) const
Definition: SkVx.h:148

skvx::Vec< 4, T >::Vec
SKVX_ALWAYS_INLINE Vec()=default

skvx::Vec< 4, T >::zwxy
SKVX_ALWAYS_INLINE Vec< 4, T > zwxy() const
Definition: SkVx.h:168

skvx::Vec< 4, T >::y
SKVX_ALWAYS_INLINE T & y()
Definition: SkVx.h:155

skvx::Vec< 4, T >::zw
SKVX_ALWAYS_INLINE Vec< 2, T > zw() const
Definition: SkVx.h:160

skvx::Vec< 4, T >::zw
SKVX_ALWAYS_INLINE Vec< 2, T > & zw()
Definition: SkVx.h:153

skvx::Vec< 4, T >::Vec
SKVX_ALWAYS_INLINE Vec(T x, T y, Vec< 2, T > zw)
Definition: SkVx.h:130

skvx::Vec< 4, T >::Vec
SKVX_ALWAYS_INLINE Vec(T s)
Definition: SkVx.h:127

skvx::Vec< 4, T >::z
SKVX_ALWAYS_INLINE T & z()
Definition: SkVx.h:156

skvx::Vec< 4, T >::Vec
SKVX_ALWAYS_INLINE Vec(Vec< 2, T > xy, Vec< 2, T > zw)
Definition: SkVx.h:131

skvx::Vec< 4, T >::Vec
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
Definition: SkVx.h:133

skvx::Vec< 4, T >::x
SKVX_ALWAYS_INLINE T x() const
Definition: SkVx.h:161

skvx::Vec< 4, T >::hi
Vec< 2, T > hi
Definition: SkVx.h:170

skvx::Vec< 4, T >::w
SKVX_ALWAYS_INLINE T & w()
Definition: SkVx.h:157

skvx::Vec< 4, T >::Vec
SKVX_ALWAYS_INLINE Vec(T x, T y, T z, T w)
Definition: SkVx.h:128

skvx::Vec< 4, T >::yxwz
SKVX_ALWAYS_INLINE Vec< 4, T > yxwz() const
Definition: SkVx.h:167

skvx::Vec< 4, T >::operator[]
SKVX_ALWAYS_INLINE T & operator[](int i)
Definition: SkVx.h:143

skvx::Vec< 4, T >::z
SKVX_ALWAYS_INLINE T z() const
Definition: SkVx.h:163

skvx::Vec< 4, T >::y
SKVX_ALWAYS_INLINE T y() const
Definition: SkVx.h:162

skvx::Vec< 4, T >::operator[]
SKVX_ALWAYS_INLINE T operator[](int i) const
Definition: SkVx.h:142

skvx::Vec< 4, T >::Load
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
Definition: SkVx.h:145

skvx::Vec< 4, T >::xy
SKVX_ALWAYS_INLINE Vec< 2, T > & xy()
Definition: SkVx.h:152

skvx::Vec< 4, T >::xy
SKVX_ALWAYS_INLINE Vec< 2, T > xy() const
Definition: SkVx.h:159

skvx::Vec< 4, T >::Vec
SKVX_ALWAYS_INLINE Vec(Vec< 2, T > xy, T z, T w)
Definition: SkVx.h:129

skvx::Vec
Definition: SkVx.h:83

skvx::Vec::Vec
SKVX_ALWAYS_INLINE Vec(std::initializer_list< T > xs)
Definition: SkVx.h:97

skvx::Vec::Vec
SKVX_ALWAYS_INLINE Vec()=default

skvx::Vec::Load
static SKVX_ALWAYS_INLINE Vec Load(const void *ptr)
Definition: SkVx.h:109

skvx::Vec::store
SKVX_ALWAYS_INLINE void store(void *ptr) const
Definition: SkVx.h:112

skvx::Vec::hi
Vec< N/2, T > hi
Definition: SkVx.h:117

skvx::Vec::Vec
SKVX_ALWAYS_INLINE Vec(T s)
Definition: SkVx.h:93

skvx::Vec::operator[]
SKVX_ALWAYS_INLINE T operator[](int i) const
Definition: SkVx.h:106

skvx::Vec::operator[]
SKVX_ALWAYS_INLINE T & operator[](int i)
Definition: SkVx.h:107

skvx::Vec::lo
Vec< N/2, T > lo
Definition: SkVx.h:117