8#ifndef SkRasterPipeline_opts_DEFINED
9#define SkRasterPipeline_opts_DEFINED
26#if defined(__clang__) || defined(__GNUC__)
27 #define SI __attribute__((always_inline)) static inline
29 #define SI static inline
33 #define SK_UNROLL _Pragma("unroll")
39 template <
int N,
typename T>
using Vec =
T __attribute__((ext_vector_type(
N)));
40#elif defined(__GNUC__)
43 template <
int N,
typename T>
struct VecHelper {
46 template <
int N,
typename T>
using Vec =
typename VecHelper<N, T>::V;
49template <
typename Dst,
typename Src>
51 static_assert(
sizeof(Dst) >
sizeof(Src));
52 static_assert(std::is_trivially_copyable<Dst>::value);
53 static_assert(std::is_trivially_copyable<Src>::value);
55 memcpy(&dst, &src,
sizeof(Src));
70#if defined(JUMPER_IS_SCALAR) || defined(JUMPER_IS_NEON) || defined(JUMPER_IS_HSW) || \
71 defined(JUMPER_IS_SKX) || defined(JUMPER_IS_AVX) || defined(JUMPER_IS_SSE41) || \
72 defined(JUMPER_IS_SSE2)
74#elif !defined(__clang__) && !defined(__GNUC__)
75 #define JUMPER_IS_SCALAR
76#elif defined(SK_ARM_HAS_NEON)
77 #define JUMPER_IS_NEON
78#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
80#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
82#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
84#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
85 #define JUMPER_IS_SSE41
86#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
87 #define JUMPER_IS_SSE2
88#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
89 #define JUMPER_IS_LASX
90#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
93 #define JUMPER_IS_SCALAR
97#if defined(__clang__) && !defined(__OPTIMIZE__) && defined(SK_CPU_ARM32)
99 #if defined(__apple_build_version__) && __clang_major__ < 9
100 #define JUMPER_IS_SCALAR
101 #elif __clang_major__ < 5
102 #define JUMPER_IS_SCALAR
105 #if defined(JUMPER_IS_NEON) && defined(JUMPER_IS_SCALAR)
106 #undef JUMPER_IS_NEON
110#if defined(JUMPER_IS_SCALAR)
112#elif defined(JUMPER_IS_NEON)
113 #include <arm_neon.h>
114#elif defined(JUMPER_IS_LASX)
115 #include <lasxintrin.h>
116 #include <lsxintrin.h>
117#elif defined(JUMPER_IS_LSX)
118 #include <lsxintrin.h>
120 #include <immintrin.h>
132#if defined(JUMPER_IS_SCALAR)
168 SI bool any(
I32 c) {
return c != 0; }
169 SI bool all(
I32 c) {
return c != 0; }
171 template <
typename T>
175 dst[ix] = mask ? src : dst[ix];
212#elif defined(JUMPER_IS_NEON)
213 template <
typename T>
using V = Vec<4, T>;
229 SI F abs_ (
F v) {
return vabsq_f32(v); }
231 SI F rcp_approx(
F v) {
auto e = vrecpeq_f32(v);
return vrecpsq_f32 (v,e ) *
e; }
241 #if defined(SK_CPU_ARM64)
242 SI bool any(
I32 c) {
return vmaxvq_u32((
U32)c) != 0; }
243 SI bool all(
I32 c) {
return vminvq_u32((
U32)c) != 0; }
245 SI F mad(
F f,
F m,
F a) {
return vfmaq_f32(
a,f,m); }
248 SI F ceil_(
F v) {
return vrndpq_f32(v); }
249 SI F sqrt_(
F v) {
return vsqrtq_f32(v); }
254 SI bool any(
I32 c) {
return c[0] | c[1] | c[2] | c[3]; }
255 SI bool all(
I32 c) {
return c[0] & c[1] & c[2] & c[3]; }
257 SI F mad(
F f,
F m,
F a) {
return vmlaq_f32(
a,f,m); }
261 F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
266 F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
271 auto e = vrsqrteq_f32(v);
272 e *= vrsqrtsq_f32(v,e*e);
273 e *= vrsqrtsq_f32(v,e*e);
278 return vcvtq_s32_f32(v + 0.5f);
282 return vcvtq_u32_f32(v + 0.5f);
286 return vcvtq_u32_f32(
mad(v,
scale,
F() + 0.5f));
290 template <
typename T>
292 return V<T>{
p[ix[0]],
p[ix[1]],
p[ix[2]],
p[ix[3]]};
297 dst[ix[0]] = after[0];
298 dst[ix[1]] = after[1];
299 dst[ix[2]] = after[2];
300 dst[ix[3]] = after[3];
303 uint16x4x2_t rg = vld2_u16(ptr);
308 vst2_u16(ptr, (uint16x4x2_t{{r,g}}));
311 uint16x4x4_t
rgba = vld4_u16(ptr);
319 vst4_u16(ptr, (uint16x4x4_t{{r,g,
b,
a}}));
322 float32x4x4_t
rgba = vld4q_f32(ptr);
329 vst4q_f32(ptr, (float32x4x4_t{{r,g,
b,
a}}));
332#elif defined(JUMPER_IS_SKX)
333 template <
typename T>
using V = Vec<16, T>;
341 SI F mad(
F f,
F m,
F a) {
return _mm512_fmadd_ps(f, m,
a); }
342 SI F nmad(
F f,
F m,
F a) {
return _mm512_fnmadd_ps(f, m,
a); }
349 SI F abs_ (
F v) {
return _mm512_and_ps(v, _mm512_sub_ps(_mm512_setzero(), v)); }
351 SI F floor_(
F v) {
return _mm512_floor_ps(v); }
352 SI F ceil_(
F v) {
return _mm512_ceil_ps(v); }
355 SI F sqrt_ (
F v) {
return _mm512_sqrt_ps (v); }
358 return _mm512_fnmadd_ps(v, e, _mm512_set1_ps(2.0f)) *
e;
364 __m256i rst = _mm256_packus_epi32(_mm512_castsi512_si256((__m512i)v),
365 _mm512_extracti64x4_epi64((__m512i)v, 1));
366 return (
U16)_mm256_permutex_epi64(rst, 216);
369 __m256i rst = _mm256_packus_epi16((__m256i)v, (__m256i)v);
370 return (
U8)_mm256_castsi256_si128(_mm256_permute4x64_epi64(rst, 8));
373 __m512i mask = _mm512_set1_epi32(0x80000000);
374 __m512i aa = _mm512_and_si512((__m512i)c, mask);
375 return _mm512_mask_blend_ps(_mm512_test_epi32_mask(aa, aa),e,t);
378 __m512i mask = _mm512_set1_epi32(0x80000000);
379 __m512i aa = _mm512_and_si512((__m512i)c, mask);
380 return (
I32)_mm512_mask_blend_epi32(_mm512_test_epi32_mask(aa, aa),(__m512i)e,(__m512i)t);
383 __mmask16 mask32 = _mm512_test_epi32_mask((__m512i)c, (__m512i)c);
387 __mmask16 mask32 = _mm512_test_epi32_mask((__m512i)c, (__m512i)c);
388 return mask32 == 0xffff;
390 template <
typename T>
392 return V<T>{
p[ix[ 0]],
p[ix[ 1]],
p[ix[ 2]],
p[ix[ 3]],
393 p[ix[ 4]],
p[ix[ 5]],
p[ix[ 6]],
p[ix[ 7]],
394 p[ix[ 8]],
p[ix[ 9]],
p[ix[10]],
p[ix[11]],
395 p[ix[12]],
p[ix[13]],
p[ix[14]],
p[ix[15]] };
397 SI F gather(
const float* p,
U32 ix) {
return _mm512_i32gather_ps((__m512i)ix, p, 4); }
399 return (
U32)_mm512_i32gather_epi32((__m512i)ix, p, 4); }
402 _mm512_i32gather_epi64(_mm512_castsi512_si256((__m512i)ix), p, 8),
403 _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)ix, 1), p, 8),
405 return sk_bit_cast<U64>(parts);
407 template <
typename V,
typename S>
411 dst[ix[0]] = after[0];
412 dst[ix[1]] = after[1];
413 dst[ix[2]] = after[2];
414 dst[ix[3]] = after[3];
415 dst[ix[4]] = after[4];
416 dst[ix[5]] = after[5];
417 dst[ix[6]] = after[6];
418 dst[ix[7]] = after[7];
419 dst[ix[8]] = after[8];
420 dst[ix[9]] = after[9];
421 dst[ix[10]] = after[10];
422 dst[ix[11]] = after[11];
423 dst[ix[12]] = after[12];
424 dst[ix[13]] = after[13];
425 dst[ix[14]] = after[14];
426 dst[ix[15]] = after[15];
430 __m256i _01234567 = _mm256_loadu_si256(((
const __m256i*)ptr) + 0);
431 __m256i _89abcdef = _mm256_loadu_si256(((
const __m256i*)ptr) + 1);
433 *r = (
U16)_mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_mm256_slli_epi32
434 (_01234567, 16), 16), _mm256_srai_epi32(_mm256_slli_epi32(_89abcdef, 16), 16)), 216);
435 *g = (
U16)_mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_01234567, 16),
436 _mm256_srai_epi32(_89abcdef, 16)), 216);
439 __m256i _01234567 = _mm256_unpacklo_epi16((__m256i)r, (__m256i)g);
440 __m256i _89abcdef = _mm256_unpackhi_epi16((__m256i)r, (__m256i)g);
441 __m512i combinedVector = _mm512_inserti64x4(_mm512_castsi256_si512(_01234567),
443 __m512i aa = _mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), combinedVector);
444 _01234567 = _mm512_castsi512_si256(aa);
445 _89abcdef = _mm512_extracti64x4_epi64(aa, 1);
447 _mm256_storeu_si256((__m256i*)ptr + 0, _01234567);
448 _mm256_storeu_si256((__m256i*)ptr + 1, _89abcdef);
452 __m256i _0123 = _mm256_loadu_si256((
const __m256i*)ptr),
453 _4567 = _mm256_loadu_si256(((
const __m256i*)ptr) + 1),
454 _89ab = _mm256_loadu_si256(((
const __m256i*)ptr) + 2),
455 _cdef = _mm256_loadu_si256(((
const __m256i*)ptr) + 3);
457 auto a0 = _mm256_unpacklo_epi16(_0123, _4567),
458 a1 = _mm256_unpackhi_epi16(_0123, _4567),
459 b0 = _mm256_unpacklo_epi16(a0, a1),
460 b1 = _mm256_unpackhi_epi16(a0, a1),
461 a2 = _mm256_unpacklo_epi16(_89ab, _cdef),
462 a3 = _mm256_unpackhi_epi16(_89ab, _cdef),
463 b2 = _mm256_unpacklo_epi16(a2, a3),
464 b3 = _mm256_unpackhi_epi16(a2, a3),
465 rr = _mm256_unpacklo_epi64(b0, b2),
466 gg = _mm256_unpackhi_epi64(b0, b2),
467 bb = _mm256_unpacklo_epi64(b1, b3),
468 aa = _mm256_unpackhi_epi64(b1, b3);
470 *r = (
U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), rr);
471 *g = (
U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), gg);
472 *
b = (
U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), bb);
473 *
a = (
U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), aa);
476 auto rg012389ab = _mm256_unpacklo_epi16((__m256i)r, (__m256i)g),
477 rg4567cdef = _mm256_unpackhi_epi16((__m256i)r, (__m256i)g),
478 ba012389ab = _mm256_unpacklo_epi16((__m256i)
b, (__m256i)
a),
479 ba4567cdef = _mm256_unpackhi_epi16((__m256i)
b, (__m256i)
a);
481 auto _0189 = _mm256_unpacklo_epi32(rg012389ab, ba012389ab),
482 _23ab = _mm256_unpackhi_epi32(rg012389ab, ba012389ab),
483 _45cd = _mm256_unpacklo_epi32(rg4567cdef, ba4567cdef),
484 _67ef = _mm256_unpackhi_epi32(rg4567cdef, ba4567cdef);
486 auto _ab23 = _mm256_permutex_epi64(_23ab, 78),
487 _0123 = _mm256_blend_epi32(_0189, _ab23, 0xf0),
488 _89ab = _mm256_permutex_epi64(_mm256_blend_epi32(_0189, _ab23, 0x0f), 78),
489 _ef67 = _mm256_permutex_epi64(_67ef, 78),
490 _4567 = _mm256_blend_epi32(_45cd, _ef67, 0xf0),
491 _cdef = _mm256_permutex_epi64(_mm256_blend_epi32(_45cd, _ef67, 0x0f), 78);
493 _mm256_storeu_si256((__m256i*)ptr, _0123);
494 _mm256_storeu_si256((__m256i*)ptr + 1, _4567);
495 _mm256_storeu_si256((__m256i*)ptr + 2, _89ab);
496 _mm256_storeu_si256((__m256i*)ptr + 3, _cdef);
500 F _048c, _159d, _26ae, _37bf;
502 _048c = _mm512_castps128_ps512(_mm_loadu_ps(ptr) );
503 _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+16), 1);
504 _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+32), 2);
505 _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+48), 3);
506 _159d = _mm512_castps128_ps512(_mm_loadu_ps(ptr+4) );
507 _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+20), 1);
508 _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+36), 2);
509 _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+52), 3);
510 _26ae = _mm512_castps128_ps512(_mm_loadu_ps(ptr+8) );
511 _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+24), 1);
512 _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+40), 2);
513 _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+56), 3);
514 _37bf = _mm512_castps128_ps512(_mm_loadu_ps(ptr+12) );
515 _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+28), 1);
516 _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+44), 2);
517 _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+60), 3);
519 F rg02468acf = _mm512_unpacklo_ps(_048c, _26ae),
520 ba02468acf = _mm512_unpackhi_ps(_048c, _26ae),
521 rg13579bde = _mm512_unpacklo_ps(_159d, _37bf),
522 ba13579bde = _mm512_unpackhi_ps(_159d, _37bf);
524 *r = (
F)_mm512_unpacklo_ps(rg02468acf, rg13579bde);
525 *g = (
F)_mm512_unpackhi_ps(rg02468acf, rg13579bde);
526 *
b = (
F)_mm512_unpacklo_ps(ba02468acf, ba13579bde);
527 *
a = (
F)_mm512_unpackhi_ps(ba02468acf, ba13579bde);
531 F rg014589cd = _mm512_unpacklo_ps(r, g),
532 rg2367abef = _mm512_unpackhi_ps(r, g),
533 ba014589cd = _mm512_unpacklo_ps(
b,
a),
534 ba2367abef = _mm512_unpackhi_ps(
b,
a);
536 F _048c = (
F)_mm512_unpacklo_pd((__m512d)rg014589cd, (__m512d)ba014589cd),
537 _26ae = (
F)_mm512_unpacklo_pd((__m512d)rg2367abef, (__m512d)ba2367abef),
538 _159d = (
F)_mm512_unpackhi_pd((__m512d)rg014589cd, (__m512d)ba014589cd),
539 _37bf = (
F)_mm512_unpackhi_pd((__m512d)rg2367abef, (__m512d)ba2367abef);
541 F _ae26 = (
F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_26ae),
542 _bf37 = (
F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_37bf),
543 _8c04 = (
F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_048c),
544 _9d15 = (
F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_159d);
546 __m512i index = _mm512_setr_epi32(4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11);
547 F _0426 = (
F)_mm512_permutex2var_pd((__m512d)_048c, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
549 _1537 = (
F)_mm512_permutex2var_pd((__m512d)_159d, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
551 _5173 = _mm512_permutexvar_ps(index, _1537),
552 _0123 = (
F)_mm512_permutex2var_pd((__m512d)_0426, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
555 F _5476 = (
F)_mm512_permutex2var_pd((__m512d)_5173, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
557 _4567 = _mm512_permutexvar_ps(index, _5476),
558 _8cae = (
F)_mm512_permutex2var_pd((__m512d)_8c04, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
560 _9dbf = (
F)_mm512_permutex2var_pd((__m512d)_9d15, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
562 _d9fb = _mm512_permutexvar_ps(index, _9dbf),
563 _89ab = (
F)_mm512_permutex2var_pd((__m512d)_8cae, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
565 _dcfe = (
F)_mm512_permutex2var_pd((__m512d)_d9fb, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
567 _cdef = _mm512_permutexvar_ps(index, _dcfe);
569 _mm512_storeu_ps(ptr+0, _0123);
570 _mm512_storeu_ps(ptr+16, _4567);
571 _mm512_storeu_ps(ptr+32, _89ab);
572 _mm512_storeu_ps(ptr+48, _cdef);
575#elif defined(JUMPER_IS_HSW)
577 template <
typename T>
using V = Vec<8, T>;
585 SI F mad(
F f,
F m,
F a) {
return _mm256_fmadd_ps(f, m,
a); }
586 SI F nmad(
F f,
F m,
F a) {
return _mm256_fnmadd_ps(f, m,
a); }
595 SI F abs_ (
F v) {
return _mm256_and_ps(v, 0-v); }
597 SI F floor_(
F v) {
return _mm256_floor_ps(v); }
598 SI F ceil_(
F v) {
return _mm256_ceil_ps(v); }
601 SI F sqrt_ (
F v) {
return _mm256_sqrt_ps (v); }
604 return _mm256_fnmadd_ps(v, e, _mm256_set1_ps(2.0f)) *
e;
611 return (
U16)_mm_packus_epi32(_mm256_extractf128_si256((__m256i)v, 0),
612 _mm256_extractf128_si256((__m256i)v, 1));
615 auto r = _mm_packus_epi16((__m128i)v,(__m128i)v);
616 return sk_unaligned_load<U8>(&r);
621 return (
I32)_mm256_blendv_ps((__m256)e, (__m256)t, (__m256)c);
625 SI bool any(
I32 c) {
return !_mm256_testz_si256((__m256i)c, _mm256_set1_epi32(-1)); }
626 SI bool all(
I32 c) {
return _mm256_testc_si256((__m256i)c, _mm256_set1_epi32(-1)); }
628 template <
typename T>
630 return V<T>{
p[ix[0]],
p[ix[1]],
p[ix[2]],
p[ix[3]],
631 p[ix[4]],
p[ix[5]],
p[ix[6]],
p[ix[7]], };
633 SI F gather(
const float* p,
U32 ix) {
return _mm256_i32gather_ps(p, (__m256i)ix, 4); }
635 return (
U32)_mm256_i32gather_epi32((
const int*)p, (__m256i)ix, 4);
639 _mm256_i32gather_epi64(
640 (
const long long int*)p, _mm256_extracti128_si256((__m256i)ix, 0), 8),
641 _mm256_i32gather_epi64(
642 (
const long long int*)p, _mm256_extracti128_si256((__m256i)ix, 1), 8),
644 return sk_bit_cast<U64>(parts);
649 dst[ix[0]] = after[0];
650 dst[ix[1]] = after[1];
651 dst[ix[2]] = after[2];
652 dst[ix[3]] = after[3];
653 dst[ix[4]] = after[4];
654 dst[ix[5]] = after[5];
655 dst[ix[6]] = after[6];
656 dst[ix[7]] = after[7];
660 __m128i _0123 = _mm_loadu_si128(((
const __m128i*)ptr) + 0),
661 _4567 = _mm_loadu_si128(((
const __m128i*)ptr) + 1);
662 *r = (
U16)_mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, 16), 16),
663 _mm_srai_epi32(_mm_slli_epi32(_4567, 16), 16));
664 *g = (
U16)_mm_packs_epi32(_mm_srai_epi32(_0123, 16),
665 _mm_srai_epi32(_4567, 16));
668 auto _0123 = _mm_unpacklo_epi16((__m128i)r, (__m128i)g),
669 _4567 = _mm_unpackhi_epi16((__m128i)r, (__m128i)g);
670 _mm_storeu_si128((__m128i*)ptr + 0, _0123);
671 _mm_storeu_si128((__m128i*)ptr + 1, _4567);
675 __m128i _01 = _mm_loadu_si128(((
const __m128i*)ptr) + 0),
676 _23 = _mm_loadu_si128(((
const __m128i*)ptr) + 1),
677 _45 = _mm_loadu_si128(((
const __m128i*)ptr) + 2),
678 _67 = _mm_loadu_si128(((
const __m128i*)ptr) + 3);
680 auto _02 = _mm_unpacklo_epi16(_01, _23),
681 _13 = _mm_unpackhi_epi16(_01, _23),
682 _46 = _mm_unpacklo_epi16(_45, _67),
683 _57 = _mm_unpackhi_epi16(_45, _67);
685 auto rg0123 = _mm_unpacklo_epi16(_02, _13),
686 ba0123 = _mm_unpackhi_epi16(_02, _13),
687 rg4567 = _mm_unpacklo_epi16(_46, _57),
688 ba4567 = _mm_unpackhi_epi16(_46, _57);
690 *r = (
U16)_mm_unpacklo_epi64(rg0123, rg4567);
691 *g = (
U16)_mm_unpackhi_epi64(rg0123, rg4567);
692 *
b = (
U16)_mm_unpacklo_epi64(ba0123, ba4567);
693 *
a = (
U16)_mm_unpackhi_epi64(ba0123, ba4567);
696 auto rg0123 = _mm_unpacklo_epi16((__m128i)r, (__m128i)g),
697 rg4567 = _mm_unpackhi_epi16((__m128i)r, (__m128i)g),
698 ba0123 = _mm_unpacklo_epi16((__m128i)
b, (__m128i)
a),
699 ba4567 = _mm_unpackhi_epi16((__m128i)
b, (__m128i)
a);
701 auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
702 _23 = _mm_unpackhi_epi32(rg0123, ba0123),
703 _45 = _mm_unpacklo_epi32(rg4567, ba4567),
704 _67 = _mm_unpackhi_epi32(rg4567, ba4567);
706 _mm_storeu_si128((__m128i*)ptr + 0, _01);
707 _mm_storeu_si128((__m128i*)ptr + 1, _23);
708 _mm_storeu_si128((__m128i*)ptr + 2, _45);
709 _mm_storeu_si128((__m128i*)ptr + 3, _67);
713 F _04 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+ 0)),
714 _15 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+ 4)),
715 _26 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+ 8)),
716 _37 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+12));
717 _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1);
718 _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1);
719 _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1);
720 _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1);
722 F rg0145 = _mm256_unpacklo_ps(_04,_15),
723 ba0145 = _mm256_unpackhi_ps(_04,_15),
724 rg2367 = _mm256_unpacklo_ps(_26,_37),
725 ba2367 = _mm256_unpackhi_ps(_26,_37);
727 *r = (
F)_mm256_unpacklo_pd((__m256d)rg0145, (__m256d)rg2367);
728 *g = (
F)_mm256_unpackhi_pd((__m256d)rg0145, (__m256d)rg2367);
729 *
b = (
F)_mm256_unpacklo_pd((__m256d)ba0145, (__m256d)ba2367);
730 *
a = (
F)_mm256_unpackhi_pd((__m256d)ba0145, (__m256d)ba2367);
733 F rg0145 = _mm256_unpacklo_ps(r, g),
734 rg2367 = _mm256_unpackhi_ps(r, g),
735 ba0145 = _mm256_unpacklo_ps(
b,
a),
736 ba2367 = _mm256_unpackhi_ps(
b,
a);
738 F _04 = (
F)_mm256_unpacklo_pd((__m256d)rg0145, (__m256d)ba0145),
739 _15 = (
F)_mm256_unpackhi_pd((__m256d)rg0145, (__m256d)ba0145),
740 _26 = (
F)_mm256_unpacklo_pd((__m256d)rg2367, (__m256d)ba2367),
741 _37 = (
F)_mm256_unpackhi_pd((__m256d)rg2367, (__m256d)ba2367);
743 F _01 = _mm256_permute2f128_ps(_04, _15, 32),
744 _23 = _mm256_permute2f128_ps(_26, _37, 32),
745 _45 = _mm256_permute2f128_ps(_04, _15, 49),
746 _67 = _mm256_permute2f128_ps(_26, _37, 49);
747 _mm256_storeu_ps(ptr+ 0, _01);
748 _mm256_storeu_ps(ptr+ 8, _23);
749 _mm256_storeu_ps(ptr+16, _45);
750 _mm256_storeu_ps(ptr+24, _67);
753#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
754 template <
typename T>
using V = Vec<4, T>;
763 return _mm_or_ps(_mm_and_ps((__m128)c, t), _mm_andnot_ps((__m128)c, e));
766 return (
I32)_mm_or_ps(_mm_and_ps((__m128)c, (__m128)t),
767 _mm_andnot_ps((__m128)c, (__m128)e));
772#if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
781 return sk_bit_cast<U32>(
if_then_else(
a <
b, sk_bit_cast<I32>(
a), sk_bit_cast<I32>(
b)));
784 return sk_bit_cast<U32>(
if_then_else(
a >
b, sk_bit_cast<I32>(
a), sk_bit_cast<I32>(
b)));
790 SI F abs_(
F v) {
return _mm_and_ps(v, 0-v); }
791#if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
799 SI F sqrt_(
F v) {
return _mm_sqrt_ps (v); }
806 #if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
807 auto p = _mm_packus_epi32((__m128i)v,(__m128i)v);
810 auto p = _mm_srai_epi32(_mm_slli_epi32((__m128i)v, 16), 16);
811 p = _mm_packs_epi32(p,p);
813 return sk_unaligned_load<U16>(&p);
816 auto r = widen_cast<__m128i>(v);
817 r = _mm_packus_epi16(r,r);
818 return sk_unaligned_load<U8>(&r);
822 SI bool any(
I32 c) {
return _mm_movemask_ps(sk_bit_cast<F>(c)) != 0b0000; }
823 SI bool all(
I32 c) {
return _mm_movemask_ps(sk_bit_cast<F>(c)) == 0b1111; }
826 #if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
827 return _mm_floor_ps(v);
829 F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
835 #if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
836 return _mm_ceil_ps(v);
838 F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
843 template <
typename T>
845 return V<T>{
p[ix[0]],
p[ix[1]],
p[ix[2]],
p[ix[3]]};
850 dst[ix[0]] = after[0];
851 dst[ix[1]] = after[1];
852 dst[ix[2]] = after[2];
853 dst[ix[3]] = after[3];
856 __m128i _01 = _mm_loadu_si128(((
const __m128i*)ptr) + 0);
857 auto rg01_23 = _mm_shufflelo_epi16(_01, 0xD8);
858 auto rg = _mm_shufflehi_epi16(rg01_23, 0xD8);
860 auto R = _mm_shuffle_epi32(rg, 0x88);
861 auto G = _mm_shuffle_epi32(rg, 0xDD);
862 *r = sk_unaligned_load<U16>(&
R);
863 *g = sk_unaligned_load<U16>(&
G);
866 __m128i rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g));
867 _mm_storeu_si128((__m128i*)ptr + 0, rg);
871 __m128i _01 = _mm_loadu_si128(((
const __m128i*)ptr) + 0),
872 _23 = _mm_loadu_si128(((
const __m128i*)ptr) + 1);
874 auto _02 = _mm_unpacklo_epi16(_01, _23),
875 _13 = _mm_unpackhi_epi16(_01, _23);
877 auto rg = _mm_unpacklo_epi16(_02, _13),
878 ba = _mm_unpackhi_epi16(_02, _13);
880 *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0);
881 *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4);
882 *
b = sk_unaligned_load<U16>((uint16_t*)&ba + 0);
883 *
a = sk_unaligned_load<U16>((uint16_t*)&ba + 4);
887 auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
888 ba = _mm_unpacklo_epi16(widen_cast<__m128i>(
b), widen_cast<__m128i>(
a));
890 _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
891 _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
895 F _0 = _mm_loadu_ps(ptr + 0),
896 _1 = _mm_loadu_ps(ptr + 4),
897 _2 = _mm_loadu_ps(ptr + 8),
898 _3 = _mm_loadu_ps(ptr +12);
899 _MM_TRANSPOSE4_PS(_0,_1,_2,_3);
907 _MM_TRANSPOSE4_PS(r,g,
b,
a);
908 _mm_storeu_ps(ptr + 0, r);
909 _mm_storeu_ps(ptr + 4, g);
910 _mm_storeu_ps(ptr + 8,
b);
911 _mm_storeu_ps(ptr +12,
a);
914#elif defined(JUMPER_IS_LASX)
916 template <
typename T>
using V = Vec<8, T>;
924 SI __m128i emulate_lasx_d_xr2vr_l(__m256i
a) {
926 v2i64 al = {tmp[0], tmp[1]};
930 SI __m128i emulate_lasx_d_xr2vr_h(__m256i
a) {
932 v2i64 ah = {tmp[2], tmp[3]};
937 return sk_bit_cast<Vec<8,float>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),
938 sk_bit_cast<__m256i>(t),
939 sk_bit_cast<__m256i>(c)));
943 return sk_bit_cast<Vec<8,int32_t>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),
944 sk_bit_cast<__m256i>(t),
945 sk_bit_cast<__m256i>(c)));
955 SI F mad(
F f,
F m,
F a) {
return __lasx_xvfmadd_s(f, m,
a); }
956 SI F nmad(
F f,
F m,
F a) {
return __lasx_xvfmadd_s(-f, m,
a); }
962 SI F sqrt_(
F v) {
return __lasx_xvfsqrt_s(v); }
966 return __lasx_xvftintrz_w_s(v + t);
971 return __lasx_xvftintrz_w_s(v + t);
976 return __lasx_xvftintrz_w_s(
mad(v,
scale, t));
980 return __lsx_vpickev_h(__lsx_vsat_wu(emulate_lasx_d_xr2vr_h(v), 15),
981 __lsx_vsat_wu(emulate_lasx_d_xr2vr_l(v), 15));
985 __m128i tmp = __lsx_vsat_hu(v, 7);
986 auto r = __lsx_vpickev_b(tmp, tmp);
987 return sk_unaligned_load<U8>(&r);
991 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0), c));
992 return (retv[0] | retv[4]) != 0b0000;
996 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0), c));
997 return (retv[0] & retv[4]) == 0b1111;
1001 return __lasx_xvfrintrm_s(v);
1005 return __lasx_xvfrintrp_s(v);
1008 template <
typename T>
1010 return {
p[ix[0]],
p[ix[1]],
p[ix[2]],
p[ix[3]],
1011 p[ix[4]],
p[ix[5]],
p[ix[6]],
p[ix[7]], };
1014 template <
typename V,
typename S>
1018 dst[ix[0]] = after[0];
1019 dst[ix[1]] = after[1];
1020 dst[ix[2]] = after[2];
1021 dst[ix[3]] = after[3];
1022 dst[ix[4]] = after[4];
1023 dst[ix[5]] = after[5];
1024 dst[ix[6]] = after[6];
1025 dst[ix[7]] = after[7];
1029 U16 _0123 = __lsx_vld(ptr, 0),
1030 _4567 = __lsx_vld(ptr, 16);
1031 *r = __lsx_vpickev_h(__lsx_vsat_w(__lsx_vsrai_w(__lsx_vslli_w(_4567, 16), 16), 15),
1032 __lsx_vsat_w(__lsx_vsrai_w(__lsx_vslli_w(_0123, 16), 16), 15));
1033 *g = __lsx_vpickev_h(__lsx_vsat_w(__lsx_vsrai_w(_4567, 16), 15),
1034 __lsx_vsat_w(__lsx_vsrai_w(_0123, 16), 15));
1037 auto _0123 = __lsx_vilvl_h(g, r),
1038 _4567 = __lsx_vilvh_h(g, r);
1039 __lsx_vst(_0123, ptr, 0);
1040 __lsx_vst(_4567, ptr, 16);
1044 __m128i _01 = __lsx_vld(ptr, 0),
1045 _23 = __lsx_vld(ptr, 16),
1046 _45 = __lsx_vld(ptr, 32),
1047 _67 = __lsx_vld(ptr, 48);
1049 auto _02 = __lsx_vilvl_h(_23, _01),
1050 _13 = __lsx_vilvh_h(_23, _01),
1051 _46 = __lsx_vilvl_h(_67, _45),
1052 _57 = __lsx_vilvh_h(_67, _45);
1054 auto rg0123 = __lsx_vilvl_h(_13, _02),
1055 ba0123 = __lsx_vilvh_h(_13, _02),
1056 rg4567 = __lsx_vilvl_h(_57, _46),
1057 ba4567 = __lsx_vilvh_h(_57, _46);
1059 *r = __lsx_vilvl_d(rg4567, rg0123);
1060 *g = __lsx_vilvh_d(rg4567, rg0123);
1061 *
b = __lsx_vilvl_d(ba4567, ba0123);
1062 *
a = __lsx_vilvh_d(ba4567, ba0123);
1066 auto rg0123 = __lsx_vilvl_h(g, r),
1067 rg4567 = __lsx_vilvh_h(g, r),
1068 ba0123 = __lsx_vilvl_h(
a,
b),
1069 ba4567 = __lsx_vilvh_h(
a,
b);
1071 auto _01 =__lsx_vilvl_w(ba0123, rg0123),
1072 _23 =__lsx_vilvh_w(ba0123, rg0123),
1073 _45 =__lsx_vilvl_w(ba4567, rg4567),
1074 _67 =__lsx_vilvh_w(ba4567, rg4567);
1076 __lsx_vst(_01, ptr, 0);
1077 __lsx_vst(_23, ptr, 16);
1078 __lsx_vst(_45, ptr, 32);
1079 __lsx_vst(_67, ptr, 48);
1083 F _04 = (
F)__lasx_xvpermi_q(__lasx_xvld(ptr, 0), __lasx_xvld(ptr, 64), 0x02);
1084 F _15 = (
F)__lasx_xvpermi_q(__lasx_xvld(ptr, 16), __lasx_xvld(ptr, 80), 0x02);
1085 F _26 = (
F)__lasx_xvpermi_q(__lasx_xvld(ptr, 32), __lasx_xvld(ptr, 96), 0x02);
1086 F _37 = (
F)__lasx_xvpermi_q(__lasx_xvld(ptr, 48), __lasx_xvld(ptr, 112), 0x02);
1088 F rg0145 = (
F)__lasx_xvilvl_w((__m256i)_15, (__m256i)_04),
1089 ba0145 = (
F)__lasx_xvilvh_w((__m256i)_15, (__m256i)_04),
1090 rg2367 = (
F)__lasx_xvilvl_w((__m256i)_37, (__m256i)_26),
1091 ba2367 = (
F)__lasx_xvilvh_w((__m256i)_37, (__m256i)_26);
1093 *r = (
F)__lasx_xvilvl_d((__m256i)rg2367, (__m256i)rg0145);
1094 *g = (
F)__lasx_xvilvh_d((__m256i)rg2367, (__m256i)rg0145);
1095 *
b = (
F)__lasx_xvilvl_d((__m256i)ba2367, (__m256i)ba0145);
1096 *
a = (
F)__lasx_xvilvh_d((__m256i)ba2367, (__m256i)ba0145);
1099 F rg0145 = (
F)__lasx_xvilvl_w((__m256i)g, (__m256i)r),
1100 rg2367 = (
F)__lasx_xvilvh_w((__m256i)g, (__m256i)r),
1101 ba0145 = (
F)__lasx_xvilvl_w((__m256i)
a, (__m256i)
b),
1102 ba2367 = (
F)__lasx_xvilvh_w((__m256i)
a, (__m256i)
b);
1104 F _04 = (
F)__lasx_xvilvl_d((__m256i)ba0145, (__m256i)rg0145),
1105 _15 = (
F)__lasx_xvilvh_d((__m256i)ba0145, (__m256i)rg0145),
1106 _26 = (
F)__lasx_xvilvl_d((__m256i)ba2367, (__m256i)rg2367),
1107 _37 = (
F)__lasx_xvilvh_d((__m256i)ba2367, (__m256i)rg2367);
1109 F _01 = (
F)__lasx_xvpermi_q((__m256i)_04, (__m256i)_15, 0x02),
1110 _23 = (
F)__lasx_xvpermi_q((__m256i)_26, (__m256i)_37, 0x02),
1111 _45 = (
F)__lasx_xvpermi_q((__m256i)_04, (__m256i)_15, 0x13),
1112 _67 = (
F)__lasx_xvpermi_q((__m256i)_26, (__m256i)_37, 0x13);
1113 __lasx_xvst(_01, ptr, 0);
1114 __lasx_xvst(_23, ptr, 32);
1115 __lasx_xvst(_45, ptr, 64);
1116 __lasx_xvst(_67, ptr, 96);
1119#elif defined(JUMPER_IS_LSX)
1120 template <
typename T>
using V = Vec<4, T>;
1128 #define _LSX_TRANSPOSE4_S(row0, row1, row2, row3) \
1130 __m128 __t0 = (__m128)__lsx_vilvl_w ((__m128i)row1, (__m128i)row0); \
1131 __m128 __t1 = (__m128)__lsx_vilvl_w ((__m128i)row3, (__m128i)row2); \
1132 __m128 __t2 = (__m128)__lsx_vilvh_w ((__m128i)row1, (__m128i)row0); \
1133 __m128 __t3 = (__m128)__lsx_vilvh_w ((__m128i)row3, (__m128i)row2); \
1134 (row0) = (__m128)__lsx_vilvl_d ((__m128i)__t1, (__m128i)__t0); \
1135 (row1) = (__m128)__lsx_vilvh_d ((__m128i)__t1, (__m128i)__t0); \
1136 (row2) = (__m128)__lsx_vilvl_d ((__m128i)__t3, (__m128i)__t2); \
1137 (row3) = (__m128)__lsx_vilvh_d ((__m128i)__t3, (__m128i)__t2); \
1141 return sk_bit_cast<Vec<4,float>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(e),
1142 sk_bit_cast<__m128i>(t),
1143 sk_bit_cast<__m128i>(c)));
1147 return sk_bit_cast<Vec<4,int32_t>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(e),
1148 sk_bit_cast<__m128i>(t),
1149 sk_bit_cast<__m128i>(c)));
1159 SI F mad(
F f,
F m,
F a) {
return __lsx_vfmadd_s(f, m,
a); }
1160 SI F nmad(
F f,
F m,
F a) {
return __lsx_vfmadd_s(-f, m,
a); }
1166 SI F sqrt_(
F v) {
return __lsx_vfsqrt_s (v); }
1170 return __lsx_vftintrz_w_s(v + t); }
1174 return __lsx_vftintrz_w_s(v + t); }
1178 return __lsx_vftintrz_w_s(
mad(v,
scale, t)); }
1181 __m128i tmp = __lsx_vsat_wu(v, 15);
1182 auto p = __lsx_vpickev_h(tmp, tmp);
1183 return sk_unaligned_load<U16>(&p);
1187 auto r = widen_cast<__m128i>(v);
1188 __m128i tmp = __lsx_vsat_hu(r, 7);
1189 r = __lsx_vpickev_b(tmp, tmp);
1190 return sk_unaligned_load<U8>(&r);
1194 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0), c));
1195 return retv[0] != 0b0000;
1199 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0), c));
1200 return retv[0] == 0b1111;
1204 return __lsx_vfrintrm_s(v);
1208 return __lsx_vfrintrp_s(v);
1211 template <
typename T>
1213 return {
p[ix[0]],
p[ix[1]],
p[ix[2]],
p[ix[3]]};
1216 template <
typename V,
typename S>
1220 dst[ix[0]] = after[0];
1221 dst[ix[1]] = after[1];
1222 dst[ix[2]] = after[2];
1223 dst[ix[3]] = after[3];
1227 __m128i _01 = __lsx_vld(ptr, 0);
1228 auto rg = __lsx_vshuf4i_h(_01, 0xD8);
1230 auto R = __lsx_vshuf4i_w(rg, 0x88);
1231 auto G = __lsx_vshuf4i_w(rg, 0xDD);
1232 *r = sk_unaligned_load<U16>(&
R);
1233 *g = sk_unaligned_load<U16>(&
G);
1237 U32 rg = __lsx_vilvl_h(widen_cast<__m128i>(g), widen_cast<__m128i>(r));
1238 __lsx_vst(rg, ptr, 0);
1242 __m128i _01 = __lsx_vld(ptr, 0),
1243 _23 = __lsx_vld(ptr, 16);
1245 auto _02 = __lsx_vilvl_h(_23, _01),
1246 _13 = __lsx_vilvh_h(_23, _01);
1248 auto rg = __lsx_vilvl_h(_13, _02),
1249 ba = __lsx_vilvh_h(_13, _02);
1251 *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0);
1252 *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4);
1253 *
b = sk_unaligned_load<U16>((uint16_t*)&ba + 0);
1254 *
a = sk_unaligned_load<U16>((uint16_t*)&ba + 4);
1258 auto rg = __lsx_vilvl_h(widen_cast<__m128i>(g), widen_cast<__m128i>(r)),
1259 ba = __lsx_vilvl_h(widen_cast<__m128i>(
a), widen_cast<__m128i>(
b));
1261 __lsx_vst(__lsx_vilvl_w(ba, rg), ptr, 0);
1262 __lsx_vst(__lsx_vilvh_w(ba, rg), ptr, 16);
1266 F _0 = (
F)__lsx_vld(ptr, 0),
1267 _1 = (
F)__lsx_vld(ptr, 16),
1268 _2 = (
F)__lsx_vld(ptr, 32),
1269 _3 = (
F)__lsx_vld(ptr, 48);
1270 _LSX_TRANSPOSE4_S(_0,_1,_2,_3);
1278 _LSX_TRANSPOSE4_S(r,g,
b,
a);
1279 __lsx_vst(r, ptr, 0);
1280 __lsx_vst(g, ptr, 16);
1281 __lsx_vst(
b, ptr, 32);
1282 __lsx_vst(
a, ptr, 48);
1298#if defined(__clang__) || defined(JUMPER_IS_SCALAR)
1303SI constexpr F F_(
float x) {
return x -
F(); }
1312#if !defined(JUMPER_IS_SCALAR)
1336#if defined(JUMPER_IS_SCALAR)
1344 SI F cast64(
U64 v) {
return __builtin_convertvector( v,
F); }
1350#if !defined(JUMPER_IS_SCALAR)
1361 F e =
cast(sk_bit_cast<U32>(
x)) * (1.0f / (1<<23));
1364 F m = sk_bit_cast<F>((sk_bit_cast<U32>(
x) & 0x007fffff) | 0x3f000000);
1366 return nmad(m, 1.498030302f, e - 124.225514990f) - 1.725879990f / (0.3520887068f + m);
1370 const float ln2 = 0.69314718f;
1375 constexpr float kInfinityBits = 0x7f800000;
1378 F approx =
nmad(f, 1.490129070f,
x + 121.274057500f);
1379 approx += 27.728023300f / (4.84252568f - f);
1380 approx *= 1.0f * (1<<23);
1381 approx =
min(
max(approx,
F0),
F_(kInfinityBits));
1383 return sk_bit_cast<F>(
round(approx));
1387 const float log2_e = 1.4426950408889634074f;
1395#if !defined(JUMPER_IS_SCALAR)
1396SI F approx_powf(
F x,
float y) {
return approx_powf(
x,
F_(
y)); }
1400#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64)
1401 return vcvt_f32_f16((float16x4_t)
h);
1403#elif defined(JUMPER_IS_SKX)
1404 return _mm512_cvtph_ps((__m256i)
h);
1406#elif defined(JUMPER_IS_HSW)
1407 return _mm256_cvtph_ps((__m128i)
h);
1416 auto denorm = (
I32)em < 0x0400;
1418 , sk_bit_cast<F>( (
s<<16) + (em<<13) + ((127-15)<<23) ));
1423#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64)
1424 return (
U16)vcvt_f16_f32(f);
1426#elif defined(JUMPER_IS_SKX)
1427 return (
U16)_mm512_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
1429#elif defined(JUMPER_IS_HSW)
1430 return (
U16)_mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
1434 U32 sem = sk_bit_cast<U32>(f),
1435 s = sem & 0x80000000,
1439 auto denorm = (
I32)em < 0x38800000;
1441 , (
I32)((
s>>16) + (em>>13) - ((127-15)<<10))));
1446 size_t dx,
size_t dy,
size_t tail) {
1450 const ptrdiff_t
offset = patch.info.bytesPerPixel * (dy * ctx->
stride + dx);
1451 if (patch.info.load) {
1452 void* ctxData = SkTAddOffset<void>(ctx->
pixels,
offset);
1453 memcpy(patch.scratch, ctxData, patch.info.bytesPerPixel * tail);
1457 void* scratchFakeBase = SkTAddOffset<void>(patch.scratch, -
offset);
1458 patch.backup = ctx->
pixels;
1459 ctx->
pixels = scratchFakeBase;
1464 size_t dx,
size_t dy,
size_t tail) {
1469 ctx->
pixels = patch.backup;
1470 patch.backup =
nullptr;
1472 const ptrdiff_t
offset = patch.info.bytesPerPixel * (dy * ctx->
stride + dx);
1473 if (patch.info.store) {
1474 void* ctxData = SkTAddOffset<void>(ctx->
pixels,
offset);
1475 memcpy(ctxData, patch.scratch, patch.info.bytesPerPixel * tail);
1480#if defined(JUMPER_IS_SCALAR) || defined(JUMPER_IS_SSE2)
1486 SI F rcp_fast(
F v) {
return rcp_approx(v); }
1491static constexpr size_t N =
sizeof(
F) /
sizeof(
float);
1497#if defined(SK_CPU_ARM32) && defined(JUMPER_IS_NEON)
1500 #define ABI __attribute__((pcs("aapcs-vfp")))
1501 #define JUMPER_NARROW_STAGES 1
1502#elif defined(_MSC_VER)
1505 #define ABI __vectorcall
1506 #define JUMPER_NARROW_STAGES 1
1507#elif defined(__x86_64__) || defined(SK_CPU_ARM64) || defined(SK_CPU_LOONGARCH)
1510 #define JUMPER_NARROW_STAGES 0
1515 #define JUMPER_NARROW_STAGES 1
1518#if JUMPER_NARROW_STAGES
1531 size_t xlimit,
size_t ylimit,
1534 uint8_t* tailPointer) {
1535 uint8_t unreferencedTail;
1537 tailPointer = &unreferencedTail;
1540 const size_t x0 = dx;
1541 std::byte*
const base =
nullptr;
1542 for (; dy < ylimit; dy++) {
1543 #if JUMPER_NARROW_STAGES
1545 while (
params.dx +
N <= xlimit) {
1549 if (
size_t tail = xlimit -
params.dx) {
1550 *tailPointer = tail;
1554 *tailPointer = 0xFF;
1558 while (dx +
N <= xlimit) {
1559 start(program,dx,dy,
base,
F0,
F0,
F0,
F0,
F0,
F0,
F0,
F0);
1562 if (
size_t tail = xlimit - dx) {
1563 *tailPointer = tail;
1565 start(program,dx,dy,
base,
F0,
F0,
F0,
F0,
F0,
F0,
F0,
F0);
1567 *tailPointer = 0xFF;
1574 #define JUMPER_MUSTTAIL [[clang::musttail]]
1576 #define JUMPER_MUSTTAIL
1579#if JUMPER_NARROW_STAGES
1580 #define DECLARE_STAGE(name, ARG, STAGE_RET, INC, OFFSET, MUSTTAIL) \
1581 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1582 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1583 static void ABI name(Params* params, SkRasterPipelineStage* program, \
1584 F r, F g, F b, F a) { \
1585 OFFSET name##_k(Ctx{program}, params->dx,params->dy,params->base, \
1586 r,g,b,a, params->dr, params->dg, params->db, params->da); \
1588 auto fn = (Stage)program->fn; \
1589 MUSTTAIL return fn(params, program, r,g,b,a); \
1591 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1592 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1594 #define DECLARE_STAGE(name, ARG, STAGE_RET, INC, OFFSET, MUSTTAIL) \
1595 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1596 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1597 static void ABI name(SkRasterPipelineStage* program, size_t dx, size_t dy, \
1598 std::byte* base, F r, F g, F b, F a, F dr, F dg, F db, F da) { \
1599 OFFSET name##_k(Ctx{program}, dx,dy,base, r,g,b,a, dr,dg,db,da); \
1601 auto fn = (Stage)program->fn; \
1602 MUSTTAIL return fn(program, dx,dy,base, r,g,b,a, dr,dg,db,da); \
1604 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1605 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1610#define STAGE(name, arg) \
1611 DECLARE_STAGE(name, arg, void, ++program, , )
1616#define STAGE_TAIL(name, arg) \
1617 DECLARE_STAGE(name, arg, void, ++program, , JUMPER_MUSTTAIL)
1620#define STAGE_BRANCH(name, arg) \
1621 DECLARE_STAGE(name, arg, int, , program +=, JUMPER_MUSTTAIL)
1625#if JUMPER_NARROW_STAGES
1657#if JUMPER_NARROW_STAGES
1664 ctx->
stage =
nullptr;
1666 program = ctx->
stage;
1669 r = sk_unaligned_load<F>(ctx->
r );
1670 g = sk_unaligned_load<F>(ctx->
g );
1671 b = sk_unaligned_load<F>(ctx->
b );
1672 a = sk_unaligned_load<F>(ctx->
a );
1673 params->dr = sk_unaligned_load<F>(ctx->
dr);
1674 params->dg = sk_unaligned_load<F>(ctx->
dg);
1675 params->db = sk_unaligned_load<F>(ctx->
db);
1676 params->da = sk_unaligned_load<F>(ctx->
da);
1693 ctx->
stage = program;
1697 size_t dx,
size_t dy, std::byte*
base,
1703 ctx->
stage =
nullptr;
1704 next(program, dx, dy,
base, r, g,
b,
a,
dr,
dg, db,
da);
1705 program = ctx->
stage;
1708 r = sk_unaligned_load<F>(ctx->
r );
1709 g = sk_unaligned_load<F>(ctx->
g );
1710 b = sk_unaligned_load<F>(ctx->
b );
1711 a = sk_unaligned_load<F>(ctx->
a );
1712 dr = sk_unaligned_load<F>(ctx->
dr);
1713 dg = sk_unaligned_load<F>(ctx->
dg);
1714 db = sk_unaligned_load<F>(ctx->
db);
1715 da = sk_unaligned_load<F>(ctx->
da);
1721 size_t dx,
size_t dy, std::byte*
base,
1733 ctx->
stage = program;
1740template <
typename V,
typename T>
1742 return sk_unaligned_load<V>(src);
1745template <
typename V,
typename T>
1758 *r =
cast(wide & (31<<11)) * (1.0f / (31<<11));
1759 *g =
cast(wide & (63<< 5)) * (1.0f / (63<< 5));
1760 *
b =
cast(wide & (31<< 0)) * (1.0f / (31<< 0));
1764 *r =
cast(wide & (15<<12)) * (1.0f / (15<<12));
1765 *g =
cast(wide & (15<< 8)) * (1.0f / (15<< 8));
1766 *
b =
cast(wide & (15<< 4)) * (1.0f / (15<< 4));
1767 *
a =
cast(wide & (15<< 0)) * (1.0f / (15<< 0));
1770 *r =
cast((_8888 ) & 0xff) * (1/255.0f);
1771 *g =
cast((_8888 >> 8) & 0xff) * (1/255.0f);
1772 *
b =
cast((_8888 >> 16) & 0xff) * (1/255.0f);
1773 *
a =
cast((_8888 >> 24) ) * (1/255.0f);
1777 *r =
cast((wide ) & 0xff) * (1/255.0f);
1778 *g =
cast((wide >> 8) & 0xff) * (1/255.0f);
1781 *r =
cast((
rgba ) & 0x3ff) * (1/1023.0f);
1782 *g =
cast((
rgba >> 10) & 0x3ff) * (1/1023.0f);
1783 *
b =
cast((
rgba >> 20) & 0x3ff) * (1/1023.0f);
1787 static constexpr float min = -0.752941f;
1788 static constexpr float max = 1.25098f;
1789 static constexpr float range =
max -
min;
1790 *r =
cast((
rgba ) & 0x3ff) * (1/1023.0f) * range +
min;
1791 *g =
cast((
rgba >> 10) & 0x3ff) * (1/1023.0f) * range +
min;
1792 *
b =
cast((
rgba >> 20) & 0x3ff) * (1/1023.0f) * range +
min;
1796 *r = (
cast64((_10x6 >> 6) & 0x3ff) - 384.f) / 510.f;
1797 *g = (
cast64((_10x6 >> 22) & 0x3ff) - 384.f) / 510.f;
1798 *
b = (
cast64((_10x6 >> 38) & 0x3ff) - 384.f) / 510.f;
1799 *
a = (
cast64((_10x6 >> 54) & 0x3ff) - 384.f) / 510.f;
1802 *r =
cast64((_10x6 >> 6) & 0x3ff) * (1/1023.0f);
1803 *g =
cast64((_10x6 >> 22) & 0x3ff) * (1/1023.0f);
1804 *
b =
cast64((_10x6 >> 38) & 0x3ff) * (1/1023.0f);
1805 *
a =
cast64((_10x6 >> 54) & 0x3ff) * (1/1023.0f);
1808 *r =
cast((_1616 ) & 0xffff) * (1/65535.0f);
1809 *g =
cast((_1616 >> 16) & 0xffff) * (1/65535.0f);
1812 *r =
cast64((_16161616 ) & 0xffff) * (1/65535.0f);
1813 *g =
cast64((_16161616 >> 16) & 0xffff) * (1/65535.0f);
1814 *
b =
cast64((_16161616 >> 32) & 0xffff) * (1/65535.0f);
1815 *
a =
cast64((_16161616 >> 48) & 0xffff) * (1/65535.0f);
1819template <
typename T>
1826 F inclusive = sk_bit_cast<F>(sk_bit_cast<U32>(limit) - 1);
1827 return min(
max(0.0f, v), inclusive);
1832 const F inclusiveZ =
F_(std::numeric_limits<float>::min()),
1833 inclusiveL = sk_bit_cast<F>( sk_bit_cast<U32>(
F_(limit)) - 1 );
1834 return min(
max(inclusiveZ, v), inclusiveL);
1843 constexpr float A = 6.28230858f;
1844 constexpr float B = -41.1693687f;
1845 constexpr float C = 74.4388885f;
1851 constexpr float one_over_pi2 = 1 / (2 *
SK_FloatPI);
1852 x =
mad(
x, -one_over_pi2, 0.25f);
1858 constexpr float one_over_pi2 = 1 / (2 *
SK_FloatPI);
1889 x =
mad(fract(
mad(
x, 1/Pi, 0.5f)), Pi, -Pi/2);
1891 I32 neg = (
x < 0.0f);
1895 I32 use_quotient = (
x > (Pi/8));
1899 const float c4 = 62 / 2835.0f;
1900 const float c3 = 17 / 315.0f;
1901 const float c2 = 2 / 15.0f;
1902 const float c1 = 1 / 3.0f;
1903 const float c0 = 1.0f;
1905 x *=
mad(x2,
mad(x2,
mad(x2,
mad(x2, c4, c3), c2), c1), c0);
1921 const float c4 = 0.14130025741326729f;
1922 const float c3 = -0.34312835980675116f;
1923 const float c2 = -0.016172900528248768f;
1924 const float c1 = 1.0037696976200385f;
1925 const float c0 = -0.00014758242182738969f;
1931 I32 neg = (
x < 0.0f);
1933 I32 flip = (
x > 1.0f);
1945 I32 neg = (
x < 0.0f);
1947 const float c3 = -0.0187293f;
1948 const float c2 = 0.0742610f;
1949 const float c1 = -0.2121144f;
1950 const float c0 = 1.5707288f;
1971 I32 neg = (arg < 0.0f);
1986template <
typename T>
2011#if defined(JUMPER_IS_SCALAR)
2021#if defined(JUMPER_IS_SCALAR)
2027SI uint32_t select_lane(
U32 data,
int lane) {
return data[lane]; }
2034 static constexpr float iota[] = {
2035 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
2036 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
2043 r =
cast(
U32_(dx)) + sk_unaligned_load<F>(iota);
2051 uint32_t iota[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
2054 U32 X =
U32_(dx) + sk_unaligned_load<U32>(iota),
2065 U32 M = (
Y & 1) << 5 | (
X & 1) << 4
2066 | (
Y & 2) << 2 | (
X & 2) << 1
2067 | (
Y & 4) >> 1 | (
X & 4) >> 2;
2072 F dither =
cast(
M) * (2/128.0f) - (63/128.0f);
2116 r = sk_unaligned_load<F>(ptr + 0*
N);
2117 g = sk_unaligned_load<F>(ptr + 1*
N);
2118 b = sk_unaligned_load<F>(ptr + 2*
N);
2119 a = sk_unaligned_load<F>(ptr + 3*
N);
2136 r = sk_unaligned_load<F>(ptr + 0*
N);
2137 g = sk_unaligned_load<F>(ptr + 1*
N);
2146 dr = sk_unaligned_load<F>(ptr + 0*
N);
2147 dg = sk_unaligned_load<F>(ptr + 1*
N);
2148 db = sk_unaligned_load<F>(ptr + 2*
N);
2149 da = sk_unaligned_load<F>(ptr + 3*
N);
2161#define BLEND_MODE(name) \
2162 SI F name##_channel(F s, F d, F sa, F da); \
2163 STAGE(name, NoCtx) { \
2164 r = name##_channel(r,dr,a,da); \
2165 g = name##_channel(g,dg,a,da); \
2166 b = name##_channel(b,db,a,da); \
2167 a = name##_channel(a,da,a,da); \
2169 SI F name##_channel(F s, F d, F sa, F da)
2193#define BLEND_MODE(name) \
2194 SI F name##_channel(F s, F d, F sa, F da); \
2195 STAGE(name, NoCtx) { \
2196 r = name##_channel(r,dr,a,da); \
2197 g = name##_channel(g,dg,a,da); \
2198 b = name##_channel(b,db,a,da); \
2199 a = mad(da, inv(a), a); \
2201 SI F name##_channel(F s, F d, F sa, F da)
2236 F darkSrc =
d*(sa + (s2 - sa)*(1.0f - m)),
2237 darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m,
2238 liteDst =
sqrt_(m) - m,
2267 F diff = l -
lum(*r, *g, *
b);
2281 l =
lum(*r, *g, *
b),
2282 mn_scale = ( l) *
rcp_fast(l - mn),
2287 *r =
clip_channel(*r, l, clip_low, clip_high, mn_scale, mx_scale);
2288 *g =
clip_channel(*g, l, clip_low, clip_high, mn_scale, mx_scale);
2289 *
b =
clip_channel(*
b, l, clip_low, clip_high, mn_scale, mx_scale);
2348 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2350 U32 dst = load<U32>(ptr);
2351 dr =
cast((dst ) & 0xff);
2352 dg =
cast((dst >> 8) & 0xff);
2353 db =
cast((dst >> 16) & 0xff);
2394STAGE(unbounded_set_rgb,
const float* rgb) {
2441 float inf = sk_bit_cast<float>(0x7f800000);
2448 float inf = sk_bit_cast<float>(0x7f800000);
2467 (r-g)*d_rcp + 4.0f)));
2469 F l = (mx + mn) * 0.5f;
2483 c = (1.0f -
abs_(2.0f * l - 1)) *
s;
2485 auto hue_to_rgb = [&](
F hue) {
2487 return (q - 0.5f) * c + l;
2490 r = hue_to_rgb(
h + 0.0f/3.0f);
2491 g = hue_to_rgb(
h + 2.0f/3.0f);
2492 b = hue_to_rgb(
h + 1.0f/3.0f);
2498 constexpr float k = 24389 / 27.0f;
2499 constexpr float e = 216 / 24389.0f;
2502 f[1] = (r + 16) * (1 / 116.0f);
2503 f[0] = (g * (1 / 500.0f)) + f[1];
2504 f[2] = f[1] - (
b * (1 / 200.0f));
2506 F f_cubed[3] = { f[0]*f[0]*f[0], f[1]*f[1]*f[1], f[2]*f[2]*f[2] };
2509 if_then_else(f_cubed[0] > e, f_cubed[0], (116 * f[0] - 16) * (1 / k)),
2511 if_then_else(f_cubed[2] > e, f_cubed[2], (116 * f[2] - 16) * (1 / k))
2514 constexpr float D50[3] = { 0.3457f / 0.3585f, 1.0f, (1.0f - 0.3457f - 0.3585f) / 0.3585f };
2521 F l_ = r + 0.3963377774f * g + 0.2158037573f *
b,
2522 m_ = r - 0.1055613458f * g - 0.0638541728f *
b,
2523 s_ = r - 0.0894841775f * g - 1.2914855480f *
b;
2529 r = +4.0767416621f * l - 3.3077115913f * m + 0.2309699292f *
s;
2530 g = -1.2684380046f * l + 2.6097574011f * m - 0.3413193965f *
s;
2531 b = -0.0041960863f * l - 0.7034186147f * m + 1.7076147010f *
s;
2545 r = +4.0767416621f * l - 3.3077115913f * m + 0.2309699292f *
s;
2546 g = -1.2684380046f * l + 2.6097574011f * m - 0.3413193965f *
s;
2547 b = -0.0041960863f * l - 0.7034186147f * m + 1.7076147010f *
s;
2561 g =
C *
cos_(hueRadians);
2562 b =
C *
sin_(hueRadians);
2578 mod_(0 +
h * (1 / 30.0f), 12),
2579 mod_(8 +
h * (1 / 30.0f), 12),
2580 mod_(4 +
h * (1 / 30.0f), 12)
2584 l -
a *
max(-1.0f,
min(
min(k[0] - 3.0f, 9.0f - k[0]), 1.0f)),
2585 l -
a *
max(-1.0f,
min(
min(k[1] - 3.0f, 9.0f - k[1]), 1.0f)),
2586 l -
a *
max(-1.0f,
min(
min(k[2] - 3.0f, 9.0f - k[2]), 1.0f))
2601 F gray = g / (g +
b);
2604 rgb.
r = rgb.
r * (1 - g -
b) + g;
2605 rgb.
g = rgb.
g * (1 - g -
b) + g;
2606 rgb.
b = rgb.
b * (1 - g -
b) + g;
2608 auto isGray = (g +
b) >= 1;
2628 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2630 auto scales = load<U8>(ptr);
2639 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2642 from_565(load<U16>(ptr), &cr, &cg, &cb);
2653 return mad(to-from, t, from);
2662STAGE(scale_native,
const float scales[]) {
2663 auto c = sk_unaligned_load<F>(scales);
2669STAGE(lerp_native,
const float scales[]) {
2670 auto c = sk_unaligned_load<F>(scales);
2677 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2679 auto scales = load<U8>(ptr);
2688 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2691 from_565(load<U16>(ptr), &cr, &cg, &cb);
2702 auto mptr = ptr_at_xy<const uint8_t>(&ctx->
mul, dx,dy),
2703 aptr = ptr_at_xy<const uint8_t>(&ctx->
add, dx,dy);
2708 r =
mad(r, mul, add);
2709 g =
mad(g, mul, add);
2710 b =
mad(
b, mul, add);
2721 U32 bits = sk_bit_cast<U32>(
x);
2722 *
sign = bits & 0x80000000;
2723 return sk_bit_cast<F>(bits ^ *
sign);
2727 return sk_bit_cast<F>(
sign | sk_bit_cast<U32>(
x));
2731 auto fn = [&](
F v) {
2745 auto fn = [&](
F v) {
2756 auto fn = [&](
F v) {
2772 auto fn = [&](
F v) {
2776 const float R = ctx->
a,
G = ctx->
b,
2777 a = ctx->
c,
b = ctx->
d, c = ctx->
e,
2791 auto fn = [&](
F v) {
2795 const float R = ctx->
a,
G = ctx->
b,
2796 a = ctx->
c,
b = ctx->
d, c = ctx->
e,
2811 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2817 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2829 auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
2835 auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
2842 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2848 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2854 const uint16_t* ptr;
2860 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2869 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2873 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2877 const uint16_t* ptr;
2882 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2891 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2895 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2899 const uint32_t* ptr;
2904 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2914 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2915 from_88(load<U16>(ptr), &r, &g);
2920 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2926 const uint16_t* ptr;
2933 auto ptr = ptr_at_xy<uint16_t>(ctx, dx, dy);
2939 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2944 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2949 const uint16_t* ptr;
2955 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2962 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2968 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2974 const uint32_t* ptr;
2981 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2989 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2993 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2997 const uint64_t* ptr;
3002 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy);
3013 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
3017 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
3021 const uint64_t* ptr;
3026 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy);
3038 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
3042 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
3046 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
3050 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
3054 const uint32_t* ptr;
3059 const uint32_t* ptr;
3064 const uint64_t* ptr;
3069 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
3073 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
3077 static constexpr float min = -0.752941f;
3078 static constexpr float max = 1.25098f;
3079 static constexpr float range =
max -
min;
3080 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy);
3090 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
3099 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
3100 static constexpr float min = -0.752941f;
3101 static constexpr float max = 1.25098f;
3102 static constexpr float range =
max -
min;
3111 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
3114 load4((
const uint16_t*)ptr, &
R,&
G,&
B,&
A);
3121 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
3124 load4((
const uint16_t*)ptr, &
R,&
G,&
B,&
A);
3131 const uint64_t* ptr;
3133 auto px =
gather(ptr, ix);
3136 load4((
const uint16_t*)&px, &
R,&
G,&
B,&
A);
3143 auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy);
3144 store4((uint16_t*)ptr, to_half(r)
3151 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
3153 U16 A = load<U16>((
const uint16_t*)ptr);
3160 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
3162 U16 A = load<U16>((
const uint16_t*)ptr);
3167 const uint16_t* ptr;
3170 a = from_half(
gather(ptr, ix));
3173 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
3178 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
3181 load2((
const uint16_t*)ptr, &
R, &
G);
3188 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
3191 load2((
const uint16_t*)ptr, &
R, &
G);
3198 const uint32_t* ptr;
3200 auto px =
gather(ptr, ix);
3203 load2((
const uint16_t*)&px, &
R, &
G);
3210 auto ptr = ptr_at_xy<uint32_t>(ctx, dx, dy);
3211 store2((uint16_t*)ptr, to_half(r)
3216 auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy);
3220 auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy);
3226 r =
gather(ptr, 4*ix + 0);
3227 g =
gather(ptr, 4*ix + 1);
3232 auto ptr = ptr_at_xy<float>(ctx, 4*dx,4*dy);
3240 auto limit = ctx->
scale;
3244 auto u = v -
floor_(v*invLimit*0.5f)*2*limit;
3249 auto m = u - 2*
s*(u - limit);
3254 return sk_bit_cast<F>(sk_bit_cast<U32>(m) + ctx->
mirrorBiasDir*biasInUlps);
3282 auto cond = ((0 < r) & (r <
w)) | (r == e);
3288 auto cond = ((0 < g) & (g <
h)) | (g == e);
3296 auto cond = (((0 < r) & (r <
w)) | (r == ex))
3297 & (((0 < g) & (g <
h)) | (g == ey));
3301 auto mask = sk_unaligned_load<U32>(ctx->
mask);
3302 r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask);
3303 g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask);
3304 b = sk_bit_cast<F>(sk_bit_cast<U32>(
b) & mask);
3305 a = sk_bit_cast<F>(sk_bit_cast<U32>(
a) & mask);
3326 a = r*0.2126f + g*0.7152f +
b*0.0722f;
3330 r = g =
b = r*0.2126f + g*0.7152f +
b*0.0722f;
3333STAGE(matrix_translate,
const float* m) {
3337STAGE(matrix_scale_translate,
const float* m) {
3338 r =
mad(r,m[0], m[2]);
3339 g =
mad(g,m[1], m[3]);
3342 auto R =
mad(r,m[0],
mad(g,m[1], m[2])),
3343 G =
mad(r,m[3],
mad(g,m[4], m[5]));
3348 auto R =
mad(r,m[0],
mad(g,m[3],
b*m[6])),
3349 G =
mad(r,m[1],
mad(g,m[4],
b*m[7])),
3350 B =
mad(r,m[2],
mad(g,m[5],
b*m[8]));
3356 auto R =
mad(r,m[0],
mad(g,m[3],
mad(
b,m[6], m[ 9]))),
3364 auto R =
mad(r,m[ 0],
mad(g,m[ 1],
mad(
b,m[ 2],
mad(
a,m[ 3], m[ 4])))),
3377 r =
mad(
X, m[0],
mad(
Y, m[4], m[ 8]));
3378 g =
mad(
X, m[1],
mad(
Y, m[5], m[ 9]));
3382STAGE(matrix_perspective,
const float* m) {
3384 auto R =
mad(r,m[0],
mad(g,m[1], m[2])),
3385 G =
mad(r,m[3],
mad(g,m[4], m[5])),
3386 Z =
mad(r,m[6],
mad(g,m[7], m[8]));
3392 F* r,
F* g,
F*
b,
F*
a) {
3393 F fr, br, fg, bg, fb, bb, fa, ba;
3394#if defined(JUMPER_IS_HSW)
3396 fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[0]), (__m256i)idx);
3397 br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[0]), (__m256i)idx);
3398 fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[1]), (__m256i)idx);
3399 bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[1]), (__m256i)idx);
3400 fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[2]), (__m256i)idx);
3401 bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[2]), (__m256i)idx);
3402 fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[3]), (__m256i)idx);
3403 ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[3]), (__m256i)idx);
3405#elif defined(JUMPER_IS_LASX)
3407 fr = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[0], 0), idx);
3408 br = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[0], 0), idx);
3409 fg = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[1], 0), idx);
3410 bg = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[1], 0), idx);
3411 fb = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[2], 0), idx);
3412 bb = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[2], 0), idx);
3413 fa = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[3], 0), idx);
3414 ba = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[3], 0), idx);
3416#elif defined(JUMPER_IS_LSX)
3418 __m128i zero = __lsx_vldi(0);
3419 fr = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
fs[0], 0));
3420 br = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
bs[0], 0));
3421 fg = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
fs[1], 0));
3422 bg = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
bs[1], 0));
3423 fb = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
fs[2], 0));
3424 bb = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
bs[2], 0));
3425 fa = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
fs[3], 0));
3426 ba = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
bs[3], 0));
3440 *r =
mad(t, fr, br);
3441 *g =
mad(t, fg, bg);
3442 *
b =
mad(t, fb, bb);
3443 *
a =
mad(t, fa, ba);
3457 for (
size_t i = 1; i < c->
stopCount; i++) {
3466 r =
mad(t, c->
f[0], c->
b[0]);
3467 g =
mad(t, c->
f[1], c->
b[1]);
3468 b =
mad(t, c->
f[2], c->
b[2]);
3469 a =
mad(t, c->
f[3], c->
b[3]);
3478 F slope =
min(xabs, yabs)/
max(xabs, yabs);
3479 F s = slope * slope;
3486 * (0.15912117063999176025390625f +
s
3487 * (-5.185396969318389892578125e-2f +
s
3488 * (2.476101927459239959716796875e-2f +
s
3489 * (-7.0547382347285747528076171875e-3f))));
3509 F x = r,
y = g, &t = r;
3514 F x = r,
y = g, &t = r;
3519 F x = r,
y = g, &t = r;
3524 F x = r,
y = g, &t = r;
3529 F x = r,
y = g, &t = r;
3557STAGE(apply_vector_mask,
const uint32_t* ctx) {
3558 const U32 mask = sk_unaligned_load<U32>(ctx);
3559 r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask);
3560 g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask);
3561 b = sk_bit_cast<F>(sk_bit_cast<U32>(
b) & mask);
3562 a = sk_bit_cast<F>(sk_bit_cast<U32>(
a) & mask);
3569 F fx = fract(*r + 0.5f),
3570 fy = fract(*g + 0.5f);
3583 * sk_unaligned_load<F>(c->
scaley);
3595template <
int kScale>
3597 *
x = sk_unaligned_load<F>(ctx->
x) + (
kScale * 0.5f);
3598 F fx = sk_unaligned_load<F>(ctx->
fx);
3601 if (
kScale == -1) { scalex = 1.0f - fx; }
3602 if (
kScale == +1) { scalex = fx; }
3605template <
int kScale>
3607 *
y = sk_unaligned_load<F>(ctx->
y) + (
kScale * 0.5f);
3608 F fy = sk_unaligned_load<F>(ctx->
fy);
3611 if (
kScale == -1) { scaley = 1.0f - fy; }
3612 if (
kScale == +1) { scaley = fy; }
3638template <
int kScale>
3640 *
x = sk_unaligned_load<F>(ctx->
x) + (
kScale * 0.5f);
3643 if (
kScale == -3) { scalex = sk_unaligned_load<F>(ctx->
wx[0]); }
3644 if (
kScale == -1) { scalex = sk_unaligned_load<F>(ctx->
wx[1]); }
3645 if (
kScale == +1) { scalex = sk_unaligned_load<F>(ctx->
wx[2]); }
3646 if (
kScale == +3) { scalex = sk_unaligned_load<F>(ctx->
wx[3]); }
3649template <
int kScale>
3651 *
y = sk_unaligned_load<F>(ctx->
y) + (
kScale * 0.5f);
3654 if (
kScale == -3) { scaley = sk_unaligned_load<F>(ctx->
wy[0]); }
3655 if (
kScale == -1) { scaley = sk_unaligned_load<F>(ctx->
wy[1]); }
3656 if (
kScale == +1) { scaley = sk_unaligned_load<F>(ctx->
wy[2]); }
3657 if (
kScale == +3) { scaley = sk_unaligned_load<F>(ctx->
wy[3]); }
3666 F fx = sk_unaligned_load<F>(ctx->
fx);
3672 F fy = sk_unaligned_load<F>(ctx->
fy);
3694#ifdef SK_CPU_BENDIAN
3695 U32 sampleLo = sample >> 16;
3696 U32 sampleHi = sample & 0xFFFF;
3698 U32 sampleLo = sample & 0xFFFF;
3699 U32 sampleHi = sample >> 16;
3703 F vecX =
mad(
cast(sampleLo), 2.0f / 65535.0f, -1.0f);
3704 F vecY =
mad(
cast(sampleHi), 2.0f / 65535.0f, -1.0f);
3719 for (
int octave = 0; octave < ctx->
numOctaves; ++octave) {
3721 F floorValX =
floor_(noiseVecX);
3722 F floorValY =
floor_(noiseVecY);
3723 F ceilValX = floorValX + 1.0f;
3724 F ceilValY = floorValY + 1.0f;
3725 F fractValX = noiseVecX - floorValX;
3726 F fractValY = noiseVecY - floorValY;
3730 floorValX -= sk_bit_cast<F>(
cond_to_mask(floorValX >= stitchDataX) &
3731 sk_bit_cast<I32>(stitchDataX));
3732 floorValY -= sk_bit_cast<F>(
cond_to_mask(floorValY >= stitchDataY) &
3733 sk_bit_cast<I32>(stitchDataY));
3734 ceilValX -= sk_bit_cast<F>(
cond_to_mask(ceilValX >= stitchDataX) &
3735 sk_bit_cast<I32>(stitchDataX));
3736 ceilValY -= sk_bit_cast<F>(
cond_to_mask(ceilValY >= stitchDataY) &
3737 sk_bit_cast<I32>(stitchDataY));
3742 latticeLookup = (
U32)(
iround(ceilValX)) & 0xFF;
3745 U32 b00 = (
U32)(
iround(latticeIdxX + floorValY)) & 0xFF;
3746 U32 b10 = (
U32)(
iround(latticeIdxY + floorValY)) & 0xFF;
3747 U32 b01 = (
U32)(
iround(latticeIdxX + ceilValY)) & 0xFF;
3748 U32 b11 = (
U32)(
iround(latticeIdxY + ceilValY)) & 0xFF;
3752 F smoothX = fractValX * fractValX * (3.0f - 2.0f * fractValX);
3753 F smoothY = fractValY * fractValY * (3.0f - 2.0f * fractValY);
3756 const uint32_t* channelNoiseData =
reinterpret_cast<const uint32_t*
>(ctx->
noiseData);
3757 for (
int channel = 0; channel < 4; ++channel) {
3758 U32 sample00 =
gather(channelNoiseData, b00);
3759 U32 sample10 =
gather(channelNoiseData, b10);
3760 U32 sample01 =
gather(channelNoiseData, b01);
3761 U32 sample11 =
gather(channelNoiseData, b11);
3762 channelNoiseData += 256;
3766 F A =
lerp(u, v, smoothX);
3770 F B =
lerp(u, v, smoothX);
3791 stitchDataX *= 2.0f;
3792 stitchDataY *= 2.0f;
3798 r =
mad(r, 0.5f, 0.5f);
3799 g =
mad(g, 0.5f, 0.5f);
3800 b =
mad(
b, 0.5f, 0.5f);
3801 a =
mad(
a, 0.5f, 0.5f);
3821 r = sk_unaligned_load<F>(ctx->
x) * ctx->
scaleX;
3822 g = sk_unaligned_load<F>(ctx->
y) * ctx->
scaleY;
3848#define execution_mask() sk_bit_cast<I32>(a)
3849#define update_execution_mask() a = sk_bit_cast<F>(sk_bit_cast<I32>(r) & \
3850 sk_bit_cast<I32>(g) & \
3851 sk_bit_cast<I32>(b))
3854 uint32_t iota[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
3858 r = g =
b =
a = sk_bit_cast<F>(mask);
3864 static constexpr float iota[] = {
3865 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
3866 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
3870 dst[0] =
cast(
U32_(dx)) + sk_unaligned_load<F>(iota);
3878 F temp[4] = {r, g,
b,
a};
3890 r = sk_unaligned_load<F>(ctx);
3900 r = sk_bit_cast<F>(ptr[0] & ptr[1]);
3906 r = sk_bit_cast<F>(ptr[0] & ~ptr[1]);
3911 g = sk_unaligned_load<F>(ctx);
3928 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) | ptr[0]);
3935 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) & ptr[0]);
3952 I32* actualValue = (
I32*)(
base + ctx.offset);
3956 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) | caseMatches);
3960 I32* defaultMask = actualValue + 1;
3961 *defaultMask &= ~caseMatches;
3965 b = sk_unaligned_load<F>(ctx);
3981 uint32_t iota[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
4035 if (any(*traceMask)) {
4044 for (
size_t lane = 0; lane <
N; ++lane) {
4051 indirectOffset = std::min<uint32_t>(indirectOffset, ctx->
indirectLimit);
4052 data += indirectOffset;
4053 slotIdx += indirectOffset;
4055 while (numSlots--) {
4067 const int* src = ctx->
src;
4069 dst[0] =
I32_(src[0]);
4072 const int* src = ctx->
src;
4074 dst[0] =
I32_(src[0]);
4075 dst[1] =
I32_(src[1]);
4078 const int* src = ctx->
src;
4080 dst[0] =
I32_(src[0]);
4081 dst[1] =
I32_(src[1]);
4082 dst[2] =
I32_(src[2]);
4085 const int* src = ctx->
src;
4087 dst[0] =
I32_(src[0]);
4088 dst[1] =
I32_(src[1]);
4089 dst[2] =
I32_(src[2]);
4090 dst[3] =
I32_(src[3]);
4103 dst[0] = dst[1] =
value;
4109 dst[0] = dst[1] = dst[2] =
value;
4115 dst[0] = dst[1] = dst[2] = dst[3] =
value;
4118template <
int NumSlots>
4121 F* dst = (
F*)(
base + ctx.dst);
4122 F* src = (
F*)(
base + ctx.src);
4123 memcpy(dst, src,
sizeof(
F) * NumSlots);
4127 copy_n_slots_unmasked_fn<1>(packed,
base);
4130 copy_n_slots_unmasked_fn<2>(packed,
base);
4133 copy_n_slots_unmasked_fn<3>(packed,
base);
4136 copy_n_slots_unmasked_fn<4>(packed,
base);
4139template <
int NumSlots>
4144 float* src = (
float*)(
base + ctx.src);
4145 float values[NumSlots];
4146 SK_UNROLL for (
int index = 0; index < NumSlots; ++index) {
4147 values[index] = src[index];
4150 F* dst = (
F*)(
base + ctx.dst);
4151 SK_UNROLL for (
int index = 0; index < NumSlots; ++index) {
4152 dst[index] =
F_(values[index]);
4157 copy_n_immutable_unmasked_fn<1>(packed,
base);
4160 copy_n_immutable_unmasked_fn<2>(packed,
base);
4163 copy_n_immutable_unmasked_fn<3>(packed,
base);
4166 copy_n_immutable_unmasked_fn<4>(packed,
base);
4169template <
int NumSlots>
4194template <
int LoopCount,
typename OffsetType>
4207 case 16: dst[15] = scratch[15]; [[fallthrough]];
4208 case 15: dst[14] = scratch[14]; [[fallthrough]];
4209 case 14: dst[13] = scratch[13]; [[fallthrough]];
4210 case 13: dst[12] = scratch[12]; [[fallthrough]];
4211 case 12: dst[11] = scratch[11]; [[fallthrough]];
4212 case 11: dst[10] = scratch[10]; [[fallthrough]];
4213 case 10: dst[ 9] = scratch[ 9]; [[fallthrough]];
4214 case 9: dst[ 8] = scratch[ 8]; [[fallthrough]];
4215 case 8: dst[ 7] = scratch[ 7]; [[fallthrough]];
4216 case 7: dst[ 6] = scratch[ 6]; [[fallthrough]];
4217 case 6: dst[ 5] = scratch[ 5]; [[fallthrough]];
4218 case 5: dst[ 4] = scratch[ 4]; [[fallthrough]];
4219 case 4: dst[ 3] = scratch[ 3]; [[fallthrough]];
4220 case 3: dst[ 2] = scratch[ 2]; [[fallthrough]];
4221 case 2: dst[ 1] = scratch[ 1]; [[fallthrough]];
4222 case 1: dst[ 0] = scratch[ 0];
4229 shuffle_fn<N>(
base + ctx.dst, ctx.offsets,
N);
4233 small_swizzle_fn<1>(packed,
base);
4236 small_swizzle_fn<2>(packed,
base);
4239 small_swizzle_fn<3>(packed,
base);
4242 small_swizzle_fn<4>(packed,
base);
4248template <
int NumSlots>
4250 std::byte* dstB = (std::byte*)dst;
4252 I32* dstS = (
I32*)(dstB + *offsets);
4281 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4283 offsets += sk_unaligned_load<U32>(iota);
4286 const int* src = ctx->
src;
4290 *dst =
gather(src, offsets);
4293 }
while (dst !=
end);
4302 const int* src = ctx->
src;
4306 *dst =
gather(src, offsets);
4309 }
while (dst !=
end);
4321 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4323 offsets += sk_unaligned_load<U32>(iota);
4328 int* dst = ctx->
dst;
4334 }
while (src !=
end);
4346 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4348 offsets += sk_unaligned_load<U32>(iota);
4353 std::byte* dstB = (std::byte*)ctx->
dst;
4354 const uint16_t* swizzle = ctx->
offsets;
4357 int* dst = (
int*)(dstB + *swizzle);
4361 }
while (src !=
end);
4367template <
typename T,
void (*ApplyFn)(T*)>
4372 }
while (dst !=
end);
4375#if defined(JUMPER_IS_SCALAR)
4376template <
typename T>
4378 *dst = sk_bit_cast<T>((
F)*dst);
4381 *dst = sk_bit_cast<F>((
I32)*dst);
4384 *dst = sk_bit_cast<F>((
U32)*dst);
4387template <
typename T>
4388SI void cast_to_float_from_fn(
T* dst) {
4389 *dst = sk_bit_cast<T>(__builtin_convertvector(*dst,
F));
4392 *
dst = sk_bit_cast<F>(__builtin_convertvector(*dst,
I32));
4395 *
dst = sk_bit_cast<F>(__builtin_convertvector(*dst,
U32));
4415#define DECLARE_UNARY_FLOAT(name) \
4416 STAGE_TAIL(name##_float, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 1); } \
4417 STAGE_TAIL(name##_2_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 2); } \
4418 STAGE_TAIL(name##_3_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 3); } \
4419 STAGE_TAIL(name##_4_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 4); }
4421#define DECLARE_UNARY_INT(name) \
4422 STAGE_TAIL(name##_int, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 1); } \
4423 STAGE_TAIL(name##_2_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 2); } \
4424 STAGE_TAIL(name##_3_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 3); } \
4425 STAGE_TAIL(name##_4_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 4); }
4427#define DECLARE_UNARY_UINT(name) \
4428 STAGE_TAIL(name##_uint, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 1); } \
4429 STAGE_TAIL(name##_2_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 2); } \
4430 STAGE_TAIL(name##_3_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 3); } \
4431 STAGE_TAIL(name##_4_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 4); }
4441#undef DECLARE_UNARY_FLOAT
4442#undef DECLARE_UNARY_INT
4443#undef DECLARE_UNARY_UINT
4459 F a00 = dst[0], a01 = dst[1],
4460 a10 = dst[2], a11 = dst[3];
4461 F det =
nmad(a01, a10, a00 * a11),
4463 dst[0] = invdet * a11;
4464 dst[1] = -invdet * a01;
4465 dst[2] = -invdet * a10;
4466 dst[3] = invdet * a00;
4470 F a00 = dst[0], a01 = dst[1], a02 = dst[2],
4471 a10 = dst[3], a11 = dst[4], a12 = dst[5],
4472 a20 = dst[6], a21 = dst[7], a22 = dst[8];
4473 F b01 =
nmad(a12, a21, a22 * a11),
4474 b11 =
nmad(a22, a10, a12 * a20),
4475 b21 =
nmad(a11, a20, a21 * a10);
4476 F det =
mad(a00, b01,
mad(a01, b11, a02 * b21)),
4478 dst[0] = invdet * b01;
4479 dst[1] = invdet *
nmad(a22, a01, a02 * a21);
4480 dst[2] = invdet *
nmad(a02, a11, a12 * a01);
4481 dst[3] = invdet * b11;
4482 dst[4] = invdet *
nmad(a02, a20, a22 * a00);
4483 dst[5] = invdet *
nmad(a12, a00, a02 * a10);
4484 dst[6] = invdet * b21;
4485 dst[7] = invdet *
nmad(a21, a00, a01 * a20);
4486 dst[8] = invdet *
nmad(a01, a10, a11 * a00);
4490 F a00 = dst[0], a01 = dst[1], a02 = dst[2], a03 = dst[3],
4491 a10 = dst[4], a11 = dst[5], a12 = dst[6], a13 = dst[7],
4492 a20 = dst[8], a21 = dst[9], a22 = dst[10], a23 = dst[11],
4493 a30 = dst[12], a31 = dst[13], a32 = dst[14], a33 = dst[15];
4494 F b00 =
nmad(a01, a10, a00 * a11),
4495 b01 =
nmad(a02, a10, a00 * a12),
4496 b02 =
nmad(a03, a10, a00 * a13),
4497 b03 =
nmad(a02, a11, a01 * a12),
4498 b04 =
nmad(a03, a11, a01 * a13),
4499 b05 =
nmad(a03, a12, a02 * a13),
4500 b06 =
nmad(a21, a30, a20 * a31),
4501 b07 =
nmad(a22, a30, a20 * a32),
4502 b08 =
nmad(a23, a30, a20 * a33),
4503 b09 =
nmad(a22, a31, a21 * a32),
4504 b10 =
nmad(a23, a31, a21 * a33),
4505 b11 =
nmad(a23, a32, a22 * a33),
4506 det =
mad(b00, b11, b05 * b06) +
mad(b02, b09, b03 * b08) -
mad(b01, b10, b04 * b07),
4520 dst[0] =
mad(a13, b09,
nmad(a12, b10, a11*b11));
4521 dst[1] =
nmad(a03, b09,
nmad(a01, b11, a02*b10));
4522 dst[2] =
mad(a33, b03,
nmad(a32, b04, a31*b05));
4523 dst[3] =
nmad(a23, b03,
nmad(a21, b05, a22*b04));
4524 dst[4] =
nmad(a13, b07,
nmad(a10, b11, a12*b08));
4525 dst[5] =
mad(a03, b07,
nmad(a02, b08, a00*b11));
4526 dst[6] =
nmad(a33, b01,
nmad(a30, b05, a32*b02));
4527 dst[7] =
mad(a23, b01,
nmad(a22, b02, a20*b05));
4528 dst[8] =
mad(a13, b06,
nmad(a11, b08, a10*b10));
4529 dst[9] =
nmad(a03, b06,
nmad(a00, b10, a01*b08));
4530 dst[10] =
mad(a33, b00,
nmad(a31, b02, a30*b04));
4531 dst[11] =
nmad(a23, b00,
nmad(a20, b04, a21*b02));
4532 dst[12] =
nmad(a12, b06,
nmad(a10, b09, a11*b07));
4533 dst[13] =
mad(a02, b06,
nmad(a01, b07, a00*b09));
4534 dst[14] =
nmad(a32, b00,
nmad(a30, b03, a31*b01));
4535 dst[15] =
mad(a22, b00,
nmad(a21, b01, a20*b03));
4539template <
typename T,
void (*ApplyFn)(T*, T*)>
4546 }
while (dst !=
end);
4549template <
typename T,
void (*ApplyFn)(T*, T*)>
4552 std::byte* dst =
base + ctx.dst;
4553 std::byte* src =
base + ctx.src;
4554 apply_adjacent_binary<T, ApplyFn>((
T*)dst, (
T*)src);
4557template <
int N,
typename V,
typename S,
void (*ApplyFn)(V*, V*)>
4560 V* dst = (
V*)(
base + ctx.dst);
4561 S scalar = sk_bit_cast<S>(ctx.value);
4562 V src = scalar -
V();
4563 SK_UNROLL for (
int index = 0; index <
N; ++index) {
4569template <
typename T>
4574template <
typename T>
4579template <
typename T>
4584template <
typename T>
4587 if constexpr (!std::is_same_v<T, F>) {
4606template <
typename T>
4608 *dst =
max(*dst, *src);
4611template <
typename T>
4613 *dst =
min(*dst, *src);
4616template <
typename T>
4618 static_assert(
sizeof(
T) ==
sizeof(
I32));
4623template <
typename T>
4625 static_assert(
sizeof(
T) ==
sizeof(
I32));
4630template <
typename T>
4632 static_assert(
sizeof(
T) ==
sizeof(
I32));
4637template <
typename T>
4639 static_assert(
sizeof(
T) ==
sizeof(
I32));
4645 *dst =
atan2_(*dst, *src);
4653 *dst = *dst - *src *
floor_(*dst / *src);
4656#define DECLARE_N_WAY_BINARY_FLOAT(name) \
4657 STAGE_TAIL(name##_n_floats, SkRasterPipeline_BinaryOpCtx* packed) { \
4658 apply_adjacent_binary_packed<F, &name##_fn>(packed, base); \
4661#define DECLARE_BINARY_FLOAT(name) \
4662 STAGE_TAIL(name##_float, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 1); } \
4663 STAGE_TAIL(name##_2_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 2); } \
4664 STAGE_TAIL(name##_3_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 3); } \
4665 STAGE_TAIL(name##_4_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 4); } \
4666 DECLARE_N_WAY_BINARY_FLOAT(name)
4668#define DECLARE_N_WAY_BINARY_INT(name) \
4669 STAGE_TAIL(name##_n_ints, SkRasterPipeline_BinaryOpCtx* packed) { \
4670 apply_adjacent_binary_packed<I32, &name##_fn>(packed, base); \
4673#define DECLARE_BINARY_INT(name) \
4674 STAGE_TAIL(name##_int, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 1); } \
4675 STAGE_TAIL(name##_2_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 2); } \
4676 STAGE_TAIL(name##_3_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 3); } \
4677 STAGE_TAIL(name##_4_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 4); } \
4678 DECLARE_N_WAY_BINARY_INT(name)
4680#define DECLARE_N_WAY_BINARY_UINT(name) \
4681 STAGE_TAIL(name##_n_uints, SkRasterPipeline_BinaryOpCtx* packed) { \
4682 apply_adjacent_binary_packed<U32, &name##_fn>(packed, base); \
4685#define DECLARE_BINARY_UINT(name) \
4686 STAGE_TAIL(name##_uint, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 1); } \
4687 STAGE_TAIL(name##_2_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 2); } \
4688 STAGE_TAIL(name##_3_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 3); } \
4689 STAGE_TAIL(name##_4_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 4); } \
4690 DECLARE_N_WAY_BINARY_UINT(name)
4715#define DECLARE_IMM_BINARY_FLOAT(name) \
4716 STAGE_TAIL(name##_imm_float, SkRasterPipeline_ConstantCtx* packed) { \
4717 apply_binary_immediate<1, F, float, &name##_fn>(packed, base); \
4719#define DECLARE_IMM_BINARY_INT(name) \
4720 STAGE_TAIL(name##_imm_int, SkRasterPipeline_ConstantCtx* packed) { \
4721 apply_binary_immediate<1, I32, int32_t, &name##_fn>(packed, base); \
4723#define DECLARE_MULTI_IMM_BINARY_INT(name) \
4724 STAGE_TAIL(name##_imm_int, SkRasterPipeline_ConstantCtx* packed) { \
4725 apply_binary_immediate<1, I32, int32_t, &name##_fn>(packed, base); \
4727 STAGE_TAIL(name##_imm_2_ints, SkRasterPipeline_ConstantCtx* packed) { \
4728 apply_binary_immediate<2, I32, int32_t, &name##_fn>(packed, base); \
4730 STAGE_TAIL(name##_imm_3_ints, SkRasterPipeline_ConstantCtx* packed) { \
4731 apply_binary_immediate<3, I32, int32_t, &name##_fn>(packed, base); \
4733 STAGE_TAIL(name##_imm_4_ints, SkRasterPipeline_ConstantCtx* packed) { \
4734 apply_binary_immediate<4, I32, int32_t, &name##_fn>(packed, base); \
4736#define DECLARE_IMM_BINARY_UINT(name) \
4737 STAGE_TAIL(name##_imm_uint, SkRasterPipeline_ConstantCtx* packed) { \
4738 apply_binary_immediate<1, U32, uint32_t, &name##_fn>(packed, base); \
4752#undef DECLARE_MULTI_IMM_BINARY_INT
4753#undef DECLARE_IMM_BINARY_FLOAT
4754#undef DECLARE_IMM_BINARY_INT
4755#undef DECLARE_IMM_BINARY_UINT
4756#undef DECLARE_BINARY_FLOAT
4757#undef DECLARE_BINARY_INT
4758#undef DECLARE_BINARY_UINT
4759#undef DECLARE_N_WAY_BINARY_FLOAT
4760#undef DECLARE_N_WAY_BINARY_INT
4761#undef DECLARE_N_WAY_BINARY_UINT
4766 dst[0] =
mad(dst[0], dst[2],
4771 dst[0] =
mad(dst[0], dst[3],
4777 dst[0] =
mad(dst[0], dst[4],
4789 int outColumns = ctx.rightColumns,
4790 outRows = ctx.leftRows;
4797 SkASSERT(ctx.leftColumns == ctx.rightRows);
4800#if !defined(JUMPER_IS_SCALAR)
4809 F* resultMtx = (
F*)(
base + ctx.dst);
4810 F* leftMtx = &resultMtx[ctx.rightColumns * ctx.leftRows];
4811 F* rightMtx = &leftMtx[
N * ctx.leftRows];
4814 for (
int c = 0; c < outColumns; ++c) {
4815 for (
int r = 0; r < outRows; ++r) {
4817 F* leftRow = &leftMtx [r];
4818 F* rightColumn = &rightMtx[c *
N];
4820 F element = *leftRow * *rightColumn;
4821 for (
int idx = 1; idx <
N; ++idx) {
4824 element =
mad(*leftRow, *rightColumn, element);
4827 *resultMtx++ = element;
4833 matrix_multiply<2>(packed,
base);
4837 matrix_multiply<3>(packed,
base);
4841 matrix_multiply<4>(packed,
base);
4848 F *incident = dst + 0;
4849 F *normal = dst + 4;
4852 F dotNI =
mad(normal[0], incident[0],
4853 mad(normal[1], incident[1],
4854 mad(normal[2], incident[2],
4855 normal[3] * incident[3])));
4857 F k = 1.0 - eta * eta * (1.0 - dotNI * dotNI);
4860 for (
int idx = 0; idx < 4; ++idx) {
4862 eta * incident[idx] - (eta * dotNI + sqrt_k) * normal[idx],
4868template <
typename T,
void (*ApplyFn)(T*, T*, T*)>
4870 int count = src0 - dst;
4871#if !defined(JUMPER_IS_SCALAR)
4875 for (
int index = 0; index <
count; ++index) {
4876 ApplyFn(dst, src0, src1);
4883template <
typename T,
void (*ApplyFn)(T*, T*, T*)>
4886 std::byte* dst =
base + ctx.dst;
4887 std::byte* src0 = dst + ctx.delta;
4888 std::byte* src1 = src0 + ctx.delta;
4889 apply_adjacent_ternary<T, ApplyFn>((
T*)dst, (
T*)src0, (
T*)src1);
4903 F t =
clamp_01_((*
x - *edge0) / (*edge1 - *edge0));
4904 *edge0 = t * t * (3.0 - 2.0 * t);
4907#define DECLARE_N_WAY_TERNARY_FLOAT(name) \
4908 STAGE_TAIL(name##_n_floats, SkRasterPipeline_TernaryOpCtx* packed) { \
4909 apply_adjacent_ternary_packed<F, &name##_fn>(packed, base); \
4912#define DECLARE_TERNARY_FLOAT(name) \
4913 STAGE_TAIL(name##_float, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+1, p+2); } \
4914 STAGE_TAIL(name##_2_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+2, p+4); } \
4915 STAGE_TAIL(name##_3_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+3, p+6); } \
4916 STAGE_TAIL(name##_4_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+4, p+8); } \
4917 DECLARE_N_WAY_TERNARY_FLOAT(name)
4919#define DECLARE_TERNARY_INT(name) \
4920 STAGE_TAIL(name##_int, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+1, p+2); } \
4921 STAGE_TAIL(name##_2_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+2, p+4); } \
4922 STAGE_TAIL(name##_3_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+3, p+6); } \
4923 STAGE_TAIL(name##_4_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+4, p+8); } \
4924 STAGE_TAIL(name##_n_ints, SkRasterPipeline_TernaryOpCtx* packed) { \
4925 apply_adjacent_ternary_packed<I32, &name##_fn>(packed, base); \
4932#undef DECLARE_N_WAY_TERNARY_FLOAT
4933#undef DECLARE_TERNARY_FLOAT
4934#undef DECLARE_TERNARY_INT
4941 const float c4 = -2.26661229133605957031f;
4942 const float c3 = 2.89795351028442382812f;
4943 const float c2 = 0.21345567703247070312f;
4944 const float c1 = 0.15489584207534790039f;
4945 const float c0 = 0.00030726194381713867f;
4960 F fx = fract(cx + 0.5f),
4961 fy = fract(cy + 0.5f);
4966 for (
float py = -0.5f; py <= +0.5f; py += 1.0f)
4967 for (
float px = -0.5f; px <= +0.5f; px += 1.0f) {
4973 const uint32_t* ptr;
4983 F sx = (px > 0) ? fx : 1.0f - fx,
4984 sy = (py > 0) ? fy : 1.0f - fy,
5002 F fx = fract(cx + 0.5f),
5003 fy = fract(cy + 0.5f);
5018 F sample_y = cy - 1.5f;
5019 for (
int yy = 0; yy <= 3; ++yy) {
5020 F sample_x = cx - 1.5f;
5021 for (
int xx = 0; xx <= 3; ++xx) {
5022 F scale = scalex[xx] * scaley[yy];
5025 const uint32_t* ptr;
5045 auto ir = r, ig = g, ib =
b, ia =
a;
5046 F* o[] = {&r, &g, &
b, &
a};
5048 memcpy(swiz, &ctx,
sizeof(swiz));
5050 for (
int i = 0; i < 4; ++i) {
5052 case 'r': *o[i] = ir;
break;
5053 case 'g': *o[i] = ig;
break;
5054 case 'b': *o[i] = ib;
break;
5055 case 'a': *o[i] = ia;
break;
5056 case '0': *o[i] =
F0;
break;
5057 case '1': *o[i] =
F1;
break;
5064#if defined(JUMPER_IS_SCALAR) || defined(SK_ENABLE_OPTIMIZE_SIZE) || \
5065 defined(SK_BUILD_FOR_GOOGLE3) || defined(SK_DISABLE_LOWP_RASTER_PIPELINE)
5073 #define M(st) static void (*st)(void) = nullptr;
5076 static void (*just_return)(void) =
nullptr;
5080 uint8_t* tailPointer) {}
5084#if defined(JUMPER_IS_SKX) || defined(JUMPER_IS_HSW) || defined(JUMPER_IS_LASX)
5085 template <
typename T>
using V = Vec<16, T>;
5087 template <
typename T>
using V = Vec<8, T>;
5099static constexpr size_t N =
sizeof(
U16) /
sizeof(uint16_t);
5102#if defined(__clang__)
5103SI constexpr U16 U16_(uint16_t
x) {
return x; }
5104SI constexpr I32 I32_( int32_t
x) {
return x; }
5106SI constexpr F F_ (
float x) {
return x; }
5108SI constexpr U16 U16_(uint16_t
x) {
return x +
U16(); }
5111SI constexpr F F_ (
float x) {
return x -
F (); }
5114static constexpr U16 U16_0 = U16_(0),
5115 U16_255 = U16_(255);
5120#if JUMPER_NARROW_STAGES
5128 size_t dx,
size_t dy,
5134 size_t xlimit,
size_t ylimit,
5137 uint8_t* tailPointer) {
5138 uint8_t unreferencedTail;
5140 tailPointer = &unreferencedTail;
5143 for (
size_t dy = y0; dy < ylimit; dy++) {
5144 #if JUMPER_NARROW_STAGES
5145 Params
params = { x0,dy, U16_0,U16_0,U16_0,U16_0 };
5149 if (
size_t tail = xlimit -
params.dx) {
5150 *tailPointer =
tail;
5154 *tailPointer = 0xFF;
5158 for (;
dx +
N <= xlimit;
dx +=
N) {
5159 start(program, dx,dy, U16_0,U16_0,U16_0,U16_0, U16_0,U16_0,U16_0,U16_0);
5161 if (
size_t tail = xlimit - dx) {
5162 *tailPointer =
tail;
5164 start(program, dx,dy, U16_0,U16_0,U16_0,U16_0, U16_0,U16_0,U16_0,U16_0);
5166 *tailPointer = 0xFF;
5172#if JUMPER_NARROW_STAGES
5189#if JUMPER_NARROW_STAGES
5190 #define STAGE_GG(name, ARG) \
5191 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y); \
5192 static void ABI name(Params* params, SkRasterPipelineStage* program, \
5193 U16 r, U16 g, U16 b, U16 a) { \
5194 auto x = join<F>(r,g), \
5196 name##_k(Ctx{program}, params->dx,params->dy, x,y); \
5199 auto fn = (Stage)(++program)->fn; \
5200 fn(params, program, r,g,b,a); \
5202 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y)
5204 #define STAGE_GP(name, ARG) \
5205 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5206 U16& r, U16& g, U16& b, U16& a, \
5207 U16& dr, U16& dg, U16& db, U16& da); \
5208 static void ABI name(Params* params, SkRasterPipelineStage* program, \
5209 U16 r, U16 g, U16 b, U16 a) { \
5210 auto x = join<F>(r,g), \
5212 name##_k(Ctx{program}, params->dx,params->dy, x,y, r,g,b,a, \
5213 params->dr,params->dg,params->db,params->da); \
5214 auto fn = (Stage)(++program)->fn; \
5215 fn(params, program, r,g,b,a); \
5217 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5218 U16& r, U16& g, U16& b, U16& a, \
5219 U16& dr, U16& dg, U16& db, U16& da)
5221 #define STAGE_PP(name, ARG) \
5222 SI void name##_k(ARG, size_t dx, size_t dy, \
5223 U16& r, U16& g, U16& b, U16& a, \
5224 U16& dr, U16& dg, U16& db, U16& da); \
5225 static void ABI name(Params* params, SkRasterPipelineStage* program, \
5226 U16 r, U16 g, U16 b, U16 a) { \
5227 name##_k(Ctx{program}, params->dx,params->dy, r,g,b,a, \
5228 params->dr,params->dg,params->db,params->da); \
5229 auto fn = (Stage)(++program)->fn; \
5230 fn(params, program, r,g,b,a); \
5232 SI void name##_k(ARG, size_t dx, size_t dy, \
5233 U16& r, U16& g, U16& b, U16& a, \
5234 U16& dr, U16& dg, U16& db, U16& da)
5236 #define STAGE_GG(name, ARG) \
5237 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y); \
5238 static void ABI name(SkRasterPipelineStage* program, \
5239 size_t dx, size_t dy, \
5240 U16 r, U16 g, U16 b, U16 a, \
5241 U16 dr, U16 dg, U16 db, U16 da) { \
5242 auto x = join<F>(r,g), \
5244 name##_k(Ctx{program}, dx,dy, x,y); \
5247 auto fn = (Stage)(++program)->fn; \
5248 fn(program, dx,dy, r,g,b,a, dr,dg,db,da); \
5250 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y)
5252 #define STAGE_GP(name, ARG) \
5253 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5254 U16& r, U16& g, U16& b, U16& a, \
5255 U16& dr, U16& dg, U16& db, U16& da); \
5256 static void ABI name(SkRasterPipelineStage* program, \
5257 size_t dx, size_t dy, \
5258 U16 r, U16 g, U16 b, U16 a, \
5259 U16 dr, U16 dg, U16 db, U16 da) { \
5260 auto x = join<F>(r,g), \
5262 name##_k(Ctx{program}, dx,dy, x,y, r,g,b,a, dr,dg,db,da); \
5263 auto fn = (Stage)(++program)->fn; \
5264 fn(program, dx,dy, r,g,b,a, dr,dg,db,da); \
5266 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5267 U16& r, U16& g, U16& b, U16& a, \
5268 U16& dr, U16& dg, U16& db, U16& da)
5270 #define STAGE_PP(name, ARG) \
5271 SI void name##_k(ARG, size_t dx, size_t dy, \
5272 U16& r, U16& g, U16& b, U16& a, \
5273 U16& dr, U16& dg, U16& db, U16& da); \
5274 static void ABI name(SkRasterPipelineStage* program, \
5275 size_t dx, size_t dy, \
5276 U16 r, U16 g, U16 b, U16 a, \
5277 U16 dr, U16 dg, U16 db, U16 da) { \
5278 name##_k(Ctx{program}, dx,dy, r,g,b,a, dr,dg,db,da); \
5279 auto fn = (Stage)(++program)->fn; \
5280 fn(program, dx,dy, r,g,b,a, dr,dg,db,da); \
5282 SI void name##_k(ARG, size_t dx, size_t dy, \
5283 U16& r, U16& g, U16& b, U16& a, \
5284 U16& dr, U16& dg, U16& db, U16& da)
5308#if defined(JUMPER_IS_NEON)
5311 return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8);
5322#if defined(JUMPER_IS_NEON)
5328 return (v+(v/256))/256;
5335 return (t & sk_bit_cast<U16>(c)) | (
e & sk_bit_cast<U16>(~c));
5338 return (t & sk_bit_cast<U32>(c)) | (
e & sk_bit_cast<U32>(~c));
5349SI U16 from_float(
float f) {
return U16_(f * 255.0f + 0.5f); }
5353template <
typename D,
typename S>
5355 return __builtin_convertvector(src,
D);
5358template <
typename D,
typename S>
5359SI void split(S v,
D* lo,
D* hi) {
5360 static_assert(2*
sizeof(
D) ==
sizeof(S),
"");
5361 memcpy(lo, (
const char*)&v + 0*
sizeof(
D),
sizeof(
D));
5362 memcpy(hi, (
const char*)&v + 1*
sizeof(
D),
sizeof(
D));
5364template <
typename D,
typename S>
5366 static_assert(
sizeof(
D) == 2*
sizeof(S),
"");
5368 memcpy((
char*)&v + 0*
sizeof(S), &lo,
sizeof(S));
5369 memcpy((
char*)&v + 1*
sizeof(S), &hi,
sizeof(S));
5374 return sk_bit_cast<F>( (sk_bit_cast<I32>(t) & c) | (sk_bit_cast<I32>(e) & ~c) );
5388 return (t & c) | (
e & ~c);
5418#if defined(JUMPER_IS_SKX)
5419 F e = _mm512_rcp14_ps(
x);
5420 return _mm512_fnmadd_ps(
x, e, _mm512_set1_ps(2.0f)) *
e;
5421#elif defined(JUMPER_IS_HSW)
5424 return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
5425#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
5428 return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
5429#elif defined(JUMPER_IS_NEON)
5432 return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
5433#elif defined(JUMPER_IS_LASX)
5436 return join<F>(__lasx_xvfrecip_s(lo), __lasx_xvfrecip_s(hi));
5437#elif defined(JUMPER_IS_LSX)
5440 return join<F>(__lsx_vfrecip_s(lo), __lsx_vfrecip_s(hi));
5446#if defined(JUMPER_IS_SKX)
5447 return _mm512_sqrt_ps(
x);
5448#elif defined(JUMPER_IS_HSW)
5451 return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi));
5452#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
5455 return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi));
5456#elif defined(SK_CPU_ARM64)
5459 return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi));
5460#elif defined(JUMPER_IS_NEON)
5461 auto sqrt = [](float32x4_t v) {
5462 auto est = vrsqrteq_f32(v);
5463 est *= vrsqrtsq_f32(v,est*est);
5464 est *= vrsqrtsq_f32(v,est*est);
5469 return join<F>(
sqrt(lo),
sqrt(hi));
5470#elif defined(JUMPER_IS_LASX)
5473 return join<F>(__lasx_xvfsqrt_s(lo), __lasx_xvfsqrt_s(hi));
5474#elif defined(JUMPER_IS_LSX)
5477 return join<F>(__lsx_vfsqrt_s(lo), __lsx_vfsqrt_s(hi));
5480 sqrtf(
x[0]), sqrtf(
x[1]), sqrtf(
x[2]), sqrtf(
x[3]),
5481 sqrtf(
x[4]), sqrtf(
x[5]), sqrtf(
x[6]), sqrtf(
x[7]),
5487#if defined(SK_CPU_ARM64)
5490 return join<F>(vrndmq_f32(lo), vrndmq_f32(hi));
5491#elif defined(JUMPER_IS_SKX)
5492 return _mm512_floor_ps(
x);
5493#elif defined(JUMPER_IS_HSW)
5496 return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi));
5497#elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
5500 return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi));
5501#elif defined(JUMPER_IS_LASX)
5504 return join<F>(__lasx_xvfrintrm_s(lo), __lasx_xvfrintrm_s(hi));
5505#elif defined(JUMPER_IS_LSX)
5508 return join<F>(__lsx_vfrintrm_s(lo), __lsx_vfrintrm_s(hi));
5510 F roundtrip = cast<F>(cast<I32>(
x));
5521#if defined(JUMPER_IS_SKX)
5522 return (
I16)_mm256_mulhrs_epi16((__m256i)
a, (__m256i)
b);
5523#elif defined(JUMPER_IS_HSW)
5524 return (
I16)_mm256_mulhrs_epi16((__m256i)
a, (__m256i)
b);
5525#elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
5526 return (
I16)_mm_mulhrs_epi16((__m128i)
a, (__m128i)
b);
5527#elif defined(SK_CPU_ARM64)
5528 return vqrdmulhq_s16(
a,
b);
5529#elif defined(JUMPER_IS_NEON)
5530 return vqrdmulhq_s16(
a,
b);
5531#elif defined(JUMPER_IS_LASX)
5532 I16 res = __lasx_xvmuh_h(
a,
b);
5533 return __lasx_xvslli_h(res, 1);
5534#elif defined(JUMPER_IS_LSX)
5535 I16 res = __lsx_vmuh_h(
a,
b);
5536 return __lsx_vslli_h(res, 1);
5538 const I32 roundingTerm =
I32_(1 << 14);
5539 return cast<I16>((cast<I32>(
a) * cast<I32>(
b) + roundingTerm) >> 15);
5547 #if defined(SK_DEBUG)
5548 for (
size_t i = 0; i <
N; i++) {
5555 SkASSERT(-ib <= ia && ia <= 65535 - ib);
5558 return b + sk_bit_cast<U16>(
a);
5562SI F abs_(
F x) {
return sk_bit_cast<F>( sk_bit_cast<I32>(
x) & 0x7fffffff ); }
5566STAGE_GG(seed_shader,
NoCtx) {
5567 static constexpr float iota[] = {
5568 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
5569 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
5573 x = cast<F>(
I32_(dx)) + sk_unaligned_load<F>(iota);
5574 y = cast<F>(
I32_(dy)) + 0.5f;
5577STAGE_GG(matrix_translate,
const float* m) {
5581STAGE_GG(matrix_scale_translate,
const float* m) {
5582 x =
mad(
x,m[0], m[2]);
5583 y =
mad(
y,m[1], m[3]);
5585STAGE_GG(matrix_2x3,
const float* m) {
5586 auto X =
mad(
x,m[0],
mad(
y,m[1], m[2])),
5591STAGE_GG(matrix_perspective,
const float* m) {
5593 auto X =
mad(
x,m[0],
mad(
y,m[1], m[2])),
5601 r = U16_(c->
rgba[0]);
5602 g = U16_(c->
rgba[1]);
5603 b = U16_(c->
rgba[2]);
5604 a = U16_(c->
rgba[3]);
5612STAGE_PP(black_color,
NoCtx) { r = g =
b = U16_0;
a = U16_255; }
5613STAGE_PP(white_color,
NoCtx) { r = g =
b = U16_255;
a = U16_255; }
5615STAGE_PP(set_rgb,
const float rgb[3]) {
5616 r = from_float(rgb[0]);
5617 g = from_float(rgb[1]);
5618 b = from_float(rgb[2]);
5622STAGE_PP(clamp_01,
NoCtx) {
5629STAGE_PP(clamp_gamut,
NoCtx) {
5637 r = div255_accurate(r *
a);
5638 g = div255_accurate(g *
a);
5639 b = div255_accurate(
b *
a);
5641STAGE_PP(premul_dst,
NoCtx) {
5642 dr = div255_accurate(
dr *
da);
5643 dg = div255_accurate(
dg *
da);
5644 db = div255_accurate(db *
da);
5647STAGE_PP(force_opaque ,
NoCtx) {
a = U16_255; }
5648STAGE_PP(force_opaque_dst,
NoCtx) {
da = U16_255; }
5650STAGE_PP(swap_rb,
NoCtx) {
5655STAGE_PP(swap_rb_dst,
NoCtx) {
5661STAGE_PP(move_src_dst,
NoCtx) {
5668STAGE_PP(move_dst_src,
NoCtx) {
5675STAGE_PP(swap_src_dst,
NoCtx) {
5685#define BLEND_MODE(name) \
5686 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
5687 STAGE_PP(name, NoCtx) { \
5688 r = name##_channel(r,dr,a,da); \
5689 g = name##_channel(g,dg,a,da); \
5690 b = name##_channel(b,db,a,da); \
5691 a = name##_channel(a,da,a,da); \
5693 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
5695#if defined(SK_USE_INACCURATE_DIV255_IN_BLEND)
5715 BLEND_MODE(dstin) {
return div255_accurate(
d*sa ); }
5720 BLEND_MODE(modulate) {
return div255_accurate(
s*
d ); }
5729#define BLEND_MODE(name) \
5730 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
5731 STAGE_PP(name, NoCtx) { \
5732 r = name##_channel(r,dr,a,da); \
5733 g = name##_channel(g,dg,a,da); \
5734 b = name##_channel(b,db,a,da); \
5735 a = a + div255( da*inv(a) ); \
5737 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
5756template <
typename T>
5761template <
typename T>
5764 const F w =
F_(sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->
width ) - 1)),
5765 h =
F_(sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->
height) - 1));
5767 const F z =
F_(std::numeric_limits<float>::min());
5779template <
typename T>
5794template <
typename V,
typename T>
5797 memcpy(&v, ptr,
sizeof(v));
5800template <
typename V,
typename T>
5802 memcpy(ptr, &v,
sizeof(v));
5805#if defined(JUMPER_IS_SKX)
5806 template <
typename V,
typename T>
5808 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5809 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
5810 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
5811 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
5816 return _mm512_i32gather_ps((__m512i)ix, ptr, 4);
5821 return (
U32)_mm512_i32gather_epi32((__m512i)ix, ptr, 4);
5824#elif defined(JUMPER_IS_HSW)
5825 template <
typename V,
typename T>
5827 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5828 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
5829 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
5830 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
5836 split(ix, &lo, &hi);
5838 return join<F>(_mm256_i32gather_ps(ptr, lo, 4),
5839 _mm256_i32gather_ps(ptr, hi, 4));
5845 split(ix, &lo, &hi);
5847 return join<U32>(_mm256_i32gather_epi32((
const int*)ptr, lo, 4),
5848 _mm256_i32gather_epi32((
const int*)ptr, hi, 4));
5850#elif defined(JUMPER_IS_LASX)
5851 template <
typename V,
typename T>
5853 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5854 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
5855 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
5856 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
5859 template <
typename V,
typename T>
5861 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5862 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
5870#if defined(JUMPER_IS_SKX)
5871 rgba = (
U32)_mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), (__m512i)
rgba);
5872 auto cast_U16 = [](
U32 v) ->
U16 {
5873 return (
U16)_mm256_packus_epi32(_mm512_castsi512_si256((__m512i)v),
5874 _mm512_extracti64x4_epi64((__m512i)v, 1));
5876#elif defined(JUMPER_IS_HSW)
5879 split(
rgba, &_01, &_23);
5880 __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
5881 _13 = _mm256_permute2x128_si256(_01,_23, 0x31);
5882 rgba = join<U32>(_02, _13);
5884 auto cast_U16 = [](
U32 v) ->
U16 {
5886 split(v, &_02,&_13);
5887 return (
U16)_mm256_packus_epi32(_02,_13);
5889#elif defined(JUMPER_IS_LASX)
5891 split(
rgba, &_01, &_23);
5892 __m256i _02 = __lasx_xvpermi_q(_01, _23, 0x02),
5893 _13 = __lasx_xvpermi_q(_01, _23, 0x13);
5894 rgba = join<U32>(_02, _13);
5896 auto cast_U16 = [](
U32 v) ->
U16 {
5898 split(v, &_02,&_13);
5899 __m256i tmp0 = __lasx_xvsat_wu(_02, 15);
5900 __m256i tmp1 = __lasx_xvsat_wu(_13, 15);
5901 return __lasx_xvpickev_h(tmp1, tmp0);
5904 auto cast_U16 = [](
U32 v) ->
U16 {
5905 return cast<U16>(v);
5908 *r = cast_U16(
rgba & 65535) & 255;
5909 *g = cast_U16(
rgba & 65535) >> 8;
5910 *
b = cast_U16(
rgba >> 16) & 255;
5911 *
a = cast_U16(
rgba >> 16) >> 8;
5915#if 1 && defined(JUMPER_IS_NEON)
5916 uint8x8x4_t
rgba = vld4_u8((
const uint8_t*)(ptr));
5917 *r = cast<U16>(
rgba.val[0]);
5918 *g = cast<U16>(
rgba.val[1]);
5919 *
b = cast<U16>(
rgba.val[2]);
5920 *
a = cast<U16>(
rgba.val[3]);
5931#if 1 && defined(JUMPER_IS_NEON)
5932 uint8x8x4_t
rgba = {{
5938 vst4_u8((uint8_t*)(ptr),
rgba);
5940 store(ptr, cast<U32>(r | (g<<8)) << 0
5941 | cast<U32>(
b | (
a<<8)) << 16);
5946 load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), &r,&g,&
b,&
a);
5949 load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), &
dr,&
dg,&db,&
da);
5952 store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), r,g,
b,
a);
5955 const uint32_t* ptr;
5964 U16 R = (rgb >> 11) & 31,
5965 G = (rgb >> 5) & 63,
5966 B = (rgb >> 0) & 31;
5969 *r = (
R << 3) | (
R >> 2);
5970 *g = (
G << 2) | (
G >> 4);
5971 *
b = (
B << 3) | (
B >> 2);
5973SI void load_565_(
const uint16_t* ptr,
U16* r,
U16* g,
U16*
b) {
5976SI void store_565_(uint16_t* ptr,
U16 r,
U16 g,
U16 b) {
5984 U16 R = (r * 9 + 36) / 74,
5985 G = (g * 21 + 42) / 85,
5986 B = (
b * 9 + 36) / 74;
5994 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &r,&g,&
b);
5998 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &
dr,&
dg,&db);
6002 store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), r,g,
b);
6005 const uint16_t* ptr;
6007 from_565(gather<U16>(ptr, ix), &r, &g, &
b);
6014 G = (
rgba >> 8) & 15,
6015 B = (
rgba >> 4) & 15,
6016 A = (
rgba >> 0) & 15;
6034 U16 R = (r + 8) / 17,
6046 load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &r,&g,&
b,&
a);
6049 load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &
dr,&
dg,&db,&
da);
6052 store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), r,g,
b,
a);
6055 const uint16_t* ptr;
6065SI void load_88_(
const uint16_t* ptr,
U16* r,
U16* g) {
6066#if 1 && defined(JUMPER_IS_NEON)
6067 uint8x8x2_t rg = vld2_u8((
const uint8_t*)(ptr));
6068 *r = cast<U16>(rg.val[0]);
6069 *g = cast<U16>(rg.val[1]);
6075SI void store_88_(uint16_t* ptr,
U16 r,
U16 g) {
6079#if 1 && defined(JUMPER_IS_NEON)
6084 vst2_u8((uint8_t*)(ptr), rg);
6086 store(ptr, cast<U16>(r | (g<<8)) << 0);
6091 load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), &r, &g);
6096 load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), &
dr, &
dg);
6101 store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), r, g);
6104 const uint16_t* ptr;
6106 from_88(gather<U16>(ptr, ix), &r, &g);
6113SI U16 load_8(
const uint8_t* ptr) {
6114 return cast<U16>(load<U8>(ptr));
6116SI void store_8(uint8_t* ptr,
U16 v) {
6118 store(ptr, cast<U8>(v));
6123 a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy));
6127 da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy));
6130 store_8(ptr_at_xy<uint8_t>(ctx, dx,dy),
a);
6136 a = cast<U16>(gather<U8>(ptr, ix));
6139 store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), r);
6142STAGE_PP(alpha_to_gray,
NoCtx) {
6146STAGE_PP(alpha_to_gray_dst,
NoCtx) {
6150STAGE_PP(alpha_to_red,
NoCtx) {
6154STAGE_PP(alpha_to_red_dst,
NoCtx) {
6159STAGE_PP(bt709_luminance_or_luma_to_alpha,
NoCtx) {
6160 a = (r*54 + g*183 +
b*19)/256;
6163STAGE_PP(bt709_luminance_or_luma_to_rgb,
NoCtx) {
6164 r = g =
b =(r*54 + g*183 +
b*19)/256;
6169STAGE_PP(load_src,
const uint16_t* ptr) {
6170 r = sk_unaligned_load<U16>(ptr + 0*
N);
6171 g = sk_unaligned_load<U16>(ptr + 1*
N);
6172 b = sk_unaligned_load<U16>(ptr + 2*
N);
6173 a = sk_unaligned_load<U16>(ptr + 3*
N);
6175STAGE_PP(store_src, uint16_t* ptr) {
6181STAGE_PP(store_src_a, uint16_t* ptr) {
6184STAGE_PP(load_dst,
const uint16_t* ptr) {
6185 dr = sk_unaligned_load<U16>(ptr + 0*
N);
6186 dg = sk_unaligned_load<U16>(ptr + 1*
N);
6187 db = sk_unaligned_load<U16>(ptr + 2*
N);
6188 da = sk_unaligned_load<U16>(ptr + 3*
N);
6190STAGE_PP(store_dst, uint16_t* ptr) {
6199STAGE_PP(scale_1_float,
const float* f) {
6200 U16 c = from_float(*f);
6206STAGE_PP(lerp_1_float,
const float* f) {
6207 U16 c = from_float(*f);
6213STAGE_PP(scale_native,
const uint16_t scales[]) {
6214 auto c = sk_unaligned_load<U16>(scales);
6221STAGE_PP(lerp_native,
const uint16_t scales[]) {
6222 auto c = sk_unaligned_load<U16>(scales);
6230 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy));
6237 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy));
6251 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &cr,&cg,&cb);
6261 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &cr,&cg,&cb);
6271 U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->
mul, dx,dy)),
6272 add = load_8(ptr_at_xy<const uint8_t>(&ctx->
add, dx,dy));
6288STAGE_GG(mirror_x_1,
NoCtx) {
6289 auto two = [](
F x){
return x+
x; };
6293SI I16 cond_to_mask_16(
I32 cond) {
return cast<I16>(cond); }
6313 auto mask = sk_unaligned_load<U16>(ctx->
mask);
6321 auto round_color = [](
F x) {
return cast<U16>(
x * 255.0f + 0.5f); };
6323 *r = round_color(
min(
max(0,
R), 1));
6324 *g = round_color(
min(
max(0,
G), 1));
6325 *
b = round_color(
min(
max(0,
B), 1));
6326 *
a = round_color(
A);
6332 F fr, fg, fb, fa, br, bg, bb, ba;
6333#if defined(JUMPER_IS_HSW)
6336 split(idx, &lo, &hi);
6338 fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[0]), lo),
6339 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[0]), hi));
6340 br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[0]), lo),
6341 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[0]), hi));
6342 fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[1]), lo),
6343 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[1]), hi));
6344 bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[1]), lo),
6345 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[1]), hi));
6346 fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[2]), lo),
6347 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[2]), hi));
6348 bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[2]), lo),
6349 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[2]), hi));
6350 fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[3]), lo),
6351 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[3]), hi));
6352 ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[3]), lo),
6353 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[3]), hi));
6355#elif defined(JUMPER_IS_LASX)
6358 split(idx, &lo, &hi);
6360 fr = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[0], 0), lo),
6361 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[0], 0), hi));
6362 br = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[0], 0), lo),
6363 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[0], 0), hi));
6364 fg = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[1], 0), lo),
6365 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[1], 0), hi));
6366 bg = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[1], 0), lo),
6367 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[1], 0), hi));
6368 fb = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[2], 0), lo),
6369 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[2], 0), hi));
6370 bb = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[2], 0), lo),
6371 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[2], 0), hi));
6372 fa = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[3], 0), lo),
6373 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[3], 0), hi));
6374 ba = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[3], 0), lo),
6375 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[3], 0), hi));
6377#elif defined(JUMPER_IS_LSX)
6380 split(idx, &lo, &hi);
6381 __m128i zero = __lsx_vldi(0);
6382 fr = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
fs[0], 0)),
6383 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
fs[0], 0)));
6384 br = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
bs[0], 0)),
6385 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
bs[0], 0)));
6386 fg = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
fs[1], 0)),
6387 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
fs[1], 0)));
6388 bg = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
bs[1], 0)),
6389 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
bs[1], 0)));
6390 fb = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
fs[2], 0)),
6391 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
fs[2], 0)));
6392 bb = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
bs[2], 0)),
6393 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
bs[2], 0)));
6394 fa = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
fs[3], 0)),
6395 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
fs[3], 0)));
6396 ba = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
bs[3], 0)),
6397 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
bs[3], 0)));
6401 fr = gather<F>(c->
fs[0], idx);
6402 fg = gather<F>(c->
fs[1], idx);
6403 fb = gather<F>(c->
fs[2], idx);
6404 fa = gather<F>(c->
fs[3], idx);
6405 br = gather<F>(c->
bs[0], idx);
6406 bg = gather<F>(c->
bs[1], idx);
6407 bb = gather<F>(c->
bs[2], idx);
6408 ba = gather<F>(c->
bs[3], idx);
6410 round_F_to_U16(
mad(t, fr, br),
6422 for (
size_t i = 1; i < c->
stopCount; i++) {
6437 round_F_to_U16(
mad(t, c->
f[0], c->
b[0]),
6438 mad(t, c->
f[1], c->
b[1]),
6439 mad(t, c->
f[2], c->
b[2]),
6440 mad(t, c->
f[3], c->
b[3]),
6447 I32 qx = cast<I32>(
floor_(65536.0f *
x + 0.5f)) - 32768,
6448 qy = cast<I32>(
floor_(65536.0f *
y + 0.5f)) - 32768;
6464 I16 tx = cast<I16>(qx ^ 0x8000),
6465 ty = cast<I16>(qy ^ 0x8000);
6498 const uint32_t* ptr;
6500 U16 leftR, leftG, leftB, leftA;
6501 from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
6504 U16 rightR, rightG, rightB, rightA;
6505 from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
6507 U16 topR = lerpX(leftR, rightR),
6508 topG = lerpX(leftG, rightG),
6509 topB = lerpX(leftB, rightB),
6510 topA = lerpX(leftA, rightA);
6513 from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
6516 from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
6518 U16 bottomR = lerpX(leftR, rightR),
6519 bottomG = lerpX(leftG, rightG),
6520 bottomB = lerpX(leftB, rightB),
6521 bottomA = lerpX(leftA, rightA);
6525 auto lerpY = [&](
U16 top,
U16 bottom) ->
U16 {
6527 U16 middle = bottom + top;
6534 r = lerpY(topR, bottomR);
6535 g = lerpY(topG, bottomG);
6536 b = lerpY(topB, bottomB);
6537 a = lerpY(topA, bottomA);
6540STAGE_GG(xy_to_unit_angle,
NoCtx) {
6544 F slope =
min(xabs, yabs)/
max(xabs, yabs);
6545 F s = slope * slope;
6552 * (0.15912117063999176025390625f +
s
6553 * (-5.185396969318389892578125e-2f +
s
6554 * (2.476101927459239959716796875e-2f +
s
6555 * (-7.0547382347285747528076171875e-3f))));
6563STAGE_GG(xy_to_radius,
NoCtx) {
6570 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
6572 load_8888_(ptr, &
dr,&
dg,&db,&
da);
6577 store_8888_(ptr, r,g,
b,
a);
6582STAGE_PP(swizzle,
void* ctx) {
6583 auto ir = r, ig = g, ib =
b, ia =
a;
6584 U16* o[] = {&r, &g, &
b, &
a};
6586 memcpy(swiz, &ctx,
sizeof(swiz));
6588 for (
int i = 0; i < 4; ++i) {
6590 case 'r': *o[i] = ir;
break;
6591 case 'g': *o[i] = ig;
break;
6592 case 'b': *o[i] = ib;
break;
6593 case 'a': *o[i] = ia;
break;
6594 case '0': *o[i] = U16_0;
break;
6595 case '1': *o[i] = U16_255;
break;
6605namespace lowp {
static constexpr size_t lowp_N =
N; }
6608constexpr size_t raster_pipeline_lowp_stride() {
return lowp::lowp_N; }
6609constexpr size_t raster_pipeline_highp_stride() {
return N; }
static SkM44 inv(const SkM44 &m)
static bool match(const char *needle, const char *haystack)
static void round(SkPoint *p)
static float next(float f)
static U16 constrained_add(I16 a, U16 b)
static const uint32_t rgba[kNumPixels]
static unsigned clamp(SkFixed fx, int max)
static uint32_t pack(SkFixed f, unsigned max, SkFixed one)
static uint8_t div255(unsigned prod)
constexpr float SK_FloatPI
static size_t difference(size_t minuend, size_t subtrahend)
static bool is_degenerate(const SkPath &path)
static int sign(SkScalar x)
static bool left(const SkPoint &p0, const SkPoint &p1)
static bool right(const SkPoint &p0, const SkPoint &p1)
static constexpr int SkRasterPipeline_kMaxStride
static constexpr int SkRasterPipeline_kMaxStride_highp
#define SK_RASTER_PIPELINE_OPS_LOWP(M)
#define DECLARE_IMM_BINARY_INT(name)
#define STAGE_TAIL(name, arg)
#define DECLARE_BINARY_FLOAT(name)
SI Dst widen_cast(const Src &src)
#define DECLARE_IMM_BINARY_FLOAT(name)
#define DECLARE_BINARY_UINT(name)
#define DECLARE_N_WAY_BINARY_FLOAT(name)
#define DECLARE_TERNARY_FLOAT(name)
#define DECLARE_MULTI_IMM_BINARY_INT(name)
#define STAGE_BRANCH(name, arg)
#define update_execution_mask()
#define DECLARE_UNARY_UINT(name)
#define DECLARE_TERNARY_INT(name)
#define DECLARE_UNARY_INT(name)
#define DECLARE_IMM_BINARY_UINT(name)
#define DECLARE_UNARY_FLOAT(name)
#define DECLARE_N_WAY_TERNARY_FLOAT(name)
#define DECLARE_BINARY_INT(name)
static SK_ALWAYS_INLINE void SK_FP_SAFE_ABI sk_unaligned_store(P *ptr, T val)
static uint32_t premul(uint32_t color)
virtual void var(int slot, int32_t val)=0
virtual void scope(int delta)=0
virtual void enter(int fnIdx)=0
virtual void exit(int fnIdx)=0
virtual void line(int lineNum)=0
const EmbeddedViewParams * params
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
FlKeyEvent uint64_t FlKeyResponderAsyncCallback callback
static void set_sat(float *r, float *g, float *b, float s)
static SkColor blend(SkColor dst, SkColor src, void(*mode)(float, float, float, float *, float *, float *))
static float sat(float r, float g, float b)
static void luminosity(float dr, float dg, float db, float *sr, float *sg, float *sb)
static void hue(float dr, float dg, float db, float *sr, float *sg, float *sb)
static void saturation(float dr, float dg, float db, float *sr, float *sg, float *sb)
static float max(float r, float g, float b)
static void clip_color(float *r, float *g, float *b)
static float lum(float r, float g, float b)
static float min(float r, float g, float b)
static void set_lum(float *r, float *g, float *b, float l)
__attribute__((visibility("default"))) int RunBenchmarks(int argc
T __attribute__((ext_vector_type(N))) V
static void(* just_return)(void)
static constexpr size_t lowp_N
SI F approx_powf(F x, F y)
SI void apply_adjacent_ternary(T *dst, T *src0, T *src1)
SI constexpr I32 I32_(int32_t x)
SI void load4(const uint16_t *ptr, U16 *r, U16 *g, U16 *b, U16 *a)
SI void smoothstep_fn(F *edge0, F *edge1, F *x)
SI void from_4444(U16 _4444, F *r, F *g, F *b, F *a)
SI void cmpne_fn(T *dst, T *src)
SI void cast_to_uint_from_fn(F *dst)
SI void from_565(U16 _565, F *r, F *g, F *b)
SI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx *ctx)
SI void cast_to_int_from_fn(F *dst)
SI constexpr U32 U32_(uint32_t x)
SI void mul_fn(T *dst, T *src)
SI void cmpeq_fn(T *dst, T *src)
SI void sub_fn(T *dst, T *src)
SI void cast_to_float_from_fn(T *dst)
static void patch_memory_contexts(SkSpan< SkRasterPipeline_MemoryCtxPatch > memoryCtxPatches, size_t dx, size_t dy, size_t tail)
SI void copy_n_slots_masked_fn(SkRasterPipeline_BinaryOpCtx *packed, std::byte *base, I32 mask)
SI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb)
SI void store4(uint16_t *ptr, U16 r, U16 g, U16 b, U16 a)
SI uint32_t select_lane(uint32_t data, int)
SI void load2(const uint16_t *ptr, U16 *r, U16 *g)
SI void scatter_masked(I32 src, int *dst, U32 ix, I32 mask)
SI void apply_adjacent_ternary_packed(SkRasterPipeline_TernaryOpCtx *packed, std::byte *base)
static void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, SkRasterPipelineStage *program, SkSpan< SkRasterPipeline_MemoryCtxPatch > memoryCtxPatches, uint8_t *tailPointer)
SI void from_1010102(U32 rgba, F *r, F *g, F *b, F *a)
SI F approx_atan_unit(F x)
SI void from_8888(U32 _8888, F *r, F *g, F *b, F *a)
SI void apply_adjacent_unary(T *dst, T *end)
SI T gather(const T *p, U32 ix)
SI U32 to_unorm(F v, float scale, float bias=1.0f)
SI U32 ix_and_ptr(T **ptr, const SkRasterPipeline_GatherCtx *ctx, F x, F y)
SI void bicubic_y(SkRasterPipeline_SamplerCtx *ctx, F *y)
SI void bicubic_x(SkRasterPipeline_SamplerCtx *ctx, F *x)
SI void cmplt_fn(T *dst, T *src)
static void restore_memory_contexts(SkSpan< SkRasterPipeline_MemoryCtxPatch > memoryCtxPatches, size_t dx, size_t dy, size_t tail)
SI void bilinear_x(SkRasterPipeline_SamplerCtx *ctx, F *x)
SI void apply_binary_immediate(SkRasterPipeline_ConstantCtx *packed, std::byte *base)
static void ABI stack_checkpoint(Params *params, SkRasterPipelineStage *program, F r, F g, F b, F a)
SI void small_swizzle_fn(SkRasterPipeline_SwizzleCtx *packed, std::byte *base)
SI void from_1616(U32 _1616, F *r, F *g)
SI void from_16161616(U64 _16161616, F *r, F *g, F *b, F *a)
SI void save_xy(F *r, F *g, SkRasterPipeline_SamplerCtx *c)
SI void from_10101010_xr(U64 _10x6, F *r, F *g, F *b, F *a)
SI void apply_adjacent_binary(T *dst, T *src)
SI void from_1010102_xr(U32 rgba, F *r, F *g, F *b, F *a)
SI void bilinear_y(SkRasterPipeline_SamplerCtx *ctx, F *y)
SI void matrix_multiply(SkRasterPipeline_MatrixMultiplyCtx *packed, std::byte *base)
SI void from_10x6(U64 _10x6, F *r, F *g, F *b, F *a)
SI F clamp_ex(F v, float limit)
SI void shuffle_fn(std::byte *ptr, OffsetType *offsets, int numSlots)
SI void mod_fn(F *dst, F *src)
static void ABI stack_rewind(Params *params, SkRasterPipelineStage *program, F r, F g, F b, F a)
SI void bitwise_xor_fn(I32 *dst, I32 *src)
SI void atan2_fn(F *dst, F *src)
SI void mix_fn(F *a, F *x, F *y)
SI void gradient_lookup(const SkRasterPipeline_GradientCtx *c, U32 idx, F t, F *r, F *g, F *b, F *a)
SI void bitwise_and_fn(I32 *dst, I32 *src)
SI void div_fn(T *dst, T *src)
SI void cmple_fn(T *dst, T *src)
SI void max_fn(T *dst, T *src)
SI void apply_adjacent_binary_packed(SkRasterPipeline_BinaryOpCtx *packed, std::byte *base)
SI F bicubic_wts(F t, float A, float B, float C, float D)
SI void min_fn(T *dst, T *src)
SI T * ptr_at_xy(const SkRasterPipeline_MemoryCtx *ctx, size_t dx, size_t dy)
SI void swizzle_copy_masked_fn(I32 *dst, const I32 *src, uint16_t *offsets, I32 mask)
SI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx *ctx)
SI void from_88(U16 _88, F *r, F *g)
SI F clip_channel(F c, F l, I32 clip_low, I32 clip_high, F mn_scale, F mx_scale)
SI void pow_fn(F *dst, F *src)
SI void invsqrt_fn(F *dst)
SI void add_fn(T *dst, T *src)
SI I32 cond_to_mask(I32 cond)
SI RGB css_hsl_to_srgb_(F h, F s, F l)
SI void copy_n_immutable_unmasked_fn(SkRasterPipeline_BinaryOpCtx *packed, std::byte *base)
SI void store2(uint16_t *ptr, U16 r, U16 g)
SI F compute_perlin_vector(U32 sample, F x, F y)
SI void copy_n_slots_unmasked_fn(SkRasterPipeline_BinaryOpCtx *packed, std::byte *base)
SI void bitwise_or_fn(I32 *dst, I32 *src)
static UnpackedType< T > Unpack(const T *ctx)
skia_private::AutoTArray< sk_sp< SkImageFilter > > filters TypedMatrix matrix TypedMatrix matrix SkScalar dx
DEF_SWITCHES_START aot vmservice shared library Name of the *so containing AOT compiled Dart assets for launching the service isolate vm snapshot data
SIN Vec< N, float > sqrt(const Vec< N, float > &x)
SINT Vec< 2 *N, T > join(const Vec< N, T > &lo, const Vec< N, T > &hi)
SIT bool all(const Vec< 1, T > &x)
SIN Vec< N, float > fract(const Vec< N, float > &x)
SIT bool any(const Vec< 1, T > &x)
SkRasterPipelineStage * fStage
uint32_t fMask[SkRasterPipeline_kMaxStride_highp]
float rgba[4 *SkRasterPipeline_kMaxStride_highp]
void(* fn)(SkRasterPipeline_CallbackCtx *self, int active_pixels)
const uint32_t * indirectOffset
uint32_t mask[SkRasterPipeline_kMaxStride]
SkRasterPipeline_MemoryCtx add
SkRasterPipeline_MemoryCtx mul
float x[SkRasterPipeline_kMaxStride_highp]
float g[SkRasterPipeline_kMaxStride_highp]
float r[SkRasterPipeline_kMaxStride_highp]
float b[SkRasterPipeline_kMaxStride_highp]
float y[SkRasterPipeline_kMaxStride_highp]
float a[SkRasterPipeline_kMaxStride_highp]
SkPerlinNoiseShaderType noiseType
const uint8_t * latticeSelector
const uint16_t * noiseData
float dg[SkRasterPipeline_kMaxStride_highp]
float g[SkRasterPipeline_kMaxStride_highp]
SkRasterPipelineStage * stage
float dr[SkRasterPipeline_kMaxStride_highp]
float db[SkRasterPipeline_kMaxStride_highp]
float a[SkRasterPipeline_kMaxStride_highp]
float r[SkRasterPipeline_kMaxStride_highp]
float da[SkRasterPipeline_kMaxStride_highp]
float b[SkRasterPipeline_kMaxStride_highp]
float scalex[SkRasterPipeline_kMaxStride_highp]
float fy[SkRasterPipeline_kMaxStride_highp]
float wy[4][SkRasterPipeline_kMaxStride_highp]
float scaley[SkRasterPipeline_kMaxStride_highp]
float fx[SkRasterPipeline_kMaxStride_highp]
float x[SkRasterPipeline_kMaxStride_highp]
float wx[4][SkRasterPipeline_kMaxStride_highp]
float y[SkRasterPipeline_kMaxStride_highp]
SkSL::TraceHook * traceHook
SkSL::TraceHook * traceHook
SkSL::TraceHook * traceHook
const uint32_t * indirectOffset
SkSL::TraceHook * traceHook
static SkPoint lerp(const SkPoint &a, const SkPoint &b, float T)
static constexpr int kScale