8#ifndef SkRasterPipeline_opts_DEFINED
9#define SkRasterPipeline_opts_DEFINED
26#if defined(__clang__) || defined(__GNUC__)
27 #define SI __attribute__((always_inline)) static inline
29 #define SI static inline
33 #define SK_UNROLL _Pragma("unroll")
39 template <
int N,
typename T>
using Vec =
T __attribute__((ext_vector_type(
N)));
40#elif defined(__GNUC__)
43 template <
int N,
typename T>
struct VecHelper {
49template <
typename Dst,
typename Src>
51 static_assert(
sizeof(Dst) >
sizeof(Src));
55 memcpy(&
dst, &
src,
sizeof(Src));
70#if defined(JUMPER_IS_SCALAR) || defined(JUMPER_IS_NEON) || defined(JUMPER_IS_HSW) || \
71 defined(JUMPER_IS_SKX) || defined(JUMPER_IS_AVX) || defined(JUMPER_IS_SSE41) || \
72 defined(JUMPER_IS_SSE2)
74#elif !defined(__clang__) && !defined(__GNUC__)
75 #define JUMPER_IS_SCALAR
76#elif defined(SK_ARM_HAS_NEON)
77 #define JUMPER_IS_NEON
78#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
80#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
82#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
84#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
85 #define JUMPER_IS_SSE41
86#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
87 #define JUMPER_IS_SSE2
88#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
89 #define JUMPER_IS_LASX
90#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
93 #define JUMPER_IS_SCALAR
97#if defined(__clang__) && !defined(__OPTIMIZE__) && defined(SK_CPU_ARM32)
99 #if defined(__apple_build_version__) && __clang_major__ < 9
100 #define JUMPER_IS_SCALAR
101 #elif __clang_major__ < 5
102 #define JUMPER_IS_SCALAR
105 #if defined(JUMPER_IS_NEON) && defined(JUMPER_IS_SCALAR)
106 #undef JUMPER_IS_NEON
110#if defined(JUMPER_IS_SCALAR)
112#elif defined(JUMPER_IS_NEON)
113 #include <arm_neon.h>
114#elif defined(JUMPER_IS_LASX)
115 #include <lasxintrin.h>
116 #include <lsxintrin.h>
117#elif defined(JUMPER_IS_LSX)
118 #include <lsxintrin.h>
120 #include <immintrin.h>
132#if defined(JUMPER_IS_SCALAR)
171 template <
typename T>
212#elif defined(JUMPER_IS_NEON)
213 template <
typename T>
using V = Vec<4, T>;
229 SI F abs_ (
F v) {
return vabsq_f32(v); }
241 #if defined(SK_CPU_ARM64)
242 SI bool any(
I32 c) {
return vmaxvq_u32((
U32)c) != 0; }
243 SI bool all(
I32 c) {
return vminvq_u32((
U32)c) != 0; }
248 SI F ceil_(
F v) {
return vrndpq_f32(v); }
249 SI F sqrt_(
F v) {
return vsqrtq_f32(v); }
254 SI bool any(
I32 c) {
return c[0] | c[1] | c[2] | c[3]; }
255 SI bool all(
I32 c) {
return c[0] & c[1] & c[2] & c[3]; }
261 F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
266 F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
271 auto e = vrsqrteq_f32(v);
272 e *= vrsqrtsq_f32(v,
e*
e);
273 e *= vrsqrtsq_f32(v,
e*
e);
278 return vcvtq_s32_f32(v + 0.5f);
282 return vcvtq_u32_f32(v + 0.5f);
286 return vcvtq_u32_f32(
mad(v,
scale,
F() + 0.5f));
290 template <
typename T>
292 return V<T>{
p[ix[0]],
p[ix[1]],
p[ix[2]],
p[ix[3]]};
297 dst[ix[0]] = after[0];
298 dst[ix[1]] = after[1];
299 dst[ix[2]] = after[2];
300 dst[ix[3]] = after[3];
303 uint16x4x2_t rg = vld2_u16(ptr);
308 vst2_u16(ptr, (uint16x4x2_t{{r,g}}));
311 uint16x4x4_t
rgba = vld4_u16(ptr);
319 vst4_u16(ptr, (uint16x4x4_t{{r,g,
b,
a}}));
322 float32x4x4_t
rgba = vld4q_f32(ptr);
329 vst4q_f32(ptr, (float32x4x4_t{{r,g,
b,
a}}));
332#elif defined(JUMPER_IS_SKX)
333 template <
typename T>
using V = Vec<16, T>;
349 SI F abs_ (
F v) {
return _mm512_and_ps(v, _mm512_sub_ps(_mm512_setzero(), v)); }
351 SI F floor_(
F v) {
return _mm512_floor_ps(v); }
352 SI F ceil_(
F v) {
return _mm512_ceil_ps(v); }
355 SI F sqrt_ (
F v) {
return _mm512_sqrt_ps (v); }
358 return _mm512_fnmadd_ps(v,
e, _mm512_set1_ps(2.0f)) *
e;
364 __m256i rst = _mm256_packus_epi32(_mm512_castsi512_si256((__m512i)v),
365 _mm512_extracti64x4_epi64((__m512i)v, 1));
366 return (
U16)_mm256_permutex_epi64(rst, 216);
369 __m256i rst = _mm256_packus_epi16((__m256i)v, (__m256i)v);
370 return (
U8)_mm256_castsi256_si128(_mm256_permute4x64_epi64(rst, 8));
373 __m512i mask = _mm512_set1_epi32(0x80000000);
374 __m512i aa = _mm512_and_si512((__m512i)c, mask);
375 return _mm512_mask_blend_ps(_mm512_test_epi32_mask(aa, aa),
e,t);
378 __m512i mask = _mm512_set1_epi32(0x80000000);
379 __m512i aa = _mm512_and_si512((__m512i)c, mask);
380 return (
I32)_mm512_mask_blend_epi32(_mm512_test_epi32_mask(aa, aa),(__m512i)
e,(__m512i)t);
383 __mmask16 mask32 = _mm512_test_epi32_mask((__m512i)c, (__m512i)c);
387 __mmask16 mask32 = _mm512_test_epi32_mask((__m512i)c, (__m512i)c);
388 return mask32 == 0xffff;
390 template <
typename T>
392 return V<T>{
p[ix[ 0]],
p[ix[ 1]],
p[ix[ 2]],
p[ix[ 3]],
393 p[ix[ 4]],
p[ix[ 5]],
p[ix[ 6]],
p[ix[ 7]],
394 p[ix[ 8]],
p[ix[ 9]],
p[ix[10]],
p[ix[11]],
395 p[ix[12]],
p[ix[13]],
p[ix[14]],
p[ix[15]] };
397 SI F gather(
const float*
p,
U32 ix) {
return _mm512_i32gather_ps((__m512i)ix,
p, 4); }
399 return (
U32)_mm512_i32gather_epi32((__m512i)ix,
p, 4); }
402 _mm512_i32gather_epi64(_mm512_castsi512_si256((__m512i)ix),
p, 8),
403 _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)ix, 1),
p, 8),
405 return sk_bit_cast<U64>(
parts);
407 template <
typename V,
typename S>
411 dst[ix[0]] = after[0];
412 dst[ix[1]] = after[1];
413 dst[ix[2]] = after[2];
414 dst[ix[3]] = after[3];
415 dst[ix[4]] = after[4];
416 dst[ix[5]] = after[5];
417 dst[ix[6]] = after[6];
418 dst[ix[7]] = after[7];
419 dst[ix[8]] = after[8];
420 dst[ix[9]] = after[9];
421 dst[ix[10]] = after[10];
422 dst[ix[11]] = after[11];
423 dst[ix[12]] = after[12];
424 dst[ix[13]] = after[13];
425 dst[ix[14]] = after[14];
426 dst[ix[15]] = after[15];
430 __m256i _01234567 = _mm256_loadu_si256(((
const __m256i*)ptr) + 0);
431 __m256i _89abcdef = _mm256_loadu_si256(((
const __m256i*)ptr) + 1);
433 *r = (
U16)_mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_mm256_slli_epi32
434 (_01234567, 16), 16), _mm256_srai_epi32(_mm256_slli_epi32(_89abcdef, 16), 16)), 216);
435 *g = (
U16)_mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_01234567, 16),
436 _mm256_srai_epi32(_89abcdef, 16)), 216);
439 __m256i _01234567 = _mm256_unpacklo_epi16((__m256i)r, (__m256i)g);
440 __m256i _89abcdef = _mm256_unpackhi_epi16((__m256i)r, (__m256i)g);
441 __m512i combinedVector = _mm512_inserti64x4(_mm512_castsi256_si512(_01234567),
443 __m512i aa = _mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), combinedVector);
444 _01234567 = _mm512_castsi512_si256(aa);
445 _89abcdef = _mm512_extracti64x4_epi64(aa, 1);
447 _mm256_storeu_si256((__m256i*)ptr + 0, _01234567);
448 _mm256_storeu_si256((__m256i*)ptr + 1, _89abcdef);
452 __m256i _0123 = _mm256_loadu_si256((
const __m256i*)ptr),
453 _4567 = _mm256_loadu_si256(((
const __m256i*)ptr) + 1),
454 _89ab = _mm256_loadu_si256(((
const __m256i*)ptr) + 2),
455 _cdef = _mm256_loadu_si256(((
const __m256i*)ptr) + 3);
457 auto a0 = _mm256_unpacklo_epi16(_0123, _4567),
458 a1 = _mm256_unpackhi_epi16(_0123, _4567),
459 b0 = _mm256_unpacklo_epi16(a0, a1),
460 b1 = _mm256_unpackhi_epi16(a0, a1),
461 a2 = _mm256_unpacklo_epi16(_89ab, _cdef),
462 a3 = _mm256_unpackhi_epi16(_89ab, _cdef),
463 b2 = _mm256_unpacklo_epi16(a2, a3),
464 b3 = _mm256_unpackhi_epi16(a2, a3),
465 rr = _mm256_unpacklo_epi64(b0, b2),
466 gg = _mm256_unpackhi_epi64(b0, b2),
467 bb = _mm256_unpacklo_epi64(b1, b3),
468 aa = _mm256_unpackhi_epi64(b1, b3);
470 *r = (
U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), rr);
471 *g = (
U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), gg);
472 *
b = (
U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), bb);
473 *
a = (
U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), aa);
476 auto rg012389ab = _mm256_unpacklo_epi16((__m256i)r, (__m256i)g),
477 rg4567cdef = _mm256_unpackhi_epi16((__m256i)r, (__m256i)g),
478 ba012389ab = _mm256_unpacklo_epi16((__m256i)
b, (__m256i)
a),
479 ba4567cdef = _mm256_unpackhi_epi16((__m256i)
b, (__m256i)
a);
481 auto _0189 = _mm256_unpacklo_epi32(rg012389ab, ba012389ab),
482 _23ab = _mm256_unpackhi_epi32(rg012389ab, ba012389ab),
483 _45cd = _mm256_unpacklo_epi32(rg4567cdef, ba4567cdef),
484 _67ef = _mm256_unpackhi_epi32(rg4567cdef, ba4567cdef);
486 auto _ab23 = _mm256_permutex_epi64(_23ab, 78),
487 _0123 = _mm256_blend_epi32(_0189, _ab23, 0xf0),
488 _89ab = _mm256_permutex_epi64(_mm256_blend_epi32(_0189, _ab23, 0x0f), 78),
489 _ef67 = _mm256_permutex_epi64(_67ef, 78),
490 _4567 = _mm256_blend_epi32(_45cd, _ef67, 0xf0),
491 _cdef = _mm256_permutex_epi64(_mm256_blend_epi32(_45cd, _ef67, 0x0f), 78);
493 _mm256_storeu_si256((__m256i*)ptr, _0123);
494 _mm256_storeu_si256((__m256i*)ptr + 1, _4567);
495 _mm256_storeu_si256((__m256i*)ptr + 2, _89ab);
496 _mm256_storeu_si256((__m256i*)ptr + 3, _cdef);
500 F _048c, _159d, _26ae, _37bf;
502 _048c = _mm512_castps128_ps512(_mm_loadu_ps(ptr) );
503 _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+16), 1);
504 _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+32), 2);
505 _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+48), 3);
506 _159d = _mm512_castps128_ps512(_mm_loadu_ps(ptr+4) );
507 _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+20), 1);
508 _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+36), 2);
509 _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+52), 3);
510 _26ae = _mm512_castps128_ps512(_mm_loadu_ps(ptr+8) );
511 _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+24), 1);
512 _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+40), 2);
513 _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+56), 3);
514 _37bf = _mm512_castps128_ps512(_mm_loadu_ps(ptr+12) );
515 _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+28), 1);
516 _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+44), 2);
517 _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+60), 3);
519 F rg02468acf = _mm512_unpacklo_ps(_048c, _26ae),
520 ba02468acf = _mm512_unpackhi_ps(_048c, _26ae),
521 rg13579bde = _mm512_unpacklo_ps(_159d, _37bf),
522 ba13579bde = _mm512_unpackhi_ps(_159d, _37bf);
524 *r = (
F)_mm512_unpacklo_ps(rg02468acf, rg13579bde);
525 *g = (
F)_mm512_unpackhi_ps(rg02468acf, rg13579bde);
526 *
b = (
F)_mm512_unpacklo_ps(ba02468acf, ba13579bde);
527 *
a = (
F)_mm512_unpackhi_ps(ba02468acf, ba13579bde);
531 F rg014589cd = _mm512_unpacklo_ps(r, g),
532 rg2367abef = _mm512_unpackhi_ps(r, g),
533 ba014589cd = _mm512_unpacklo_ps(
b,
a),
534 ba2367abef = _mm512_unpackhi_ps(
b,
a);
536 F _048c = (
F)_mm512_unpacklo_pd((__m512d)rg014589cd, (__m512d)ba014589cd),
537 _26ae = (
F)_mm512_unpacklo_pd((__m512d)rg2367abef, (__m512d)ba2367abef),
538 _159d = (
F)_mm512_unpackhi_pd((__m512d)rg014589cd, (__m512d)ba014589cd),
539 _37bf = (
F)_mm512_unpackhi_pd((__m512d)rg2367abef, (__m512d)ba2367abef);
541 F _ae26 = (
F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_26ae),
542 _bf37 = (
F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_37bf),
543 _8c04 = (
F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_048c),
544 _9d15 = (
F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_159d);
546 __m512i index = _mm512_setr_epi32(4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11);
547 F _0426 = (
F)_mm512_permutex2var_pd((__m512d)_048c, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
549 _1537 = (
F)_mm512_permutex2var_pd((__m512d)_159d, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
551 _5173 = _mm512_permutexvar_ps(index, _1537),
552 _0123 = (
F)_mm512_permutex2var_pd((__m512d)_0426, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
555 F _5476 = (
F)_mm512_permutex2var_pd((__m512d)_5173, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
557 _4567 = _mm512_permutexvar_ps(index, _5476),
558 _8cae = (
F)_mm512_permutex2var_pd((__m512d)_8c04, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
560 _9dbf = (
F)_mm512_permutex2var_pd((__m512d)_9d15, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
562 _d9fb = _mm512_permutexvar_ps(index, _9dbf),
563 _89ab = (
F)_mm512_permutex2var_pd((__m512d)_8cae, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
565 _dcfe = (
F)_mm512_permutex2var_pd((__m512d)_d9fb, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
567 _cdef = _mm512_permutexvar_ps(index, _dcfe);
569 _mm512_storeu_ps(ptr+0, _0123);
570 _mm512_storeu_ps(ptr+16, _4567);
571 _mm512_storeu_ps(ptr+32, _89ab);
572 _mm512_storeu_ps(ptr+48, _cdef);
575#elif defined(JUMPER_IS_HSW)
577 template <
typename T>
using V = Vec<8, T>;
595 SI F abs_ (
F v) {
return _mm256_and_ps(v, 0-v); }
597 SI F floor_(
F v) {
return _mm256_floor_ps(v); }
598 SI F ceil_(
F v) {
return _mm256_ceil_ps(v); }
601 SI F sqrt_ (
F v) {
return _mm256_sqrt_ps (v); }
604 return _mm256_fnmadd_ps(v,
e, _mm256_set1_ps(2.0f)) *
e;
611 return (
U16)_mm_packus_epi32(_mm256_extractf128_si256((__m256i)v, 0),
612 _mm256_extractf128_si256((__m256i)v, 1));
615 auto r = _mm_packus_epi16((__m128i)v,(__m128i)v);
616 return sk_unaligned_load<U8>(&r);
621 return (
I32)_mm256_blendv_ps((__m256)
e, (__m256)t, (__m256)c);
625 SI bool any(
I32 c) {
return !_mm256_testz_si256((__m256i)c, _mm256_set1_epi32(-1)); }
626 SI bool all(
I32 c) {
return _mm256_testc_si256((__m256i)c, _mm256_set1_epi32(-1)); }
628 template <
typename T>
630 return V<T>{
p[ix[0]],
p[ix[1]],
p[ix[2]],
p[ix[3]],
631 p[ix[4]],
p[ix[5]],
p[ix[6]],
p[ix[7]], };
633 SI F gather(
const float*
p,
U32 ix) {
return _mm256_i32gather_ps(
p, (__m256i)ix, 4); }
635 return (
U32)_mm256_i32gather_epi32((
const int*)
p, (__m256i)ix, 4);
639 _mm256_i32gather_epi64(
640 (
const long long int*)
p, _mm256_extracti128_si256((__m256i)ix, 0), 8),
641 _mm256_i32gather_epi64(
642 (
const long long int*)
p, _mm256_extracti128_si256((__m256i)ix, 1), 8),
644 return sk_bit_cast<U64>(
parts);
649 dst[ix[0]] = after[0];
650 dst[ix[1]] = after[1];
651 dst[ix[2]] = after[2];
652 dst[ix[3]] = after[3];
653 dst[ix[4]] = after[4];
654 dst[ix[5]] = after[5];
655 dst[ix[6]] = after[6];
656 dst[ix[7]] = after[7];
660 __m128i _0123 = _mm_loadu_si128(((
const __m128i*)ptr) + 0),
661 _4567 = _mm_loadu_si128(((
const __m128i*)ptr) + 1);
662 *r = (
U16)_mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, 16), 16),
663 _mm_srai_epi32(_mm_slli_epi32(_4567, 16), 16));
664 *g = (
U16)_mm_packs_epi32(_mm_srai_epi32(_0123, 16),
665 _mm_srai_epi32(_4567, 16));
668 auto _0123 = _mm_unpacklo_epi16((__m128i)r, (__m128i)g),
669 _4567 = _mm_unpackhi_epi16((__m128i)r, (__m128i)g);
670 _mm_storeu_si128((__m128i*)ptr + 0, _0123);
671 _mm_storeu_si128((__m128i*)ptr + 1, _4567);
675 __m128i _01 = _mm_loadu_si128(((
const __m128i*)ptr) + 0),
676 _23 = _mm_loadu_si128(((
const __m128i*)ptr) + 1),
677 _45 = _mm_loadu_si128(((
const __m128i*)ptr) + 2),
678 _67 = _mm_loadu_si128(((
const __m128i*)ptr) + 3);
680 auto _02 = _mm_unpacklo_epi16(_01, _23),
681 _13 = _mm_unpackhi_epi16(_01, _23),
682 _46 = _mm_unpacklo_epi16(_45, _67),
683 _57 = _mm_unpackhi_epi16(_45, _67);
685 auto rg0123 = _mm_unpacklo_epi16(_02, _13),
686 ba0123 = _mm_unpackhi_epi16(_02, _13),
687 rg4567 = _mm_unpacklo_epi16(_46, _57),
688 ba4567 = _mm_unpackhi_epi16(_46, _57);
690 *r = (
U16)_mm_unpacklo_epi64(rg0123, rg4567);
691 *g = (
U16)_mm_unpackhi_epi64(rg0123, rg4567);
692 *
b = (
U16)_mm_unpacklo_epi64(ba0123, ba4567);
693 *
a = (
U16)_mm_unpackhi_epi64(ba0123, ba4567);
696 auto rg0123 = _mm_unpacklo_epi16((__m128i)r, (__m128i)g),
697 rg4567 = _mm_unpackhi_epi16((__m128i)r, (__m128i)g),
698 ba0123 = _mm_unpacklo_epi16((__m128i)
b, (__m128i)
a),
699 ba4567 = _mm_unpackhi_epi16((__m128i)
b, (__m128i)
a);
701 auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
702 _23 = _mm_unpackhi_epi32(rg0123, ba0123),
703 _45 = _mm_unpacklo_epi32(rg4567, ba4567),
704 _67 = _mm_unpackhi_epi32(rg4567, ba4567);
706 _mm_storeu_si128((__m128i*)ptr + 0, _01);
707 _mm_storeu_si128((__m128i*)ptr + 1, _23);
708 _mm_storeu_si128((__m128i*)ptr + 2, _45);
709 _mm_storeu_si128((__m128i*)ptr + 3, _67);
713 F _04 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+ 0)),
714 _15 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+ 4)),
715 _26 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+ 8)),
716 _37 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+12));
717 _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1);
718 _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1);
719 _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1);
720 _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1);
722 F rg0145 = _mm256_unpacklo_ps(_04,_15),
723 ba0145 = _mm256_unpackhi_ps(_04,_15),
724 rg2367 = _mm256_unpacklo_ps(_26,_37),
725 ba2367 = _mm256_unpackhi_ps(_26,_37);
727 *r = (
F)_mm256_unpacklo_pd((__m256d)rg0145, (__m256d)rg2367);
728 *g = (
F)_mm256_unpackhi_pd((__m256d)rg0145, (__m256d)rg2367);
729 *
b = (
F)_mm256_unpacklo_pd((__m256d)ba0145, (__m256d)ba2367);
730 *
a = (
F)_mm256_unpackhi_pd((__m256d)ba0145, (__m256d)ba2367);
733 F rg0145 = _mm256_unpacklo_ps(r, g),
734 rg2367 = _mm256_unpackhi_ps(r, g),
735 ba0145 = _mm256_unpacklo_ps(
b,
a),
736 ba2367 = _mm256_unpackhi_ps(
b,
a);
738 F _04 = (
F)_mm256_unpacklo_pd((__m256d)rg0145, (__m256d)ba0145),
739 _15 = (
F)_mm256_unpackhi_pd((__m256d)rg0145, (__m256d)ba0145),
740 _26 = (
F)_mm256_unpacklo_pd((__m256d)rg2367, (__m256d)ba2367),
741 _37 = (
F)_mm256_unpackhi_pd((__m256d)rg2367, (__m256d)ba2367);
743 F _01 = _mm256_permute2f128_ps(_04, _15, 32),
744 _23 = _mm256_permute2f128_ps(_26, _37, 32),
745 _45 = _mm256_permute2f128_ps(_04, _15, 49),
746 _67 = _mm256_permute2f128_ps(_26, _37, 49);
747 _mm256_storeu_ps(ptr+ 0, _01);
748 _mm256_storeu_ps(ptr+ 8, _23);
749 _mm256_storeu_ps(ptr+16, _45);
750 _mm256_storeu_ps(ptr+24, _67);
753#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
754 template <
typename T>
using V = Vec<4, T>;
763 return _mm_or_ps(_mm_and_ps((__m128)c, t), _mm_andnot_ps((__m128)c,
e));
766 return (
I32)_mm_or_ps(_mm_and_ps((__m128)c, (__m128)t),
767 _mm_andnot_ps((__m128)c, (__m128)
e));
772#if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
781 return sk_bit_cast<U32>(
if_then_else(
a <
b, sk_bit_cast<I32>(
a), sk_bit_cast<I32>(
b)));
784 return sk_bit_cast<U32>(
if_then_else(
a >
b, sk_bit_cast<I32>(
a), sk_bit_cast<I32>(
b)));
790 SI F abs_(
F v) {
return _mm_and_ps(v, 0-v); }
791#if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
799 SI F sqrt_(
F v) {
return _mm_sqrt_ps (v); }
806 #if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
807 auto p = _mm_packus_epi32((__m128i)v,(__m128i)v);
810 auto p = _mm_srai_epi32(_mm_slli_epi32((__m128i)v, 16), 16);
811 p = _mm_packs_epi32(
p,
p);
813 return sk_unaligned_load<U16>(&
p);
816 auto r = widen_cast<__m128i>(v);
817 r = _mm_packus_epi16(r,r);
818 return sk_unaligned_load<U8>(&r);
822 SI bool any(
I32 c) {
return _mm_movemask_ps(sk_bit_cast<F>(c)) != 0b0000; }
823 SI bool all(
I32 c) {
return _mm_movemask_ps(sk_bit_cast<F>(c)) == 0b1111; }
826 #if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
827 return _mm_floor_ps(v);
829 F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
835 #if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
836 return _mm_ceil_ps(v);
838 F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
843 template <
typename T>
845 return V<T>{
p[ix[0]],
p[ix[1]],
p[ix[2]],
p[ix[3]]};
850 dst[ix[0]] = after[0];
851 dst[ix[1]] = after[1];
852 dst[ix[2]] = after[2];
853 dst[ix[3]] = after[3];
856 __m128i _01 = _mm_loadu_si128(((
const __m128i*)ptr) + 0);
857 auto rg01_23 = _mm_shufflelo_epi16(_01, 0xD8);
858 auto rg = _mm_shufflehi_epi16(rg01_23, 0xD8);
860 auto R = _mm_shuffle_epi32(rg, 0x88);
861 auto G = _mm_shuffle_epi32(rg, 0xDD);
862 *r = sk_unaligned_load<U16>(&
R);
863 *g = sk_unaligned_load<U16>(&
G);
866 __m128i rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g));
867 _mm_storeu_si128((__m128i*)ptr + 0, rg);
871 __m128i _01 = _mm_loadu_si128(((
const __m128i*)ptr) + 0),
872 _23 = _mm_loadu_si128(((
const __m128i*)ptr) + 1);
874 auto _02 = _mm_unpacklo_epi16(_01, _23),
875 _13 = _mm_unpackhi_epi16(_01, _23);
877 auto rg = _mm_unpacklo_epi16(_02, _13),
878 ba = _mm_unpackhi_epi16(_02, _13);
880 *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0);
881 *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4);
882 *
b = sk_unaligned_load<U16>((uint16_t*)&ba + 0);
883 *
a = sk_unaligned_load<U16>((uint16_t*)&ba + 4);
887 auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
888 ba = _mm_unpacklo_epi16(widen_cast<__m128i>(
b), widen_cast<__m128i>(
a));
890 _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
891 _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
895 F _0 = _mm_loadu_ps(ptr + 0),
896 _1 = _mm_loadu_ps(ptr + 4),
897 _2 = _mm_loadu_ps(ptr + 8),
898 _3 = _mm_loadu_ps(ptr +12);
899 _MM_TRANSPOSE4_PS(_0,_1,_2,_3);
907 _MM_TRANSPOSE4_PS(r,g,
b,
a);
908 _mm_storeu_ps(ptr + 0, r);
909 _mm_storeu_ps(ptr + 4, g);
910 _mm_storeu_ps(ptr + 8,
b);
911 _mm_storeu_ps(ptr +12,
a);
914#elif defined(JUMPER_IS_LASX)
916 template <
typename T>
using V = Vec<8, T>;
924 SI __m128i emulate_lasx_d_xr2vr_l(__m256i
a) {
926 v2i64 al = {tmp[0], tmp[1]};
930 SI __m128i emulate_lasx_d_xr2vr_h(__m256i
a) {
932 v2i64 ah = {tmp[2], tmp[3]};
937 return sk_bit_cast<Vec<8,float>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(
e),
938 sk_bit_cast<__m256i>(t),
939 sk_bit_cast<__m256i>(c)));
943 return sk_bit_cast<Vec<8,int32_t>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(
e),
944 sk_bit_cast<__m256i>(t),
945 sk_bit_cast<__m256i>(c)));
962 SI F sqrt_(
F v) {
return __lasx_xvfsqrt_s(v); }
966 return __lasx_xvftintrz_w_s(v + t);
971 return __lasx_xvftintrz_w_s(v + t);
976 return __lasx_xvftintrz_w_s(
mad(v,
scale, t));
980 return __lsx_vpickev_h(__lsx_vsat_wu(emulate_lasx_d_xr2vr_h(v), 15),
981 __lsx_vsat_wu(emulate_lasx_d_xr2vr_l(v), 15));
985 __m128i tmp = __lsx_vsat_hu(v, 7);
986 auto r = __lsx_vpickev_b(tmp, tmp);
987 return sk_unaligned_load<U8>(&r);
991 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0), c));
992 return (retv[0] | retv[4]) != 0b0000;
996 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0), c));
997 return (retv[0] & retv[4]) == 0b1111;
1001 return __lasx_xvfrintrm_s(v);
1005 return __lasx_xvfrintrp_s(v);
1008 template <
typename T>
1010 return {
p[ix[0]],
p[ix[1]],
p[ix[2]],
p[ix[3]],
1011 p[ix[4]],
p[ix[5]],
p[ix[6]],
p[ix[7]], };
1014 template <
typename V,
typename S>
1018 dst[ix[0]] = after[0];
1019 dst[ix[1]] = after[1];
1020 dst[ix[2]] = after[2];
1021 dst[ix[3]] = after[3];
1022 dst[ix[4]] = after[4];
1023 dst[ix[5]] = after[5];
1024 dst[ix[6]] = after[6];
1025 dst[ix[7]] = after[7];
1029 U16 _0123 = __lsx_vld(ptr, 0),
1030 _4567 = __lsx_vld(ptr, 16);
1031 *r = __lsx_vpickev_h(__lsx_vsat_w(__lsx_vsrai_w(__lsx_vslli_w(_4567, 16), 16), 15),
1032 __lsx_vsat_w(__lsx_vsrai_w(__lsx_vslli_w(_0123, 16), 16), 15));
1033 *g = __lsx_vpickev_h(__lsx_vsat_w(__lsx_vsrai_w(_4567, 16), 15),
1034 __lsx_vsat_w(__lsx_vsrai_w(_0123, 16), 15));
1037 auto _0123 = __lsx_vilvl_h(g, r),
1038 _4567 = __lsx_vilvh_h(g, r);
1039 __lsx_vst(_0123, ptr, 0);
1040 __lsx_vst(_4567, ptr, 16);
1044 __m128i _01 = __lsx_vld(ptr, 0),
1045 _23 = __lsx_vld(ptr, 16),
1046 _45 = __lsx_vld(ptr, 32),
1047 _67 = __lsx_vld(ptr, 48);
1049 auto _02 = __lsx_vilvl_h(_23, _01),
1050 _13 = __lsx_vilvh_h(_23, _01),
1051 _46 = __lsx_vilvl_h(_67, _45),
1052 _57 = __lsx_vilvh_h(_67, _45);
1054 auto rg0123 = __lsx_vilvl_h(_13, _02),
1055 ba0123 = __lsx_vilvh_h(_13, _02),
1056 rg4567 = __lsx_vilvl_h(_57, _46),
1057 ba4567 = __lsx_vilvh_h(_57, _46);
1059 *r = __lsx_vilvl_d(rg4567, rg0123);
1060 *g = __lsx_vilvh_d(rg4567, rg0123);
1061 *
b = __lsx_vilvl_d(ba4567, ba0123);
1062 *
a = __lsx_vilvh_d(ba4567, ba0123);
1066 auto rg0123 = __lsx_vilvl_h(g, r),
1067 rg4567 = __lsx_vilvh_h(g, r),
1068 ba0123 = __lsx_vilvl_h(
a,
b),
1069 ba4567 = __lsx_vilvh_h(
a,
b);
1071 auto _01 =__lsx_vilvl_w(ba0123, rg0123),
1072 _23 =__lsx_vilvh_w(ba0123, rg0123),
1073 _45 =__lsx_vilvl_w(ba4567, rg4567),
1074 _67 =__lsx_vilvh_w(ba4567, rg4567);
1076 __lsx_vst(_01, ptr, 0);
1077 __lsx_vst(_23, ptr, 16);
1078 __lsx_vst(_45, ptr, 32);
1079 __lsx_vst(_67, ptr, 48);
1083 F _04 = (
F)__lasx_xvpermi_q(__lasx_xvld(ptr, 0), __lasx_xvld(ptr, 64), 0x02);
1084 F _15 = (
F)__lasx_xvpermi_q(__lasx_xvld(ptr, 16), __lasx_xvld(ptr, 80), 0x02);
1085 F _26 = (
F)__lasx_xvpermi_q(__lasx_xvld(ptr, 32), __lasx_xvld(ptr, 96), 0x02);
1086 F _37 = (
F)__lasx_xvpermi_q(__lasx_xvld(ptr, 48), __lasx_xvld(ptr, 112), 0x02);
1088 F rg0145 = (
F)__lasx_xvilvl_w((__m256i)_15, (__m256i)_04),
1089 ba0145 = (
F)__lasx_xvilvh_w((__m256i)_15, (__m256i)_04),
1090 rg2367 = (
F)__lasx_xvilvl_w((__m256i)_37, (__m256i)_26),
1091 ba2367 = (
F)__lasx_xvilvh_w((__m256i)_37, (__m256i)_26);
1093 *r = (
F)__lasx_xvilvl_d((__m256i)rg2367, (__m256i)rg0145);
1094 *g = (
F)__lasx_xvilvh_d((__m256i)rg2367, (__m256i)rg0145);
1095 *
b = (
F)__lasx_xvilvl_d((__m256i)ba2367, (__m256i)ba0145);
1096 *
a = (
F)__lasx_xvilvh_d((__m256i)ba2367, (__m256i)ba0145);
1099 F rg0145 = (
F)__lasx_xvilvl_w((__m256i)g, (__m256i)r),
1100 rg2367 = (
F)__lasx_xvilvh_w((__m256i)g, (__m256i)r),
1101 ba0145 = (
F)__lasx_xvilvl_w((__m256i)
a, (__m256i)
b),
1102 ba2367 = (
F)__lasx_xvilvh_w((__m256i)
a, (__m256i)
b);
1104 F _04 = (
F)__lasx_xvilvl_d((__m256i)ba0145, (__m256i)rg0145),
1105 _15 = (
F)__lasx_xvilvh_d((__m256i)ba0145, (__m256i)rg0145),
1106 _26 = (
F)__lasx_xvilvl_d((__m256i)ba2367, (__m256i)rg2367),
1107 _37 = (
F)__lasx_xvilvh_d((__m256i)ba2367, (__m256i)rg2367);
1109 F _01 = (
F)__lasx_xvpermi_q((__m256i)_04, (__m256i)_15, 0x02),
1110 _23 = (
F)__lasx_xvpermi_q((__m256i)_26, (__m256i)_37, 0x02),
1111 _45 = (
F)__lasx_xvpermi_q((__m256i)_04, (__m256i)_15, 0x13),
1112 _67 = (
F)__lasx_xvpermi_q((__m256i)_26, (__m256i)_37, 0x13);
1113 __lasx_xvst(_01, ptr, 0);
1114 __lasx_xvst(_23, ptr, 32);
1115 __lasx_xvst(_45, ptr, 64);
1116 __lasx_xvst(_67, ptr, 96);
1119#elif defined(JUMPER_IS_LSX)
1120 template <
typename T>
using V = Vec<4, T>;
1128 #define _LSX_TRANSPOSE4_S(row0, row1, row2, row3) \
1130 __m128 __t0 = (__m128)__lsx_vilvl_w ((__m128i)row1, (__m128i)row0); \
1131 __m128 __t1 = (__m128)__lsx_vilvl_w ((__m128i)row3, (__m128i)row2); \
1132 __m128 __t2 = (__m128)__lsx_vilvh_w ((__m128i)row1, (__m128i)row0); \
1133 __m128 __t3 = (__m128)__lsx_vilvh_w ((__m128i)row3, (__m128i)row2); \
1134 (row0) = (__m128)__lsx_vilvl_d ((__m128i)__t1, (__m128i)__t0); \
1135 (row1) = (__m128)__lsx_vilvh_d ((__m128i)__t1, (__m128i)__t0); \
1136 (row2) = (__m128)__lsx_vilvl_d ((__m128i)__t3, (__m128i)__t2); \
1137 (row3) = (__m128)__lsx_vilvh_d ((__m128i)__t3, (__m128i)__t2); \
1141 return sk_bit_cast<Vec<4,float>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(
e),
1142 sk_bit_cast<__m128i>(t),
1143 sk_bit_cast<__m128i>(c)));
1147 return sk_bit_cast<Vec<4,int32_t>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(
e),
1148 sk_bit_cast<__m128i>(t),
1149 sk_bit_cast<__m128i>(c)));
1166 SI F sqrt_(
F v) {
return __lsx_vfsqrt_s (v); }
1170 return __lsx_vftintrz_w_s(v + t); }
1174 return __lsx_vftintrz_w_s(v + t); }
1178 return __lsx_vftintrz_w_s(
mad(v,
scale, t)); }
1181 __m128i tmp = __lsx_vsat_wu(v, 15);
1182 auto p = __lsx_vpickev_h(tmp, tmp);
1183 return sk_unaligned_load<U16>(&
p);
1187 auto r = widen_cast<__m128i>(v);
1188 __m128i tmp = __lsx_vsat_hu(r, 7);
1189 r = __lsx_vpickev_b(tmp, tmp);
1190 return sk_unaligned_load<U8>(&r);
1194 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0), c));
1195 return retv[0] != 0b0000;
1199 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0), c));
1200 return retv[0] == 0b1111;
1204 return __lsx_vfrintrm_s(v);
1208 return __lsx_vfrintrp_s(v);
1211 template <
typename T>
1213 return {
p[ix[0]],
p[ix[1]],
p[ix[2]],
p[ix[3]]};
1216 template <
typename V,
typename S>
1220 dst[ix[0]] = after[0];
1221 dst[ix[1]] = after[1];
1222 dst[ix[2]] = after[2];
1223 dst[ix[3]] = after[3];
1227 __m128i _01 = __lsx_vld(ptr, 0);
1228 auto rg = __lsx_vshuf4i_h(_01, 0xD8);
1230 auto R = __lsx_vshuf4i_w(rg, 0x88);
1231 auto G = __lsx_vshuf4i_w(rg, 0xDD);
1232 *r = sk_unaligned_load<U16>(&
R);
1233 *g = sk_unaligned_load<U16>(&
G);
1237 U32 rg = __lsx_vilvl_h(widen_cast<__m128i>(g), widen_cast<__m128i>(r));
1238 __lsx_vst(rg, ptr, 0);
1242 __m128i _01 = __lsx_vld(ptr, 0),
1243 _23 = __lsx_vld(ptr, 16);
1245 auto _02 = __lsx_vilvl_h(_23, _01),
1246 _13 = __lsx_vilvh_h(_23, _01);
1248 auto rg = __lsx_vilvl_h(_13, _02),
1249 ba = __lsx_vilvh_h(_13, _02);
1251 *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0);
1252 *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4);
1253 *
b = sk_unaligned_load<U16>((uint16_t*)&ba + 0);
1254 *
a = sk_unaligned_load<U16>((uint16_t*)&ba + 4);
1258 auto rg = __lsx_vilvl_h(widen_cast<__m128i>(g), widen_cast<__m128i>(r)),
1259 ba = __lsx_vilvl_h(widen_cast<__m128i>(
a), widen_cast<__m128i>(
b));
1261 __lsx_vst(__lsx_vilvl_w(ba, rg), ptr, 0);
1262 __lsx_vst(__lsx_vilvh_w(ba, rg), ptr, 16);
1266 F _0 = (
F)__lsx_vld(ptr, 0),
1267 _1 = (
F)__lsx_vld(ptr, 16),
1268 _2 = (
F)__lsx_vld(ptr, 32),
1269 _3 = (
F)__lsx_vld(ptr, 48);
1270 _LSX_TRANSPOSE4_S(_0,_1,_2,_3);
1278 _LSX_TRANSPOSE4_S(r,g,
b,
a);
1279 __lsx_vst(r, ptr, 0);
1280 __lsx_vst(g, ptr, 16);
1281 __lsx_vst(
b, ptr, 32);
1282 __lsx_vst(
a, ptr, 48);
1298#if defined(__clang__) || defined(JUMPER_IS_SCALAR)
1303SI constexpr F F_(
float x) {
return x -
F(); }
1312#if !defined(JUMPER_IS_SCALAR)
1336#if defined(JUMPER_IS_SCALAR)
1344 SI F cast64(
U64 v) {
return __builtin_convertvector( v,
F); }
1350#if !defined(JUMPER_IS_SCALAR)
1361 F e =
cast(sk_bit_cast<U32>(
x)) * (1.0f / (1<<23));
1364 F m = sk_bit_cast<F>((sk_bit_cast<U32>(
x) & 0x007fffff) | 0x3f000000);
1366 return nmad(
m, 1.498030302f,
e - 124.225514990f) - 1.725879990f / (0.3520887068f +
m);
1370 const float ln2 = 0.69314718f;
1375 constexpr float kInfinityBits = 0x7f800000;
1378 F approx =
nmad(
f, 1.490129070f,
x + 121.274057500f);
1379 approx += 27.728023300f / (4.84252568f -
f);
1380 approx *= 1.0f * (1<<23);
1381 approx =
min(
max(approx,
F0),
F_(kInfinityBits));
1383 return sk_bit_cast<F>(
round(approx));
1387 const float log2_e = 1.4426950408889634074f;
1395#if !defined(JUMPER_IS_SCALAR)
1400#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64)
1401 return vcvt_f32_f16((float16x4_t)
h);
1403#elif defined(JUMPER_IS_SKX)
1404 return _mm512_cvtph_ps((__m256i)
h);
1406#elif defined(JUMPER_IS_HSW)
1407 return _mm256_cvtph_ps((__m128i)
h);
1416 auto denorm = (
I32)em < 0x0400;
1418 , sk_bit_cast<F>( (
s<<16) + (em<<13) + ((127-15)<<23) ));
1423#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64)
1424 return (
U16)vcvt_f16_f32(
f);
1426#elif defined(JUMPER_IS_SKX)
1427 return (
U16)_mm512_cvtps_ph(
f, _MM_FROUND_CUR_DIRECTION);
1429#elif defined(JUMPER_IS_HSW)
1430 return (
U16)_mm256_cvtps_ph(
f, _MM_FROUND_CUR_DIRECTION);
1434 U32 sem = sk_bit_cast<U32>(
f),
1435 s = sem & 0x80000000,
1439 auto denorm = (
I32)em < 0x38800000;
1441 , (
I32)((
s>>16) + (em>>13) - ((127-15)<<10))));
1446 size_t dx,
size_t dy,
size_t tail) {
1450 const ptrdiff_t
offset = patch.info.bytesPerPixel * (dy * ctx->
stride +
dx);
1451 if (patch.info.load) {
1452 void* ctxData = SkTAddOffset<void>(ctx->
pixels,
offset);
1453 memcpy(patch.scratch, ctxData, patch.info.bytesPerPixel *
tail);
1457 void* scratchFakeBase = SkTAddOffset<void>(patch.scratch, -
offset);
1458 patch.backup = ctx->
pixels;
1459 ctx->
pixels = scratchFakeBase;
1464 size_t dx,
size_t dy,
size_t tail) {
1469 ctx->
pixels = patch.backup;
1470 patch.backup =
nullptr;
1472 const ptrdiff_t
offset = patch.info.bytesPerPixel * (dy * ctx->
stride +
dx);
1473 if (patch.info.store) {
1474 void* ctxData = SkTAddOffset<void>(ctx->
pixels,
offset);
1475 memcpy(ctxData, patch.scratch, patch.info.bytesPerPixel *
tail);
1480#if defined(JUMPER_IS_SCALAR) || defined(JUMPER_IS_SSE2)
1491static constexpr size_t N =
sizeof(
F) /
sizeof(
float);
1497#if defined(SK_CPU_ARM32) && defined(JUMPER_IS_NEON)
1500 #define ABI __attribute__((pcs("aapcs-vfp")))
1501 #define JUMPER_NARROW_STAGES 1
1502#elif defined(_MSC_VER)
1505 #define ABI __vectorcall
1506 #define JUMPER_NARROW_STAGES 1
1507#elif defined(__x86_64__) || defined(SK_CPU_ARM64) || defined(SK_CPU_LOONGARCH)
1510 #define JUMPER_NARROW_STAGES 0
1515 #define JUMPER_NARROW_STAGES 1
1518#if JUMPER_NARROW_STAGES
1531 size_t xlimit,
size_t ylimit,
1534 uint8_t* tailPointer) {
1535 uint8_t unreferencedTail;
1537 tailPointer = &unreferencedTail;
1540 const size_t x0 =
dx;
1541 std::byte*
const base =
nullptr;
1542 for (; dy < ylimit; dy++) {
1543 #if JUMPER_NARROW_STAGES
1545 while (
params.dx +
N <= xlimit) {
1550 *tailPointer =
tail;
1554 *tailPointer = 0xFF;
1558 while (
dx +
N <= xlimit) {
1559 start(program,
dx,dy,
base,
F0,
F0,
F0,
F0,
F0,
F0,
F0,
F0);
1562 if (
size_t tail = xlimit -
dx) {
1563 *tailPointer =
tail;
1565 start(program,
dx,dy,
base,
F0,
F0,
F0,
F0,
F0,
F0,
F0,
F0);
1567 *tailPointer = 0xFF;
1574 #define JUMPER_MUSTTAIL [[clang::musttail]]
1576 #define JUMPER_MUSTTAIL
1579#if JUMPER_NARROW_STAGES
1580 #define DECLARE_STAGE(name, ARG, STAGE_RET, INC, OFFSET, MUSTTAIL) \
1581 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1582 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1583 static void ABI name(Params* params, SkRasterPipelineStage* program, \
1584 F r, F g, F b, F a) { \
1585 OFFSET name##_k(Ctx{program}, params->dx,params->dy,params->base, \
1586 r,g,b,a, params->dr, params->dg, params->db, params->da); \
1588 auto fn = (Stage)program->fn; \
1589 MUSTTAIL return fn(params, program, r,g,b,a); \
1591 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1592 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1594 #define DECLARE_STAGE(name, ARG, STAGE_RET, INC, OFFSET, MUSTTAIL) \
1595 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1596 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1597 static void ABI name(SkRasterPipelineStage* program, size_t dx, size_t dy, \
1598 std::byte* base, F r, F g, F b, F a, F dr, F dg, F db, F da) { \
1599 OFFSET name##_k(Ctx{program}, dx,dy,base, r,g,b,a, dr,dg,db,da); \
1601 auto fn = (Stage)program->fn; \
1602 MUSTTAIL return fn(program, dx,dy,base, r,g,b,a, dr,dg,db,da); \
1604 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1605 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1610#define STAGE(name, arg) \
1611 DECLARE_STAGE(name, arg, void, ++program, , )
1616#define STAGE_TAIL(name, arg) \
1617 DECLARE_STAGE(name, arg, void, ++program, , JUMPER_MUSTTAIL)
1620#define STAGE_BRANCH(name, arg) \
1621 DECLARE_STAGE(name, arg, int, , program +=, JUMPER_MUSTTAIL)
1625#if JUMPER_NARROW_STAGES
1657#if JUMPER_NARROW_STAGES
1664 ctx->
stage =
nullptr;
1666 program = ctx->
stage;
1669 r = sk_unaligned_load<F>(ctx->
r );
1670 g = sk_unaligned_load<F>(ctx->
g );
1671 b = sk_unaligned_load<F>(ctx->
b );
1672 a = sk_unaligned_load<F>(ctx->
a );
1673 params->dr = sk_unaligned_load<F>(ctx->
dr);
1674 params->dg = sk_unaligned_load<F>(ctx->
dg);
1675 params->db = sk_unaligned_load<F>(ctx->
db);
1676 params->da = sk_unaligned_load<F>(ctx->
da);
1693 ctx->
stage = program;
1697 size_t dx,
size_t dy, std::byte*
base,
1703 ctx->
stage =
nullptr;
1704 next(program,
dx, dy,
base, r, g,
b,
a,
dr,
dg,
db,
da);
1705 program = ctx->
stage;
1708 r = sk_unaligned_load<F>(ctx->
r );
1709 g = sk_unaligned_load<F>(ctx->
g );
1710 b = sk_unaligned_load<F>(ctx->
b );
1711 a = sk_unaligned_load<F>(ctx->
a );
1712 dr = sk_unaligned_load<F>(ctx->
dr);
1713 dg = sk_unaligned_load<F>(ctx->
dg);
1714 db = sk_unaligned_load<F>(ctx->
db);
1715 da = sk_unaligned_load<F>(ctx->
da);
1721 size_t dx,
size_t dy, std::byte*
base,
1733 ctx->
stage = program;
1740template <
typename V,
typename T>
1742 return sk_unaligned_load<V>(
src);
1745template <
typename V,
typename T>
1758 *r =
cast(wide & (31<<11)) * (1.0f / (31<<11));
1759 *g =
cast(wide & (63<< 5)) * (1.0f / (63<< 5));
1760 *
b =
cast(wide & (31<< 0)) * (1.0f / (31<< 0));
1764 *r =
cast(wide & (15<<12)) * (1.0f / (15<<12));
1765 *g =
cast(wide & (15<< 8)) * (1.0f / (15<< 8));
1766 *
b =
cast(wide & (15<< 4)) * (1.0f / (15<< 4));
1767 *
a =
cast(wide & (15<< 0)) * (1.0f / (15<< 0));
1770 *r =
cast((_8888 ) & 0xff) * (1/255.0f);
1771 *g =
cast((_8888 >> 8) & 0xff) * (1/255.0f);
1772 *
b =
cast((_8888 >> 16) & 0xff) * (1/255.0f);
1773 *
a =
cast((_8888 >> 24) ) * (1/255.0f);
1777 *r =
cast((wide ) & 0xff) * (1/255.0f);
1778 *g =
cast((wide >> 8) & 0xff) * (1/255.0f);
1781 *r =
cast((
rgba ) & 0x3ff) * (1/1023.0f);
1782 *g =
cast((
rgba >> 10) & 0x3ff) * (1/1023.0f);
1783 *
b =
cast((
rgba >> 20) & 0x3ff) * (1/1023.0f);
1787 static constexpr float min = -0.752941f;
1788 static constexpr float max = 1.25098f;
1789 static constexpr float range =
max -
min;
1790 *r =
cast((
rgba ) & 0x3ff) * (1/1023.0f) * range +
min;
1791 *g =
cast((
rgba >> 10) & 0x3ff) * (1/1023.0f) * range +
min;
1792 *
b =
cast((
rgba >> 20) & 0x3ff) * (1/1023.0f) * range +
min;
1796 *r = (
cast64((_10x6 >> 6) & 0x3ff) - 384.f) / 510.f;
1797 *g = (
cast64((_10x6 >> 22) & 0x3ff) - 384.f) / 510.f;
1798 *
b = (
cast64((_10x6 >> 38) & 0x3ff) - 384.f) / 510.f;
1799 *
a = (
cast64((_10x6 >> 54) & 0x3ff) - 384.f) / 510.f;
1802 *r =
cast64((_10x6 >> 6) & 0x3ff) * (1/1023.0f);
1803 *g =
cast64((_10x6 >> 22) & 0x3ff) * (1/1023.0f);
1804 *
b =
cast64((_10x6 >> 38) & 0x3ff) * (1/1023.0f);
1805 *
a =
cast64((_10x6 >> 54) & 0x3ff) * (1/1023.0f);
1808 *r =
cast((_1616 ) & 0xffff) * (1/65535.0f);
1809 *g =
cast((_1616 >> 16) & 0xffff) * (1/65535.0f);
1812 *r =
cast64((_16161616 ) & 0xffff) * (1/65535.0f);
1813 *g =
cast64((_16161616 >> 16) & 0xffff) * (1/65535.0f);
1814 *
b =
cast64((_16161616 >> 32) & 0xffff) * (1/65535.0f);
1815 *
a =
cast64((_16161616 >> 48) & 0xffff) * (1/65535.0f);
1819template <
typename T>
1826 F inclusive = sk_bit_cast<F>(sk_bit_cast<U32>(limit) - 1);
1827 return min(
max(0.0f, v), inclusive);
1833 inclusiveL = sk_bit_cast<F>( sk_bit_cast<U32>(
F_(limit)) - 1 );
1834 return min(
max(inclusiveZ, v), inclusiveL);
1843 constexpr float A = 6.28230858f;
1844 constexpr float B = -41.1693687f;
1845 constexpr float C = 74.4388885f;
1851 constexpr float one_over_pi2 = 1 / (2 *
SK_FloatPI);
1852 x =
mad(
x, -one_over_pi2, 0.25f);
1858 constexpr float one_over_pi2 = 1 / (2 *
SK_FloatPI);
1891 I32 neg = (
x < 0.0f);
1895 I32 use_quotient = (
x > (Pi/8));
1899 const float c4 = 62 / 2835.0f;
1900 const float c3 = 17 / 315.0f;
1901 const float c2 = 2 / 15.0f;
1902 const float c1 = 1 / 3.0f;
1903 const float c0 = 1.0f;
1905 x *=
mad(x2,
mad(x2,
mad(x2,
mad(x2, c4, c3), c2), c1), c0);
1921 const float c4 = 0.14130025741326729f;
1922 const float c3 = -0.34312835980675116f;
1923 const float c2 = -0.016172900528248768f;
1924 const float c1 = 1.0037696976200385f;
1925 const float c0 = -0.00014758242182738969f;
1931 I32 neg = (
x < 0.0f);
1933 I32 flip = (
x > 1.0f);
1945 I32 neg = (
x < 0.0f);
1947 const float c3 = -0.0187293f;
1948 const float c2 = 0.0742610f;
1949 const float c1 = -0.2121144f;
1950 const float c0 = 1.5707288f;
1971 I32 neg = (arg < 0.0f);
1986template <
typename T>
2011#if defined(JUMPER_IS_SCALAR)
2021#if defined(JUMPER_IS_SCALAR)
2034 static constexpr float iota[] = {
2035 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
2036 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
2043 r =
cast(
U32_(
dx)) + sk_unaligned_load<F>(iota);
2051 uint32_t iota[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
2054 U32 X =
U32_(
dx) + sk_unaligned_load<U32>(iota),
2065 U32 M = (
Y & 1) << 5 | (
X & 1) << 4
2066 | (
Y & 2) << 2 | (
X & 2) << 1
2067 | (
Y & 4) >> 1 | (
X & 4) >> 2;
2072 F dither =
mad(
cast(
M), 2/128.0f, -63/128.0f);
2074 r =
mad(dither, *rate, r);
2075 g =
mad(dither, *rate, g);
2076 b =
mad(dither, *rate,
b);
2116 r = sk_unaligned_load<F>(ptr + 0*
N);
2117 g = sk_unaligned_load<F>(ptr + 1*
N);
2118 b = sk_unaligned_load<F>(ptr + 2*
N);
2119 a = sk_unaligned_load<F>(ptr + 3*
N);
2136 r = sk_unaligned_load<F>(ptr + 0*
N);
2137 g = sk_unaligned_load<F>(ptr + 1*
N);
2146 dr = sk_unaligned_load<F>(ptr + 0*
N);
2147 dg = sk_unaligned_load<F>(ptr + 1*
N);
2148 db = sk_unaligned_load<F>(ptr + 2*
N);
2149 da = sk_unaligned_load<F>(ptr + 3*
N);
2161#define BLEND_MODE(name) \
2162 SI F name##_channel(F s, F d, F sa, F da); \
2163 STAGE(name, NoCtx) { \
2164 r = name##_channel(r,dr,a,da); \
2165 g = name##_channel(g,dg,a,da); \
2166 b = name##_channel(b,db,a,da); \
2167 a = name##_channel(a,da,a,da); \
2169 SI F name##_channel(F s, F d, F sa, F da)
2192#define BLEND_MODE(name) \
2193 SI F name##_channel(F s, F d, F sa, F da); \
2194 STAGE(name, NoCtx) { \
2195 r = name##_channel(r,dr,a,da); \
2196 g = name##_channel(g,dg,a,da); \
2197 b = name##_channel(b,db,a,da); \
2198 a = mad(da, inv(a), a); \
2200 SI F name##_channel(F s, F d, F sa, F da)
2235 F darkSrc =
d*(sa + (s2 - sa)*(1.0f -
m)),
2236 darkDst = (m4*m4 + m4)*(
m - 1.0f) + 7.0f*
m,
2266 F diff = l -
lum(*r, *g, *
b);
2280 l =
lum(*r, *g, *
b),
2281 mn_scale = ( l) *
rcp_fast(l - mn),
2286 *r =
clip_channel(*r, l, clip_low, clip_high, mn_scale, mx_scale);
2287 *g =
clip_channel(*g, l, clip_low, clip_high, mn_scale, mx_scale);
2288 *
b =
clip_channel(*
b, l, clip_low, clip_high, mn_scale, mx_scale);
2347 auto ptr = ptr_at_xy<uint32_t>(ctx,
dx,dy);
2349 U32 dst = load<U32>(ptr);
2393STAGE(unbounded_set_rgb,
const float* rgb) {
2440 float inf = sk_bit_cast<float>(0x7f800000);
2447 float inf = sk_bit_cast<float>(0x7f800000);
2466 (r-g)*d_rcp + 4.0f)));
2468 F l = (mx + mn) * 0.5f;
2482 c = (1.0f -
abs_(2.0f * l - 1)) *
s;
2484 auto hue_to_rgb = [&](
F hue) {
2486 return (q - 0.5f) * c + l;
2489 r = hue_to_rgb(
h + 0.0f/3.0f);
2490 g = hue_to_rgb(
h + 2.0f/3.0f);
2491 b = hue_to_rgb(
h + 1.0f/3.0f);
2497 constexpr float k = 24389 / 27.0f;
2498 constexpr float e = 216 / 24389.0f;
2501 f[1] = (r + 16) * (1 / 116.0f);
2502 f[0] = (g * (1 / 500.0f)) +
f[1];
2503 f[2] =
f[1] - (
b * (1 / 200.0f));
2505 F f_cubed[3] = {
f[0]*
f[0]*
f[0],
f[1]*
f[1]*
f[1],
f[2]*
f[2]*
f[2] };
2508 if_then_else(f_cubed[0] >
e, f_cubed[0], (116 *
f[0] - 16) * (1 / k)),
2510 if_then_else(f_cubed[2] >
e, f_cubed[2], (116 *
f[2] - 16) * (1 / k))
2513 constexpr float D50[3] = { 0.3457f / 0.3585f, 1.0f, (1.0f - 0.3457f - 0.3585f) / 0.3585f };
2520 F l_ = r + 0.3963377774f * g + 0.2158037573f *
b,
2521 m_ = r - 0.1055613458f * g - 0.0638541728f *
b,
2522 s_ = r - 0.0894841775f * g - 1.2914855480f *
b;
2528 r = +4.0767416621f * l - 3.3077115913f *
m + 0.2309699292f *
s;
2529 g = -1.2684380046f * l + 2.6097574011f *
m - 0.3413193965f *
s;
2530 b = -0.0041960863f * l - 0.7034186147f *
m + 1.7076147010f *
s;
2544 r = +4.0767416621f * l - 3.3077115913f *
m + 0.2309699292f *
s;
2545 g = -1.2684380046f * l + 2.6097574011f *
m - 0.3413193965f *
s;
2546 b = -0.0041960863f * l - 0.7034186147f *
m + 1.7076147010f *
s;
2560 g =
C *
cos_(hueRadians);
2561 b =
C *
sin_(hueRadians);
2577 mod_(0 +
h * (1 / 30.0f), 12),
2578 mod_(8 +
h * (1 / 30.0f), 12),
2579 mod_(4 +
h * (1 / 30.0f), 12)
2583 l -
a *
max(-1.0f,
min(
min(k[0] - 3.0f, 9.0f - k[0]), 1.0f)),
2584 l -
a *
max(-1.0f,
min(
min(k[1] - 3.0f, 9.0f - k[1]), 1.0f)),
2585 l -
a *
max(-1.0f,
min(
min(k[2] - 3.0f, 9.0f - k[2]), 1.0f))
2600 F gray = g / (g +
b);
2603 rgb.
r = rgb.
r * (1 - g -
b) + g;
2604 rgb.
g = rgb.
g * (1 - g -
b) + g;
2605 rgb.
b = rgb.
b * (1 - g -
b) + g;
2607 auto isGray = (g +
b) >= 1;
2627 auto ptr = ptr_at_xy<const uint8_t>(ctx,
dx,dy);
2629 auto scales = load<U8>(ptr);
2638 auto ptr = ptr_at_xy<const uint16_t>(ctx,
dx,dy);
2641 from_565(load<U16>(ptr), &cr, &cg, &cb);
2652 return mad(to-from, t, from);
2661STAGE(scale_native,
const float scales[]) {
2662 auto c = sk_unaligned_load<F>(scales);
2668STAGE(lerp_native,
const float scales[]) {
2669 auto c = sk_unaligned_load<F>(scales);
2676 auto ptr = ptr_at_xy<const uint8_t>(ctx,
dx,dy);
2678 auto scales = load<U8>(ptr);
2687 auto ptr = ptr_at_xy<const uint16_t>(ctx,
dx,dy);
2690 from_565(load<U16>(ptr), &cr, &cg, &cb);
2701 auto mptr = ptr_at_xy<const uint8_t>(&ctx->
mul,
dx,dy),
2702 aptr = ptr_at_xy<const uint8_t>(&ctx->
add,
dx,dy);
2707 r =
mad(r, mul, add);
2708 g =
mad(g, mul, add);
2709 b =
mad(
b, mul, add);
2722 return sk_bit_cast<F>(
bits ^ *
sign);
2726 return sk_bit_cast<F>(
sign | sk_bit_cast<U32>(
x));
2730 auto fn = [&](
F v) {
2744 auto fn = [&](
F v) {
2755 auto fn = [&](
F v) {
2771 auto fn = [&](
F v) {
2775 const float R = ctx->
a,
G = ctx->
b,
2776 a = ctx->
c,
b = ctx->
d, c = ctx->
e,
2790 auto fn = [&](
F v) {
2794 const float R = ctx->
a,
G = ctx->
b,
2795 a = ctx->
c,
b = ctx->
d, c = ctx->
e,
2810 auto ptr = ptr_at_xy<const uint8_t>(ctx,
dx,dy);
2816 auto ptr = ptr_at_xy<const uint8_t>(ctx,
dx,dy);
2828 auto ptr = ptr_at_xy<uint8_t>(ctx,
dx,dy);
2834 auto ptr = ptr_at_xy<uint8_t>(ctx,
dx,dy);
2841 auto ptr = ptr_at_xy<const uint16_t>(ctx,
dx,dy);
2847 auto ptr = ptr_at_xy<const uint16_t>(ctx,
dx,dy);
2853 const uint16_t* ptr;
2859 auto ptr = ptr_at_xy<uint16_t>(ctx,
dx,dy);
2868 auto ptr = ptr_at_xy<const uint16_t>(ctx,
dx,dy);
2872 auto ptr = ptr_at_xy<const uint16_t>(ctx,
dx,dy);
2876 const uint16_t* ptr;
2881 auto ptr = ptr_at_xy<uint16_t>(ctx,
dx,dy);
2890 auto ptr = ptr_at_xy<const uint32_t>(ctx,
dx,dy);
2894 auto ptr = ptr_at_xy<const uint32_t>(ctx,
dx,dy);
2898 const uint32_t* ptr;
2903 auto ptr = ptr_at_xy<uint32_t>(ctx,
dx,dy);
2913 auto ptr = ptr_at_xy<const uint16_t>(ctx,
dx, dy);
2914 from_88(load<U16>(ptr), &r, &g);
2919 auto ptr = ptr_at_xy<const uint16_t>(ctx,
dx, dy);
2925 const uint16_t* ptr;
2932 auto ptr = ptr_at_xy<uint16_t>(ctx,
dx, dy);
2938 auto ptr = ptr_at_xy<const uint16_t>(ctx,
dx,dy);
2943 auto ptr = ptr_at_xy<const uint16_t>(ctx,
dx, dy);
2948 const uint16_t* ptr;
2954 auto ptr = ptr_at_xy<uint16_t>(ctx,
dx,dy);
2961 auto ptr = ptr_at_xy<const uint32_t>(ctx,
dx, dy);
2967 auto ptr = ptr_at_xy<const uint32_t>(ctx,
dx, dy);
2973 const uint32_t* ptr;
2980 auto ptr = ptr_at_xy<uint32_t>(ctx,
dx,dy);
2988 auto ptr = ptr_at_xy<const uint64_t>(ctx,
dx, dy);
2992 auto ptr = ptr_at_xy<const uint64_t>(ctx,
dx, dy);
2996 const uint64_t* ptr;
3001 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*
dx,4*dy);
3012 auto ptr = ptr_at_xy<const uint64_t>(ctx,
dx, dy);
3016 auto ptr = ptr_at_xy<const uint64_t>(ctx,
dx, dy);
3020 const uint64_t* ptr;
3025 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*
dx,4*dy);
3037 auto ptr = ptr_at_xy<const uint32_t>(ctx,
dx,dy);
3041 auto ptr = ptr_at_xy<const uint32_t>(ctx,
dx,dy);
3045 auto ptr = ptr_at_xy<const uint32_t>(ctx,
dx,dy);
3049 auto ptr = ptr_at_xy<const uint32_t>(ctx,
dx,dy);
3053 const uint32_t* ptr;
3058 const uint32_t* ptr;
3063 const uint64_t* ptr;
3068 auto ptr = ptr_at_xy<const uint64_t>(ctx,
dx, dy);
3072 auto ptr = ptr_at_xy<const uint64_t>(ctx,
dx, dy);
3076 static constexpr float min = -0.752941f;
3077 static constexpr float max = 1.25098f;
3078 static constexpr float range =
max -
min;
3079 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*
dx,4*dy);
3089 auto ptr = ptr_at_xy<uint32_t>(ctx,
dx,dy);
3098 auto ptr = ptr_at_xy<uint32_t>(ctx,
dx,dy);
3099 static constexpr float min = -0.752941f;
3100 static constexpr float max = 1.25098f;
3101 static constexpr float range =
max -
min;
3110 auto ptr = ptr_at_xy<const uint64_t>(ctx,
dx,dy);
3113 load4((
const uint16_t*)ptr, &
R,&
G,&
B,&
A);
3120 auto ptr = ptr_at_xy<const uint64_t>(ctx,
dx,dy);
3123 load4((
const uint16_t*)ptr, &
R,&
G,&
B,&
A);
3130 const uint64_t* ptr;
3132 auto px =
gather(ptr, ix);
3135 load4((
const uint16_t*)&px, &
R,&
G,&
B,&
A);
3142 auto ptr = ptr_at_xy<uint64_t>(ctx,
dx,dy);
3150 auto ptr = ptr_at_xy<const uint16_t>(ctx,
dx,dy);
3152 U16 A = load<U16>((
const uint16_t*)ptr);
3159 auto ptr = ptr_at_xy<const uint16_t>(ctx,
dx, dy);
3161 U16 A = load<U16>((
const uint16_t*)ptr);
3166 const uint16_t* ptr;
3172 auto ptr = ptr_at_xy<uint16_t>(ctx,
dx,dy);
3177 auto ptr = ptr_at_xy<const uint32_t>(ctx,
dx, dy);
3180 load2((
const uint16_t*)ptr, &
R, &
G);
3187 auto ptr = ptr_at_xy<const uint32_t>(ctx,
dx, dy);
3190 load2((
const uint16_t*)ptr, &
R, &
G);
3197 const uint32_t* ptr;
3199 auto px =
gather(ptr, ix);
3202 load2((
const uint16_t*)&px, &
R, &
G);
3209 auto ptr = ptr_at_xy<uint32_t>(ctx,
dx, dy);
3215 auto ptr = ptr_at_xy<const float>(ctx, 4*
dx,4*dy);
3219 auto ptr = ptr_at_xy<const float>(ctx, 4*
dx,4*dy);
3225 r =
gather(ptr, 4*ix + 0);
3226 g =
gather(ptr, 4*ix + 1);
3231 auto ptr = ptr_at_xy<float>(ctx, 4*
dx,4*dy);
3239 auto limit = ctx->
scale;
3243 auto u = v -
floor_(v*invLimit*0.5f)*2*limit;
3248 auto m = u - 2*
s*(u - limit);
3253 return sk_bit_cast<F>(sk_bit_cast<U32>(
m) + ctx->
mirrorBiasDir*biasInUlps);
3281 auto cond = ((0 < r) & (r <
w)) | (r ==
e);
3287 auto cond = ((0 < g) & (g <
h)) | (g ==
e);
3295 auto cond = (((0 < r) & (r <
w)) | (r == ex))
3296 & (((0 < g) & (g <
h)) | (g == ey));
3300 auto mask = sk_unaligned_load<U32>(ctx->
mask);
3301 r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask);
3302 g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask);
3303 b = sk_bit_cast<F>(sk_bit_cast<U32>(
b) & mask);
3304 a = sk_bit_cast<F>(sk_bit_cast<U32>(
a) & mask);
3325 a = r*0.2126f + g*0.7152f +
b*0.0722f;
3329 r = g =
b = r*0.2126f + g*0.7152f +
b*0.0722f;
3336STAGE(matrix_scale_translate,
const float*
m) {
3337 r =
mad(r,
m[0],
m[2]);
3338 g =
mad(g,
m[1],
m[3]);
3391 F* r,
F* g,
F*
b,
F*
a) {
3392 F fr, br, fg, bg, fb, bb, fa, ba;
3393#if defined(JUMPER_IS_HSW)
3395 fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[0]), (__m256i)idx);
3396 br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[0]), (__m256i)idx);
3397 fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[1]), (__m256i)idx);
3398 bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[1]), (__m256i)idx);
3399 fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[2]), (__m256i)idx);
3400 bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[2]), (__m256i)idx);
3401 fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[3]), (__m256i)idx);
3402 ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[3]), (__m256i)idx);
3404#elif defined(JUMPER_IS_LASX)
3406 fr = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[0], 0), idx);
3407 br = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[0], 0), idx);
3408 fg = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[1], 0), idx);
3409 bg = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[1], 0), idx);
3410 fb = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[2], 0), idx);
3411 bb = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[2], 0), idx);
3412 fa = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[3], 0), idx);
3413 ba = (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[3], 0), idx);
3415#elif defined(JUMPER_IS_LSX)
3417 __m128i zero = __lsx_vldi(0);
3418 fr = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
fs[0], 0));
3419 br = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
bs[0], 0));
3420 fg = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
fs[1], 0));
3421 bg = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
bs[1], 0));
3422 fb = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
fs[2], 0));
3423 bb = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
bs[2], 0));
3424 fa = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
fs[3], 0));
3425 ba = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->
bs[3], 0));
3439 *r =
mad(t, fr, br);
3440 *g =
mad(t, fg, bg);
3441 *
b =
mad(t, fb, bb);
3442 *
a =
mad(t, fa, ba);
3465 r =
mad(t, c->
f[0], c->
b[0]);
3466 g =
mad(t, c->
f[1], c->
b[1]);
3467 b =
mad(t, c->
f[2], c->
b[2]);
3468 a =
mad(t, c->
f[3], c->
b[3]);
3477 F slope =
min(xabs, yabs)/
max(xabs, yabs);
3478 F s = slope * slope;
3485 * (0.15912117063999176025390625f +
s
3486 * (-5.185396969318389892578125e-2f +
s
3487 * (2.476101927459239959716796875e-2f +
s
3488 * (-7.0547382347285747528076171875e-3f))));
3508 F x = r,
y = g, &t = r;
3513 F x = r,
y = g, &t = r;
3518 F x = r,
y = g, &t = r;
3523 F x = r,
y = g, &t = r;
3528 F x = r,
y = g, &t = r;
3556STAGE(apply_vector_mask,
const uint32_t* ctx) {
3557 const U32 mask = sk_unaligned_load<U32>(ctx);
3558 r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask);
3559 g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask);
3560 b = sk_bit_cast<F>(sk_bit_cast<U32>(
b) & mask);
3561 a = sk_bit_cast<F>(sk_bit_cast<U32>(
a) & mask);
3569 fy =
fract(*g + 0.5f);
3582 * sk_unaligned_load<F>(c->
scaley);
3594template <
int kScale>
3596 *
x = sk_unaligned_load<F>(ctx->
x) + (
kScale * 0.5f);
3597 F fx = sk_unaligned_load<F>(ctx->
fx);
3600 if (
kScale == -1) { scalex = 1.0f - fx; }
3601 if (
kScale == +1) { scalex = fx; }
3604template <
int kScale>
3606 *
y = sk_unaligned_load<F>(ctx->
y) + (
kScale * 0.5f);
3607 F fy = sk_unaligned_load<F>(ctx->
fy);
3610 if (
kScale == -1) { scaley = 1.0f - fy; }
3611 if (
kScale == +1) { scaley = fy; }
3637template <
int kScale>
3639 *
x = sk_unaligned_load<F>(ctx->
x) + (
kScale * 0.5f);
3642 if (
kScale == -3) { scalex = sk_unaligned_load<F>(ctx->
wx[0]); }
3643 if (
kScale == -1) { scalex = sk_unaligned_load<F>(ctx->
wx[1]); }
3644 if (
kScale == +1) { scalex = sk_unaligned_load<F>(ctx->
wx[2]); }
3645 if (
kScale == +3) { scalex = sk_unaligned_load<F>(ctx->
wx[3]); }
3648template <
int kScale>
3650 *
y = sk_unaligned_load<F>(ctx->
y) + (
kScale * 0.5f);
3653 if (
kScale == -3) { scaley = sk_unaligned_load<F>(ctx->
wy[0]); }
3654 if (
kScale == -1) { scaley = sk_unaligned_load<F>(ctx->
wy[1]); }
3655 if (
kScale == +1) { scaley = sk_unaligned_load<F>(ctx->
wy[2]); }
3656 if (
kScale == +3) { scaley = sk_unaligned_load<F>(ctx->
wy[3]); }
3665 F fx = sk_unaligned_load<F>(ctx->
fx);
3671 F fy = sk_unaligned_load<F>(ctx->
fy);
3693#ifdef SK_CPU_BENDIAN
3694 U32 sampleLo = sample >> 16;
3695 U32 sampleHi = sample & 0xFFFF;
3697 U32 sampleLo = sample & 0xFFFF;
3698 U32 sampleHi = sample >> 16;
3702 F vecX =
mad(
cast(sampleLo), 2.0f / 65535.0f, -1.0f);
3703 F vecY =
mad(
cast(sampleHi), 2.0f / 65535.0f, -1.0f);
3718 for (
int octave = 0; octave < ctx->
numOctaves; ++octave) {
3720 F floorValX =
floor_(noiseVecX);
3721 F floorValY =
floor_(noiseVecY);
3722 F ceilValX = floorValX + 1.0f;
3723 F ceilValY = floorValY + 1.0f;
3724 F fractValX = noiseVecX - floorValX;
3725 F fractValY = noiseVecY - floorValY;
3729 floorValX -= sk_bit_cast<F>(
cond_to_mask(floorValX >= stitchDataX) &
3730 sk_bit_cast<I32>(stitchDataX));
3731 floorValY -= sk_bit_cast<F>(
cond_to_mask(floorValY >= stitchDataY) &
3732 sk_bit_cast<I32>(stitchDataY));
3733 ceilValX -= sk_bit_cast<F>(
cond_to_mask(ceilValX >= stitchDataX) &
3734 sk_bit_cast<I32>(stitchDataX));
3735 ceilValY -= sk_bit_cast<F>(
cond_to_mask(ceilValY >= stitchDataY) &
3736 sk_bit_cast<I32>(stitchDataY));
3741 latticeLookup = (
U32)(
iround(ceilValX)) & 0xFF;
3744 U32 b00 = (
U32)(
iround(latticeIdxX + floorValY)) & 0xFF;
3745 U32 b10 = (
U32)(
iround(latticeIdxY + floorValY)) & 0xFF;
3746 U32 b01 = (
U32)(
iround(latticeIdxX + ceilValY)) & 0xFF;
3747 U32 b11 = (
U32)(
iround(latticeIdxY + ceilValY)) & 0xFF;
3751 F smoothX = fractValX * fractValX * (3.0f - 2.0f * fractValX);
3752 F smoothY = fractValY * fractValY * (3.0f - 2.0f * fractValY);
3755 const uint32_t* channelNoiseData =
reinterpret_cast<const uint32_t*
>(ctx->
noiseData);
3756 for (
int channel = 0; channel < 4; ++channel) {
3757 U32 sample00 =
gather(channelNoiseData, b00);
3758 U32 sample10 =
gather(channelNoiseData, b10);
3759 U32 sample01 =
gather(channelNoiseData, b01);
3760 U32 sample11 =
gather(channelNoiseData, b11);
3761 channelNoiseData += 256;
3765 F A =
lerp(u, v, smoothX);
3769 F B =
lerp(u, v, smoothX);
3790 stitchDataX *= 2.0f;
3791 stitchDataY *= 2.0f;
3797 r =
mad(r, 0.5f, 0.5f);
3798 g =
mad(g, 0.5f, 0.5f);
3799 b =
mad(
b, 0.5f, 0.5f);
3800 a =
mad(
a, 0.5f, 0.5f);
3820 r = sk_unaligned_load<F>(ctx->
x) * ctx->
scaleX;
3821 g = sk_unaligned_load<F>(ctx->
y) * ctx->
scaleY;
3847#define execution_mask() sk_bit_cast<I32>(a)
3848#define update_execution_mask() a = sk_bit_cast<F>(sk_bit_cast<I32>(r) & \
3849 sk_bit_cast<I32>(g) & \
3850 sk_bit_cast<I32>(b))
3853 uint32_t iota[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
3857 r = g =
b =
a = sk_bit_cast<F>(mask);
3863 static constexpr float iota[] = {
3864 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
3865 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
3877 F temp[4] = {r, g,
b,
a};
3889 r = sk_unaligned_load<F>(ctx);
3899 r = sk_bit_cast<F>(ptr[0] & ptr[1]);
3905 r = sk_bit_cast<F>(ptr[0] & ~ptr[1]);
3910 g = sk_unaligned_load<F>(ctx);
3927 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) | ptr[0]);
3934 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) & ptr[0]);
3951 I32* actualValue = (
I32*)(
base + ctx.offset);
3955 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) | caseMatches);
3959 I32* defaultMask = actualValue + 1;
3960 *defaultMask &= ~caseMatches;
3964 b = sk_unaligned_load<F>(ctx);
3980 uint32_t iota[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
4034 if (
any(*traceMask)) {
4043 for (
size_t lane = 0; lane <
N; ++lane) {
4050 indirectOffset = std::min<uint32_t>(indirectOffset, ctx->
indirectLimit);
4051 data += indirectOffset;
4052 slotIdx += indirectOffset;
4054 while (numSlots--) {
4066 const int*
src = ctx->
src;
4071 const int*
src = ctx->
src;
4077 const int*
src = ctx->
src;
4084 const int*
src = ctx->
src;
4117template <
int NumSlots>
4122 memcpy(
dst,
src,
sizeof(
F) * NumSlots);
4126 copy_n_slots_unmasked_fn<1>(packed,
base);
4129 copy_n_slots_unmasked_fn<2>(packed,
base);
4132 copy_n_slots_unmasked_fn<3>(packed,
base);
4135 copy_n_slots_unmasked_fn<4>(packed,
base);
4138template <
int NumSlots>
4143 float*
src = (
float*)(
base + ctx.src);
4145 SK_UNROLL for (
int index = 0; index < NumSlots; ++index) {
4150 SK_UNROLL for (
int index = 0; index < NumSlots; ++index) {
4156 copy_n_immutable_unmasked_fn<1>(packed,
base);
4159 copy_n_immutable_unmasked_fn<2>(packed,
base);
4162 copy_n_immutable_unmasked_fn<3>(packed,
base);
4165 copy_n_immutable_unmasked_fn<4>(packed,
base);
4168template <
int NumSlots>
4193template <
int LoopCount,
typename OffsetType>
4206 case 16:
dst[15] = scratch[15]; [[fallthrough]];
4207 case 15:
dst[14] = scratch[14]; [[fallthrough]];
4208 case 14:
dst[13] = scratch[13]; [[fallthrough]];
4209 case 13:
dst[12] = scratch[12]; [[fallthrough]];
4210 case 12:
dst[11] = scratch[11]; [[fallthrough]];
4211 case 11:
dst[10] = scratch[10]; [[fallthrough]];
4212 case 10:
dst[ 9] = scratch[ 9]; [[fallthrough]];
4213 case 9:
dst[ 8] = scratch[ 8]; [[fallthrough]];
4214 case 8:
dst[ 7] = scratch[ 7]; [[fallthrough]];
4215 case 7:
dst[ 6] = scratch[ 6]; [[fallthrough]];
4216 case 6:
dst[ 5] = scratch[ 5]; [[fallthrough]];
4217 case 5:
dst[ 4] = scratch[ 4]; [[fallthrough]];
4218 case 4:
dst[ 3] = scratch[ 3]; [[fallthrough]];
4219 case 3:
dst[ 2] = scratch[ 2]; [[fallthrough]];
4220 case 2:
dst[ 1] = scratch[ 1]; [[fallthrough]];
4221 case 1:
dst[ 0] = scratch[ 0];
4228 shuffle_fn<N>(
base + ctx.dst, ctx.offsets,
N);
4232 small_swizzle_fn<1>(packed,
base);
4235 small_swizzle_fn<2>(packed,
base);
4238 small_swizzle_fn<3>(packed,
base);
4241 small_swizzle_fn<4>(packed,
base);
4247template <
int NumSlots>
4249 std::byte* dstB = (std::byte*)
dst;
4280 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4282 offsets += sk_unaligned_load<U32>(iota);
4285 const int*
src = ctx->
src;
4292 }
while (
dst != end);
4301 const int*
src = ctx->
src;
4320 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4322 offsets += sk_unaligned_load<U32>(iota);
4345 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4347 offsets += sk_unaligned_load<U32>(iota);
4352 std::byte* dstB = (std::byte*)ctx->
dst;
4353 const uint16_t* swizzle = ctx->
offsets;
4356 int*
dst = (
int*)(dstB + *swizzle);
4366template <
typename T,
void (*ApplyFn)(T*)>
4374#if defined(JUMPER_IS_SCALAR)
4375template <
typename T>
4377 *
dst = sk_bit_cast<T>((
F)*
dst);
4386template <
typename T>
4388 *
dst = sk_bit_cast<T>(__builtin_convertvector(*
dst,
F));
4391 *
dst = sk_bit_cast<F>(__builtin_convertvector(*
dst,
I32));
4394 *
dst = sk_bit_cast<F>(__builtin_convertvector(*
dst,
U32));
4414#define DECLARE_UNARY_FLOAT(name) \
4415 STAGE_TAIL(name##_float, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 1); } \
4416 STAGE_TAIL(name##_2_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 2); } \
4417 STAGE_TAIL(name##_3_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 3); } \
4418 STAGE_TAIL(name##_4_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 4); }
4420#define DECLARE_UNARY_INT(name) \
4421 STAGE_TAIL(name##_int, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 1); } \
4422 STAGE_TAIL(name##_2_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 2); } \
4423 STAGE_TAIL(name##_3_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 3); } \
4424 STAGE_TAIL(name##_4_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 4); }
4426#define DECLARE_UNARY_UINT(name) \
4427 STAGE_TAIL(name##_uint, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 1); } \
4428 STAGE_TAIL(name##_2_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 2); } \
4429 STAGE_TAIL(name##_3_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 3); } \
4430 STAGE_TAIL(name##_4_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 4); }
4440#undef DECLARE_UNARY_FLOAT
4441#undef DECLARE_UNARY_INT
4442#undef DECLARE_UNARY_UINT
4459 a10 =
dst[2], a11 =
dst[3];
4460 F det =
nmad(a01, a10, a00 * a11),
4462 dst[0] = invdet * a11;
4463 dst[1] = -invdet * a01;
4464 dst[2] = -invdet * a10;
4465 dst[3] = invdet * a00;
4470 a10 =
dst[3], a11 =
dst[4], a12 =
dst[5],
4471 a20 =
dst[6], a21 =
dst[7], a22 =
dst[8];
4472 F b01 =
nmad(a12, a21, a22 * a11),
4473 b11 =
nmad(a22, a10, a12 * a20),
4474 b21 =
nmad(a11, a20, a21 * a10);
4475 F det =
mad(a00, b01,
mad(a01, b11, a02 * b21)),
4477 dst[0] = invdet * b01;
4478 dst[1] = invdet *
nmad(a22, a01, a02 * a21);
4479 dst[2] = invdet *
nmad(a02, a11, a12 * a01);
4480 dst[3] = invdet * b11;
4481 dst[4] = invdet *
nmad(a02, a20, a22 * a00);
4482 dst[5] = invdet *
nmad(a12, a00, a02 * a10);
4483 dst[6] = invdet * b21;
4484 dst[7] = invdet *
nmad(a21, a00, a01 * a20);
4485 dst[8] = invdet *
nmad(a01, a10, a11 * a00);
4490 a10 =
dst[4], a11 =
dst[5], a12 =
dst[6], a13 =
dst[7],
4491 a20 =
dst[8], a21 =
dst[9], a22 =
dst[10], a23 =
dst[11],
4492 a30 =
dst[12], a31 =
dst[13], a32 =
dst[14], a33 =
dst[15];
4493 F b00 =
nmad(a01, a10, a00 * a11),
4494 b01 =
nmad(a02, a10, a00 * a12),
4495 b02 =
nmad(a03, a10, a00 * a13),
4496 b03 =
nmad(a02, a11, a01 * a12),
4497 b04 =
nmad(a03, a11, a01 * a13),
4498 b05 =
nmad(a03, a12, a02 * a13),
4499 b06 =
nmad(a21, a30, a20 * a31),
4500 b07 =
nmad(a22, a30, a20 * a32),
4501 b08 =
nmad(a23, a30, a20 * a33),
4502 b09 =
nmad(a22, a31, a21 * a32),
4503 b10 =
nmad(a23, a31, a21 * a33),
4504 b11 =
nmad(a23, a32, a22 * a33),
4505 det =
mad(b00, b11, b05 * b06) +
mad(b02, b09, b03 * b08) -
mad(b01, b10, b04 * b07),
4519 dst[0] =
mad(a13, b09,
nmad(a12, b10, a11*b11));
4521 dst[2] =
mad(a33, b03,
nmad(a32, b04, a31*b05));
4524 dst[5] =
mad(a03, b07,
nmad(a02, b08, a00*b11));
4526 dst[7] =
mad(a23, b01,
nmad(a22, b02, a20*b05));
4527 dst[8] =
mad(a13, b06,
nmad(a11, b08, a10*b10));
4529 dst[10] =
mad(a33, b00,
nmad(a31, b02, a30*b04));
4530 dst[11] =
nmad(a23, b00,
nmad(a20, b04, a21*b02));
4531 dst[12] =
nmad(a12, b06,
nmad(a10, b09, a11*b07));
4532 dst[13] =
mad(a02, b06,
nmad(a01, b07, a00*b09));
4533 dst[14] =
nmad(a32, b00,
nmad(a30, b03, a31*b01));
4534 dst[15] =
mad(a22, b00,
nmad(a21, b01, a20*b03));
4538template <
typename T,
void (*ApplyFn)(T*, T*)>
4548template <
typename T,
void (*ApplyFn)(T*, T*)>
4551 std::byte*
dst =
base + ctx.dst;
4552 std::byte*
src =
base + ctx.src;
4553 apply_adjacent_binary<T, ApplyFn>((
T*)
dst, (
T*)
src);
4556template <
int N,
typename V,
typename S,
void (*ApplyFn)(V*, V*)>
4560 S scalar = sk_bit_cast<S>(ctx.value);
4561 V src = scalar -
V();
4562 SK_UNROLL for (
int index = 0; index <
N; ++index) {
4568template <
typename T>
4573template <
typename T>
4578template <
typename T>
4583template <
typename T>
4586 if constexpr (!std::is_same_v<T, F>) {
4605template <
typename T>
4610template <
typename T>
4615template <
typename T>
4617 static_assert(
sizeof(
T) ==
sizeof(
I32));
4622template <
typename T>
4624 static_assert(
sizeof(
T) ==
sizeof(
I32));
4629template <
typename T>
4631 static_assert(
sizeof(
T) ==
sizeof(
I32));
4636template <
typename T>
4638 static_assert(
sizeof(
T) ==
sizeof(
I32));
4655#define DECLARE_N_WAY_BINARY_FLOAT(name) \
4656 STAGE_TAIL(name##_n_floats, SkRasterPipeline_BinaryOpCtx* packed) { \
4657 apply_adjacent_binary_packed<F, &name##_fn>(packed, base); \
4660#define DECLARE_BINARY_FLOAT(name) \
4661 STAGE_TAIL(name##_float, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 1); } \
4662 STAGE_TAIL(name##_2_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 2); } \
4663 STAGE_TAIL(name##_3_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 3); } \
4664 STAGE_TAIL(name##_4_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 4); } \
4665 DECLARE_N_WAY_BINARY_FLOAT(name)
4667#define DECLARE_N_WAY_BINARY_INT(name) \
4668 STAGE_TAIL(name##_n_ints, SkRasterPipeline_BinaryOpCtx* packed) { \
4669 apply_adjacent_binary_packed<I32, &name##_fn>(packed, base); \
4672#define DECLARE_BINARY_INT(name) \
4673 STAGE_TAIL(name##_int, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 1); } \
4674 STAGE_TAIL(name##_2_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 2); } \
4675 STAGE_TAIL(name##_3_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 3); } \
4676 STAGE_TAIL(name##_4_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 4); } \
4677 DECLARE_N_WAY_BINARY_INT(name)
4679#define DECLARE_N_WAY_BINARY_UINT(name) \
4680 STAGE_TAIL(name##_n_uints, SkRasterPipeline_BinaryOpCtx* packed) { \
4681 apply_adjacent_binary_packed<U32, &name##_fn>(packed, base); \
4684#define DECLARE_BINARY_UINT(name) \
4685 STAGE_TAIL(name##_uint, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 1); } \
4686 STAGE_TAIL(name##_2_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 2); } \
4687 STAGE_TAIL(name##_3_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 3); } \
4688 STAGE_TAIL(name##_4_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 4); } \
4689 DECLARE_N_WAY_BINARY_UINT(name)
4714#define DECLARE_IMM_BINARY_FLOAT(name) \
4715 STAGE_TAIL(name##_imm_float, SkRasterPipeline_ConstantCtx* packed) { \
4716 apply_binary_immediate<1, F, float, &name##_fn>(packed, base); \
4718#define DECLARE_IMM_BINARY_INT(name) \
4719 STAGE_TAIL(name##_imm_int, SkRasterPipeline_ConstantCtx* packed) { \
4720 apply_binary_immediate<1, I32, int32_t, &name##_fn>(packed, base); \
4722#define DECLARE_MULTI_IMM_BINARY_INT(name) \
4723 STAGE_TAIL(name##_imm_int, SkRasterPipeline_ConstantCtx* packed) { \
4724 apply_binary_immediate<1, I32, int32_t, &name##_fn>(packed, base); \
4726 STAGE_TAIL(name##_imm_2_ints, SkRasterPipeline_ConstantCtx* packed) { \
4727 apply_binary_immediate<2, I32, int32_t, &name##_fn>(packed, base); \
4729 STAGE_TAIL(name##_imm_3_ints, SkRasterPipeline_ConstantCtx* packed) { \
4730 apply_binary_immediate<3, I32, int32_t, &name##_fn>(packed, base); \
4732 STAGE_TAIL(name##_imm_4_ints, SkRasterPipeline_ConstantCtx* packed) { \
4733 apply_binary_immediate<4, I32, int32_t, &name##_fn>(packed, base); \
4735#define DECLARE_IMM_BINARY_UINT(name) \
4736 STAGE_TAIL(name##_imm_uint, SkRasterPipeline_ConstantCtx* packed) { \
4737 apply_binary_immediate<1, U32, uint32_t, &name##_fn>(packed, base); \
4751#undef DECLARE_MULTI_IMM_BINARY_INT
4752#undef DECLARE_IMM_BINARY_FLOAT
4753#undef DECLARE_IMM_BINARY_INT
4754#undef DECLARE_IMM_BINARY_UINT
4755#undef DECLARE_BINARY_FLOAT
4756#undef DECLARE_BINARY_INT
4757#undef DECLARE_BINARY_UINT
4758#undef DECLARE_N_WAY_BINARY_FLOAT
4759#undef DECLARE_N_WAY_BINARY_INT
4760#undef DECLARE_N_WAY_BINARY_UINT
4788 int outColumns = ctx.rightColumns,
4789 outRows = ctx.leftRows;
4796 SkASSERT(ctx.leftColumns == ctx.rightRows);
4799#if !defined(JUMPER_IS_SCALAR)
4808 F* resultMtx = (
F*)(
base + ctx.dst);
4809 F* leftMtx = &resultMtx[ctx.rightColumns * ctx.leftRows];
4810 F* rightMtx = &leftMtx[
N * ctx.leftRows];
4813 for (
int c = 0; c < outColumns; ++c) {
4814 for (
int r = 0; r < outRows; ++r) {
4816 F* leftRow = &leftMtx [r];
4817 F* rightColumn = &rightMtx[c *
N];
4819 F element = *leftRow * *rightColumn;
4820 for (
int idx = 1; idx <
N; ++idx) {
4823 element =
mad(*leftRow, *rightColumn, element);
4826 *resultMtx++ = element;
4832 matrix_multiply<2>(packed,
base);
4836 matrix_multiply<3>(packed,
base);
4840 matrix_multiply<4>(packed,
base);
4847 F *incident =
dst + 0;
4848 F *normal =
dst + 4;
4851 F dotNI =
mad(normal[0], incident[0],
4852 mad(normal[1], incident[1],
4853 mad(normal[2], incident[2],
4854 normal[3] * incident[3])));
4856 F k = 1.0 - eta * eta * (1.0 - dotNI * dotNI);
4859 for (
int idx = 0; idx < 4; ++idx) {
4861 eta * incident[idx] - (eta * dotNI + sqrt_k) * normal[idx],
4867template <
typename T,
void (*ApplyFn)(T*, T*, T*)>
4870#if !defined(JUMPER_IS_SCALAR)
4874 for (
int index = 0; index <
count; ++index) {
4875 ApplyFn(
dst, src0, src1);
4882template <
typename T,
void (*ApplyFn)(T*, T*, T*)>
4885 std::byte*
dst =
base + ctx.dst;
4886 std::byte* src0 =
dst + ctx.delta;
4887 std::byte* src1 = src0 + ctx.delta;
4888 apply_adjacent_ternary<T, ApplyFn>((
T*)
dst, (
T*)src0, (
T*)src1);
4902 F t =
clamp_01_((*
x - *edge0) / (*edge1 - *edge0));
4903 *edge0 = t * t * (3.0 - 2.0 * t);
4906#define DECLARE_N_WAY_TERNARY_FLOAT(name) \
4907 STAGE_TAIL(name##_n_floats, SkRasterPipeline_TernaryOpCtx* packed) { \
4908 apply_adjacent_ternary_packed<F, &name##_fn>(packed, base); \
4911#define DECLARE_TERNARY_FLOAT(name) \
4912 STAGE_TAIL(name##_float, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+1, p+2); } \
4913 STAGE_TAIL(name##_2_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+2, p+4); } \
4914 STAGE_TAIL(name##_3_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+3, p+6); } \
4915 STAGE_TAIL(name##_4_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+4, p+8); } \
4916 DECLARE_N_WAY_TERNARY_FLOAT(name)
4918#define DECLARE_TERNARY_INT(name) \
4919 STAGE_TAIL(name##_int, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+1, p+2); } \
4920 STAGE_TAIL(name##_2_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+2, p+4); } \
4921 STAGE_TAIL(name##_3_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+3, p+6); } \
4922 STAGE_TAIL(name##_4_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+4, p+8); } \
4923 STAGE_TAIL(name##_n_ints, SkRasterPipeline_TernaryOpCtx* packed) { \
4924 apply_adjacent_ternary_packed<I32, &name##_fn>(packed, base); \
4931#undef DECLARE_N_WAY_TERNARY_FLOAT
4932#undef DECLARE_TERNARY_FLOAT
4933#undef DECLARE_TERNARY_INT
4940 const float c4 = -2.26661229133605957031f;
4941 const float c3 = 2.89795351028442382812f;
4942 const float c2 = 0.21345567703247070312f;
4943 const float c1 = 0.15489584207534790039f;
4944 const float c0 = 0.00030726194381713867f;
4960 fy =
fract(cy + 0.5f);
4965 for (
float py = -0.5f; py <= +0.5f; py += 1.0f)
4966 for (
float px = -0.5f; px <= +0.5f; px += 1.0f) {
4972 const uint32_t* ptr;
4982 F sx = (px > 0) ? fx : 1.0f - fx,
4983 sy = (py > 0) ? fy : 1.0f - fy,
5002 fy =
fract(cy + 0.5f);
5017 F sample_y = cy - 1.5f;
5018 for (
int yy = 0; yy <= 3; ++yy) {
5019 F sample_x = cx - 1.5f;
5020 for (
int xx = 0; xx <= 3; ++xx) {
5021 F scale = scalex[xx] * scaley[yy];
5024 const uint32_t* ptr;
5044 auto ir = r, ig = g, ib =
b, ia =
a;
5045 F* o[] = {&r, &g, &
b, &
a};
5047 memcpy(swiz, &ctx,
sizeof(swiz));
5049 for (
int i = 0;
i < 4; ++
i) {
5051 case 'r': *o[
i] = ir;
break;
5052 case 'g': *o[
i] = ig;
break;
5053 case 'b': *o[
i] = ib;
break;
5054 case 'a': *o[
i] = ia;
break;
5055 case '0': *o[
i] =
F0;
break;
5056 case '1': *o[
i] =
F1;
break;
5063#if defined(JUMPER_IS_SCALAR) || defined(SK_ENABLE_OPTIMIZE_SIZE) || \
5064 defined(SK_BUILD_FOR_GOOGLE3) || defined(SK_DISABLE_LOWP_RASTER_PIPELINE)
5072 #define M(st) static void (*st)(void) = nullptr;
5079 uint8_t* tailPointer) {}
5083#if defined(JUMPER_IS_SKX) || defined(JUMPER_IS_HSW) || defined(JUMPER_IS_LASX)
5084 template <
typename T>
using V = Vec<16, T>;
5086 template <
typename T>
using V = Vec<8, T>;
5098static constexpr size_t N =
sizeof(
U16) /
sizeof(uint16_t);
5101#if defined(__clang__)
5102SI constexpr U16 U16_(uint16_t
x) {
return x; }
5105SI constexpr F F_ (
float x) {
return x; }
5107SI constexpr U16 U16_(uint16_t
x) {
return x +
U16(); }
5110SI constexpr F F_ (
float x) {
return x -
F (); }
5113static constexpr U16 U16_0 = U16_(0),
5114 U16_255 = U16_(255);
5119#if JUMPER_NARROW_STAGES
5127 size_t dx,
size_t dy,
5133 size_t xlimit,
size_t ylimit,
5136 uint8_t* tailPointer) {
5137 uint8_t unreferencedTail;
5139 tailPointer = &unreferencedTail;
5142 for (
size_t dy = y0; dy < ylimit; dy++) {
5143 #if JUMPER_NARROW_STAGES
5144 Params
params = { x0,dy, U16_0,U16_0,U16_0,U16_0 };
5149 *tailPointer =
tail;
5153 *tailPointer = 0xFF;
5157 for (;
dx +
N <= xlimit;
dx +=
N) {
5158 start(program,
dx,dy, U16_0,U16_0,U16_0,U16_0, U16_0,U16_0,U16_0,U16_0);
5160 if (
size_t tail = xlimit -
dx) {
5161 *tailPointer =
tail;
5163 start(program,
dx,dy, U16_0,U16_0,U16_0,U16_0, U16_0,U16_0,U16_0,U16_0);
5165 *tailPointer = 0xFF;
5171#if JUMPER_NARROW_STAGES
5188#if JUMPER_NARROW_STAGES
5189 #define STAGE_GG(name, ARG) \
5190 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y); \
5191 static void ABI name(Params* params, SkRasterPipelineStage* program, \
5192 U16 r, U16 g, U16 b, U16 a) { \
5193 auto x = join<F>(r,g), \
5195 name##_k(Ctx{program}, params->dx,params->dy, x,y); \
5198 auto fn = (Stage)(++program)->fn; \
5199 fn(params, program, r,g,b,a); \
5201 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y)
5203 #define STAGE_GP(name, ARG) \
5204 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5205 U16& r, U16& g, U16& b, U16& a, \
5206 U16& dr, U16& dg, U16& db, U16& da); \
5207 static void ABI name(Params* params, SkRasterPipelineStage* program, \
5208 U16 r, U16 g, U16 b, U16 a) { \
5209 auto x = join<F>(r,g), \
5211 name##_k(Ctx{program}, params->dx,params->dy, x,y, r,g,b,a, \
5212 params->dr,params->dg,params->db,params->da); \
5213 auto fn = (Stage)(++program)->fn; \
5214 fn(params, program, r,g,b,a); \
5216 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5217 U16& r, U16& g, U16& b, U16& a, \
5218 U16& dr, U16& dg, U16& db, U16& da)
5220 #define STAGE_PP(name, ARG) \
5221 SI void name##_k(ARG, size_t dx, size_t dy, \
5222 U16& r, U16& g, U16& b, U16& a, \
5223 U16& dr, U16& dg, U16& db, U16& da); \
5224 static void ABI name(Params* params, SkRasterPipelineStage* program, \
5225 U16 r, U16 g, U16 b, U16 a) { \
5226 name##_k(Ctx{program}, params->dx,params->dy, r,g,b,a, \
5227 params->dr,params->dg,params->db,params->da); \
5228 auto fn = (Stage)(++program)->fn; \
5229 fn(params, program, r,g,b,a); \
5231 SI void name##_k(ARG, size_t dx, size_t dy, \
5232 U16& r, U16& g, U16& b, U16& a, \
5233 U16& dr, U16& dg, U16& db, U16& da)
5235 #define STAGE_GG(name, ARG) \
5236 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y); \
5237 static void ABI name(SkRasterPipelineStage* program, \
5238 size_t dx, size_t dy, \
5239 U16 r, U16 g, U16 b, U16 a, \
5240 U16 dr, U16 dg, U16 db, U16 da) { \
5241 auto x = join<F>(r,g), \
5243 name##_k(Ctx{program}, dx,dy, x,y); \
5246 auto fn = (Stage)(++program)->fn; \
5247 fn(program, dx,dy, r,g,b,a, dr,dg,db,da); \
5249 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y)
5251 #define STAGE_GP(name, ARG) \
5252 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5253 U16& r, U16& g, U16& b, U16& a, \
5254 U16& dr, U16& dg, U16& db, U16& da); \
5255 static void ABI name(SkRasterPipelineStage* program, \
5256 size_t dx, size_t dy, \
5257 U16 r, U16 g, U16 b, U16 a, \
5258 U16 dr, U16 dg, U16 db, U16 da) { \
5259 auto x = join<F>(r,g), \
5261 name##_k(Ctx{program}, dx,dy, x,y, r,g,b,a, dr,dg,db,da); \
5262 auto fn = (Stage)(++program)->fn; \
5263 fn(program, dx,dy, r,g,b,a, dr,dg,db,da); \
5265 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5266 U16& r, U16& g, U16& b, U16& a, \
5267 U16& dr, U16& dg, U16& db, U16& da)
5269 #define STAGE_PP(name, ARG) \
5270 SI void name##_k(ARG, size_t dx, size_t dy, \
5271 U16& r, U16& g, U16& b, U16& a, \
5272 U16& dr, U16& dg, U16& db, U16& da); \
5273 static void ABI name(SkRasterPipelineStage* program, \
5274 size_t dx, size_t dy, \
5275 U16 r, U16 g, U16 b, U16 a, \
5276 U16 dr, U16 dg, U16 db, U16 da) { \
5277 name##_k(Ctx{program}, dx,dy, r,g,b,a, dr,dg,db,da); \
5278 auto fn = (Stage)(++program)->fn; \
5279 fn(program, dx,dy, r,g,b,a, dr,dg,db,da); \
5281 SI void name##_k(ARG, size_t dx, size_t dy, \
5282 U16& r, U16& g, U16& b, U16& a, \
5283 U16& dr, U16& dg, U16& db, U16& da)
5307#if defined(JUMPER_IS_NEON)
5310 return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8);
5321#if defined(JUMPER_IS_NEON)
5327 return (v+(v/256))/256;
5334 return (t & sk_bit_cast<U16>(c)) | (
e & sk_bit_cast<U16>(~c));
5337 return (t & sk_bit_cast<U32>(c)) | (
e & sk_bit_cast<U32>(~c));
5348SI U16 from_float(
float f) {
return U16_(
f * 255.0f + 0.5f); }
5352template <
typename D,
typename S>
5354 return __builtin_convertvector(
src,
D);
5357template <
typename D,
typename S>
5358SI void split(
S v,
D* lo,
D* hi) {
5359 static_assert(2*
sizeof(
D) ==
sizeof(
S),
"");
5360 memcpy(lo, (
const char*)&v + 0*
sizeof(
D),
sizeof(
D));
5361 memcpy(hi, (
const char*)&v + 1*
sizeof(
D),
sizeof(
D));
5363template <
typename D,
typename S>
5365 static_assert(
sizeof(
D) == 2*
sizeof(
S),
"");
5367 memcpy((
char*)&v + 0*
sizeof(
S), &lo,
sizeof(
S));
5368 memcpy((
char*)&v + 1*
sizeof(
S), &hi,
sizeof(
S));
5373 return sk_bit_cast<F>( (sk_bit_cast<I32>(t) & c) | (sk_bit_cast<I32>(
e) & ~c) );
5387 return (t & c) | (
e & ~c);
5417#if defined(JUMPER_IS_SKX)
5418 F e = _mm512_rcp14_ps(
x);
5419 return _mm512_fnmadd_ps(
x,
e, _mm512_set1_ps(2.0f)) *
e;
5420#elif defined(JUMPER_IS_HSW)
5424#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
5428#elif defined(JUMPER_IS_NEON)
5432#elif defined(JUMPER_IS_LASX)
5435 return join<F>(__lasx_xvfrecip_s(lo), __lasx_xvfrecip_s(hi));
5436#elif defined(JUMPER_IS_LSX)
5439 return join<F>(__lsx_vfrecip_s(lo), __lsx_vfrecip_s(hi));
5445#if defined(JUMPER_IS_SKX)
5446 return _mm512_sqrt_ps(
x);
5447#elif defined(JUMPER_IS_HSW)
5450 return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi));
5451#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
5454 return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi));
5455#elif defined(SK_CPU_ARM64)
5458 return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi));
5459#elif defined(JUMPER_IS_NEON)
5460 auto sqrt = [](float32x4_t v) {
5461 auto est = vrsqrteq_f32(v);
5462 est *= vrsqrtsq_f32(v,est*est);
5463 est *= vrsqrtsq_f32(v,est*est);
5468 return join<F>(
sqrt(lo),
sqrt(hi));
5469#elif defined(JUMPER_IS_LASX)
5472 return join<F>(__lasx_xvfsqrt_s(lo), __lasx_xvfsqrt_s(hi));
5473#elif defined(JUMPER_IS_LSX)
5476 return join<F>(__lsx_vfsqrt_s(lo), __lsx_vfsqrt_s(hi));
5479 sqrtf(
x[0]), sqrtf(
x[1]), sqrtf(
x[2]), sqrtf(
x[3]),
5480 sqrtf(
x[4]), sqrtf(
x[5]), sqrtf(
x[6]), sqrtf(
x[7]),
5486#if defined(SK_CPU_ARM64)
5489 return join<F>(vrndmq_f32(lo), vrndmq_f32(hi));
5490#elif defined(JUMPER_IS_SKX)
5491 return _mm512_floor_ps(
x);
5492#elif defined(JUMPER_IS_HSW)
5495 return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi));
5496#elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
5499 return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi));
5500#elif defined(JUMPER_IS_LASX)
5503 return join<F>(__lasx_xvfrintrm_s(lo), __lasx_xvfrintrm_s(hi));
5504#elif defined(JUMPER_IS_LSX)
5507 return join<F>(__lsx_vfrintrm_s(lo), __lsx_vfrintrm_s(hi));
5509 F roundtrip = cast<F>(cast<I32>(
x));
5520#if defined(JUMPER_IS_SKX)
5521 return (
I16)_mm256_mulhrs_epi16((__m256i)
a, (__m256i)
b);
5522#elif defined(JUMPER_IS_HSW)
5523 return (
I16)_mm256_mulhrs_epi16((__m256i)
a, (__m256i)
b);
5524#elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
5525 return (
I16)_mm_mulhrs_epi16((__m128i)
a, (__m128i)
b);
5526#elif defined(SK_CPU_ARM64)
5527 return vqrdmulhq_s16(
a,
b);
5528#elif defined(JUMPER_IS_NEON)
5529 return vqrdmulhq_s16(
a,
b);
5530#elif defined(JUMPER_IS_LASX)
5531 I16 res = __lasx_xvmuh_h(
a,
b);
5532 return __lasx_xvslli_h(res, 1);
5533#elif defined(JUMPER_IS_LSX)
5534 I16 res = __lsx_vmuh_h(
a,
b);
5535 return __lsx_vslli_h(res, 1);
5537 const I32 roundingTerm =
I32_(1 << 14);
5538 return cast<I16>((cast<I32>(
a) * cast<I32>(
b) + roundingTerm) >> 15);
5546 #if defined(SK_DEBUG)
5547 for (
size_t i = 0;
i <
N;
i++) {
5554 SkASSERT(-ib <= ia && ia <= 65535 - ib);
5557 return b + sk_bit_cast<U16>(
a);
5561SI F abs_(
F x) {
return sk_bit_cast<F>( sk_bit_cast<I32>(
x) & 0x7fffffff ); }
5565STAGE_GG(seed_shader,
NoCtx) {
5566 static constexpr float iota[] = {
5567 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
5568 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
5572 x = cast<F>(
I32_(
dx)) + sk_unaligned_load<F>(iota);
5573 y = cast<F>(
I32_(dy)) + 0.5f;
5576STAGE_GG(matrix_translate,
const float*
m) {
5580STAGE_GG(matrix_scale_translate,
const float*
m) {
5584STAGE_GG(matrix_2x3,
const float*
m) {
5590STAGE_GG(matrix_perspective,
const float*
m) {
5600 r = U16_(c->
rgba[0]);
5601 g = U16_(c->
rgba[1]);
5602 b = U16_(c->
rgba[2]);
5603 a = U16_(c->
rgba[3]);
5611STAGE_PP(black_color,
NoCtx) { r = g =
b = U16_0;
a = U16_255; }
5612STAGE_PP(white_color,
NoCtx) { r = g =
b = U16_255;
a = U16_255; }
5614STAGE_PP(set_rgb,
const float rgb[3]) {
5615 r = from_float(rgb[0]);
5616 g = from_float(rgb[1]);
5617 b = from_float(rgb[2]);
5621STAGE_PP(clamp_01,
NoCtx) {
5628STAGE_PP(clamp_gamut,
NoCtx) {
5636 r = div255_accurate(r *
a);
5637 g = div255_accurate(g *
a);
5638 b = div255_accurate(
b *
a);
5640STAGE_PP(premul_dst,
NoCtx) {
5641 dr = div255_accurate(
dr *
da);
5642 dg = div255_accurate(
dg *
da);
5643 db = div255_accurate(
db *
da);
5646STAGE_PP(force_opaque ,
NoCtx) {
a = U16_255; }
5647STAGE_PP(force_opaque_dst,
NoCtx) {
da = U16_255; }
5649STAGE_PP(swap_rb,
NoCtx) {
5654STAGE_PP(swap_rb_dst,
NoCtx) {
5660STAGE_PP(move_src_dst,
NoCtx) {
5667STAGE_PP(move_dst_src,
NoCtx) {
5674STAGE_PP(swap_src_dst,
NoCtx) {
5684#define BLEND_MODE(name) \
5685 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
5686 STAGE_PP(name, NoCtx) { \
5687 r = name##_channel(r,dr,a,da); \
5688 g = name##_channel(g,dg,a,da); \
5689 b = name##_channel(b,db,a,da); \
5690 a = name##_channel(a,da,a,da); \
5692 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
5694#if defined(SK_USE_INACCURATE_DIV255_IN_BLEND)
5714 BLEND_MODE(dstin) {
return div255_accurate(
d*sa ); }
5719 BLEND_MODE(modulate) {
return div255_accurate(
s*
d ); }
5728#define BLEND_MODE(name) \
5729 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
5730 STAGE_PP(name, NoCtx) { \
5731 r = name##_channel(r,dr,a,da); \
5732 g = name##_channel(g,dg,a,da); \
5733 b = name##_channel(b,db,a,da); \
5734 a = a + div255( da*inv(a) ); \
5736 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
5755template <
typename T>
5760template <
typename T>
5763 const F w =
F_(sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->
width ) - 1)),
5764 h =
F_(sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->
height) - 1));
5778template <
typename T>
5793template <
typename V,
typename T>
5796 memcpy(&v, ptr,
sizeof(v));
5799template <
typename V,
typename T>
5801 memcpy(ptr, &v,
sizeof(v));
5804#if defined(JUMPER_IS_SKX)
5805 template <
typename V,
typename T>
5807 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5808 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
5809 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
5810 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
5815 return _mm512_i32gather_ps((__m512i)ix, ptr, 4);
5820 return (
U32)_mm512_i32gather_epi32((__m512i)ix, ptr, 4);
5823#elif defined(JUMPER_IS_HSW)
5824 template <
typename V,
typename T>
5826 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5827 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
5828 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
5829 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
5835 split(ix, &lo, &hi);
5837 return join<F>(_mm256_i32gather_ps(ptr, lo, 4),
5838 _mm256_i32gather_ps(ptr, hi, 4));
5844 split(ix, &lo, &hi);
5846 return join<U32>(_mm256_i32gather_epi32((
const int*)ptr, lo, 4),
5847 _mm256_i32gather_epi32((
const int*)ptr, hi, 4));
5849#elif defined(JUMPER_IS_LASX)
5850 template <
typename V,
typename T>
5852 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5853 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
5854 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
5855 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
5858 template <
typename V,
typename T>
5860 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5861 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
5869#if defined(JUMPER_IS_SKX)
5870 rgba = (
U32)_mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), (__m512i)
rgba);
5871 auto cast_U16 = [](
U32 v) ->
U16 {
5872 return (
U16)_mm256_packus_epi32(_mm512_castsi512_si256((__m512i)v),
5873 _mm512_extracti64x4_epi64((__m512i)v, 1));
5875#elif defined(JUMPER_IS_HSW)
5878 split(
rgba, &_01, &_23);
5879 __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
5880 _13 = _mm256_permute2x128_si256(_01,_23, 0x31);
5881 rgba = join<U32>(_02, _13);
5883 auto cast_U16 = [](
U32 v) ->
U16 {
5885 split(v, &_02,&_13);
5886 return (
U16)_mm256_packus_epi32(_02,_13);
5888#elif defined(JUMPER_IS_LASX)
5890 split(
rgba, &_01, &_23);
5891 __m256i _02 = __lasx_xvpermi_q(_01, _23, 0x02),
5892 _13 = __lasx_xvpermi_q(_01, _23, 0x13);
5893 rgba = join<U32>(_02, _13);
5895 auto cast_U16 = [](
U32 v) ->
U16 {
5897 split(v, &_02,&_13);
5898 __m256i tmp0 = __lasx_xvsat_wu(_02, 15);
5899 __m256i tmp1 = __lasx_xvsat_wu(_13, 15);
5900 return __lasx_xvpickev_h(tmp1, tmp0);
5903 auto cast_U16 = [](
U32 v) ->
U16 {
5904 return cast<U16>(v);
5907 *r = cast_U16(
rgba & 65535) & 255;
5908 *g = cast_U16(
rgba & 65535) >> 8;
5909 *
b = cast_U16(
rgba >> 16) & 255;
5910 *
a = cast_U16(
rgba >> 16) >> 8;
5914#if 1 && defined(JUMPER_IS_NEON)
5915 uint8x8x4_t
rgba = vld4_u8((
const uint8_t*)(ptr));
5916 *r = cast<U16>(
rgba.val[0]);
5917 *g = cast<U16>(
rgba.val[1]);
5918 *
b = cast<U16>(
rgba.val[2]);
5919 *
a = cast<U16>(
rgba.val[3]);
5930#if 1 && defined(JUMPER_IS_NEON)
5931 uint8x8x4_t
rgba = {{
5937 vst4_u8((uint8_t*)(ptr),
rgba);
5939 store(ptr, cast<U32>(r | (g<<8)) << 0
5940 | cast<U32>(
b | (
a<<8)) << 16);
5945 load_8888_(ptr_at_xy<const uint32_t>(ctx,
dx,dy), &r,&g,&
b,&
a);
5948 load_8888_(ptr_at_xy<const uint32_t>(ctx,
dx,dy), &
dr,&
dg,&
db,&
da);
5951 store_8888_(ptr_at_xy<uint32_t>(ctx,
dx,dy), r,g,
b,
a);
5954 const uint32_t* ptr;
5963 U16 R = (rgb >> 11) & 31,
5964 G = (rgb >> 5) & 63,
5965 B = (rgb >> 0) & 31;
5968 *r = (
R << 3) | (
R >> 2);
5969 *g = (
G << 2) | (
G >> 4);
5970 *
b = (
B << 3) | (
B >> 2);
5972SI void load_565_(
const uint16_t* ptr,
U16* r,
U16* g,
U16*
b) {
5975SI void store_565_(uint16_t* ptr,
U16 r,
U16 g,
U16 b) {
5983 U16 R = (r * 9 + 36) / 74,
5984 G = (g * 21 + 42) / 85,
5985 B = (
b * 9 + 36) / 74;
5993 load_565_(ptr_at_xy<const uint16_t>(ctx,
dx,dy), &r,&g,&
b);
5997 load_565_(ptr_at_xy<const uint16_t>(ctx,
dx,dy), &
dr,&
dg,&
db);
6001 store_565_(ptr_at_xy<uint16_t>(ctx,
dx,dy), r,g,
b);
6004 const uint16_t* ptr;
6006 from_565(gather<U16>(ptr, ix), &r, &g, &
b);
6013 G = (
rgba >> 8) & 15,
6014 B = (
rgba >> 4) & 15,
6015 A = (
rgba >> 0) & 15;
6033 U16 R = (r + 8) / 17,
6045 load_4444_(ptr_at_xy<const uint16_t>(ctx,
dx,dy), &r,&g,&
b,&
a);
6048 load_4444_(ptr_at_xy<const uint16_t>(ctx,
dx,dy), &
dr,&
dg,&
db,&
da);
6051 store_4444_(ptr_at_xy<uint16_t>(ctx,
dx,dy), r,g,
b,
a);
6054 const uint16_t* ptr;
6064SI void load_88_(
const uint16_t* ptr,
U16* r,
U16* g) {
6065#if 1 && defined(JUMPER_IS_NEON)
6066 uint8x8x2_t rg = vld2_u8((
const uint8_t*)(ptr));
6067 *r = cast<U16>(rg.val[0]);
6068 *g = cast<U16>(rg.val[1]);
6074SI void store_88_(uint16_t* ptr,
U16 r,
U16 g) {
6078#if 1 && defined(JUMPER_IS_NEON)
6083 vst2_u8((uint8_t*)(ptr), rg);
6085 store(ptr, cast<U16>(r | (g<<8)) << 0);
6090 load_88_(ptr_at_xy<const uint16_t>(ctx,
dx, dy), &r, &g);
6095 load_88_(ptr_at_xy<const uint16_t>(ctx,
dx, dy), &
dr, &
dg);
6100 store_88_(ptr_at_xy<uint16_t>(ctx,
dx, dy), r, g);
6103 const uint16_t* ptr;
6105 from_88(gather<U16>(ptr, ix), &r, &g);
6112SI U16 load_8(
const uint8_t* ptr) {
6113 return cast<U16>(load<U8>(ptr));
6115SI void store_8(uint8_t* ptr,
U16 v) {
6117 store(ptr, cast<U8>(v));
6122 a = load_8(ptr_at_xy<const uint8_t>(ctx,
dx,dy));
6126 da = load_8(ptr_at_xy<const uint8_t>(ctx,
dx,dy));
6129 store_8(ptr_at_xy<uint8_t>(ctx,
dx,dy),
a);
6135 a = cast<U16>(gather<U8>(ptr, ix));
6138 store_8(ptr_at_xy<uint8_t>(ctx,
dx,dy), r);
6141STAGE_PP(alpha_to_gray,
NoCtx) {
6145STAGE_PP(alpha_to_gray_dst,
NoCtx) {
6149STAGE_PP(alpha_to_red,
NoCtx) {
6153STAGE_PP(alpha_to_red_dst,
NoCtx) {
6158STAGE_PP(bt709_luminance_or_luma_to_alpha,
NoCtx) {
6159 a = (r*54 + g*183 +
b*19)/256;
6162STAGE_PP(bt709_luminance_or_luma_to_rgb,
NoCtx) {
6163 r = g =
b =(r*54 + g*183 +
b*19)/256;
6168STAGE_PP(load_src,
const uint16_t* ptr) {
6169 r = sk_unaligned_load<U16>(ptr + 0*
N);
6170 g = sk_unaligned_load<U16>(ptr + 1*
N);
6171 b = sk_unaligned_load<U16>(ptr + 2*
N);
6172 a = sk_unaligned_load<U16>(ptr + 3*
N);
6174STAGE_PP(store_src, uint16_t* ptr) {
6180STAGE_PP(store_src_a, uint16_t* ptr) {
6183STAGE_PP(load_dst,
const uint16_t* ptr) {
6184 dr = sk_unaligned_load<U16>(ptr + 0*
N);
6185 dg = sk_unaligned_load<U16>(ptr + 1*
N);
6186 db = sk_unaligned_load<U16>(ptr + 2*
N);
6187 da = sk_unaligned_load<U16>(ptr + 3*
N);
6189STAGE_PP(store_dst, uint16_t* ptr) {
6198STAGE_PP(scale_1_float,
const float*
f) {
6199 U16 c = from_float(*
f);
6205STAGE_PP(lerp_1_float,
const float*
f) {
6206 U16 c = from_float(*
f);
6212STAGE_PP(scale_native,
const uint16_t scales[]) {
6213 auto c = sk_unaligned_load<U16>(scales);
6220STAGE_PP(lerp_native,
const uint16_t scales[]) {
6221 auto c = sk_unaligned_load<U16>(scales);
6229 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx,
dx,dy));
6236 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx,
dx,dy));
6250 load_565_(ptr_at_xy<const uint16_t>(ctx,
dx,dy), &cr,&cg,&cb);
6260 load_565_(ptr_at_xy<const uint16_t>(ctx,
dx,dy), &cr,&cg,&cb);
6270 U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->
mul,
dx,dy)),
6271 add = load_8(ptr_at_xy<const uint8_t>(&ctx->
add,
dx,dy));
6287STAGE_GG(mirror_x_1,
NoCtx) {
6288 auto two = [](
F x){
return x+
x; };
6292SI I16 cond_to_mask_16(
I32 cond) {
return cast<I16>(cond); }
6312 auto mask = sk_unaligned_load<U16>(ctx->
mask);
6320 auto round_color = [](
F x) {
return cast<U16>(
x * 255.0f + 0.5f); };
6322 *r = round_color(
min(
max(0,
R), 1));
6323 *g = round_color(
min(
max(0,
G), 1));
6324 *
b = round_color(
min(
max(0,
B), 1));
6325 *
a = round_color(
A);
6331 F fr, fg, fb, fa, br, bg, bb, ba;
6332#if defined(JUMPER_IS_HSW)
6335 split(idx, &lo, &hi);
6337 fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[0]), lo),
6338 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[0]), hi));
6339 br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[0]), lo),
6340 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[0]), hi));
6341 fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[1]), lo),
6342 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[1]), hi));
6343 bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[1]), lo),
6344 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[1]), hi));
6345 fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[2]), lo),
6346 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[2]), hi));
6347 bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[2]), lo),
6348 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[2]), hi));
6349 fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[3]), lo),
6350 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
fs[3]), hi));
6351 ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[3]), lo),
6352 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->
bs[3]), hi));
6354#elif defined(JUMPER_IS_LASX)
6357 split(idx, &lo, &hi);
6359 fr = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[0], 0), lo),
6360 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[0], 0), hi));
6361 br = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[0], 0), lo),
6362 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[0], 0), hi));
6363 fg = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[1], 0), lo),
6364 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[1], 0), hi));
6365 bg = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[1], 0), lo),
6366 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[1], 0), hi));
6367 fb = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[2], 0), lo),
6368 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[2], 0), hi));
6369 bb = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[2], 0), lo),
6370 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[2], 0), hi));
6371 fa = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[3], 0), lo),
6372 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
fs[3], 0), hi));
6373 ba = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[3], 0), lo),
6374 (__m256)__lasx_xvperm_w(__lasx_xvld(c->
bs[3], 0), hi));
6376#elif defined(JUMPER_IS_LSX)
6379 split(idx, &lo, &hi);
6380 __m128i zero = __lsx_vldi(0);
6381 fr = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
fs[0], 0)),
6382 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
fs[0], 0)));
6383 br = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
bs[0], 0)),
6384 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
bs[0], 0)));
6385 fg = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
fs[1], 0)),
6386 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
fs[1], 0)));
6387 bg = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
bs[1], 0)),
6388 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
bs[1], 0)));
6389 fb = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
fs[2], 0)),
6390 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
fs[2], 0)));
6391 bb = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
bs[2], 0)),
6392 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
bs[2], 0)));
6393 fa = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
fs[3], 0)),
6394 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
fs[3], 0)));
6395 ba = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->
bs[3], 0)),
6396 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->
bs[3], 0)));
6400 fr = gather<F>(c->
fs[0], idx);
6401 fg = gather<F>(c->
fs[1], idx);
6402 fb = gather<F>(c->
fs[2], idx);
6403 fa = gather<F>(c->
fs[3], idx);
6404 br = gather<F>(c->
bs[0], idx);
6405 bg = gather<F>(c->
bs[1], idx);
6406 bb = gather<F>(c->
bs[2], idx);
6407 ba = gather<F>(c->
bs[3], idx);
6409 round_F_to_U16(
mad(t, fr, br),
6436 round_F_to_U16(
mad(t, c->
f[0], c->
b[0]),
6437 mad(t, c->
f[1], c->
b[1]),
6438 mad(t, c->
f[2], c->
b[2]),
6439 mad(t, c->
f[3], c->
b[3]),
6446 I32 qx = cast<I32>(
floor_(65536.0f *
x + 0.5f)) - 32768,
6447 qy = cast<I32>(
floor_(65536.0f *
y + 0.5f)) - 32768;
6463 I16 tx = cast<I16>(qx ^ 0x8000),
6464 ty = cast<I16>(qy ^ 0x8000);
6497 const uint32_t* ptr;
6499 U16 leftR, leftG, leftB, leftA;
6500 from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
6503 U16 rightR, rightG, rightB, rightA;
6504 from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
6506 U16 topR = lerpX(leftR, rightR),
6507 topG = lerpX(leftG, rightG),
6508 topB = lerpX(leftB, rightB),
6509 topA = lerpX(leftA, rightA);
6512 from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
6515 from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
6517 U16 bottomR = lerpX(leftR, rightR),
6518 bottomG = lerpX(leftG, rightG),
6519 bottomB = lerpX(leftB, rightB),
6520 bottomA = lerpX(leftA, rightA);
6524 auto lerpY = [&](
U16 top,
U16 bottom) ->
U16 {
6526 U16 middle = bottom + top;
6533 r = lerpY(topR, bottomR);
6534 g = lerpY(topG, bottomG);
6535 b = lerpY(topB, bottomB);
6536 a = lerpY(topA, bottomA);
6539STAGE_GG(xy_to_unit_angle,
NoCtx) {
6543 F slope =
min(xabs, yabs)/
max(xabs, yabs);
6544 F s = slope * slope;
6551 * (0.15912117063999176025390625f +
s
6552 * (-5.185396969318389892578125e-2f +
s
6553 * (2.476101927459239959716796875e-2f +
s
6554 * (-7.0547382347285747528076171875e-3f))));
6562STAGE_GG(xy_to_radius,
NoCtx) {
6569 auto ptr = ptr_at_xy<uint32_t>(ctx,
dx,dy);
6576 store_8888_(ptr, r,g,
b,
a);
6581STAGE_PP(swizzle,
void* ctx) {
6582 auto ir = r, ig = g, ib =
b, ia =
a;
6583 U16* o[] = {&r, &g, &
b, &
a};
6585 memcpy(swiz, &ctx,
sizeof(swiz));
6587 for (
int i = 0;
i < 4; ++
i) {
6589 case 'r': *o[
i] = ir;
break;
6590 case 'g': *o[
i] = ig;
break;
6591 case 'b': *o[
i] = ib;
break;
6592 case 'a': *o[
i] = ia;
break;
6593 case '0': *o[
i] = U16_0;
break;
6594 case '1': *o[
i] = U16_255;
break;
6604namespace lowp {
static constexpr size_t lowp_N =
N; }
for(const auto glyph :glyphs)
SkPoint lerp(const SkPoint &a, const SkPoint &b, float t)
static float next(float f)
static U16 constrained_add(I16 a, U16 b)
static const uint32_t rgba[kNumPixels]
constexpr float SK_FloatPI
static size_t difference(size_t minuend, size_t subtrahend)
static bool is_degenerate(const SkPath &path)
static int sign(SkScalar x)
static bool left(const SkPoint &p0, const SkPoint &p1)
static bool right(const SkPoint &p0, const SkPoint &p1)
static constexpr int SkRasterPipeline_kMaxStride
static constexpr int SkRasterPipeline_kMaxStride_highp
#define SK_RASTER_PIPELINE_OPS_LOWP(M)
#define DECLARE_IMM_BINARY_INT(name)
#define STAGE_TAIL(name, arg)
#define DECLARE_BINARY_FLOAT(name)
SI Dst widen_cast(const Src &src)
#define DECLARE_IMM_BINARY_FLOAT(name)
#define DECLARE_BINARY_UINT(name)
#define DECLARE_N_WAY_BINARY_FLOAT(name)
#define DECLARE_TERNARY_FLOAT(name)
#define DECLARE_MULTI_IMM_BINARY_INT(name)
#define update_execution_mask()
#define DECLARE_UNARY_UINT(name)
#define DECLARE_TERNARY_INT(name)
#define DECLARE_UNARY_INT(name)
#define DECLARE_IMM_BINARY_UINT(name)
#define DECLARE_UNARY_FLOAT(name)
#define DECLARE_N_WAY_TERNARY_FLOAT(name)
#define DECLARE_BINARY_INT(name)
void swap(sk_sp< T > &a, sk_sp< T > &b)
static SK_ALWAYS_INLINE void SK_FP_SAFE_ABI sk_unaligned_store(P *ptr, T val)
static uint32_t premul(uint32_t color)
virtual void var(int slot, int32_t val)=0
virtual void scope(int delta)=0
virtual void enter(int fnIdx)=0
virtual void exit(int fnIdx)=0
virtual void line(int lineNum)=0
const EmbeddedViewParams * params
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
FlKeyEvent uint64_t FlKeyResponderAsyncCallback callback
static SkColor blend(SkColor dst, SkColor src, void(*mode)(float, float, float, float *, float *, float *))
static void luminosity(float dr, float dg, float db, float *sr, float *sg, float *sb)
static void hue(float dr, float dg, float db, float *sr, float *sg, float *sb)
static void saturation(float dr, float dg, float db, float *sr, float *sg, float *sb)
static float max(float r, float g, float b)
static float min(float r, float g, float b)
__attribute__((visibility("default"))) int RunBenchmarks(int argc
T __attribute__((ext_vector_type(N))) V
static std::unique_ptr< GrFragmentProcessor > hsl_to_rgb(std::unique_ptr< GrFragmentProcessor > child)
static std::unique_ptr< GrFragmentProcessor > rgb_to_hsl(std::unique_ptr< GrFragmentProcessor > child)
static void start_pipeline(size_t, size_t, size_t, size_t, SkRasterPipelineStage *, SkSpan< SkRasterPipeline_MemoryCtxPatch >, uint8_t *tailPointer)
static constexpr size_t lowp_N
static constexpr size_t N
SI F approx_powf(F x, F y)
SI void apply_adjacent_ternary(T *dst, T *src0, T *src1)
SI constexpr I32 I32_(int32_t x)
SI void load4(const uint16_t *ptr, U16 *r, U16 *g, U16 *b, U16 *a)
SI void smoothstep_fn(F *edge0, F *edge1, F *x)
SI void store(T *dst, V v)
SI void from_4444(U16 _4444, F *r, F *g, F *b, F *a)
SI void cmpne_fn(T *dst, T *src)
SI void cast_to_uint_from_fn(F *dst)
SI void from_565(U16 _565, F *r, F *g, F *b)
SI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx *ctx)
SI void cast_to_int_from_fn(F *dst)
void(ABI *)(Params *, SkRasterPipelineStage *program, F r, F g, F b, F a) Stage
SI constexpr U32 U32_(uint32_t x)
SI void set_sat(F *r, F *g, F *b, F s)
SI void mul_fn(T *dst, T *src)
SI void cmpeq_fn(T *dst, T *src)
constexpr size_t raster_pipeline_highp_stride()
SI void sub_fn(T *dst, T *src)
SI void cast_to_float_from_fn(T *dst)
static void patch_memory_contexts(SkSpan< SkRasterPipeline_MemoryCtxPatch > memoryCtxPatches, size_t dx, size_t dy, size_t tail)
SI void copy_n_slots_masked_fn(SkRasterPipeline_BinaryOpCtx *packed, std::byte *base, I32 mask)
SI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb)
SI void store4(uint16_t *ptr, U16 r, U16 g, U16 b, U16 a)
SI uint32_t select_lane(uint32_t data, int)
SI void load2(const uint16_t *ptr, U16 *r, U16 *g)
SI void scatter_masked(I32 src, int *dst, U32 ix, I32 mask)
SI void apply_adjacent_ternary_packed(SkRasterPipeline_TernaryOpCtx *packed, std::byte *base)
SI F if_then_else(I32 c, F t, F e)
static void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, SkRasterPipelineStage *program, SkSpan< SkRasterPipeline_MemoryCtxPatch > memoryCtxPatches, uint8_t *tailPointer)
SI void from_1010102(U32 rgba, F *r, F *g, F *b, F *a)
SI F approx_atan_unit(F x)
SI void from_8888(U32 _8888, F *r, F *g, F *b, F *a)
SI void apply_adjacent_unary(T *dst, T *end)
SI T gather(const T *p, U32 ix)
SI U32 to_unorm(F v, float scale, float bias=1.0f)
SI U32 ix_and_ptr(T **ptr, const SkRasterPipeline_GatherCtx *ctx, F x, F y)
SI void bicubic_y(SkRasterPipeline_SamplerCtx *ctx, F *y)
SI void bicubic_x(SkRasterPipeline_SamplerCtx *ctx, F *x)
SI void cmplt_fn(T *dst, T *src)
STAGE_TAIL(set_base_pointer, std::byte *p)
SI F strip_sign(F x, U32 *sign)
static void restore_memory_contexts(SkSpan< SkRasterPipeline_MemoryCtxPatch > memoryCtxPatches, size_t dx, size_t dy, size_t tail)
SI void bilinear_x(SkRasterPipeline_SamplerCtx *ctx, F *x)
SI void apply_binary_immediate(SkRasterPipeline_ConstantCtx *packed, std::byte *base)
static void ABI stack_checkpoint(Params *params, SkRasterPipelineStage *program, F r, F g, F b, F a)
SI void small_swizzle_fn(SkRasterPipeline_SwizzleCtx *packed, std::byte *base)
SI void from_1616(U32 _1616, F *r, F *g)
SI void from_16161616(U64 _16161616, F *r, F *g, F *b, F *a)
SI void save_xy(F *r, F *g, SkRasterPipeline_SamplerCtx *c)
SI void from_10101010_xr(U64 _10x6, F *r, F *g, F *b, F *a)
SI void apply_adjacent_binary(T *dst, T *src)
SI void from_1010102_xr(U32 rgba, F *r, F *g, F *b, F *a)
SI void bilinear_y(SkRasterPipeline_SamplerCtx *ctx, F *y)
SI void matrix_multiply(SkRasterPipeline_MatrixMultiplyCtx *packed, std::byte *base)
SI void from_10x6(U64 _10x6, F *r, F *g, F *b, F *a)
SI F clamp_ex(F v, float limit)
SI void shuffle_fn(std::byte *ptr, OffsetType *offsets, int numSlots)
static void ABI just_return(Params *, SkRasterPipelineStage *, F, F, F, F)
STAGE_BRANCH(branch_if_all_lanes_active, SkRasterPipeline_BranchIfAllLanesActiveCtx *ctx)
SI void set_lum(F *r, F *g, F *b, F l)
SI void mod_fn(F *dst, F *src)
constexpr size_t raster_pipeline_lowp_stride()
static void ABI stack_rewind(Params *params, SkRasterPipelineStage *program, F r, F g, F b, F a)
SI constexpr F F_(float x)
SI void bitwise_xor_fn(I32 *dst, I32 *src)
SI void atan2_fn(F *dst, F *src)
SI void gradient_lookup(const SkRasterPipeline_GradientCtx *c, U32 idx, F t, F *r, F *g, F *b, F *a)
SI void bitwise_and_fn(I32 *dst, I32 *src)
SI F apply_sign(F x, U32 sign)
SI void div_fn(T *dst, T *src)
SI void cmple_fn(T *dst, T *src)
SI void max_fn(T *dst, T *src)
SI void apply_adjacent_binary_packed(SkRasterPipeline_BinaryOpCtx *packed, std::byte *base)
SI F bicubic_wts(F t, float A, float B, float C, float D)
SI void min_fn(T *dst, T *src)
SI T * ptr_at_xy(const SkRasterPipeline_MemoryCtx *ctx, size_t dx, size_t dy)
SI void swizzle_copy_masked_fn(I32 *dst, const I32 *src, uint16_t *offsets, I32 mask)
SI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx *ctx)
SI void clip_color(F *r, F *g, F *b, F a)
SI void from_88(U16 _88, F *r, F *g)
SI void mix_fn(I32 *a, I32 *x, I32 *y)
SI F clip_channel(F c, F l, I32 clip_low, I32 clip_high, F mn_scale, F mx_scale)
STAGE(seed_shader, NoCtx)
SI void pow_fn(F *dst, F *src)
SI void invsqrt_fn(F *dst)
SI void add_fn(T *dst, T *src)
SI I32 cond_to_mask(I32 cond)
SI RGB css_hsl_to_srgb_(F h, F s, F l)
SI F lerp(F from, F to, F t)
SI void copy_n_immutable_unmasked_fn(SkRasterPipeline_BinaryOpCtx *packed, std::byte *base)
SI void store2(uint16_t *ptr, U16 r, U16 g)
SI F compute_perlin_vector(U32 sample, F x, F y)
SI void copy_n_slots_unmasked_fn(SkRasterPipeline_BinaryOpCtx *packed, std::byte *base)
SI void bitwise_or_fn(I32 *dst, I32 *src)
static UnpackedType< T > Unpack(const T *ctx)
skia_private::AutoTArray< sk_sp< SkImageFilter > > filters TypedMatrix matrix TypedMatrix matrix SkScalar dx
Int96 multiply(int64_t a, int32_t b)
it will be possible to load the file into Perfetto s trace viewer disable asset Prevents usage of any non test fonts unless they were explicitly Loaded via prefetched default font Indicates whether the embedding started a prefetch of the default font manager before creating the engine run In non interactive keep the shell running after the Dart script has completed enable serial On low power devices with low core running concurrent GC tasks on threads can cause them to contend with the UI thread which could potentially lead to jank This option turns off all concurrent GC activities domain network JSON encoded network policy per domain This overrides the DisallowInsecureConnections switch Embedder can specify whether to allow or disallow insecure connections at a domain level old gen heap size
SIN Vec< N, float > abs(const Vec< N, float > &x)
SIN Vec< N, float > sqrt(const Vec< N, float > &x)
SIN Vec< N, uint8_t > div255(const Vec< N, uint16_t > &x)
SI Vec< sizeof...(Ix), T > shuffle(const Vec< N, T > &)
SIN Vec< N, float > floor(const Vec< N, float > &x)
SIN Vec< N, float > ceil(const Vec< N, float > &x)
static SkString join(const CommandLineFlags::StringArray &)
SkRasterPipelineStage * fStage
uint32_t fMask[SkRasterPipeline_kMaxStride_highp]
float rgba[4 *SkRasterPipeline_kMaxStride_highp]
void(* fn)(SkRasterPipeline_CallbackCtx *self, int active_pixels)
const uint32_t * indirectOffset
uint32_t mask[SkRasterPipeline_kMaxStride]
SkRasterPipeline_MemoryCtx add
SkRasterPipeline_MemoryCtx mul
float x[SkRasterPipeline_kMaxStride_highp]
float g[SkRasterPipeline_kMaxStride_highp]
float r[SkRasterPipeline_kMaxStride_highp]
float b[SkRasterPipeline_kMaxStride_highp]
float y[SkRasterPipeline_kMaxStride_highp]
float a[SkRasterPipeline_kMaxStride_highp]
SkPerlinNoiseShaderType noiseType
const uint8_t * latticeSelector
const uint16_t * noiseData
float dg[SkRasterPipeline_kMaxStride_highp]
float g[SkRasterPipeline_kMaxStride_highp]
SkRasterPipelineStage * stage
float dr[SkRasterPipeline_kMaxStride_highp]
float db[SkRasterPipeline_kMaxStride_highp]
float a[SkRasterPipeline_kMaxStride_highp]
float r[SkRasterPipeline_kMaxStride_highp]
float da[SkRasterPipeline_kMaxStride_highp]
float b[SkRasterPipeline_kMaxStride_highp]
float scalex[SkRasterPipeline_kMaxStride_highp]
float fy[SkRasterPipeline_kMaxStride_highp]
float wy[4][SkRasterPipeline_kMaxStride_highp]
float scaley[SkRasterPipeline_kMaxStride_highp]
float fx[SkRasterPipeline_kMaxStride_highp]
float x[SkRasterPipeline_kMaxStride_highp]
float wx[4][SkRasterPipeline_kMaxStride_highp]
float y[SkRasterPipeline_kMaxStride_highp]
SkSL::TraceHook * traceHook
SkSL::TraceHook * traceHook
SkSL::TraceHook * traceHook
const uint32_t * indirectOffset
SkSL::TraceHook * traceHook
std::shared_ptr< const fml::Mapping > data