21#if defined(__GNUC__) && !defined(__clang__)
23 static constexpr F F0 =
F() + 0.0f,
27 static constexpr F F0 = 0.0f,
36#if !defined(USING_AVX) && N == 8 && defined(__AVX__)
39#if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
40 #define USING_AVX_F16C
42#if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__)
45#if !defined(USING_AVX512F) && N == 16 && defined(__AVX512F__) && defined(__AVX512DQ__)
51#if N > 1 && defined(__ARM_NEON)
55 #if defined(__clang__)
58 #define USING_NEON_F16C
60 #elif defined(__GNUC__)
63 #if defined(__ARM_FP16_FORMAT_IEEE)
64 #define USING_NEON_F16C
71#if defined(USING_NEON) && defined(__clang__)
72 #pragma clang diagnostic ignored "-Wvector-conversion"
78#if defined(__SSE__) && defined(__GNUC__)
79 #if !defined(__has_warning)
80 #pragma GCC diagnostic ignored "-Wpsabi"
81 #elif __has_warning("-Wpsabi")
82 #pragma GCC diagnostic ignored "-Wpsabi"
91#if defined(__clang__) || defined(__GNUC__)
92 #define SI static inline __attribute__((always_inline))
94 #define SI static inline
97template <
typename T,
typename P>
100 memcpy(&val, ptr,
sizeof(val));
103template <
typename T,
typename P>
105 memcpy(ptr, &val,
sizeof(val));
110template <
typename D,
typename S>
114#elif defined(__clang__)
115 return __builtin_convertvector(v,
D);
118 for (
int i = 0;
i <
N;
i++) {
125template <
typename D,
typename S>
127 static_assert(
sizeof(
D) ==
sizeof(v),
"");
143 #define if_then_else(cond, t, e) ((cond) ? (t) : (e))
145 template <
typename C,
typename T>
147 return bit_pun<T>( ( cond & bit_pun<C>(t)) |
148 (~cond & bit_pun<C>(
e)) );
154#if defined(USING_NEON_F16C)
155 return vcvt_f32_f16((float16x4_t)half);
156#elif defined(USING_AVX512F)
157 return (
F)_mm512_cvtph_ps((__m256i)half);
158#elif defined(USING_AVX_F16C)
160 return __builtin_ia32_vcvtph2ps256((
I16)half);
162 U32 wide = cast<U32>(half);
164 U32 s = wide & 0x8000,
168 F norm = bit_pun<F>( (
s<<16) + (em<<13) + ((127-15)<<23) );
175#if defined(__clang__)
181#if defined(USING_NEON_F16C)
182 return (
U16)vcvt_f16_f32(
f);
183#elif defined(USING_AVX512F)
184 return (
U16)_mm512_cvtps_ph((__m512 )
f, _MM_FROUND_CUR_DIRECTION );
185#elif defined(USING_AVX_F16C)
186 return (
U16)__builtin_ia32_vcvtps2ph256(
f, 0x04);
189 U32 sem = bit_pun<U32>(
f),
190 s = sem & 0x80000000,
195 , (
s>>16) + (em>>13) - ((127-15)<<10)));
200#if defined(USING_NEON)
202 return (
U16)vrev16_u8((uint8x8_t) v);
207 return (
rgba & 0x00ff00ff00ff00ff) << 8
208 | (
rgba & 0xff00ff00ff00ff00) >> 8;
211#if defined(USING_NEON)
212 SI F min_(
F x,
F y) {
return (
F)vminq_f32((float32x4_t)
x, (float32x4_t)
y); }
213 SI F max_(
F x,
F y) {
return (
F)vmaxq_f32((float32x4_t)
x, (float32x4_t)
y); }
222#elif defined(__aarch64__)
223 return vrndmq_f32(
x);
224#elif defined(USING_AVX512F)
229 return _mm512_mask_floor_ps(
x, (__mmask16)-1,
x);
230#elif defined(USING_AVX)
231 return __builtin_ia32_roundps256(
x, 0x01);
232#elif defined(__SSE4_1__)
233 return _mm_floor_ps(
x);
236 F roundtrip = cast<F>(cast<I32>(
x));
249 F e = cast<F>(
bits) * (1.0f / (1<<23));
252 F m = bit_pun<F>( (
bits & 0x007fffff) | 0x3f000000 );
254 return e - 124.225514990f
256 - 1.725879990f/(0.3520887068f +
m);
260 const float ln2 = 0.69314718f;
267 F fbits = (1.0f * (1<<23)) * (
x + 121.274057500f
269 + 27.728023300f/(4.84252568f -
fract));
272 return bit_pun<F>(
bits);
281 const float log2_e = 1.4426950408889634074f;
292 return bit_pun<F>(
sign | bit_pun<U32>(
x));
325 return bit_pun<F>(
sign | bit_pun<U32>(v));
329 const float R = tf->
a,
G = tf->
b,
330 a = tf->
c,
b = tf->
d, c = tf->
e,
339 return K*bit_pun<F>(
sign | bit_pun<U32>(v));
343 const float R = tf->
a,
G = tf->
b,
344 a = tf->
c,
b = tf->
d, c = tf->
e,
354 return bit_pun<F>(
sign | bit_pun<U32>(v));
359template <
typename T,
typename P>
364 return T{
p[ 0],
p[ 3],
p[ 6],
p[ 9]};
366 return T{
p[ 0],
p[ 3],
p[ 6],
p[ 9],
p[12],
p[15],
p[18],
p[21]};
368 return T{
p[ 0],
p[ 3],
p[ 6],
p[ 9],
p[12],
p[15],
p[18],
p[21],
369 p[24],
p[27],
p[30],
p[33],
p[36],
p[39],
p[42],
p[45]};
373template <
typename T,
typename P>
378 return T{
p[ 0],
p[ 4],
p[ 8],
p[12]};
380 return T{
p[ 0],
p[ 4],
p[ 8],
p[12],
p[16],
p[20],
p[24],
p[28]};
382 return T{
p[ 0],
p[ 4],
p[ 8],
p[12],
p[16],
p[20],
p[24],
p[28],
383 p[32],
p[36],
p[40],
p[44],
p[48],
p[52],
p[56],
p[60]};
387template <
typename T,
typename P>
392 p[ 0] = v[ 0];
p[ 3] = v[ 1];
p[ 6] = v[ 2];
p[ 9] = v[ 3];
394 p[ 0] = v[ 0];
p[ 3] = v[ 1];
p[ 6] = v[ 2];
p[ 9] = v[ 3];
395 p[12] = v[ 4];
p[15] = v[ 5];
p[18] = v[ 6];
p[21] = v[ 7];
397 p[ 0] = v[ 0];
p[ 3] = v[ 1];
p[ 6] = v[ 2];
p[ 9] = v[ 3];
398 p[12] = v[ 4];
p[15] = v[ 5];
p[18] = v[ 6];
p[21] = v[ 7];
399 p[24] = v[ 8];
p[27] = v[ 9];
p[30] = v[10];
p[33] = v[11];
400 p[36] = v[12];
p[39] = v[13];
p[42] = v[14];
p[45] = v[15];
404template <
typename T,
typename P>
409 p[ 0] = v[ 0];
p[ 4] = v[ 1];
p[ 8] = v[ 2];
p[12] = v[ 3];
411 p[ 0] = v[ 0];
p[ 4] = v[ 1];
p[ 8] = v[ 2];
p[12] = v[ 3];
412 p[16] = v[ 4];
p[20] = v[ 5];
p[24] = v[ 6];
p[28] = v[ 7];
414 p[ 0] = v[ 0];
p[ 4] = v[ 1];
p[ 8] = v[ 2];
p[12] = v[ 3];
415 p[16] = v[ 4];
p[20] = v[ 5];
p[24] = v[ 6];
p[28] = v[ 7];
416 p[32] = v[ 8];
p[36] = v[ 9];
p[40] = v[10];
p[44] = v[11];
417 p[48] = v[12];
p[52] = v[13];
p[56] = v[14];
p[60] = v[15];
426 U8 v = {
p[ix[0]],
p[ix[1]],
p[ix[2]],
p[ix[3]] };
428 U8 v = {
p[ix[0]],
p[ix[1]],
p[ix[2]],
p[ix[3]],
429 p[ix[4]],
p[ix[5]],
p[ix[6]],
p[ix[7]] };
431 U8 v = {
p[ix[ 0]],
p[ix[ 1]],
p[ix[ 2]],
p[ix[ 3]],
432 p[ix[ 4]],
p[ix[ 5]],
p[ix[ 6]],
p[ix[ 7]],
433 p[ix[ 8]],
p[ix[ 9]],
p[ix[10]],
p[ix[11]],
434 p[ix[12]],
p[ix[13]],
p[ix[14]],
p[ix[15]] };
441 auto load_16 = [
p](
int i) {
442 return load<uint16_t>(
p + 2*
i);
447 U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
449 U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
450 load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
452 U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
453 load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
454 load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
455 load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
462 auto load_32 = [
p](
int i) {
463 return load<uint32_t>(
p + 4*
i);
468 U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
470 U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
471 load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
473 U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
474 load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
475 load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
476 load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
488 auto load_24_32 = [
p](
int i) {
489 return load<uint32_t>(
p + 3*
i);
494 U32 v = load_24_32(ix);
496 U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
497#elif N == 8 && !defined(USING_AVX2)
498 U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
499 load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
504 const int* p4 = bit_pun<const int*>(
p);
505 I32 zero = { 0, 0, 0, 0, 0, 0, 0, 0},
506 mask = {-1,-1,-1,-1, -1,-1,-1,-1};
507 #if defined(__clang__)
508 U32 v = (
U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
509 #elif defined(__GNUC__)
510 U32 v = (
U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
516 const int* p4 = bit_pun<const int*>(
p);
517 U32 v = (
U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
530 auto load_48_64 = [
p](
int i) {
531 return load<uint64_t>(
p + 6*
i);
538 load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
540 #elif N == 8 && !defined(USING_AVX2)
542 load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
543 load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
552 const long long int* p8 = bit_pun<const long long int*>(
p);
554 Half_I64 zero = { 0, 0, 0, 0},
555 mask = {-1,-1,-1,-1};
558 Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
559 ix_hi = { ix[4], ix[5], ix[6], ix[7] };
561 #if defined(__clang__)
562 Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
563 hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
564 #elif defined(__GNUC__)
565 Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
566 hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
568 store((
char*)v + 0, lo);
569 store((
char*)v + 32, hi);
572 const long long int* p8 = bit_pun<const long long int*>(
p);
573 __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
574 hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
575 store((
char*)v + 0, lo);
576 store((
char*)v + 64, hi);
584 return cast<F>(v) * (1/255.0f);
591 hi = (v << 8) & 0xffff;
592 return cast<F>(lo|hi) * (1/65535.0f);
601 return bit_pun<F>( bit_pun<U32>(v) - 1 );
609 I32 lo = cast<I32>( ix ),
611 F t = ix - cast<F>(lo);
631 *r = cast<F>((rgb >> 0) & 0xff) * (1/255.0f);
632 *g = cast<F>((rgb >> 8) & 0xff) * (1/255.0f);
633 *
b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
640 *r = cast<F>((
rgba >> 0) & 0xff) * (1/255.0f);
641 *g = cast<F>((
rgba >> 8) & 0xff) * (1/255.0f);
642 *
b = cast<F>((
rgba >> 16) & 0xff) * (1/255.0f);
643 *
a = cast<F>((
rgba >> 24) & 0xff) * (1/255.0f);
658 *r = cast<F>((rgb >> 0) & 0xffff) * (1/65535.0f);
659 *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
660 *
b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
672static void clut(uint32_t input_channels, uint32_t output_channels,
673 const uint8_t grid_points[4],
const uint8_t* grid_8,
const uint8_t* grid_16,
676 const int dim = (
int)input_channels;
677 assert (0 < dim && dim <= 4);
678 assert (output_channels == 3 ||
679 output_channels == 4);
687 for (
int i = dim-1, stride = 1;
i >= 0;
i--) {
692 I32 lo = cast<I32>(
x ),
695 index[
i+0] = lo * stride;
696 index[
i+4] = hi * stride;
697 stride *= grid_points[
i];
700 F t =
x - cast<F>(lo);
706 if (output_channels == 4) {
712 for (
int combo = 0; combo < (1<<dim); combo++) {
720 I32 ix = index [0 + (combo&1)*4];
721 F w = weight[0 + (combo&1)*4];
724 case 3: ix += index [3 + (combo&8)/2];
725 w *= weight[3 + (combo&8)/2];
729 case 2: ix += index [2 + (combo&4)*1];
730 w *= weight[2 + (combo&4)*1];
734 case 1: ix += index [1 + (combo&2)*2];
735 w *= weight[1 + (combo&2)*2];
739 if (output_channels == 3) {
769 template <
typename T>
operator T*() {
return (
const T*)
fArg; }
772#define STAGE_PARAMS(MAYBE_REF) SKCMS_MAYBE_UNUSED const char* src, \
773 SKCMS_MAYBE_UNUSED char* dst, \
774 SKCMS_MAYBE_UNUSED F MAYBE_REF r, \
775 SKCMS_MAYBE_UNUSED F MAYBE_REF g, \
776 SKCMS_MAYBE_UNUSED F MAYBE_REF b, \
777 SKCMS_MAYBE_UNUSED F MAYBE_REF a, \
778 SKCMS_MAYBE_UNUSED int i
780#if SKCMS_HAS_MUSTTAIL
793 #define DECLARE_STAGE(name, arg, CALL_NEXT) \
794 SI void Exec_##name##_k(arg, STAGE_PARAMS(&)); \
796 SI void Exec_##name(StageList list, const void** ctx, STAGE_PARAMS()) { \
797 Exec_##name##_k(Ctx{*ctx}, src, dst, r, g, b, a, i); \
802 SI void Exec_##name##_k(arg, STAGE_PARAMS(&))
804 #define STAGE(name, arg) \
805 DECLARE_STAGE(name, arg, [[clang::musttail]] return (*list.fn)(list, ctx, src, dst, \
808 #define FINAL_STAGE(name, arg) \
809 DECLARE_STAGE(name, arg, )
813 #define DECLARE_STAGE(name, arg) \
814 SI void Exec_##name##_k(arg, STAGE_PARAMS(&)); \
816 SI void Exec_##name(const void* ctx, STAGE_PARAMS(&)) { \
817 Exec_##name##_k(Ctx{ctx}, src, dst, r, g, b, a, i); \
820 SI void Exec_##name##_k(arg, STAGE_PARAMS(&))
822 #define STAGE(name, arg) DECLARE_STAGE(name, arg)
823 #define FINAL_STAGE(name, arg) DECLARE_STAGE(name, arg)
836 U16 abgr = load<U16>(
src + 2*
i);
838 r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
839 g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
840 b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
841 a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
845 U16 rgb = load<U16>(
src + 2*
i);
847 r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
848 g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
849 b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
853 const uint8_t* rgb = (
const uint8_t*)(
src + 3*
i);
854#if defined(USING_NEON)
858 uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
859 v = vld3_lane_u8(rgb+0, v, 0);
860 v = vld3_lane_u8(rgb+3, v, 2);
861 v = vld3_lane_u8(rgb+6, v, 4);
862 v = vld3_lane_u8(rgb+9, v, 6);
867 r = cast<F>((
U16)v.val[0]) * (1/255.0f);
868 g = cast<F>((
U16)v.val[1]) * (1/255.0f);
869 b = cast<F>((
U16)v.val[2]) * (1/255.0f);
871 r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
872 g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
873 b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
880 r = cast<F>((
rgba >> 0) & 0xff) * (1/255.0f);
881 g = cast<F>((
rgba >> 8) & 0xff) * (1/255.0f);
882 b = cast<F>((
rgba >> 16) & 0xff) * (1/255.0f);
883 a = cast<F>((
rgba >> 24) & 0xff) * (1/255.0f);
889 r = cast<F>((
rgba >> 0) & 0x3ff) * (1/1023.0f);
890 g = cast<F>((
rgba >> 10) & 0x3ff) * (1/1023.0f);
891 b = cast<F>((
rgba >> 20) & 0x3ff) * (1/1023.0f);
892 a = cast<F>((
rgba >> 30) & 0x3 ) * (1/ 3.0f);
896 static constexpr float min = -0.752941f;
897 static constexpr float max = 1.25098f;
898 static constexpr float range =
max -
min;
900 r = cast<F>((
rgba >> 0) & 0x3ff) * (1/1023.0f) * range +
min;
901 g = cast<F>((
rgba >> 10) & 0x3ff) * (1/1023.0f) * range +
min;
902 b = cast<F>((
rgba >> 20) & 0x3ff) * (1/1023.0f) * range +
min;
906 static constexpr float min = -0.752941f;
907 static constexpr float max = 1.25098f;
908 static constexpr float range =
max -
min;
910 r = cast<F>((
rgba >> (0+6)) & 0x3ff) * (1/1023.0f) * range +
min;
911 g = cast<F>((
rgba >> (16+6)) & 0x3ff) * (1/1023.0f) * range +
min;
912 b = cast<F>((
rgba >> (32+6)) & 0x3ff) * (1/1023.0f) * range +
min;
913 a = cast<F>((
rgba >> (48+6)) & 0x3ff) * (1/1023.0f) * range +
min;
917 uintptr_t ptr = (uintptr_t)(
src + 6*
i);
918 assert( (ptr & 1) == 0 );
919 const uint16_t* rgb = (
const uint16_t*)ptr;
920#if defined(USING_NEON)
921 uint16x4x3_t v = vld3_u16(rgb);
922 r = cast<F>((
U16)v.val[0]) * (1/65535.0f);
923 g = cast<F>((
U16)v.val[1]) * (1/65535.0f);
924 b = cast<F>((
U16)v.val[2]) * (1/65535.0f);
926 r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
927 g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
928 b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
933 uintptr_t ptr = (uintptr_t)(
src + 8*
i);
934 assert( (ptr & 1) == 0 );
935 const uint16_t*
rgba = (
const uint16_t*)ptr;
936#if defined(USING_NEON)
937 uint16x4x4_t v = vld4_u16(
rgba);
938 r = cast<F>((
U16)v.val[0]) * (1/65535.0f);
939 g = cast<F>((
U16)v.val[1]) * (1/65535.0f);
940 b = cast<F>((
U16)v.val[2]) * (1/65535.0f);
941 a = cast<F>((
U16)v.val[3]) * (1/65535.0f);
945 r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
946 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
947 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
948 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
953 uintptr_t ptr = (uintptr_t)(
src + 6*
i);
954 assert( (ptr & 1) == 0 );
955 const uint16_t* rgb = (
const uint16_t*)ptr;
956#if defined(USING_NEON)
957 uint16x4x3_t v = vld3_u16(rgb);
958 r = cast<F>(swap_endian_16((
U16)v.val[0])) * (1/65535.0f);
959 g = cast<F>(swap_endian_16((
U16)v.val[1])) * (1/65535.0f);
960 b = cast<F>(swap_endian_16((
U16)v.val[2])) * (1/65535.0f);
962 U32 R = load_3<U32>(rgb+0),
963 G = load_3<U32>(rgb+1),
964 B = load_3<U32>(rgb+2);
966 r = cast<F>((
R & 0x00ff)<<8 | (
R & 0xff00)>>8) * (1/65535.0f);
967 g = cast<F>((
G & 0x00ff)<<8 | (
G & 0xff00)>>8) * (1/65535.0f);
968 b = cast<F>((
B & 0x00ff)<<8 | (
B & 0xff00)>>8) * (1/65535.0f);
973 uintptr_t ptr = (uintptr_t)(
src + 8*
i);
974 assert( (ptr & 1) == 0 );
975 const uint16_t*
rgba = (
const uint16_t*)ptr;
976#if defined(USING_NEON)
977 uint16x4x4_t v = vld4_u16(
rgba);
978 r = cast<F>(swap_endian_16((
U16)v.val[0])) * (1/65535.0f);
979 g = cast<F>(swap_endian_16((
U16)v.val[1])) * (1/65535.0f);
980 b = cast<F>(swap_endian_16((
U16)v.val[2])) * (1/65535.0f);
981 a = cast<F>(swap_endian_16((
U16)v.val[3])) * (1/65535.0f);
985 r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
986 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
987 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
988 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
993 uintptr_t ptr = (uintptr_t)(
src + 6*
i);
994 assert( (ptr & 1) == 0 );
995 const uint16_t* rgb = (
const uint16_t*)ptr;
996#if defined(USING_NEON)
997 uint16x4x3_t v = vld3_u16(rgb);
1002 U16 R = load_3<U16>(rgb+0),
1003 G = load_3<U16>(rgb+1),
1004 B = load_3<U16>(rgb+2);
1012 uintptr_t ptr = (uintptr_t)(
src + 8*
i);
1013 assert( (ptr & 1) == 0 );
1014 const uint16_t*
rgba = (
const uint16_t*)ptr;
1015#if defined(USING_NEON)
1016 uint16x4x4_t v = vld4_u16(
rgba);
1023 U16 R = cast<U16>((px >> 0) & 0xffff),
1024 G = cast<U16>((px >> 16) & 0xffff),
1025 B = cast<U16>((px >> 32) & 0xffff),
1026 A = cast<U16>((px >> 48) & 0xffff);
1035 uintptr_t ptr = (uintptr_t)(
src + 12*
i);
1036 assert( (ptr & 3) == 0 );
1037 const float* rgb = (
const float*)ptr;
1038#if defined(USING_NEON)
1039 float32x4x3_t v = vld3q_f32(rgb);
1044 r = load_3<F>(rgb+0);
1045 g = load_3<F>(rgb+1);
1046 b = load_3<F>(rgb+2);
1051 uintptr_t ptr = (uintptr_t)(
src + 16*
i);
1052 assert( (ptr & 3) == 0 );
1053 const float*
rgba = (
const float*)ptr;
1054#if defined(USING_NEON)
1055 float32x4x4_t v = vld4q_f32(
rgba);
1061 r = load_4<F>(
rgba+0);
1062 g = load_4<F>(
rgba+1);
1063 b = load_4<F>(
rgba+2);
1064 a = load_4<F>(
rgba+3);
1106 const float*
m = &
matrix->vals[0][0];
1108 F R =
m[0]*r +
m[1]*g +
m[2]*
b,
1109 G =
m[3]*r +
m[4]*g +
m[5]*
b,
1110 B =
m[6]*r +
m[7]*g +
m[8]*
b;
1118 const float*
m = &
matrix->vals[0][0];
1120 F R =
m[0]*r +
m[1]*g +
m[ 2]*
b +
m[ 3],
1121 G =
m[4]*r +
m[5]*g +
m[ 6]*
b +
m[ 7],
1122 B =
m[8]*r +
m[9]*g +
m[10]*
b +
m[11];
1132 A = g * 255.0f - 128.0f,
1133 B =
b * 255.0f - 128.0f;
1136 F Y = (
L + 16.0f) * (1/116.0f),
1137 X =
Y +
A*(1/500.0f),
1138 Z =
Y -
B*(1/200.0f);
1152 F X = r * (1/0.9642f),
1154 Z =
b * (1/0.8249f);
1160 F L =
Y*116.0f - 16.0f,
1165 g = (
A + 128.0f) * (1/255.0f);
1166 b = (
B + 128.0f) * (1/255.0f);
1254 store<U16>(
dst + 2*
i, cast<U16>(
to_fixed(r * 15) << 12)
1261 store<U16>(
dst + 2*
i, cast<U16>(
to_fixed(r * 31) << 0 )
1262 | cast<U16>(
to_fixed(g * 63) << 5 )
1267 uint8_t* rgb = (uint8_t*)
dst + 3*
i;
1268#if defined(USING_NEON)
1276 uint8x8x3_t v = {{ (uint8x8_t)
R, (uint8x8_t)
G, (uint8x8_t)
B }};
1277 vst3_lane_u8(rgb+0, v, 0);
1278 vst3_lane_u8(rgb+3, v, 2);
1279 vst3_lane_u8(rgb+6, v, 4);
1280 vst3_lane_u8(rgb+9, v, 6);
1290 | cast<U32>(
to_fixed(g * 255)) << 8
1296 static constexpr float min = -0.752941f;
1297 static constexpr float max = 1.25098f;
1298 static constexpr float range =
max -
min;
1300 | cast<U32>(
to_fixed(((g -
min) / range) * 1023)) << 10
1301 | cast<U32>(
to_fixed(((
b -
min) / range) * 1023)) << 20);
1306 | cast<U32>(
to_fixed(g * 1023)) << 10
1312 uintptr_t ptr = (uintptr_t)(
dst + 6*
i);
1313 assert( (ptr & 1) == 0 );
1314 uint16_t* rgb = (uint16_t*)ptr;
1315#if defined(USING_NEON)
1331 uintptr_t ptr = (uintptr_t)(
dst + 8*
i);
1332 assert( (ptr & 1) == 0 );
1333 uint16_t*
rgba = (uint16_t*)ptr;
1334#if defined(USING_NEON)
1344 | cast<U64>(
to_fixed(g * 65535)) << 16
1352 uintptr_t ptr = (uintptr_t)(
dst + 6*
i);
1353 assert( (ptr & 1) == 0 );
1354 uint16_t* rgb = (uint16_t*)ptr;
1355#if defined(USING_NEON)
1357 (uint16x4_t)swap_endian_16(cast<U16>(
U16_from_F(r))),
1358 (uint16x4_t)swap_endian_16(cast<U16>(
U16_from_F(g))),
1359 (uint16x4_t)swap_endian_16(cast<U16>(
U16_from_F(
b))),
1366 store_3(rgb+0, cast<U16>((
R & 0x00ff) << 8 | (
R & 0xff00) >> 8) );
1367 store_3(rgb+1, cast<U16>((
G & 0x00ff) << 8 | (
G & 0xff00) >> 8) );
1368 store_3(rgb+2, cast<U16>((
B & 0x00ff) << 8 | (
B & 0xff00) >> 8) );
1374 uintptr_t ptr = (uintptr_t)(
dst + 8*
i);
1375 assert( (ptr & 1) == 0 );
1376 uint16_t*
rgba = (uint16_t*)ptr;
1377#if defined(USING_NEON)
1379 (uint16x4_t)swap_endian_16(cast<U16>(
U16_from_F(r))),
1380 (uint16x4_t)swap_endian_16(cast<U16>(
U16_from_F(g))),
1381 (uint16x4_t)swap_endian_16(cast<U16>(
U16_from_F(
b))),
1382 (uint16x4_t)swap_endian_16(cast<U16>(
U16_from_F(
a))),
1387 | cast<U64>(
to_fixed(g * 65535)) << 16
1395 uintptr_t ptr = (uintptr_t)(
dst + 6*
i);
1396 assert( (ptr & 1) == 0 );
1397 uint16_t* rgb = (uint16_t*)ptr;
1402#if defined(USING_NEON)
1417 uintptr_t ptr = (uintptr_t)(
dst + 8*
i);
1418 assert( (ptr & 1) == 0 );
1419 uint16_t*
rgba = (uint16_t*)ptr;
1425#if defined(USING_NEON)
1435 | cast<U64>(
G) << 16
1436 | cast<U64>(
B) << 32
1437 | cast<U64>(
A) << 48);
1442 uintptr_t ptr = (uintptr_t)(
dst + 12*
i);
1443 assert( (ptr & 3) == 0 );
1444 float* rgb = (
float*)ptr;
1445#if defined(USING_NEON)
1446 float32x4x3_t v = {{
1460 uintptr_t ptr = (uintptr_t)(
dst + 16*
i);
1461 assert( (ptr & 3) == 0 );
1462 float*
rgba = (
float*)ptr;
1463#if defined(USING_NEON)
1464 float32x4x4_t v = {{
1479#if SKCMS_HAS_MUSTTAIL
1488 const char*
src,
char*
dst,
int i) {
1492#define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); break;
1495#define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); return;
1506 const char*
src,
char*
dst,
int n,
1507 const size_t src_bpp,
const size_t dst_bpp) {
1508#if SKCMS_HAS_MUSTTAIL
1513 static constexpr StageFn kStageFns[] = {
1514#define M(name) &Exec_##name,
1520 for (ptrdiff_t index = 0; index < programSize; ++index) {
1521 stages[index] = kStageFns[(
int)program[index]];
1525 const Op* stages = program;
1535 char tmp[4*4*
N] = {0};
1537 memcpy(tmp, (
const char*)
src + (
size_t)
i*src_bpp, (
size_t)n*src_bpp);
1539 memcpy((
char*)
dst + (
size_t)
i*dst_bpp, tmp, (
size_t)n*dst_bpp);
static const uint32_t rgba[kNumPixels]
static unsigned clamp(SkFixed fx, int max)
static int sign(SkScalar x)
static uint32_t premul(uint32_t color)
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
static float max(float r, float g, float b)
static float min(float r, float g, float b)
__attribute__((visibility("default"))) int RunBenchmarks(int argc
T __attribute__((ext_vector_type(N))) V
unsigned useCenter Optional< SkMatrix > matrix
SIN Vec< N, float > fract(const Vec< N, float > &x)
static float floorf_(float x)
#define SKCMS_FALLTHROUGH
#define SKCMS_MAYBE_UNUSED