21#if defined(__GNUC__) && !defined(__clang__)
23 static constexpr F F0 =
F() + 0.0f,
27 static constexpr F F0 = 0.0f,
36#if !defined(USING_AVX) && N == 8 && defined(__AVX__)
39#if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
40 #define USING_AVX_F16C
42#if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__)
45#if !defined(USING_AVX512F) && N == 16 && defined(__AVX512F__) && defined(__AVX512DQ__)
51#if N > 1 && defined(__ARM_NEON)
55 #if defined(__clang__)
58 #define USING_NEON_F16C
60 #elif defined(__GNUC__)
63 #if defined(__ARM_FP16_FORMAT_IEEE)
64 #define USING_NEON_F16C
71#if defined(USING_NEON) && defined(__clang__)
72 #pragma clang diagnostic ignored "-Wvector-conversion"
78#if defined(__SSE__) && defined(__GNUC__)
79 #if !defined(__has_warning)
80 #pragma GCC diagnostic ignored "-Wpsabi"
81 #elif __has_warning("-Wpsabi")
82 #pragma GCC diagnostic ignored "-Wpsabi"
91#if defined(__clang__) || defined(__GNUC__)
92 #define SI static inline __attribute__((always_inline))
94 #define SI static inline
97template <
typename T,
typename P>
100 memcpy(&val, ptr,
sizeof(val));
103template <
typename T,
typename P>
105 memcpy(ptr, &val,
sizeof(val));
110template <
typename D,
typename S>
114#elif defined(__clang__)
115 return __builtin_convertvector(v,
D);
118 for (
int i = 0; i <
N; i++) {
125template <
typename D,
typename S>
127 static_assert(
sizeof(
D) ==
sizeof(v),
"");
143 #define if_then_else(cond, t, e) ((cond) ? (t) : (e))
145 template <
typename C,
typename T>
147 return bit_pun<T>( ( cond & bit_pun<C>(t)) |
148 (~cond & bit_pun<C>(e)) );
154#if defined(USING_NEON_F16C)
155 return vcvt_f32_f16((float16x4_t)half);
156#elif defined(USING_AVX512F)
157 return (
F)_mm512_cvtph_ps((__m256i)half);
158#elif defined(USING_AVX_F16C)
160 return __builtin_ia32_vcvtph2ps256((
I16)half);
162 U32 wide = cast<U32>(half);
164 U32 s = wide & 0x8000,
168 F norm = bit_pun<F>( (
s<<16) + (em<<13) + ((127-15)<<23) );
175#if defined(__clang__)
181#if defined(USING_NEON_F16C)
182 return (
U16)vcvt_f16_f32(f);
183#elif defined(USING_AVX512F)
184 return (
U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
185#elif defined(USING_AVX_F16C)
186 return (
U16)__builtin_ia32_vcvtps2ph256(f, 0x04);
189 U32 sem = bit_pun<U32>(f),
190 s = sem & 0x80000000,
195 , (
s>>16) + (em>>13) - ((127-15)<<10)));
200#if defined(USING_NEON)
202 return (
U16)vrev16_u8((uint8x8_t) v);
207 return (
rgba & 0x00ff00ff00ff00ff) << 8
208 | (
rgba & 0xff00ff00ff00ff00) >> 8;
211#if defined(USING_NEON)
212 SI F min_(
F x,
F y) {
return (
F)vminq_f32((float32x4_t)
x, (float32x4_t)
y); }
213 SI F max_(
F x,
F y) {
return (
F)vmaxq_f32((float32x4_t)
x, (float32x4_t)
y); }
222#elif defined(__aarch64__)
223 return vrndmq_f32(
x);
224#elif defined(USING_AVX512F)
229 return _mm512_mask_floor_ps(
x, (__mmask16)-1,
x);
230#elif defined(USING_AVX)
231 return __builtin_ia32_roundps256(
x, 0x01);
232#elif defined(__SSE4_1__)
233 return _mm_floor_ps(
x);
236 F roundtrip = cast<F>(cast<I32>(
x));
247 I32 bits = bit_pun<I32>(
x);
249 F e = cast<F>(bits) * (1.0f / (1<<23));
252 F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
254 return e - 124.225514990f
256 - 1.725879990f/(0.3520887068f + m);
260 const float ln2 = 0.69314718f;
267 F fbits = (1.0f * (1<<23)) * (
x + 121.274057500f
269 + 27.728023300f/(4.84252568f - fract));
272 return bit_pun<F>(bits);
281 const float log2_e = 1.4426950408889634074f;
286 U32 bits = bit_pun<U32>(
x);
287 *
sign = bits & 0x80000000;
288 return bit_pun<F>(bits ^ *
sign);
292 return bit_pun<F>(
sign | bit_pun<U32>(
x));
317 U32 bits = bit_pun<U32>(
x),
318 sign = bits & 0x80000000;
319 x = bit_pun<F>(bits ^
sign);
325 return bit_pun<F>(
sign | bit_pun<U32>(v));
329 const float R = tf->
a,
G = tf->
b,
330 a = tf->
c,
b = tf->
d, c = tf->
e,
332 U32 bits = bit_pun<U32>(
x),
333 sign = bits & 0x80000000;
334 x = bit_pun<F>(bits ^
sign);
339 return K*bit_pun<F>(
sign | bit_pun<U32>(v));
343 const float R = tf->
a,
G = tf->
b,
344 a = tf->
c,
b = tf->
d, c = tf->
e,
346 U32 bits = bit_pun<U32>(
x),
347 sign = bits & 0x80000000;
348 x = bit_pun<F>(bits ^
sign);
354 return bit_pun<F>(
sign | bit_pun<U32>(v));
359template <
typename T,
typename P>
364 return T{p[ 0],p[ 3],p[ 6],p[ 9]};
366 return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
368 return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
369 p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
373template <
typename T,
typename P>
378 return T{p[ 0],p[ 4],p[ 8],p[12]};
380 return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
382 return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
383 p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
387template <
typename T,
typename P>
392 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
394 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
395 p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
397 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
398 p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
399 p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
400 p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
404template <
typename T,
typename P>
409 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
411 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
412 p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
414 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
415 p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
416 p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
417 p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
426 U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
428 U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
429 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
431 U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
432 p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
433 p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
434 p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
441 auto load_16 = [p](
int i) {
442 return load<uint16_t>(p + 2*i);
447 U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
449 U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
450 load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
452 U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
453 load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
454 load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
455 load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
462 auto load_32 = [p](
int i) {
463 return load<uint32_t>(p + 4*i);
468 U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
470 U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
471 load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
473 U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
474 load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
475 load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
476 load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
488 auto load_24_32 = [p](
int i) {
489 return load<uint32_t>(p + 3*i);
494 U32 v = load_24_32(ix);
496 U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
497#elif N == 8 && !defined(USING_AVX2)
498 U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
499 load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
504 const int* p4 = bit_pun<const int*>(p);
505 I32 zero = { 0, 0, 0, 0, 0, 0, 0, 0},
506 mask = {-1,-1,-1,-1, -1,-1,-1,-1};
507 #if defined(__clang__)
508 U32 v = (
U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
509 #elif defined(__GNUC__)
510 U32 v = (
U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
516 const int* p4 = bit_pun<const int*>(p);
517 U32 v = (
U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
530 auto load_48_64 = [p](
int i) {
531 return load<uint64_t>(p + 6*i);
538 load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
540 #elif N == 8 && !defined(USING_AVX2)
542 load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
543 load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
552 const long long int* p8 = bit_pun<const long long int*>(p);
554 Half_I64 zero = { 0, 0, 0, 0},
555 mask = {-1,-1,-1,-1};
558 Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
559 ix_hi = { ix[4], ix[5], ix[6], ix[7] };
561 #if defined(__clang__)
562 Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
563 hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
564 #elif defined(__GNUC__)
565 Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
566 hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
568 store((
char*)v + 0, lo);
569 store((
char*)v + 32, hi);
572 const long long int* p8 = bit_pun<const long long int*>(p);
573 __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
574 hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
575 store((
char*)v + 0, lo);
576 store((
char*)v + 64, hi);
584 return cast<F>(v) * (1/255.0f);
591 hi = (v << 8) & 0xffff;
592 return cast<F>(lo|hi) * (1/65535.0f);
601 return bit_pun<F>( bit_pun<U32>(v) - 1 );
609 I32 lo = cast<I32>( ix ),
611 F t = ix - cast<F>(lo);
631 *r = cast<F>((rgb >> 0) & 0xff) * (1/255.0f);
632 *g = cast<F>((rgb >> 8) & 0xff) * (1/255.0f);
633 *
b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
640 *r = cast<F>((
rgba >> 0) & 0xff) * (1/255.0f);
641 *g = cast<F>((
rgba >> 8) & 0xff) * (1/255.0f);
642 *
b = cast<F>((
rgba >> 16) & 0xff) * (1/255.0f);
643 *
a = cast<F>((
rgba >> 24) & 0xff) * (1/255.0f);
658 *r = cast<F>((rgb >> 0) & 0xffff) * (1/65535.0f);
659 *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
660 *
b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
672static void clut(uint32_t input_channels, uint32_t output_channels,
673 const uint8_t grid_points[4],
const uint8_t* grid_8,
const uint8_t* grid_16,
676 const int dim = (
int)input_channels;
677 assert (0 < dim && dim <= 4);
678 assert (output_channels == 3 ||
679 output_channels == 4);
686 const F inputs[] = { *r,*g,*
b,*
a };
687 for (
int i = dim-1, stride = 1; i >= 0; i--) {
689 F x = inputs[i] * (float)(grid_points[i] - 1);
692 I32 lo = cast<I32>(
x ),
695 index[i+0] = lo * stride;
696 index[i+4] = hi * stride;
697 stride *= grid_points[i];
700 F t =
x - cast<F>(lo);
706 if (output_channels == 4) {
712 for (
int combo = 0; combo < (1<<dim); combo++) {
720 I32 ix = index [0 + (combo&1)*4];
721 F w = weight[0 + (combo&1)*4];
724 case 3: ix += index [3 + (combo&8)/2];
725 w *= weight[3 + (combo&8)/2];
729 case 2: ix += index [2 + (combo&4)*1];
730 w *= weight[2 + (combo&4)*1];
734 case 1: ix += index [1 + (combo&2)*2];
735 w *= weight[1 + (combo&2)*2];
739 if (output_channels == 3) {
769 template <
typename T>
operator T*() {
return (
const T*)
fArg; }
772#define STAGE_PARAMS(MAYBE_REF) SKCMS_MAYBE_UNUSED const char* src, \
773 SKCMS_MAYBE_UNUSED char* dst, \
774 SKCMS_MAYBE_UNUSED F MAYBE_REF r, \
775 SKCMS_MAYBE_UNUSED F MAYBE_REF g, \
776 SKCMS_MAYBE_UNUSED F MAYBE_REF b, \
777 SKCMS_MAYBE_UNUSED F MAYBE_REF a, \
778 SKCMS_MAYBE_UNUSED int i
780#if SKCMS_HAS_MUSTTAIL
788 using StageFn = void (*)(StageList stages,
const void** ctx,
STAGE_PARAMS());
793 #define DECLARE_STAGE(name, arg, CALL_NEXT) \
794 SI void Exec_##name##_k(arg, STAGE_PARAMS(&)); \
796 SI void Exec_##name(StageList list, const void** ctx, STAGE_PARAMS()) { \
797 Exec_##name##_k(Ctx{*ctx}, src, dst, r, g, b, a, i); \
802 SI void Exec_##name##_k(arg, STAGE_PARAMS(&))
804 #define STAGE(name, arg) \
805 DECLARE_STAGE(name, arg, [[clang::musttail]] return (*list.fn)(list, ctx, src, dst, \
808 #define FINAL_STAGE(name, arg) \
809 DECLARE_STAGE(name, arg, )
813 #define DECLARE_STAGE(name, arg) \
814 SI void Exec_##name##_k(arg, STAGE_PARAMS(&)); \
816 SI void Exec_##name(const void* ctx, STAGE_PARAMS(&)) { \
817 Exec_##name##_k(Ctx{ctx}, src, dst, r, g, b, a, i); \
820 SI void Exec_##name##_k(arg, STAGE_PARAMS(&))
822 #define STAGE(name, arg) DECLARE_STAGE(name, arg)
823 #define FINAL_STAGE(name, arg) DECLARE_STAGE(name, arg)
836 U16 abgr = load<U16>(src + 2*i);
838 r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
839 g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
840 b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
841 a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
845 U16 rgb = load<U16>(src + 2*i);
847 r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
848 g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
849 b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
853 const uint8_t* rgb = (
const uint8_t*)(src + 3*i);
854#if defined(USING_NEON)
858 uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
859 v = vld3_lane_u8(rgb+0, v, 0);
860 v = vld3_lane_u8(rgb+3, v, 2);
861 v = vld3_lane_u8(rgb+6, v, 4);
862 v = vld3_lane_u8(rgb+9, v, 6);
867 r = cast<F>((
U16)v.val[0]) * (1/255.0f);
868 g = cast<F>((
U16)v.val[1]) * (1/255.0f);
869 b = cast<F>((
U16)v.val[2]) * (1/255.0f);
871 r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
872 g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
873 b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
878 U32 rgba = load<U32>(src + 4*i);
880 r = cast<F>((
rgba >> 0) & 0xff) * (1/255.0f);
881 g = cast<F>((
rgba >> 8) & 0xff) * (1/255.0f);
882 b = cast<F>((
rgba >> 16) & 0xff) * (1/255.0f);
883 a = cast<F>((
rgba >> 24) & 0xff) * (1/255.0f);
887 U32 rgba = load<U32>(src + 4*i);
889 r = cast<F>((
rgba >> 0) & 0x3ff) * (1/1023.0f);
890 g = cast<F>((
rgba >> 10) & 0x3ff) * (1/1023.0f);
891 b = cast<F>((
rgba >> 20) & 0x3ff) * (1/1023.0f);
892 a = cast<F>((
rgba >> 30) & 0x3 ) * (1/ 3.0f);
896 static constexpr float min = -0.752941f;
897 static constexpr float max = 1.25098f;
898 static constexpr float range =
max -
min;
899 U32 rgba = load<U32>(src + 4*i);
900 r = cast<F>((
rgba >> 0) & 0x3ff) * (1/1023.0f) * range +
min;
901 g = cast<F>((
rgba >> 10) & 0x3ff) * (1/1023.0f) * range +
min;
902 b = cast<F>((
rgba >> 20) & 0x3ff) * (1/1023.0f) * range +
min;
906 uintptr_t ptr = (uintptr_t)(src + 6*i);
907 assert( (ptr & 1) == 0 );
908 const uint16_t* rgb = (
const uint16_t*)ptr;
909#if defined(USING_NEON)
910 uint16x4x3_t v = vld3_u16(rgb);
911 r = cast<F>((
U16)v.val[0]) * (1/65535.0f);
912 g = cast<F>((
U16)v.val[1]) * (1/65535.0f);
913 b = cast<F>((
U16)v.val[2]) * (1/65535.0f);
915 r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
916 g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
917 b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
922 uintptr_t ptr = (uintptr_t)(src + 8*i);
923 assert( (ptr & 1) == 0 );
924 const uint16_t*
rgba = (
const uint16_t*)ptr;
925#if defined(USING_NEON)
926 uint16x4x4_t v = vld4_u16(
rgba);
927 r = cast<F>((
U16)v.val[0]) * (1/65535.0f);
928 g = cast<F>((
U16)v.val[1]) * (1/65535.0f);
929 b = cast<F>((
U16)v.val[2]) * (1/65535.0f);
930 a = cast<F>((
U16)v.val[3]) * (1/65535.0f);
934 r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
935 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
936 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
937 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
942 uintptr_t ptr = (uintptr_t)(src + 6*i);
943 assert( (ptr & 1) == 0 );
944 const uint16_t* rgb = (
const uint16_t*)ptr;
945#if defined(USING_NEON)
946 uint16x4x3_t v = vld3_u16(rgb);
947 r = cast<F>(swap_endian_16((
U16)v.val[0])) * (1/65535.0f);
948 g = cast<F>(swap_endian_16((
U16)v.val[1])) * (1/65535.0f);
949 b = cast<F>(swap_endian_16((
U16)v.val[2])) * (1/65535.0f);
951 U32 R = load_3<U32>(rgb+0),
952 G = load_3<U32>(rgb+1),
953 B = load_3<U32>(rgb+2);
955 r = cast<F>((
R & 0x00ff)<<8 | (
R & 0xff00)>>8) * (1/65535.0f);
956 g = cast<F>((
G & 0x00ff)<<8 | (
G & 0xff00)>>8) * (1/65535.0f);
957 b = cast<F>((
B & 0x00ff)<<8 | (
B & 0xff00)>>8) * (1/65535.0f);
962 uintptr_t ptr = (uintptr_t)(src + 8*i);
963 assert( (ptr & 1) == 0 );
964 const uint16_t*
rgba = (
const uint16_t*)ptr;
965#if defined(USING_NEON)
966 uint16x4x4_t v = vld4_u16(
rgba);
967 r = cast<F>(swap_endian_16((
U16)v.val[0])) * (1/65535.0f);
968 g = cast<F>(swap_endian_16((
U16)v.val[1])) * (1/65535.0f);
969 b = cast<F>(swap_endian_16((
U16)v.val[2])) * (1/65535.0f);
970 a = cast<F>(swap_endian_16((
U16)v.val[3])) * (1/65535.0f);
974 r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
975 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
976 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
977 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
982 uintptr_t ptr = (uintptr_t)(src + 6*i);
983 assert( (ptr & 1) == 0 );
984 const uint16_t* rgb = (
const uint16_t*)ptr;
985#if defined(USING_NEON)
986 uint16x4x3_t v = vld3_u16(rgb);
991 U16 R = load_3<U16>(rgb+0),
992 G = load_3<U16>(rgb+1),
993 B = load_3<U16>(rgb+2);
1001 uintptr_t ptr = (uintptr_t)(src + 8*i);
1002 assert( (ptr & 1) == 0 );
1003 const uint16_t*
rgba = (
const uint16_t*)ptr;
1004#if defined(USING_NEON)
1005 uint16x4x4_t v = vld4_u16(
rgba);
1012 U16 R = cast<U16>((px >> 0) & 0xffff),
1013 G = cast<U16>((px >> 16) & 0xffff),
1014 B = cast<U16>((px >> 32) & 0xffff),
1015 A = cast<U16>((px >> 48) & 0xffff);
1024 uintptr_t ptr = (uintptr_t)(src + 12*i);
1025 assert( (ptr & 3) == 0 );
1026 const float* rgb = (
const float*)ptr;
1027#if defined(USING_NEON)
1028 float32x4x3_t v = vld3q_f32(rgb);
1033 r = load_3<F>(rgb+0);
1034 g = load_3<F>(rgb+1);
1035 b = load_3<F>(rgb+2);
1040 uintptr_t ptr = (uintptr_t)(src + 16*i);
1041 assert( (ptr & 3) == 0 );
1042 const float*
rgba = (
const float*)ptr;
1043#if defined(USING_NEON)
1044 float32x4x4_t v = vld4q_f32(
rgba);
1050 r = load_4<F>(
rgba+0);
1051 g = load_4<F>(
rgba+1);
1052 b = load_4<F>(
rgba+2);
1053 a = load_4<F>(
rgba+3);
1095 const float* m = &matrix->vals[0][0];
1097 F R = m[0]*r + m[1]*g + m[2]*
b,
1098 G = m[3]*r + m[4]*g + m[5]*
b,
1099 B = m[6]*r + m[7]*g + m[8]*
b;
1107 const float* m = &matrix->vals[0][0];
1109 F R = m[0]*r + m[1]*g + m[ 2]*
b + m[ 3],
1110 G = m[4]*r + m[5]*g + m[ 6]*
b + m[ 7],
1111 B = m[8]*r + m[9]*g + m[10]*
b + m[11];
1121 A = g * 255.0f - 128.0f,
1122 B =
b * 255.0f - 128.0f;
1125 F Y = (L + 16.0f) * (1/116.0f),
1126 X =
Y +
A*(1/500.0f),
1127 Z =
Y -
B*(1/200.0f);
1141 F X = r * (1/0.9642f),
1143 Z =
b * (1/0.8249f);
1149 F L =
Y*116.0f - 16.0f,
1154 g = (
A + 128.0f) * (1/255.0f);
1155 b = (
B + 128.0f) * (1/255.0f);
1243 store<U16>(dst + 2*i, cast<U16>(
to_fixed(r * 15) << 12)
1250 store<U16>(dst + 2*i, cast<U16>(
to_fixed(r * 31) << 0 )
1251 | cast<U16>(
to_fixed(g * 63) << 5 )
1256 uint8_t* rgb = (uint8_t*)dst + 3*i;
1257#if defined(USING_NEON)
1265 uint8x8x3_t v = {{ (uint8x8_t)
R, (uint8x8_t)
G, (uint8x8_t)
B }};
1266 vst3_lane_u8(rgb+0, v, 0);
1267 vst3_lane_u8(rgb+3, v, 2);
1268 vst3_lane_u8(rgb+6, v, 4);
1269 vst3_lane_u8(rgb+9, v, 6);
1279 | cast<U32>(
to_fixed(g * 255)) << 8
1285 static constexpr float min = -0.752941f;
1286 static constexpr float max = 1.25098f;
1287 static constexpr float range =
max -
min;
1289 | cast<U32>(
to_fixed(((g -
min) / range) * 1023)) << 10
1290 | cast<U32>(
to_fixed(((
b -
min) / range) * 1023)) << 20);
1295 | cast<U32>(
to_fixed(g * 1023)) << 10
1301 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1302 assert( (ptr & 1) == 0 );
1303 uint16_t* rgb = (uint16_t*)ptr;
1304#if defined(USING_NEON)
1320 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1321 assert( (ptr & 1) == 0 );
1322 uint16_t*
rgba = (uint16_t*)ptr;
1323#if defined(USING_NEON)
1333 | cast<U64>(
to_fixed(g * 65535)) << 16
1341 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1342 assert( (ptr & 1) == 0 );
1343 uint16_t* rgb = (uint16_t*)ptr;
1344#if defined(USING_NEON)
1346 (uint16x4_t)swap_endian_16(cast<U16>(
U16_from_F(r))),
1347 (uint16x4_t)swap_endian_16(cast<U16>(
U16_from_F(g))),
1348 (uint16x4_t)swap_endian_16(cast<U16>(
U16_from_F(
b))),
1355 store_3(rgb+0, cast<U16>((
R & 0x00ff) << 8 | (
R & 0xff00) >> 8) );
1356 store_3(rgb+1, cast<U16>((
G & 0x00ff) << 8 | (
G & 0xff00) >> 8) );
1357 store_3(rgb+2, cast<U16>((
B & 0x00ff) << 8 | (
B & 0xff00) >> 8) );
1363 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1364 assert( (ptr & 1) == 0 );
1365 uint16_t*
rgba = (uint16_t*)ptr;
1366#if defined(USING_NEON)
1368 (uint16x4_t)swap_endian_16(cast<U16>(
U16_from_F(r))),
1369 (uint16x4_t)swap_endian_16(cast<U16>(
U16_from_F(g))),
1370 (uint16x4_t)swap_endian_16(cast<U16>(
U16_from_F(
b))),
1371 (uint16x4_t)swap_endian_16(cast<U16>(
U16_from_F(
a))),
1376 | cast<U64>(
to_fixed(g * 65535)) << 16
1384 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1385 assert( (ptr & 1) == 0 );
1386 uint16_t* rgb = (uint16_t*)ptr;
1391#if defined(USING_NEON)
1406 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1407 assert( (ptr & 1) == 0 );
1408 uint16_t*
rgba = (uint16_t*)ptr;
1414#if defined(USING_NEON)
1424 | cast<U64>(
G) << 16
1425 | cast<U64>(
B) << 32
1426 | cast<U64>(
A) << 48);
1431 uintptr_t ptr = (uintptr_t)(dst + 12*i);
1432 assert( (ptr & 3) == 0 );
1433 float* rgb = (
float*)ptr;
1434#if defined(USING_NEON)
1435 float32x4x3_t v = {{
1449 uintptr_t ptr = (uintptr_t)(dst + 16*i);
1450 assert( (ptr & 3) == 0 );
1451 float*
rgba = (
float*)ptr;
1452#if defined(USING_NEON)
1453 float32x4x4_t v = {{
1468#if SKCMS_HAS_MUSTTAIL
1470 SI void exec_stages(StageFn* stages,
const void** contexts,
const char* src,
char* dst,
int i) {
1477 const char* src,
char* dst,
int i) {
1481#define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); break;
1484#define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); return;
1495 const char* src,
char* dst,
int n,
1496 const size_t src_bpp,
const size_t dst_bpp) {
1497#if SKCMS_HAS_MUSTTAIL
1502 static constexpr StageFn kStageFns[] = {
1503#define M(name) &Exec_##name,
1509 for (ptrdiff_t index = 0; index < programSize; ++index) {
1510 stages[index] = kStageFns[(
int)program[index]];
1514 const Op* stages = program;
1524 char tmp[4*4*
N] = {0};
1526 memcpy(tmp, (
const char*)src + (
size_t)i*src_bpp, (
size_t)n*src_bpp);
1528 memcpy((
char*)dst + (
size_t)i*dst_bpp, tmp, (
size_t)n*dst_bpp);
static const uint32_t rgba[kNumPixels]
static unsigned clamp(SkFixed fx, int max)
static int sign(SkScalar x)
static uint32_t premul(uint32_t color)
Type::kYUV Type::kRGBA() int(0.7 *637)
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
static float max(float r, float g, float b)
static float min(float r, float g, float b)
__attribute__((visibility("default"))) int RunBenchmarks(int argc
T __attribute__((ext_vector_type(N))) V
static float floorf_(float x)
#define SKCMS_FALLTHROUGH
#define SKCMS_MAYBE_UNUSED