Flutter Engine
The Flutter Engine
SkRasterPipeline_opts.h
Go to the documentation of this file.
1/*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkRasterPipeline_opts_DEFINED
9#define SkRasterPipeline_opts_DEFINED
10
15#include "modules/skcms/skcms.h"
16#include "src/base/SkUtils.h" // unaligned_{load,store}
21
22#include <cstdint>
23#include <type_traits>
24
25// Every function in this file should be marked static and inline using SI.
26#if defined(__clang__) || defined(__GNUC__)
27 #define SI __attribute__((always_inline)) static inline
28#else
29 #define SI static inline
30#endif
31
32#if defined(__clang__)
33 #define SK_UNROLL _Pragma("unroll")
34#else
35 #define SK_UNROLL
36#endif
37
38#if defined(__clang__)
39 template <int N, typename T> using Vec = T __attribute__((ext_vector_type(N)));
40#elif defined(__GNUC__)
41 // Unfortunately, GCC does not allow us to omit the struct. This will not compile:
42 // template <int N, typename T> using Vec = T __attribute__((vector_size(N*sizeof(T))));
43 template <int N, typename T> struct VecHelper {
44 typedef T __attribute__((vector_size(N * sizeof(T)))) V;
45 };
46 template <int N, typename T> using Vec = typename VecHelper<N, T>::V;
47#endif
48
49template <typename Dst, typename Src>
50SI Dst widen_cast(const Src& src) {
51 static_assert(sizeof(Dst) > sizeof(Src));
54 Dst dst;
55 memcpy(&dst, &src, sizeof(Src));
56 return dst;
57}
58
59struct Ctx {
61
62 template <typename T>
63 operator T*() {
64 return (T*)fStage->ctx;
65 }
66};
67
68using NoCtx = const void*;
69
70#if defined(JUMPER_IS_SCALAR) || defined(JUMPER_IS_NEON) || defined(JUMPER_IS_HSW) || \
71 defined(JUMPER_IS_SKX) || defined(JUMPER_IS_AVX) || defined(JUMPER_IS_SSE41) || \
72 defined(JUMPER_IS_SSE2)
73 // Honor the existing setting
74#elif !defined(__clang__) && !defined(__GNUC__)
75 #define JUMPER_IS_SCALAR
76#elif defined(SK_ARM_HAS_NEON)
77 #define JUMPER_IS_NEON
78#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
79 #define JUMPER_IS_SKX
80#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
81 #define JUMPER_IS_HSW
82#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
83 #define JUMPER_IS_AVX
84#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
85 #define JUMPER_IS_SSE41
86#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
87 #define JUMPER_IS_SSE2
88#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
89 #define JUMPER_IS_LASX
90#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
91 #define JUMPER_IS_LSX
92#else
93 #define JUMPER_IS_SCALAR
94#endif
95
96// Older Clangs seem to crash when generating non-optimized NEON code for ARMv7.
97#if defined(__clang__) && !defined(__OPTIMIZE__) && defined(SK_CPU_ARM32)
98 // Apple Clang 9 and vanilla Clang 5 are fine, and may even be conservative.
99 #if defined(__apple_build_version__) && __clang_major__ < 9
100 #define JUMPER_IS_SCALAR
101 #elif __clang_major__ < 5
102 #define JUMPER_IS_SCALAR
103 #endif
104
105 #if defined(JUMPER_IS_NEON) && defined(JUMPER_IS_SCALAR)
106 #undef JUMPER_IS_NEON
107 #endif
108#endif
109
110#if defined(JUMPER_IS_SCALAR)
111 #include <math.h>
112#elif defined(JUMPER_IS_NEON)
113 #include <arm_neon.h>
114#elif defined(JUMPER_IS_LASX)
115 #include <lasxintrin.h>
116 #include <lsxintrin.h>
117#elif defined(JUMPER_IS_LSX)
118 #include <lsxintrin.h>
119#else
120 #include <immintrin.h>
121#endif
122
123// Notes:
124// * rcp_fast and rcp_precise both produce a reciprocal, but rcp_fast is an estimate with at least
125// 12 bits of precision while rcp_precise should be accurate for float size. For ARM rcp_precise
126// requires 2 Newton-Raphson refinement steps because its estimate has 8 bit precision, and for
127// Intel this requires one additional step because its estimate has 12 bit precision.
128//
129// * Don't call rcp_approx or rsqrt_approx directly; only use rcp_fast and rsqrt.
130
131namespace SK_OPTS_NS {
132#if defined(JUMPER_IS_SCALAR)
133 // This path should lead to portable scalar code.
134 using F = float ;
135 using I32 = int32_t;
136 using U64 = uint64_t;
137 using U32 = uint32_t;
138 using U16 = uint16_t;
139 using U8 = uint8_t ;
140
141 SI F min(F a, F b) { return fminf(a,b); }
142 SI I32 min(I32 a, I32 b) { return a < b ? a : b; }
143 SI U32 min(U32 a, U32 b) { return a < b ? a : b; }
144 SI F max(F a, F b) { return fmaxf(a,b); }
145 SI I32 max(I32 a, I32 b) { return a > b ? a : b; }
146 SI U32 max(U32 a, U32 b) { return a > b ? a : b; }
147
148 SI F mad(F f, F m, F a) { return a+f*m; }
149 SI F nmad(F f, F m, F a) { return a-f*m; }
150 SI F abs_ (F v) { return fabsf(v); }
151 SI I32 abs_ (I32 v) { return v < 0 ? -v : v; }
152 SI F floor_(F v) { return floorf(v); }
153 SI F ceil_(F v) { return ceilf(v); }
154 SI F rcp_approx(F v) { return 1.0f / v; } // use rcp_fast instead
155 SI F rsqrt_approx(F v) { return 1.0f / sqrtf(v); }
156 SI F sqrt_ (F v) { return sqrtf(v); }
157 SI F rcp_precise (F v) { return 1.0f / v; }
158
159 SI I32 iround(F v) { return (I32)(v + 0.5f); }
160 SI U32 round(F v) { return (U32)(v + 0.5f); }
161 SI U32 round(F v, F scale) { return (U32)(v*scale + 0.5f); }
162 SI U16 pack(U32 v) { return (U16)v; }
163 SI U8 pack(U16 v) { return (U8)v; }
164
165 SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
166 SI I32 if_then_else(I32 c, I32 t, I32 e) { return c ? t : e; }
167
168 SI bool any(I32 c) { return c != 0; }
169 SI bool all(I32 c) { return c != 0; }
170
171 template <typename T>
172 SI T gather(const T* p, U32 ix) { return p[ix]; }
173
174 SI void scatter_masked(I32 src, int* dst, U32 ix, I32 mask) {
175 dst[ix] = mask ? src : dst[ix];
176 }
177
178 SI void load2(const uint16_t* ptr, U16* r, U16* g) {
179 *r = ptr[0];
180 *g = ptr[1];
181 }
182 SI void store2(uint16_t* ptr, U16 r, U16 g) {
183 ptr[0] = r;
184 ptr[1] = g;
185 }
186 SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
187 *r = ptr[0];
188 *g = ptr[1];
189 *b = ptr[2];
190 *a = ptr[3];
191 }
192 SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
193 ptr[0] = r;
194 ptr[1] = g;
195 ptr[2] = b;
196 ptr[3] = a;
197 }
198
199 SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
200 *r = ptr[0];
201 *g = ptr[1];
202 *b = ptr[2];
203 *a = ptr[3];
204 }
205 SI void store4(float* ptr, F r, F g, F b, F a) {
206 ptr[0] = r;
207 ptr[1] = g;
208 ptr[2] = b;
209 ptr[3] = a;
210 }
211
212#elif defined(JUMPER_IS_NEON)
213 template <typename T> using V = Vec<4, T>;
214 using F = V<float >;
215 using I32 = V< int32_t>;
216 using U64 = V<uint64_t>;
217 using U32 = V<uint32_t>;
218 using U16 = V<uint16_t>;
219 using U8 = V<uint8_t >;
220
221 // We polyfill a few routines that Clang doesn't build into ext_vector_types.
222 SI F min(F a, F b) { return vminq_f32(a,b); }
223 SI I32 min(I32 a, I32 b) { return vminq_s32(a,b); }
224 SI U32 min(U32 a, U32 b) { return vminq_u32(a,b); }
225 SI F max(F a, F b) { return vmaxq_f32(a,b); }
226 SI I32 max(I32 a, I32 b) { return vmaxq_s32(a,b); }
227 SI U32 max(U32 a, U32 b) { return vmaxq_u32(a,b); }
228
229 SI F abs_ (F v) { return vabsq_f32(v); }
230 SI I32 abs_ (I32 v) { return vabsq_s32(v); }
231 SI F rcp_approx(F v) { auto e = vrecpeq_f32(v); return vrecpsq_f32 (v,e ) * e; }
232 SI F rcp_precise(F v) { auto e = rcp_approx(v); return vrecpsq_f32 (v,e ) * e; }
233 SI F rsqrt_approx(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
234
235 SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
236 SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
237
238 SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
239 SI I32 if_then_else(I32 c, I32 t, I32 e) { return vbslq_s32((U32)c,t,e); }
240
241 #if defined(SK_CPU_ARM64)
242 SI bool any(I32 c) { return vmaxvq_u32((U32)c) != 0; }
243 SI bool all(I32 c) { return vminvq_u32((U32)c) != 0; }
244
245 SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
246 SI F nmad(F f, F m, F a) { return vfmsq_f32(a,f,m); }
247 SI F floor_(F v) { return vrndmq_f32(v); }
248 SI F ceil_(F v) { return vrndpq_f32(v); }
249 SI F sqrt_(F v) { return vsqrtq_f32(v); }
250 SI I32 iround(F v) { return vcvtnq_s32_f32(v); }
251 SI U32 round(F v) { return vcvtnq_u32_f32(v); }
252 SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
253 #else
254 SI bool any(I32 c) { return c[0] | c[1] | c[2] | c[3]; }
255 SI bool all(I32 c) { return c[0] & c[1] & c[2] & c[3]; }
256
257 SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); }
258 SI F nmad(F f, F m, F a) { return vmlsq_f32(a,f,m); }
259
260 SI F floor_(F v) {
261 F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
262 return roundtrip - if_then_else(roundtrip > v, F() + 1, F());
263 }
264
265 SI F ceil_(F v) {
266 F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
267 return roundtrip + if_then_else(roundtrip < v, F() + 1, F());
268 }
269
270 SI F sqrt_(F v) {
271 auto e = vrsqrteq_f32(v); // Estimate and two refinement steps for e = rsqrt(v).
272 e *= vrsqrtsq_f32(v,e*e);
273 e *= vrsqrtsq_f32(v,e*e);
274 return v*e; // sqrt(v) == v*rsqrt(v).
275 }
276
277 SI I32 iround(F v) {
278 return vcvtq_s32_f32(v + 0.5f);
279 }
280
281 SI U32 round(F v) {
282 return vcvtq_u32_f32(v + 0.5f);
283 }
284
285 SI U32 round(F v, F scale) {
286 return vcvtq_u32_f32(mad(v, scale, F() + 0.5f));
287 }
288 #endif
289
290 template <typename T>
291 SI V<T> gather(const T* p, U32 ix) {
292 return V<T>{p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
293 }
294 SI void scatter_masked(I32 src, int* dst, U32 ix, I32 mask) {
295 I32 before = gather(dst, ix);
296 I32 after = if_then_else(mask, src, before);
297 dst[ix[0]] = after[0];
298 dst[ix[1]] = after[1];
299 dst[ix[2]] = after[2];
300 dst[ix[3]] = after[3];
301 }
302 SI void load2(const uint16_t* ptr, U16* r, U16* g) {
303 uint16x4x2_t rg = vld2_u16(ptr);
304 *r = rg.val[0];
305 *g = rg.val[1];
306 }
307 SI void store2(uint16_t* ptr, U16 r, U16 g) {
308 vst2_u16(ptr, (uint16x4x2_t{{r,g}}));
309 }
310 SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
311 uint16x4x4_t rgba = vld4_u16(ptr);
312 *r = rgba.val[0];
313 *g = rgba.val[1];
314 *b = rgba.val[2];
315 *a = rgba.val[3];
316 }
317
318 SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
319 vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
320 }
321 SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
322 float32x4x4_t rgba = vld4q_f32(ptr);
323 *r = rgba.val[0];
324 *g = rgba.val[1];
325 *b = rgba.val[2];
326 *a = rgba.val[3];
327 }
328 SI void store4(float* ptr, F r, F g, F b, F a) {
329 vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
330 }
331
332#elif defined(JUMPER_IS_SKX)
333 template <typename T> using V = Vec<16, T>;
334 using F = V<float >;
335 using I32 = V< int32_t>;
336 using U64 = V<uint64_t>;
337 using U32 = V<uint32_t>;
338 using U16 = V<uint16_t>;
339 using U8 = V<uint8_t >;
340
341 SI F mad(F f, F m, F a) { return _mm512_fmadd_ps(f, m, a); }
342 SI F nmad(F f, F m, F a) { return _mm512_fnmadd_ps(f, m, a); }
343 SI F min(F a, F b) { return _mm512_min_ps(a,b); }
344 SI I32 min(I32 a, I32 b) { return (I32)_mm512_min_epi32((__m512i)a,(__m512i)b); }
345 SI U32 min(U32 a, U32 b) { return (U32)_mm512_min_epu32((__m512i)a,(__m512i)b); }
346 SI F max(F a, F b) { return _mm512_max_ps(a,b); }
347 SI I32 max(I32 a, I32 b) { return (I32)_mm512_max_epi32((__m512i)a,(__m512i)b); }
348 SI U32 max(U32 a, U32 b) { return (U32)_mm512_max_epu32((__m512i)a,(__m512i)b); }
349 SI F abs_ (F v) { return _mm512_and_ps(v, _mm512_sub_ps(_mm512_setzero(), v)); }
350 SI I32 abs_ (I32 v) { return (I32)_mm512_abs_epi32((__m512i)v); }
351 SI F floor_(F v) { return _mm512_floor_ps(v); }
352 SI F ceil_(F v) { return _mm512_ceil_ps(v); }
353 SI F rcp_approx(F v) { return _mm512_rcp14_ps (v); }
354 SI F rsqrt_approx (F v) { return _mm512_rsqrt14_ps(v); }
355 SI F sqrt_ (F v) { return _mm512_sqrt_ps (v); }
356 SI F rcp_precise (F v) {
357 F e = rcp_approx(v);
358 return _mm512_fnmadd_ps(v, e, _mm512_set1_ps(2.0f)) * e;
359 }
360 SI I32 iround(F v) { return (I32)_mm512_cvtps_epi32(v); }
361 SI U32 round(F v) { return (U32)_mm512_cvtps_epi32(v); }
362 SI U32 round(F v, F scale) { return (U32)_mm512_cvtps_epi32(v*scale); }
363 SI U16 pack(U32 v) {
364 __m256i rst = _mm256_packus_epi32(_mm512_castsi512_si256((__m512i)v),
365 _mm512_extracti64x4_epi64((__m512i)v, 1));
366 return (U16)_mm256_permutex_epi64(rst, 216);
367 }
368 SI U8 pack(U16 v) {
369 __m256i rst = _mm256_packus_epi16((__m256i)v, (__m256i)v);
370 return (U8)_mm256_castsi256_si128(_mm256_permute4x64_epi64(rst, 8));
371 }
372 SI F if_then_else(I32 c, F t, F e) {
373 __m512i mask = _mm512_set1_epi32(0x80000000);
374 __m512i aa = _mm512_and_si512((__m512i)c, mask);
375 return _mm512_mask_blend_ps(_mm512_test_epi32_mask(aa, aa),e,t);
376 }
377 SI I32 if_then_else(I32 c, I32 t, I32 e) {
378 __m512i mask = _mm512_set1_epi32(0x80000000);
379 __m512i aa = _mm512_and_si512((__m512i)c, mask);
380 return (I32)_mm512_mask_blend_epi32(_mm512_test_epi32_mask(aa, aa),(__m512i)e,(__m512i)t);
381 }
382 SI bool any(I32 c) {
383 __mmask16 mask32 = _mm512_test_epi32_mask((__m512i)c, (__m512i)c);
384 return mask32 != 0;
385 }
386 SI bool all(I32 c) {
387 __mmask16 mask32 = _mm512_test_epi32_mask((__m512i)c, (__m512i)c);
388 return mask32 == 0xffff;
389 }
390 template <typename T>
391 SI V<T> gather(const T* p, U32 ix) {
392 return V<T>{ p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
393 p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
394 p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
395 p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
396 }
397 SI F gather(const float* p, U32 ix) { return _mm512_i32gather_ps((__m512i)ix, p, 4); }
398 SI U32 gather(const uint32_t* p, U32 ix) {
399 return (U32)_mm512_i32gather_epi32((__m512i)ix, p, 4); }
400 SI U64 gather(const uint64_t* p, U32 ix) {
401 __m512i parts[] = {
402 _mm512_i32gather_epi64(_mm512_castsi512_si256((__m512i)ix), p, 8),
403 _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)ix, 1), p, 8),
404 };
405 return sk_bit_cast<U64>(parts);
406 }
407 template <typename V, typename S>
408 SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
409 V before = gather(dst, ix);
410 V after = if_then_else(mask, src, before);
411 dst[ix[0]] = after[0];
412 dst[ix[1]] = after[1];
413 dst[ix[2]] = after[2];
414 dst[ix[3]] = after[3];
415 dst[ix[4]] = after[4];
416 dst[ix[5]] = after[5];
417 dst[ix[6]] = after[6];
418 dst[ix[7]] = after[7];
419 dst[ix[8]] = after[8];
420 dst[ix[9]] = after[9];
421 dst[ix[10]] = after[10];
422 dst[ix[11]] = after[11];
423 dst[ix[12]] = after[12];
424 dst[ix[13]] = after[13];
425 dst[ix[14]] = after[14];
426 dst[ix[15]] = after[15];
427 }
428
429 SI void load2(const uint16_t* ptr, U16* r, U16* g) {
430 __m256i _01234567 = _mm256_loadu_si256(((const __m256i*)ptr) + 0);
431 __m256i _89abcdef = _mm256_loadu_si256(((const __m256i*)ptr) + 1);
432
433 *r = (U16)_mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_mm256_slli_epi32
434 (_01234567, 16), 16), _mm256_srai_epi32(_mm256_slli_epi32(_89abcdef, 16), 16)), 216);
435 *g = (U16)_mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_01234567, 16),
436 _mm256_srai_epi32(_89abcdef, 16)), 216);
437 }
438 SI void store2(uint16_t* ptr, U16 r, U16 g) {
439 __m256i _01234567 = _mm256_unpacklo_epi16((__m256i)r, (__m256i)g);
440 __m256i _89abcdef = _mm256_unpackhi_epi16((__m256i)r, (__m256i)g);
441 __m512i combinedVector = _mm512_inserti64x4(_mm512_castsi256_si512(_01234567),
442 _89abcdef, 1);
443 __m512i aa = _mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), combinedVector);
444 _01234567 = _mm512_castsi512_si256(aa);
445 _89abcdef = _mm512_extracti64x4_epi64(aa, 1);
446
447 _mm256_storeu_si256((__m256i*)ptr + 0, _01234567);
448 _mm256_storeu_si256((__m256i*)ptr + 1, _89abcdef);
449 }
450
451 SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
452 __m256i _0123 = _mm256_loadu_si256((const __m256i*)ptr),
453 _4567 = _mm256_loadu_si256(((const __m256i*)ptr) + 1),
454 _89ab = _mm256_loadu_si256(((const __m256i*)ptr) + 2),
455 _cdef = _mm256_loadu_si256(((const __m256i*)ptr) + 3);
456
457 auto a0 = _mm256_unpacklo_epi16(_0123, _4567),
458 a1 = _mm256_unpackhi_epi16(_0123, _4567),
459 b0 = _mm256_unpacklo_epi16(a0, a1),
460 b1 = _mm256_unpackhi_epi16(a0, a1),
461 a2 = _mm256_unpacklo_epi16(_89ab, _cdef),
462 a3 = _mm256_unpackhi_epi16(_89ab, _cdef),
463 b2 = _mm256_unpacklo_epi16(a2, a3),
464 b3 = _mm256_unpackhi_epi16(a2, a3),
465 rr = _mm256_unpacklo_epi64(b0, b2),
466 gg = _mm256_unpackhi_epi64(b0, b2),
467 bb = _mm256_unpacklo_epi64(b1, b3),
468 aa = _mm256_unpackhi_epi64(b1, b3);
469
470 *r = (U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), rr);
471 *g = (U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), gg);
472 *b = (U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), bb);
473 *a = (U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), aa);
474 }
475 SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
476 auto rg012389ab = _mm256_unpacklo_epi16((__m256i)r, (__m256i)g),
477 rg4567cdef = _mm256_unpackhi_epi16((__m256i)r, (__m256i)g),
478 ba012389ab = _mm256_unpacklo_epi16((__m256i)b, (__m256i)a),
479 ba4567cdef = _mm256_unpackhi_epi16((__m256i)b, (__m256i)a);
480
481 auto _0189 = _mm256_unpacklo_epi32(rg012389ab, ba012389ab),
482 _23ab = _mm256_unpackhi_epi32(rg012389ab, ba012389ab),
483 _45cd = _mm256_unpacklo_epi32(rg4567cdef, ba4567cdef),
484 _67ef = _mm256_unpackhi_epi32(rg4567cdef, ba4567cdef);
485
486 auto _ab23 = _mm256_permutex_epi64(_23ab, 78),
487 _0123 = _mm256_blend_epi32(_0189, _ab23, 0xf0),
488 _89ab = _mm256_permutex_epi64(_mm256_blend_epi32(_0189, _ab23, 0x0f), 78),
489 _ef67 = _mm256_permutex_epi64(_67ef, 78),
490 _4567 = _mm256_blend_epi32(_45cd, _ef67, 0xf0),
491 _cdef = _mm256_permutex_epi64(_mm256_blend_epi32(_45cd, _ef67, 0x0f), 78);
492
493 _mm256_storeu_si256((__m256i*)ptr, _0123);
494 _mm256_storeu_si256((__m256i*)ptr + 1, _4567);
495 _mm256_storeu_si256((__m256i*)ptr + 2, _89ab);
496 _mm256_storeu_si256((__m256i*)ptr + 3, _cdef);
497 }
498
499 SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
500 F _048c, _159d, _26ae, _37bf;
501
502 _048c = _mm512_castps128_ps512(_mm_loadu_ps(ptr) );
503 _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+16), 1);
504 _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+32), 2);
505 _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+48), 3);
506 _159d = _mm512_castps128_ps512(_mm_loadu_ps(ptr+4) );
507 _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+20), 1);
508 _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+36), 2);
509 _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+52), 3);
510 _26ae = _mm512_castps128_ps512(_mm_loadu_ps(ptr+8) );
511 _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+24), 1);
512 _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+40), 2);
513 _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+56), 3);
514 _37bf = _mm512_castps128_ps512(_mm_loadu_ps(ptr+12) );
515 _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+28), 1);
516 _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+44), 2);
517 _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+60), 3);
518
519 F rg02468acf = _mm512_unpacklo_ps(_048c, _26ae),
520 ba02468acf = _mm512_unpackhi_ps(_048c, _26ae),
521 rg13579bde = _mm512_unpacklo_ps(_159d, _37bf),
522 ba13579bde = _mm512_unpackhi_ps(_159d, _37bf);
523
524 *r = (F)_mm512_unpacklo_ps(rg02468acf, rg13579bde);
525 *g = (F)_mm512_unpackhi_ps(rg02468acf, rg13579bde);
526 *b = (F)_mm512_unpacklo_ps(ba02468acf, ba13579bde);
527 *a = (F)_mm512_unpackhi_ps(ba02468acf, ba13579bde);
528 }
529
530 SI void store4(float* ptr, F r, F g, F b, F a) {
531 F rg014589cd = _mm512_unpacklo_ps(r, g),
532 rg2367abef = _mm512_unpackhi_ps(r, g),
533 ba014589cd = _mm512_unpacklo_ps(b, a),
534 ba2367abef = _mm512_unpackhi_ps(b, a);
535
536 F _048c = (F)_mm512_unpacklo_pd((__m512d)rg014589cd, (__m512d)ba014589cd),
537 _26ae = (F)_mm512_unpacklo_pd((__m512d)rg2367abef, (__m512d)ba2367abef),
538 _159d = (F)_mm512_unpackhi_pd((__m512d)rg014589cd, (__m512d)ba014589cd),
539 _37bf = (F)_mm512_unpackhi_pd((__m512d)rg2367abef, (__m512d)ba2367abef);
540
541 F _ae26 = (F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_26ae),
542 _bf37 = (F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_37bf),
543 _8c04 = (F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_048c),
544 _9d15 = (F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_159d);
545
546 __m512i index = _mm512_setr_epi32(4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11);
547 F _0426 = (F)_mm512_permutex2var_pd((__m512d)_048c, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
548 (__m512d)_ae26),
549 _1537 = (F)_mm512_permutex2var_pd((__m512d)_159d, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
550 (__m512d)_bf37),
551 _5173 = _mm512_permutexvar_ps(index, _1537),
552 _0123 = (F)_mm512_permutex2var_pd((__m512d)_0426, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
553 (__m512d)_5173);
554
555 F _5476 = (F)_mm512_permutex2var_pd((__m512d)_5173, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
556 (__m512d)_0426),
557 _4567 = _mm512_permutexvar_ps(index, _5476),
558 _8cae = (F)_mm512_permutex2var_pd((__m512d)_8c04, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
559 (__m512d)_26ae),
560 _9dbf = (F)_mm512_permutex2var_pd((__m512d)_9d15, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
561 (__m512d)_37bf),
562 _d9fb = _mm512_permutexvar_ps(index, _9dbf),
563 _89ab = (F)_mm512_permutex2var_pd((__m512d)_8cae, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
564 (__m512d)_d9fb),
565 _dcfe = (F)_mm512_permutex2var_pd((__m512d)_d9fb, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
566 (__m512d)_8cae),
567 _cdef = _mm512_permutexvar_ps(index, _dcfe);
568
569 _mm512_storeu_ps(ptr+0, _0123);
570 _mm512_storeu_ps(ptr+16, _4567);
571 _mm512_storeu_ps(ptr+32, _89ab);
572 _mm512_storeu_ps(ptr+48, _cdef);
573 }
574
575#elif defined(JUMPER_IS_HSW)
576 // These are __m256 and __m256i, but friendlier and strongly-typed.
577 template <typename T> using V = Vec<8, T>;
578 using F = V<float >;
579 using I32 = V< int32_t>;
580 using U64 = V<uint64_t>;
581 using U32 = V<uint32_t>;
582 using U16 = V<uint16_t>;
583 using U8 = V<uint8_t >;
584
585 SI F mad(F f, F m, F a) { return _mm256_fmadd_ps(f, m, a); }
586 SI F nmad(F f, F m, F a) { return _mm256_fnmadd_ps(f, m, a); }
587
588 SI F min(F a, F b) { return _mm256_min_ps(a,b); }
589 SI I32 min(I32 a, I32 b) { return (I32)_mm256_min_epi32((__m256i)a,(__m256i)b); }
590 SI U32 min(U32 a, U32 b) { return (U32)_mm256_min_epu32((__m256i)a,(__m256i)b); }
591 SI F max(F a, F b) { return _mm256_max_ps(a,b); }
592 SI I32 max(I32 a, I32 b) { return (I32)_mm256_max_epi32((__m256i)a,(__m256i)b); }
593 SI U32 max(U32 a, U32 b) { return (U32)_mm256_max_epu32((__m256i)a,(__m256i)b); }
594
595 SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); }
596 SI I32 abs_ (I32 v) { return (I32)_mm256_abs_epi32((__m256i)v); }
597 SI F floor_(F v) { return _mm256_floor_ps(v); }
598 SI F ceil_(F v) { return _mm256_ceil_ps(v); }
599 SI F rcp_approx(F v) { return _mm256_rcp_ps (v); } // use rcp_fast instead
600 SI F rsqrt_approx(F v) { return _mm256_rsqrt_ps(v); }
601 SI F sqrt_ (F v) { return _mm256_sqrt_ps (v); }
602 SI F rcp_precise (F v) {
603 F e = rcp_approx(v);
604 return _mm256_fnmadd_ps(v, e, _mm256_set1_ps(2.0f)) * e;
605 }
606
607 SI I32 iround(F v) { return (I32)_mm256_cvtps_epi32(v); }
608 SI U32 round(F v) { return (U32)_mm256_cvtps_epi32(v); }
609 SI U32 round(F v, F scale) { return (U32)_mm256_cvtps_epi32(v*scale); }
610 SI U16 pack(U32 v) {
611 return (U16)_mm_packus_epi32(_mm256_extractf128_si256((__m256i)v, 0),
612 _mm256_extractf128_si256((__m256i)v, 1));
613 }
614 SI U8 pack(U16 v) {
615 auto r = _mm_packus_epi16((__m128i)v,(__m128i)v);
616 return sk_unaligned_load<U8>(&r);
617 }
618
619 SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e, t, (__m256)c); }
620 SI I32 if_then_else(I32 c, I32 t, I32 e) {
621 return (I32)_mm256_blendv_ps((__m256)e, (__m256)t, (__m256)c);
622 }
623
624 // NOTE: This version of 'all' only works with mask values (true == all bits set)
625 SI bool any(I32 c) { return !_mm256_testz_si256((__m256i)c, _mm256_set1_epi32(-1)); }
626 SI bool all(I32 c) { return _mm256_testc_si256((__m256i)c, _mm256_set1_epi32(-1)); }
627
628 template <typename T>
629 SI V<T> gather(const T* p, U32 ix) {
630 return V<T>{ p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
631 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
632 }
633 SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, (__m256i)ix, 4); }
634 SI U32 gather(const uint32_t* p, U32 ix) {
635 return (U32)_mm256_i32gather_epi32((const int*)p, (__m256i)ix, 4);
636 }
637 SI U64 gather(const uint64_t* p, U32 ix) {
638 __m256i parts[] = {
639 _mm256_i32gather_epi64(
640 (const long long int*)p, _mm256_extracti128_si256((__m256i)ix, 0), 8),
641 _mm256_i32gather_epi64(
642 (const long long int*)p, _mm256_extracti128_si256((__m256i)ix, 1), 8),
643 };
644 return sk_bit_cast<U64>(parts);
645 }
646 SI void scatter_masked(I32 src, int* dst, U32 ix, I32 mask) {
647 I32 before = gather(dst, ix);
648 I32 after = if_then_else(mask, src, before);
649 dst[ix[0]] = after[0];
650 dst[ix[1]] = after[1];
651 dst[ix[2]] = after[2];
652 dst[ix[3]] = after[3];
653 dst[ix[4]] = after[4];
654 dst[ix[5]] = after[5];
655 dst[ix[6]] = after[6];
656 dst[ix[7]] = after[7];
657 }
658
659 SI void load2(const uint16_t* ptr, U16* r, U16* g) {
660 __m128i _0123 = _mm_loadu_si128(((const __m128i*)ptr) + 0),
661 _4567 = _mm_loadu_si128(((const __m128i*)ptr) + 1);
662 *r = (U16)_mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, 16), 16),
663 _mm_srai_epi32(_mm_slli_epi32(_4567, 16), 16));
664 *g = (U16)_mm_packs_epi32(_mm_srai_epi32(_0123, 16),
665 _mm_srai_epi32(_4567, 16));
666 }
667 SI void store2(uint16_t* ptr, U16 r, U16 g) {
668 auto _0123 = _mm_unpacklo_epi16((__m128i)r, (__m128i)g),
669 _4567 = _mm_unpackhi_epi16((__m128i)r, (__m128i)g);
670 _mm_storeu_si128((__m128i*)ptr + 0, _0123);
671 _mm_storeu_si128((__m128i*)ptr + 1, _4567);
672 }
673
674 SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
675 __m128i _01 = _mm_loadu_si128(((const __m128i*)ptr) + 0),
676 _23 = _mm_loadu_si128(((const __m128i*)ptr) + 1),
677 _45 = _mm_loadu_si128(((const __m128i*)ptr) + 2),
678 _67 = _mm_loadu_si128(((const __m128i*)ptr) + 3);
679
680 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
681 _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
682 _46 = _mm_unpacklo_epi16(_45, _67),
683 _57 = _mm_unpackhi_epi16(_45, _67);
684
685 auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
686 ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
687 rg4567 = _mm_unpacklo_epi16(_46, _57),
688 ba4567 = _mm_unpackhi_epi16(_46, _57);
689
690 *r = (U16)_mm_unpacklo_epi64(rg0123, rg4567);
691 *g = (U16)_mm_unpackhi_epi64(rg0123, rg4567);
692 *b = (U16)_mm_unpacklo_epi64(ba0123, ba4567);
693 *a = (U16)_mm_unpackhi_epi64(ba0123, ba4567);
694 }
695 SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
696 auto rg0123 = _mm_unpacklo_epi16((__m128i)r, (__m128i)g), // r0 g0 r1 g1 r2 g2 r3 g3
697 rg4567 = _mm_unpackhi_epi16((__m128i)r, (__m128i)g), // r4 g4 r5 g5 r6 g6 r7 g7
698 ba0123 = _mm_unpacklo_epi16((__m128i)b, (__m128i)a),
699 ba4567 = _mm_unpackhi_epi16((__m128i)b, (__m128i)a);
700
701 auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
702 _23 = _mm_unpackhi_epi32(rg0123, ba0123),
703 _45 = _mm_unpacklo_epi32(rg4567, ba4567),
704 _67 = _mm_unpackhi_epi32(rg4567, ba4567);
705
706 _mm_storeu_si128((__m128i*)ptr + 0, _01);
707 _mm_storeu_si128((__m128i*)ptr + 1, _23);
708 _mm_storeu_si128((__m128i*)ptr + 2, _45);
709 _mm_storeu_si128((__m128i*)ptr + 3, _67);
710 }
711
712 SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
713 F _04 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+ 0)),
714 _15 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+ 4)),
715 _26 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+ 8)),
716 _37 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+12));
717 _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1);
718 _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1);
719 _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1);
720 _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1);
721
722 F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 | r4 r5 g4 g5
723 ba0145 = _mm256_unpackhi_ps(_04,_15),
724 rg2367 = _mm256_unpacklo_ps(_26,_37),
725 ba2367 = _mm256_unpackhi_ps(_26,_37);
726
727 *r = (F)_mm256_unpacklo_pd((__m256d)rg0145, (__m256d)rg2367);
728 *g = (F)_mm256_unpackhi_pd((__m256d)rg0145, (__m256d)rg2367);
729 *b = (F)_mm256_unpacklo_pd((__m256d)ba0145, (__m256d)ba2367);
730 *a = (F)_mm256_unpackhi_pd((__m256d)ba0145, (__m256d)ba2367);
731 }
732 SI void store4(float* ptr, F r, F g, F b, F a) {
733 F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5
734 rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ...
735 ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5
736 ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ...
737
738 F _04 = (F)_mm256_unpacklo_pd((__m256d)rg0145, (__m256d)ba0145),// r0 g0 b0 a0 | r4 g4 b4 a4
739 _15 = (F)_mm256_unpackhi_pd((__m256d)rg0145, (__m256d)ba0145),// r1 ... | r5 ...
740 _26 = (F)_mm256_unpacklo_pd((__m256d)rg2367, (__m256d)ba2367),// r2 ... | r6 ...
741 _37 = (F)_mm256_unpackhi_pd((__m256d)rg2367, (__m256d)ba2367);// r3 ... | r7 ...
742
743 F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo
744 _23 = _mm256_permute2f128_ps(_26, _37, 32),
745 _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi
746 _67 = _mm256_permute2f128_ps(_26, _37, 49);
747 _mm256_storeu_ps(ptr+ 0, _01);
748 _mm256_storeu_ps(ptr+ 8, _23);
749 _mm256_storeu_ps(ptr+16, _45);
750 _mm256_storeu_ps(ptr+24, _67);
751 }
752
753#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
754 template <typename T> using V = Vec<4, T>;
755 using F = V<float >;
756 using I32 = V< int32_t>;
757 using U64 = V<uint64_t>;
758 using U32 = V<uint32_t>;
759 using U16 = V<uint16_t>;
760 using U8 = V<uint8_t >;
761
762 SI F if_then_else(I32 c, F t, F e) {
763 return _mm_or_ps(_mm_and_ps((__m128)c, t), _mm_andnot_ps((__m128)c, e));
764 }
765 SI I32 if_then_else(I32 c, I32 t, I32 e) {
766 return (I32)_mm_or_ps(_mm_and_ps((__m128)c, (__m128)t),
767 _mm_andnot_ps((__m128)c, (__m128)e));
768 }
769
770 SI F min(F a, F b) { return _mm_min_ps(a,b); }
771 SI F max(F a, F b) { return _mm_max_ps(a,b); }
772#if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
773 SI I32 min(I32 a, I32 b) { return (I32)_mm_min_epi32((__m128i)a,(__m128i)b); }
774 SI U32 min(U32 a, U32 b) { return (U32)_mm_min_epu32((__m128i)a,(__m128i)b); }
775 SI I32 max(I32 a, I32 b) { return (I32)_mm_max_epi32((__m128i)a,(__m128i)b); }
776 SI U32 max(U32 a, U32 b) { return (U32)_mm_max_epu32((__m128i)a,(__m128i)b); }
777#else
778 SI I32 min(I32 a, I32 b) { return if_then_else(a < b, a, b); }
779 SI I32 max(I32 a, I32 b) { return if_then_else(a > b, a, b); }
780 SI U32 min(U32 a, U32 b) {
781 return sk_bit_cast<U32>(if_then_else(a < b, sk_bit_cast<I32>(a), sk_bit_cast<I32>(b)));
782 }
783 SI U32 max(U32 a, U32 b) {
784 return sk_bit_cast<U32>(if_then_else(a > b, sk_bit_cast<I32>(a), sk_bit_cast<I32>(b)));
785 }
786#endif
787
788 SI F mad(F f, F m, F a) { return a+f*m; }
789 SI F nmad(F f, F m, F a) { return a-f*m; }
790 SI F abs_(F v) { return _mm_and_ps(v, 0-v); }
791#if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
792 SI I32 abs_(I32 v) { return (I32)_mm_abs_epi32((__m128i)v); }
793#else
794 SI I32 abs_(I32 v) { return max(v, -v); }
795#endif
796 SI F rcp_approx(F v) { return _mm_rcp_ps (v); } // use rcp_fast instead
797 SI F rcp_precise (F v) { F e = rcp_approx(v); return e * (2.0f - v * e); }
798 SI F rsqrt_approx(F v) { return _mm_rsqrt_ps(v); }
799 SI F sqrt_(F v) { return _mm_sqrt_ps (v); }
800
801 SI I32 iround(F v) { return (I32)_mm_cvtps_epi32(v); }
802 SI U32 round(F v) { return (U32)_mm_cvtps_epi32(v); }
803 SI U32 round(F v, F scale) { return (U32)_mm_cvtps_epi32(v*scale); }
804
805 SI U16 pack(U32 v) {
806 #if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
807 auto p = _mm_packus_epi32((__m128i)v,(__m128i)v);
808 #else
809 // Sign extend so that _mm_packs_epi32() does the pack we want.
810 auto p = _mm_srai_epi32(_mm_slli_epi32((__m128i)v, 16), 16);
811 p = _mm_packs_epi32(p,p);
812 #endif
813 return sk_unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
814 }
815 SI U8 pack(U16 v) {
816 auto r = widen_cast<__m128i>(v);
817 r = _mm_packus_epi16(r,r);
818 return sk_unaligned_load<U8>(&r);
819 }
820
821 // NOTE: This only checks the top bit of each lane, and is incorrect with non-mask values.
822 SI bool any(I32 c) { return _mm_movemask_ps(sk_bit_cast<F>(c)) != 0b0000; }
823 SI bool all(I32 c) { return _mm_movemask_ps(sk_bit_cast<F>(c)) == 0b1111; }
824
825 SI F floor_(F v) {
826 #if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
827 return _mm_floor_ps(v);
828 #else
829 F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
830 return roundtrip - if_then_else(roundtrip > v, F() + 1, F() + 0);
831 #endif
832 }
833
834 SI F ceil_(F v) {
835 #if defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
836 return _mm_ceil_ps(v);
837 #else
838 F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
839 return roundtrip + if_then_else(roundtrip < v, F() + 1, F() + 0);
840 #endif
841 }
842
843 template <typename T>
844 SI V<T> gather(const T* p, U32 ix) {
845 return V<T>{p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
846 }
847 SI void scatter_masked(I32 src, int* dst, U32 ix, I32 mask) {
848 I32 before = gather(dst, ix);
849 I32 after = if_then_else(mask, src, before);
850 dst[ix[0]] = after[0];
851 dst[ix[1]] = after[1];
852 dst[ix[2]] = after[2];
853 dst[ix[3]] = after[3];
854 }
855 SI void load2(const uint16_t* ptr, U16* r, U16* g) {
856 __m128i _01 = _mm_loadu_si128(((const __m128i*)ptr) + 0); // r0 g0 r1 g1 r2 g2 r3 g3
857 auto rg01_23 = _mm_shufflelo_epi16(_01, 0xD8); // r0 r1 g0 g1 r2 g2 r3 g3
858 auto rg = _mm_shufflehi_epi16(rg01_23, 0xD8); // r0 r1 g0 g1 r2 r3 g2 g3
859
860 auto R = _mm_shuffle_epi32(rg, 0x88); // r0 r1 r2 r3 r0 r1 r2 r3
861 auto G = _mm_shuffle_epi32(rg, 0xDD); // g0 g1 g2 g3 g0 g1 g2 g3
862 *r = sk_unaligned_load<U16>(&R);
863 *g = sk_unaligned_load<U16>(&G);
864 }
865 SI void store2(uint16_t* ptr, U16 r, U16 g) {
866 __m128i rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g));
867 _mm_storeu_si128((__m128i*)ptr + 0, rg);
868 }
869
870 SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
871 __m128i _01 = _mm_loadu_si128(((const __m128i*)ptr) + 0), // r0 g0 b0 a0 r1 g1 b1 a1
872 _23 = _mm_loadu_si128(((const __m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3
873
874 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
875 _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3
876
877 auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
878 ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3
879
880 *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0);
881 *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4);
882 *b = sk_unaligned_load<U16>((uint16_t*)&ba + 0);
883 *a = sk_unaligned_load<U16>((uint16_t*)&ba + 4);
884 }
885
886 SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
887 auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
888 ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
889
890 _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
891 _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
892 }
893
894 SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
895 F _0 = _mm_loadu_ps(ptr + 0),
896 _1 = _mm_loadu_ps(ptr + 4),
897 _2 = _mm_loadu_ps(ptr + 8),
898 _3 = _mm_loadu_ps(ptr +12);
899 _MM_TRANSPOSE4_PS(_0,_1,_2,_3);
900 *r = _0;
901 *g = _1;
902 *b = _2;
903 *a = _3;
904 }
905
906 SI void store4(float* ptr, F r, F g, F b, F a) {
907 _MM_TRANSPOSE4_PS(r,g,b,a);
908 _mm_storeu_ps(ptr + 0, r);
909 _mm_storeu_ps(ptr + 4, g);
910 _mm_storeu_ps(ptr + 8, b);
911 _mm_storeu_ps(ptr +12, a);
912 }
913
914#elif defined(JUMPER_IS_LASX)
915 // These are __m256 and __m256i, but friendlier and strongly-typed.
916 template <typename T> using V = Vec<8, T>;
917 using F = V<float >;
918 using I32 = V<int32_t>;
919 using U64 = V<uint64_t>;
920 using U32 = V<uint32_t>;
921 using U16 = V<uint16_t>;
922 using U8 = V<uint8_t >;
923
924 SI __m128i emulate_lasx_d_xr2vr_l(__m256i a) {
925 v4i64 tmp = a;
926 v2i64 al = {tmp[0], tmp[1]};
927 return (__m128i)al;
928 }
929
930 SI __m128i emulate_lasx_d_xr2vr_h(__m256i a) {
931 v4i64 tmp = a;
932 v2i64 ah = {tmp[2], tmp[3]};
933 return (__m128i)ah;
934 }
935
936 SI F if_then_else(I32 c, F t, F e) {
937 return sk_bit_cast<Vec<8,float>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),
938 sk_bit_cast<__m256i>(t),
939 sk_bit_cast<__m256i>(c)));
940 }
941
942 SI I32 if_then_else(I32 c, I32 t, I32 e) {
943 return sk_bit_cast<Vec<8,int32_t>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),
944 sk_bit_cast<__m256i>(t),
945 sk_bit_cast<__m256i>(c)));
946 }
947
948 SI F min(F a, F b) { return __lasx_xvfmin_s(a,b); }
949 SI F max(F a, F b) { return __lasx_xvfmax_s(a,b); }
950 SI I32 min(I32 a, I32 b) { return __lasx_xvmin_w(a,b); }
951 SI U32 min(U32 a, U32 b) { return __lasx_xvmin_wu(a,b); }
952 SI I32 max(I32 a, I32 b) { return __lasx_xvmax_w(a,b); }
953 SI U32 max(U32 a, U32 b) { return __lasx_xvmax_wu(a,b); }
954
955 SI F mad(F f, F m, F a) { return __lasx_xvfmadd_s(f, m, a); }
956 SI F nmad(F f, F m, F a) { return __lasx_xvfmadd_s(-f, m, a); }
957 SI F abs_ (F v) { return (F)__lasx_xvand_v((I32)v, (I32)(0-v)); }
958 SI I32 abs_(I32 v) { return max(v, -v); }
959 SI F rcp_approx(F v) { return __lasx_xvfrecip_s(v); }
960 SI F rcp_precise (F v) { F e = rcp_approx(v); return e * nmad(v, e, 2.0f); }
961 SI F rsqrt_approx (F v) { return __lasx_xvfrsqrt_s(v); }
962 SI F sqrt_(F v) { return __lasx_xvfsqrt_s(v); }
963
964 SI U32 iround(F v) {
965 F t = F(0.5);
966 return __lasx_xvftintrz_w_s(v + t);
967 }
968
969 SI U32 round(F v) {
970 F t = F(0.5);
971 return __lasx_xvftintrz_w_s(v + t);
972 }
973
974 SI U32 round(F v, F scale) {
975 F t = F(0.5);
976 return __lasx_xvftintrz_w_s(mad(v, scale, t));
977 }
978
979 SI U16 pack(U32 v) {
980 return __lsx_vpickev_h(__lsx_vsat_wu(emulate_lasx_d_xr2vr_h(v), 15),
981 __lsx_vsat_wu(emulate_lasx_d_xr2vr_l(v), 15));
982 }
983
984 SI U8 pack(U16 v) {
985 __m128i tmp = __lsx_vsat_hu(v, 7);
986 auto r = __lsx_vpickev_b(tmp, tmp);
987 return sk_unaligned_load<U8>(&r);
988 }
989
990 SI bool any(I32 c){
991 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0), c));
992 return (retv[0] | retv[4]) != 0b0000;
993 }
994
995 SI bool all(I32 c){
996 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0), c));
997 return (retv[0] & retv[4]) == 0b1111;
998 }
999
1000 SI F floor_(F v) {
1001 return __lasx_xvfrintrm_s(v);
1002 }
1003
1004 SI F ceil_(F v) {
1005 return __lasx_xvfrintrp_s(v);
1006 }
1007
1008 template <typename T>
1009 SI V<T> gather(const T* p, U32 ix) {
1010 return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
1011 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
1012 }
1013
1014 template <typename V, typename S>
1015 SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
1016 V before = gather(dst, ix);
1017 V after = if_then_else(mask, src, before);
1018 dst[ix[0]] = after[0];
1019 dst[ix[1]] = after[1];
1020 dst[ix[2]] = after[2];
1021 dst[ix[3]] = after[3];
1022 dst[ix[4]] = after[4];
1023 dst[ix[5]] = after[5];
1024 dst[ix[6]] = after[6];
1025 dst[ix[7]] = after[7];
1026 }
1027
1028 SI void load2(const uint16_t* ptr, U16* r, U16* g) {
1029 U16 _0123 = __lsx_vld(ptr, 0),
1030 _4567 = __lsx_vld(ptr, 16);
1031 *r = __lsx_vpickev_h(__lsx_vsat_w(__lsx_vsrai_w(__lsx_vslli_w(_4567, 16), 16), 15),
1032 __lsx_vsat_w(__lsx_vsrai_w(__lsx_vslli_w(_0123, 16), 16), 15));
1033 *g = __lsx_vpickev_h(__lsx_vsat_w(__lsx_vsrai_w(_4567, 16), 15),
1034 __lsx_vsat_w(__lsx_vsrai_w(_0123, 16), 15));
1035 }
1036 SI void store2(uint16_t* ptr, U16 r, U16 g) {
1037 auto _0123 = __lsx_vilvl_h(g, r),
1038 _4567 = __lsx_vilvh_h(g, r);
1039 __lsx_vst(_0123, ptr, 0);
1040 __lsx_vst(_4567, ptr, 16);
1041 }
1042
1043 SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
1044 __m128i _01 = __lsx_vld(ptr, 0),
1045 _23 = __lsx_vld(ptr, 16),
1046 _45 = __lsx_vld(ptr, 32),
1047 _67 = __lsx_vld(ptr, 48);
1048
1049 auto _02 = __lsx_vilvl_h(_23, _01), // r0 r2 g0 g2 b0 b2 a0 a2
1050 _13 = __lsx_vilvh_h(_23, _01), // r1 r3 g1 g3 b1 b3 a1 a3
1051 _46 = __lsx_vilvl_h(_67, _45),
1052 _57 = __lsx_vilvh_h(_67, _45);
1053
1054 auto rg0123 = __lsx_vilvl_h(_13, _02), // r0 r1 r2 r3 g0 g1 g2 g3
1055 ba0123 = __lsx_vilvh_h(_13, _02), // b0 b1 b2 b3 a0 a1 a2 a3
1056 rg4567 = __lsx_vilvl_h(_57, _46),
1057 ba4567 = __lsx_vilvh_h(_57, _46);
1058
1059 *r = __lsx_vilvl_d(rg4567, rg0123);
1060 *g = __lsx_vilvh_d(rg4567, rg0123);
1061 *b = __lsx_vilvl_d(ba4567, ba0123);
1062 *a = __lsx_vilvh_d(ba4567, ba0123);
1063 }
1064
1065 SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
1066 auto rg0123 = __lsx_vilvl_h(g, r), // r0 g0 r1 g1 r2 g2 r3 g3
1067 rg4567 = __lsx_vilvh_h(g, r), // r4 g4 r5 g5 r6 g6 r7 g7
1068 ba0123 = __lsx_vilvl_h(a, b),
1069 ba4567 = __lsx_vilvh_h(a, b);
1070
1071 auto _01 =__lsx_vilvl_w(ba0123, rg0123),
1072 _23 =__lsx_vilvh_w(ba0123, rg0123),
1073 _45 =__lsx_vilvl_w(ba4567, rg4567),
1074 _67 =__lsx_vilvh_w(ba4567, rg4567);
1075
1076 __lsx_vst(_01, ptr, 0);
1077 __lsx_vst(_23, ptr, 16);
1078 __lsx_vst(_45, ptr, 32);
1079 __lsx_vst(_67, ptr, 48);
1080 }
1081
1082 SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
1083 F _04 = (F)__lasx_xvpermi_q(__lasx_xvld(ptr, 0), __lasx_xvld(ptr, 64), 0x02);
1084 F _15 = (F)__lasx_xvpermi_q(__lasx_xvld(ptr, 16), __lasx_xvld(ptr, 80), 0x02);
1085 F _26 = (F)__lasx_xvpermi_q(__lasx_xvld(ptr, 32), __lasx_xvld(ptr, 96), 0x02);
1086 F _37 = (F)__lasx_xvpermi_q(__lasx_xvld(ptr, 48), __lasx_xvld(ptr, 112), 0x02);
1087
1088 F rg0145 = (F)__lasx_xvilvl_w((__m256i)_15, (__m256i)_04), // r0 r1 g0 g1 | r4 r5 g4 g5
1089 ba0145 = (F)__lasx_xvilvh_w((__m256i)_15, (__m256i)_04),
1090 rg2367 = (F)__lasx_xvilvl_w((__m256i)_37, (__m256i)_26),
1091 ba2367 = (F)__lasx_xvilvh_w((__m256i)_37, (__m256i)_26);
1092
1093 *r = (F)__lasx_xvilvl_d((__m256i)rg2367, (__m256i)rg0145);
1094 *g = (F)__lasx_xvilvh_d((__m256i)rg2367, (__m256i)rg0145);
1095 *b = (F)__lasx_xvilvl_d((__m256i)ba2367, (__m256i)ba0145);
1096 *a = (F)__lasx_xvilvh_d((__m256i)ba2367, (__m256i)ba0145);
1097 }
1098 SI void store4(float* ptr, F r, F g, F b, F a) {
1099 F rg0145 = (F)__lasx_xvilvl_w((__m256i)g, (__m256i)r), // r0 g0 r1 g1 | r4 g4 r5 g5
1100 rg2367 = (F)__lasx_xvilvh_w((__m256i)g, (__m256i)r), // r2 ... | r6 ...
1101 ba0145 = (F)__lasx_xvilvl_w((__m256i)a, (__m256i)b), // b0 a0 b1 a1 | b4 a4 b5 a5
1102 ba2367 = (F)__lasx_xvilvh_w((__m256i)a, (__m256i)b); // b2 ... | b6 ...
1103
1104 F _04 = (F)__lasx_xvilvl_d((__m256i)ba0145, (__m256i)rg0145), // r0 g0 b0 a0 | r4 g4 b4 a4
1105 _15 = (F)__lasx_xvilvh_d((__m256i)ba0145, (__m256i)rg0145), // r1 ... | r5 ...
1106 _26 = (F)__lasx_xvilvl_d((__m256i)ba2367, (__m256i)rg2367), // r2 ... | r6 ...
1107 _37 = (F)__lasx_xvilvh_d((__m256i)ba2367, (__m256i)rg2367); // r3 ... | r7 ...
1108
1109 F _01 = (F)__lasx_xvpermi_q((__m256i)_04, (__m256i)_15, 0x02),
1110 _23 = (F)__lasx_xvpermi_q((__m256i)_26, (__m256i)_37, 0x02),
1111 _45 = (F)__lasx_xvpermi_q((__m256i)_04, (__m256i)_15, 0x13),
1112 _67 = (F)__lasx_xvpermi_q((__m256i)_26, (__m256i)_37, 0x13);
1113 __lasx_xvst(_01, ptr, 0);
1114 __lasx_xvst(_23, ptr, 32);
1115 __lasx_xvst(_45, ptr, 64);
1116 __lasx_xvst(_67, ptr, 96);
1117 }
1118
1119#elif defined(JUMPER_IS_LSX)
1120 template <typename T> using V = Vec<4, T>;
1121 using F = V<float >;
1122 using I32 = V<int32_t >;
1123 using U64 = V<uint64_t>;
1124 using U32 = V<uint32_t>;
1125 using U16 = V<uint16_t>;
1126 using U8 = V<uint8_t >;
1127
1128 #define _LSX_TRANSPOSE4_S(row0, row1, row2, row3) \
1129 do { \
1130 __m128 __t0 = (__m128)__lsx_vilvl_w ((__m128i)row1, (__m128i)row0); \
1131 __m128 __t1 = (__m128)__lsx_vilvl_w ((__m128i)row3, (__m128i)row2); \
1132 __m128 __t2 = (__m128)__lsx_vilvh_w ((__m128i)row1, (__m128i)row0); \
1133 __m128 __t3 = (__m128)__lsx_vilvh_w ((__m128i)row3, (__m128i)row2); \
1134 (row0) = (__m128)__lsx_vilvl_d ((__m128i)__t1, (__m128i)__t0); \
1135 (row1) = (__m128)__lsx_vilvh_d ((__m128i)__t1, (__m128i)__t0); \
1136 (row2) = (__m128)__lsx_vilvl_d ((__m128i)__t3, (__m128i)__t2); \
1137 (row3) = (__m128)__lsx_vilvh_d ((__m128i)__t3, (__m128i)__t2); \
1138 } while (0)
1139
1140 SI F if_then_else(I32 c, F t, F e) {
1141 return sk_bit_cast<Vec<4,float>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(e),
1142 sk_bit_cast<__m128i>(t),
1143 sk_bit_cast<__m128i>(c)));
1144 }
1145
1146 SI I32 if_then_else(I32 c, I32 t, I32 e) {
1147 return sk_bit_cast<Vec<4,int32_t>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(e),
1148 sk_bit_cast<__m128i>(t),
1149 sk_bit_cast<__m128i>(c)));
1150 }
1151
1152 SI F min(F a, F b) { return __lsx_vfmin_s(a,b); }
1153 SI F max(F a, F b) { return __lsx_vfmax_s(a,b); }
1154 SI I32 min(I32 a, I32 b) { return __lsx_vmin_w(a,b); }
1155 SI U32 min(U32 a, U32 b) { return __lsx_vmin_wu(a,b); }
1156 SI I32 max(I32 a, I32 b) { return __lsx_vmax_w(a,b); }
1157 SI U32 max(U32 a, U32 b) { return __lsx_vmax_wu(a,b); }
1158
1159 SI F mad(F f, F m, F a) { return __lsx_vfmadd_s(f, m, a); }
1160 SI F nmad(F f, F m, F a) { return __lsx_vfmadd_s(-f, m, a); }
1161 SI F abs_(F v) { return (F)__lsx_vand_v((I32)v, (I32)(0-v)); }
1162 SI I32 abs_(I32 v) { return max(v, -v); }
1163 SI F rcp_approx (F v) { return __lsx_vfrecip_s(v); }
1164 SI F rcp_precise (F v) { F e = rcp_approx(v); return e * nmad(v, e, 2.0f); }
1165 SI F rsqrt_approx (F v) { return __lsx_vfrsqrt_s(v); }
1166 SI F sqrt_(F v) { return __lsx_vfsqrt_s (v); }
1167
1168 SI U32 iround(F v) {
1169 F t = F(0.5);
1170 return __lsx_vftintrz_w_s(v + t); }
1171
1172 SI U32 round(F v) {
1173 F t = F(0.5);
1174 return __lsx_vftintrz_w_s(v + t); }
1175
1176 SI U32 round(F v, F scale) {
1177 F t = F(0.5);
1178 return __lsx_vftintrz_w_s(mad(v, scale, t)); }
1179
1180 SI U16 pack(U32 v) {
1181 __m128i tmp = __lsx_vsat_wu(v, 15);
1182 auto p = __lsx_vpickev_h(tmp, tmp);
1183 return sk_unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
1184 }
1185
1186 SI U8 pack(U16 v) {
1187 auto r = widen_cast<__m128i>(v);
1188 __m128i tmp = __lsx_vsat_hu(r, 7);
1189 r = __lsx_vpickev_b(tmp, tmp);
1190 return sk_unaligned_load<U8>(&r);
1191 }
1192
1193 SI bool any(I32 c){
1194 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0), c));
1195 return retv[0] != 0b0000;
1196 }
1197
1198 SI bool all(I32 c){
1199 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0), c));
1200 return retv[0] == 0b1111;
1201 }
1202
1203 SI F floor_(F v) {
1204 return __lsx_vfrintrm_s(v);
1205 }
1206
1207 SI F ceil_(F v) {
1208 return __lsx_vfrintrp_s(v);
1209 }
1210
1211 template <typename T>
1212 SI V<T> gather(const T* p, U32 ix) {
1213 return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
1214 }
1215
1216 template <typename V, typename S>
1217 SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
1218 V before = gather(dst, ix);
1219 V after = if_then_else(mask, src, before);
1220 dst[ix[0]] = after[0];
1221 dst[ix[1]] = after[1];
1222 dst[ix[2]] = after[2];
1223 dst[ix[3]] = after[3];
1224 }
1225
1226 SI void load2(const uint16_t* ptr, U16* r, U16* g) {
1227 __m128i _01 = __lsx_vld(ptr, 0); // r0 g0 r1 g1 r2 g2 r3 g3
1228 auto rg = __lsx_vshuf4i_h(_01, 0xD8); // r0 r1 g0 g1 r2 r3 g2 g3
1229
1230 auto R = __lsx_vshuf4i_w(rg, 0x88); // r0 r1 r2 r3 r0 r1 r2 r3
1231 auto G = __lsx_vshuf4i_w(rg, 0xDD); // g0 g1 g2 g3 g0 g1 g2 g3
1232 *r = sk_unaligned_load<U16>(&R);
1233 *g = sk_unaligned_load<U16>(&G);
1234 }
1235
1236 SI void store2(uint16_t* ptr, U16 r, U16 g) {
1237 U32 rg = __lsx_vilvl_h(widen_cast<__m128i>(g), widen_cast<__m128i>(r));
1238 __lsx_vst(rg, ptr, 0);
1239 }
1240
1241 SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
1242 __m128i _01 = __lsx_vld(ptr, 0), // r0 g0 b0 a0 r1 g1 b1 a1
1243 _23 = __lsx_vld(ptr, 16); // r2 g2 b2 a2 r3 g3 b3 a3
1244
1245 auto _02 = __lsx_vilvl_h(_23, _01), // r0 r2 g0 g2 b0 b2 a0 a2
1246 _13 = __lsx_vilvh_h(_23, _01); // r1 r3 g1 g3 b1 b3 a1 a3
1247
1248 auto rg = __lsx_vilvl_h(_13, _02), // r0 r1 r2 r3 g0 g1 g2 g3
1249 ba = __lsx_vilvh_h(_13, _02); // b0 b1 b2 b3 a0 a1 a2 a3
1250
1251 *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0);
1252 *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4);
1253 *b = sk_unaligned_load<U16>((uint16_t*)&ba + 0);
1254 *a = sk_unaligned_load<U16>((uint16_t*)&ba + 4);
1255 }
1256
1257 SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
1258 auto rg = __lsx_vilvl_h(widen_cast<__m128i>(g), widen_cast<__m128i>(r)),
1259 ba = __lsx_vilvl_h(widen_cast<__m128i>(a), widen_cast<__m128i>(b));
1260
1261 __lsx_vst(__lsx_vilvl_w(ba, rg), ptr, 0);
1262 __lsx_vst(__lsx_vilvh_w(ba, rg), ptr, 16);
1263 }
1264
1265 SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
1266 F _0 = (F)__lsx_vld(ptr, 0),
1267 _1 = (F)__lsx_vld(ptr, 16),
1268 _2 = (F)__lsx_vld(ptr, 32),
1269 _3 = (F)__lsx_vld(ptr, 48);
1270 _LSX_TRANSPOSE4_S(_0,_1,_2,_3);
1271 *r = _0;
1272 *g = _1;
1273 *b = _2;
1274 *a = _3;
1275 }
1276
1277 SI void store4(float* ptr, F r, F g, F b, F a) {
1278 _LSX_TRANSPOSE4_S(r,g,b,a);
1279 __lsx_vst(r, ptr, 0);
1280 __lsx_vst(g, ptr, 16);
1281 __lsx_vst(b, ptr, 32);
1282 __lsx_vst(a, ptr, 48);
1283 }
1284
1285#endif
1286
1287// Helpers to do scalar -> vector promotion on GCC (clang does this automatically)
1288// We need to subtract (not add) zero to keep float conversion zero-cost. See:
1289// https://stackoverflow.com/q/48255293
1290//
1291// The GCC implementation should be usable everywhere, but Mac clang (only) complains that the
1292// expressions make these functions not constexpr.
1293//
1294// Further: We can't use the subtract-zero version in scalar mode. There, the subtraction will
1295// really happen (at least at low optimization levels), which can alter the bit pattern of NaNs.
1296// Because F_() is used when copying uniforms (even integer uniforms), this can corrupt values.
1297// The vector subtraction of zero doesn't appear to ever alter NaN bit patterns.
1298#if defined(__clang__) || defined(JUMPER_IS_SCALAR)
1299SI constexpr F F_(float x) { return x; }
1300SI constexpr I32 I32_(int32_t x) { return x; }
1301SI constexpr U32 U32_(uint32_t x) { return x; }
1302#else
1303SI constexpr F F_(float x) { return x - F(); }
1304SI constexpr I32 I32_(int32_t x) { return x + I32(); }
1305SI constexpr U32 U32_(uint32_t x) { return x + U32(); }
1306#endif
1307
1308// Extremely helpful literals:
1309static constexpr F F0 = F_(0.0f),
1310 F1 = F_(1.0f);
1311
1312#if !defined(JUMPER_IS_SCALAR)
1313 SI F min(F a, float b) { return min(a, F_(b)); }
1314 SI F min(float a, F b) { return min(F_(a), b); }
1315 SI F max(F a, float b) { return max(a, F_(b)); }
1316 SI F max(float a, F b) { return max(F_(a), b); }
1317
1318 SI F mad(F f, F m, float a) { return mad(f, m, F_(a)); }
1319 SI F mad(F f, float m, F a) { return mad(f, F_(m), a); }
1320 SI F mad(F f, float m, float a) { return mad(f, F_(m), F_(a)); }
1321 SI F mad(float f, F m, F a) { return mad(F_(f), m, a); }
1322 SI F mad(float f, F m, float a) { return mad(F_(f), m, F_(a)); }
1323 SI F mad(float f, float m, F a) { return mad(F_(f), F_(m), a); }
1324
1325 SI F nmad(F f, F m, float a) { return nmad(f, m, F_(a)); }
1326 SI F nmad(F f, float m, F a) { return nmad(f, F_(m), a); }
1327 SI F nmad(F f, float m, float a) { return nmad(f, F_(m), F_(a)); }
1328 SI F nmad(float f, F m, F a) { return nmad(F_(f), m, a); }
1329 SI F nmad(float f, F m, float a) { return nmad(F_(f), m, F_(a)); }
1330 SI F nmad(float f, float m, F a) { return nmad(F_(f), F_(m), a); }
1331#endif
1332
1333// We need to be a careful with casts.
1334// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
1335// These named casts and bit_cast() are always what they seem to be.
1336#if defined(JUMPER_IS_SCALAR)
1337 SI F cast (U32 v) { return (F)v; }
1338 SI F cast64(U64 v) { return (F)v; }
1339 SI U32 trunc_(F v) { return (U32)v; }
1340 SI U32 expand(U16 v) { return (U32)v; }
1341 SI U32 expand(U8 v) { return (U32)v; }
1342#else
1343 SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
1344 SI F cast64(U64 v) { return __builtin_convertvector( v, F); }
1345 SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); }
1346 SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
1347 SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
1348#endif
1349
1350#if !defined(JUMPER_IS_SCALAR)
1351SI F if_then_else(I32 c, F t, float e) { return if_then_else(c, t , F_(e)); }
1352SI F if_then_else(I32 c, float t, F e) { return if_then_else(c, F_(t), e ); }
1353SI F if_then_else(I32 c, float t, float e) { return if_then_else(c, F_(t), F_(e)); }
1354#endif
1355
1356SI F fract(F v) { return v - floor_(v); }
1357
1358// See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html
1360 // e - 127 is a fair approximation of log2(x) in its own right...
1361 F e = cast(sk_bit_cast<U32>(x)) * (1.0f / (1<<23));
1362
1363 // ... but using the mantissa to refine its error is _much_ better.
1364 F m = sk_bit_cast<F>((sk_bit_cast<U32>(x) & 0x007fffff) | 0x3f000000);
1365
1366 return nmad(m, 1.498030302f, e - 124.225514990f) - 1.725879990f / (0.3520887068f + m);
1367}
1368
1370 const float ln2 = 0.69314718f;
1371 return ln2 * approx_log2(x);
1372}
1373
1375 constexpr float kInfinityBits = 0x7f800000;
1376
1377 F f = fract(x);
1378 F approx = nmad(f, 1.490129070f, x + 121.274057500f);
1379 approx += 27.728023300f / (4.84252568f - f);
1380 approx *= 1.0f * (1<<23);
1381 approx = min(max(approx, F0), F_(kInfinityBits)); // guard against underflow/overflow
1382
1383 return sk_bit_cast<F>(round(approx));
1384}
1385
1387 const float log2_e = 1.4426950408889634074f;
1388 return approx_pow2(log2_e * x);
1389}
1390
1392 return if_then_else((x == 0)|(x == 1), x
1393 , approx_pow2(approx_log2(x) * y));
1394}
1395#if !defined(JUMPER_IS_SCALAR)
1396SI F approx_powf(F x, float y) { return approx_powf(x, F_(y)); }
1397#endif
1398
1400#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64)
1401 return vcvt_f32_f16((float16x4_t)h);
1402
1403#elif defined(JUMPER_IS_SKX)
1404 return _mm512_cvtph_ps((__m256i)h);
1405
1406#elif defined(JUMPER_IS_HSW)
1407 return _mm256_cvtph_ps((__m128i)h);
1408
1409#else
1410 // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias.
1411 U32 sem = expand(h),
1412 s = sem & 0x8000,
1413 em = sem ^ s;
1414
1415 // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero.
1416 auto denorm = (I32)em < 0x0400; // I32 comparison is often quicker, and always safe here.
1417 return if_then_else(denorm, F0
1418 , sk_bit_cast<F>( (s<<16) + (em<<13) + ((127-15)<<23) ));
1419#endif
1420}
1421
1423#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64)
1424 return (U16)vcvt_f16_f32(f);
1425
1426#elif defined(JUMPER_IS_SKX)
1427 return (U16)_mm512_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
1428
1429#elif defined(JUMPER_IS_HSW)
1430 return (U16)_mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
1431
1432#else
1433 // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias.
1434 U32 sem = sk_bit_cast<U32>(f),
1435 s = sem & 0x80000000,
1436 em = sem ^ s;
1437
1438 // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero.
1439 auto denorm = (I32)em < 0x38800000; // I32 comparison is often quicker, and always safe here.
1440 return pack((U32)if_then_else(denorm, I32_(0)
1441 , (I32)((s>>16) + (em>>13) - ((127-15)<<10))));
1442#endif
1443}
1444
1446 size_t dx, size_t dy, size_t tail) {
1447 for (SkRasterPipeline_MemoryCtxPatch& patch : memoryCtxPatches) {
1448 SkRasterPipeline_MemoryCtx* ctx = patch.info.context;
1449
1450 const ptrdiff_t offset = patch.info.bytesPerPixel * (dy * ctx->stride + dx);
1451 if (patch.info.load) {
1452 void* ctxData = SkTAddOffset<void>(ctx->pixels, offset);
1453 memcpy(patch.scratch, ctxData, patch.info.bytesPerPixel * tail);
1454 }
1455
1456 SkASSERT(patch.backup == nullptr);
1457 void* scratchFakeBase = SkTAddOffset<void>(patch.scratch, -offset);
1458 patch.backup = ctx->pixels;
1459 ctx->pixels = scratchFakeBase;
1460 }
1461}
1462
1464 size_t dx, size_t dy, size_t tail) {
1465 for (SkRasterPipeline_MemoryCtxPatch& patch : memoryCtxPatches) {
1466 SkRasterPipeline_MemoryCtx* ctx = patch.info.context;
1467
1468 SkASSERT(patch.backup != nullptr);
1469 ctx->pixels = patch.backup;
1470 patch.backup = nullptr;
1471
1472 const ptrdiff_t offset = patch.info.bytesPerPixel * (dy * ctx->stride + dx);
1473 if (patch.info.store) {
1474 void* ctxData = SkTAddOffset<void>(ctx->pixels, offset);
1475 memcpy(ctxData, patch.scratch, patch.info.bytesPerPixel * tail);
1476 }
1477 }
1478}
1479
1480#if defined(JUMPER_IS_SCALAR) || defined(JUMPER_IS_SSE2)
1481 // In scalar and SSE2 mode, we always use precise math so we can have more predictable results.
1482 // Chrome will use the SSE2 implementation when --disable-skia-runtime-opts is set. (b/40042946)
1483 SI F rcp_fast(F v) { return rcp_precise(v); }
1484 SI F rsqrt(F v) { return rcp_precise(sqrt_(v)); }
1485#else
1486 SI F rcp_fast(F v) { return rcp_approx(v); }
1487 SI F rsqrt(F v) { return rsqrt_approx(v); }
1488#endif
1489
1490// Our fundamental vector depth is our pixel stride.
1491static constexpr size_t N = sizeof(F) / sizeof(float);
1492
1493// We're finally going to get to what a Stage function looks like!
1494
1495// Any custom ABI to use for all (non-externally-facing) stage functions?
1496// Also decide here whether to use narrow (compromise) or wide (ideal) stages.
1497#if defined(SK_CPU_ARM32) && defined(JUMPER_IS_NEON)
1498 // This lets us pass vectors more efficiently on 32-bit ARM.
1499 // We can still only pass 16 floats, so best as 4x {r,g,b,a}.
1500 #define ABI __attribute__((pcs("aapcs-vfp")))
1501 #define JUMPER_NARROW_STAGES 1
1502#elif defined(_MSC_VER)
1503 // Even if not vectorized, this lets us pass {r,g,b,a} as registers,
1504 // instead of {b,a} on the stack. Narrow stages work best for __vectorcall.
1505 #define ABI __vectorcall
1506 #define JUMPER_NARROW_STAGES 1
1507#elif defined(__x86_64__) || defined(SK_CPU_ARM64) || defined(SK_CPU_LOONGARCH)
1508 // These platforms are ideal for wider stages, and their default ABI is ideal.
1509 #define ABI
1510 #define JUMPER_NARROW_STAGES 0
1511#else
1512 // 32-bit or unknown... shunt them down the narrow path.
1513 // Odds are these have few registers and are better off there.
1514 #define ABI
1515 #define JUMPER_NARROW_STAGES 1
1516#endif
1517
1518#if JUMPER_NARROW_STAGES
1519 struct Params {
1520 size_t dx, dy;
1521 std::byte* base;
1523 };
1524 using Stage = void(ABI*)(Params*, SkRasterPipelineStage* program, F r, F g, F b, F a);
1525#else
1526 using Stage = void(ABI*)(SkRasterPipelineStage* program, size_t dx, size_t dy,
1527 std::byte* base, F,F,F,F, F,F,F,F);
1528#endif
1529
1530static void start_pipeline(size_t dx, size_t dy,
1531 size_t xlimit, size_t ylimit,
1532 SkRasterPipelineStage* program,
1534 uint8_t* tailPointer) {
1535 uint8_t unreferencedTail;
1536 if (!tailPointer) {
1537 tailPointer = &unreferencedTail;
1538 }
1539 auto start = (Stage)program->fn;
1540 const size_t x0 = dx;
1541 std::byte* const base = nullptr;
1542 for (; dy < ylimit; dy++) {
1543 #if JUMPER_NARROW_STAGES
1544 Params params = { x0,dy,base, F0,F0,F0,F0 };
1545 while (params.dx + N <= xlimit) {
1546 start(&params,program, F0,F0,F0,F0);
1547 params.dx += N;
1548 }
1549 if (size_t tail = xlimit - params.dx) {
1550 *tailPointer = tail;
1551 patch_memory_contexts(memoryCtxPatches, params.dx, dy, tail);
1552 start(&params,program, F0,F0,F0,F0);
1553 restore_memory_contexts(memoryCtxPatches, params.dx, dy, tail);
1554 *tailPointer = 0xFF;
1555 }
1556 #else
1557 dx = x0;
1558 while (dx + N <= xlimit) {
1559 start(program,dx,dy,base, F0,F0,F0,F0, F0,F0,F0,F0);
1560 dx += N;
1561 }
1562 if (size_t tail = xlimit - dx) {
1563 *tailPointer = tail;
1564 patch_memory_contexts(memoryCtxPatches, dx, dy, tail);
1565 start(program,dx,dy,base, F0,F0,F0,F0, F0,F0,F0,F0);
1566 restore_memory_contexts(memoryCtxPatches, dx, dy, tail);
1567 *tailPointer = 0xFF;
1568 }
1569 #endif
1570 }
1571}
1572
1573#if SK_HAS_MUSTTAIL
1574 #define JUMPER_MUSTTAIL [[clang::musttail]]
1575#else
1576 #define JUMPER_MUSTTAIL
1577#endif
1578
1579#if JUMPER_NARROW_STAGES
1580 #define DECLARE_STAGE(name, ARG, STAGE_RET, INC, OFFSET, MUSTTAIL) \
1581 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1582 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1583 static void ABI name(Params* params, SkRasterPipelineStage* program, \
1584 F r, F g, F b, F a) { \
1585 OFFSET name##_k(Ctx{program}, params->dx,params->dy,params->base, \
1586 r,g,b,a, params->dr, params->dg, params->db, params->da); \
1587 INC; \
1588 auto fn = (Stage)program->fn; \
1589 MUSTTAIL return fn(params, program, r,g,b,a); \
1590 } \
1591 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1592 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1593#else
1594 #define DECLARE_STAGE(name, ARG, STAGE_RET, INC, OFFSET, MUSTTAIL) \
1595 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1596 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1597 static void ABI name(SkRasterPipelineStage* program, size_t dx, size_t dy, \
1598 std::byte* base, F r, F g, F b, F a, F dr, F dg, F db, F da) { \
1599 OFFSET name##_k(Ctx{program}, dx,dy,base, r,g,b,a, dr,dg,db,da); \
1600 INC; \
1601 auto fn = (Stage)program->fn; \
1602 MUSTTAIL return fn(program, dx,dy,base, r,g,b,a, dr,dg,db,da); \
1603 } \
1604 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1605 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1606#endif
1607
1608// A typical stage returns void, always increments the program counter by 1, and lets the optimizer
1609// decide whether or not tail-calling is appropriate.
1610#define STAGE(name, arg) \
1611 DECLARE_STAGE(name, arg, void, ++program, /*no offset*/, /*no musttail*/)
1612
1613// A tail stage returns void, always increments the program counter by 1, and uses tail-calling.
1614// Tail-calling is necessary in SkSL-generated programs, which can be thousands of ops long, and
1615// could overflow the stack (particularly in debug).
1616#define STAGE_TAIL(name, arg) \
1617 DECLARE_STAGE(name, arg, void, ++program, /*no offset*/, JUMPER_MUSTTAIL)
1618
1619// A branch stage returns an integer, which is added directly to the program counter, and tailcalls.
1620#define STAGE_BRANCH(name, arg) \
1621 DECLARE_STAGE(name, arg, int, /*no increment*/, program +=, JUMPER_MUSTTAIL)
1622
1623// just_return() is a simple no-op stage that only exists to end the chain,
1624// returning back up to start_pipeline(), and from there to the caller.
1625#if JUMPER_NARROW_STAGES
1627#else
1628 static void ABI just_return(SkRasterPipelineStage*, size_t,size_t, std::byte*,
1629 F,F,F,F, F,F,F,F) {}
1630#endif
1631
1632// Note that in release builds, most stages consume no stack (thanks to tail call optimization).
1633// However: certain builds (especially with non-clang compilers) may fail to optimize tail
1634// calls, resulting in actual stack frames being generated.
1635//
1636// stack_checkpoint() and stack_rewind() are special stages that can be used to manage stack growth.
1637// If a pipeline contains a stack_checkpoint, followed by any number of stack_rewind (at any point),
1638// the C++ stack will be reset to the state it was at when the stack_checkpoint was initially hit.
1639//
1640// All instances of stack_rewind (as well as the one instance of stack_checkpoint near the start of
1641// a pipeline) share a single context (of type SkRasterPipeline_RewindCtx). That context holds the
1642// full state of the mutable registers that are normally passed to the next stage in the program.
1643//
1644// stack_rewind is the only stage other than just_return that actually returns (rather than jumping
1645// to the next stage in the program). Before it does so, it stashes all of the registers in the
1646// context. This includes the updated `program` pointer. Unlike stages that tail call exactly once,
1647// stack_checkpoint calls the next stage in the program repeatedly, as long as the `program` in the
1648// context is overwritten (i.e., as long as a stack_rewind was the reason the pipeline returned,
1649// rather than a just_return).
1650//
1651// Normally, just_return is the only stage that returns, and no other stage does anything after a
1652// subsequent (called) stage returns, so the stack just unwinds all the way to start_pipeline.
1653// With stack_checkpoint on the stack, any stack_rewind stages will return all the way up to the
1654// stack_checkpoint. That grabs the values that would have been passed to the next stage (from the
1655// context), and continues the linear execution of stages, but has reclaimed all of the stack frames
1656// pushed before the stack_rewind before doing so.
1657#if JUMPER_NARROW_STAGES
1659 F r, F g, F b, F a) {
1660 SkRasterPipeline_RewindCtx* ctx = Ctx{program};
1661 while (program) {
1662 auto next = (Stage)(++program)->fn;
1663
1664 ctx->stage = nullptr;
1665 next(params, program, r, g, b, a);
1666 program = ctx->stage;
1667
1668 if (program) {
1669 r = sk_unaligned_load<F>(ctx->r );
1670 g = sk_unaligned_load<F>(ctx->g );
1671 b = sk_unaligned_load<F>(ctx->b );
1672 a = sk_unaligned_load<F>(ctx->a );
1673 params->dr = sk_unaligned_load<F>(ctx->dr);
1674 params->dg = sk_unaligned_load<F>(ctx->dg);
1675 params->db = sk_unaligned_load<F>(ctx->db);
1676 params->da = sk_unaligned_load<F>(ctx->da);
1677 params->base = ctx->base;
1678 }
1679 }
1680 }
1682 F r, F g, F b, F a) {
1683 SkRasterPipeline_RewindCtx* ctx = Ctx{program};
1684 sk_unaligned_store(ctx->r , r );
1685 sk_unaligned_store(ctx->g , g );
1686 sk_unaligned_store(ctx->b , b );
1687 sk_unaligned_store(ctx->a , a );
1688 sk_unaligned_store(ctx->dr, params->dr);
1689 sk_unaligned_store(ctx->dg, params->dg);
1690 sk_unaligned_store(ctx->db, params->db);
1691 sk_unaligned_store(ctx->da, params->da);
1692 ctx->base = params->base;
1693 ctx->stage = program;
1694 }
1695#else
1696 static void ABI stack_checkpoint(SkRasterPipelineStage* program,
1697 size_t dx, size_t dy, std::byte* base,
1698 F r, F g, F b, F a, F dr, F dg, F db, F da) {
1699 SkRasterPipeline_RewindCtx* ctx = Ctx{program};
1700 while (program) {
1701 auto next = (Stage)(++program)->fn;
1702
1703 ctx->stage = nullptr;
1704 next(program, dx, dy, base, r, g, b, a, dr, dg, db, da);
1705 program = ctx->stage;
1706
1707 if (program) {
1708 r = sk_unaligned_load<F>(ctx->r );
1709 g = sk_unaligned_load<F>(ctx->g );
1710 b = sk_unaligned_load<F>(ctx->b );
1711 a = sk_unaligned_load<F>(ctx->a );
1712 dr = sk_unaligned_load<F>(ctx->dr);
1713 dg = sk_unaligned_load<F>(ctx->dg);
1714 db = sk_unaligned_load<F>(ctx->db);
1715 da = sk_unaligned_load<F>(ctx->da);
1716 base = ctx->base;
1717 }
1718 }
1719 }
1720 static void ABI stack_rewind(SkRasterPipelineStage* program,
1721 size_t dx, size_t dy, std::byte* base,
1722 F r, F g, F b, F a, F dr, F dg, F db, F da) {
1723 SkRasterPipeline_RewindCtx* ctx = Ctx{program};
1724 sk_unaligned_store(ctx->r , r );
1725 sk_unaligned_store(ctx->g , g );
1726 sk_unaligned_store(ctx->b , b );
1727 sk_unaligned_store(ctx->a , a );
1728 sk_unaligned_store(ctx->dr, dr);
1729 sk_unaligned_store(ctx->dg, dg);
1730 sk_unaligned_store(ctx->db, db);
1731 sk_unaligned_store(ctx->da, da);
1732 ctx->base = base;
1733 ctx->stage = program;
1734 }
1735#endif
1736
1737
1738// We could start defining normal Stages now. But first, some helper functions.
1739
1740template <typename V, typename T>
1741SI V load(const T* src) {
1742 return sk_unaligned_load<V>(src);
1743}
1744
1745template <typename V, typename T>
1746SI void store(T* dst, V v) {
1748}
1749
1751 return cast(expand(b)) * (1/255.0f);
1752}
1754 return cast(expand(s)) * (1/65535.0f);
1755}
1756SI void from_565(U16 _565, F* r, F* g, F* b) {
1757 U32 wide = expand(_565);
1758 *r = cast(wide & (31<<11)) * (1.0f / (31<<11));
1759 *g = cast(wide & (63<< 5)) * (1.0f / (63<< 5));
1760 *b = cast(wide & (31<< 0)) * (1.0f / (31<< 0));
1761}
1762SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) {
1763 U32 wide = expand(_4444);
1764 *r = cast(wide & (15<<12)) * (1.0f / (15<<12));
1765 *g = cast(wide & (15<< 8)) * (1.0f / (15<< 8));
1766 *b = cast(wide & (15<< 4)) * (1.0f / (15<< 4));
1767 *a = cast(wide & (15<< 0)) * (1.0f / (15<< 0));
1768}
1769SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) {
1770 *r = cast((_8888 ) & 0xff) * (1/255.0f);
1771 *g = cast((_8888 >> 8) & 0xff) * (1/255.0f);
1772 *b = cast((_8888 >> 16) & 0xff) * (1/255.0f);
1773 *a = cast((_8888 >> 24) ) * (1/255.0f);
1774}
1775SI void from_88(U16 _88, F* r, F* g) {
1776 U32 wide = expand(_88);
1777 *r = cast((wide ) & 0xff) * (1/255.0f);
1778 *g = cast((wide >> 8) & 0xff) * (1/255.0f);
1779}
1780SI void from_1010102(U32 rgba, F* r, F* g, F* b, F* a) {
1781 *r = cast((rgba ) & 0x3ff) * (1/1023.0f);
1782 *g = cast((rgba >> 10) & 0x3ff) * (1/1023.0f);
1783 *b = cast((rgba >> 20) & 0x3ff) * (1/1023.0f);
1784 *a = cast((rgba >> 30) ) * (1/ 3.0f);
1785}
1786SI void from_1010102_xr(U32 rgba, F* r, F* g, F* b, F* a) {
1787 static constexpr float min = -0.752941f;
1788 static constexpr float max = 1.25098f;
1789 static constexpr float range = max - min;
1790 *r = cast((rgba ) & 0x3ff) * (1/1023.0f) * range + min;
1791 *g = cast((rgba >> 10) & 0x3ff) * (1/1023.0f) * range + min;
1792 *b = cast((rgba >> 20) & 0x3ff) * (1/1023.0f) * range + min;
1793 *a = cast((rgba >> 30) ) * (1/ 3.0f);
1794}
1795SI void from_10101010_xr(U64 _10x6, F* r, F* g, F* b, F* a) {
1796 *r = (cast64((_10x6 >> 6) & 0x3ff) - 384.f) / 510.f;
1797 *g = (cast64((_10x6 >> 22) & 0x3ff) - 384.f) / 510.f;
1798 *b = (cast64((_10x6 >> 38) & 0x3ff) - 384.f) / 510.f;
1799 *a = (cast64((_10x6 >> 54) & 0x3ff) - 384.f) / 510.f;
1800}
1801SI void from_10x6(U64 _10x6, F* r, F* g, F* b, F* a) {
1802 *r = cast64((_10x6 >> 6) & 0x3ff) * (1/1023.0f);
1803 *g = cast64((_10x6 >> 22) & 0x3ff) * (1/1023.0f);
1804 *b = cast64((_10x6 >> 38) & 0x3ff) * (1/1023.0f);
1805 *a = cast64((_10x6 >> 54) & 0x3ff) * (1/1023.0f);
1806}
1807SI void from_1616(U32 _1616, F* r, F* g) {
1808 *r = cast((_1616 ) & 0xffff) * (1/65535.0f);
1809 *g = cast((_1616 >> 16) & 0xffff) * (1/65535.0f);
1810}
1811SI void from_16161616(U64 _16161616, F* r, F* g, F* b, F* a) {
1812 *r = cast64((_16161616 ) & 0xffff) * (1/65535.0f);
1813 *g = cast64((_16161616 >> 16) & 0xffff) * (1/65535.0f);
1814 *b = cast64((_16161616 >> 32) & 0xffff) * (1/65535.0f);
1815 *a = cast64((_16161616 >> 48) & 0xffff) * (1/65535.0f);
1816}
1817
1818// Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory.
1819template <typename T>
1820SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
1821 return (T*)ctx->pixels + dy*ctx->stride + dx;
1822}
1823
1824// clamp v to [0,limit).
1825SI F clamp(F v, F limit) {
1826 F inclusive = sk_bit_cast<F>(sk_bit_cast<U32>(limit) - 1); // Exclusive -> inclusive.
1827 return min(max(0.0f, v), inclusive);
1828}
1829
1830// clamp to (0,limit).
1831SI F clamp_ex(F v, float limit) {
1832 const F inclusiveZ = F_(std::numeric_limits<float>::min()),
1833 inclusiveL = sk_bit_cast<F>( sk_bit_cast<U32>(F_(limit)) - 1 );
1834 return min(max(inclusiveZ, v), inclusiveL);
1835}
1836
1837// Polynomial approximation of degree 5 for sin(x * 2 * pi) in the range [-1/4, 1/4]
1838// Adapted from https://github.com/google/swiftshader/blob/master/docs/Sin-Cos-Optimization.pdf
1840 // A * x + B * x^3 + C * x^5
1841 // Exact at x = 0, 1/12, 1/6, 1/4, and their negatives,
1842 // which correspond to x * 2 * pi = 0, pi/6, pi/3, pi/2
1843 constexpr float A = 6.28230858f;
1844 constexpr float B = -41.1693687f;
1845 constexpr float C = 74.4388885f;
1846 F x2 = x * x;
1847 return x * mad(mad(x2, C, B), x2, A);
1848}
1849
1851 constexpr float one_over_pi2 = 1 / (2 * SK_FloatPI);
1852 x = mad(x, -one_over_pi2, 0.25f);
1853 x = 0.25f - abs_(x - floor_(x + 0.5f));
1854 return sin5q_(x);
1855}
1856
1858 constexpr float one_over_pi2 = 1 / (2 * SK_FloatPI);
1859 x *= one_over_pi2;
1860 x = 0.25f - abs_(x - floor_(x + 0.5f));
1861 return sin5q_(x);
1862}
1863
1864/* "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION"
1865 https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf
1866
1867 approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9
1868
1869 Some simplifications:
1870 1. tan(x) is periodic, -PI/2 < x < PI/2
1871 2. tan(x) is odd, so tan(-x) = -tan(x)
1872 3. Our polynomial approximation is best near zero, so we use the following identity
1873 tan(x) + tan(y)
1874 tan(x + y) = -----------------
1875 1 - tan(x)*tan(y)
1876 tan(PI/4) = 1
1877
1878 So for x > PI/8, we do the following refactor:
1879 x' = x - PI/4
1880
1881 1 + tan(x')
1882 tan(x) = ------------
1883 1 - tan(x')
1884 */
1886 constexpr float Pi = SK_FloatPI;
1887 // periodic between -pi/2 ... pi/2
1888 // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back
1889 x = mad(fract(mad(x, 1/Pi, 0.5f)), Pi, -Pi/2);
1890
1891 I32 neg = (x < 0.0f);
1892 x = if_then_else(neg, -x, x);
1893
1894 // minimize total error by shifting if x > pi/8
1895 I32 use_quotient = (x > (Pi/8));
1896 x = if_then_else(use_quotient, x - (Pi/4), x);
1897
1898 // 9th order poly = 4th order(x^2) * x
1899 const float c4 = 62 / 2835.0f;
1900 const float c3 = 17 / 315.0f;
1901 const float c2 = 2 / 15.0f;
1902 const float c1 = 1 / 3.0f;
1903 const float c0 = 1.0f;
1904 F x2 = x * x;
1905 x *= mad(x2, mad(x2, mad(x2, mad(x2, c4, c3), c2), c1), c0);
1906 x = if_then_else(use_quotient, (1+x)/(1-x), x);
1907 x = if_then_else(neg, -x, x);
1908 return x;
1909}
1910
1911/* Use 4th order polynomial approximation from https://arachnoid.com/polysolve/
1912 with 129 values of x,atan(x) for x:[0...1]
1913 This only works for 0 <= x <= 1
1914 */
1916 // y = 0.14130025741326729 x⁴
1917 // - 0.34312835980675116 x³
1918 // - 0.016172900528248768 x²
1919 // + 1.00376969762003850 x
1920 // - 0.00014758242182738969
1921 const float c4 = 0.14130025741326729f;
1922 const float c3 = -0.34312835980675116f;
1923 const float c2 = -0.016172900528248768f;
1924 const float c1 = 1.0037696976200385f;
1925 const float c0 = -0.00014758242182738969f;
1926 return mad(x, mad(x, mad(x, mad(x, c4, c3), c2), c1), c0);
1927}
1928
1929// Use identity atan(x) = pi/2 - atan(1/x) for x > 1
1931 I32 neg = (x < 0.0f);
1932 x = if_then_else(neg, -x, x);
1933 I32 flip = (x > 1.0f);
1934 x = if_then_else(flip, 1/x, x);
1935 x = approx_atan_unit(x);
1936 x = if_then_else(flip, SK_FloatPI/2 - x, x);
1937 x = if_then_else(neg, -x, x);
1938 return x;
1939}
1940
1941// Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun:
1942// https://books.google.com/books/content?id=ZboM5tOFWtsC&pg=PA81&img=1&zoom=3&hl=en&bul=1&sig=ACfU3U2M75tG_iGVOS92eQspr14LTq02Nw&ci=0%2C15%2C999%2C1279&edge=0
1943// http://screen/8YGJxUGFQ49bVX6
1945 I32 neg = (x < 0.0f);
1946 x = if_then_else(neg, -x, x);
1947 const float c3 = -0.0187293f;
1948 const float c2 = 0.0742610f;
1949 const float c1 = -0.2121144f;
1950 const float c0 = 1.5707288f;
1951 F poly = mad(x, mad(x, mad(x, c3, c2), c1), c0);
1952 x = nmad(sqrt_(1 - x), poly, SK_FloatPI/2);
1953 x = if_then_else(neg, -x, x);
1954 return x;
1955}
1956
1958 return SK_FloatPI/2 - asin_(x);
1959}
1960
1961/* Use identity atan(x) = pi/2 - atan(1/x) for x > 1
1962 By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit()
1963 which avoids a 2nd divide instruction if we had instead called atan().
1964 */
1965SI F atan2_(F y0, F x0) {
1966 I32 flip = (abs_(y0) > abs_(x0));
1967 F y = if_then_else(flip, x0, y0);
1968 F x = if_then_else(flip, y0, x0);
1969 F arg = y/x;
1970
1971 I32 neg = (arg < 0.0f);
1972 arg = if_then_else(neg, -arg, arg);
1973
1974 F r = approx_atan_unit(arg);
1975 r = if_then_else(flip, SK_FloatPI/2 - r, r);
1976 r = if_then_else(neg, -r, r);
1977
1978 // handle quadrant distinctions
1979 r = if_then_else((y0 >= 0) & (x0 < 0), r + SK_FloatPI, r);
1980 r = if_then_else((y0 < 0) & (x0 <= 0), r - SK_FloatPI, r);
1981 // Note: we don't try to handle 0,0 or infinities
1982 return r;
1983}
1984
1985// Used by gather_ stages to calculate the base pointer and a vector of indices to load.
1986template <typename T>
1988 // We use exclusive clamp so that our min value is > 0 because ULP subtraction using U32 would
1989 // produce a NaN if applied to +0.f.
1990 x = clamp_ex(x, ctx->width );
1991 y = clamp_ex(y, ctx->height);
1992 x = sk_bit_cast<F>(sk_bit_cast<U32>(x) - (uint32_t)ctx->roundDownAtInteger);
1993 y = sk_bit_cast<F>(sk_bit_cast<U32>(y) - (uint32_t)ctx->roundDownAtInteger);
1994 *ptr = (const T*)ctx->pixels;
1995 return trunc_(y)*ctx->stride + trunc_(x);
1996}
1997
1998// We often have a nominally [0,1] float value we need to scale and convert to an integer,
1999// whether for a table lookup or to pack back down into bytes for storage.
2000//
2001// In practice, especially when dealing with interesting color spaces, that notionally
2002// [0,1] float may be out of [0,1] range. Unorms cannot represent that, so we must clamp.
2003//
2004// You can adjust the expected input to [0,bias] by tweaking that parameter.
2005SI U32 to_unorm(F v, float scale, float bias = 1.0f) {
2006 // Any time we use round() we probably want to use to_unorm().
2007 return round(min(max(0.0f, v), bias), F_(scale));
2008}
2009
2011#if defined(JUMPER_IS_SCALAR)
2012 // In scalar mode, conditions are bools (0 or 1), but we want to store and operate on masks
2013 // (eg, using bitwise operations to select values).
2014 return if_then_else(cond, I32(~0), I32(0));
2015#else
2016 // In SIMD mode, our various instruction sets already represent conditions as masks.
2017 return cond;
2018#endif
2019}
2020
2021#if defined(JUMPER_IS_SCALAR)
2022// In scalar mode, `data` only contains a single lane.
2023SI uint32_t select_lane(uint32_t data, int /*lane*/) { return data; }
2024SI int32_t select_lane( int32_t data, int /*lane*/) { return data; }
2025#else
2026// In SIMD mode, `data` contains a vector of lanes.
2027SI uint32_t select_lane(U32 data, int lane) { return data[lane]; }
2028SI int32_t select_lane(I32 data, int lane) { return data[lane]; }
2029#endif
2030
2031// Now finally, normal Stages!
2032
2033STAGE(seed_shader, NoCtx) {
2034 static constexpr float iota[] = {
2035 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
2036 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
2037 };
2038 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
2039
2040 // It's important for speed to explicitly cast(dx) and cast(dy),
2041 // which has the effect of splatting them to vectors before converting to floats.
2042 // On Intel this breaks a data dependency on previous loop iterations' registers.
2043 r = cast(U32_(dx)) + sk_unaligned_load<F>(iota);
2044 g = cast(U32_(dy)) + 0.5f;
2045 b = F1; // This is w=1 for matrix multiplies by the device coords.
2046 a = F0;
2047}
2048
2049STAGE(dither, const float* rate) {
2050 // Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors.
2051 uint32_t iota[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
2052 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
2053
2054 U32 X = U32_(dx) + sk_unaligned_load<U32>(iota),
2055 Y = U32_(dy);
2056
2057 // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering.
2058 // In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ].
2059
2060 // We only need X and X^Y from here on, so it's easier to just think of that as "Y".
2061 Y ^= X;
2062
2063 // We'll mix the bottom 3 bits of each of X and Y to make 6 bits,
2064 // for 2^6 == 64 == 8x8 matrix values. If X=abc and Y=def, we make fcebda.
2065 U32 M = (Y & 1) << 5 | (X & 1) << 4
2066 | (Y & 2) << 2 | (X & 2) << 1
2067 | (Y & 4) >> 1 | (X & 4) >> 2;
2068
2069 // Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon.
2070 // We want to make sure our dither is less than 0.5 in either direction to keep exact values
2071 // like 0 and 1 unchanged after rounding.
2072 F dither = mad(cast(M), 2/128.0f, -63/128.0f);
2073
2074 r = mad(dither, *rate, r);
2075 g = mad(dither, *rate, g);
2076 b = mad(dither, *rate, b);
2077
2078 r = max(0.0f, min(r, a));
2079 g = max(0.0f, min(g, a));
2080 b = max(0.0f, min(b, a));
2081}
2082
2083// load 4 floats from memory, and splat them into r,g,b,a
2084STAGE(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
2085 r = F_(c->r);
2086 g = F_(c->g);
2087 b = F_(c->b);
2088 a = F_(c->a);
2089}
2090STAGE(unbounded_uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
2091 r = F_(c->r);
2092 g = F_(c->g);
2093 b = F_(c->b);
2094 a = F_(c->a);
2095}
2096// load 4 floats from memory, and splat them into dr,dg,db,da
2097STAGE(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
2098 dr = F_(c->r);
2099 dg = F_(c->g);
2100 db = F_(c->b);
2101 da = F_(c->a);
2102}
2103
2104// splats opaque-black into r,g,b,a
2105STAGE(black_color, NoCtx) {
2106 r = g = b = F0;
2107 a = F1;
2108}
2109
2110STAGE(white_color, NoCtx) {
2111 r = g = b = a = F1;
2112}
2113
2114// load registers r,g,b,a from context (mirrors store_src)
2115STAGE(load_src, const float* ptr) {
2116 r = sk_unaligned_load<F>(ptr + 0*N);
2117 g = sk_unaligned_load<F>(ptr + 1*N);
2118 b = sk_unaligned_load<F>(ptr + 2*N);
2119 a = sk_unaligned_load<F>(ptr + 3*N);
2120}
2121
2122// store registers r,g,b,a into context (mirrors load_src)
2123STAGE(store_src, float* ptr) {
2124 sk_unaligned_store(ptr + 0*N, r);
2125 sk_unaligned_store(ptr + 1*N, g);
2126 sk_unaligned_store(ptr + 2*N, b);
2127 sk_unaligned_store(ptr + 3*N, a);
2128}
2129// store registers r,g into context
2130STAGE(store_src_rg, float* ptr) {
2131 sk_unaligned_store(ptr + 0*N, r);
2132 sk_unaligned_store(ptr + 1*N, g);
2133}
2134// load registers r,g from context
2135STAGE(load_src_rg, float* ptr) {
2136 r = sk_unaligned_load<F>(ptr + 0*N);
2137 g = sk_unaligned_load<F>(ptr + 1*N);
2138}
2139// store register a into context
2140STAGE(store_src_a, float* ptr) {
2141 sk_unaligned_store(ptr, a);
2142}
2143
2144// load registers dr,dg,db,da from context (mirrors store_dst)
2145STAGE(load_dst, const float* ptr) {
2146 dr = sk_unaligned_load<F>(ptr + 0*N);
2147 dg = sk_unaligned_load<F>(ptr + 1*N);
2148 db = sk_unaligned_load<F>(ptr + 2*N);
2149 da = sk_unaligned_load<F>(ptr + 3*N);
2150}
2151
2152// store registers dr,dg,db,da into context (mirrors load_dst)
2153STAGE(store_dst, float* ptr) {
2154 sk_unaligned_store(ptr + 0*N, dr);
2155 sk_unaligned_store(ptr + 1*N, dg);
2156 sk_unaligned_store(ptr + 2*N, db);
2157 sk_unaligned_store(ptr + 3*N, da);
2158}
2159
2160// Most blend modes apply the same logic to each channel.
2161#define BLEND_MODE(name) \
2162 SI F name##_channel(F s, F d, F sa, F da); \
2163 STAGE(name, NoCtx) { \
2164 r = name##_channel(r,dr,a,da); \
2165 g = name##_channel(g,dg,a,da); \
2166 b = name##_channel(b,db,a,da); \
2167 a = name##_channel(a,da,a,da); \
2168 } \
2169 SI F name##_channel(F s, F d, F sa, F da)
2170
2171SI F inv(F x) { return 1.0f - x; }
2172SI F two(F x) { return x + x; }
2173
2174BLEND_MODE(clear) { return F0; }
2175BLEND_MODE(srcatop) { return mad(s, da, d*inv(sa)); }
2176BLEND_MODE(dstatop) { return mad(d, sa, s*inv(da)); }
2177BLEND_MODE(srcin) { return s * da; }
2178BLEND_MODE(dstin) { return d * sa; }
2179BLEND_MODE(srcout) { return s * inv(da); }
2180BLEND_MODE(dstout) { return d * inv(sa); }
2181BLEND_MODE(srcover) { return mad(d, inv(sa), s); }
2182BLEND_MODE(dstover) { return mad(s, inv(da), d); }
2183
2184BLEND_MODE(modulate) { return s*d; }
2185BLEND_MODE(multiply) { return mad(s, d, mad(s, inv(da), d*inv(sa))); }
2186BLEND_MODE(plus_) { return min(s + d, 1.0f); } // We can clamp to either 1 or sa.
2187BLEND_MODE(screen) { return nmad(s, d, s + d); }
2188BLEND_MODE(xor_) { return mad(s, inv(da), d*inv(sa)); }
2189#undef BLEND_MODE
2190
2191// Most other blend modes apply the same logic to colors, and srcover to alpha.
2192#define BLEND_MODE(name) \
2193 SI F name##_channel(F s, F d, F sa, F da); \
2194 STAGE(name, NoCtx) { \
2195 r = name##_channel(r,dr,a,da); \
2196 g = name##_channel(g,dg,a,da); \
2197 b = name##_channel(b,db,a,da); \
2198 a = mad(da, inv(a), a); \
2199 } \
2200 SI F name##_channel(F s, F d, F sa, F da)
2201
2202BLEND_MODE(darken) { return s + d - max(s*da, d*sa) ; }
2203BLEND_MODE(lighten) { return s + d - min(s*da, d*sa) ; }
2204BLEND_MODE(difference) { return s + d - two(min(s*da, d*sa)); }
2205BLEND_MODE(exclusion) { return s + d - two(s*d); }
2206
2207BLEND_MODE(colorburn) {
2208 return if_then_else(d == da, d + s*inv(da),
2209 if_then_else(s == 0, /* s + */ d*inv(sa),
2210 sa*(da - min(da, (da-d)*sa*rcp_fast(s))) + s*inv(da) + d*inv(sa)));
2211}
2212BLEND_MODE(colordodge) {
2213 return if_then_else(d == 0, /* d + */ s*inv(da),
2214 if_then_else(s == sa, s + d*inv(sa),
2215 sa*min(da, (d*sa)*rcp_fast(sa - s)) + s*inv(da) + d*inv(sa)));
2216}
2217BLEND_MODE(hardlight) {
2218 return s*inv(da) + d*inv(sa)
2219 + if_then_else(two(s) <= sa, two(s*d), sa*da - two((da-d)*(sa-s)));
2220}
2221BLEND_MODE(overlay) {
2222 return s*inv(da) + d*inv(sa)
2223 + if_then_else(two(d) <= da, two(s*d), sa*da - two((da-d)*(sa-s)));
2224}
2225
2226BLEND_MODE(softlight) {
2227 F m = if_then_else(da > 0, d / da, 0.0f),
2228 s2 = two(s),
2229 m4 = two(two(m));
2230
2231 // The logic forks three ways:
2232 // 1. dark src?
2233 // 2. light src, dark dst?
2234 // 3. light src, light dst?
2235 F darkSrc = d*(sa + (s2 - sa)*(1.0f - m)), // Used in case 1.
2236 darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m, // Used in case 2.
2237 liteDst = sqrt_(m) - m,
2238 liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3?
2239 return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc); // 1 or (2 or 3)?
2240}
2241#undef BLEND_MODE
2242
2243// We're basing our implemenation of non-separable blend modes on
2244// https://www.w3.org/TR/compositing-1/#blendingnonseparable.
2245// and
2246// https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
2247// They're equivalent, but ES' math has been better simplified.
2248//
2249// Anything extra we add beyond that is to make the math work with premul inputs.
2250
2251SI F sat(F r, F g, F b) { return max(r, max(g,b)) - min(r, min(g,b)); }
2252SI F lum(F r, F g, F b) { return mad(r, 0.30f, mad(g, 0.59f, b*0.11f)); }
2253
2254SI void set_sat(F* r, F* g, F* b, F s) {
2255 F mn = min(*r, min(*g,*b)),
2256 mx = max(*r, max(*g,*b)),
2257 sat = mx - mn;
2258
2259 // Map min channel to 0, max channel to s, and scale the middle proportionally.
2260 s = if_then_else(sat == 0.0f, 0.0f, s * rcp_fast(sat));
2261 *r = (*r - mn) * s;
2262 *g = (*g - mn) * s;
2263 *b = (*b - mn) * s;
2264}
2265SI void set_lum(F* r, F* g, F* b, F l) {
2266 F diff = l - lum(*r, *g, *b);
2267 *r += diff;
2268 *g += diff;
2269 *b += diff;
2270}
2271SI F clip_channel(F c, F l, I32 clip_low, I32 clip_high, F mn_scale, F mx_scale) {
2272 c = if_then_else(clip_low, mad(mn_scale, c - l, l), c);
2273 c = if_then_else(clip_high, mad(mx_scale, c - l, l), c);
2274 c = max(c, 0.0f); // Sometimes without this we may dip just a little negative.
2275 return c;
2276}
2277SI void clip_color(F* r, F* g, F* b, F a) {
2278 F mn = min(*r, min(*g, *b)),
2279 mx = max(*r, max(*g, *b)),
2280 l = lum(*r, *g, *b),
2281 mn_scale = ( l) * rcp_fast(l - mn),
2282 mx_scale = (a - l) * rcp_fast(mx - l);
2283 I32 clip_low = cond_to_mask(mn < 0 && l != mn),
2284 clip_high = cond_to_mask(mx > a && l != mx);
2285
2286 *r = clip_channel(*r, l, clip_low, clip_high, mn_scale, mx_scale);
2287 *g = clip_channel(*g, l, clip_low, clip_high, mn_scale, mx_scale);
2288 *b = clip_channel(*b, l, clip_low, clip_high, mn_scale, mx_scale);
2289}
2290
2292 F R = r*a,
2293 G = g*a,
2294 B = b*a;
2295
2296 set_sat(&R, &G, &B, sat(dr,dg,db)*a);
2297 set_lum(&R, &G, &B, lum(dr,dg,db)*a);
2298 clip_color(&R,&G,&B, a*da);
2299
2300 r = mad(r, inv(da), mad(dr, inv(a), R));
2301 g = mad(g, inv(da), mad(dg, inv(a), G));
2302 b = mad(b, inv(da), mad(db, inv(a), B));
2303 a = a + nmad(a, da, da);
2304}
2306 F R = dr*a,
2307 G = dg*a,
2308 B = db*a;
2309
2310 set_sat(&R, &G, &B, sat( r, g, b)*da);
2311 set_lum(&R, &G, &B, lum(dr,dg,db)* a); // (This is not redundant.)
2312 clip_color(&R,&G,&B, a*da);
2313
2314 r = mad(r, inv(da), mad(dr, inv(a), R));
2315 g = mad(g, inv(da), mad(dg, inv(a), G));
2316 b = mad(b, inv(da), mad(db, inv(a), B));
2317 a = a + nmad(a, da, da);
2318}
2320 F R = r*da,
2321 G = g*da,
2322 B = b*da;
2323
2324 set_lum(&R, &G, &B, lum(dr,dg,db)*a);
2325 clip_color(&R,&G,&B, a*da);
2326
2327 r = mad(r, inv(da), mad(dr, inv(a), R));
2328 g = mad(g, inv(da), mad(dg, inv(a), G));
2329 b = mad(b, inv(da), mad(db, inv(a), B));
2330 a = a + nmad(a, da, da);
2331}
2333 F R = dr*a,
2334 G = dg*a,
2335 B = db*a;
2336
2337 set_lum(&R, &G, &B, lum(r,g,b)*da);
2338 clip_color(&R,&G,&B, a*da);
2339
2340 r = mad(r, inv(da), mad(dr, inv(a), R));
2341 g = mad(g, inv(da), mad(dg, inv(a), G));
2342 b = mad(b, inv(da), mad(db, inv(a), B));
2343 a = a + nmad(a, da, da);
2344}
2345
2346STAGE(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
2347 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2348
2349 U32 dst = load<U32>(ptr);
2350 dr = cast((dst ) & 0xff);
2351 dg = cast((dst >> 8) & 0xff);
2352 db = cast((dst >> 16) & 0xff);
2353 da = cast((dst >> 24) );
2354 // {dr,dg,db,da} are in [0,255]
2355 // { r, g, b, a} are in [0, 1] (but may be out of gamut)
2356
2357 r = mad(dr, inv(a), r*255.0f);
2358 g = mad(dg, inv(a), g*255.0f);
2359 b = mad(db, inv(a), b*255.0f);
2360 a = mad(da, inv(a), a*255.0f);
2361 // { r, g, b, a} are now in [0,255] (but may be out of gamut)
2362
2363 // to_unorm() clamps back to gamut. Scaling by 1 since we're already 255-biased.
2364 dst = to_unorm(r, 1, 255)
2365 | to_unorm(g, 1, 255) << 8
2366 | to_unorm(b, 1, 255) << 16
2367 | to_unorm(a, 1, 255) << 24;
2368 store(ptr, dst);
2369}
2370
2371SI F clamp_01_(F v) { return min(max(0.0f, v), 1.0f); }
2372
2373STAGE(clamp_01, NoCtx) {
2374 r = clamp_01_(r);
2375 g = clamp_01_(g);
2376 b = clamp_01_(b);
2377 a = clamp_01_(a);
2378}
2379
2380STAGE(clamp_gamut, NoCtx) {
2381 a = min(max(a, 0.0f), 1.0f);
2382 r = min(max(r, 0.0f), a);
2383 g = min(max(g, 0.0f), a);
2384 b = min(max(b, 0.0f), a);
2385}
2386
2387STAGE(set_rgb, const float* rgb) {
2388 r = F_(rgb[0]);
2389 g = F_(rgb[1]);
2390 b = F_(rgb[2]);
2391}
2392
2393STAGE(unbounded_set_rgb, const float* rgb) {
2394 r = F_(rgb[0]);
2395 g = F_(rgb[1]);
2396 b = F_(rgb[2]);
2397}
2398
2399STAGE(swap_rb, NoCtx) {
2400 auto tmp = r;
2401 r = b;
2402 b = tmp;
2403}
2404STAGE(swap_rb_dst, NoCtx) {
2405 auto tmp = dr;
2406 dr = db;
2407 db = tmp;
2408}
2409
2410STAGE(move_src_dst, NoCtx) {
2411 dr = r;
2412 dg = g;
2413 db = b;
2414 da = a;
2415}
2416STAGE(move_dst_src, NoCtx) {
2417 r = dr;
2418 g = dg;
2419 b = db;
2420 a = da;
2421}
2422STAGE(swap_src_dst, NoCtx) {
2423 std::swap(r, dr);
2424 std::swap(g, dg);
2425 std::swap(b, db);
2426 std::swap(a, da);
2427}
2428
2430 r = r * a;
2431 g = g * a;
2432 b = b * a;
2433}
2434STAGE(premul_dst, NoCtx) {
2435 dr = dr * da;
2436 dg = dg * da;
2437 db = db * da;
2438}
2439STAGE(unpremul, NoCtx) {
2440 float inf = sk_bit_cast<float>(0x7f800000);
2441 auto scale = if_then_else(1.0f/a < inf, 1.0f/a, 0.0f);
2442 r *= scale;
2443 g *= scale;
2444 b *= scale;
2445}
2446STAGE(unpremul_polar, NoCtx) {
2447 float inf = sk_bit_cast<float>(0x7f800000);
2448 auto scale = if_then_else(1.0f/a < inf, 1.0f/a, 0.0f);
2449 g *= scale;
2450 b *= scale;
2451}
2452
2453STAGE(force_opaque , NoCtx) { a = F1; }
2454STAGE(force_opaque_dst, NoCtx) { da = F1; }
2455
2457 F mx = max(r, max(g,b)),
2458 mn = min(r, min(g,b)),
2459 d = mx - mn,
2460 d_rcp = 1.0f / d;
2461
2462 F h = (1/6.0f) *
2463 if_then_else(mx == mn, 0.0f,
2464 if_then_else(mx == r, (g-b)*d_rcp + if_then_else(g < b, 6.0f, 0.0f),
2465 if_then_else(mx == g, (b-r)*d_rcp + 2.0f,
2466 (r-g)*d_rcp + 4.0f)));
2467
2468 F l = (mx + mn) * 0.5f;
2469 F s = if_then_else(mx == mn, 0.0f,
2470 d / if_then_else(l > 0.5f, 2.0f-mx-mn, mx+mn));
2471
2472 r = h;
2473 g = s;
2474 b = l;
2475}
2477 // See GrRGBToHSLFilterEffect.fp
2478
2479 F h = r,
2480 s = g,
2481 l = b,
2482 c = (1.0f - abs_(2.0f * l - 1)) * s;
2483
2484 auto hue_to_rgb = [&](F hue) {
2485 F q = clamp_01_(abs_(fract(hue) * 6.0f - 3.0f) - 1.0f);
2486 return (q - 0.5f) * c + l;
2487 };
2488
2489 r = hue_to_rgb(h + 0.0f/3.0f);
2490 g = hue_to_rgb(h + 2.0f/3.0f);
2491 b = hue_to_rgb(h + 1.0f/3.0f);
2492}
2493
2494// Color conversion functions used in gradient interpolation, based on
2495// https://www.w3.org/TR/css-color-4/#color-conversion-code
2496STAGE(css_lab_to_xyz, NoCtx) {
2497 constexpr float k = 24389 / 27.0f;
2498 constexpr float e = 216 / 24389.0f;
2499
2500 F f[3];
2501 f[1] = (r + 16) * (1 / 116.0f);
2502 f[0] = (g * (1 / 500.0f)) + f[1];
2503 f[2] = f[1] - (b * (1 / 200.0f));
2504
2505 F f_cubed[3] = { f[0]*f[0]*f[0], f[1]*f[1]*f[1], f[2]*f[2]*f[2] };
2506
2507 F xyz[3] = {
2508 if_then_else(f_cubed[0] > e, f_cubed[0], (116 * f[0] - 16) * (1 / k)),
2509 if_then_else(r > k * e, f_cubed[1], r * (1 / k)),
2510 if_then_else(f_cubed[2] > e, f_cubed[2], (116 * f[2] - 16) * (1 / k))
2511 };
2512
2513 constexpr float D50[3] = { 0.3457f / 0.3585f, 1.0f, (1.0f - 0.3457f - 0.3585f) / 0.3585f };
2514 r = xyz[0]*D50[0];
2515 g = xyz[1]*D50[1];
2516 b = xyz[2]*D50[2];
2517}
2518
2519STAGE(css_oklab_to_linear_srgb, NoCtx) {
2520 F l_ = r + 0.3963377774f * g + 0.2158037573f * b,
2521 m_ = r - 0.1055613458f * g - 0.0638541728f * b,
2522 s_ = r - 0.0894841775f * g - 1.2914855480f * b;
2523
2524 F l = l_*l_*l_,
2525 m = m_*m_*m_,
2526 s = s_*s_*s_;
2527
2528 r = +4.0767416621f * l - 3.3077115913f * m + 0.2309699292f * s;
2529 g = -1.2684380046f * l + 2.6097574011f * m - 0.3413193965f * s;
2530 b = -0.0041960863f * l - 0.7034186147f * m + 1.7076147010f * s;
2531}
2532
2533STAGE(css_oklab_gamut_map_to_linear_srgb, NoCtx) {
2534 // TODO(https://crbug.com/1508329): Add support for gamut mapping.
2535 // Return a greyscale value, so that accidental use is obvious.
2536 F l_ = r,
2537 m_ = r,
2538 s_ = r;
2539
2540 F l = l_*l_*l_,
2541 m = m_*m_*m_,
2542 s = s_*s_*s_;
2543
2544 r = +4.0767416621f * l - 3.3077115913f * m + 0.2309699292f * s;
2545 g = -1.2684380046f * l + 2.6097574011f * m - 0.3413193965f * s;
2546 b = -0.0041960863f * l - 0.7034186147f * m + 1.7076147010f * s;
2547}
2548
2549// Skia stores all polar colors with hue in the first component, so this "LCH -> Lab" transform
2550// actually takes "HCL". This is also used to do the same polar transform for OkHCL to OkLAB.
2551// See similar comments & logic in SkGradientBaseShader.cpp.
2552STAGE(css_hcl_to_lab, NoCtx) {
2553 F H = r,
2554 C = g,
2555 L = b;
2556
2557 F hueRadians = H * (SK_FloatPI / 180);
2558
2559 r = L;
2560 g = C * cos_(hueRadians);
2561 b = C * sin_(hueRadians);
2562}
2563
2564SI F mod_(F x, float y) {
2565 return nmad(y, floor_(x * (1 / y)), x);
2566}
2567
2568struct RGB { F r, g, b; };
2569
2571 h = mod_(h, 360);
2572
2573 s *= 0.01f;
2574 l *= 0.01f;
2575
2576 F k[3] = {
2577 mod_(0 + h * (1 / 30.0f), 12),
2578 mod_(8 + h * (1 / 30.0f), 12),
2579 mod_(4 + h * (1 / 30.0f), 12)
2580 };
2581 F a = s * min(l, 1 - l);
2582 return {
2583 l - a * max(-1.0f, min(min(k[0] - 3.0f, 9.0f - k[0]), 1.0f)),
2584 l - a * max(-1.0f, min(min(k[1] - 3.0f, 9.0f - k[1]), 1.0f)),
2585 l - a * max(-1.0f, min(min(k[2] - 3.0f, 9.0f - k[2]), 1.0f))
2586 };
2587}
2588
2589STAGE(css_hsl_to_srgb, NoCtx) {
2590 RGB rgb = css_hsl_to_srgb_(r, g, b);
2591 r = rgb.r;
2592 g = rgb.g;
2593 b = rgb.b;
2594}
2595
2596STAGE(css_hwb_to_srgb, NoCtx) {
2597 g *= 0.01f;
2598 b *= 0.01f;
2599
2600 F gray = g / (g + b);
2601
2602 RGB rgb = css_hsl_to_srgb_(r, F_(100.0f), F_(50.0f));
2603 rgb.r = rgb.r * (1 - g - b) + g;
2604 rgb.g = rgb.g * (1 - g - b) + g;
2605 rgb.b = rgb.b * (1 - g - b) + g;
2606
2607 auto isGray = (g + b) >= 1;
2608
2609 r = if_then_else(isGray, gray, rgb.r);
2610 g = if_then_else(isGray, gray, rgb.g);
2611 b = if_then_else(isGray, gray, rgb.b);
2612}
2613
2614// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
2616 return if_then_else(a < da, min(cr, min(cg,cb))
2617 , max(cr, max(cg,cb)));
2618}
2619
2620STAGE(scale_1_float, const float* c) {
2621 r = r * *c;
2622 g = g * *c;
2623 b = b * *c;
2624 a = a * *c;
2625}
2626STAGE(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
2627 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2628
2629 auto scales = load<U8>(ptr);
2630 auto c = from_byte(scales);
2631
2632 r = r * c;
2633 g = g * c;
2634 b = b * c;
2635 a = a * c;
2636}
2637STAGE(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
2638 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2639
2640 F cr,cg,cb;
2641 from_565(load<U16>(ptr), &cr, &cg, &cb);
2642
2643 F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
2644
2645 r = r * cr;
2646 g = g * cg;
2647 b = b * cb;
2648 a = a * ca;
2649}
2650
2651SI F lerp(F from, F to, F t) {
2652 return mad(to-from, t, from);
2653}
2654
2655STAGE(lerp_1_float, const float* c) {
2656 r = lerp(dr, r, F_(*c));
2657 g = lerp(dg, g, F_(*c));
2658 b = lerp(db, b, F_(*c));
2659 a = lerp(da, a, F_(*c));
2660}
2661STAGE(scale_native, const float scales[]) {
2662 auto c = sk_unaligned_load<F>(scales);
2663 r = r * c;
2664 g = g * c;
2665 b = b * c;
2666 a = a * c;
2667}
2668STAGE(lerp_native, const float scales[]) {
2669 auto c = sk_unaligned_load<F>(scales);
2670 r = lerp(dr, r, c);
2671 g = lerp(dg, g, c);
2672 b = lerp(db, b, c);
2673 a = lerp(da, a, c);
2674}
2675STAGE(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
2676 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2677
2678 auto scales = load<U8>(ptr);
2679 auto c = from_byte(scales);
2680
2681 r = lerp(dr, r, c);
2682 g = lerp(dg, g, c);
2683 b = lerp(db, b, c);
2684 a = lerp(da, a, c);
2685}
2686STAGE(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
2687 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2688
2689 F cr,cg,cb;
2690 from_565(load<U16>(ptr), &cr, &cg, &cb);
2691
2692 F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
2693
2694 r = lerp(dr, r, cr);
2695 g = lerp(dg, g, cg);
2696 b = lerp(db, b, cb);
2697 a = lerp(da, a, ca);
2698}
2699
2701 auto mptr = ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy),
2702 aptr = ptr_at_xy<const uint8_t>(&ctx->add, dx,dy);
2703
2704 F mul = from_byte(load<U8>(mptr)),
2705 add = from_byte(load<U8>(aptr));
2706
2707 r = mad(r, mul, add);
2708 g = mad(g, mul, add);
2709 b = mad(b, mul, add);
2710}
2711
2712STAGE(byte_tables, const SkRasterPipeline_TablesCtx* tables) {
2713 r = from_byte(gather(tables->r, to_unorm(r, 255)));
2714 g = from_byte(gather(tables->g, to_unorm(g, 255)));
2715 b = from_byte(gather(tables->b, to_unorm(b, 255)));
2716 a = from_byte(gather(tables->a, to_unorm(a, 255)));
2717}
2718
2720 U32 bits = sk_bit_cast<U32>(x);
2721 *sign = bits & 0x80000000;
2722 return sk_bit_cast<F>(bits ^ *sign);
2723}
2724
2726 return sk_bit_cast<F>(sign | sk_bit_cast<U32>(x));
2727}
2728
2729STAGE(parametric, const skcms_TransferFunction* ctx) {
2730 auto fn = [&](F v) {
2731 U32 sign;
2732 v = strip_sign(v, &sign);
2733
2734 F r = if_then_else(v <= ctx->d, mad(ctx->c, v, ctx->f)
2735 , approx_powf(mad(ctx->a, v, ctx->b), ctx->g) + ctx->e);
2736 return apply_sign(r, sign);
2737 };
2738 r = fn(r);
2739 g = fn(g);
2740 b = fn(b);
2741}
2742
2743STAGE(gamma_, const float* G) {
2744 auto fn = [&](F v) {
2745 U32 sign;
2746 v = strip_sign(v, &sign);
2747 return apply_sign(approx_powf(v, *G), sign);
2748 };
2749 r = fn(r);
2750 g = fn(g);
2751 b = fn(b);
2752}
2753
2754STAGE(PQish, const skcms_TransferFunction* ctx) {
2755 auto fn = [&](F v) {
2756 U32 sign;
2757 v = strip_sign(v, &sign);
2758
2759 F r = approx_powf(max(mad(ctx->b, approx_powf(v, ctx->c), ctx->a), 0.0f)
2760 / (mad(ctx->e, approx_powf(v, ctx->c), ctx->d)),
2761 ctx->f);
2762
2763 return apply_sign(r, sign);
2764 };
2765 r = fn(r);
2766 g = fn(g);
2767 b = fn(b);
2768}
2769
2770STAGE(HLGish, const skcms_TransferFunction* ctx) {
2771 auto fn = [&](F v) {
2772 U32 sign;
2773 v = strip_sign(v, &sign);
2774
2775 const float R = ctx->a, G = ctx->b,
2776 a = ctx->c, b = ctx->d, c = ctx->e,
2777 K = ctx->f + 1.0f;
2778
2779 F r = if_then_else(v*R <= 1, approx_powf(v*R, G)
2780 , approx_exp((v-c)*a) + b);
2781
2782 return K * apply_sign(r, sign);
2783 };
2784 r = fn(r);
2785 g = fn(g);
2786 b = fn(b);
2787}
2788
2789STAGE(HLGinvish, const skcms_TransferFunction* ctx) {
2790 auto fn = [&](F v) {
2791 U32 sign;
2792 v = strip_sign(v, &sign);
2793
2794 const float R = ctx->a, G = ctx->b,
2795 a = ctx->c, b = ctx->d, c = ctx->e,
2796 K = ctx->f + 1.0f;
2797
2798 v /= K;
2799 F r = if_then_else(v <= 1, R * approx_powf(v, G)
2800 , a * approx_log(v - b) + c);
2801
2802 return apply_sign(r, sign);
2803 };
2804 r = fn(r);
2805 g = fn(g);
2806 b = fn(b);
2807}
2808
2809STAGE(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
2810 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2811
2812 r = g = b = F0;
2813 a = from_byte(load<U8>(ptr));
2814}
2815STAGE(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2816 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2817
2818 dr = dg = db = F0;
2819 da = from_byte(load<U8>(ptr));
2820}
2821STAGE(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
2822 const uint8_t* ptr;
2823 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2824 r = g = b = F0;
2825 a = from_byte(gather(ptr, ix));
2826}
2827STAGE(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
2828 auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
2829
2830 U8 packed = pack(pack(to_unorm(a, 255)));
2831 store(ptr, packed);
2832}
2833STAGE(store_r8, const SkRasterPipeline_MemoryCtx* ctx) {
2834 auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
2835
2836 U8 packed = pack(pack(to_unorm(r, 255)));
2837 store(ptr, packed);
2838}
2839
2840STAGE(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
2841 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2842
2843 from_565(load<U16>(ptr), &r,&g,&b);
2844 a = F1;
2845}
2846STAGE(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2847 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2848
2849 from_565(load<U16>(ptr), &dr,&dg,&db);
2850 da = F1;
2851}
2852STAGE(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
2853 const uint16_t* ptr;
2854 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2855 from_565(gather(ptr, ix), &r,&g,&b);
2856 a = F1;
2857}
2858STAGE(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
2859 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2860
2861 U16 px = pack( to_unorm(r, 31) << 11
2862 | to_unorm(g, 63) << 5
2863 | to_unorm(b, 31) );
2864 store(ptr, px);
2865}
2866
2867STAGE(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
2868 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2869 from_4444(load<U16>(ptr), &r,&g,&b,&a);
2870}
2871STAGE(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2872 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2873 from_4444(load<U16>(ptr), &dr,&dg,&db,&da);
2874}
2875STAGE(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
2876 const uint16_t* ptr;
2877 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2878 from_4444(gather(ptr, ix), &r,&g,&b,&a);
2879}
2880STAGE(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
2881 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2882 U16 px = pack( to_unorm(r, 15) << 12
2883 | to_unorm(g, 15) << 8
2884 | to_unorm(b, 15) << 4
2885 | to_unorm(a, 15) );
2886 store(ptr, px);
2887}
2888
2889STAGE(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
2890 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2891 from_8888(load<U32>(ptr), &r,&g,&b,&a);
2892}
2893STAGE(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2894 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2895 from_8888(load<U32>(ptr), &dr,&dg,&db,&da);
2896}
2897STAGE(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
2898 const uint32_t* ptr;
2899 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2900 from_8888(gather(ptr, ix), &r,&g,&b,&a);
2901}
2902STAGE(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
2903 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2904
2905 U32 px = to_unorm(r, 255)
2906 | to_unorm(g, 255) << 8
2907 | to_unorm(b, 255) << 16
2908 | to_unorm(a, 255) << 24;
2909 store(ptr, px);
2910}
2911
2912STAGE(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
2913 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2914 from_88(load<U16>(ptr), &r, &g);
2915 b = F0;
2916 a = F1;
2917}
2918STAGE(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2919 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2920 from_88(load<U16>(ptr), &dr, &dg);
2921 db = F0;
2922 da = F1;
2923}
2924STAGE(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) {
2925 const uint16_t* ptr;
2926 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2927 from_88(gather(ptr, ix), &r, &g);
2928 b = F0;
2929 a = F1;
2930}
2931STAGE(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
2932 auto ptr = ptr_at_xy<uint16_t>(ctx, dx, dy);
2933 U16 px = pack( to_unorm(r, 255) | to_unorm(g, 255) << 8 );
2934 store(ptr, px);
2935}
2936
2937STAGE(load_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2938 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2939 r = g = b = F0;
2940 a = from_short(load<U16>(ptr));
2941}
2942STAGE(load_a16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2943 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2944 dr = dg = db = F0;
2945 da = from_short(load<U16>(ptr));
2946}
2947STAGE(gather_a16, const SkRasterPipeline_GatherCtx* ctx) {
2948 const uint16_t* ptr;
2949 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2950 r = g = b = F0;
2951 a = from_short(gather(ptr, ix));
2952}
2953STAGE(store_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2954 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2955
2956 U16 px = pack(to_unorm(a, 65535));
2957 store(ptr, px);
2958}
2959
2960STAGE(load_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2961 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2962 b = F0;
2963 a = F1;
2964 from_1616(load<U32>(ptr), &r,&g);
2965}
2966STAGE(load_rg1616_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2967 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2968 from_1616(load<U32>(ptr), &dr, &dg);
2969 db = F0;
2970 da = F1;
2971}
2972STAGE(gather_rg1616, const SkRasterPipeline_GatherCtx* ctx) {
2973 const uint32_t* ptr;
2974 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2975 from_1616(gather(ptr, ix), &r, &g);
2976 b = F0;
2977 a = F1;
2978}
2979STAGE(store_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2980 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2981
2982 U32 px = to_unorm(r, 65535)
2983 | to_unorm(g, 65535) << 16;
2984 store(ptr, px);
2985}
2986
2987STAGE(load_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
2988 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2989 from_16161616(load<U64>(ptr), &r,&g, &b, &a);
2990}
2991STAGE(load_16161616_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2992 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2993 from_16161616(load<U64>(ptr), &dr, &dg, &db, &da);
2994}
2995STAGE(gather_16161616, const SkRasterPipeline_GatherCtx* ctx) {
2996 const uint64_t* ptr;
2997 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2998 from_16161616(gather(ptr, ix), &r, &g, &b, &a);
2999}
3000STAGE(store_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
3001 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy);
3002
3003 U16 R = pack(to_unorm(r, 65535)),
3004 G = pack(to_unorm(g, 65535)),
3005 B = pack(to_unorm(b, 65535)),
3006 A = pack(to_unorm(a, 65535));
3007
3008 store4(ptr, R,G,B,A);
3009}
3010
3011STAGE(load_10x6, const SkRasterPipeline_MemoryCtx* ctx) {
3012 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
3013 from_10x6(load<U64>(ptr), &r,&g, &b, &a);
3014}
3015STAGE(load_10x6_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3016 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
3017 from_10x6(load<U64>(ptr), &dr, &dg, &db, &da);
3018}
3019STAGE(gather_10x6, const SkRasterPipeline_GatherCtx* ctx) {
3020 const uint64_t* ptr;
3021 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
3022 from_10x6(gather(ptr, ix), &r, &g, &b, &a);
3023}
3024STAGE(store_10x6, const SkRasterPipeline_MemoryCtx* ctx) {
3025 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy);
3026
3027 U16 R = pack(to_unorm(r, 1023)) << 6,
3028 G = pack(to_unorm(g, 1023)) << 6,
3029 B = pack(to_unorm(b, 1023)) << 6,
3030 A = pack(to_unorm(a, 1023)) << 6;
3031
3032 store4(ptr, R,G,B,A);
3033}
3034
3035
3036STAGE(load_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
3037 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
3038 from_1010102(load<U32>(ptr), &r,&g,&b,&a);
3039}
3040STAGE(load_1010102_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3041 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
3042 from_1010102(load<U32>(ptr), &dr,&dg,&db,&da);
3043}
3044STAGE(load_1010102_xr, const SkRasterPipeline_MemoryCtx* ctx) {
3045 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
3046 from_1010102_xr(load<U32>(ptr), &r,&g,&b,&a);
3047}
3048STAGE(load_1010102_xr_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3049 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
3050 from_1010102_xr(load<U32>(ptr), &dr,&dg,&db,&da);
3051}
3052STAGE(gather_1010102, const SkRasterPipeline_GatherCtx* ctx) {
3053 const uint32_t* ptr;
3054 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
3055 from_1010102(gather(ptr, ix), &r,&g,&b,&a);
3056}
3057STAGE(gather_1010102_xr, const SkRasterPipeline_GatherCtx* ctx) {
3058 const uint32_t* ptr;
3059 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
3060 from_1010102_xr(gather(ptr, ix), &r,&g,&b,&a);
3061}
3062STAGE(gather_10101010_xr, const SkRasterPipeline_GatherCtx* ctx) {
3063 const uint64_t* ptr;
3064 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
3065 from_10101010_xr(gather(ptr, ix), &r, &g, &b, &a);
3066}
3067STAGE(load_10101010_xr, const SkRasterPipeline_MemoryCtx* ctx) {
3068 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
3069 from_10101010_xr(load<U64>(ptr), &r,&g, &b, &a);
3070}
3071STAGE(load_10101010_xr_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3072 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
3073 from_10101010_xr(load<U64>(ptr), &dr, &dg, &db, &da);
3074}
3075STAGE(store_10101010_xr, const SkRasterPipeline_MemoryCtx* ctx) {
3076 static constexpr float min = -0.752941f;
3077 static constexpr float max = 1.25098f;
3078 static constexpr float range = max - min;
3079 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy);
3080
3081 U16 R = pack(to_unorm((r - min) / range, 1023)) << 6,
3082 G = pack(to_unorm((g - min) / range, 1023)) << 6,
3083 B = pack(to_unorm((b - min) / range, 1023)) << 6,
3084 A = pack(to_unorm((a - min) / range, 1023)) << 6;
3085
3086 store4(ptr, R,G,B,A);
3087}
3088STAGE(store_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
3089 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
3090
3091 U32 px = to_unorm(r, 1023)
3092 | to_unorm(g, 1023) << 10
3093 | to_unorm(b, 1023) << 20
3094 | to_unorm(a, 3) << 30;
3095 store(ptr, px);
3096}
3097STAGE(store_1010102_xr, const SkRasterPipeline_MemoryCtx* ctx) {
3098 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
3099 static constexpr float min = -0.752941f;
3100 static constexpr float max = 1.25098f;
3101 static constexpr float range = max - min;
3102 U32 px = to_unorm((r - min) / range, 1023)
3103 | to_unorm((g - min) / range, 1023) << 10
3104 | to_unorm((b - min) / range, 1023) << 20
3105 | to_unorm(a, 3) << 30;
3106 store(ptr, px);
3107}
3108
3109STAGE(load_f16, const SkRasterPipeline_MemoryCtx* ctx) {
3110 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
3111
3112 U16 R,G,B,A;
3113 load4((const uint16_t*)ptr, &R,&G,&B,&A);
3114 r = from_half(R);
3115 g = from_half(G);
3116 b = from_half(B);
3117 a = from_half(A);
3118}
3119STAGE(load_f16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3120 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
3121
3122 U16 R,G,B,A;
3123 load4((const uint16_t*)ptr, &R,&G,&B,&A);
3124 dr = from_half(R);
3125 dg = from_half(G);
3126 db = from_half(B);
3127 da = from_half(A);
3128}
3129STAGE(gather_f16, const SkRasterPipeline_GatherCtx* ctx) {
3130 const uint64_t* ptr;
3131 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
3132 auto px = gather(ptr, ix);
3133
3134 U16 R,G,B,A;
3135 load4((const uint16_t*)&px, &R,&G,&B,&A);
3136 r = from_half(R);
3137 g = from_half(G);
3138 b = from_half(B);
3139 a = from_half(A);
3140}
3141STAGE(store_f16, const SkRasterPipeline_MemoryCtx* ctx) {
3142 auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy);
3143 store4((uint16_t*)ptr, to_half(r)
3144 , to_half(g)
3145 , to_half(b)
3146 , to_half(a));
3147}
3148
3149STAGE(load_af16, const SkRasterPipeline_MemoryCtx* ctx) {
3150 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
3151
3152 U16 A = load<U16>((const uint16_t*)ptr);
3153 r = F0;
3154 g = F0;
3155 b = F0;
3156 a = from_half(A);
3157}
3158STAGE(load_af16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3159 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
3160
3161 U16 A = load<U16>((const uint16_t*)ptr);
3162 dr = dg = db = F0;
3163 da = from_half(A);
3164}
3165STAGE(gather_af16, const SkRasterPipeline_GatherCtx* ctx) {
3166 const uint16_t* ptr;
3167 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
3168 r = g = b = F0;
3169 a = from_half(gather(ptr, ix));
3170}
3171STAGE(store_af16, const SkRasterPipeline_MemoryCtx* ctx) {
3172 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
3173 store(ptr, to_half(a));
3174}
3175
3176STAGE(load_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
3177 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
3178
3179 U16 R,G;
3180 load2((const uint16_t*)ptr, &R, &G);
3181 r = from_half(R);
3182 g = from_half(G);
3183 b = F0;
3184 a = F1;
3185}
3186STAGE(load_rgf16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3187 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
3188
3189 U16 R,G;
3190 load2((const uint16_t*)ptr, &R, &G);
3191 dr = from_half(R);
3192 dg = from_half(G);
3193 db = F0;
3194 da = F1;
3195}
3196STAGE(gather_rgf16, const SkRasterPipeline_GatherCtx* ctx) {
3197 const uint32_t* ptr;
3198 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
3199 auto px = gather(ptr, ix);
3200
3201 U16 R,G;
3202 load2((const uint16_t*)&px, &R, &G);
3203 r = from_half(R);
3204 g = from_half(G);
3205 b = F0;
3206 a = F1;
3207}
3208STAGE(store_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
3209 auto ptr = ptr_at_xy<uint32_t>(ctx, dx, dy);
3210 store2((uint16_t*)ptr, to_half(r)
3211 , to_half(g));
3212}
3213
3214STAGE(load_f32, const SkRasterPipeline_MemoryCtx* ctx) {
3215 auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy);
3216 load4(ptr, &r,&g,&b,&a);
3217}
3218STAGE(load_f32_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3219 auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy);
3220 load4(ptr, &dr,&dg,&db,&da);
3221}
3222STAGE(gather_f32, const SkRasterPipeline_GatherCtx* ctx) {
3223 const float* ptr;
3224 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
3225 r = gather(ptr, 4*ix + 0);
3226 g = gather(ptr, 4*ix + 1);
3227 b = gather(ptr, 4*ix + 2);
3228 a = gather(ptr, 4*ix + 3);
3229}
3230STAGE(store_f32, const SkRasterPipeline_MemoryCtx* ctx) {
3231 auto ptr = ptr_at_xy<float>(ctx, 4*dx,4*dy);
3232 store4(ptr, r,g,b,a);
3233}
3234
3236 return v - floor_(v*ctx->invScale)*ctx->scale;
3237}
3239 auto limit = ctx->scale;
3240 auto invLimit = ctx->invScale;
3241
3242 // This is "repeat" over the range 0..2*limit
3243 auto u = v - floor_(v*invLimit*0.5f)*2*limit;
3244 // s will be 0 when moving forward (e.g. [0, limit)) and 1 when moving backward (e.g.
3245 // [limit, 2*limit)).
3246 auto s = floor_(u*invLimit);
3247 // This is the mirror result.
3248 auto m = u - 2*s*(u - limit);
3249 // Apply a bias to m if moving backwards so that we snap consistently at exact integer coords in
3250 // the logical infinite image. This is tested by mirror_tile GM. Note that all values
3251 // that have a non-zero bias applied are > 0.
3252 auto biasInUlps = trunc_(s);
3253 return sk_bit_cast<F>(sk_bit_cast<U32>(m) + ctx->mirrorBiasDir*biasInUlps);
3254}
3255// Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images).
3256// The gather stages will hard clamp the output of these stages to [0,limit)...
3257// we just need to do the basic repeat or mirroring.
3258STAGE(repeat_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_repeat(r, ctx); }
3259STAGE(repeat_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_repeat(g, ctx); }
3260STAGE(mirror_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_mirror(r, ctx); }
3261STAGE(mirror_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_mirror(g, ctx); }
3262
3263STAGE( clamp_x_1, NoCtx) { r = clamp_01_(r); }
3264STAGE(repeat_x_1, NoCtx) { r = clamp_01_(r - floor_(r)); }
3265STAGE(mirror_x_1, NoCtx) { r = clamp_01_(abs_( (r-1.0f) - two(floor_((r-1.0f)*0.5f)) - 1.0f )); }
3266
3267STAGE(clamp_x_and_y, const SkRasterPipeline_CoordClampCtx* ctx) {
3268 r = min(ctx->max_x, max(ctx->min_x, r));
3269 g = min(ctx->max_y, max(ctx->min_y, g));
3270}
3271
3272// Decal stores a 32bit mask after checking the coordinate (x and/or y) against its domain:
3273// mask == 0x00000000 if the coordinate(s) are out of bounds
3274// mask == 0xFFFFFFFF if the coordinate(s) are in bounds
3275// After the gather stage, the r,g,b,a values are AND'd with this mask, setting them to 0
3276// if either of the coordinates were out of bounds.
3277
3279 auto w = ctx->limit_x;
3280 auto e = ctx->inclusiveEdge_x;
3281 auto cond = ((0 < r) & (r < w)) | (r == e);
3283}
3285 auto h = ctx->limit_y;
3286 auto e = ctx->inclusiveEdge_y;
3287 auto cond = ((0 < g) & (g < h)) | (g == e);
3289}
3291 auto w = ctx->limit_x;
3292 auto h = ctx->limit_y;
3293 auto ex = ctx->inclusiveEdge_x;
3294 auto ey = ctx->inclusiveEdge_y;
3295 auto cond = (((0 < r) & (r < w)) | (r == ex))
3296 & (((0 < g) & (g < h)) | (g == ey));
3298}
3299STAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
3300 auto mask = sk_unaligned_load<U32>(ctx->mask);
3301 r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask);
3302 g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask);
3303 b = sk_bit_cast<F>(sk_bit_cast<U32>(b) & mask);
3304 a = sk_bit_cast<F>(sk_bit_cast<U32>(a) & mask);
3305}
3306
3307STAGE(alpha_to_gray, NoCtx) {
3308 r = g = b = a;
3309 a = F1;
3310}
3311STAGE(alpha_to_gray_dst, NoCtx) {
3312 dr = dg = db = da;
3313 da = F1;
3314}
3315STAGE(alpha_to_red, NoCtx) {
3316 r = a;
3317 a = F1;
3318}
3319STAGE(alpha_to_red_dst, NoCtx) {
3320 dr = da;
3321 da = F1;
3322}
3323
3324STAGE(bt709_luminance_or_luma_to_alpha, NoCtx) {
3325 a = r*0.2126f + g*0.7152f + b*0.0722f;
3326 r = g = b = F0;
3327}
3328STAGE(bt709_luminance_or_luma_to_rgb, NoCtx) {
3329 r = g = b = r*0.2126f + g*0.7152f + b*0.0722f;
3330}
3331
3332STAGE(matrix_translate, const float* m) {
3333 r += m[0];
3334 g += m[1];
3335}
3336STAGE(matrix_scale_translate, const float* m) {
3337 r = mad(r,m[0], m[2]);
3338 g = mad(g,m[1], m[3]);
3339}
3340STAGE(matrix_2x3, const float* m) {
3341 auto R = mad(r,m[0], mad(g,m[1], m[2])),
3342 G = mad(r,m[3], mad(g,m[4], m[5]));
3343 r = R;
3344 g = G;
3345}
3346STAGE(matrix_3x3, const float* m) {
3347 auto R = mad(r,m[0], mad(g,m[3], b*m[6])),
3348 G = mad(r,m[1], mad(g,m[4], b*m[7])),
3349 B = mad(r,m[2], mad(g,m[5], b*m[8]));
3350 r = R;
3351 g = G;
3352 b = B;
3353}
3354STAGE(matrix_3x4, const float* m) {
3355 auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))),
3356 G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))),
3357 B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11])));
3358 r = R;
3359 g = G;
3360 b = B;
3361}
3362STAGE(matrix_4x5, const float* m) {
3363 auto R = mad(r,m[ 0], mad(g,m[ 1], mad(b,m[ 2], mad(a,m[ 3], m[ 4])))),
3364 G = mad(r,m[ 5], mad(g,m[ 6], mad(b,m[ 7], mad(a,m[ 8], m[ 9])))),
3365 B = mad(r,m[10], mad(g,m[11], mad(b,m[12], mad(a,m[13], m[14])))),
3366 A = mad(r,m[15], mad(g,m[16], mad(b,m[17], mad(a,m[18], m[19]))));
3367 r = R;
3368 g = G;
3369 b = B;
3370 a = A;
3371}
3372STAGE(matrix_4x3, const float* m) {
3373 auto X = r,
3374 Y = g;
3375
3376 r = mad(X, m[0], mad(Y, m[4], m[ 8]));
3377 g = mad(X, m[1], mad(Y, m[5], m[ 9]));
3378 b = mad(X, m[2], mad(Y, m[6], m[10]));
3379 a = mad(X, m[3], mad(Y, m[7], m[11]));
3380}
3381STAGE(matrix_perspective, const float* m) {
3382 // N.B. Unlike the other matrix_ stages, this matrix is row-major.
3383 auto R = mad(r,m[0], mad(g,m[1], m[2])),
3384 G = mad(r,m[3], mad(g,m[4], m[5])),
3385 Z = mad(r,m[6], mad(g,m[7], m[8]));
3386 r = R * rcp_precise(Z);
3387 g = G * rcp_precise(Z);
3388}
3389
3391 F* r, F* g, F* b, F* a) {
3392 F fr, br, fg, bg, fb, bb, fa, ba;
3393#if defined(JUMPER_IS_HSW)
3394 if (c->stopCount <=8) {
3395 fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), (__m256i)idx);
3396 br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), (__m256i)idx);
3397 fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), (__m256i)idx);
3398 bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), (__m256i)idx);
3399 fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), (__m256i)idx);
3400 bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), (__m256i)idx);
3401 fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), (__m256i)idx);
3402 ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), (__m256i)idx);
3403 } else
3404#elif defined(JUMPER_IS_LASX)
3405 if (c->stopCount <= 8) {
3406 fr = (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[0], 0), idx);
3407 br = (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[0], 0), idx);
3408 fg = (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[1], 0), idx);
3409 bg = (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[1], 0), idx);
3410 fb = (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[2], 0), idx);
3411 bb = (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[2], 0), idx);
3412 fa = (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[3], 0), idx);
3413 ba = (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[3], 0), idx);
3414 } else
3415#elif defined(JUMPER_IS_LSX)
3416 if (c->stopCount <= 4) {
3417 __m128i zero = __lsx_vldi(0);
3418 fr = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->fs[0], 0));
3419 br = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->bs[0], 0));
3420 fg = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->fs[1], 0));
3421 bg = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->bs[1], 0));
3422 fb = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->fs[2], 0));
3423 bb = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->bs[2], 0));
3424 fa = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->fs[3], 0));
3425 ba = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->bs[3], 0));
3426 } else
3427#endif
3428 {
3429 fr = gather(c->fs[0], idx);
3430 br = gather(c->bs[0], idx);
3431 fg = gather(c->fs[1], idx);
3432 bg = gather(c->bs[1], idx);
3433 fb = gather(c->fs[2], idx);
3434 bb = gather(c->bs[2], idx);
3435 fa = gather(c->fs[3], idx);
3436 ba = gather(c->bs[3], idx);
3437 }
3438
3439 *r = mad(t, fr, br);
3440 *g = mad(t, fg, bg);
3441 *b = mad(t, fb, bb);
3442 *a = mad(t, fa, ba);
3443}
3444
3445STAGE(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
3446 auto t = r;
3447 auto idx = trunc_(t * static_cast<float>(c->stopCount-1));
3448 gradient_lookup(c, idx, t, &r, &g, &b, &a);
3449}
3450
3452 auto t = r;
3453 U32 idx = U32_(0);
3454
3455 // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
3456 for (size_t i = 1; i < c->stopCount; i++) {
3457 idx += (U32)if_then_else(t >= c->ts[i], I32_(1), I32_(0));
3458 }
3459
3460 gradient_lookup(c, idx, t, &r, &g, &b, &a);
3461}
3462
3463STAGE(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) {
3464 auto t = r;
3465 r = mad(t, c->f[0], c->b[0]);
3466 g = mad(t, c->f[1], c->b[1]);
3467 b = mad(t, c->f[2], c->b[2]);
3468 a = mad(t, c->f[3], c->b[3]);
3469}
3470
3471STAGE(xy_to_unit_angle, NoCtx) {
3472 F X = r,
3473 Y = g;
3474 F xabs = abs_(X),
3475 yabs = abs_(Y);
3476
3477 F slope = min(xabs, yabs)/max(xabs, yabs);
3478 F s = slope * slope;
3479
3480 // Use a 7th degree polynomial to approximate atan.
3481 // This was generated using sollya.gforge.inria.fr.
3482 // A float optimized polynomial was generated using the following command.
3483 // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
3484 F phi = slope
3485 * (0.15912117063999176025390625f + s
3486 * (-5.185396969318389892578125e-2f + s
3487 * (2.476101927459239959716796875e-2f + s
3488 * (-7.0547382347285747528076171875e-3f))));
3489
3490 phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
3491 phi = if_then_else(X < 0.0f , 1.0f/2.0f - phi, phi);
3492 phi = if_then_else(Y < 0.0f , 1.0f - phi , phi);
3493 phi = if_then_else(phi != phi , 0.0f , phi); // Check for NaN.
3494 r = phi;
3495}
3496
3497STAGE(xy_to_radius, NoCtx) {
3498 F X2 = r * r,
3499 Y2 = g * g;
3500 r = sqrt_(X2 + Y2);
3501}
3502
3503// Please see https://skia.org/dev/design/conical for how our 2pt conical shader works.
3504
3505STAGE(negate_x, NoCtx) { r = -r; }
3506
3507STAGE(xy_to_2pt_conical_strip, const SkRasterPipeline_2PtConicalCtx* ctx) {
3508 F x = r, y = g, &t = r;
3509 t = x + sqrt_(ctx->fP0 - y*y); // ctx->fP0 = r0 * r0
3510}
3511
3512STAGE(xy_to_2pt_conical_focal_on_circle, NoCtx) {
3513 F x = r, y = g, &t = r;
3514 t = x + y*y / x; // (x^2 + y^2) / x
3515}
3516
3517STAGE(xy_to_2pt_conical_well_behaved, const SkRasterPipeline_2PtConicalCtx* ctx) {
3518 F x = r, y = g, &t = r;
3519 t = sqrt_(x*x + y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
3520}
3521
3522STAGE(xy_to_2pt_conical_greater, const SkRasterPipeline_2PtConicalCtx* ctx) {
3523 F x = r, y = g, &t = r;
3524 t = sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
3525}
3526
3527STAGE(xy_to_2pt_conical_smaller, const SkRasterPipeline_2PtConicalCtx* ctx) {
3528 F x = r, y = g, &t = r;
3529 t = -sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
3530}
3531
3532STAGE(alter_2pt_conical_compensate_focal, const SkRasterPipeline_2PtConicalCtx* ctx) {
3533 F& t = r;
3534 t = t + ctx->fP1; // ctx->fP1 = f
3535}
3536
3537STAGE(alter_2pt_conical_unswap, NoCtx) {
3538 F& t = r;
3539 t = 1 - t;
3540}
3541
3542STAGE(mask_2pt_conical_nan, SkRasterPipeline_2PtConicalCtx* c) {
3543 F& t = r;
3544 auto is_degenerate = (t != t); // NaN
3545 t = if_then_else(is_degenerate, F0, t);
3547}
3548
3549STAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) {
3550 F& t = r;
3551 auto is_degenerate = (t <= 0) | (t != t);
3552 t = if_then_else(is_degenerate, F0, t);
3554}
3555
3556STAGE(apply_vector_mask, const uint32_t* ctx) {
3557 const U32 mask = sk_unaligned_load<U32>(ctx);
3558 r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask);
3559 g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask);
3560 b = sk_bit_cast<F>(sk_bit_cast<U32>(b) & mask);
3561 a = sk_bit_cast<F>(sk_bit_cast<U32>(a) & mask);
3562}
3563
3565 // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy).
3566 // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
3567 // surrounding (x,y) at (0.5,0.5) off-center.
3568 F fx = fract(*r + 0.5f),
3569 fy = fract(*g + 0.5f);
3570
3571 // Samplers will need to load x and fx, or y and fy.
3572 sk_unaligned_store(c->x, *r);
3573 sk_unaligned_store(c->y, *g);
3574 sk_unaligned_store(c->fx, fx);
3575 sk_unaligned_store(c->fy, fy);
3576}
3577
3578STAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) {
3579 // Bilinear and bicubic filters are both separable, so we produce independent contributions
3580 // from x and y, multiplying them together here to get each pixel's total scale factor.
3581 auto scale = sk_unaligned_load<F>(c->scalex)
3582 * sk_unaligned_load<F>(c->scaley);
3583 dr = mad(scale, r, dr);
3584 dg = mad(scale, g, dg);
3585 db = mad(scale, b, db);
3586 da = mad(scale, a, da);
3587}
3588
3589// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
3590// are combined in direct proportion to their area overlapping that logical query pixel.
3591// At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x.
3592// The y-axis is symmetric.
3593
3594template <int kScale>
3596 *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f);
3597 F fx = sk_unaligned_load<F>(ctx->fx);
3598
3599 F scalex;
3600 if (kScale == -1) { scalex = 1.0f - fx; }
3601 if (kScale == +1) { scalex = fx; }
3602 sk_unaligned_store(ctx->scalex, scalex);
3603}
3604template <int kScale>
3606 *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f);
3607 F fy = sk_unaligned_load<F>(ctx->fy);
3608
3609 F scaley;
3610 if (kScale == -1) { scaley = 1.0f - fy; }
3611 if (kScale == +1) { scaley = fy; }
3612 sk_unaligned_store(ctx->scaley, scaley);
3613}
3614
3615STAGE(bilinear_setup, SkRasterPipeline_SamplerCtx* ctx) {
3616 save_xy(&r, &g, ctx);
3617 // Init for accumulate
3618 dr = dg = db = da = F0;
3619}
3620
3621STAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-1>(ctx, &r); }
3622STAGE(bilinear_px, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<+1>(ctx, &r); }
3623STAGE(bilinear_ny, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<-1>(ctx, &g); }
3624STAGE(bilinear_py, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<+1>(ctx, &g); }
3625
3626
3627// In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
3628// pixel center are combined with a non-uniform cubic filter, with higher values near the center.
3629//
3630// This helper computes the total weight along one axis (our bicubic filter is separable), given one
3631// column of the sampling matrix, and a fractional pixel offset. See SkCubicResampler for details.
3632
3633SI F bicubic_wts(F t, float A, float B, float C, float D) {
3634 return mad(t, mad(t, mad(t, D, C), B), A);
3635}
3636
3637template <int kScale>
3639 *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f);
3640
3641 F scalex;
3642 if (kScale == -3) { scalex = sk_unaligned_load<F>(ctx->wx[0]); }
3643 if (kScale == -1) { scalex = sk_unaligned_load<F>(ctx->wx[1]); }
3644 if (kScale == +1) { scalex = sk_unaligned_load<F>(ctx->wx[2]); }
3645 if (kScale == +3) { scalex = sk_unaligned_load<F>(ctx->wx[3]); }
3646 sk_unaligned_store(ctx->scalex, scalex);
3647}
3648template <int kScale>
3650 *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f);
3651
3652 F scaley;
3653 if (kScale == -3) { scaley = sk_unaligned_load<F>(ctx->wy[0]); }
3654 if (kScale == -1) { scaley = sk_unaligned_load<F>(ctx->wy[1]); }
3655 if (kScale == +1) { scaley = sk_unaligned_load<F>(ctx->wy[2]); }
3656 if (kScale == +3) { scaley = sk_unaligned_load<F>(ctx->wy[3]); }
3657 sk_unaligned_store(ctx->scaley, scaley);
3658}
3659
3661 save_xy(&r, &g, ctx);
3662
3663 const float* w = ctx->weights;
3664
3665 F fx = sk_unaligned_load<F>(ctx->fx);
3666 sk_unaligned_store(ctx->wx[0], bicubic_wts(fx, w[0], w[4], w[ 8], w[12]));
3667 sk_unaligned_store(ctx->wx[1], bicubic_wts(fx, w[1], w[5], w[ 9], w[13]));
3668 sk_unaligned_store(ctx->wx[2], bicubic_wts(fx, w[2], w[6], w[10], w[14]));
3669 sk_unaligned_store(ctx->wx[3], bicubic_wts(fx, w[3], w[7], w[11], w[15]));
3670
3671 F fy = sk_unaligned_load<F>(ctx->fy);
3672 sk_unaligned_store(ctx->wy[0], bicubic_wts(fy, w[0], w[4], w[ 8], w[12]));
3673 sk_unaligned_store(ctx->wy[1], bicubic_wts(fy, w[1], w[5], w[ 9], w[13]));
3674 sk_unaligned_store(ctx->wy[2], bicubic_wts(fy, w[2], w[6], w[10], w[14]));
3675 sk_unaligned_store(ctx->wy[3], bicubic_wts(fy, w[3], w[7], w[11], w[15]));
3676
3677 // Init for accumulate
3678 dr = dg = db = da = F0;
3679}
3680
3681STAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-3>(ctx, &r); }
3682STAGE(bicubic_n1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-1>(ctx, &r); }
3683STAGE(bicubic_p1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+1>(ctx, &r); }
3684STAGE(bicubic_p3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+3>(ctx, &r); }
3685
3686STAGE(bicubic_n3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-3>(ctx, &g); }
3687STAGE(bicubic_n1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-1>(ctx, &g); }
3688STAGE(bicubic_p1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+1>(ctx, &g); }
3689STAGE(bicubic_p3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+3>(ctx, &g); }
3690
3692 // We're relying on the packing of uint16s within a uint32, which will vary based on endianness.
3693#ifdef SK_CPU_BENDIAN
3694 U32 sampleLo = sample >> 16;
3695 U32 sampleHi = sample & 0xFFFF;
3696#else
3697 U32 sampleLo = sample & 0xFFFF;
3698 U32 sampleHi = sample >> 16;
3699#endif
3700
3701 // Convert 32-bit sample value into two floats in the [-1..1] range.
3702 F vecX = mad(cast(sampleLo), 2.0f / 65535.0f, -1.0f);
3703 F vecY = mad(cast(sampleHi), 2.0f / 65535.0f, -1.0f);
3704
3705 // Return the dot of the sample and the passed-in vector.
3706 return mad(vecX, x,
3707 vecY * y);
3708}
3709
3711 F noiseVecX = (r + 0.5) * ctx->baseFrequencyX;
3712 F noiseVecY = (g + 0.5) * ctx->baseFrequencyY;
3713 r = g = b = a = F0;
3714 F stitchDataX = F_(ctx->stitchDataInX);
3715 F stitchDataY = F_(ctx->stitchDataInY);
3716 F ratio = F1;
3717
3718 for (int octave = 0; octave < ctx->numOctaves; ++octave) {
3719 // Calculate noise coordinates. (Roughly $noise_helper in Graphite)
3720 F floorValX = floor_(noiseVecX);
3721 F floorValY = floor_(noiseVecY);
3722 F ceilValX = floorValX + 1.0f;
3723 F ceilValY = floorValY + 1.0f;
3724 F fractValX = noiseVecX - floorValX;
3725 F fractValY = noiseVecY - floorValY;
3726
3727 if (ctx->stitching) {
3728 // If we are stitching, wrap the coordinates to the stitch position.
3729 floorValX -= sk_bit_cast<F>(cond_to_mask(floorValX >= stitchDataX) &
3730 sk_bit_cast<I32>(stitchDataX));
3731 floorValY -= sk_bit_cast<F>(cond_to_mask(floorValY >= stitchDataY) &
3732 sk_bit_cast<I32>(stitchDataY));
3733 ceilValX -= sk_bit_cast<F>(cond_to_mask(ceilValX >= stitchDataX) &
3734 sk_bit_cast<I32>(stitchDataX));
3735 ceilValY -= sk_bit_cast<F>(cond_to_mask(ceilValY >= stitchDataY) &
3736 sk_bit_cast<I32>(stitchDataY));
3737 }
3738
3739 U32 latticeLookup = (U32)(iround(floorValX)) & 0xFF;
3740 F latticeIdxX = cast(expand(gather(ctx->latticeSelector, latticeLookup)));
3741 latticeLookup = (U32)(iround(ceilValX)) & 0xFF;
3742 F latticeIdxY = cast(expand(gather(ctx->latticeSelector, latticeLookup)));
3743
3744 U32 b00 = (U32)(iround(latticeIdxX + floorValY)) & 0xFF;
3745 U32 b10 = (U32)(iround(latticeIdxY + floorValY)) & 0xFF;
3746 U32 b01 = (U32)(iround(latticeIdxX + ceilValY)) & 0xFF;
3747 U32 b11 = (U32)(iround(latticeIdxY + ceilValY)) & 0xFF;
3748
3749 // Calculate noise colors. (Roughly $noise_function in Graphite)
3750 // Apply Hermite interpolation to the fractional value.
3751 F smoothX = fractValX * fractValX * (3.0f - 2.0f * fractValX);
3752 F smoothY = fractValY * fractValY * (3.0f - 2.0f * fractValY);
3753
3754 F color[4];
3755 const uint32_t* channelNoiseData = reinterpret_cast<const uint32_t*>(ctx->noiseData);
3756 for (int channel = 0; channel < 4; ++channel) {
3757 U32 sample00 = gather(channelNoiseData, b00);
3758 U32 sample10 = gather(channelNoiseData, b10);
3759 U32 sample01 = gather(channelNoiseData, b01);
3760 U32 sample11 = gather(channelNoiseData, b11);
3761 channelNoiseData += 256;
3762
3763 F u = compute_perlin_vector(sample00, fractValX, fractValY);
3764 F v = compute_perlin_vector(sample10, fractValX - 1.0f, fractValY);
3765 F A = lerp(u, v, smoothX);
3766
3767 u = compute_perlin_vector(sample01, fractValX, fractValY - 1.0f);
3768 v = compute_perlin_vector(sample11, fractValX - 1.0f, fractValY - 1.0f);
3769 F B = lerp(u, v, smoothX);
3770
3771 color[channel] = lerp(A, B, smoothY);
3772 }
3773
3775 // For kTurbulence the result is: abs(noise[-1,1])
3776 color[0] = abs_(color[0]);
3777 color[1] = abs_(color[1]);
3778 color[2] = abs_(color[2]);
3779 color[3] = abs_(color[3]);
3780 }
3781
3782 r = mad(color[0], ratio, r);
3783 g = mad(color[1], ratio, g);
3784 b = mad(color[2], ratio, b);
3785 a = mad(color[3], ratio, a);
3786
3787 // Scale inputs for the next round.
3788 noiseVecX *= 2.0f;
3789 noiseVecY *= 2.0f;
3790 stitchDataX *= 2.0f;
3791 stitchDataY *= 2.0f;
3792 ratio *= 0.5f;
3793 }
3794
3796 // For kFractalNoise the result is: noise[-1,1] * 0.5 + 0.5
3797 r = mad(r, 0.5f, 0.5f);
3798 g = mad(g, 0.5f, 0.5f);
3799 b = mad(b, 0.5f, 0.5f);
3800 a = mad(a, 0.5f, 0.5f);
3801 }
3802
3803 r = clamp_01_(r) * a;
3804 g = clamp_01_(g) * a;
3805 b = clamp_01_(b) * a;
3806 a = clamp_01_(a);
3807}
3808
3809STAGE(mipmap_linear_init, SkRasterPipeline_MipmapCtx* ctx) {
3810 sk_unaligned_store(ctx->x, r);
3811 sk_unaligned_store(ctx->y, g);
3812}
3813
3814STAGE(mipmap_linear_update, SkRasterPipeline_MipmapCtx* ctx) {
3815 sk_unaligned_store(ctx->r, r);
3816 sk_unaligned_store(ctx->g, g);
3817 sk_unaligned_store(ctx->b, b);
3818 sk_unaligned_store(ctx->a, a);
3819
3820 r = sk_unaligned_load<F>(ctx->x) * ctx->scaleX;
3821 g = sk_unaligned_load<F>(ctx->y) * ctx->scaleY;
3822}
3823
3824STAGE(mipmap_linear_finish, SkRasterPipeline_MipmapCtx* ctx) {
3825 r = lerp(sk_unaligned_load<F>(ctx->r), r, F_(ctx->lowerWeight));
3826 g = lerp(sk_unaligned_load<F>(ctx->g), g, F_(ctx->lowerWeight));
3827 b = lerp(sk_unaligned_load<F>(ctx->b), b, F_(ctx->lowerWeight));
3828 a = lerp(sk_unaligned_load<F>(ctx->a), a, F_(ctx->lowerWeight));
3829}
3830
3832 store4(c->rgba, r,g,b,a);
3833 c->fn(c, N);
3834 load4(c->read_from, &r,&g,&b,&a);
3835}
3836
3837STAGE_TAIL(set_base_pointer, std::byte* p) {
3838 base = p;
3839}
3840
3841// All control flow stages used by SkSL maintain some state in the common registers:
3842// r: condition mask
3843// g: loop mask
3844// b: return mask
3845// a: execution mask (intersection of all three masks)
3846// After updating r/g/b, you must invoke update_execution_mask().
3847#define execution_mask() sk_bit_cast<I32>(a)
3848#define update_execution_mask() a = sk_bit_cast<F>(sk_bit_cast<I32>(r) & \
3849 sk_bit_cast<I32>(g) & \
3850 sk_bit_cast<I32>(b))
3851
3853 uint32_t iota[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
3854 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
3855
3856 I32 mask = cond_to_mask(sk_unaligned_load<U32>(iota) < *ctx->tail);
3857 r = g = b = a = sk_bit_cast<F>(mask);
3858}
3859
3860STAGE_TAIL(store_device_xy01, F* dst) {
3861 // This is very similar to `seed_shader + store_src`, but b/a are backwards.
3862 // (sk_FragCoord actually puts w=1 in the w slot.)
3863 static constexpr float iota[] = {
3864 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
3865 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
3866 };
3867 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
3868
3869 dst[0] = cast(U32_(dx)) + sk_unaligned_load<F>(iota);
3870 dst[1] = cast(U32_(dy)) + 0.5f;
3871 dst[2] = F0;
3872 dst[3] = F1;
3873}
3874
3875STAGE_TAIL(exchange_src, F* rgba) {
3876 // Swaps r,g,b,a registers with the values at `rgba`.
3877 F temp[4] = {r, g, b, a};
3878 r = rgba[0];
3879 rgba[0] = temp[0];
3880 g = rgba[1];
3881 rgba[1] = temp[1];
3882 b = rgba[2];
3883 rgba[2] = temp[2];
3884 a = rgba[3];
3885 rgba[3] = temp[3];
3886}
3887
3888STAGE_TAIL(load_condition_mask, F* ctx) {
3889 r = sk_unaligned_load<F>(ctx);
3891}
3892
3893STAGE_TAIL(store_condition_mask, F* ctx) {
3894 sk_unaligned_store(ctx, r);
3895}
3896
3897STAGE_TAIL(merge_condition_mask, I32* ptr) {
3898 // Set the condition-mask to the intersection of two adjacent masks at the pointer.
3899 r = sk_bit_cast<F>(ptr[0] & ptr[1]);
3901}
3902
3903STAGE_TAIL(merge_inv_condition_mask, I32* ptr) {
3904 // Set the condition-mask to the intersection of the first mask and the inverse of the second.
3905 r = sk_bit_cast<F>(ptr[0] & ~ptr[1]);
3907}
3908
3909STAGE_TAIL(load_loop_mask, F* ctx) {
3910 g = sk_unaligned_load<F>(ctx);
3912}
3913
3914STAGE_TAIL(store_loop_mask, F* ctx) {
3915 sk_unaligned_store(ctx, g);
3916}
3917
3918STAGE_TAIL(mask_off_loop_mask, NoCtx) {
3919 // We encountered a break statement. If a lane was active, it should be masked off now, and stay
3920 // masked-off until the termination of the loop.
3921 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) & ~execution_mask());
3923}
3924
3925STAGE_TAIL(reenable_loop_mask, I32* ptr) {
3926 // Set the loop-mask to the union of the current loop-mask with the mask at the pointer.
3927 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) | ptr[0]);
3929}
3930
3931STAGE_TAIL(merge_loop_mask, I32* ptr) {
3932 // Set the loop-mask to the intersection of the current loop-mask with the mask at the pointer.
3933 // (Note: this behavior subtly differs from merge_condition_mask!)
3934 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) & ptr[0]);
3936}
3937
3938STAGE_TAIL(continue_op, I32* continueMask) {
3939 // Set any currently-executing lanes in the continue-mask to true.
3940 *continueMask |= execution_mask();
3941
3942 // Disable any currently-executing lanes from the loop mask. (Just like `mask_off_loop_mask`.)
3943 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) & ~execution_mask());
3945}
3946
3948 auto ctx = SkRPCtxUtils::Unpack(packed);
3949
3950 // Check each lane to see if the case value matches the expectation.
3951 I32* actualValue = (I32*)(base + ctx.offset);
3952 I32 caseMatches = cond_to_mask(*actualValue == ctx.expectedValue);
3953
3954 // In lanes where we found a match, enable the loop mask...
3955 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) | caseMatches);
3957
3958 // ... and clear the default-case mask.
3959 I32* defaultMask = actualValue + 1;
3960 *defaultMask &= ~caseMatches;
3961}
3962
3963STAGE_TAIL(load_return_mask, F* ctx) {
3964 b = sk_unaligned_load<F>(ctx);
3966}
3967
3968STAGE_TAIL(store_return_mask, F* ctx) {
3969 sk_unaligned_store(ctx, b);
3970}
3971
3972STAGE_TAIL(mask_off_return_mask, NoCtx) {
3973 // We encountered a return statement. If a lane was active, it should be masked off now, and
3974 // stay masked-off until the end of the function.
3975 b = sk_bit_cast<F>(sk_bit_cast<I32>(b) & ~execution_mask());
3977}
3978
3980 uint32_t iota[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
3981 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
3982
3983 I32 tailLanes = cond_to_mask(*ctx->tail <= sk_unaligned_load<U32>(iota));
3984 return all(execution_mask() | tailLanes) ? ctx->offset : 1;
3985}
3986
3987STAGE_BRANCH(branch_if_any_lanes_active, SkRasterPipeline_BranchCtx* ctx) {
3988 return any(execution_mask()) ? ctx->offset : 1;
3989}
3990
3991STAGE_BRANCH(branch_if_no_lanes_active, SkRasterPipeline_BranchCtx* ctx) {
3992 return any(execution_mask()) ? 1 : ctx->offset;
3993}
3994
3996 return ctx->offset;
3997}
3998
3999STAGE_BRANCH(branch_if_no_active_lanes_eq, SkRasterPipeline_BranchIfEqualCtx* ctx) {
4000 // Compare each lane against the expected value...
4001 I32 match = cond_to_mask(*(const I32*)ctx->ptr == ctx->value);
4002 // ... but mask off lanes that aren't executing.
4003 match &= execution_mask();
4004 // If any lanes matched, don't take the branch.
4005 return any(match) ? 1 : ctx->offset;
4006}
4007
4009 const I32* traceMask = (const I32*)ctx->traceMask;
4010 if (any(execution_mask() & *traceMask)) {
4011 ctx->traceHook->line(ctx->lineNumber);
4012 }
4013}
4014
4016 const I32* traceMask = (const I32*)ctx->traceMask;
4017 if (any(execution_mask() & *traceMask)) {
4018 ctx->traceHook->enter(ctx->funcIdx);
4019 }
4020}
4021
4023 const I32* traceMask = (const I32*)ctx->traceMask;
4024 if (any(execution_mask() & *traceMask)) {
4025 ctx->traceHook->exit(ctx->funcIdx);
4026 }
4027}
4028
4030 // Note that trace_scope intentionally does not incorporate the execution mask. Otherwise, the
4031 // scopes would become unbalanced if the execution mask changed in the middle of a block. The
4032 // caller is responsible for providing a combined trace- and execution-mask.
4033 const I32* traceMask = (const I32*)ctx->traceMask;
4034 if (any(*traceMask)) {
4035 ctx->traceHook->scope(ctx->delta);
4036 }
4037}
4038
4040 const I32* traceMask = (const I32*)ctx->traceMask;
4041 I32 mask = execution_mask() & *traceMask;
4042 if (any(mask)) {
4043 for (size_t lane = 0; lane < N; ++lane) {
4044 if (select_lane(mask, lane)) {
4045 const I32* data = (const I32*)ctx->data;
4046 int slotIdx = ctx->slotIdx, numSlots = ctx->numSlots;
4047 if (ctx->indirectOffset) {
4048 // If this was an indirect store, apply the indirect-offset to the data pointer.
4049 uint32_t indirectOffset = select_lane(*(const U32*)ctx->indirectOffset, lane);
4050 indirectOffset = std::min<uint32_t>(indirectOffset, ctx->indirectLimit);
4051 data += indirectOffset;
4052 slotIdx += indirectOffset;
4053 }
4054 while (numSlots--) {
4055 ctx->traceHook->var(slotIdx, select_lane(*data, lane));
4056 ++slotIdx;
4057 ++data;
4058 }
4059 break;
4060 }
4061 }
4062 }
4063}
4064
4066 const int* src = ctx->src;
4067 I32* dst = (I32*)ctx->dst;
4068 dst[0] = I32_(src[0]);
4069}
4071 const int* src = ctx->src;
4072 I32* dst = (I32*)ctx->dst;
4073 dst[0] = I32_(src[0]);
4074 dst[1] = I32_(src[1]);
4075}
4077 const int* src = ctx->src;
4078 I32* dst = (I32*)ctx->dst;
4079 dst[0] = I32_(src[0]);
4080 dst[1] = I32_(src[1]);
4081 dst[2] = I32_(src[2]);
4082}
4084 const int* src = ctx->src;
4085 I32* dst = (I32*)ctx->dst;
4086 dst[0] = I32_(src[0]);
4087 dst[1] = I32_(src[1]);
4088 dst[2] = I32_(src[2]);
4089 dst[3] = I32_(src[3]);
4090}
4091
4093 auto ctx = SkRPCtxUtils::Unpack(packed);
4094 I32* dst = (I32*)(base + ctx.dst);
4095 I32 value = I32_(ctx.value);
4096 dst[0] = value;
4097}
4098STAGE_TAIL(splat_2_constants, SkRasterPipeline_ConstantCtx* packed) {
4099 auto ctx = SkRPCtxUtils::Unpack(packed);
4100 I32* dst = (I32*)(base + ctx.dst);
4101 I32 value = I32_(ctx.value);
4102 dst[0] = dst[1] = value;
4103}
4104STAGE_TAIL(splat_3_constants, SkRasterPipeline_ConstantCtx* packed) {
4105 auto ctx = SkRPCtxUtils::Unpack(packed);
4106 I32* dst = (I32*)(base + ctx.dst);
4107 I32 value = I32_(ctx.value);
4108 dst[0] = dst[1] = dst[2] = value;
4109}
4110STAGE_TAIL(splat_4_constants, SkRasterPipeline_ConstantCtx* packed) {
4111 auto ctx = SkRPCtxUtils::Unpack(packed);
4112 I32* dst = (I32*)(base + ctx.dst);
4113 I32 value = I32_(ctx.value);
4114 dst[0] = dst[1] = dst[2] = dst[3] = value;
4115}
4116
4117template <int NumSlots>
4119 auto ctx = SkRPCtxUtils::Unpack(packed);
4120 F* dst = (F*)(base + ctx.dst);
4121 F* src = (F*)(base + ctx.src);
4122 memcpy(dst, src, sizeof(F) * NumSlots);
4123}
4124
4125STAGE_TAIL(copy_slot_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4126 copy_n_slots_unmasked_fn<1>(packed, base);
4127}
4128STAGE_TAIL(copy_2_slots_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4129 copy_n_slots_unmasked_fn<2>(packed, base);
4130}
4131STAGE_TAIL(copy_3_slots_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4132 copy_n_slots_unmasked_fn<3>(packed, base);
4133}
4134STAGE_TAIL(copy_4_slots_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4135 copy_n_slots_unmasked_fn<4>(packed, base);
4136}
4137
4138template <int NumSlots>
4140 auto ctx = SkRPCtxUtils::Unpack(packed);
4141
4142 // Load the scalar values.
4143 float* src = (float*)(base + ctx.src);
4144 float values[NumSlots];
4145 SK_UNROLL for (int index = 0; index < NumSlots; ++index) {
4146 values[index] = src[index];
4147 }
4148 // Broadcast the scalars into the destination.
4149 F* dst = (F*)(base + ctx.dst);
4150 SK_UNROLL for (int index = 0; index < NumSlots; ++index) {
4151 dst[index] = F_(values[index]);
4152 }
4153}
4154
4155STAGE_TAIL(copy_immutable_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4156 copy_n_immutable_unmasked_fn<1>(packed, base);
4157}
4158STAGE_TAIL(copy_2_immutables_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4159 copy_n_immutable_unmasked_fn<2>(packed, base);
4160}
4161STAGE_TAIL(copy_3_immutables_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4162 copy_n_immutable_unmasked_fn<3>(packed, base);
4163}
4164STAGE_TAIL(copy_4_immutables_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4165 copy_n_immutable_unmasked_fn<4>(packed, base);
4166}
4167
4168template <int NumSlots>
4170 auto ctx = SkRPCtxUtils::Unpack(packed);
4171 I32* dst = (I32*)(base + ctx.dst);
4172 I32* src = (I32*)(base + ctx.src);
4173 SK_UNROLL for (int count = 0; count < NumSlots; ++count) {
4174 *dst = if_then_else(mask, *src, *dst);
4175 dst += 1;
4176 src += 1;
4177 }
4178}
4179
4180STAGE_TAIL(copy_slot_masked, SkRasterPipeline_BinaryOpCtx* packed) {
4181 copy_n_slots_masked_fn<1>(packed, base, execution_mask());
4182}
4183STAGE_TAIL(copy_2_slots_masked, SkRasterPipeline_BinaryOpCtx* packed) {
4184 copy_n_slots_masked_fn<2>(packed, base, execution_mask());
4185}
4186STAGE_TAIL(copy_3_slots_masked, SkRasterPipeline_BinaryOpCtx* packed) {
4187 copy_n_slots_masked_fn<3>(packed, base, execution_mask());
4188}
4189STAGE_TAIL(copy_4_slots_masked, SkRasterPipeline_BinaryOpCtx* packed) {
4190 copy_n_slots_masked_fn<4>(packed, base, execution_mask());
4191}
4192
4193template <int LoopCount, typename OffsetType>
4194SI void shuffle_fn(std::byte* ptr, OffsetType* offsets, int numSlots) {
4195 F scratch[16];
4196 SK_UNROLL for (int count = 0; count < LoopCount; ++count) {
4197 scratch[count] = *(F*)(ptr + offsets[count]);
4198 }
4199 // Surprisingly, this switch generates significantly better code than a memcpy (on x86-64) when
4200 // the number of slots is unknown at compile time, and generates roughly identical code when the
4201 // number of slots is hardcoded. Using a switch allows `scratch` to live in ymm0-ymm15 instead
4202 // of being written out to the stack and then read back in. Also, the intrinsic memcpy assumes
4203 // that `numSlots` could be arbitrarily large, and so it emits more code than we need.
4204 F* dst = (F*)ptr;
4205 switch (numSlots) {
4206 case 16: dst[15] = scratch[15]; [[fallthrough]];
4207 case 15: dst[14] = scratch[14]; [[fallthrough]];
4208 case 14: dst[13] = scratch[13]; [[fallthrough]];
4209 case 13: dst[12] = scratch[12]; [[fallthrough]];
4210 case 12: dst[11] = scratch[11]; [[fallthrough]];
4211 case 11: dst[10] = scratch[10]; [[fallthrough]];
4212 case 10: dst[ 9] = scratch[ 9]; [[fallthrough]];
4213 case 9: dst[ 8] = scratch[ 8]; [[fallthrough]];
4214 case 8: dst[ 7] = scratch[ 7]; [[fallthrough]];
4215 case 7: dst[ 6] = scratch[ 6]; [[fallthrough]];
4216 case 6: dst[ 5] = scratch[ 5]; [[fallthrough]];
4217 case 5: dst[ 4] = scratch[ 4]; [[fallthrough]];
4218 case 4: dst[ 3] = scratch[ 3]; [[fallthrough]];
4219 case 3: dst[ 2] = scratch[ 2]; [[fallthrough]];
4220 case 2: dst[ 1] = scratch[ 1]; [[fallthrough]];
4221 case 1: dst[ 0] = scratch[ 0];
4222 }
4223}
4224
4225template <int N>
4227 auto ctx = SkRPCtxUtils::Unpack(packed);
4228 shuffle_fn<N>(base + ctx.dst, ctx.offsets, N);
4229}
4230
4232 small_swizzle_fn<1>(packed, base);
4233}
4235 small_swizzle_fn<2>(packed, base);
4236}
4238 small_swizzle_fn<3>(packed, base);
4239}
4241 small_swizzle_fn<4>(packed, base);
4242}
4244 shuffle_fn<16>((std::byte*)ctx->ptr, ctx->offsets, ctx->count);
4245}
4246
4247template <int NumSlots>
4248SI void swizzle_copy_masked_fn(I32* dst, const I32* src, uint16_t* offsets, I32 mask) {
4249 std::byte* dstB = (std::byte*)dst;
4250 SK_UNROLL for (int count = 0; count < NumSlots; ++count) {
4251 I32* dstS = (I32*)(dstB + *offsets);
4252 *dstS = if_then_else(mask, *src, *dstS);
4253 offsets += 1;
4254 src += 1;
4255 }
4256}
4257
4258STAGE_TAIL(swizzle_copy_slot_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
4259 swizzle_copy_masked_fn<1>((I32*)ctx->dst, (const I32*)ctx->src, ctx->offsets, execution_mask());
4260}
4261STAGE_TAIL(swizzle_copy_2_slots_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
4262 swizzle_copy_masked_fn<2>((I32*)ctx->dst, (const I32*)ctx->src, ctx->offsets, execution_mask());
4263}
4264STAGE_TAIL(swizzle_copy_3_slots_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
4265 swizzle_copy_masked_fn<3>((I32*)ctx->dst, (const I32*)ctx->src, ctx->offsets, execution_mask());
4266}
4267STAGE_TAIL(swizzle_copy_4_slots_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
4268 swizzle_copy_masked_fn<4>((I32*)ctx->dst, (const I32*)ctx->src, ctx->offsets, execution_mask());
4269}
4270
4271STAGE_TAIL(copy_from_indirect_unmasked, SkRasterPipeline_CopyIndirectCtx* ctx) {
4272 // Clamp the indirect offsets to stay within the limit.
4273 U32 offsets = *(const U32*)ctx->indirectOffset;
4275
4276 // Scale up the offsets to account for the N lanes per value.
4277 offsets *= N;
4278
4279 // Adjust the offsets forward so that they fetch from the correct lane.
4280 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4281 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
4282 offsets += sk_unaligned_load<U32>(iota);
4283
4284 // Use gather to perform indirect lookups; write the results into `dst`.
4285 const int* src = ctx->src;
4286 I32* dst = (I32*)ctx->dst;
4287 I32* end = dst + ctx->slots;
4288 do {
4289 *dst = gather(src, offsets);
4290 dst += 1;
4291 src += N;
4292 } while (dst != end);
4293}
4294
4295STAGE_TAIL(copy_from_indirect_uniform_unmasked, SkRasterPipeline_CopyIndirectCtx* ctx) {
4296 // Clamp the indirect offsets to stay within the limit.
4297 U32 offsets = *(const U32*)ctx->indirectOffset;
4299
4300 // Use gather to perform indirect lookups; write the results into `dst`.
4301 const int* src = ctx->src;
4302 I32* dst = (I32*)ctx->dst;
4303 I32* end = dst + ctx->slots;
4304 do {
4305 *dst = gather(src, offsets);
4306 dst += 1;
4307 src += 1;
4308 } while (dst != end);
4309}
4310
4311STAGE_TAIL(copy_to_indirect_masked, SkRasterPipeline_CopyIndirectCtx* ctx) {
4312 // Clamp the indirect offsets to stay within the limit.
4313 U32 offsets = *(const U32*)ctx->indirectOffset;
4315
4316 // Scale up the offsets to account for the N lanes per value.
4317 offsets *= N;
4318
4319 // Adjust the offsets forward so that they store into the correct lane.
4320 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4321 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
4322 offsets += sk_unaligned_load<U32>(iota);
4323
4324 // Perform indirect, masked writes into `dst`.
4325 const I32* src = (const I32*)ctx->src;
4326 const I32* end = src + ctx->slots;
4327 int* dst = ctx->dst;
4328 I32 mask = execution_mask();
4329 do {
4330 scatter_masked(*src, dst, offsets, mask);
4331 dst += N;
4332 src += 1;
4333 } while (src != end);
4334}
4335
4336STAGE_TAIL(swizzle_copy_to_indirect_masked, SkRasterPipeline_SwizzleCopyIndirectCtx* ctx) {
4337 // Clamp the indirect offsets to stay within the limit.
4338 U32 offsets = *(const U32*)ctx->indirectOffset;
4340
4341 // Scale up the offsets to account for the N lanes per value.
4342 offsets *= N;
4343
4344 // Adjust the offsets forward so that they store into the correct lane.
4345 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4346 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
4347 offsets += sk_unaligned_load<U32>(iota);
4348
4349 // Perform indirect, masked, swizzled writes into `dst`.
4350 const I32* src = (const I32*)ctx->src;
4351 const I32* end = src + ctx->slots;
4352 std::byte* dstB = (std::byte*)ctx->dst;
4353 const uint16_t* swizzle = ctx->offsets;
4354 I32 mask = execution_mask();
4355 do {
4356 int* dst = (int*)(dstB + *swizzle);
4357 scatter_masked(*src, dst, offsets, mask);
4358 swizzle += 1;
4359 src += 1;
4360 } while (src != end);
4361}
4362
4363// Unary operations take a single input, and overwrite it with their output.
4364// Unlike binary or ternary operations, we provide variations of 1-4 slots, but don't provide
4365// an arbitrary-width "n-slot" variation; the Builder can chain together longer sequences manually.
4366template <typename T, void (*ApplyFn)(T*)>
4368 do {
4369 ApplyFn(dst);
4370 dst += 1;
4371 } while (dst != end);
4372}
4373
4374#if defined(JUMPER_IS_SCALAR)
4375template <typename T>
4377 *dst = sk_bit_cast<T>((F)*dst);
4378}
4380 *dst = sk_bit_cast<F>((I32)*dst);
4381}
4383 *dst = sk_bit_cast<F>((U32)*dst);
4384}
4385#else
4386template <typename T>
4388 *dst = sk_bit_cast<T>(__builtin_convertvector(*dst, F));
4389}
4390SI void cast_to_int_from_fn(F* dst) {
4391 *dst = sk_bit_cast<F>(__builtin_convertvector(*dst, I32));
4392}
4394 *dst = sk_bit_cast<F>(__builtin_convertvector(*dst, U32));
4395}
4396#endif
4397
4399 *dst = abs_(*dst);
4400}
4401
4403 *dst = floor_(*dst);
4404}
4405
4406SI void ceil_fn(F* dst) {
4407 *dst = ceil_(*dst);
4408}
4409
4411 *dst = rsqrt(*dst);
4412}
4413
4414#define DECLARE_UNARY_FLOAT(name) \
4415 STAGE_TAIL(name##_float, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 1); } \
4416 STAGE_TAIL(name##_2_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 2); } \
4417 STAGE_TAIL(name##_3_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 3); } \
4418 STAGE_TAIL(name##_4_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 4); }
4419
4420#define DECLARE_UNARY_INT(name) \
4421 STAGE_TAIL(name##_int, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 1); } \
4422 STAGE_TAIL(name##_2_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 2); } \
4423 STAGE_TAIL(name##_3_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 3); } \
4424 STAGE_TAIL(name##_4_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 4); }
4425
4426#define DECLARE_UNARY_UINT(name) \
4427 STAGE_TAIL(name##_uint, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 1); } \
4428 STAGE_TAIL(name##_2_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 2); } \
4429 STAGE_TAIL(name##_3_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 3); } \
4430 STAGE_TAIL(name##_4_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 4); }
4431
4432DECLARE_UNARY_INT(cast_to_float_from) DECLARE_UNARY_UINT(cast_to_float_from)
4433DECLARE_UNARY_FLOAT(cast_to_int_from)
4434DECLARE_UNARY_FLOAT(cast_to_uint_from)
4437DECLARE_UNARY_FLOAT(invsqrt)
4439
4440#undef DECLARE_UNARY_FLOAT
4441#undef DECLARE_UNARY_INT
4442#undef DECLARE_UNARY_UINT
4443
4444// For complex unary ops, we only provide a 1-slot version to reduce code bloat.
4445STAGE_TAIL(sin_float, F* dst) { *dst = sin_(*dst); }
4446STAGE_TAIL(cos_float, F* dst) { *dst = cos_(*dst); }
4447STAGE_TAIL(tan_float, F* dst) { *dst = tan_(*dst); }
4448STAGE_TAIL(asin_float, F* dst) { *dst = asin_(*dst); }
4449STAGE_TAIL(acos_float, F* dst) { *dst = acos_(*dst); }
4450STAGE_TAIL(atan_float, F* dst) { *dst = atan_(*dst); }
4451STAGE_TAIL(sqrt_float, F* dst) { *dst = sqrt_(*dst); }
4452STAGE_TAIL(exp_float, F* dst) { *dst = approx_exp(*dst); }
4453STAGE_TAIL(exp2_float, F* dst) { *dst = approx_pow2(*dst); }
4454STAGE_TAIL(log_float, F* dst) { *dst = approx_log(*dst); }
4455STAGE_TAIL(log2_float, F* dst) { *dst = approx_log2(*dst); }
4456
4457STAGE_TAIL(inverse_mat2, F* dst) {
4458 F a00 = dst[0], a01 = dst[1],
4459 a10 = dst[2], a11 = dst[3];
4460 F det = nmad(a01, a10, a00 * a11),
4461 invdet = rcp_precise(det);
4462 dst[0] = invdet * a11;
4463 dst[1] = -invdet * a01;
4464 dst[2] = -invdet * a10;
4465 dst[3] = invdet * a00;
4466}
4467
4468STAGE_TAIL(inverse_mat3, F* dst) {
4469 F a00 = dst[0], a01 = dst[1], a02 = dst[2],
4470 a10 = dst[3], a11 = dst[4], a12 = dst[5],
4471 a20 = dst[6], a21 = dst[7], a22 = dst[8];
4472 F b01 = nmad(a12, a21, a22 * a11),
4473 b11 = nmad(a22, a10, a12 * a20),
4474 b21 = nmad(a11, a20, a21 * a10);
4475 F det = mad(a00, b01, mad(a01, b11, a02 * b21)),
4476 invdet = rcp_precise(det);
4477 dst[0] = invdet * b01;
4478 dst[1] = invdet * nmad(a22, a01, a02 * a21);
4479 dst[2] = invdet * nmad(a02, a11, a12 * a01);
4480 dst[3] = invdet * b11;
4481 dst[4] = invdet * nmad(a02, a20, a22 * a00);
4482 dst[5] = invdet * nmad(a12, a00, a02 * a10);
4483 dst[6] = invdet * b21;
4484 dst[7] = invdet * nmad(a21, a00, a01 * a20);
4485 dst[8] = invdet * nmad(a01, a10, a11 * a00);
4486}
4487
4488STAGE_TAIL(inverse_mat4, F* dst) {
4489 F a00 = dst[0], a01 = dst[1], a02 = dst[2], a03 = dst[3],
4490 a10 = dst[4], a11 = dst[5], a12 = dst[6], a13 = dst[7],
4491 a20 = dst[8], a21 = dst[9], a22 = dst[10], a23 = dst[11],
4492 a30 = dst[12], a31 = dst[13], a32 = dst[14], a33 = dst[15];
4493 F b00 = nmad(a01, a10, a00 * a11),
4494 b01 = nmad(a02, a10, a00 * a12),
4495 b02 = nmad(a03, a10, a00 * a13),
4496 b03 = nmad(a02, a11, a01 * a12),
4497 b04 = nmad(a03, a11, a01 * a13),
4498 b05 = nmad(a03, a12, a02 * a13),
4499 b06 = nmad(a21, a30, a20 * a31),
4500 b07 = nmad(a22, a30, a20 * a32),
4501 b08 = nmad(a23, a30, a20 * a33),
4502 b09 = nmad(a22, a31, a21 * a32),
4503 b10 = nmad(a23, a31, a21 * a33),
4504 b11 = nmad(a23, a32, a22 * a33),
4505 det = mad(b00, b11, b05 * b06) + mad(b02, b09, b03 * b08) - mad(b01, b10, b04 * b07),
4506 invdet = rcp_precise(det);
4507 b00 *= invdet;
4508 b01 *= invdet;
4509 b02 *= invdet;
4510 b03 *= invdet;
4511 b04 *= invdet;
4512 b05 *= invdet;
4513 b06 *= invdet;
4514 b07 *= invdet;
4515 b08 *= invdet;
4516 b09 *= invdet;
4517 b10 *= invdet;
4518 b11 *= invdet;
4519 dst[0] = mad(a13, b09, nmad(a12, b10, a11*b11));
4520 dst[1] = nmad(a03, b09, nmad(a01, b11, a02*b10));
4521 dst[2] = mad(a33, b03, nmad(a32, b04, a31*b05));
4522 dst[3] = nmad(a23, b03, nmad(a21, b05, a22*b04));
4523 dst[4] = nmad(a13, b07, nmad(a10, b11, a12*b08));
4524 dst[5] = mad(a03, b07, nmad(a02, b08, a00*b11));
4525 dst[6] = nmad(a33, b01, nmad(a30, b05, a32*b02));
4526 dst[7] = mad(a23, b01, nmad(a22, b02, a20*b05));
4527 dst[8] = mad(a13, b06, nmad(a11, b08, a10*b10));
4528 dst[9] = nmad(a03, b06, nmad(a00, b10, a01*b08));
4529 dst[10] = mad(a33, b00, nmad(a31, b02, a30*b04));
4530 dst[11] = nmad(a23, b00, nmad(a20, b04, a21*b02));
4531 dst[12] = nmad(a12, b06, nmad(a10, b09, a11*b07));
4532 dst[13] = mad(a02, b06, nmad(a01, b07, a00*b09));
4533 dst[14] = nmad(a32, b00, nmad(a30, b03, a31*b01));
4534 dst[15] = mad(a22, b00, nmad(a21, b01, a20*b03));
4535}
4536
4537// Binary operations take two adjacent inputs, and write their output in the first position.
4538template <typename T, void (*ApplyFn)(T*, T*)>
4540 T* end = src;
4541 do {
4542 ApplyFn(dst, src);
4543 dst += 1;
4544 src += 1;
4545 } while (dst != end);
4546}
4547
4548template <typename T, void (*ApplyFn)(T*, T*)>
4550 auto ctx = SkRPCtxUtils::Unpack(packed);
4551 std::byte* dst = base + ctx.dst;
4552 std::byte* src = base + ctx.src;
4553 apply_adjacent_binary<T, ApplyFn>((T*)dst, (T*)src);
4554}
4555
4556template <int N, typename V, typename S, void (*ApplyFn)(V*, V*)>
4558 auto ctx = SkRPCtxUtils::Unpack(packed);
4559 V* dst = (V*)(base + ctx.dst); // get a pointer to the destination
4560 S scalar = sk_bit_cast<S>(ctx.value); // bit-pun the constant value as desired
4561 V src = scalar - V(); // broadcast the constant value into a vector
4562 SK_UNROLL for (int index = 0; index < N; ++index) {
4563 ApplyFn(dst, &src); // perform the operation
4564 dst += 1;
4565 }
4566}
4567
4568template <typename T>
4569SI void add_fn(T* dst, T* src) {
4570 *dst += *src;
4571}
4572
4573template <typename T>
4574SI void sub_fn(T* dst, T* src) {
4575 *dst -= *src;
4576}
4577
4578template <typename T>
4579SI void mul_fn(T* dst, T* src) {
4580 *dst *= *src;
4581}
4582
4583template <typename T>
4584SI void div_fn(T* dst, T* src) {
4585 T divisor = *src;
4586 if constexpr (!std::is_same_v<T, F>) {
4587 // We will crash if we integer-divide against zero. Convert 0 to ~0 to avoid this.
4588 divisor |= (T)cond_to_mask(divisor == 0);
4589 }
4590 *dst /= divisor;
4591}
4592
4594 *dst &= *src;
4595}
4596
4598 *dst |= *src;
4599}
4600
4602 *dst ^= *src;
4603}
4604
4605template <typename T>
4606SI void max_fn(T* dst, T* src) {
4607 *dst = max(*dst, *src);
4608}
4609
4610template <typename T>
4611SI void min_fn(T* dst, T* src) {
4612 *dst = min(*dst, *src);
4613}
4614
4615template <typename T>
4616SI void cmplt_fn(T* dst, T* src) {
4617 static_assert(sizeof(T) == sizeof(I32));
4618 I32 result = cond_to_mask(*dst < *src);
4619 memcpy(dst, &result, sizeof(I32));
4620}
4621
4622template <typename T>
4623SI void cmple_fn(T* dst, T* src) {
4624 static_assert(sizeof(T) == sizeof(I32));
4625 I32 result = cond_to_mask(*dst <= *src);
4626 memcpy(dst, &result, sizeof(I32));
4627}
4628
4629template <typename T>
4630SI void cmpeq_fn(T* dst, T* src) {
4631 static_assert(sizeof(T) == sizeof(I32));
4632 I32 result = cond_to_mask(*dst == *src);
4633 memcpy(dst, &result, sizeof(I32));
4634}
4635
4636template <typename T>
4637SI void cmpne_fn(T* dst, T* src) {
4638 static_assert(sizeof(T) == sizeof(I32));
4639 I32 result = cond_to_mask(*dst != *src);
4640 memcpy(dst, &result, sizeof(I32));
4641}
4642
4643SI void atan2_fn(F* dst, F* src) {
4644 *dst = atan2_(*dst, *src);
4645}
4646
4647SI void pow_fn(F* dst, F* src) {
4648 *dst = approx_powf(*dst, *src);
4649}
4650
4651SI void mod_fn(F* dst, F* src) {
4652 *dst = nmad(*src, floor_(*dst / *src), *dst);
4653}
4654
4655#define DECLARE_N_WAY_BINARY_FLOAT(name) \
4656 STAGE_TAIL(name##_n_floats, SkRasterPipeline_BinaryOpCtx* packed) { \
4657 apply_adjacent_binary_packed<F, &name##_fn>(packed, base); \
4658 }
4659
4660#define DECLARE_BINARY_FLOAT(name) \
4661 STAGE_TAIL(name##_float, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 1); } \
4662 STAGE_TAIL(name##_2_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 2); } \
4663 STAGE_TAIL(name##_3_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 3); } \
4664 STAGE_TAIL(name##_4_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 4); } \
4665 DECLARE_N_WAY_BINARY_FLOAT(name)
4666
4667#define DECLARE_N_WAY_BINARY_INT(name) \
4668 STAGE_TAIL(name##_n_ints, SkRasterPipeline_BinaryOpCtx* packed) { \
4669 apply_adjacent_binary_packed<I32, &name##_fn>(packed, base); \
4670 }
4671
4672#define DECLARE_BINARY_INT(name) \
4673 STAGE_TAIL(name##_int, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 1); } \
4674 STAGE_TAIL(name##_2_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 2); } \
4675 STAGE_TAIL(name##_3_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 3); } \
4676 STAGE_TAIL(name##_4_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 4); } \
4677 DECLARE_N_WAY_BINARY_INT(name)
4678
4679#define DECLARE_N_WAY_BINARY_UINT(name) \
4680 STAGE_TAIL(name##_n_uints, SkRasterPipeline_BinaryOpCtx* packed) { \
4681 apply_adjacent_binary_packed<U32, &name##_fn>(packed, base); \
4682 }
4683
4684#define DECLARE_BINARY_UINT(name) \
4685 STAGE_TAIL(name##_uint, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 1); } \
4686 STAGE_TAIL(name##_2_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 2); } \
4687 STAGE_TAIL(name##_3_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 3); } \
4688 STAGE_TAIL(name##_4_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 4); } \
4689 DECLARE_N_WAY_BINARY_UINT(name)
4690
4691// Many ops reuse the int stages when performing uint arithmetic, since they're equivalent on a
4692// two's-complement machine. (Even multiplication is equivalent in the lower 32 bits.)
4697 DECLARE_BINARY_INT(bitwise_and)
4698 DECLARE_BINARY_INT(bitwise_or)
4699 DECLARE_BINARY_INT(bitwise_xor)
4707
4708// Sufficiently complex ops only provide an N-way version, to avoid code bloat from the dedicated
4709// 1-4 slot versions.
4712
4713// Some ops have an optimized version when the right-side is an immediate value.
4714#define DECLARE_IMM_BINARY_FLOAT(name) \
4715 STAGE_TAIL(name##_imm_float, SkRasterPipeline_ConstantCtx* packed) { \
4716 apply_binary_immediate<1, F, float, &name##_fn>(packed, base); \
4717 }
4718#define DECLARE_IMM_BINARY_INT(name) \
4719 STAGE_TAIL(name##_imm_int, SkRasterPipeline_ConstantCtx* packed) { \
4720 apply_binary_immediate<1, I32, int32_t, &name##_fn>(packed, base); \
4721 }
4722#define DECLARE_MULTI_IMM_BINARY_INT(name) \
4723 STAGE_TAIL(name##_imm_int, SkRasterPipeline_ConstantCtx* packed) { \
4724 apply_binary_immediate<1, I32, int32_t, &name##_fn>(packed, base); \
4725 } \
4726 STAGE_TAIL(name##_imm_2_ints, SkRasterPipeline_ConstantCtx* packed) { \
4727 apply_binary_immediate<2, I32, int32_t, &name##_fn>(packed, base); \
4728 } \
4729 STAGE_TAIL(name##_imm_3_ints, SkRasterPipeline_ConstantCtx* packed) { \
4730 apply_binary_immediate<3, I32, int32_t, &name##_fn>(packed, base); \
4731 } \
4732 STAGE_TAIL(name##_imm_4_ints, SkRasterPipeline_ConstantCtx* packed) { \
4733 apply_binary_immediate<4, I32, int32_t, &name##_fn>(packed, base); \
4734 }
4735#define DECLARE_IMM_BINARY_UINT(name) \
4736 STAGE_TAIL(name##_imm_uint, SkRasterPipeline_ConstantCtx* packed) { \
4737 apply_binary_immediate<1, U32, uint32_t, &name##_fn>(packed, base); \
4738 }
4739
4742 DECLARE_MULTI_IMM_BINARY_INT(bitwise_and)
4745 DECLARE_IMM_BINARY_INT(bitwise_xor)
4750
4751#undef DECLARE_MULTI_IMM_BINARY_INT
4752#undef DECLARE_IMM_BINARY_FLOAT
4753#undef DECLARE_IMM_BINARY_INT
4754#undef DECLARE_IMM_BINARY_UINT
4755#undef DECLARE_BINARY_FLOAT
4756#undef DECLARE_BINARY_INT
4757#undef DECLARE_BINARY_UINT
4758#undef DECLARE_N_WAY_BINARY_FLOAT
4759#undef DECLARE_N_WAY_BINARY_INT
4760#undef DECLARE_N_WAY_BINARY_UINT
4761
4762// Dots can be represented with multiply and add ops, but they are so foundational that it's worth
4763// having dedicated ops.
4764STAGE_TAIL(dot_2_floats, F* dst) {
4765 dst[0] = mad(dst[0], dst[2],
4766 dst[1] * dst[3]);
4767}
4768
4769STAGE_TAIL(dot_3_floats, F* dst) {
4770 dst[0] = mad(dst[0], dst[3],
4771 mad(dst[1], dst[4],
4772 dst[2] * dst[5]));
4773}
4774
4775STAGE_TAIL(dot_4_floats, F* dst) {
4776 dst[0] = mad(dst[0], dst[4],
4777 mad(dst[1], dst[5],
4778 mad(dst[2], dst[6],
4779 dst[3] * dst[7])));
4780}
4781
4782// MxM, VxM and MxV multiplication all use matrix_multiply. Vectors are treated like a matrix with a
4783// single column or row.
4784template <int N>
4786 auto ctx = SkRPCtxUtils::Unpack(packed);
4787
4788 int outColumns = ctx.rightColumns,
4789 outRows = ctx.leftRows;
4790
4791 SkASSERT(outColumns >= 1);
4792 SkASSERT(outRows >= 1);
4793 SkASSERT(outColumns <= 4);
4794 SkASSERT(outRows <= 4);
4795
4796 SkASSERT(ctx.leftColumns == ctx.rightRows);
4797 SkASSERT(N == ctx.leftColumns); // N should match the result width
4798
4799#if !defined(JUMPER_IS_SCALAR)
4800 // This prevents Clang from generating early-out checks for zero-sized matrices.
4801 SK_ASSUME(outColumns >= 1);
4802 SK_ASSUME(outRows >= 1);
4803 SK_ASSUME(outColumns <= 4);
4804 SK_ASSUME(outRows <= 4);
4805#endif
4806
4807 // Get pointers to the adjacent left- and right-matrices.
4808 F* resultMtx = (F*)(base + ctx.dst);
4809 F* leftMtx = &resultMtx[ctx.rightColumns * ctx.leftRows];
4810 F* rightMtx = &leftMtx[N * ctx.leftRows];
4811
4812 // Emit each matrix element.
4813 for (int c = 0; c < outColumns; ++c) {
4814 for (int r = 0; r < outRows; ++r) {
4815 // Dot a vector from leftMtx[*][r] with rightMtx[c][*].
4816 F* leftRow = &leftMtx [r];
4817 F* rightColumn = &rightMtx[c * N];
4818
4819 F element = *leftRow * *rightColumn;
4820 for (int idx = 1; idx < N; ++idx) {
4821 leftRow += outRows;
4822 rightColumn += 1;
4823 element = mad(*leftRow, *rightColumn, element);
4824 }
4825
4826 *resultMtx++ = element;
4827 }
4828 }
4829}
4830
4832 matrix_multiply<2>(packed, base);
4833}
4834
4836 matrix_multiply<3>(packed, base);
4837}
4838
4840 matrix_multiply<4>(packed, base);
4841}
4842
4843// Refract always operates on 4-wide incident and normal vectors; for narrower inputs, the code
4844// generator fills in the input columns with zero, and discards the extra output columns.
4845STAGE_TAIL(refract_4_floats, F* dst) {
4846 // Algorithm adapted from https://registry.khronos.org/OpenGL-Refpages/gl4/html/refract.xhtml
4847 F *incident = dst + 0;
4848 F *normal = dst + 4;
4849 F eta = dst[8];
4850
4851 F dotNI = mad(normal[0], incident[0],
4852 mad(normal[1], incident[1],
4853 mad(normal[2], incident[2],
4854 normal[3] * incident[3])));
4855
4856 F k = 1.0 - eta * eta * (1.0 - dotNI * dotNI);
4857 F sqrt_k = sqrt_(k);
4858
4859 for (int idx = 0; idx < 4; ++idx) {
4860 dst[idx] = if_then_else(k >= 0,
4861 eta * incident[idx] - (eta * dotNI + sqrt_k) * normal[idx],
4862 0.0);
4863 }
4864}
4865
4866// Ternary operations work like binary ops (see immediately above) but take two source inputs.
4867template <typename T, void (*ApplyFn)(T*, T*, T*)>
4868SI void apply_adjacent_ternary(T* dst, T* src0, T* src1) {
4869 int count = src0 - dst;
4870#if !defined(JUMPER_IS_SCALAR)
4871 SK_ASSUME(count >= 1);
4872#endif
4873
4874 for (int index = 0; index < count; ++index) {
4875 ApplyFn(dst, src0, src1);
4876 dst += 1;
4877 src0 += 1;
4878 src1 += 1;
4879 }
4880}
4881
4882template <typename T, void (*ApplyFn)(T*, T*, T*)>
4884 auto ctx = SkRPCtxUtils::Unpack(packed);
4885 std::byte* dst = base + ctx.dst;
4886 std::byte* src0 = dst + ctx.delta;
4887 std::byte* src1 = src0 + ctx.delta;
4888 apply_adjacent_ternary<T, ApplyFn>((T*)dst, (T*)src0, (T*)src1);
4889}
4890
4891SI void mix_fn(F* a, F* x, F* y) {
4892 // We reorder the arguments here to match lerp's GLSL-style order (interpolation point last).
4893 *a = lerp(*x, *y, *a);
4894}
4895
4896SI void mix_fn(I32* a, I32* x, I32* y) {
4897 // We reorder the arguments here to match if_then_else's expected order (y before x).
4898 *a = if_then_else(*a, *y, *x);
4899}
4900
4901SI void smoothstep_fn(F* edge0, F* edge1, F* x) {
4902 F t = clamp_01_((*x - *edge0) / (*edge1 - *edge0));
4903 *edge0 = t * t * (3.0 - 2.0 * t);
4904}
4905
4906#define DECLARE_N_WAY_TERNARY_FLOAT(name) \
4907 STAGE_TAIL(name##_n_floats, SkRasterPipeline_TernaryOpCtx* packed) { \
4908 apply_adjacent_ternary_packed<F, &name##_fn>(packed, base); \
4909 }
4910
4911#define DECLARE_TERNARY_FLOAT(name) \
4912 STAGE_TAIL(name##_float, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+1, p+2); } \
4913 STAGE_TAIL(name##_2_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+2, p+4); } \
4914 STAGE_TAIL(name##_3_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+3, p+6); } \
4915 STAGE_TAIL(name##_4_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+4, p+8); } \
4916 DECLARE_N_WAY_TERNARY_FLOAT(name)
4917
4918#define DECLARE_TERNARY_INT(name) \
4919 STAGE_TAIL(name##_int, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+1, p+2); } \
4920 STAGE_TAIL(name##_2_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+2, p+4); } \
4921 STAGE_TAIL(name##_3_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+3, p+6); } \
4922 STAGE_TAIL(name##_4_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+4, p+8); } \
4923 STAGE_TAIL(name##_n_ints, SkRasterPipeline_TernaryOpCtx* packed) { \
4924 apply_adjacent_ternary_packed<I32, &name##_fn>(packed, base); \
4925 }
4926
4930
4931#undef DECLARE_N_WAY_TERNARY_FLOAT
4932#undef DECLARE_TERNARY_FLOAT
4933#undef DECLARE_TERNARY_INT
4934
4935STAGE(gauss_a_to_rgba, NoCtx) {
4936 // x = 1 - x;
4937 // exp(-x * x * 4) - 0.018f;
4938 // ... now approximate with quartic
4939 //
4940 const float c4 = -2.26661229133605957031f;
4941 const float c3 = 2.89795351028442382812f;
4942 const float c2 = 0.21345567703247070312f;
4943 const float c1 = 0.15489584207534790039f;
4944 const float c0 = 0.00030726194381713867f;
4945 a = mad(a, mad(a, mad(a, mad(a, c4, c3), c2), c1), c0);
4946 r = a;
4947 g = a;
4948 b = a;
4949}
4950
4951// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
4952STAGE(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
4953 // (cx,cy) are the center of our sample.
4954 F cx = r,
4955 cy = g;
4956
4957 // All sample points are at the same fractional offset (fx,fy).
4958 // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
4959 F fx = fract(cx + 0.5f),
4960 fy = fract(cy + 0.5f);
4961
4962 // We'll accumulate the color of all four samples into {r,g,b,a} directly.
4963 r = g = b = a = F0;
4964
4965 for (float py = -0.5f; py <= +0.5f; py += 1.0f)
4966 for (float px = -0.5f; px <= +0.5f; px += 1.0f) {
4967 // (x,y) are the coordinates of this sample point.
4968 F x = cx + px,
4969 y = cy + py;
4970
4971 // ix_and_ptr() will clamp to the image's bounds for us.
4972 const uint32_t* ptr;
4973 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
4974
4975 F sr,sg,sb,sa;
4976 from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
4977
4978 // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
4979 // are combined in direct proportion to their area overlapping that logical query pixel.
4980 // At positive offsets, the x-axis contribution to that rectangle is fx,
4981 // or (1-fx) at negative x. Same deal for y.
4982 F sx = (px > 0) ? fx : 1.0f - fx,
4983 sy = (py > 0) ? fy : 1.0f - fy,
4984 area = sx * sy;
4985
4986 r += sr * area;
4987 g += sg * area;
4988 b += sb * area;
4989 a += sa * area;
4990 }
4991}
4992
4993// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
4994STAGE(bicubic_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
4995 // (cx,cy) are the center of our sample.
4996 F cx = r,
4997 cy = g;
4998
4999 // All sample points are at the same fractional offset (fx,fy).
5000 // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
5001 F fx = fract(cx + 0.5f),
5002 fy = fract(cy + 0.5f);
5003
5004 // We'll accumulate the color of all four samples into {r,g,b,a} directly.
5005 r = g = b = a = F0;
5006
5007 const float* w = ctx->weights;
5008 const F scaley[4] = {bicubic_wts(fy, w[0], w[4], w[ 8], w[12]),
5009 bicubic_wts(fy, w[1], w[5], w[ 9], w[13]),
5010 bicubic_wts(fy, w[2], w[6], w[10], w[14]),
5011 bicubic_wts(fy, w[3], w[7], w[11], w[15])};
5012 const F scalex[4] = {bicubic_wts(fx, w[0], w[4], w[ 8], w[12]),
5013 bicubic_wts(fx, w[1], w[5], w[ 9], w[13]),
5014 bicubic_wts(fx, w[2], w[6], w[10], w[14]),
5015 bicubic_wts(fx, w[3], w[7], w[11], w[15])};
5016
5017 F sample_y = cy - 1.5f;
5018 for (int yy = 0; yy <= 3; ++yy) {
5019 F sample_x = cx - 1.5f;
5020 for (int xx = 0; xx <= 3; ++xx) {
5021 F scale = scalex[xx] * scaley[yy];
5022
5023 // ix_and_ptr() will clamp to the image's bounds for us.
5024 const uint32_t* ptr;
5025 U32 ix = ix_and_ptr(&ptr, ctx, sample_x, sample_y);
5026
5027 F sr,sg,sb,sa;
5028 from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
5029
5030 r = mad(scale, sr, r);
5031 g = mad(scale, sg, g);
5032 b = mad(scale, sb, b);
5033 a = mad(scale, sa, a);
5034
5035 sample_x += 1;
5036 }
5037 sample_y += 1;
5038 }
5039}
5040
5041// ~~~~~~ skgpu::Swizzle stage ~~~~~~ //
5042
5043STAGE(swizzle, void* ctx) {
5044 auto ir = r, ig = g, ib = b, ia = a;
5045 F* o[] = {&r, &g, &b, &a};
5046 char swiz[4];
5047 memcpy(swiz, &ctx, sizeof(swiz));
5048
5049 for (int i = 0; i < 4; ++i) {
5050 switch (swiz[i]) {
5051 case 'r': *o[i] = ir; break;
5052 case 'g': *o[i] = ig; break;
5053 case 'b': *o[i] = ib; break;
5054 case 'a': *o[i] = ia; break;
5055 case '0': *o[i] = F0; break;
5056 case '1': *o[i] = F1; break;
5057 default: break;
5058 }
5059 }
5060}
5061
5062namespace lowp {
5063#if defined(JUMPER_IS_SCALAR) || defined(SK_ENABLE_OPTIMIZE_SIZE) || \
5064 defined(SK_BUILD_FOR_GOOGLE3) || defined(SK_DISABLE_LOWP_RASTER_PIPELINE)
5065 // We don't bother generating the lowp stages if we are:
5066 // - ... in scalar mode (MSVC, old clang, etc...)
5067 // - ... trying to save code size
5068 // - ... building for Google3. (No justification for this, but changing it would be painful).
5069 // - ... explicitly disabling it. This is currently just used by Flutter.
5070 //
5071 // Having nullptr for every stage will cause SkRasterPipeline to always use the highp stages.
5072 #define M(st) static void (*st)(void) = nullptr;
5074 #undef M
5075 static void (*just_return)(void) = nullptr;
5076
5077 static void start_pipeline(size_t,size_t,size_t,size_t, SkRasterPipelineStage*,
5079 uint8_t* tailPointer) {}
5080
5081#else // We are compiling vector code with Clang... let's make some lowp stages!
5082
5083#if defined(JUMPER_IS_SKX) || defined(JUMPER_IS_HSW) || defined(JUMPER_IS_LASX)
5084 template <typename T> using V = Vec<16, T>;
5085#else
5086 template <typename T> using V = Vec<8, T>;
5087#endif
5088
5089using U8 = V<uint8_t >;
5090using U16 = V<uint16_t>;
5091using I16 = V< int16_t>;
5092using I32 = V< int32_t>;
5093using U32 = V<uint32_t>;
5094using I64 = V< int64_t>;
5095using U64 = V<uint64_t>;
5096using F = V<float >;
5097
5098static constexpr size_t N = sizeof(U16) / sizeof(uint16_t);
5099
5100// Promotion helpers (for GCC)
5101#if defined(__clang__)
5102SI constexpr U16 U16_(uint16_t x) { return x; }
5103SI constexpr I32 I32_( int32_t x) { return x; }
5104SI constexpr U32 U32_(uint32_t x) { return x; }
5105SI constexpr F F_ (float x) { return x; }
5106#else
5107SI constexpr U16 U16_(uint16_t x) { return x + U16(); }
5108SI constexpr I32 I32_( int32_t x) { return x + I32(); }
5109SI constexpr U32 U32_(uint32_t x) { return x + U32(); }
5110SI constexpr F F_ (float x) { return x - F (); }
5111#endif
5112
5113static constexpr U16 U16_0 = U16_(0),
5114 U16_255 = U16_(255);
5115
5116// Once again, some platforms benefit from a restricted Stage calling convention,
5117// but others can pass tons and tons of registers and we're happy to exploit that.
5118// It's exactly the same decision and implementation strategy as the F stages above.
5119#if JUMPER_NARROW_STAGES
5120 struct Params {
5121 size_t dx, dy;
5122 U16 dr,dg,db,da;
5123 };
5124 using Stage = void (ABI*)(Params*, SkRasterPipelineStage* program, U16 r, U16 g, U16 b, U16 a);
5125#else
5126 using Stage = void (ABI*)(SkRasterPipelineStage* program,
5127 size_t dx, size_t dy,
5128 U16 r, U16 g, U16 b, U16 a,
5129 U16 dr, U16 dg, U16 db, U16 da);
5130#endif
5131
5132static void start_pipeline(size_t x0, size_t y0,
5133 size_t xlimit, size_t ylimit,
5134 SkRasterPipelineStage* program,
5136 uint8_t* tailPointer) {
5137 uint8_t unreferencedTail;
5138 if (!tailPointer) {
5139 tailPointer = &unreferencedTail;
5140 }
5141 auto start = (Stage)program->fn;
5142 for (size_t dy = y0; dy < ylimit; dy++) {
5143 #if JUMPER_NARROW_STAGES
5144 Params params = { x0,dy, U16_0,U16_0,U16_0,U16_0 };
5145 for (; params.dx + N <= xlimit; params.dx += N) {
5146 start(&params, program, U16_0,U16_0,U16_0,U16_0);
5147 }
5148 if (size_t tail = xlimit - params.dx) {
5149 *tailPointer = tail;
5150 patch_memory_contexts(memoryCtxPatches, params.dx, dy, tail);
5151 start(&params, program, U16_0,U16_0,U16_0,U16_0);
5152 restore_memory_contexts(memoryCtxPatches, params.dx, dy, tail);
5153 *tailPointer = 0xFF;
5154 }
5155 #else
5156 size_t dx = x0;
5157 for (; dx + N <= xlimit; dx += N) {
5158 start(program, dx,dy, U16_0,U16_0,U16_0,U16_0, U16_0,U16_0,U16_0,U16_0);
5159 }
5160 if (size_t tail = xlimit - dx) {
5161 *tailPointer = tail;
5162 patch_memory_contexts(memoryCtxPatches, dx, dy, tail);
5163 start(program, dx,dy, U16_0,U16_0,U16_0,U16_0, U16_0,U16_0,U16_0,U16_0);
5164 restore_memory_contexts(memoryCtxPatches, dx, dy, tail);
5165 *tailPointer = 0xFF;
5166 }
5167 #endif
5168 }
5169}
5170
5171#if JUMPER_NARROW_STAGES
5172 static void ABI just_return(Params*, SkRasterPipelineStage*, U16,U16,U16,U16) {}
5173#else
5174 static void ABI just_return(SkRasterPipelineStage*, size_t,size_t,
5175 U16,U16,U16,U16, U16,U16,U16,U16) {}
5176#endif
5177
5178// All stages use the same function call ABI to chain into each other, but there are three types:
5179// GG: geometry in, geometry out -- think, a matrix
5180// GP: geometry in, pixels out. -- think, a memory gather
5181// PP: pixels in, pixels out. -- think, a blend mode
5182//
5183// (Some stages ignore their inputs or produce no logical output. That's perfectly fine.)
5184//
5185// These three STAGE_ macros let you define each type of stage,
5186// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
5187
5188#if JUMPER_NARROW_STAGES
5189 #define STAGE_GG(name, ARG) \
5190 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y); \
5191 static void ABI name(Params* params, SkRasterPipelineStage* program, \
5192 U16 r, U16 g, U16 b, U16 a) { \
5193 auto x = join<F>(r,g), \
5194 y = join<F>(b,a); \
5195 name##_k(Ctx{program}, params->dx,params->dy, x,y); \
5196 split(x, &r,&g); \
5197 split(y, &b,&a); \
5198 auto fn = (Stage)(++program)->fn; \
5199 fn(params, program, r,g,b,a); \
5200 } \
5201 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y)
5202
5203 #define STAGE_GP(name, ARG) \
5204 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5205 U16& r, U16& g, U16& b, U16& a, \
5206 U16& dr, U16& dg, U16& db, U16& da); \
5207 static void ABI name(Params* params, SkRasterPipelineStage* program, \
5208 U16 r, U16 g, U16 b, U16 a) { \
5209 auto x = join<F>(r,g), \
5210 y = join<F>(b,a); \
5211 name##_k(Ctx{program}, params->dx,params->dy, x,y, r,g,b,a, \
5212 params->dr,params->dg,params->db,params->da); \
5213 auto fn = (Stage)(++program)->fn; \
5214 fn(params, program, r,g,b,a); \
5215 } \
5216 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5217 U16& r, U16& g, U16& b, U16& a, \
5218 U16& dr, U16& dg, U16& db, U16& da)
5219
5220 #define STAGE_PP(name, ARG) \
5221 SI void name##_k(ARG, size_t dx, size_t dy, \
5222 U16& r, U16& g, U16& b, U16& a, \
5223 U16& dr, U16& dg, U16& db, U16& da); \
5224 static void ABI name(Params* params, SkRasterPipelineStage* program, \
5225 U16 r, U16 g, U16 b, U16 a) { \
5226 name##_k(Ctx{program}, params->dx,params->dy, r,g,b,a, \
5227 params->dr,params->dg,params->db,params->da); \
5228 auto fn = (Stage)(++program)->fn; \
5229 fn(params, program, r,g,b,a); \
5230 } \
5231 SI void name##_k(ARG, size_t dx, size_t dy, \
5232 U16& r, U16& g, U16& b, U16& a, \
5233 U16& dr, U16& dg, U16& db, U16& da)
5234#else
5235 #define STAGE_GG(name, ARG) \
5236 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y); \
5237 static void ABI name(SkRasterPipelineStage* program, \
5238 size_t dx, size_t dy, \
5239 U16 r, U16 g, U16 b, U16 a, \
5240 U16 dr, U16 dg, U16 db, U16 da) { \
5241 auto x = join<F>(r,g), \
5242 y = join<F>(b,a); \
5243 name##_k(Ctx{program}, dx,dy, x,y); \
5244 split(x, &r,&g); \
5245 split(y, &b,&a); \
5246 auto fn = (Stage)(++program)->fn; \
5247 fn(program, dx,dy, r,g,b,a, dr,dg,db,da); \
5248 } \
5249 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y)
5250
5251 #define STAGE_GP(name, ARG) \
5252 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5253 U16& r, U16& g, U16& b, U16& a, \
5254 U16& dr, U16& dg, U16& db, U16& da); \
5255 static void ABI name(SkRasterPipelineStage* program, \
5256 size_t dx, size_t dy, \
5257 U16 r, U16 g, U16 b, U16 a, \
5258 U16 dr, U16 dg, U16 db, U16 da) { \
5259 auto x = join<F>(r,g), \
5260 y = join<F>(b,a); \
5261 name##_k(Ctx{program}, dx,dy, x,y, r,g,b,a, dr,dg,db,da); \
5262 auto fn = (Stage)(++program)->fn; \
5263 fn(program, dx,dy, r,g,b,a, dr,dg,db,da); \
5264 } \
5265 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5266 U16& r, U16& g, U16& b, U16& a, \
5267 U16& dr, U16& dg, U16& db, U16& da)
5268
5269 #define STAGE_PP(name, ARG) \
5270 SI void name##_k(ARG, size_t dx, size_t dy, \
5271 U16& r, U16& g, U16& b, U16& a, \
5272 U16& dr, U16& dg, U16& db, U16& da); \
5273 static void ABI name(SkRasterPipelineStage* program, \
5274 size_t dx, size_t dy, \
5275 U16 r, U16 g, U16 b, U16 a, \
5276 U16 dr, U16 dg, U16 db, U16 da) { \
5277 name##_k(Ctx{program}, dx,dy, r,g,b,a, dr,dg,db,da); \
5278 auto fn = (Stage)(++program)->fn; \
5279 fn(program, dx,dy, r,g,b,a, dr,dg,db,da); \
5280 } \
5281 SI void name##_k(ARG, size_t dx, size_t dy, \
5282 U16& r, U16& g, U16& b, U16& a, \
5283 U16& dr, U16& dg, U16& db, U16& da)
5284#endif
5285
5286// ~~~~~~ Commonly used helper functions ~~~~~~ //
5287
5288/**
5289 * Helpers to to properly rounded division (by 255). The ideal answer we want to compute is slow,
5290 * thanks to a division by a non-power of two:
5291 * [1] (v + 127) / 255
5292 *
5293 * There is a two-step process that computes the correct answer for all inputs:
5294 * [2] (v + 128 + ((v + 128) >> 8)) >> 8
5295 *
5296 * There is also a single iteration approximation, but it's wrong (+-1) ~25% of the time:
5297 * [3] (v + 255) >> 8;
5298 *
5299 * We offer two different implementations here, depending on the requirements of the calling stage.
5300 */
5301
5302/**
5303 * div255 favors speed over accuracy. It uses formula [2] on NEON (where we can compute it as fast
5304 * as [3]), and uses [3] elsewhere.
5305 */
5306SI U16 div255(U16 v) {
5307#if defined(JUMPER_IS_NEON)
5308 // With NEON we can compute [2] just as fast as [3], so let's be correct.
5309 // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up:
5310 return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8);
5311#else
5312 // Otherwise, use [3], which is never wrong by more than 1:
5313 return (v+255)/256;
5314#endif
5315}
5316
5317/**
5318 * div255_accurate guarantees the right answer on all platforms, at the expense of performance.
5319 */
5320SI U16 div255_accurate(U16 v) {
5321#if defined(JUMPER_IS_NEON)
5322 // Our NEON implementation of div255 is already correct for all inputs:
5323 return div255(v);
5324#else
5325 // This is [2] (the same formulation as NEON), but written without the benefit of intrinsics:
5326 v += 128;
5327 return (v+(v/256))/256;
5328#endif
5329}
5330
5331SI U16 inv(U16 v) { return 255-v; }
5332
5333SI U16 if_then_else(I16 c, U16 t, U16 e) {
5334 return (t & sk_bit_cast<U16>(c)) | (e & sk_bit_cast<U16>(~c));
5335}
5336SI U32 if_then_else(I32 c, U32 t, U32 e) {
5337 return (t & sk_bit_cast<U32>(c)) | (e & sk_bit_cast<U32>(~c));
5338}
5339
5340SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); }
5341SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); }
5342
5343SI U16 max(U16 a, uint16_t b) { return max( a , U16_(b)); }
5344SI U16 max(uint16_t a, U16 b) { return max(U16_(a), b ); }
5345SI U16 min(U16 a, uint16_t b) { return min( a , U16_(b)); }
5346SI U16 min(uint16_t a, U16 b) { return min(U16_(a), b ); }
5347
5348SI U16 from_float(float f) { return U16_(f * 255.0f + 0.5f); }
5349
5350SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); }
5351
5352template <typename D, typename S>
5353SI D cast(S src) {
5354 return __builtin_convertvector(src, D);
5355}
5356
5357template <typename D, typename S>
5358SI void split(S v, D* lo, D* hi) {
5359 static_assert(2*sizeof(D) == sizeof(S), "");
5360 memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D));
5361 memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D));
5362}
5363template <typename D, typename S>
5364SI D join(S lo, S hi) {
5365 static_assert(sizeof(D) == 2*sizeof(S), "");
5366 D v;
5367 memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S));
5368 memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S));
5369 return v;
5370}
5371
5372SI F if_then_else(I32 c, F t, F e) {
5373 return sk_bit_cast<F>( (sk_bit_cast<I32>(t) & c) | (sk_bit_cast<I32>(e) & ~c) );
5374}
5375SI F if_then_else(I32 c, F t, float e) { return if_then_else(c, t , F_(e)); }
5376SI F if_then_else(I32 c, float t, F e) { return if_then_else(c, F_(t), e ); }
5377
5378SI F max(F x, F y) { return if_then_else(x < y, y, x); }
5379SI F min(F x, F y) { return if_then_else(x < y, x, y); }
5380
5381SI F max(F a, float b) { return max( a , F_(b)); }
5382SI F max(float a, F b) { return max(F_(a), b ); }
5383SI F min(F a, float b) { return min( a , F_(b)); }
5384SI F min(float a, F b) { return min(F_(a), b ); }
5385
5386SI I32 if_then_else(I32 c, I32 t, I32 e) {
5387 return (t & c) | (e & ~c);
5388}
5389SI I32 max(I32 x, I32 y) { return if_then_else(x < y, y, x); }
5390SI I32 min(I32 x, I32 y) { return if_then_else(x < y, x, y); }
5391
5392SI I32 max(I32 a, int32_t b) { return max( a , I32_(b)); }
5393SI I32 max(int32_t a, I32 b) { return max(I32_(a), b ); }
5394SI I32 min(I32 a, int32_t b) { return min( a , I32_(b)); }
5395SI I32 min(int32_t a, I32 b) { return min(I32_(a), b ); }
5396
5397SI F mad(F f, F m, F a) { return a+f*m; }
5398SI F mad(F f, F m, float a) { return mad( f , m , F_(a)); }
5399SI F mad(F f, float m, F a) { return mad( f , F_(m), a ); }
5400SI F mad(F f, float m, float a) { return mad( f , F_(m), F_(a)); }
5401SI F mad(float f, F m, F a) { return mad(F_(f), m , a ); }
5402SI F mad(float f, F m, float a) { return mad(F_(f), m , F_(a)); }
5403SI F mad(float f, float m, F a) { return mad(F_(f), F_(m), a ); }
5404
5405SI F nmad(F f, F m, F a) { return a-f*m; }
5406SI F nmad(F f, F m, float a) { return nmad( f , m , F_(a)); }
5407SI F nmad(F f, float m, F a) { return nmad( f , F_(m), a ); }
5408SI F nmad(F f, float m, float a) { return nmad( f , F_(m), F_(a)); }
5409SI F nmad(float f, F m, F a) { return nmad(F_(f), m , a ); }
5410SI F nmad(float f, F m, float a) { return nmad(F_(f), m , F_(a)); }
5411SI F nmad(float f, float m, F a) { return nmad(F_(f), F_(m), a ); }
5412
5413SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
5414
5415// Use approximate instructions and one Newton-Raphson step to calculate 1/x.
5416SI F rcp_precise(F x) {
5417#if defined(JUMPER_IS_SKX)
5418 F e = _mm512_rcp14_ps(x);
5419 return _mm512_fnmadd_ps(x, e, _mm512_set1_ps(2.0f)) * e;
5420#elif defined(JUMPER_IS_HSW)
5421 __m256 lo,hi;
5422 split(x, &lo,&hi);
5423 return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
5424#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
5425 __m128 lo,hi;
5426 split(x, &lo,&hi);
5427 return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
5428#elif defined(JUMPER_IS_NEON)
5429 float32x4_t lo,hi;
5430 split(x, &lo,&hi);
5431 return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
5432#elif defined(JUMPER_IS_LASX)
5433 __m256 lo,hi;
5434 split(x, &lo,&hi);
5435 return join<F>(__lasx_xvfrecip_s(lo), __lasx_xvfrecip_s(hi));
5436#elif defined(JUMPER_IS_LSX)
5437 __m128 lo,hi;
5438 split(x, &lo,&hi);
5439 return join<F>(__lsx_vfrecip_s(lo), __lsx_vfrecip_s(hi));
5440#else
5441 return 1.0f / x;
5442#endif
5443}
5444SI F sqrt_(F x) {
5445#if defined(JUMPER_IS_SKX)
5446 return _mm512_sqrt_ps(x);
5447#elif defined(JUMPER_IS_HSW)
5448 __m256 lo,hi;
5449 split(x, &lo,&hi);
5450 return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi));
5451#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
5452 __m128 lo,hi;
5453 split(x, &lo,&hi);
5454 return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi));
5455#elif defined(SK_CPU_ARM64)
5456 float32x4_t lo,hi;
5457 split(x, &lo,&hi);
5458 return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi));
5459#elif defined(JUMPER_IS_NEON)
5460 auto sqrt = [](float32x4_t v) {
5461 auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v).
5462 est *= vrsqrtsq_f32(v,est*est);
5463 est *= vrsqrtsq_f32(v,est*est);
5464 return v*est; // sqrt(v) == v*rsqrt(v).
5465 };
5466 float32x4_t lo,hi;
5467 split(x, &lo,&hi);
5468 return join<F>(sqrt(lo), sqrt(hi));
5469#elif defined(JUMPER_IS_LASX)
5470 __m256 lo,hi;
5471 split(x, &lo,&hi);
5472 return join<F>(__lasx_xvfsqrt_s(lo), __lasx_xvfsqrt_s(hi));
5473#elif defined(JUMPER_IS_LSX)
5474 __m128 lo,hi;
5475 split(x, &lo,&hi);
5476 return join<F>(__lsx_vfsqrt_s(lo), __lsx_vfsqrt_s(hi));
5477#else
5478 return F{
5479 sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]),
5480 sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]),
5481 };
5482#endif
5483}
5484
5485SI F floor_(F x) {
5486#if defined(SK_CPU_ARM64)
5487 float32x4_t lo,hi;
5488 split(x, &lo,&hi);
5489 return join<F>(vrndmq_f32(lo), vrndmq_f32(hi));
5490#elif defined(JUMPER_IS_SKX)
5491 return _mm512_floor_ps(x);
5492#elif defined(JUMPER_IS_HSW)
5493 __m256 lo,hi;
5494 split(x, &lo,&hi);
5495 return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi));
5496#elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
5497 __m128 lo,hi;
5498 split(x, &lo,&hi);
5499 return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi));
5500#elif defined(JUMPER_IS_LASX)
5501 __m256 lo,hi;
5502 split(x, &lo,&hi);
5503 return join<F>(__lasx_xvfrintrm_s(lo), __lasx_xvfrintrm_s(hi));
5504#elif defined(JUMPER_IS_LSX)
5505 __m128 lo,hi;
5506 split(x, &lo,&hi);
5507 return join<F>(__lsx_vfrintrm_s(lo), __lsx_vfrintrm_s(hi));
5508#else
5509 F roundtrip = cast<F>(cast<I32>(x));
5510 return roundtrip - if_then_else(roundtrip > x, F_(1), F_(0));
5511#endif
5512}
5513
5514// scaled_mult interprets a and b as number on [-1, 1) which are numbers in Q15 format. Functionally
5515// this multiply is:
5516// (2 * a * b + (1 << 15)) >> 16
5517// The result is a number on [-1, 1).
5518// Note: on neon this is a saturating multiply while the others are not.
5519SI I16 scaled_mult(I16 a, I16 b) {
5520#if defined(JUMPER_IS_SKX)
5521 return (I16)_mm256_mulhrs_epi16((__m256i)a, (__m256i)b);
5522#elif defined(JUMPER_IS_HSW)
5523 return (I16)_mm256_mulhrs_epi16((__m256i)a, (__m256i)b);
5524#elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
5525 return (I16)_mm_mulhrs_epi16((__m128i)a, (__m128i)b);
5526#elif defined(SK_CPU_ARM64)
5527 return vqrdmulhq_s16(a, b);
5528#elif defined(JUMPER_IS_NEON)
5529 return vqrdmulhq_s16(a, b);
5530#elif defined(JUMPER_IS_LASX)
5531 I16 res = __lasx_xvmuh_h(a, b);
5532 return __lasx_xvslli_h(res, 1);
5533#elif defined(JUMPER_IS_LSX)
5534 I16 res = __lsx_vmuh_h(a, b);
5535 return __lsx_vslli_h(res, 1);
5536#else
5537 const I32 roundingTerm = I32_(1 << 14);
5538 return cast<I16>((cast<I32>(a) * cast<I32>(b) + roundingTerm) >> 15);
5539#endif
5540}
5541
5542// This sum is to support lerp where the result will always be a positive number. In general,
5543// a sum like this would require an additional bit, but because we know the range of the result
5544// we know that the extra bit will always be zero.
5546 #if defined(SK_DEBUG)
5547 for (size_t i = 0; i < N; i++) {
5548 // Ensure that a + b is on the interval [0, UINT16_MAX]
5549 int ia = a[i],
5550 ib = b[i];
5551 // Use 65535 here because fuchsia's compiler evaluates UINT16_MAX - ib, which is
5552 // 65536U - ib, as an uint32_t instead of an int32_t. This was forcing ia to be
5553 // interpreted as an uint32_t.
5554 SkASSERT(-ib <= ia && ia <= 65535 - ib);
5555 }
5556 #endif
5557 return b + sk_bit_cast<U16>(a);
5558}
5559
5560SI F fract(F x) { return x - floor_(x); }
5561SI F abs_(F x) { return sk_bit_cast<F>( sk_bit_cast<I32>(x) & 0x7fffffff ); }
5562
5563// ~~~~~~ Basic / misc. stages ~~~~~~ //
5564
5565STAGE_GG(seed_shader, NoCtx) {
5566 static constexpr float iota[] = {
5567 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
5568 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
5569 };
5570 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride);
5571
5572 x = cast<F>(I32_(dx)) + sk_unaligned_load<F>(iota);
5573 y = cast<F>(I32_(dy)) + 0.5f;
5574}
5575
5576STAGE_GG(matrix_translate, const float* m) {
5577 x += m[0];
5578 y += m[1];
5579}
5580STAGE_GG(matrix_scale_translate, const float* m) {
5581 x = mad(x,m[0], m[2]);
5582 y = mad(y,m[1], m[3]);
5583}
5584STAGE_GG(matrix_2x3, const float* m) {
5585 auto X = mad(x,m[0], mad(y,m[1], m[2])),
5586 Y = mad(x,m[3], mad(y,m[4], m[5]));
5587 x = X;
5588 y = Y;
5589}
5590STAGE_GG(matrix_perspective, const float* m) {
5591 // N.B. Unlike the other matrix_ stages, this matrix is row-major.
5592 auto X = mad(x,m[0], mad(y,m[1], m[2])),
5593 Y = mad(x,m[3], mad(y,m[4], m[5])),
5594 Z = mad(x,m[6], mad(y,m[7], m[8]));
5595 x = X * rcp_precise(Z);
5596 y = Y * rcp_precise(Z);
5597}
5598
5599STAGE_PP(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
5600 r = U16_(c->rgba[0]);
5601 g = U16_(c->rgba[1]);
5602 b = U16_(c->rgba[2]);
5603 a = U16_(c->rgba[3]);
5604}
5605STAGE_PP(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
5606 dr = U16_(c->rgba[0]);
5607 dg = U16_(c->rgba[1]);
5608 db = U16_(c->rgba[2]);
5609 da = U16_(c->rgba[3]);
5610}
5611STAGE_PP(black_color, NoCtx) { r = g = b = U16_0; a = U16_255; }
5612STAGE_PP(white_color, NoCtx) { r = g = b = U16_255; a = U16_255; }
5613
5614STAGE_PP(set_rgb, const float rgb[3]) {
5615 r = from_float(rgb[0]);
5616 g = from_float(rgb[1]);
5617 b = from_float(rgb[2]);
5618}
5619
5620// No need to clamp against 0 here (values are unsigned)
5621STAGE_PP(clamp_01, NoCtx) {
5622 r = min(r, 255);
5623 g = min(g, 255);
5624 b = min(b, 255);
5625 a = min(a, 255);
5626}
5627
5628STAGE_PP(clamp_gamut, NoCtx) {
5629 a = min(a, 255);
5630 r = min(r, a);
5631 g = min(g, a);
5632 b = min(b, a);
5633}
5634
5635STAGE_PP(premul, NoCtx) {
5636 r = div255_accurate(r * a);
5637 g = div255_accurate(g * a);
5638 b = div255_accurate(b * a);
5639}
5640STAGE_PP(premul_dst, NoCtx) {
5641 dr = div255_accurate(dr * da);
5642 dg = div255_accurate(dg * da);
5643 db = div255_accurate(db * da);
5644}
5645
5646STAGE_PP(force_opaque , NoCtx) { a = U16_255; }
5647STAGE_PP(force_opaque_dst, NoCtx) { da = U16_255; }
5648
5649STAGE_PP(swap_rb, NoCtx) {
5650 auto tmp = r;
5651 r = b;
5652 b = tmp;
5653}
5654STAGE_PP(swap_rb_dst, NoCtx) {
5655 auto tmp = dr;
5656 dr = db;
5657 db = tmp;
5658}
5659
5660STAGE_PP(move_src_dst, NoCtx) {
5661 dr = r;
5662 dg = g;
5663 db = b;
5664 da = a;
5665}
5666
5667STAGE_PP(move_dst_src, NoCtx) {
5668 r = dr;
5669 g = dg;
5670 b = db;
5671 a = da;
5672}
5673
5674STAGE_PP(swap_src_dst, NoCtx) {
5675 std::swap(r, dr);
5676 std::swap(g, dg);
5677 std::swap(b, db);
5678 std::swap(a, da);
5679}
5680
5681// ~~~~~~ Blend modes ~~~~~~ //
5682
5683// The same logic applied to all 4 channels.
5684#define BLEND_MODE(name) \
5685 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
5686 STAGE_PP(name, NoCtx) { \
5687 r = name##_channel(r,dr,a,da); \
5688 g = name##_channel(g,dg,a,da); \
5689 b = name##_channel(b,db,a,da); \
5690 a = name##_channel(a,da,a,da); \
5691 } \
5692 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
5693
5694#if defined(SK_USE_INACCURATE_DIV255_IN_BLEND)
5695 BLEND_MODE(clear) { return U16_0; }
5696 BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); }
5697 BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); }
5698 BLEND_MODE(srcin) { return div255( s*da ); }
5699 BLEND_MODE(dstin) { return div255( d*sa ); }
5700 BLEND_MODE(srcout) { return div255( s*inv(da) ); }
5701 BLEND_MODE(dstout) { return div255( d*inv(sa) ); }
5702 BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); }
5703 BLEND_MODE(dstover) { return d + div255( s*inv(da) ); }
5704 BLEND_MODE(modulate) { return div255( s*d ); }
5705 BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); }
5706 BLEND_MODE(plus_) { return min(s+d, 255); }
5707 BLEND_MODE(screen) { return s + d - div255( s*d ); }
5708 BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); }
5709#else
5710 BLEND_MODE(clear) { return U16_0; }
5711 BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); }
5712 BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); }
5713 BLEND_MODE(srcin) { return div255_accurate( s*da ); }
5714 BLEND_MODE(dstin) { return div255_accurate( d*sa ); }
5715 BLEND_MODE(srcout) { return div255_accurate( s*inv(da) ); }
5716 BLEND_MODE(dstout) { return div255_accurate( d*inv(sa) ); }
5717 BLEND_MODE(srcover) { return s + div255_accurate( d*inv(sa) ); }
5718 BLEND_MODE(dstover) { return d + div255_accurate( s*inv(da) ); }
5719 BLEND_MODE(modulate) { return div255_accurate( s*d ); }
5720 BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); }
5721 BLEND_MODE(plus_) { return min(s+d, 255); }
5722 BLEND_MODE(screen) { return s + d - div255_accurate( s*d ); }
5723 BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); }
5724#endif
5725#undef BLEND_MODE
5726
5727// The same logic applied to color, and srcover for alpha.
5728#define BLEND_MODE(name) \
5729 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
5730 STAGE_PP(name, NoCtx) { \
5731 r = name##_channel(r,dr,a,da); \
5732 g = name##_channel(g,dg,a,da); \
5733 b = name##_channel(b,db,a,da); \
5734 a = a + div255( da*inv(a) ); \
5735 } \
5736 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
5737
5738 BLEND_MODE(darken) { return s + d - div255( max(s*da, d*sa) ); }
5739 BLEND_MODE(lighten) { return s + d - div255( min(s*da, d*sa) ); }
5740 BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); }
5741 BLEND_MODE(exclusion) { return s + d - 2*div255( s*d ); }
5742
5743 BLEND_MODE(hardlight) {
5744 return div255( s*inv(da) + d*inv(sa) +
5745 if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
5746 }
5747 BLEND_MODE(overlay) {
5748 return div255( s*inv(da) + d*inv(sa) +
5749 if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
5750 }
5751#undef BLEND_MODE
5752
5753// ~~~~~~ Helpers for interacting with memory ~~~~~~ //
5754
5755template <typename T>
5756SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
5757 return (T*)ctx->pixels + dy*ctx->stride + dx;
5758}
5759
5760template <typename T>
5761SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
5762 // Exclusive -> inclusive.
5763 const F w = F_(sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->width ) - 1)),
5764 h = F_(sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->height) - 1));
5765
5767
5768 x = min(max(z, x), w);
5769 y = min(max(z, y), h);
5770
5771 x = sk_bit_cast<F>(sk_bit_cast<U32>(x) - (uint32_t)ctx->roundDownAtInteger);
5772 y = sk_bit_cast<F>(sk_bit_cast<U32>(y) - (uint32_t)ctx->roundDownAtInteger);
5773
5774 *ptr = (const T*)ctx->pixels;
5775 return trunc_(y)*ctx->stride + trunc_(x);
5776}
5777
5778template <typename T>
5779SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, I32 x, I32 y) {
5780 // This flag doesn't make sense when the coords are integers.
5781 SkASSERT(ctx->roundDownAtInteger == 0);
5782 // Exclusive -> inclusive.
5783 const I32 w = I32_( ctx->width - 1),
5784 h = I32_(ctx->height - 1);
5785
5786 U32 ax = cast<U32>(min(max(0, x), w)),
5787 ay = cast<U32>(min(max(0, y), h));
5788
5789 *ptr = (const T*)ctx->pixels;
5790 return ay * ctx->stride + ax;
5791}
5792
5793template <typename V, typename T>
5794SI V load(const T* ptr) {
5795 V v;
5796 memcpy(&v, ptr, sizeof(v));
5797 return v;
5798}
5799template <typename V, typename T>
5800SI void store(T* ptr, V v) {
5801 memcpy(ptr, &v, sizeof(v));
5802}
5803
5804#if defined(JUMPER_IS_SKX)
5805 template <typename V, typename T>
5806 SI V gather(const T* ptr, U32 ix) {
5807 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5808 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
5809 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
5810 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
5811 }
5812
5813 template<>
5814 F gather(const float* ptr, U32 ix) {
5815 return _mm512_i32gather_ps((__m512i)ix, ptr, 4);
5816 }
5817
5818 template<>
5819 U32 gather(const uint32_t* ptr, U32 ix) {
5820 return (U32)_mm512_i32gather_epi32((__m512i)ix, ptr, 4);
5821 }
5822
5823#elif defined(JUMPER_IS_HSW)
5824 template <typename V, typename T>
5825 SI V gather(const T* ptr, U32 ix) {
5826 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5827 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
5828 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
5829 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
5830 }
5831
5832 template<>
5833 F gather(const float* ptr, U32 ix) {
5834 __m256i lo, hi;
5835 split(ix, &lo, &hi);
5836
5837 return join<F>(_mm256_i32gather_ps(ptr, lo, 4),
5838 _mm256_i32gather_ps(ptr, hi, 4));
5839 }
5840
5841 template<>
5842 U32 gather(const uint32_t* ptr, U32 ix) {
5843 __m256i lo, hi;
5844 split(ix, &lo, &hi);
5845
5846 return join<U32>(_mm256_i32gather_epi32((const int*)ptr, lo, 4),
5847 _mm256_i32gather_epi32((const int*)ptr, hi, 4));
5848 }
5849#elif defined(JUMPER_IS_LASX)
5850 template <typename V, typename T>
5851 SI V gather(const T* ptr, U32 ix) {
5852 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5853 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
5854 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
5855 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
5856 }
5857#else
5858 template <typename V, typename T>
5859 SI V gather(const T* ptr, U32 ix) {
5860 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5861 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
5862 }
5863#endif
5864
5865
5866// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
5867
5868SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
5869#if defined(JUMPER_IS_SKX)
5870 rgba = (U32)_mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), (__m512i)rgba);
5871 auto cast_U16 = [](U32 v) -> U16 {
5872 return (U16)_mm256_packus_epi32(_mm512_castsi512_si256((__m512i)v),
5873 _mm512_extracti64x4_epi64((__m512i)v, 1));
5874 };
5875#elif defined(JUMPER_IS_HSW)
5876 // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
5877 __m256i _01,_23;
5878 split(rgba, &_01, &_23);
5879 __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
5880 _13 = _mm256_permute2x128_si256(_01,_23, 0x31);
5881 rgba = join<U32>(_02, _13);
5882
5883 auto cast_U16 = [](U32 v) -> U16 {
5884 __m256i _02,_13;
5885 split(v, &_02,&_13);
5886 return (U16)_mm256_packus_epi32(_02,_13);
5887 };
5888#elif defined(JUMPER_IS_LASX)
5889 __m256i _01, _23;
5890 split(rgba, &_01, &_23);
5891 __m256i _02 = __lasx_xvpermi_q(_01, _23, 0x02),
5892 _13 = __lasx_xvpermi_q(_01, _23, 0x13);
5893 rgba = join<U32>(_02, _13);
5894
5895 auto cast_U16 = [](U32 v) -> U16 {
5896 __m256i _02,_13;
5897 split(v, &_02,&_13);
5898 __m256i tmp0 = __lasx_xvsat_wu(_02, 15);
5899 __m256i tmp1 = __lasx_xvsat_wu(_13, 15);
5900 return __lasx_xvpickev_h(tmp1, tmp0);
5901 };
5902#else
5903 auto cast_U16 = [](U32 v) -> U16 {
5904 return cast<U16>(v);
5905 };
5906#endif
5907 *r = cast_U16(rgba & 65535) & 255;
5908 *g = cast_U16(rgba & 65535) >> 8;
5909 *b = cast_U16(rgba >> 16) & 255;
5910 *a = cast_U16(rgba >> 16) >> 8;
5911}
5912
5913SI void load_8888_(const uint32_t* ptr, U16* r, U16* g, U16* b, U16* a) {
5914#if 1 && defined(JUMPER_IS_NEON)
5915 uint8x8x4_t rgba = vld4_u8((const uint8_t*)(ptr));
5916 *r = cast<U16>(rgba.val[0]);
5917 *g = cast<U16>(rgba.val[1]);
5918 *b = cast<U16>(rgba.val[2]);
5919 *a = cast<U16>(rgba.val[3]);
5920#else
5921 from_8888(load<U32>(ptr), r,g,b,a);
5922#endif
5923}
5924SI void store_8888_(uint32_t* ptr, U16 r, U16 g, U16 b, U16 a) {
5925 r = min(r, 255);
5926 g = min(g, 255);
5927 b = min(b, 255);
5928 a = min(a, 255);
5929
5930#if 1 && defined(JUMPER_IS_NEON)
5931 uint8x8x4_t rgba = {{
5932 cast<U8>(r),
5933 cast<U8>(g),
5934 cast<U8>(b),
5935 cast<U8>(a),
5936 }};
5937 vst4_u8((uint8_t*)(ptr), rgba);
5938#else
5939 store(ptr, cast<U32>(r | (g<<8)) << 0
5940 | cast<U32>(b | (a<<8)) << 16);
5941#endif
5942}
5943
5944STAGE_PP(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
5945 load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), &r,&g,&b,&a);
5946}
5947STAGE_PP(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
5948 load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), &dr,&dg,&db,&da);
5949}
5950STAGE_PP(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
5951 store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), r,g,b,a);
5952}
5953STAGE_GP(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
5954 const uint32_t* ptr;
5955 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
5956 from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
5957}
5958
5959// ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
5960
5961SI void from_565(U16 rgb, U16* r, U16* g, U16* b) {
5962 // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0
5963 U16 R = (rgb >> 11) & 31,
5964 G = (rgb >> 5) & 63,
5965 B = (rgb >> 0) & 31;
5966
5967 // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit.
5968 *r = (R << 3) | (R >> 2);
5969 *g = (G << 2) | (G >> 4);
5970 *b = (B << 3) | (B >> 2);
5971}
5972SI void load_565_(const uint16_t* ptr, U16* r, U16* g, U16* b) {
5973 from_565(load<U16>(ptr), r,g,b);
5974}
5975SI void store_565_(uint16_t* ptr, U16 r, U16 g, U16 b) {
5976 r = min(r, 255);
5977 g = min(g, 255);
5978 b = min(b, 255);
5979
5980 // Round from [0,255] to [0,31] or [0,63], as if x * (31/255.0f) + 0.5f.
5981 // (Don't feel like you need to find some fundamental truth in these...
5982 // they were brute-force searched.)
5983 U16 R = (r * 9 + 36) / 74, // 9/74 ≈ 31/255, plus 36/74, about half.
5984 G = (g * 21 + 42) / 85, // 21/85 = 63/255 exactly.
5985 B = (b * 9 + 36) / 74;
5986 // Pack them back into 15|rrrrr gggggg bbbbb|0.
5987 store(ptr, R << 11
5988 | G << 5
5989 | B << 0);
5990}
5991
5992STAGE_PP(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
5993 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &r,&g,&b);
5994 a = U16_255;
5995}
5996STAGE_PP(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
5997 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &dr,&dg,&db);
5998 da = U16_255;
5999}
6000STAGE_PP(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
6001 store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), r,g,b);
6002}
6003STAGE_GP(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
6004 const uint16_t* ptr;
6005 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
6006 from_565(gather<U16>(ptr, ix), &r, &g, &b);
6007 a = U16_255;
6008}
6009
6010SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) {
6011 // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0.
6012 U16 R = (rgba >> 12) & 15,
6013 G = (rgba >> 8) & 15,
6014 B = (rgba >> 4) & 15,
6015 A = (rgba >> 0) & 15;
6016
6017 // Scale [0,15] to [0,255].
6018 *r = (R << 4) | R;
6019 *g = (G << 4) | G;
6020 *b = (B << 4) | B;
6021 *a = (A << 4) | A;
6022}
6023SI void load_4444_(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
6024 from_4444(load<U16>(ptr), r,g,b,a);
6025}
6026SI void store_4444_(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
6027 r = min(r, 255);
6028 g = min(g, 255);
6029 b = min(b, 255);
6030 a = min(a, 255);
6031
6032 // Round from [0,255] to [0,15], producing the same value as (x*(15/255.0f) + 0.5f).
6033 U16 R = (r + 8) / 17,
6034 G = (g + 8) / 17,
6035 B = (b + 8) / 17,
6036 A = (a + 8) / 17;
6037 // Pack them back into 15|rrrr gggg bbbb aaaa|0.
6038 store(ptr, R << 12
6039 | G << 8
6040 | B << 4
6041 | A << 0);
6042}
6043
6044STAGE_PP(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
6045 load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &r,&g,&b,&a);
6046}
6047STAGE_PP(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
6048 load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &dr,&dg,&db,&da);
6049}
6050STAGE_PP(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
6051 store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), r,g,b,a);
6052}
6053STAGE_GP(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
6054 const uint16_t* ptr;
6055 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
6056 from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
6057}
6058
6059SI void from_88(U16 rg, U16* r, U16* g) {
6060 *r = (rg & 0xFF);
6061 *g = (rg >> 8);
6062}
6063
6064SI void load_88_(const uint16_t* ptr, U16* r, U16* g) {
6065#if 1 && defined(JUMPER_IS_NEON)
6066 uint8x8x2_t rg = vld2_u8((const uint8_t*)(ptr));
6067 *r = cast<U16>(rg.val[0]);
6068 *g = cast<U16>(rg.val[1]);
6069#else
6070 from_88(load<U16>(ptr), r,g);
6071#endif
6072}
6073
6074SI void store_88_(uint16_t* ptr, U16 r, U16 g) {
6075 r = min(r, 255);
6076 g = min(g, 255);
6077
6078#if 1 && defined(JUMPER_IS_NEON)
6079 uint8x8x2_t rg = {{
6080 cast<U8>(r),
6081 cast<U8>(g),
6082 }};
6083 vst2_u8((uint8_t*)(ptr), rg);
6084#else
6085 store(ptr, cast<U16>(r | (g<<8)) << 0);
6086#endif
6087}
6088
6089STAGE_PP(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
6090 load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), &r, &g);
6091 b = U16_0;
6092 a = U16_255;
6093}
6094STAGE_PP(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) {
6095 load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), &dr, &dg);
6096 db = U16_0;
6097 da = U16_255;
6098}
6099STAGE_PP(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
6100 store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), r, g);
6101}
6102STAGE_GP(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) {
6103 const uint16_t* ptr;
6104 U32 ix = ix_and_ptr(&ptr, ctx, x, y);
6105 from_88(gather<U16>(ptr, ix), &r, &g);
6106 b = U16_0;
6107 a = U16_255;
6108}
6109
6110// ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
6111
6112SI U16 load_8(const uint8_t* ptr) {
6113 return cast<U16>(load<U8>(ptr));
6114}
6115SI void store_8(uint8_t* ptr, U16 v) {
6116 v = min(v, 255);
6117 store(ptr, cast<U8>(v));
6118}
6119
6120STAGE_PP(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
6121 r = g = b = U16_0;
6122 a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy));
6123}
6124STAGE_PP(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
6125 dr = dg = db = U16_0;
6126 da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy));
6127}
6128STAGE_PP(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
6129 store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), a);
6130}
6131STAGE_GP(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
6132 const uint8_t* ptr;
6133 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
6134 r = g = b = U16_0;
6135 a = cast<U16>(gather<U8>(ptr, ix));
6136}
6137STAGE_PP(store_r8, const SkRasterPipeline_MemoryCtx* ctx) {
6138 store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), r);
6139}
6140
6141STAGE_PP(alpha_to_gray, NoCtx) {
6142 r = g = b = a;
6143 a = U16_255;
6144}
6145STAGE_PP(alpha_to_gray_dst, NoCtx) {
6146 dr = dg = db = da;
6147 da = U16_255;
6148}
6149STAGE_PP(alpha_to_red, NoCtx) {
6150 r = a;
6151 a = U16_255;
6152}
6153STAGE_PP(alpha_to_red_dst, NoCtx) {
6154 dr = da;
6155 da = U16_255;
6156}
6157
6158STAGE_PP(bt709_luminance_or_luma_to_alpha, NoCtx) {
6159 a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator.
6160 r = g = b = U16_0;
6161}
6162STAGE_PP(bt709_luminance_or_luma_to_rgb, NoCtx) {
6163 r = g = b =(r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator.
6164}
6165
6166// ~~~~~~ Coverage scales / lerps ~~~~~~ //
6167
6168STAGE_PP(load_src, const uint16_t* ptr) {
6169 r = sk_unaligned_load<U16>(ptr + 0*N);
6170 g = sk_unaligned_load<U16>(ptr + 1*N);
6171 b = sk_unaligned_load<U16>(ptr + 2*N);
6172 a = sk_unaligned_load<U16>(ptr + 3*N);
6173}
6174STAGE_PP(store_src, uint16_t* ptr) {
6175 sk_unaligned_store(ptr + 0*N, r);
6176 sk_unaligned_store(ptr + 1*N, g);
6177 sk_unaligned_store(ptr + 2*N, b);
6178 sk_unaligned_store(ptr + 3*N, a);
6179}
6180STAGE_PP(store_src_a, uint16_t* ptr) {
6181 sk_unaligned_store(ptr, a);
6182}
6183STAGE_PP(load_dst, const uint16_t* ptr) {
6184 dr = sk_unaligned_load<U16>(ptr + 0*N);
6185 dg = sk_unaligned_load<U16>(ptr + 1*N);
6186 db = sk_unaligned_load<U16>(ptr + 2*N);
6187 da = sk_unaligned_load<U16>(ptr + 3*N);
6188}
6189STAGE_PP(store_dst, uint16_t* ptr) {
6190 sk_unaligned_store(ptr + 0*N, dr);
6191 sk_unaligned_store(ptr + 1*N, dg);
6192 sk_unaligned_store(ptr + 2*N, db);
6193 sk_unaligned_store(ptr + 3*N, da);
6194}
6195
6196// ~~~~~~ Coverage scales / lerps ~~~~~~ //
6197
6198STAGE_PP(scale_1_float, const float* f) {
6199 U16 c = from_float(*f);
6200 r = div255( r * c );
6201 g = div255( g * c );
6202 b = div255( b * c );
6203 a = div255( a * c );
6204}
6205STAGE_PP(lerp_1_float, const float* f) {
6206 U16 c = from_float(*f);
6207 r = lerp(dr, r, c);
6208 g = lerp(dg, g, c);
6209 b = lerp(db, b, c);
6210 a = lerp(da, a, c);
6211}
6212STAGE_PP(scale_native, const uint16_t scales[]) {
6213 auto c = sk_unaligned_load<U16>(scales);
6214 r = div255( r * c );
6215 g = div255( g * c );
6216 b = div255( b * c );
6217 a = div255( a * c );
6218}
6219
6220STAGE_PP(lerp_native, const uint16_t scales[]) {
6221 auto c = sk_unaligned_load<U16>(scales);
6222 r = lerp(dr, r, c);
6223 g = lerp(dg, g, c);
6224 b = lerp(db, b, c);
6225 a = lerp(da, a, c);
6226}
6227
6228STAGE_PP(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
6229 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy));
6230 r = div255( r * c );
6231 g = div255( g * c );
6232 b = div255( b * c );
6233 a = div255( a * c );
6234}
6235STAGE_PP(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
6236 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy));
6237 r = lerp(dr, r, c);
6238 g = lerp(dg, g, c);
6239 b = lerp(db, b, c);
6240 a = lerp(da, a, c);
6241}
6242
6243// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
6245 return if_then_else(a < da, min(cr, min(cg,cb))
6246 , max(cr, max(cg,cb)));
6247}
6248STAGE_PP(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
6249 U16 cr,cg,cb;
6250 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &cr,&cg,&cb);
6251 U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
6252
6253 r = div255( r * cr );
6254 g = div255( g * cg );
6255 b = div255( b * cb );
6256 a = div255( a * ca );
6257}
6258STAGE_PP(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
6259 U16 cr,cg,cb;
6260 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &cr,&cg,&cb);
6261 U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
6262
6263 r = lerp(dr, r, cr);
6264 g = lerp(dg, g, cg);
6265 b = lerp(db, b, cb);
6266 a = lerp(da, a, ca);
6267}
6268
6269STAGE_PP(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
6270 U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy)),
6271 add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy));
6272
6273 r = min(div255(r*mul) + add, a);
6274 g = min(div255(g*mul) + add, a);
6275 b = min(div255(b*mul) + add, a);
6276}
6277
6278
6279// ~~~~~~ Gradient stages ~~~~~~ //
6280
6281// Clamp x to [0,1], both sides inclusive (think, gradients).
6282// Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
6283SI F clamp_01_(F v) { return min(max(0, v), 1); }
6284
6285STAGE_GG(clamp_x_1 , NoCtx) { x = clamp_01_(x); }
6286STAGE_GG(repeat_x_1, NoCtx) { x = clamp_01_(x - floor_(x)); }
6287STAGE_GG(mirror_x_1, NoCtx) {
6288 auto two = [](F x){ return x+x; };
6289 x = clamp_01_(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f ));
6290}
6291
6292SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); }
6293
6294STAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
6295 auto w = ctx->limit_x;
6296 sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w)));
6297}
6298STAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
6299 auto h = ctx->limit_y;
6300 sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h)));
6301}
6302STAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
6303 auto w = ctx->limit_x;
6304 auto h = ctx->limit_y;
6305 sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h)));
6306}
6307STAGE_GG(clamp_x_and_y, SkRasterPipeline_CoordClampCtx* ctx) {
6308 x = min(ctx->max_x, max(ctx->min_x, x));
6309 y = min(ctx->max_y, max(ctx->min_y, y));
6310}
6311STAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
6312 auto mask = sk_unaligned_load<U16>(ctx->mask);
6313 r = r & mask;
6314 g = g & mask;
6315 b = b & mask;
6316 a = a & mask;
6317}
6318
6319SI void round_F_to_U16(F R, F G, F B, F A, U16* r, U16* g, U16* b, U16* a) {
6320 auto round_color = [](F x) { return cast<U16>(x * 255.0f + 0.5f); };
6321
6322 *r = round_color(min(max(0, R), 1));
6323 *g = round_color(min(max(0, G), 1));
6324 *b = round_color(min(max(0, B), 1));
6325 *a = round_color(A); // we assume alpha is already in [0,1].
6326}
6327
6329 U16* r, U16* g, U16* b, U16* a) {
6330
6331 F fr, fg, fb, fa, br, bg, bb, ba;
6332#if defined(JUMPER_IS_HSW)
6333 if (c->stopCount <=8) {
6334 __m256i lo, hi;
6335 split(idx, &lo, &hi);
6336
6337 fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo),
6338 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi));
6339 br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo),
6340 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi));
6341 fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo),
6342 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi));
6343 bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo),
6344 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi));
6345 fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo),
6346 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi));
6347 bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo),
6348 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi));
6349 fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo),
6350 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi));
6351 ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo),
6352 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi));
6353 } else
6354#elif defined(JUMPER_IS_LASX)
6355 if (c->stopCount <= 8) {
6356 __m256i lo, hi;
6357 split(idx, &lo, &hi);
6358
6359 fr = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[0], 0), lo),
6360 (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[0], 0), hi));
6361 br = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[0], 0), lo),
6362 (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[0], 0), hi));
6363 fg = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[1], 0), lo),
6364 (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[1], 0), hi));
6365 bg = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[1], 0), lo),
6366 (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[1], 0), hi));
6367 fb = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[2], 0), lo),
6368 (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[2], 0), hi));
6369 bb = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[2], 0), lo),
6370 (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[2], 0), hi));
6371 fa = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[3], 0), lo),
6372 (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[3], 0), hi));
6373 ba = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[3], 0), lo),
6374 (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[3], 0), hi));
6375 } else
6376#elif defined(JUMPER_IS_LSX)
6377 if (c->stopCount <= 4) {
6378 __m128i lo, hi;
6379 split(idx, &lo, &hi);
6380 __m128i zero = __lsx_vldi(0);
6381 fr = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->fs[0], 0)),
6382 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->fs[0], 0)));
6383 br = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->bs[0], 0)),
6384 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->bs[0], 0)));
6385 fg = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->fs[1], 0)),
6386 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->fs[1], 0)));
6387 bg = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->bs[1], 0)),
6388 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->bs[1], 0)));
6389 fb = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->fs[2], 0)),
6390 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->fs[2], 0)));
6391 bb = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->bs[2], 0)),
6392 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->bs[2], 0)));
6393 fa = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->fs[3], 0)),
6394 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->fs[3], 0)));
6395 ba = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->bs[3], 0)),
6396 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->bs[3], 0)));
6397 } else
6398#endif
6399 {
6400 fr = gather<F>(c->fs[0], idx);
6401 fg = gather<F>(c->fs[1], idx);
6402 fb = gather<F>(c->fs[2], idx);
6403 fa = gather<F>(c->fs[3], idx);
6404 br = gather<F>(c->bs[0], idx);
6405 bg = gather<F>(c->bs[1], idx);
6406 bb = gather<F>(c->bs[2], idx);
6407 ba = gather<F>(c->bs[3], idx);
6408 }
6409 round_F_to_U16(mad(t, fr, br),
6410 mad(t, fg, bg),
6411 mad(t, fb, bb),
6412 mad(t, fa, ba),
6413 r,g,b,a);
6414}
6415
6416STAGE_GP(gradient, const SkRasterPipeline_GradientCtx* c) {
6417 auto t = x;
6418 U32 idx = U32_(0);
6419
6420 // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
6421 for (size_t i = 1; i < c->stopCount; i++) {
6422 idx += if_then_else(t >= c->ts[i], U32_(1), U32_(0));
6423 }
6424
6425 gradient_lookup(c, idx, t, &r, &g, &b, &a);
6426}
6427
6428STAGE_GP(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
6429 auto t = x;
6430 auto idx = trunc_(t * static_cast<float>(c->stopCount-1));
6431 gradient_lookup(c, idx, t, &r, &g, &b, &a);
6432}
6433
6434STAGE_GP(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) {
6435 auto t = x;
6436 round_F_to_U16(mad(t, c->f[0], c->b[0]),
6437 mad(t, c->f[1], c->b[1]),
6438 mad(t, c->f[2], c->b[2]),
6439 mad(t, c->f[3], c->b[3]),
6440 &r,&g,&b,&a);
6441}
6442
6443STAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
6444 // Quantize sample point and transform into lerp coordinates converting them to 16.16 fixed
6445 // point number.
6446 I32 qx = cast<I32>(floor_(65536.0f * x + 0.5f)) - 32768,
6447 qy = cast<I32>(floor_(65536.0f * y + 0.5f)) - 32768;
6448
6449 // Calculate screen coordinates sx & sy by flooring qx and qy.
6450 I32 sx = qx >> 16,
6451 sy = qy >> 16;
6452
6453 // We are going to perform a change of parameters for qx on [0, 1) to tx on [-1, 1).
6454 // This will put tx in Q15 format for use with q_mult.
6455 // Calculate tx and ty on the interval of [-1, 1). Give {qx} and {qy} are on the interval
6456 // [0, 1), where {v} is fract(v), we can transform to tx in the following manner ty follows
6457 // the same math:
6458 // tx = 2 * {qx} - 1, so
6459 // {qx} = (tx + 1) / 2.
6460 // Calculate {qx} - 1 and {qy} - 1 where the {} operation is handled by the cast, and the - 1
6461 // is handled by the ^ 0x8000, dividing by 2 is deferred and handled in lerpX and lerpY in
6462 // order to use the full 16-bit resolution.
6463 I16 tx = cast<I16>(qx ^ 0x8000),
6464 ty = cast<I16>(qy ^ 0x8000);
6465
6466 // Substituting the {qx} by the equation for tx from above into the lerp equation where v is
6467 // the lerped value:
6468 // v = {qx}*(R - L) + L,
6469 // v = 1/2*(tx + 1)*(R - L) + L
6470 // 2 * v = (tx + 1)*(R - L) + 2*L
6471 // = tx*R - tx*L + R - L + 2*L
6472 // = tx*(R - L) + (R + L).
6473 // Since R and L are on [0, 255] we need them on the interval [0, 1/2] to get them into form
6474 // for Q15_mult. If L and R where in 16.16 format, this would be done by dividing by 2^9. In
6475 // code, we can multiply by 2^7 to get the value directly.
6476 // 2 * v = tx*(R - L) + (R + L)
6477 // 2^-9 * 2 * v = tx*(R - L)*2^-9 + (R + L)*2^-9
6478 // 2^-8 * v = 2^-9 * (tx*(R - L) + (R + L))
6479 // v = 1/2 * (tx*(R - L) + (R + L))
6480 auto lerpX = [&](U16 left, U16 right) -> U16 {
6481 I16 width = (I16)(right - left) << 7;
6482 U16 middle = (right + left) << 7;
6483 // The constrained_add is the most subtle part of lerp. The first term is on the interval
6484 // [-1, 1), and the second term is on the interval is on the interval [0, 1) because
6485 // both terms are too high by a factor of 2 which will be handled below. (Both R and L are
6486 // on [0, 1/2), but the sum R + L is on the interval [0, 1).) Generally, the sum below
6487 // should overflow, but because we know that sum produces an output on the
6488 // interval [0, 1) we know that the extra bit that would be needed will always be 0. So
6489 // we need to be careful to treat this sum as an unsigned positive number in the divide
6490 // by 2 below. Add +1 for rounding.
6491 U16 v2 = constrained_add(scaled_mult(tx, width), middle) + 1;
6492 // Divide by 2 to calculate v and at the same time bring the intermediate value onto the
6493 // interval [0, 1/2] to set up for the lerpY.
6494 return v2 >> 1;
6495 };
6496
6497 const uint32_t* ptr;
6498 U32 ix = ix_and_ptr(&ptr, ctx, sx, sy);
6499 U16 leftR, leftG, leftB, leftA;
6500 from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
6501
6502 ix = ix_and_ptr(&ptr, ctx, sx+1, sy);
6503 U16 rightR, rightG, rightB, rightA;
6504 from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
6505
6506 U16 topR = lerpX(leftR, rightR),
6507 topG = lerpX(leftG, rightG),
6508 topB = lerpX(leftB, rightB),
6509 topA = lerpX(leftA, rightA);
6510
6511 ix = ix_and_ptr(&ptr, ctx, sx, sy+1);
6512 from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
6513
6514 ix = ix_and_ptr(&ptr, ctx, sx+1, sy+1);
6515 from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
6516
6517 U16 bottomR = lerpX(leftR, rightR),
6518 bottomG = lerpX(leftG, rightG),
6519 bottomB = lerpX(leftB, rightB),
6520 bottomA = lerpX(leftA, rightA);
6521
6522 // lerpY plays the same mathematical tricks as lerpX, but the final divide is by 256 resulting
6523 // in a value on [0, 255].
6524 auto lerpY = [&](U16 top, U16 bottom) -> U16 {
6525 I16 width = (I16)bottom - (I16)top;
6526 U16 middle = bottom + top;
6527 // Add + 0x80 for rounding.
6528 U16 blend = constrained_add(scaled_mult(ty, width), middle) + 0x80;
6529
6530 return blend >> 8;
6531 };
6532
6533 r = lerpY(topR, bottomR);
6534 g = lerpY(topG, bottomG);
6535 b = lerpY(topB, bottomB);
6536 a = lerpY(topA, bottomA);
6537}
6538
6539STAGE_GG(xy_to_unit_angle, NoCtx) {
6540 F xabs = abs_(x),
6541 yabs = abs_(y);
6542
6543 F slope = min(xabs, yabs)/max(xabs, yabs);
6544 F s = slope * slope;
6545
6546 // Use a 7th degree polynomial to approximate atan.
6547 // This was generated using sollya.gforge.inria.fr.
6548 // A float optimized polynomial was generated using the following command.
6549 // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
6550 F phi = slope
6551 * (0.15912117063999176025390625f + s
6552 * (-5.185396969318389892578125e-2f + s
6553 * (2.476101927459239959716796875e-2f + s
6554 * (-7.0547382347285747528076171875e-3f))));
6555
6556 phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
6557 phi = if_then_else(x < 0.0f , 1.0f/2.0f - phi, phi);
6558 phi = if_then_else(y < 0.0f , 1.0f - phi , phi);
6559 phi = if_then_else(phi != phi , 0 , phi); // Check for NaN.
6560 x = phi;
6561}
6562STAGE_GG(xy_to_radius, NoCtx) {
6563 x = sqrt_(x*x + y*y);
6564}
6565
6566// ~~~~~~ Compound stages ~~~~~~ //
6567
6568STAGE_PP(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
6569 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
6570
6571 load_8888_(ptr, &dr,&dg,&db,&da);
6572 r = r + div255( dr*inv(a) );
6573 g = g + div255( dg*inv(a) );
6574 b = b + div255( db*inv(a) );
6575 a = a + div255( da*inv(a) );
6576 store_8888_(ptr, r,g,b,a);
6577}
6578
6579// ~~~~~~ skgpu::Swizzle stage ~~~~~~ //
6580
6581STAGE_PP(swizzle, void* ctx) {
6582 auto ir = r, ig = g, ib = b, ia = a;
6583 U16* o[] = {&r, &g, &b, &a};
6584 char swiz[4];
6585 memcpy(swiz, &ctx, sizeof(swiz));
6586
6587 for (int i = 0; i < 4; ++i) {
6588 switch (swiz[i]) {
6589 case 'r': *o[i] = ir; break;
6590 case 'g': *o[i] = ig; break;
6591 case 'b': *o[i] = ib; break;
6592 case 'a': *o[i] = ia; break;
6593 case '0': *o[i] = U16_0; break;
6594 case '1': *o[i] = U16_255; break;
6595 default: break;
6596 }
6597 }
6598}
6599
6600#endif//defined(JUMPER_IS_SCALAR) controlling whether we build lowp stages
6601} // namespace lowp
6602
6603/* This gives us SK_OPTS::lowp::N if lowp::N has been set, or SK_OPTS::N if it hasn't. */
6604namespace lowp { static constexpr size_t lowp_N = N; }
6605
6606/** Allow outside code to access the Raster Pipeline pixel stride. */
6607constexpr size_t raster_pipeline_lowp_stride() { return lowp::lowp_N; }
6608constexpr size_t raster_pipeline_highp_stride() { return N; }
6609
6610} // namespace SK_OPTS_NS
6611
6612#undef SI
6613
6614#endif//SkRasterPipeline_opts_DEFINED
for(const auto glyph :glyphs)
Definition: FontMgrTest.cpp:52
int count
Definition: FontMgrTest.cpp:50
SkPoint lerp(const SkPoint &a, const SkPoint &b, float t)
static float next(float f)
V< 8, int16_t > I16
Definition: QMath.h:30
static U16 constrained_add(I16 a, U16 b)
Definition: QMath.h:34
V< 8, uint16_t > U16
Definition: QMath.h:31
static const uint32_t rgba[kNumPixels]
#define SK_ASSUME(cond)
Definition: SkAssert.h:44
#define SkASSERT(cond)
Definition: SkAssert.h:116
constexpr float SK_FloatPI
static size_t difference(size_t minuend, size_t subtrahend)
static bool is_degenerate(const SkPath &path)
Definition: SkPath.cpp:73
static int sign(SkScalar x)
Definition: SkPath.cpp:2205
static bool left(const SkPoint &p0, const SkPoint &p1)
static bool right(const SkPoint &p0, const SkPoint &p1)
static constexpr int SkRasterPipeline_kMaxStride
static constexpr int SkRasterPipeline_kMaxStride_highp
#define SK_RASTER_PIPELINE_OPS_LOWP(M)
constexpr auto F_
#define DECLARE_IMM_BINARY_INT(name)
#define ABI
#define BLEND_MODE(name)
#define STAGE_TAIL(name, arg)
#define DECLARE_BINARY_FLOAT(name)
SI Dst widen_cast(const Src &src)
#define DECLARE_IMM_BINARY_FLOAT(name)
#define DECLARE_BINARY_UINT(name)
#define DECLARE_N_WAY_BINARY_FLOAT(name)
#define DECLARE_TERNARY_FLOAT(name)
#define DECLARE_MULTI_IMM_BINARY_INT(name)
#define update_execution_mask()
#define SI
#define STAGE(name, arg)
#define DECLARE_UNARY_UINT(name)
#define DECLARE_TERNARY_INT(name)
#define DECLARE_UNARY_INT(name)
#define DECLARE_IMM_BINARY_UINT(name)
#define SK_UNROLL
#define execution_mask()
#define DECLARE_UNARY_FLOAT(name)
#define DECLARE_N_WAY_TERNARY_FLOAT(name)
#define DECLARE_BINARY_INT(name)
void swap(sk_sp< T > &a, sk_sp< T > &b)
Definition: SkRefCnt.h:341
static SK_ALWAYS_INLINE void SK_FP_SAFE_ABI sk_unaligned_store(P *ptr, T val)
Definition: SkUtils.h:61
static const SkScalar Y
Definition: StrokeBench.cpp:55
static const SkScalar X
Definition: StrokeBench.cpp:54
V< uint64_t > U64
Definition: Transform_inl.h:16
SI F approx_exp(F x)
SI F approx_log(F x)
SI void store(P *ptr, const T &val)
SI D cast(const S &v)
SI T if_then_else(C cond, T t, T e)
static constexpr F F0
Definition: Transform_inl.h:27
SI T load(const P *ptr)
Definition: Transform_inl.h:98
V< uint8_t > U8
Definition: Transform_inl.h:19
SI F floor_(F x)
SI F approx_log2(F x)
V< int32_t > I32
Definition: Transform_inl.h:15
V< uint32_t > U32
Definition: Transform_inl.h:17
Vec2Value v2
static uint32_t premul(uint32_t color)
#define Z
virtual void var(int slot, int32_t val)=0
virtual void scope(int delta)=0
virtual void enter(int fnIdx)=0
virtual void exit(int fnIdx)=0
virtual void line(int lineNum)=0
static const int K
Definition: daa.cpp:21
DlColor color
const EmbeddedViewParams * params
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
Definition: main.cc:19
static bool b
struct MyStruct s
struct MyStruct a[10]
if(end==-1)
glong glong end
FlKeyEvent uint64_t FlKeyResponderAsyncCallback callback
uint8_t value
GAsyncResult * result
static SkColor blend(SkColor dst, SkColor src, void(*mode)(float, float, float, float *, float *, float *))
Definition: hsl.cpp:142
static void luminosity(float dr, float dg, float db, float *sr, float *sg, float *sb)
Definition: hsl.cpp:130
static void hue(float dr, float dg, float db, float *sr, float *sg, float *sb)
Definition: hsl.cpp:92
static void saturation(float dr, float dg, float db, float *sr, float *sg, float *sb)
Definition: hsl.cpp:105
static float max(float r, float g, float b)
Definition: hsl.cpp:49
static float min(float r, float g, float b)
Definition: hsl.cpp:48
#define R(r)
#define B
__attribute__((visibility("default"))) int RunBenchmarks(int argc
T __attribute__((ext_vector_type(N))) V
double y
double x
static std::unique_ptr< GrFragmentProcessor > hsl_to_rgb(std::unique_ptr< GrFragmentProcessor > child)
static std::unique_ptr< GrFragmentProcessor > rgb_to_hsl(std::unique_ptr< GrFragmentProcessor > child)
static void start_pipeline(size_t, size_t, size_t, size_t, SkRasterPipelineStage *, SkSpan< SkRasterPipeline_MemoryCtxPatch >, uint8_t *tailPointer)
static constexpr size_t lowp_N
static constexpr size_t N
SI F rsqrt_approx(F v)
SI F approx_powf(F x, F y)
SI void apply_adjacent_ternary(T *dst, T *src0, T *src1)
SI constexpr I32 I32_(int32_t x)
SI void load4(const uint16_t *ptr, U16 *r, U16 *g, U16 *b, U16 *a)
SI void smoothstep_fn(F *edge0, F *edge1, F *x)
SI void store(T *dst, V v)
SI void from_4444(U16 _4444, F *r, F *g, F *b, F *a)
SI void cmpne_fn(T *dst, T *src)
SI void cast_to_uint_from_fn(F *dst)
SI void from_565(U16 _565, F *r, F *g, F *b)
SI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx *ctx)
SI void cast_to_int_from_fn(F *dst)
void(ABI *)(Params *, SkRasterPipelineStage *program, F r, F g, F b, F a) Stage
SI constexpr U32 U32_(uint32_t x)
SI I32 abs_(I32 v)
SI void set_sat(F *r, F *g, F *b, F s)
SI void mul_fn(T *dst, T *src)
SI void cmpeq_fn(T *dst, T *src)
constexpr size_t raster_pipeline_highp_stride()
SI void sub_fn(T *dst, T *src)
SI void cast_to_float_from_fn(T *dst)
static void patch_memory_contexts(SkSpan< SkRasterPipeline_MemoryCtxPatch > memoryCtxPatches, size_t dx, size_t dy, size_t tail)
SI F nmad(F f, F m, F a)
SI void copy_n_slots_masked_fn(SkRasterPipeline_BinaryOpCtx *packed, std::byte *base, I32 mask)
SI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb)
SI void store4(uint16_t *ptr, U16 r, U16 g, U16 b, U16 a)
SI uint32_t select_lane(uint32_t data, int)
SI F min(F a, F b)
SI void load2(const uint16_t *ptr, U16 *r, U16 *g)
SI V load(const T *src)
static constexpr F F1
SI void scatter_masked(I32 src, int *dst, U32 ix, I32 mask)
SI U16 pack(U32 v)
SI void apply_adjacent_ternary_packed(SkRasterPipeline_TernaryOpCtx *packed, std::byte *base)
SI F if_then_else(I32 c, F t, F e)
static void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, SkRasterPipelineStage *program, SkSpan< SkRasterPipeline_MemoryCtxPatch > memoryCtxPatches, uint8_t *tailPointer)
SI void from_1010102(U32 rgba, F *r, F *g, F *b, F *a)
SI F approx_atan_unit(F x)
SI void from_8888(U32 _8888, F *r, F *g, F *b, F *a)
SI void apply_adjacent_unary(T *dst, T *end)
SI T gather(const T *p, U32 ix)
SI U32 to_unorm(F v, float scale, float bias=1.0f)
SI F sat(F r, F g, F b)
SI U32 ix_and_ptr(T **ptr, const SkRasterPipeline_GatherCtx *ctx, F x, F y)
SI void bicubic_y(SkRasterPipeline_SamplerCtx *ctx, F *y)
SI bool all(I32 c)
SI void bicubic_x(SkRasterPipeline_SamplerCtx *ctx, F *x)
SI void cmplt_fn(T *dst, T *src)
SI F mod_(F x, float y)
STAGE_TAIL(set_base_pointer, std::byte *p)
SI void ceil_fn(F *dst)
SI F strip_sign(F x, U32 *sign)
SI void abs_fn(I32 *dst)
static void restore_memory_contexts(SkSpan< SkRasterPipeline_MemoryCtxPatch > memoryCtxPatches, size_t dx, size_t dy, size_t tail)
SI void bilinear_x(SkRasterPipeline_SamplerCtx *ctx, F *x)
SI void apply_binary_immediate(SkRasterPipeline_ConstantCtx *packed, std::byte *base)
static void ABI stack_checkpoint(Params *params, SkRasterPipelineStage *program, F r, F g, F b, F a)
SI void floor_fn(F *dst)
SI void small_swizzle_fn(SkRasterPipeline_SwizzleCtx *packed, std::byte *base)
SI void from_1616(U32 _1616, F *r, F *g)
SI void from_16161616(U64 _16161616, F *r, F *g, F *b, F *a)
SI void save_xy(F *r, F *g, SkRasterPipeline_SamplerCtx *c)
SI void from_10101010_xr(U64 _10x6, F *r, F *g, F *b, F *a)
SI F atan2_(F y0, F x0)
SI void apply_adjacent_binary(T *dst, T *src)
SI F lum(F r, F g, F b)
SI F max(F a, F b)
SI void from_1010102_xr(U32 rgba, F *r, F *g, F *b, F *a)
SI void bilinear_y(SkRasterPipeline_SamplerCtx *ctx, F *y)
SI void matrix_multiply(SkRasterPipeline_MatrixMultiplyCtx *packed, std::byte *base)
SI F mad(F f, F m, F a)
SI void from_10x6(U64 _10x6, F *r, F *g, F *b, F *a)
SI F clamp_ex(F v, float limit)
SI void shuffle_fn(std::byte *ptr, OffsetType *offsets, int numSlots)
static void ABI just_return(Params *, SkRasterPipelineStage *, F, F, F, F)
STAGE_BRANCH(branch_if_all_lanes_active, SkRasterPipeline_BranchIfAllLanesActiveCtx *ctx)
SI void set_lum(F *r, F *g, F *b, F l)
SI void mod_fn(F *dst, F *src)
constexpr size_t raster_pipeline_lowp_stride()
static void ABI stack_rewind(Params *params, SkRasterPipelineStage *program, F r, F g, F b, F a)
SI F clamp(F v, F limit)
SI constexpr F F_(float x)
SI U32 expand(U16 v)
SI void bitwise_xor_fn(I32 *dst, I32 *src)
SI void atan2_fn(F *dst, F *src)
SI void gradient_lookup(const SkRasterPipeline_GradientCtx *c, U32 idx, F t, F *r, F *g, F *b, F *a)
SI void bitwise_and_fn(I32 *dst, I32 *src)
SI F apply_sign(F x, U32 sign)
SI void div_fn(T *dst, T *src)
SI void cmple_fn(T *dst, T *src)
SI void max_fn(T *dst, T *src)
SI void apply_adjacent_binary_packed(SkRasterPipeline_BinaryOpCtx *packed, std::byte *base)
SI F bicubic_wts(F t, float A, float B, float C, float D)
SI void min_fn(T *dst, T *src)
SI F from_short(U16 s)
SI T * ptr_at_xy(const SkRasterPipeline_MemoryCtx *ctx, size_t dx, size_t dy)
SI void swizzle_copy_masked_fn(I32 *dst, const I32 *src, uint16_t *offsets, I32 mask)
SI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx *ctx)
SI void clip_color(F *r, F *g, F *b, F a)
SI void from_88(U16 _88, F *r, F *g)
SI void mix_fn(I32 *a, I32 *x, I32 *y)
SI F clip_channel(F c, F l, I32 clip_low, I32 clip_high, F mn_scale, F mx_scale)
STAGE(seed_shader, NoCtx)
SI void pow_fn(F *dst, F *src)
SI void invsqrt_fn(F *dst)
SI void add_fn(T *dst, T *src)
SI bool any(I32 c)
SI I32 cond_to_mask(I32 cond)
SI RGB css_hsl_to_srgb_(F h, F s, F l)
SI F from_half(U16 h)
static constexpr F F0
SI F lerp(F from, F to, F t)
SI void copy_n_immutable_unmasked_fn(SkRasterPipeline_BinaryOpCtx *packed, std::byte *base)
SI I32 iround(F v)
SI void store2(uint16_t *ptr, U16 r, U16 g)
SI F compute_perlin_vector(U32 sample, F x, F y)
SI void copy_n_slots_unmasked_fn(SkRasterPipeline_BinaryOpCtx *packed, std::byte *base)
SI void bitwise_or_fn(I32 *dst, I32 *src)
static UnpackedType< T > Unpack(const T *ctx)
skia_private::AutoTArray< sk_sp< SkImageFilter > > filters TypedMatrix matrix TypedMatrix matrix SkScalar dx
Definition: SkRecords.h:208
def match(bench, filt)
Definition: benchmark.py:23
Int96 multiply(int64_t a, int32_t b)
Definition: Int96.cpp:41
it will be possible to load the file into Perfetto s trace viewer disable asset Prevents usage of any non test fonts unless they were explicitly Loaded via prefetched default font Indicates whether the embedding started a prefetch of the default font manager before creating the engine run In non interactive keep the shell running after the Dart script has completed enable serial On low power devices with low core running concurrent GC tasks on threads can cause them to contend with the UI thread which could potentially lead to jank This option turns off all concurrent GC activities domain network JSON encoded network policy per domain This overrides the DisallowInsecureConnections switch Embedder can specify whether to allow or disallow insecure connections at a domain level old gen heap size
Definition: switches.h:259
list offsets
Definition: mskp_parser.py:37
SIN Vec< N, float > abs(const Vec< N, float > &x)
Definition: SkVx.h:707
SIN Vec< N, float > sqrt(const Vec< N, float > &x)
Definition: SkVx.h:706
SIN Vec< N, uint8_t > div255(const Vec< N, uint16_t > &x)
Definition: SkVx.h:818
SI Vec< sizeof...(Ix), T > shuffle(const Vec< N, T > &)
Definition: SkVx.h:667
SIN Vec< N, float > floor(const Vec< N, float > &x)
Definition: SkVx.h:703
SIN Vec< N, float > ceil(const Vec< N, float > &x)
Definition: SkVx.h:702
SkScalar w
SkScalar h
#define T
Definition: precompiler.cc:65
#define V(name)
Definition: raw_object.h:125
int32_t width
#define M(PROC, DITHER)
static SkString join(const CommandLineFlags::StringArray &)
Definition: skpbench.cpp:741
const Scalar scale
SeparatedVector2 offset
SkRasterPipelineStage * fStage
Definition: SkMD5.cpp:120
Definition: SkMD5.cpp:125
Definition: SkMD5.cpp:130
uint32_t fMask[SkRasterPipeline_kMaxStride_highp]
float rgba[4 *SkRasterPipeline_kMaxStride_highp]
void(* fn)(SkRasterPipeline_CallbackCtx *self, int active_pixels)
uint32_t mask[SkRasterPipeline_kMaxStride]
SkRasterPipeline_MemoryCtx add
SkRasterPipeline_MemoryCtx mul
float x[SkRasterPipeline_kMaxStride_highp]
float g[SkRasterPipeline_kMaxStride_highp]
float r[SkRasterPipeline_kMaxStride_highp]
float b[SkRasterPipeline_kMaxStride_highp]
float y[SkRasterPipeline_kMaxStride_highp]
float a[SkRasterPipeline_kMaxStride_highp]
float dg[SkRasterPipeline_kMaxStride_highp]
float g[SkRasterPipeline_kMaxStride_highp]
float dr[SkRasterPipeline_kMaxStride_highp]
float db[SkRasterPipeline_kMaxStride_highp]
float a[SkRasterPipeline_kMaxStride_highp]
float r[SkRasterPipeline_kMaxStride_highp]
float da[SkRasterPipeline_kMaxStride_highp]
float b[SkRasterPipeline_kMaxStride_highp]
float scalex[SkRasterPipeline_kMaxStride_highp]
float fy[SkRasterPipeline_kMaxStride_highp]
float wy[4][SkRasterPipeline_kMaxStride_highp]
float scaley[SkRasterPipeline_kMaxStride_highp]
float fx[SkRasterPipeline_kMaxStride_highp]
float x[SkRasterPipeline_kMaxStride_highp]
float wx[4][SkRasterPipeline_kMaxStride_highp]
float y[SkRasterPipeline_kMaxStride_highp]
std::shared_ptr< const fml::Mapping > data
Definition: texture_gles.cc:63