Flutter Engine
The Flutter Engine
SkBitmapProcState_opts.h
Go to the documentation of this file.
1/*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkBitmapProcState_opts_DEFINED
9#define SkBitmapProcState_opts_DEFINED
10
11#include "src/base/SkMSAN.h"
12#include "src/base/SkVx.h"
14
15// SkBitmapProcState optimized Shader, Sample, or Matrix procs.
16//
17// Only S32_alpha_D32_filter_DX exploits instructions beyond
18// our common baseline SSE2/NEON instruction sets, so that's
19// all that lives here.
20//
21// The rest are scattershot at the moment but I want to get them
22// all migrated to be normal code inside SkBitmapProcState.cpp.
23
24#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
25 #include <immintrin.h>
26#elif defined(SK_ARM_HAS_NEON)
27 #include <arm_neon.h>
28#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
29 #include <lasxintrin.h>
30#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
31 #include <lsxintrin.h>
32#endif
33
34namespace SK_OPTS_NS {
35
36// This same basic packing scheme is used throughout the file.
37template <typename U32, typename Out>
38static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
39 *v0 = (packed >> 18); // Integer coordinate x0 or y0.
40 *v1 = (packed & 0x3fff); // Integer coordinate x1 or y1.
41 *w = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w.
42}
43
44#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
45
46 /*not static*/ inline
48 const uint32_t* xy, int count, uint32_t* colors) {
49 SkASSERT(count > 0 && colors != nullptr);
50 SkASSERT(s.fBilerp);
51 SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
52 SkASSERT(s.fAlphaScale <= 256);
53
54 // interpolate_in_x() is the crux of the SSSE3 implementation,
55 // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
56 auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
57 uint32_t B0, uint32_t B1,
58 __m128i interlaced_x_weights) {
59 // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp.
60 //
61 // It takes two arguments interlaced byte-wise:
62 // - first arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...]
63 // - second arg: [ w,W, ... 7 more pairs of signed 8-bit values ...]
64 // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ].
65 //
66 // That's why we go to all this trouble to make interlaced_x_weights,
67 // and here we're about to interlace A0 with A1 and B0 with B1 to match.
68 //
69 // Our interlaced_x_weights are all in [0,16], and so we need not worry about
70 // the signedness of that input nor about the signedness of the output.
71
72 __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
73 interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
74
75 return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
76 interlaced_x_weights);
77 };
78
79 // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
80 // Returns two pixels, with each color channel in a 16-bit lane of the __m128i.
81 auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
82 uint32_t A2, uint32_t A3,
83 uint32_t B0, uint32_t B1,
84 uint32_t B2, uint32_t B3,
85 __m128i interlaced_x_weights,
86 int wy) {
87 // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights.
88 __m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
89 bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
90
91 // Interpolate in Y. As in the SSE2 code, we calculate top*(16-wy) + bot*wy
92 // as 16*top + (bot-top)*wy to save a multiply.
93 __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4),
94 _mm_mullo_epi16(_mm_sub_epi16(bot, top),
95 _mm_set1_epi16(wy)));
96
97 // Scale down by total max weight 16x16 = 256.
98 px = _mm_srli_epi16(px, 8);
99
100 // Scale by alpha if needed.
101 if (s.fAlphaScale < 256) {
102 px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8);
103 }
104 return px;
105 };
106
107 // We're in _DX mode here, so we're only varying in X.
108 // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
109 // All the other entries in xy will be pairs of X coordinates and the X weight.
110 int y0, y1, wy;
111 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
112
113 auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
114 row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
115
116 while (count >= 4) {
117 // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.
118 int x0[4],
119 x1[4];
120 __m128i wx;
121
122 // decode_packed_coordinates_and_weight(), 4x.
123 __m128i packed = _mm_loadu_si128((const __m128i*)xy);
124 _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18));
125 _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
126 wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf)); // [0,15]
127
128 // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,
129 // and sixteen minus that as wl for pixels on the left at x0.
130 __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
131 wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
132
133 // We need to interlace wl and wr for _mm_maddubs_epi16().
134 __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),
135 interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);
136
137 enum { A,B,C,D };
138
139 // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
140 // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
141 __m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]],
142 row1[x0[A]], row1[x1[A]],
143 row0[x0[B]], row0[x1[B]],
144 row1[x0[B]], row1[x1[B]],
145 interlaced_x_weights_AB, wy);
146
147 // Once more with the other half of the x-weights for two more pixels C,D.
148 __m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]],
149 row1[x0[C]], row1[x1[C]],
150 row0[x0[D]], row0[x1[D]],
151 row1[x0[D]], row1[x1[D]],
152 interlaced_x_weights_CD, wy);
153
154 // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
155 _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD));
156 xy += 4;
157 colors += 4;
158 count -= 4;
159 }
160
161 while (count --> 0) {
162 // This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
163 int x0, x1, wx;
164 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
165
166 // As above, splat out wx four times as wr, and sixteen minus that as wl.
167 __m128i wr = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine.
168 wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
169
170 __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr);
171
172 __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
173 row1[x0], row1[x1],
174 0, 0,
175 0, 0,
176 interlaced_x_weights, wy);
177
178 *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128()));
179 }
180 }
181
182
183#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
184
185 /*not static*/ inline
187 const uint32_t* xy, int count, uint32_t* colors) {
188 SkASSERT(count > 0 && colors != nullptr);
189 SkASSERT(s.fBilerp);
190 SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
191 SkASSERT(s.fAlphaScale <= 256);
192
193 int y0, y1, wy;
194 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
195
196 auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
197 row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
198
199 // We'll put one pixel in the low 4 16-bit lanes to line up with wy,
200 // and another in the upper 4 16-bit lanes to line up with 16 - wy.
201 const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), // Bottom pixel goes here.
202 _mm_set1_epi16(16-wy)); // Top pixel goes here.
203
204 while (count --> 0) {
205 int x0, x1, wx;
206 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
207
208 // Load the 4 pixels we're interpolating, in this grid:
209 // | tl tr |
210 // | bl br |
211 const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]),
212 bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]);
213
214 // We want to calculate a sum of 4 pixels weighted in two directions:
215 //
216 // sum = tl * (16-wy) * (16-wx)
217 // + bl * ( wy) * (16-wx)
218 // + tr * (16-wy) * ( wx)
219 // + br * ( wy) * ( wx)
220 //
221 // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
222 //
223 // We've already prepared allY as a vector containing [wy, 16-wy] as a way
224 // to apply those y-direction weights. So we'll start on the x-direction
225 // first, grouping into left and right halves, lined up with allY:
226 //
227 // L = [bl, tl]
228 // R = [br, tr]
229 //
230 // sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
231 //
232 // Rewriting that one more step, we can replace a multiply with a shift:
233 //
234 // sum = horizontalSum( allY * (16*L + (R-L)*wx) )
235 //
236 // That's how we'll actually do this math.
237
238 __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),
239 R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());
240
241 __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4),
242 _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx)));
243
244 __m128i sum_in_x = _mm_mullo_epi16(inner, allY);
245
246 // sum = horizontalSum( ... )
247 __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8));
248
249 // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
250 sum = _mm_srli_epi16(sum, 8);
251
252 if (s.fAlphaScale < 256) {
253 // Scale by alpha, which is in [0,256].
254 sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
255 sum = _mm_srli_epi16(sum, 8);
256 }
257
258 // Pack back into 8-bit values and store.
259 *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
260 }
261 }
262
263#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
264 /*not static*/ inline
266 const uint32_t* xy, int count, uint32_t* colors) {
267 SkASSERT(count > 0 && colors != nullptr);
268 SkASSERT(s.fBilerp);
269 SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
270 SkASSERT(s.fAlphaScale <= 256);
271
272 int y0, y1, wy;
273 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
274
275 auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
276 row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
277
278 // We'll put one pixel in the low 16 16-bit lanes to line up with wy,
279 // and another in the upper 16 16-bit lanes to line up with 16 - wy.
280 __m256i allY = __lasx_xvilvl_d(__lasx_xvreplgr2vr_h(16-wy), __lasx_xvreplgr2vr_h(wy));
281
282 while (count --> 0) {
283 int x0, x1, wx;
284 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
285
286 // Load the 4 pixels we're interpolating, in this grid:
287 // | tl tr |
288 // | bl br |
289
290 const __m256i zeros = __lasx_xvldi(0);
291 const __m256i tl = __lasx_xvinsgr2vr_w(zeros, row0[x0], 0),
292 tr = __lasx_xvinsgr2vr_w(zeros, row0[x1], 0),
293 bl = __lasx_xvinsgr2vr_w(zeros, row1[x0], 0),
294 br = __lasx_xvinsgr2vr_w(zeros, row1[x1], 0);
295
296 // We want to calculate a sum of 8 pixels weighted in two directions:
297 //
298 // sum = tl * (16-wy) * (16-wx)
299 // + bl * ( wy) * (16-wx)
300 // + tr * (16-wy) * ( wx)
301 // + br * ( wy) * ( wx)
302 //
303 // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
304 //
305 // We've already prepared allY as a vector containing [wy, 16-wy] as a way
306 // to apply those y-direction weights. So we'll start on the x-direction
307 // first, grouping into left and right halves, lined up with allY:
308 //
309 // L = [bl, tl]
310 // R = [br, tr]
311 //
312 // sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
313 //
314 // Rewriting that one more step, we can replace a multiply with a shift:
315 //
316 // sum = horizontalSum( allY * (16*L + (R-L)*wx) )
317 //
318 // That's how we'll actually do this math.
319
320 __m256i L = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tl, bl)),
321 R = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tr, br));
322
323 __m256i inner = __lasx_xvadd_h(__lasx_xvslli_h(L, 4),
324 __lasx_xvmul_h(__lasx_xvsub_h(R,L),
325 __lasx_xvreplgr2vr_h(wx)));
326
327 __m256i sum_in_x = __lasx_xvmul_h(inner, allY);
328
329 // sum = horizontalSum( ... )
330 __m256i sum = __lasx_xvadd_h(sum_in_x, __lasx_xvbsrl_v(sum_in_x, 8));
331
332 // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
333 sum = __lasx_xvsrli_h(sum, 8);
334
335 if (s.fAlphaScale < 256) {
336 // Scale by alpha, which is in [0,256].
337 sum = __lasx_xvmul_h(sum, __lasx_xvreplgr2vr_h(s.fAlphaScale));
338 sum = __lasx_xvsrli_h(sum, 8);
339 }
340
341 // Pack back into 8-bit values and store.
342 *colors++ = __lasx_xvpickve2gr_w(__lasx_xvpickev_b(__lasx_xvldi(0),
343 __lasx_xvsat_hu(sum, 8)), 0);
344 }
345 }
346
347#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
348
349 /*not static*/ inline
351 const uint32_t* xy, int count, uint32_t* colors) {
352 SkASSERT(count > 0 && colors != nullptr);
353 SkASSERT(s.fBilerp);
354 SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
355 SkASSERT(s.fAlphaScale <= 256);
356
357 int y0, y1, wy;
358 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
359
360 auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
361 row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
362
363 // We'll put one pixel in the low 8 16-bit lanes to line up with wy,
364 // and another in the upper 8 16-bit lanes to line up with 16 - wy.
365 __m128i allY = __lsx_vilvl_d(__lsx_vreplgr2vr_h(16-wy), __lsx_vreplgr2vr_h(wy));
366
367 while (count --> 0) {
368 int x0, x1, wx;
369 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
370
371 // Load the 4 pixels we're interpolating, in this grid:
372 // | tl tr |
373 // | bl br |
374 const __m128i zeros = __lsx_vldi(0);
375 const __m128i tl = __lsx_vinsgr2vr_w(zeros, row0[x0], 0),
376 tr = __lsx_vinsgr2vr_w(zeros, row0[x1], 0),
377 bl = __lsx_vinsgr2vr_w(zeros, row1[x0], 0),
378 br = __lsx_vinsgr2vr_w(zeros, row1[x1], 0);
379
380 // We want to calculate a sum of 8 pixels weighted in two directions:
381 //
382 // sum = tl * (16-wy) * (16-wx)
383 // + bl * ( wy) * (16-wx)
384 // + tr * (16-wy) * ( wx)
385 // + br * ( wy) * ( wx)
386 //
387 // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
388 //
389 // We've already prepared allY as a vector containing [wy, 16-wy] as a way
390 // to apply those y-direction weights. So we'll start on the x-direction
391 // first, grouping into left and right halves, lined up with allY:
392 //
393 // L = [bl, tl]
394 // R = [br, tr]
395 //
396 // sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
397 //
398 // Rewriting that one more step, we can replace a multiply with a shift:
399 //
400 // sum = horizontalSum( allY * (16*L + (R-L)*wx) )
401 //
402 // That's how we'll actually do this math.
403
404
405 __m128i L = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tl, bl)),
406 R = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tr, br));
407
408 __m128i inner = __lsx_vadd_h(__lsx_vslli_h(L, 4),
409 __lsx_vmul_h(__lsx_vsub_h(R,L),
410 __lsx_vreplgr2vr_h(wx)));
411
412 __m128i sum_in_x = __lsx_vmul_h(inner, allY);
413
414 // sum = horizontalSum( ... )
415 __m128i sum = __lsx_vadd_h(sum_in_x, __lsx_vbsrl_v(sum_in_x, 8));
416
417 // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
418 sum = __lsx_vsrli_h(sum, 8);
419
420 if (s.fAlphaScale < 256) {
421 // Scale by alpha, which is in [0,256].
422 sum = __lsx_vmul_h(sum, __lsx_vreplgr2vr_h(s.fAlphaScale));
423 sum = __lsx_vsrli_h(sum, 8);
424 }
425
426 // Pack back into 8-bit values and store.
427 *colors++ = __lsx_vpickve2gr_w(__lsx_vpickev_b(__lsx_vldi(0),
428 __lsx_vsat_hu(sum, 8)), 0);
429 }
430 }
431
432#else
433
434 // The NEON code only actually differs from the portable code in the
435 // filtering step after we've loaded all four pixels we want to bilerp.
436
437 #if defined(SK_ARM_HAS_NEON)
438 static void filter_and_scale_by_alpha(unsigned x, unsigned y,
439 SkPMColor a00, SkPMColor a01,
440 SkPMColor a10, SkPMColor a11,
441 SkPMColor *dst,
442 uint16_t scale) {
443 uint8x8_t vy, vconst16_8, v16_y, vres;
444 uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
445 uint32x2_t va0, va1;
446 uint16x8_t tmp1, tmp2;
447
448 vy = vdup_n_u8(y); // duplicate y into vy
449 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8
450 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y
451
452 va0 = vdup_n_u32(a00); // duplicate a00
453 va1 = vdup_n_u32(a10); // duplicate a10
454 va0 = vset_lane_u32(a01, va0, 1); // set top to a01
455 va1 = vset_lane_u32(a11, va1, 1); // set top to a11
456
457 tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
458 tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y
459
460 vx = vdup_n_u16(x); // duplicate x into vx
461 vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16
462 v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
463
464 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x
465 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x
466 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
467 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
468
469 if (scale < 256) {
470 vscale = vdup_n_u16(scale); // duplicate scale
471 tmp = vshr_n_u16(tmp, 8); // shift down result by 8
472 tmp = vmul_u16(tmp, vscale); // multiply result by scale
473 }
474
475 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8
476 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result
477 }
478 #else
479 static void filter_and_scale_by_alpha(unsigned x, unsigned y,
480 SkPMColor a00, SkPMColor a01,
481 SkPMColor a10, SkPMColor a11,
482 SkPMColor* dstColor,
483 unsigned alphaScale) {
484 SkASSERT((unsigned)x <= 0xF);
485 SkASSERT((unsigned)y <= 0xF);
486 SkASSERT(alphaScale <= 256);
487
488 int xy = x * y;
489 const uint32_t mask = 0xFF00FF;
490
491 int scale = 256 - 16*y - 16*x + xy;
492 uint32_t lo = (a00 & mask) * scale;
493 uint32_t hi = ((a00 >> 8) & mask) * scale;
494
495 scale = 16*x - xy;
496 lo += (a01 & mask) * scale;
497 hi += ((a01 >> 8) & mask) * scale;
498
499 scale = 16*y - xy;
500 lo += (a10 & mask) * scale;
501 hi += ((a10 >> 8) & mask) * scale;
502
503 lo += (a11 & mask) * xy;
504 hi += ((a11 >> 8) & mask) * xy;
505
506 if (alphaScale < 256) {
507 lo = ((lo >> 8) & mask) * alphaScale;
508 hi = ((hi >> 8) & mask) * alphaScale;
509 }
510
511 *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
512 }
513 #endif
514
515
516 /*not static*/ inline
518 const uint32_t* xy, int count, SkPMColor* colors) {
519 SkASSERT(count > 0 && colors != nullptr);
520 SkASSERT(s.fBilerp);
521 SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
522 SkASSERT(s.fAlphaScale <= 256);
523
524 int y0, y1, wy;
525 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
526
527 auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
528 row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
529
530 while (count --> 0) {
531 int x0, x1, wx;
532 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
533
534 filter_and_scale_by_alpha(wx, wy,
535 row0[x0], row0[x1],
536 row1[x0], row1[x1],
537 colors++,
538 s.fAlphaScale);
539 }
540 }
541
542#endif
543
544#if defined(SK_ARM_HAS_NEON)
545 /*not static*/ inline
547 const uint32_t* xy, int count, SkPMColor* colors) {
548 SkASSERT(count > 0 && colors != nullptr);
549 SkASSERT(s.fBilerp);
550 SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
551 SkASSERT(s.fAlphaScale <= 256);
552
553 auto src = (const char*)s.fPixmap.addr();
554 size_t rb = s.fPixmap.rowBytes();
555
556 while (count --> 0) {
557 int y0, y1, wy,
558 x0, x1, wx;
559 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
560 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
561
562 auto row0 = (const uint32_t*)(src + y0*rb),
563 row1 = (const uint32_t*)(src + y1*rb);
564
565 filter_and_scale_by_alpha(wx, wy,
566 row0[x0], row0[x1],
567 row1[x0], row1[x1],
568 colors++,
569 s.fAlphaScale);
570 }
571 }
572#else
573 // It's not yet clear whether it's worthwhile specializing for other architectures.
574 constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&,
575 const uint32_t*, int, SkPMColor*) = nullptr;
576#endif
577
578} // namespace SK_OPTS_NS
579
580namespace sktests {
581 template <typename U32, typename Out>
582 void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
583 SK_OPTS_NS::decode_packed_coordinates_and_weight<U32, Out>(packed, v0, v1, w);
584 }
585}
586
587#endif
int count
Definition: FontMgrTest.cpp:50
#define SkASSERT(cond)
Definition: SkAssert.h:116
uint32_t SkPMColor
Definition: SkColor.h:205
static void B2(DFData *curr, int width)
static void B1(DFData *curr, int width)
V< uint32_t > U32
Definition: Transform_inl.h:17
#define C(TEST_CATEGORY)
Definition: colrv1.cpp:248
struct MyStruct s
#define R(r)
#define B
double y
double x
static constexpr void(* S32_alpha_D32_filter_DXDY)(const SkBitmapProcState &, const uint32_t *, int, SkPMColor *)
static void decode_packed_coordinates_and_weight(U32 packed, Out *v0, Out *v1, Out *w)
void S32_alpha_D32_filter_DX(const SkBitmapProcState &s, const uint32_t *xy, int count, uint32_t *colors)
PODArray< SkColor > colors
Definition: SkRecords.h:276
void decode_packed_coordinates_and_weight(U32 packed, Out *v0, Out *v1, Out *w)
SkScalar w
const Scalar scale