dc/d6b/SkBitmapProcState__opts_8h_source.html

/*

 * Copyright 2018 Google Inc.

 *

 * Use of this source code is governed by a BSD-style license that can be

 * found in the LICENSE file.

 */


#ifndef SkBitmapProcState_opts_DEFINED

#define SkBitmapProcState_opts_DEFINED


#include "src/base/SkMSAN.h"

#include "src/base/SkVx.h"

#include "src/core/SkBitmapProcState.h"


// SkBitmapProcState optimized Shader, Sample, or Matrix procs.

//

// Only S32_alpha_D32_filter_DX exploits instructions beyond

// our common baseline SSE2/NEON instruction sets, so that's

// all that lives here.

//

// The rest are scattershot at the moment but I want to get them

// all migrated to be normal code inside SkBitmapProcState.cpp.


#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

    #include <immintrin.h>

#elif defined(SK_ARM_HAS_NEON)

    #include <arm_neon.h>

#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX

    #include <lasxintrin.h>

#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX

    #include <lsxintrin.h>

#endif


namespace SK_OPTS_NS {


// This same basic packing scheme is used throughout the file.

template <typename U32, typename Out>

static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {

    *v0 = (packed >> 18);       // Integer coordinate x0 or y0.

    *v1 = (packed & 0x3fff);    // Integer coordinate x1 or y1.

    *w  = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w.

}


#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3


    /*not static*/ inline

    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,

                                 const uint32_t* xy, int count, uint32_t* colors) {

        SkASSERT(count > 0 && colors != nullptr);

        SkASSERT(s.fBilerp);

        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());

        SkASSERT(s.fAlphaScale <= 256);


        // interpolate_in_x() is the crux of the SSSE3 implementation,

        // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().

        auto interpolate_in_x = [](uint32_t A0, uint32_t A1,

                                   uint32_t B0, uint32_t B1,

                                   __m128i interlaced_x_weights) {

            // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp.

            //

            // It takes two arguments interlaced byte-wise:

            //    - first  arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...]

            //    - second arg: [ w,W, ... 7 more pairs of   signed 8-bit values ...]

            // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ].

            //

            // That's why we go to all this trouble to make interlaced_x_weights,

            // and here we're about to interlace A0 with A1 and B0 with B1 to match.

            //

            // Our interlaced_x_weights are all in [0,16], and so we need not worry about

            // the signedness of that input nor about the signedness of the output.


            __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),

                    interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));


            return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),

                                     interlaced_x_weights);

        };


        // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.

        // Returns two pixels, with each color channel in a 16-bit lane of the __m128i.

        auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,

                                          uint32_t A2, uint32_t A3,

                                          uint32_t B0, uint32_t B1,

                                          uint32_t B2, uint32_t B3,

                                          __m128i interlaced_x_weights,

                                          int wy) {

            // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights.

            __m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),

                    bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);


            // Interpolate in Y.  As in the SSE2 code, we calculate top*(16-wy) + bot*wy

            // as 16*top + (bot-top)*wy to save a multiply.

            __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4),

                                       _mm_mullo_epi16(_mm_sub_epi16(bot, top),

                                                       _mm_set1_epi16(wy)));


            // Scale down by total max weight 16x16 = 256.

            px = _mm_srli_epi16(px, 8);


            // Scale by alpha if needed.

            if (s.fAlphaScale < 256) {

                px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8);

            }

            return px;

        };


        // We're in _DX mode here, so we're only varying in X.

        // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.

        // All the other entries in xy will be pairs of X coordinates and the X weight.

        int y0, y1, wy;

        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);


        auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),

             row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());


        while (count >= 4) {

            // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.

            int x0[4],

                x1[4];

            __m128i wx;


            // decode_packed_coordinates_and_weight(), 4x.

            __m128i packed = _mm_loadu_si128((const __m128i*)xy);

            _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18));

            _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));

            wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));  // [0,15]


            // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,

            // and sixteen minus that as wl for pixels on the left at x0.

            __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),

                    wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);


            // We need to interlace wl and wr for _mm_maddubs_epi16().

            __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),

                    interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);


            enum { A,B,C,D };


            // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time

            // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.

            __m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]],

                                                row1[x0[A]], row1[x1[A]],

                                                row0[x0[B]], row0[x1[B]],

                                                row1[x0[B]], row1[x1[B]],

                                                interlaced_x_weights_AB, wy);


            // Once more with the other half of the x-weights for two more pixels C,D.

            __m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]],

                                                row1[x0[C]], row1[x1[C]],

                                                row0[x0[D]], row0[x1[D]],

                                                row1[x0[D]], row1[x1[D]],

                                                interlaced_x_weights_CD, wy);


            // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!

            _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD));

            xy     += 4;

            colors += 4;

            count  -= 4;

        }


        while (count --> 0) {

            // This is exactly the same flow as the count >= 4 loop above, but writing one pixel.

            int x0, x1, wx;

            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);


            // As above, splat out wx four times as wr, and sixteen minus that as wl.

            __m128i wr = _mm_set1_epi8(wx),     // This splats it out 16 times, but that's fine.

                    wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);


            __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr);


            __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],

                                               row1[x0], row1[x1],

                                                      0,        0,

                                                      0,        0,

                                               interlaced_x_weights, wy);


            *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128()));

        }

    }


#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2


    /*not static*/ inline

    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,

                                 const uint32_t* xy, int count, uint32_t* colors) {

        SkASSERT(count > 0 && colors != nullptr);

        SkASSERT(s.fBilerp);

        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());

        SkASSERT(s.fAlphaScale <= 256);


        int y0, y1, wy;

        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);


        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),

             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );


        // We'll put one pixel in the low 4 16-bit lanes to line up with wy,

        // and another in the upper 4 16-bit lanes to line up with 16 - wy.

        const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16(   wy),   // Bottom pixel goes here.

                                                _mm_set1_epi16(16-wy));  // Top pixel goes here.


        while (count --> 0) {

            int x0, x1, wx;

            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);


            // Load the 4 pixels we're interpolating, in this grid:

            //    | tl  tr |

            //    | bl  br |

            const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]),

                          bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]);


            // We want to calculate a sum of 4 pixels weighted in two directions:

            //

            //  sum = tl * (16-wy) * (16-wx)

            //      + bl * (   wy) * (16-wx)

            //      + tr * (16-wy) * (   wx)

            //      + br * (   wy) * (   wx)

            //

            // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)

            //

            // We've already prepared allY as a vector containing [wy, 16-wy] as a way

            // to apply those y-direction weights.  So we'll start on the x-direction

            // first, grouping into left and right halves, lined up with allY:

            //

            //     L = [bl, tl]

            //     R = [br, tr]

            //

            //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )

            //

            // Rewriting that one more step, we can replace a multiply with a shift:

            //

            //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )

            //

            // That's how we'll actually do this math.


            __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),

                    R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());


            __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4),

                                          _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx)));


            __m128i sum_in_x = _mm_mullo_epi16(inner, allY);


            // sum = horizontalSum( ... )

            __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8));


            // Get back to [0,255] by dividing by maximum weight 16x16 = 256.

            sum = _mm_srli_epi16(sum, 8);


            if (s.fAlphaScale < 256) {

                // Scale by alpha, which is in [0,256].

                sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));

                sum = _mm_srli_epi16(sum, 8);

            }


            // Pack back into 8-bit values and store.

            *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));

        }

    }


#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX

    /*not static*/ inline

    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,

                                 const uint32_t* xy, int count, uint32_t* colors) {

        SkASSERT(count > 0 && colors != nullptr);

        SkASSERT(s.fBilerp);

        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());

        SkASSERT(s.fAlphaScale <= 256);


        int y0, y1, wy;

        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);


        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),

             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );


        // We'll put one pixel in the low 16 16-bit lanes to line up with wy,

        // and another in the upper 16 16-bit lanes to line up with 16 - wy.

        __m256i allY = __lasx_xvilvl_d(__lasx_xvreplgr2vr_h(16-wy), __lasx_xvreplgr2vr_h(wy));


        while (count --> 0) {

            int x0, x1, wx;

            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);


            // Load the 4 pixels we're interpolating, in this grid:

            //    | tl  tr |

            //    | bl  br |


            const __m256i zeros = __lasx_xvldi(0);

            const __m256i tl = __lasx_xvinsgr2vr_w(zeros, row0[x0], 0),

                          tr = __lasx_xvinsgr2vr_w(zeros, row0[x1], 0),

                          bl = __lasx_xvinsgr2vr_w(zeros, row1[x0], 0),

                          br = __lasx_xvinsgr2vr_w(zeros, row1[x1], 0);


            // We want to calculate a sum of 8 pixels weighted in two directions:

            //

            //  sum = tl * (16-wy) * (16-wx)

            //      + bl * (   wy) * (16-wx)

            //      + tr * (16-wy) * (   wx)

            //      + br * (   wy) * (   wx)

            //

            // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)

            //

            // We've already prepared allY as a vector containing [wy, 16-wy] as a way

            // to apply those y-direction weights.  So we'll start on the x-direction

            // first, grouping into left and right halves, lined up with allY:

            //

            //     L = [bl, tl]

            //     R = [br, tr]

            //

            //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )

            //

            // Rewriting that one more step, we can replace a multiply with a shift:

            //

            //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )

            //

            // That's how we'll actually do this math.


            __m256i L = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tl, bl)),

                    R = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tr, br));


            __m256i inner = __lasx_xvadd_h(__lasx_xvslli_h(L, 4),

                                           __lasx_xvmul_h(__lasx_xvsub_h(R,L),

                                                          __lasx_xvreplgr2vr_h(wx)));


            __m256i sum_in_x = __lasx_xvmul_h(inner, allY);


            // sum = horizontalSum( ... )

            __m256i sum = __lasx_xvadd_h(sum_in_x, __lasx_xvbsrl_v(sum_in_x, 8));


            // Get back to [0,255] by dividing by maximum weight 16x16 = 256.

            sum = __lasx_xvsrli_h(sum, 8);


            if (s.fAlphaScale < 256) {

                // Scale by alpha, which is in [0,256].

                sum = __lasx_xvmul_h(sum, __lasx_xvreplgr2vr_h(s.fAlphaScale));

                sum = __lasx_xvsrli_h(sum, 8);

            }


            // Pack back into 8-bit values and store.

            *colors++ = __lasx_xvpickve2gr_w(__lasx_xvpickev_b(__lasx_xvldi(0),

                                                               __lasx_xvsat_hu(sum, 8)), 0);

        }

    }


#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX


    /*not static*/ inline

    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,

                                 const uint32_t* xy, int count, uint32_t* colors) {

        SkASSERT(count > 0 && colors != nullptr);

        SkASSERT(s.fBilerp);

        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());

        SkASSERT(s.fAlphaScale <= 256);


        int y0, y1, wy;

        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);


        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),

             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );


        // We'll put one pixel in the low 8 16-bit lanes to line up with wy,

        // and another in the upper 8 16-bit lanes to line up with 16 - wy.

        __m128i allY = __lsx_vilvl_d(__lsx_vreplgr2vr_h(16-wy), __lsx_vreplgr2vr_h(wy));


        while (count --> 0) {

            int x0, x1, wx;

            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);


            // Load the 4 pixels we're interpolating, in this grid:

            //    | tl  tr |

            //    | bl  br |

            const __m128i zeros = __lsx_vldi(0);

            const __m128i tl = __lsx_vinsgr2vr_w(zeros, row0[x0], 0),

                          tr = __lsx_vinsgr2vr_w(zeros, row0[x1], 0),

                          bl = __lsx_vinsgr2vr_w(zeros, row1[x0], 0),

                          br = __lsx_vinsgr2vr_w(zeros, row1[x1], 0);


            // We want to calculate a sum of 8 pixels weighted in two directions:

            //

            //  sum = tl * (16-wy) * (16-wx)

            //      + bl * (   wy) * (16-wx)

            //      + tr * (16-wy) * (   wx)

            //      + br * (   wy) * (   wx)

            //

            // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)

            //

            // We've already prepared allY as a vector containing [wy, 16-wy] as a way

            // to apply those y-direction weights.  So we'll start on the x-direction

            // first, grouping into left and right halves, lined up with allY:

            //

            //     L = [bl, tl]

            //     R = [br, tr]

            //

            //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )

            //

            // Rewriting that one more step, we can replace a multiply with a shift:

            //

            //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )

            //

            // That's how we'll actually do this math.


            __m128i L = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tl, bl)),

                    R = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tr, br));


            __m128i inner = __lsx_vadd_h(__lsx_vslli_h(L, 4),

                                         __lsx_vmul_h(__lsx_vsub_h(R,L),

                                                      __lsx_vreplgr2vr_h(wx)));


            __m128i sum_in_x = __lsx_vmul_h(inner, allY);


            // sum = horizontalSum( ... )

            __m128i sum = __lsx_vadd_h(sum_in_x, __lsx_vbsrl_v(sum_in_x, 8));


            // Get back to [0,255] by dividing by maximum weight 16x16 = 256.

            sum = __lsx_vsrli_h(sum, 8);


            if (s.fAlphaScale < 256) {

                // Scale by alpha, which is in [0,256].

                sum = __lsx_vmul_h(sum, __lsx_vreplgr2vr_h(s.fAlphaScale));

                sum = __lsx_vsrli_h(sum, 8);

            }


            // Pack back into 8-bit values and store.

            *colors++ = __lsx_vpickve2gr_w(__lsx_vpickev_b(__lsx_vldi(0),

                                                           __lsx_vsat_hu(sum, 8)), 0);

        }

    }


#else


    // The NEON code only actually differs from the portable code in the

    // filtering step after we've loaded all four pixels we want to bilerp.


    #if defined(SK_ARM_HAS_NEON)

        static void filter_and_scale_by_alpha(unsigned x, unsigned y,

                                              SkPMColor a00, SkPMColor a01,

                                              SkPMColor a10, SkPMColor a11,

                                              SkPMColor *dst,

                                              uint16_t scale) {

            uint8x8_t vy, vconst16_8, v16_y, vres;

            uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;

            uint32x2_t va0, va1;

            uint16x8_t tmp1, tmp2;


            vy = vdup_n_u8(y);                // duplicate y into vy

            vconst16_8 = vmov_n_u8(16);       // set up constant in vconst16_8

            v16_y = vsub_u8(vconst16_8, vy);  // v16_y = 16-y


            va0 = vdup_n_u32(a00);            // duplicate a00

            va1 = vdup_n_u32(a10);            // duplicate a10

            va0 = vset_lane_u32(a01, va0, 1); // set top to a01

            va1 = vset_lane_u32(a11, va1, 1); // set top to a11


            tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)

            tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);    // tmp2 = [a11|a10] * y


            vx = vdup_n_u16(x);                // duplicate x into vx

            vconst16_16 = vmov_n_u16(16);      // set up constant in vconst16_16

            v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x


            tmp = vmul_u16(vget_high_u16(tmp1), vx);        // tmp  = a01 * x

            tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);   // tmp += a11 * x

            tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)

            tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)


            if (scale < 256) {

                vscale = vdup_n_u16(scale);        // duplicate scale

                tmp = vshr_n_u16(tmp, 8);          // shift down result by 8

                tmp = vmul_u16(tmp, vscale);       // multiply result by scale

            }


            vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8

            vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result

        }

    #else

        static void filter_and_scale_by_alpha(unsigned x, unsigned y,

                                              SkPMColor a00, SkPMColor a01,

                                              SkPMColor a10, SkPMColor a11,

                                              SkPMColor* dstColor,

                                              unsigned alphaScale) {

            SkASSERT((unsigned)x <= 0xF);

            SkASSERT((unsigned)y <= 0xF);

            SkASSERT(alphaScale <= 256);


            int xy = x * y;

            const uint32_t mask = 0xFF00FF;


            int scale = 256 - 16*y - 16*x + xy;

            uint32_t lo = (a00 & mask) * scale;

            uint32_t hi = ((a00 >> 8) & mask) * scale;


            scale = 16*x - xy;

            lo += (a01 & mask) * scale;

            hi += ((a01 >> 8) & mask) * scale;


            scale = 16*y - xy;

            lo += (a10 & mask) * scale;

            hi += ((a10 >> 8) & mask) * scale;


            lo += (a11 & mask) * xy;

            hi += ((a11 >> 8) & mask) * xy;


            if (alphaScale < 256) {

                lo = ((lo >> 8) & mask) * alphaScale;

                hi = ((hi >> 8) & mask) * alphaScale;

            }


            *dstColor = ((lo >> 8) & mask) | (hi & ~mask);

        }

    #endif


    /*not static*/ inline

    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,

                                 const uint32_t* xy, int count, SkPMColor* colors) {

        SkASSERT(count > 0 && colors != nullptr);

        SkASSERT(s.fBilerp);

        SkASSERT(4 == s.fPixmap.info().bytesPerPixel());

        SkASSERT(s.fAlphaScale <= 256);


        int y0, y1, wy;

        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);


        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),

             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );


        while (count --> 0) {

            int x0, x1, wx;

            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);


            filter_and_scale_by_alpha(wx, wy,

                                      row0[x0], row0[x1],

                                      row1[x0], row1[x1],

                                      colors++,

                                      s.fAlphaScale);

        }

    }


#endif


#if defined(SK_ARM_HAS_NEON)

    /*not static*/ inline

    void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s,

                                   const uint32_t* xy, int count, SkPMColor* colors) {

        SkASSERT(count > 0 && colors != nullptr);

        SkASSERT(s.fBilerp);

        SkASSERT(4 == s.fPixmap.info().bytesPerPixel());

        SkASSERT(s.fAlphaScale <= 256);


        auto src = (const char*)s.fPixmap.addr();

        size_t rb = s.fPixmap.rowBytes();


        while (count --> 0) {

            int y0, y1, wy,

                x0, x1, wx;

            decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);


            auto row0 = (const uint32_t*)(src + y0*rb),

                 row1 = (const uint32_t*)(src + y1*rb);


            filter_and_scale_by_alpha(wx, wy,

                                      row0[x0], row0[x1],

                                      row1[x0], row1[x1],

                                      colors++,

                                      s.fAlphaScale);

        }

    }

#else

    // It's not yet clear whether it's worthwhile specializing for other architectures.

    constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&,

                                                       const uint32_t*, int, SkPMColor*) = nullptr;

#endif


}  // namespace SK_OPTS_NS


namespace sktests {

    template <typename U32, typename Out>

    void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {

        SK_OPTS_NS::decode_packed_coordinates_and_weight<U32, Out>(packed, v0, v1, w);

    }

}


#endif

count
int count
Definition: FontMgrTest.cpp:50

SkASSERT
#define SkASSERT(cond)
Definition: SkAssert.h:116

SkBitmapProcState.h

SkPMColor
uint32_t SkPMColor
Definition: SkColor.h:205

B2
static void B2(DFData *curr, int width)
Definition: SkDistanceFieldGen.cpp:287

B1
static void B1(DFData *curr, int width)
Definition: SkDistanceFieldGen.cpp:273

SkMSAN.h

SkVx.h

U32
V< uint32_t > U32
Definition: Transform_inl.h:17

A
Definition: GrMemoryPoolTest.cpp:24

B
Definition: GrMemoryPoolTest.cpp:72

C
Definition: GrMemoryPoolTest.cpp:90

D
Definition: GrMemoryPoolTest.cpp:109

C
#define C(TEST_CATEGORY)
Definition: colrv1.cpp:248

s
struct MyStruct s

R
#define R(r)

B
#define B
Definition: kernel_binary_flowgraph.cc:27

y
double y
Definition: mouse-input-test.cc:83

x
double x
Definition: mouse-input-test.cc:82

SK_OPTS_NS
Definition: SkBitmapProcState_opts.h:34

SK_OPTS_NS::S32_alpha_D32_filter_DXDY
static constexpr void(* S32_alpha_D32_filter_DXDY)(const SkBitmapProcState &, const uint32_t *, int, SkPMColor *)
Definition: SkBitmapProcState_opts.h:574

SK_OPTS_NS::decode_packed_coordinates_and_weight
static void decode_packed_coordinates_and_weight(U32 packed, Out *v0, Out *v1, Out *w)
Definition: SkBitmapProcState_opts.h:38

SK_OPTS_NS::S32_alpha_D32_filter_DX
void S32_alpha_D32_filter_DX(const SkBitmapProcState &s, const uint32_t *xy, int count, uint32_t *colors)
Definition: SkBitmapProcState_opts.h:47

SK_OPTS_NS::dst
F * dst
Definition: SkRasterPipeline_opts.h:4445

SK_OPTS_NS::U32
uint32_t U32
Definition: SkRasterPipeline_opts.h:137

SkRecords::colors
PODArray< SkColor > colors
Definition: SkRecords.h:276

dart::compiler::B0
@ B0
Definition: assembler_arm.h:75

dart::compiler::L
@ L
Definition: assembler_arm.h:64

dart::compiler::A
@ A
Definition: assembler_arm.h:67

dart::compiler::B3
@ B3
Definition: assembler_arm.h:78

dart::A1
@ A1
Definition: constants_riscv.h:57

dart::A3
@ A3
Definition: constants_riscv.h:59

dart::A0
@ A0
Definition: constants_riscv.h:56

dart::A2
@ A2
Definition: constants_riscv.h:58

mskp_parser.src
src
Definition: mskp_parser.py:22

sktests
Definition: SkBitmapProcState.h:208

sktests::decode_packed_coordinates_and_weight
void decode_packed_coordinates_and_weight(U32 packed, Out *v0, Out *v1, Out *w)
Definition: SkBitmapProcState_opts.h:582

tools.skpbench.skpbench.int
int
Definition: skpbench.py:49

w
SkScalar w
Definition: pictureshadertile.cpp:30

scale
const Scalar scale
Definition: stroke_path_geometry.cc:308

SkBitmapProcState
Definition: SkBitmapProcState.h:36