#include "include/private/SkColorData.h"
#include "src/base/SkMSAN.h"
#include "src/base/SkVx.h"
#include <immintrin.h>
#include <lasxintrin.h>
#include <lsxintrin.h>

Namespaces
namespace	SK_OPTS_NS

Functions
static __m256i	SkPMSrcOver_AVX2 (const __m256i &src, const __m256i &dst)

static __m128i	SkPMSrcOver_SSE2 (const __m128i &src, const __m128i &dst)

static __m256i	SkPMSrcOver_LASX (const __m256i &src, const __m256i &dst)

static __m128i	SkPMSrcOver_LSX (const __m128i &src, const __m128i &dst)

void	SK_OPTS_NS::blit_row_s32a_opaque (SkPMColor dst, const SkPMColor src, int len, U8CPU alpha)

void	SK_OPTS_NS::blit_row_color32 (SkPMColor *dst, int count, SkPMColor color)

Function Documentation

◆ SkPMSrcOver_AVX2()

static __m256i SkPMSrcOver_AVX2	(	const __m256i &	src,
		const __m256i &	dst
	)

inlinestatic

Definition at line 25 of file SkBlitRow_opts.h.

                                                                                   {
        // Abstractly srcover is
        //     b = s + d*(1-srcA)
        //
        // In terms of unorm8 bytes, that works out to
        //     b = s + (d*(255-srcA) + 127) / 255
        //
        // But we approximate that to within a bit with
        //     b = s + (d*(255-srcA) + d) / 256
        // a.k.a
        //     b = s + (d*(256-srcA)) >> 8
 
        // The bottleneck of this math is the multiply, and we want to do it as
        // narrowly as possible, here getting inputs into 16-bit lanes and
        // using 16-bit multiplies.  We can do twice as many multiplies at once
        // as using naive 32-bit multiplies, and on top of that, the 16-bit multiplies
        // are themselves a couple cycles quicker.  Win-win.
 
        // We'll get everything in 16-bit lanes for two multiplies, one
        // handling dst red and blue, the other green and alpha.  (They're
        // conveniently 16-bits apart, you see.) We don't need the individual
        // src channels beyond alpha until the very end when we do the "s + "
        // add, and we don't even need to unpack them; the adds cannot overflow.
 
        // Shuffle each pixel's srcA to the low byte of each 16-bit half of the pixel.
        const int _ = -1;   // fills a literal 0 byte.
        __m256i srcA_x2 = _mm256_shuffle_epi8(src,
                _mm256_setr_epi8(3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_,
                                 3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_));
        __m256i scale_x2 = _mm256_sub_epi16(_mm256_set1_epi16(256),
                                            srcA_x2);
 
        // Scale red and blue, leaving results in the low byte of each 16-bit lane.
        __m256i rb = _mm256_and_si256(_mm256_set1_epi32(0x00ff00ff), dst);
        rb = _mm256_mullo_epi16(rb, scale_x2);
        rb = _mm256_srli_epi16 (rb, 8);
 
        // Scale green and alpha, leaving results in the high byte, masking off the low bits.
        __m256i ga = _mm256_srli_epi16(dst, 8);
        ga = _mm256_mullo_epi16(ga, scale_x2);
        ga = _mm256_andnot_si256(_mm256_set1_epi32(0x00ff00ff), ga);
 
        return _mm256_adds_epu8(src, _mm256_or_si256(rb, ga));
    }

◆ SkPMSrcOver_LASX()

static __m256i SkPMSrcOver_LASX	(	const __m256i &	src,
		const __m256i &	dst
	)

inlinestatic

Definition at line 122 of file SkBlitRow_opts.h.

                                                                                   {
        __m256i val = __lasx_xvreplgr2vr_w(256);
        __m256i scale = __lasx_xvsub_w(val, __lasx_xvsrli_w(src, 24));
        __m256i scale_x2 = __lasx_xvor_v(__lasx_xvslli_w(scale, 16), scale);
 
        val = __lasx_xvreplgr2vr_w(0x00ff00ff);
        __m256i rb = __lasx_xvand_v(val, dst);
        rb = __lasx_xvmul_h(rb, scale_x2);
        rb = __lasx_xvsrli_h(rb, 8);
 
        __m256i ga = __lasx_xvsrli_h(dst, 8);
        ga = __lasx_xvmul_h(ga, scale_x2);
        ga = __lasx_xvandn_v(val, ga);
 
        return __lasx_xvsadd_bu(src, __lasx_xvor_v(rb, ga));
    }

◆ SkPMSrcOver_LSX()

static __m128i SkPMSrcOver_LSX	(	const __m128i &	src,
		const __m128i &	dst
	)

inlinestatic

Definition at line 143 of file SkBlitRow_opts.h.

                                                                                  {
        __m128i val = __lsx_vreplgr2vr_w(256);
        __m128i scale = __lsx_vsub_w(val, __lsx_vsrli_w(src, 24));
        __m128i scale_x2 = __lsx_vor_v(__lsx_vslli_w(scale, 16), scale);
 
        val = __lsx_vreplgr2vr_w(0x00ff00ff);
        __m128i rb = __lsx_vand_v(val, dst);
        rb = __lsx_vmul_h(rb, scale_x2);
        rb = __lsx_vsrli_h(rb, 8);
 
        __m128i ga = __lsx_vsrli_h(dst, 8);
        ga = __lsx_vmul_h(ga, scale_x2);
        ga = __lsx_vandn_v(val, ga);
 
        return __lsx_vsadd_bu(src, __lsx_vor_v(rb, ga));
    }

◆ SkPMSrcOver_SSE2()

static __m128i SkPMSrcOver_SSE2	(	const __m128i &	src,
		const __m128i &	dst
	)

inlinestatic

Definition at line 74 of file SkBlitRow_opts.h.

                                                                                   {
        __m128i scale = _mm_sub_epi32(_mm_set1_epi32(256),
                                      _mm_srli_epi32(src, 24));
        __m128i scale_x2 = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);
 
        __m128i rb = _mm_and_si128(_mm_set1_epi32(0x00ff00ff), dst);
        rb = _mm_mullo_epi16(rb, scale_x2);
        rb = _mm_srli_epi16(rb, 8);
 
        __m128i ga = _mm_srli_epi16(dst, 8);
        ga = _mm_mullo_epi16(ga, scale_x2);
        ga = _mm_andnot_si128(_mm_set1_epi32(0x00ff00ff), ga);
 
        return _mm_adds_epu8(src, _mm_or_si128(rb, ga));
    }

Namespaces

Functions

Function Documentation

◆ SkPMSrcOver_AVX2()

◆ SkPMSrcOver_LASX()

◆ SkPMSrcOver_LSX()

◆ SkPMSrcOver_SSE2()