Flutter Engine
The Flutter Engine
Namespaces | Functions
SkBlitRow_opts.h File Reference
#include "include/private/SkColorData.h"
#include "src/base/SkMSAN.h"
#include "src/base/SkVx.h"
#include <immintrin.h>
#include <lasxintrin.h>
#include <lsxintrin.h>

Go to the source code of this file.

Namespaces

namespace  SK_OPTS_NS
 

Functions

static __m256i SkPMSrcOver_AVX2 (const __m256i &src, const __m256i &dst)
 
static __m128i SkPMSrcOver_SSE2 (const __m128i &src, const __m128i &dst)
 
static __m256i SkPMSrcOver_LASX (const __m256i &src, const __m256i &dst)
 
static __m128i SkPMSrcOver_LSX (const __m128i &src, const __m128i &dst)
 
void SK_OPTS_NS::blit_row_s32a_opaque (SkPMColor *dst, const SkPMColor *src, int len, U8CPU alpha)
 
void SK_OPTS_NS::blit_row_color32 (SkPMColor *dst, int count, SkPMColor color)
 

Function Documentation

◆ SkPMSrcOver_AVX2()

static __m256i SkPMSrcOver_AVX2 ( const __m256i &  src,
const __m256i &  dst 
)
inlinestatic

Definition at line 25 of file SkBlitRow_opts.h.

25 {
26 // Abstractly srcover is
27 // b = s + d*(1-srcA)
28 //
29 // In terms of unorm8 bytes, that works out to
30 // b = s + (d*(255-srcA) + 127) / 255
31 //
32 // But we approximate that to within a bit with
33 // b = s + (d*(255-srcA) + d) / 256
34 // a.k.a
35 // b = s + (d*(256-srcA)) >> 8
36
37 // The bottleneck of this math is the multiply, and we want to do it as
38 // narrowly as possible, here getting inputs into 16-bit lanes and
39 // using 16-bit multiplies. We can do twice as many multiplies at once
40 // as using naive 32-bit multiplies, and on top of that, the 16-bit multiplies
41 // are themselves a couple cycles quicker. Win-win.
42
43 // We'll get everything in 16-bit lanes for two multiplies, one
44 // handling dst red and blue, the other green and alpha. (They're
45 // conveniently 16-bits apart, you see.) We don't need the individual
46 // src channels beyond alpha until the very end when we do the "s + "
47 // add, and we don't even need to unpack them; the adds cannot overflow.
48
49 // Shuffle each pixel's srcA to the low byte of each 16-bit half of the pixel.
50 const int _ = -1; // fills a literal 0 byte.
51 __m256i srcA_x2 = _mm256_shuffle_epi8(src,
52 _mm256_setr_epi8(3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_,
53 3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_));
54 __m256i scale_x2 = _mm256_sub_epi16(_mm256_set1_epi16(256),
55 srcA_x2);
56
57 // Scale red and blue, leaving results in the low byte of each 16-bit lane.
58 __m256i rb = _mm256_and_si256(_mm256_set1_epi32(0x00ff00ff), dst);
59 rb = _mm256_mullo_epi16(rb, scale_x2);
60 rb = _mm256_srli_epi16 (rb, 8);
61
62 // Scale green and alpha, leaving results in the high byte, masking off the low bits.
63 __m256i ga = _mm256_srli_epi16(dst, 8);
64 ga = _mm256_mullo_epi16(ga, scale_x2);
65 ga = _mm256_andnot_si256(_mm256_set1_epi32(0x00ff00ff), ga);
66
67 return _mm256_adds_epu8(src, _mm256_or_si256(rb, ga));
68 }
dst
Definition: cp.py:12

◆ SkPMSrcOver_LASX()

static __m256i SkPMSrcOver_LASX ( const __m256i &  src,
const __m256i &  dst 
)
inlinestatic

Definition at line 122 of file SkBlitRow_opts.h.

122 {
123 __m256i val = __lasx_xvreplgr2vr_w(256);
124 __m256i scale = __lasx_xvsub_w(val, __lasx_xvsrli_w(src, 24));
125 __m256i scale_x2 = __lasx_xvor_v(__lasx_xvslli_w(scale, 16), scale);
126
127 val = __lasx_xvreplgr2vr_w(0x00ff00ff);
128 __m256i rb = __lasx_xvand_v(val, dst);
129 rb = __lasx_xvmul_h(rb, scale_x2);
130 rb = __lasx_xvsrli_h(rb, 8);
131
132 __m256i ga = __lasx_xvsrli_h(dst, 8);
133 ga = __lasx_xvmul_h(ga, scale_x2);
134 ga = __lasx_xvandn_v(val, ga);
135
136 return __lasx_xvsadd_bu(src, __lasx_xvor_v(rb, ga));
137 }
const Scalar scale

◆ SkPMSrcOver_LSX()

static __m128i SkPMSrcOver_LSX ( const __m128i &  src,
const __m128i &  dst 
)
inlinestatic

Definition at line 143 of file SkBlitRow_opts.h.

143 {
144 __m128i val = __lsx_vreplgr2vr_w(256);
145 __m128i scale = __lsx_vsub_w(val, __lsx_vsrli_w(src, 24));
146 __m128i scale_x2 = __lsx_vor_v(__lsx_vslli_w(scale, 16), scale);
147
148 val = __lsx_vreplgr2vr_w(0x00ff00ff);
149 __m128i rb = __lsx_vand_v(val, dst);
150 rb = __lsx_vmul_h(rb, scale_x2);
151 rb = __lsx_vsrli_h(rb, 8);
152
153 __m128i ga = __lsx_vsrli_h(dst, 8);
154 ga = __lsx_vmul_h(ga, scale_x2);
155 ga = __lsx_vandn_v(val, ga);
156
157 return __lsx_vsadd_bu(src, __lsx_vor_v(rb, ga));
158 }

◆ SkPMSrcOver_SSE2()

static __m128i SkPMSrcOver_SSE2 ( const __m128i &  src,
const __m128i &  dst 
)
inlinestatic

Definition at line 74 of file SkBlitRow_opts.h.

74 {
75 __m128i scale = _mm_sub_epi32(_mm_set1_epi32(256),
76 _mm_srli_epi32(src, 24));
77 __m128i scale_x2 = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);
78
79 __m128i rb = _mm_and_si128(_mm_set1_epi32(0x00ff00ff), dst);
80 rb = _mm_mullo_epi16(rb, scale_x2);
81 rb = _mm_srli_epi16(rb, 8);
82
83 __m128i ga = _mm_srli_epi16(dst, 8);
84 ga = _mm_mullo_epi16(ga, scale_x2);
85 ga = _mm_andnot_si128(_mm_set1_epi32(0x00ff00ff), ga);
86
87 return _mm_adds_epu8(src, _mm_or_si128(rb, ga));
88 }