diff options
Diffstat (limited to 'third_party/webrender/swgl/src/blend.h')
-rw-r--r-- | third_party/webrender/swgl/src/blend.h | 864 |
1 files changed, 0 insertions, 864 deletions
diff --git a/third_party/webrender/swgl/src/blend.h b/third_party/webrender/swgl/src/blend.h deleted file mode 100644 index 8bc1c93994e..00000000000 --- a/third_party/webrender/swgl/src/blend.h +++ /dev/null @@ -1,864 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -static ALWAYS_INLINE HalfRGBA8 packRGBA8(I32 a, I32 b) { -#if USE_SSE2 - return _mm_packs_epi32(a, b); -#elif USE_NEON - return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b)); -#else - return CONVERT(combine(a, b), HalfRGBA8); -#endif -} - -static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4& v, - float scale = 255.0f) { - ivec4 i = round_pixel(v, scale); - HalfRGBA8 xz = packRGBA8(i.z, i.x); - HalfRGBA8 yw = packRGBA8(i.y, i.w); - HalfRGBA8 xyzwl = zipLow(xz, yw); - HalfRGBA8 xyzwh = zipHigh(xz, yw); - HalfRGBA8 lo = zip2Low(xyzwl, xyzwh); - HalfRGBA8 hi = zip2High(xyzwl, xyzwh); - return combine(lo, hi); -} - -static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(Float alpha, - float scale = 255.0f) { - I32 i = round_pixel(alpha, scale); - HalfRGBA8 c = packRGBA8(i, i); - c = zipLow(c, c); - return zip(c, c); -} - -static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(float alpha, - float scale = 255.0f) { - I32 i = round_pixel(alpha, scale); - return repeat2(packRGBA8(i, i)); -} - -UNUSED static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v, - float scale = 255.0f) { - I32 i = round_pixel((Float){v.z, v.y, v.x, v.w}, scale); - return repeat2(packRGBA8(i, i)); -} - -static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8() { - return pack_pixels_RGBA8(fragment_shader->gl_FragColor); -} - -static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(WideRGBA32F v, - float scale = 255.0f) { - ivec4 i = round_pixel(bit_cast<vec4>(v), scale); - return combine(packRGBA8(i.x, i.y), packRGBA8(i.z, i.w)); -} - -static ALWAYS_INLINE WideR8 packR8(I32 a) { -#if USE_SSE2 - return lowHalf(bit_cast<V8<uint16_t>>(_mm_packs_epi32(a, a))); -#elif USE_NEON - return vqmovun_s32(a); -#else - return CONVERT(a, WideR8); -#endif -} - -static ALWAYS_INLINE WideR8 pack_pixels_R8(Float c, float scale = 255.0f) { - return packR8(round_pixel(c, scale)); -} - -static ALWAYS_INLINE WideR8 pack_pixels_R8() { - return pack_pixels_R8(fragment_shader->gl_FragColor.x); -} - -// Load a partial span > 0 and < 4 pixels. -template <typename V, typename P> -static ALWAYS_INLINE V partial_load_span(const P* src, int span) { - return bit_cast<V>( - (span >= 2 - ? combine(unaligned_load<V2<P>>(src), - V2<P>{span > 2 ? unaligned_load<P>(src + 2) : P(0), 0}) - : V4<P>{unaligned_load<P>(src), 0, 0, 0})); -} - -// Store a partial span > 0 and < 4 pixels. -template <typename V, typename P> -static ALWAYS_INLINE void partial_store_span(P* dst, V src, int span) { - auto pixels = bit_cast<V4<P>>(src); - if (span >= 2) { - unaligned_store(dst, lowHalf(pixels)); - if (span > 2) { - unaligned_store(dst + 2, pixels.z); - } - } else { - unaligned_store(dst, pixels.x); - } -} - -// Dispatcher that chooses when to load a full or partial span -template <typename V, typename P> -static ALWAYS_INLINE V load_span(const P* src, int span) { - if (span >= 4) { - return unaligned_load<V, P>(src); - } else { - return partial_load_span<V, P>(src, span); - } -} - -// Dispatcher that chooses when to store a full or partial span -template <typename V, typename P> -static ALWAYS_INLINE void store_span(P* dst, V src, int span) { - if (span >= 4) { - unaligned_store<V, P>(dst, src); - } else { - partial_store_span<V, P>(dst, src, span); - } -} - -template <typename T> -static ALWAYS_INLINE T muldiv256(T x, T y) { - return (x * y) >> 8; -} - -// (x*y + x) >> 8, cheap approximation of (x*y) / 255 -template <typename T> -static ALWAYS_INLINE T muldiv255(T x, T y) { - return (x * y + x) >> 8; -} - -template <typename V> -static ALWAYS_INLINE WideRGBA8 pack_span(uint32_t*, const V& v, - float scale = 255.0f) { - return pack_pixels_RGBA8(v, scale); -} - -template <typename C> -static ALWAYS_INLINE WideR8 pack_span(uint8_t*, C c, float scale = 255.0f) { - return pack_pixels_R8(c, scale); -} - -// Helper functions to apply a color modulus when available. -struct NoColor {}; - -template <typename P> -static ALWAYS_INLINE P applyColor(P src, NoColor) { - return src; -} - -struct InvertColor {}; - -template <typename P> -static ALWAYS_INLINE P applyColor(P src, InvertColor) { - return 255 - src; -} - -template <typename P> -static ALWAYS_INLINE P applyColor(P src, P color) { - return muldiv255(color, src); -} - -static ALWAYS_INLINE WideRGBA8 applyColor(PackedRGBA8 src, WideRGBA8 color) { - return applyColor(unpack(src), color); -} - -template <typename P, typename C> -static ALWAYS_INLINE auto packColor(P* buf, C color) { - return pack_span(buf, color, 255.0f); -} - -template <typename P> -static ALWAYS_INLINE NoColor packColor(UNUSED P* buf, NoColor noColor) { - return noColor; -} - -template <typename P> -static ALWAYS_INLINE InvertColor packColor(UNUSED P* buf, - InvertColor invertColor) { - return invertColor; -} - -// Single argument variation that takes an explicit destination buffer type. -template <typename P, typename C> -static ALWAYS_INLINE auto packColor(C color) { - // Just pass in a typed null pointer, as the pack routines never use the - // pointer's value, just its type. - return packColor((P*)0, color); -} - -// Byte-wise addition for when x or y is a signed 8-bit value stored in the -// low byte of a larger type T only with zeroed-out high bits, where T is -// greater than 8 bits, i.e. uint16_t. This can result when muldiv255 is used -// upon signed operands, using up all the precision in a 16 bit integer, and -// potentially losing the sign bit in the last >> 8 shift. Due to the -// properties of two's complement arithmetic, even though we've discarded the -// sign bit, we can still represent a negative number under addition (without -// requiring any extra sign bits), just that any negative number will behave -// like a large unsigned number under addition, generating a single carry bit -// on overflow that we need to discard. Thus, just doing a byte-wise add will -// overflow without the troublesome carry, giving us only the remaining 8 low -// bits we actually need while keeping the high bits at zero. -template <typename T> -static ALWAYS_INLINE T addlow(T x, T y) { - typedef VectorType<uint8_t, sizeof(T)> bytes; - return bit_cast<T>(bit_cast<bytes>(x) + bit_cast<bytes>(y)); -} - -// Replace color components of each pixel with the pixel's alpha values. -template <typename T> -static ALWAYS_INLINE T alphas(T c) { - return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15); -} - -// Replace the alpha values of the first vector with alpha values from the -// second, while leaving the color components unmodified. -template <typename T> -static ALWAYS_INLINE T set_alphas(T c, T a) { - return SHUFFLE(c, a, 0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31); -} - -// Miscellaneous helper functions for working with packed RGBA8 data. -static ALWAYS_INLINE HalfRGBA8 if_then_else(V8<int16_t> c, HalfRGBA8 t, - HalfRGBA8 e) { - return bit_cast<HalfRGBA8>((c & t) | (~c & e)); -} - -template <typename T, typename C, int N> -static ALWAYS_INLINE VectorType<T, N> if_then_else(VectorType<C, N> c, - VectorType<T, N> t, - VectorType<T, N> e) { - return combine(if_then_else(lowHalf(c), lowHalf(t), lowHalf(e)), - if_then_else(highHalf(c), highHalf(t), highHalf(e))); -} - -static ALWAYS_INLINE HalfRGBA8 min(HalfRGBA8 x, HalfRGBA8 y) { -#if USE_SSE2 - return bit_cast<HalfRGBA8>( - _mm_min_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y))); -#elif USE_NEON - return vminq_u16(x, y); -#else - return if_then_else(x < y, x, y); -#endif -} - -template <typename T, int N> -static ALWAYS_INLINE VectorType<T, N> min(VectorType<T, N> x, - VectorType<T, N> y) { - return combine(min(lowHalf(x), lowHalf(y)), min(highHalf(x), highHalf(y))); -} - -static ALWAYS_INLINE HalfRGBA8 max(HalfRGBA8 x, HalfRGBA8 y) { -#if USE_SSE2 - return bit_cast<HalfRGBA8>( - _mm_max_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y))); -#elif USE_NEON - return vmaxq_u16(x, y); -#else - return if_then_else(x > y, x, y); -#endif -} - -template <typename T, int N> -static ALWAYS_INLINE VectorType<T, N> max(VectorType<T, N> x, - VectorType<T, N> y) { - return combine(max(lowHalf(x), lowHalf(y)), max(highHalf(x), highHalf(y))); -} - -template <typename T, int N> -static ALWAYS_INLINE VectorType<T, N> recip(VectorType<T, N> v) { - return combine(recip(lowHalf(v)), recip(highHalf(v))); -} - -// Helper to get the reciprocal if the value is non-zero, or otherwise default -// to the supplied fallback value. -template <typename V> -static ALWAYS_INLINE V recip_or(V v, float f) { - return if_then_else(v != V(0.0f), recip(v), V(f)); -} - -template <typename T, int N> -static ALWAYS_INLINE VectorType<T, N> inversesqrt(VectorType<T, N> v) { - return combine(inversesqrt(lowHalf(v)), inversesqrt(highHalf(v))); -} - -// Extract the alpha components so that we can cheaply calculate the reciprocal -// on a single SIMD register. Then multiply the duplicated alpha reciprocal with -// the pixel data. 0 alpha is treated as transparent black. -static ALWAYS_INLINE WideRGBA32F unpremultiply(WideRGBA32F v) { - Float a = recip_or((Float){v[3], v[7], v[11], v[15]}, 0.0f); - return v * a.xxxxyyyyzzzzwwww; -} - -// Packed RGBA32F data is AoS in BGRA order. Transpose it to SoA and swizzle to -// RGBA to unpack. -static ALWAYS_INLINE vec4 unpack(PackedRGBA32F c) { - return bit_cast<vec4>( - SHUFFLE(c, c, 2, 6, 10, 14, 1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15)); -} - -// The following lum/sat functions mostly follow the KHR_blend_equation_advanced -// specification but are rearranged to work on premultiplied data. -static ALWAYS_INLINE Float lumv3(vec3 v) { - return v.x * 0.30f + v.y * 0.59f + v.z * 0.11f; -} - -static ALWAYS_INLINE Float minv3(vec3 v) { return min(min(v.x, v.y), v.z); } - -static ALWAYS_INLINE Float maxv3(vec3 v) { return max(max(v.x, v.y), v.z); } - -static inline vec3 clip_color(vec3 v, Float lum, Float alpha) { - Float mincol = max(-minv3(v), lum); - Float maxcol = max(maxv3(v), alpha - lum); - return lum + v * (lum * (alpha - lum) * recip_or(mincol * maxcol, 0.0f)); -} - -static inline vec3 set_lum(vec3 base, vec3 ref, Float alpha) { - return clip_color(base - lumv3(base), lumv3(ref), alpha); -} - -static inline vec3 set_lum_sat(vec3 base, vec3 sref, vec3 lref, Float alpha) { - vec3 diff = base - minv3(base); - Float sbase = maxv3(diff); - Float ssat = maxv3(sref) - minv3(sref); - // The sbase range is rescaled to ssat. If sbase has 0 extent, then rescale - // to black, as per specification. - return set_lum(diff * ssat * recip_or(sbase, 0.0f), lref, alpha); -} - -// Flags the reflect the current blend-stage clipping to be applied. -enum SWGLClipFlag { - SWGL_CLIP_FLAG_MASK = 1 << 0, - SWGL_CLIP_FLAG_AA = 1 << 1, - SWGL_CLIP_FLAG_BLEND_OVERRIDE = 1 << 2, -}; -static int swgl_ClipFlags = 0; -static BlendKey swgl_BlendOverride = BLEND_KEY_NONE; -static WideRGBA8 swgl_BlendColorRGBA8 = {0}; -static WideRGBA8 swgl_BlendAlphaRGBA8 = {0}; - -// A pointer into the color buffer for the start of the span. -static void* swgl_SpanBuf = nullptr; -// A pointer into the clip mask for the start of the span. -static uint8_t* swgl_ClipMaskBuf = nullptr; - -static ALWAYS_INLINE WideR8 expand_mask(UNUSED uint8_t* buf, WideR8 mask) { - return mask; -} -static ALWAYS_INLINE WideRGBA8 expand_mask(UNUSED uint32_t* buf, WideR8 mask) { - WideRG8 maskRG = zip(mask, mask); - return zip(maskRG, maskRG); -} - -// Loads a chunk of clip masks. The current pointer into the color buffer is -// used to reconstruct the relative position within the span. From there, the -// pointer into the clip mask can be generated from the start of the clip mask -// span. -template <typename P> -static ALWAYS_INLINE uint8_t* get_clip_mask(P* buf) { - return &swgl_ClipMaskBuf[buf - (P*)swgl_SpanBuf]; -} - -template <typename P> -static ALWAYS_INLINE auto load_clip_mask(P* buf, int span) - -> decltype(expand_mask(buf, 0)) { - return expand_mask(buf, - unpack(load_span<PackedR8>(get_clip_mask(buf), span))); -} - -// Temporarily removes masking from the blend stage, assuming the caller will -// handle it. -static ALWAYS_INLINE void override_clip_mask() { - blend_key = BlendKey(blend_key - MASK_BLEND_KEY_NONE); -} - -// Restores masking to the blend stage, assuming it was previously overridden. -static ALWAYS_INLINE void restore_clip_mask() { - blend_key = BlendKey(MASK_BLEND_KEY_NONE + blend_key); -} - -// A pointer to the start of the opaque destination region of the span for AA. -static const uint8_t* swgl_OpaqueStart = nullptr; -// The size, in bytes, of the opaque region. -static uint32_t swgl_OpaqueSize = 0; -// AA coverage distance offsets for the left and right edges. -static Float swgl_LeftAADist = 0.0f; -static Float swgl_RightAADist = 0.0f; -// AA coverage slope values used for accumulating coverage for each step. -static Float swgl_AASlope = 0.0f; - -// Get the amount of pixels we need to process before the start of the opaque -// region. -template <typename P> -static ALWAYS_INLINE int get_aa_opaque_start(P* buf) { - return max(int((P*)swgl_OpaqueStart - buf), 0); -} - -// Assuming we are already in the opaque part of the span, return the remaining -// size of the opaque part. -template <typename P> -static ALWAYS_INLINE int get_aa_opaque_size(P* buf) { - return max(int((P*)&swgl_OpaqueStart[swgl_OpaqueSize] - buf), 0); -} - -// Temporarily removes anti-aliasing from the blend stage, assuming the caller -// will handle it. -static ALWAYS_INLINE void override_aa() { - blend_key = BlendKey(blend_key - AA_BLEND_KEY_NONE); -} - -// Restores anti-aliasing to the blend stage, assuming it was previously -// overridden. -static ALWAYS_INLINE void restore_aa() { - blend_key = BlendKey(AA_BLEND_KEY_NONE + blend_key); -} - -static PREFER_INLINE WideRGBA8 blend_pixels(uint32_t* buf, PackedRGBA8 pdst, - WideRGBA8 src, int span = 4) { - WideRGBA8 dst = unpack(pdst); - const WideRGBA8 RGB_MASK = {0xFFFF, 0xFFFF, 0xFFFF, 0, 0xFFFF, 0xFFFF, - 0xFFFF, 0, 0xFFFF, 0xFFFF, 0xFFFF, 0, - 0xFFFF, 0xFFFF, 0xFFFF, 0}; - const WideRGBA8 ALPHA_MASK = {0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF, - 0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF}; - const WideRGBA8 ALPHA_OPAQUE = {0, 0, 0, 255, 0, 0, 0, 255, - 0, 0, 0, 255, 0, 0, 0, 255}; - -// clang-format off - // Computes AA for the given pixel based on the offset of the pixel within - // destination row. Given the initial coverage offsets for the left and right - // edges, the offset is scaled by the slope and accumulated to find the - // minimum coverage value for the pixel. A final weight is generated that - // can be used to scale the source pixel. -#define DO_AA(format, body) \ - do { \ - int offset = int((const uint8_t*)buf - swgl_OpaqueStart); \ - if (uint32_t(offset) >= swgl_OpaqueSize) { \ - Float delta = swgl_AASlope * float(offset); \ - Float dist = clamp(min(swgl_LeftAADist + delta.x, \ - swgl_RightAADist + delta.y), \ - 0.0f, 256.0f); \ - auto aa = pack_pixels_##format(dist, 1.0f); \ - body; \ - } \ - } while (0) - - // Each blend case is preceded by the MASK_ variant. The MASK_ case first - // loads the mask values and multiplies the source value by them. After, it - // falls through to the normal blending case using the masked source. The - // AA_ variations may further precede the blend cases, in which case the - // source value is further modified before use. -#define BLEND_CASE_KEY(key) \ - case AA_##key: \ - DO_AA(RGBA8, src = muldiv256(src, aa)); \ - goto key; \ - case AA_MASK_##key: \ - DO_AA(RGBA8, src = muldiv256(src, aa)); \ - FALLTHROUGH; \ - case MASK_##key: \ - src = muldiv255(src, load_clip_mask(buf, span)); \ - FALLTHROUGH; \ - case key: key - -#define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__)) - - switch (blend_key) { - BLEND_CASE(GL_ONE, GL_ZERO): - return src; - BLEND_CASE(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, - GL_ONE_MINUS_SRC_ALPHA): - // dst + src.a*(src.rgb1 - dst) - // use addlow for signed overflow - return addlow(dst, muldiv255(alphas(src), (src | ALPHA_OPAQUE) - dst)); - BLEND_CASE(GL_ONE, GL_ONE_MINUS_SRC_ALPHA): - return src + dst - muldiv255(dst, alphas(src)); - BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR): - return dst - muldiv255(dst, src); - BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE): - return dst - (muldiv255(dst, src) & RGB_MASK); - BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA): - return dst - muldiv255(dst, alphas(src)); - BLEND_CASE(GL_ZERO, GL_SRC_COLOR): - return muldiv255(src, dst); - BLEND_CASE(GL_ONE, GL_ONE): - return src + dst; - BLEND_CASE(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA): - return src + dst - (muldiv255(dst, src) & ALPHA_MASK); - BLEND_CASE(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE): - // src*(1-dst.a) + dst*1 = src - src*dst.a + dst - return dst + ((src - muldiv255(src, alphas(dst))) & RGB_MASK); - BLEND_CASE(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR): - // src*k + (1-src)*dst = src*k + dst - - // src*dst = dst + src*(k - dst) use addlow - // for signed overflow - return addlow( - dst, muldiv255(src, repeat2(ctx->blendcolor) - dst)); - - // We must explicitly handle the masked/anti-aliased secondary blend case. - // The secondary color as well as the source must be multiplied by the - // weights. - case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { - WideRGBA8 secondary = - applyColor(dst, - packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor)); - return src + dst - secondary; - } - case MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { - WideRGBA8 secondary = - applyColor(dst, - packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor)); - WideRGBA8 mask = load_clip_mask(buf, span); - return muldiv255(src, mask) + dst - muldiv255(secondary, mask); - } - case AA_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { - WideRGBA8 secondary = - applyColor(dst, - packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor)); - DO_AA(RGBA8, { - src = muldiv256(src, aa); - secondary = muldiv256(secondary, aa); - }); - return src + dst - secondary; - } - case AA_MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { - WideRGBA8 secondary = - applyColor(dst, - packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor)); - WideRGBA8 mask = load_clip_mask(buf, span); - DO_AA(RGBA8, mask = muldiv256(mask, aa)); - return muldiv255(src, mask) + dst - muldiv255(secondary, mask); - } - - BLEND_CASE(GL_MIN): - return min(src, dst); - BLEND_CASE(GL_MAX): - return max(src, dst); - - // The KHR_blend_equation_advanced spec describes the blend equations such - // that the unpremultiplied values Cs, Cd, As, Ad and function f combine to - // the result: - // Cr = f(Cs,Cd)*As*Ad + Cs*As*(1-Ad) + Cd*AD*(1-As) - // Ar = As*Ad + As*(1-Ad) + Ad*(1-As) - // However, working with unpremultiplied values requires expensive math to - // unpremultiply and premultiply again during blending. We can use the fact - // that premultiplied value P = C*A and simplify the equations such that no - // unpremultiplied colors are necessary, allowing us to stay with integer - // math that avoids floating-point conversions in the common case. Some of - // the blend modes require division or sqrt, in which case we do convert - // to (possibly transposed/unpacked) floating-point to implement the mode. - // However, most common modes can still use cheaper premultiplied integer - // math. As an example, the multiply mode f(Cs,Cd) = Cs*Cd is simplified - // to: - // Cr = Cs*Cd*As*Ad + Cs*As*(1-Ad) + Cd*Ad*(1-As) - // .. Pr = Ps*Pd + Ps - Ps*Ad + Pd - Pd*As - // Ar = As*Ad + As - As*Ad + Ad - Ad*As - // .. Ar = As + Ad - As*Ad - // Note that the alpha equation is the same for all blend equations, such - // that so long as the implementation results in As + Ad - As*Ad, we can - // avoid using separate instructions to compute the alpha result, which is - // dependent on the math used to implement each blend mode. The exact - // reductions used to get the final math for every blend mode are too - // involved to show here in comments, but mostly follows from replacing - // Cs*As and Cd*Ad with Ps and Ps while factoring out as many common terms - // as possible. - - BLEND_CASE(GL_MULTIPLY_KHR): { - WideRGBA8 diff = muldiv255(alphas(src) - (src & RGB_MASK), - alphas(dst) - (dst & RGB_MASK)); - return src + dst + (diff & RGB_MASK) - alphas(diff); - } - BLEND_CASE(GL_SCREEN_KHR): - return src + dst - muldiv255(src, dst); - BLEND_CASE(GL_OVERLAY_KHR): { - WideRGBA8 srcA = alphas(src); - WideRGBA8 dstA = alphas(dst); - WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst); - return src + dst + - if_then_else(dst * 2 <= dstA, (diff & RGB_MASK) - alphas(diff), - -diff); - } - BLEND_CASE(GL_DARKEN_KHR): - return src + dst - - max(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src))); - BLEND_CASE(GL_LIGHTEN_KHR): - return src + dst - - min(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src))); - - BLEND_CASE(GL_COLORDODGE_KHR): { - // Color-dodge and color-burn require division, so we convert to FP math - // here, but avoid transposing to a vec4. - WideRGBA32F srcF = CONVERT(src, WideRGBA32F); - WideRGBA32F srcA = alphas(srcF); - WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); - WideRGBA32F dstA = alphas(dstF); - return pack_pixels_RGBA8( - srcA * set_alphas( - min(dstA, dstF * srcA * recip_or(srcA - srcF, 255.0f)), - dstF) + - srcF * (255.0f - dstA) + dstF * (255.0f - srcA), - 1.0f / 255.0f); - } - BLEND_CASE(GL_COLORBURN_KHR): { - WideRGBA32F srcF = CONVERT(src, WideRGBA32F); - WideRGBA32F srcA = alphas(srcF); - WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); - WideRGBA32F dstA = alphas(dstF); - return pack_pixels_RGBA8( - srcA * set_alphas((dstA - min(dstA, (dstA - dstF) * srcA * - recip_or(srcF, 255.0f))), - dstF) + - srcF * (255.0f - dstA) + dstF * (255.0f - srcA), - 1.0f / 255.0f); - } - BLEND_CASE(GL_HARDLIGHT_KHR): { - WideRGBA8 srcA = alphas(src); - WideRGBA8 dstA = alphas(dst); - WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst); - return src + dst + - if_then_else(src * 2 <= srcA, (diff & RGB_MASK) - alphas(diff), - -diff); - } - - BLEND_CASE(GL_SOFTLIGHT_KHR): { - // Soft-light requires an unpremultiply that can't be factored out as - // well as a sqrt, so we convert to FP math here, but avoid transposing - // to a vec4. - WideRGBA32F srcF = CONVERT(src, WideRGBA32F); - WideRGBA32F srcA = alphas(srcF); - WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); - WideRGBA32F dstA = alphas(dstF); - WideRGBA32F dstU = unpremultiply(dstF); - WideRGBA32F scale = srcF + srcF - srcA; - return pack_pixels_RGBA8( - dstF * (255.0f + - set_alphas( - scale * - if_then_else(scale < 0.0f, 1.0f - dstU, - min((16.0f * dstU - 12.0f) * dstU + 3.0f, - inversesqrt(dstU) - 1.0f)), - WideRGBA32F(0.0f))) + - srcF * (255.0f - dstA), - 1.0f / 255.0f); - } - BLEND_CASE(GL_DIFFERENCE_KHR): { - WideRGBA8 diff = - min(muldiv255(dst, alphas(src)), muldiv255(src, alphas(dst))); - return src + dst - diff - (diff & RGB_MASK); - } - BLEND_CASE(GL_EXCLUSION_KHR): { - WideRGBA8 diff = muldiv255(src, dst); - return src + dst - diff - (diff & RGB_MASK); - } - - // The HSL blend modes are non-separable and require complicated use of - // division. It is advantageous to convert to FP and transpose to vec4 - // math to more easily manipulate the individual color components. -#define DO_HSL(rgb) \ - do { \ - vec4 srcV = unpack(CONVERT(src, PackedRGBA32F)); \ - vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F)); \ - Float srcA = srcV.w * (1.0f / 255.0f); \ - Float dstA = dstV.w * (1.0f / 255.0f); \ - Float srcDstA = srcV.w * dstA; \ - vec3 srcC = vec3(srcV) * dstA; \ - vec3 dstC = vec3(dstV) * srcA; \ - return pack_pixels_RGBA8(vec4(rgb + vec3(srcV) - srcC + vec3(dstV) - dstC, \ - srcV.w + dstV.w - srcDstA), \ - 1.0f); \ - } while (0) - - BLEND_CASE(GL_HSL_HUE_KHR): - DO_HSL(set_lum_sat(srcC, dstC, dstC, srcDstA)); - BLEND_CASE(GL_HSL_SATURATION_KHR): - DO_HSL(set_lum_sat(dstC, srcC, dstC, srcDstA)); - BLEND_CASE(GL_HSL_COLOR_KHR): - DO_HSL(set_lum(srcC, dstC, srcDstA)); - BLEND_CASE(GL_HSL_LUMINOSITY_KHR): - DO_HSL(set_lum(dstC, srcC, srcDstA)); - - // SWGL-specific extended blend modes. - BLEND_CASE(SWGL_BLEND_DROP_SHADOW): { - // Premultiplied alpha over blend, but with source color set to source alpha - // modulated with a constant color. - WideRGBA8 color = applyColor(alphas(src), swgl_BlendColorRGBA8); - return color + dst - muldiv255(dst, alphas(color)); - } - - BLEND_CASE(SWGL_BLEND_SUBPIXEL_TEXT): - // Premultiplied alpha over blend, but treats the source as a subpixel mask - // modulated with a constant color. - return applyColor(src, swgl_BlendColorRGBA8) + dst - - muldiv255(dst, applyColor(src, swgl_BlendAlphaRGBA8)); - - default: - UNREACHABLE; - // return src; - } - -#undef BLEND_CASE -#undef BLEND_CASE_KEY - // clang-format on -} - -static PREFER_INLINE WideR8 blend_pixels(uint8_t* buf, WideR8 dst, WideR8 src, - int span = 4) { -// clang-format off -#define BLEND_CASE_KEY(key) \ - case AA_##key: \ - DO_AA(R8, src = muldiv256(src, aa)); \ - goto key; \ - case AA_MASK_##key: \ - DO_AA(R8, src = muldiv256(src, aa)); \ - FALLTHROUGH; \ - case MASK_##key: \ - src = muldiv255(src, load_clip_mask(buf, span)); \ - FALLTHROUGH; \ - case key: key - -#define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__)) - - switch (blend_key) { - BLEND_CASE(GL_ONE, GL_ZERO): - return src; - BLEND_CASE(GL_ZERO, GL_SRC_COLOR): - return muldiv255(src, dst); - BLEND_CASE(GL_ONE, GL_ONE): - return src + dst; - default: - UNREACHABLE; - // return src; - } - -#undef BLEND_CASE -#undef BLEND_CASE_KEY - // clang-format on -} - -static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r) { - unaligned_store(buf, pack(r)); -} - -static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r, int len) { - partial_store_span(buf, pack(r), len); -} - -static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r) { - return blend_pixels(buf, unaligned_load<PackedRGBA8>(buf), r); -} - -static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r, int len) { - return blend_pixels(buf, partial_load_span<PackedRGBA8>(buf, len), r, len); -} - -static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r) { - unaligned_store(buf, r); -} - -static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r, int len) { - partial_store_span(buf, r, len); -} - -static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r) { - return pack(blend_span(buf, unpack(r))); -} - -static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r, - int len) { - return pack(blend_span(buf, unpack(r), len)); -} - -static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r) { - unaligned_store(buf, pack(r)); -} - -static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r, int len) { - partial_store_span(buf, pack(r), len); -} - -static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r) { - return blend_pixels(buf, unpack(unaligned_load<PackedR8>(buf)), r); -} - -static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r, int len) { - return blend_pixels(buf, unpack(partial_load_span<PackedR8>(buf, len)), r, - len); -} - -static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r) { - unaligned_store(buf, r); -} - -static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r, int len) { - partial_store_span(buf, r, len); -} - -static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r) { - return pack(blend_span(buf, unpack(r))); -} - -static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r, int len) { - return pack(blend_span(buf, unpack(r), len)); -} - -template <bool BLEND, typename P, typename R> -static ALWAYS_INLINE void commit_blend_span(P* buf, R r) { - if (BLEND) { - commit_span(buf, blend_span(buf, r)); - } else { - commit_span(buf, r); - } -} - -template <bool BLEND, typename P, typename R> -static ALWAYS_INLINE void commit_blend_span(P* buf, R r, int len) { - if (BLEND) { - commit_span(buf, blend_span(buf, r, len), len); - } else { - commit_span(buf, r, len); - } -} - -template <typename P, typename R> -static ALWAYS_INLINE void commit_blend_solid_span(P* buf, R r, int len) { - for (P* end = &buf[len & ~3]; buf < end; buf += 4) { - commit_span(buf, blend_span(buf, r)); - } - len &= 3; - if (len > 0) { - partial_store_span(buf, pack(blend_span(buf, r, len)), len); - } -} - -template <bool BLEND> -static void commit_solid_span(uint32_t* buf, WideRGBA8 r, int len) { - commit_blend_solid_span(buf, r, len); -} - -template <> -ALWAYS_INLINE void commit_solid_span<false>(uint32_t* buf, WideRGBA8 r, - int len) { - fill_n(buf, len, bit_cast<U32>(pack(r)).x); -} - -template <bool BLEND> -static void commit_solid_span(uint8_t* buf, WideR8 r, int len) { - commit_blend_solid_span(buf, r, len); -} - -template <> -ALWAYS_INLINE void commit_solid_span<false>(uint8_t* buf, WideR8 r, int len) { - PackedR8 p = pack(r); - if (uintptr_t(buf) & 3) { - int align = 4 - (uintptr_t(buf) & 3); - align = min(align, len); - partial_store_span(buf, p, align); - buf += align; - len -= align; - } - fill_n((uint32_t*)buf, len / 4, bit_cast<uint32_t>(p)); - buf += len & ~3; - len &= 3; - if (len > 0) { - partial_store_span(buf, p, len); - } -} |